i386.c (vselect_insn): New variable.
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "cgraph.h"
51 #include "gimple.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "sched-int.h"
59 #include "sbitmap.h"
60 #include "fibheap.h"
61 #include "opts.h"
62 #include "diagnostic.h"
63
64 enum upper_128bits_state
65 {
66 unknown = 0,
67 unused,
68 used
69 };
70
71 typedef struct block_info_def
72 {
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
76 in this block. */
77 bool unchanged;
78 /* TRUE if block has been processed. */
79 bool processed;
80 /* TRUE if block has been scanned. */
81 bool scanned;
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
84 } *block_info;
85
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
87
88 enum call_avx256_state
89 {
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
95 callee_pass_avx256,
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
98 call_no_avx256,
99 /* vzeroupper intrinsic. */
100 vzeroupper_intrinsic
101 };
102
103 /* Check if a 256bit AVX register is referenced in stores. */
104
105 static void
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
107 {
108 if ((REG_P (dest)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
113 {
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
116 *state = used;
117 }
118 }
119
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
123
124 STATE is state of the upper 128bits of AVX registers at entry. */
125
126 static void
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
129 {
130 rtx insn, bb_end;
131 rtx vzeroupper_insn = NULL_RTX;
132 rtx pat;
133 int avx256;
134 bool unchanged;
135
136 if (BLOCK_INFO (bb)->unchanged)
137 {
138 if (dump_file)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
140 bb->index, state);
141
142 BLOCK_INFO (bb)->state = state;
143 return;
144 }
145
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
147 {
148 if (dump_file)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
151 return;
152 }
153
154 BLOCK_INFO (bb)->prev = state;
155
156 if (dump_file)
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
158 bb->index, state);
159
160 unchanged = true;
161
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
164 insn = BB_HEAD (bb);
165 while (insn != bb_end)
166 {
167 insn = NEXT_INSN (insn);
168
169 if (!NONDEBUG_INSN_P (insn))
170 continue;
171
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
174 {
175 if (!vzeroupper_insn)
176 continue;
177
178 if (PREV_INSN (insn) != vzeroupper_insn)
179 {
180 if (dump_file)
181 {
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
186 }
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
188 PREV_INSN (insn));
189 }
190 vzeroupper_insn = NULL_RTX;
191 continue;
192 }
193
194 pat = PATTERN (insn);
195
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
199 {
200 if (dump_file)
201 {
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
205 }
206 }
207 else
208 {
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
213 {
214 state = unused;
215 unchanged = false;
216
217 /* Delete pending vzeroupper insertion. */
218 if (vzeroupper_insn)
219 {
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
222 }
223 }
224 else if (state != used)
225 {
226 note_stores (pat, check_avx256_stores, &state);
227 if (state == used)
228 unchanged = false;
229 }
230 continue;
231 }
232
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
235
236 if (state == unused)
237 {
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
242 {
243 state = used;
244 unchanged = false;
245 }
246
247 /* Remove unnecessary vzeroupper since upper 128bits are
248 cleared. */
249 if (dump_file)
250 {
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
253 }
254 delete_insn (insn);
255 }
256 else
257 {
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
259 register. */
260 if (avx256 != callee_return_pass_avx256)
261 state = unused;
262
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
265 {
266 /* Must remove vzeroupper since callee passes in 256bit
267 AVX register. */
268 if (dump_file)
269 {
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
272 }
273 delete_insn (insn);
274 }
275 else
276 {
277 vzeroupper_insn = insn;
278 unchanged = false;
279 }
280 }
281 }
282
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
286
287 if (dump_file)
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
290 state);
291 }
292
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
296 state is changed. */
297
298 static bool
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
300 {
301 edge e;
302 edge_iterator ei;
303 enum upper_128bits_state state, old_state, new_state;
304 bool seen_unknown;
305
306 if (dump_file)
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
309
310 if (BLOCK_INFO (block)->processed)
311 return false;
312
313 state = unused;
314
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
318 {
319 if (e->src == block)
320 continue;
321 switch (BLOCK_INFO (e->src)->state)
322 {
323 case unknown:
324 if (!unknown_is_unused)
325 seen_unknown = true;
326 case unused:
327 break;
328 case used:
329 state = used;
330 goto done;
331 }
332 }
333
334 if (seen_unknown)
335 state = unknown;
336
337 done:
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
341
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
344
345 /* Need to rescan if the upper 128bits of AVX registers are changed
346 to USED at exit. */
347 if (new_state != old_state)
348 {
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
351 return true;
352 }
353 else
354 return false;
355 }
356
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
360
361 static void
362 move_or_delete_vzeroupper (void)
363 {
364 edge e;
365 edge_iterator ei;
366 basic_block bb;
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
369 int *bb_order;
370 int *rc_order;
371 int i;
372
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
375
376 /* Process outgoing edges of entry point. */
377 if (dump_file)
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
379
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
381 {
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
384 ? used : unused);
385 BLOCK_INFO (e->dest)->processed = true;
386 }
387
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
395 free (rc_order);
396
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
403
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
406 FOR_EACH_BB (bb)
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
409 else
410 {
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
413 }
414
415 if (dump_file)
416 fprintf (dump_file, "Check remaining basic blocks\n");
417
418 while (!fibheap_empty (pending))
419 {
420 fibheap_swap = pending;
421 pending = worklist;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
426
427 sbitmap_zero (visited);
428
429 cfun->machine->rescan_vzeroupper_p = 0;
430
431 while (!fibheap_empty (worklist))
432 {
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
437 {
438 edge_iterator ei;
439
440 SET_BIT (visited, bb->index);
441
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
444 {
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
447 continue;
448
449 if (TEST_BIT (visited, e->dest->index))
450 {
451 if (!TEST_BIT (in_pending, e->dest->index))
452 {
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
457 e->dest);
458 }
459 }
460 else if (!TEST_BIT (in_worklist, e->dest->index))
461 {
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
465 e->dest);
466 }
467 }
468 }
469 }
470
471 if (!cfun->machine->rescan_vzeroupper_p)
472 break;
473 }
474
475 free (bb_order);
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
481
482 if (dump_file)
483 fprintf (dump_file, "Process remaining basic blocks\n");
484
485 FOR_EACH_BB (bb)
486 move_or_delete_vzeroupper_1 (bb, true);
487
488 free_aux_for_blocks ();
489 }
490
491 static rtx legitimize_dllimport_symbol (rtx, bool);
492
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
495 #endif
496
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
503 : 4)
504
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
508
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
510
511 const
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
531 2, /* MOVE_RATIO */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
557 2, /* Branch cost */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 1, /* scalar_stmt_cost. */
569 1, /* scalar load_cost. */
570 1, /* scalar_store_cost. */
571 1, /* vec_stmt_cost. */
572 1, /* vec_to_scalar_cost. */
573 1, /* scalar_to_vec_cost. */
574 1, /* vec_align_load_cost. */
575 1, /* vec_unalign_load_cost. */
576 1, /* vec_store_cost. */
577 1, /* cond_taken_branch_cost. */
578 1, /* cond_not_taken_branch_cost. */
579 };
580
581 /* Processor costs (relative to an add) */
582 static const
583 struct processor_costs i386_cost = { /* 386 specific costs */
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (1), /* cost of a lea instruction */
586 COSTS_N_INSNS (3), /* variable shift costs */
587 COSTS_N_INSNS (2), /* constant shift costs */
588 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (6), /* HI */
590 COSTS_N_INSNS (6), /* SI */
591 COSTS_N_INSNS (6), /* DI */
592 COSTS_N_INSNS (6)}, /* other */
593 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (23), /* HI */
596 COSTS_N_INSNS (23), /* SI */
597 COSTS_N_INSNS (23), /* DI */
598 COSTS_N_INSNS (23)}, /* other */
599 COSTS_N_INSNS (3), /* cost of movsx */
600 COSTS_N_INSNS (2), /* cost of movzx */
601 15, /* "large" insn */
602 3, /* MOVE_RATIO */
603 4, /* cost for loading QImode using movzbl */
604 {2, 4, 2}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {2, 4, 2}, /* cost of storing integer registers */
608 2, /* cost of reg,reg fld/fst */
609 {8, 8, 8}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {8, 8, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 8}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 8}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 8, 16}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 8, 16}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 3, /* MMX or SSE register to integer */
624 0, /* size of l1 cache */
625 0, /* size of l2 cache */
626 0, /* size of prefetch block */
627 0, /* number of parallel prefetches */
628 1, /* Branch cost */
629 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (22), /* cost of FABS instruction. */
633 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
635 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
636 DUMMY_STRINGOP_ALGS},
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
650 };
651
652 static const
653 struct processor_costs i486_cost = { /* 486 specific costs */
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (1), /* cost of a lea instruction */
656 COSTS_N_INSNS (3), /* variable shift costs */
657 COSTS_N_INSNS (2), /* constant shift costs */
658 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (12), /* HI */
660 COSTS_N_INSNS (12), /* SI */
661 COSTS_N_INSNS (12), /* DI */
662 COSTS_N_INSNS (12)}, /* other */
663 1, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (40), /* HI */
666 COSTS_N_INSNS (40), /* SI */
667 COSTS_N_INSNS (40), /* DI */
668 COSTS_N_INSNS (40)}, /* other */
669 COSTS_N_INSNS (3), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 15, /* "large" insn */
672 3, /* MOVE_RATIO */
673 4, /* cost for loading QImode using movzbl */
674 {2, 4, 2}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 4, 2}, /* cost of storing integer registers */
678 2, /* cost of reg,reg fld/fst */
679 {8, 8, 8}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {8, 8, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 8}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 8}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 8, 16}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 8, 16}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 3, /* MMX or SSE register to integer */
694 4, /* size of l1 cache. 486 has 8kB cache
695 shared for code and data, so 4kB is
696 not really precise. */
697 4, /* size of l2 cache */
698 0, /* size of prefetch block */
699 0, /* number of parallel prefetches */
700 1, /* Branch cost */
701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (3), /* cost of FABS instruction. */
705 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
707 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
708 DUMMY_STRINGOP_ALGS},
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 1, /* scalar_stmt_cost. */
712 1, /* scalar load_cost. */
713 1, /* scalar_store_cost. */
714 1, /* vec_stmt_cost. */
715 1, /* vec_to_scalar_cost. */
716 1, /* scalar_to_vec_cost. */
717 1, /* vec_align_load_cost. */
718 2, /* vec_unalign_load_cost. */
719 1, /* vec_store_cost. */
720 3, /* cond_taken_branch_cost. */
721 1, /* cond_not_taken_branch_cost. */
722 };
723
724 static const
725 struct processor_costs pentium_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (1), /* cost of a lea instruction */
728 COSTS_N_INSNS (4), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (11), /* HI */
732 COSTS_N_INSNS (11), /* SI */
733 COSTS_N_INSNS (11), /* DI */
734 COSTS_N_INSNS (11)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (25), /* HI */
738 COSTS_N_INSNS (25), /* SI */
739 COSTS_N_INSNS (25), /* DI */
740 COSTS_N_INSNS (25)}, /* other */
741 COSTS_N_INSNS (3), /* cost of movsx */
742 COSTS_N_INSNS (2), /* cost of movzx */
743 8, /* "large" insn */
744 6, /* MOVE_RATIO */
745 6, /* cost for loading QImode using movzbl */
746 {2, 4, 2}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {2, 4, 2}, /* cost of storing integer registers */
750 2, /* cost of reg,reg fld/fst */
751 {2, 2, 6}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {4, 4, 6}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 8, /* cost of moving MMX register */
756 {8, 8}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {8, 8}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 8, 16}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 8, 16}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 3, /* MMX or SSE register to integer */
766 8, /* size of l1 cache. */
767 8, /* size of l2 cache */
768 0, /* size of prefetch block */
769 0, /* number of parallel prefetches */
770 2, /* Branch cost */
771 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
772 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
773 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
774 COSTS_N_INSNS (1), /* cost of FABS instruction. */
775 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
776 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
777 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
778 DUMMY_STRINGOP_ALGS},
779 {{libcall, {{-1, rep_prefix_4_byte}}},
780 DUMMY_STRINGOP_ALGS},
781 1, /* scalar_stmt_cost. */
782 1, /* scalar load_cost. */
783 1, /* scalar_store_cost. */
784 1, /* vec_stmt_cost. */
785 1, /* vec_to_scalar_cost. */
786 1, /* scalar_to_vec_cost. */
787 1, /* vec_align_load_cost. */
788 2, /* vec_unalign_load_cost. */
789 1, /* vec_store_cost. */
790 3, /* cond_taken_branch_cost. */
791 1, /* cond_not_taken_branch_cost. */
792 };
793
794 static const
795 struct processor_costs pentiumpro_cost = {
796 COSTS_N_INSNS (1), /* cost of an add instruction */
797 COSTS_N_INSNS (1), /* cost of a lea instruction */
798 COSTS_N_INSNS (1), /* variable shift costs */
799 COSTS_N_INSNS (1), /* constant shift costs */
800 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
801 COSTS_N_INSNS (4), /* HI */
802 COSTS_N_INSNS (4), /* SI */
803 COSTS_N_INSNS (4), /* DI */
804 COSTS_N_INSNS (4)}, /* other */
805 0, /* cost of multiply per each bit set */
806 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
807 COSTS_N_INSNS (17), /* HI */
808 COSTS_N_INSNS (17), /* SI */
809 COSTS_N_INSNS (17), /* DI */
810 COSTS_N_INSNS (17)}, /* other */
811 COSTS_N_INSNS (1), /* cost of movsx */
812 COSTS_N_INSNS (1), /* cost of movzx */
813 8, /* "large" insn */
814 6, /* MOVE_RATIO */
815 2, /* cost for loading QImode using movzbl */
816 {4, 4, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 2, 2}, /* cost of storing integer registers */
820 2, /* cost of reg,reg fld/fst */
821 {2, 2, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 6}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, /* cost of moving SSE register */
831 {2, 2, 8}, /* cost of loading SSE registers
832 in SImode, DImode and TImode */
833 {2, 2, 8}, /* cost of storing SSE registers
834 in SImode, DImode and TImode */
835 3, /* MMX or SSE register to integer */
836 8, /* size of l1 cache. */
837 256, /* size of l2 cache */
838 32, /* size of prefetch block */
839 6, /* number of parallel prefetches */
840 2, /* Branch cost */
841 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
842 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
843 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
844 COSTS_N_INSNS (2), /* cost of FABS instruction. */
845 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
846 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
847 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
848 (we ensure the alignment). For small blocks inline loop is still a
849 noticeable win, for bigger blocks either rep movsl or rep movsb is
850 way to go. Rep movsb has apparently more expensive startup time in CPU,
851 but after 4K the difference is down in the noise. */
852 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
853 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
854 DUMMY_STRINGOP_ALGS},
855 {{rep_prefix_4_byte, {{1024, unrolled_loop},
856 {8192, rep_prefix_4_byte}, {-1, libcall}}},
857 DUMMY_STRINGOP_ALGS},
858 1, /* scalar_stmt_cost. */
859 1, /* scalar load_cost. */
860 1, /* scalar_store_cost. */
861 1, /* vec_stmt_cost. */
862 1, /* vec_to_scalar_cost. */
863 1, /* scalar_to_vec_cost. */
864 1, /* vec_align_load_cost. */
865 2, /* vec_unalign_load_cost. */
866 1, /* vec_store_cost. */
867 3, /* cond_taken_branch_cost. */
868 1, /* cond_not_taken_branch_cost. */
869 };
870
871 static const
872 struct processor_costs geode_cost = {
873 COSTS_N_INSNS (1), /* cost of an add instruction */
874 COSTS_N_INSNS (1), /* cost of a lea instruction */
875 COSTS_N_INSNS (2), /* variable shift costs */
876 COSTS_N_INSNS (1), /* constant shift costs */
877 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
878 COSTS_N_INSNS (4), /* HI */
879 COSTS_N_INSNS (7), /* SI */
880 COSTS_N_INSNS (7), /* DI */
881 COSTS_N_INSNS (7)}, /* other */
882 0, /* cost of multiply per each bit set */
883 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
884 COSTS_N_INSNS (23), /* HI */
885 COSTS_N_INSNS (39), /* SI */
886 COSTS_N_INSNS (39), /* DI */
887 COSTS_N_INSNS (39)}, /* other */
888 COSTS_N_INSNS (1), /* cost of movsx */
889 COSTS_N_INSNS (1), /* cost of movzx */
890 8, /* "large" insn */
891 4, /* MOVE_RATIO */
892 1, /* cost for loading QImode using movzbl */
893 {1, 1, 1}, /* cost of loading integer registers
894 in QImode, HImode and SImode.
895 Relative to reg-reg move (2). */
896 {1, 1, 1}, /* cost of storing integer registers */
897 1, /* cost of reg,reg fld/fst */
898 {1, 1, 1}, /* cost of loading fp registers
899 in SFmode, DFmode and XFmode */
900 {4, 6, 6}, /* cost of storing fp registers
901 in SFmode, DFmode and XFmode */
902
903 1, /* cost of moving MMX register */
904 {1, 1}, /* cost of loading MMX registers
905 in SImode and DImode */
906 {1, 1}, /* cost of storing MMX registers
907 in SImode and DImode */
908 1, /* cost of moving SSE register */
909 {1, 1, 1}, /* cost of loading SSE registers
910 in SImode, DImode and TImode */
911 {1, 1, 1}, /* cost of storing SSE registers
912 in SImode, DImode and TImode */
913 1, /* MMX or SSE register to integer */
914 64, /* size of l1 cache. */
915 128, /* size of l2 cache. */
916 32, /* size of prefetch block */
917 1, /* number of parallel prefetches */
918 1, /* Branch cost */
919 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
920 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
921 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
922 COSTS_N_INSNS (1), /* cost of FABS instruction. */
923 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
924 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
925 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
926 DUMMY_STRINGOP_ALGS},
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 1, /* scalar_stmt_cost. */
930 1, /* scalar load_cost. */
931 1, /* scalar_store_cost. */
932 1, /* vec_stmt_cost. */
933 1, /* vec_to_scalar_cost. */
934 1, /* scalar_to_vec_cost. */
935 1, /* vec_align_load_cost. */
936 2, /* vec_unalign_load_cost. */
937 1, /* vec_store_cost. */
938 3, /* cond_taken_branch_cost. */
939 1, /* cond_not_taken_branch_cost. */
940 };
941
942 static const
943 struct processor_costs k6_cost = {
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (3), /* HI */
950 COSTS_N_INSNS (3), /* SI */
951 COSTS_N_INSNS (3), /* DI */
952 COSTS_N_INSNS (3)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (18), /* HI */
956 COSTS_N_INSNS (18), /* SI */
957 COSTS_N_INSNS (18), /* DI */
958 COSTS_N_INSNS (18)}, /* other */
959 COSTS_N_INSNS (2), /* cost of movsx */
960 COSTS_N_INSNS (2), /* cost of movzx */
961 8, /* "large" insn */
962 4, /* MOVE_RATIO */
963 3, /* cost for loading QImode using movzbl */
964 {4, 5, 4}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {2, 3, 2}, /* cost of storing integer registers */
968 4, /* cost of reg,reg fld/fst */
969 {6, 6, 6}, /* cost of loading fp registers
970 in SFmode, DFmode and XFmode */
971 {4, 4, 4}, /* cost of storing fp registers
972 in SFmode, DFmode and XFmode */
973 2, /* cost of moving MMX register */
974 {2, 2}, /* cost of loading MMX registers
975 in SImode and DImode */
976 {2, 2}, /* cost of storing MMX registers
977 in SImode and DImode */
978 2, /* cost of moving SSE register */
979 {2, 2, 8}, /* cost of loading SSE registers
980 in SImode, DImode and TImode */
981 {2, 2, 8}, /* cost of storing SSE registers
982 in SImode, DImode and TImode */
983 6, /* MMX or SSE register to integer */
984 32, /* size of l1 cache. */
985 32, /* size of l2 cache. Some models
986 have integrated l2 cache, but
987 optimizing for k6 is not important
988 enough to worry about that. */
989 32, /* size of prefetch block */
990 1, /* number of parallel prefetches */
991 1, /* Branch cost */
992 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
993 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
994 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
995 COSTS_N_INSNS (2), /* cost of FABS instruction. */
996 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
997 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
998 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
999 DUMMY_STRINGOP_ALGS},
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 1, /* scalar_stmt_cost. */
1003 1, /* scalar load_cost. */
1004 1, /* scalar_store_cost. */
1005 1, /* vec_stmt_cost. */
1006 1, /* vec_to_scalar_cost. */
1007 1, /* scalar_to_vec_cost. */
1008 1, /* vec_align_load_cost. */
1009 2, /* vec_unalign_load_cost. */
1010 1, /* vec_store_cost. */
1011 3, /* cond_taken_branch_cost. */
1012 1, /* cond_not_taken_branch_cost. */
1013 };
1014
1015 static const
1016 struct processor_costs athlon_cost = {
1017 COSTS_N_INSNS (1), /* cost of an add instruction */
1018 COSTS_N_INSNS (2), /* cost of a lea instruction */
1019 COSTS_N_INSNS (1), /* variable shift costs */
1020 COSTS_N_INSNS (1), /* constant shift costs */
1021 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1022 COSTS_N_INSNS (5), /* HI */
1023 COSTS_N_INSNS (5), /* SI */
1024 COSTS_N_INSNS (5), /* DI */
1025 COSTS_N_INSNS (5)}, /* other */
1026 0, /* cost of multiply per each bit set */
1027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1028 COSTS_N_INSNS (26), /* HI */
1029 COSTS_N_INSNS (42), /* SI */
1030 COSTS_N_INSNS (74), /* DI */
1031 COSTS_N_INSNS (74)}, /* other */
1032 COSTS_N_INSNS (1), /* cost of movsx */
1033 COSTS_N_INSNS (1), /* cost of movzx */
1034 8, /* "large" insn */
1035 9, /* MOVE_RATIO */
1036 4, /* cost for loading QImode using movzbl */
1037 {3, 4, 3}, /* cost of loading integer registers
1038 in QImode, HImode and SImode.
1039 Relative to reg-reg move (2). */
1040 {3, 4, 3}, /* cost of storing integer registers */
1041 4, /* cost of reg,reg fld/fst */
1042 {4, 4, 12}, /* cost of loading fp registers
1043 in SFmode, DFmode and XFmode */
1044 {6, 6, 8}, /* cost of storing fp registers
1045 in SFmode, DFmode and XFmode */
1046 2, /* cost of moving MMX register */
1047 {4, 4}, /* cost of loading MMX registers
1048 in SImode and DImode */
1049 {4, 4}, /* cost of storing MMX registers
1050 in SImode and DImode */
1051 2, /* cost of moving SSE register */
1052 {4, 4, 6}, /* cost of loading SSE registers
1053 in SImode, DImode and TImode */
1054 {4, 4, 5}, /* cost of storing SSE registers
1055 in SImode, DImode and TImode */
1056 5, /* MMX or SSE register to integer */
1057 64, /* size of l1 cache. */
1058 256, /* size of l2 cache. */
1059 64, /* size of prefetch block */
1060 6, /* number of parallel prefetches */
1061 5, /* Branch cost */
1062 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1063 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1064 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1065 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1066 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1067 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1068 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1069 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1070 128 bytes for memset. */
1071 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1072 DUMMY_STRINGOP_ALGS},
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 1, /* scalar_stmt_cost. */
1076 1, /* scalar load_cost. */
1077 1, /* scalar_store_cost. */
1078 1, /* vec_stmt_cost. */
1079 1, /* vec_to_scalar_cost. */
1080 1, /* scalar_to_vec_cost. */
1081 1, /* vec_align_load_cost. */
1082 2, /* vec_unalign_load_cost. */
1083 1, /* vec_store_cost. */
1084 3, /* cond_taken_branch_cost. */
1085 1, /* cond_not_taken_branch_cost. */
1086 };
1087
1088 static const
1089 struct processor_costs k8_cost = {
1090 COSTS_N_INSNS (1), /* cost of an add instruction */
1091 COSTS_N_INSNS (2), /* cost of a lea instruction */
1092 COSTS_N_INSNS (1), /* variable shift costs */
1093 COSTS_N_INSNS (1), /* constant shift costs */
1094 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1095 COSTS_N_INSNS (4), /* HI */
1096 COSTS_N_INSNS (3), /* SI */
1097 COSTS_N_INSNS (4), /* DI */
1098 COSTS_N_INSNS (5)}, /* other */
1099 0, /* cost of multiply per each bit set */
1100 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1101 COSTS_N_INSNS (26), /* HI */
1102 COSTS_N_INSNS (42), /* SI */
1103 COSTS_N_INSNS (74), /* DI */
1104 COSTS_N_INSNS (74)}, /* other */
1105 COSTS_N_INSNS (1), /* cost of movsx */
1106 COSTS_N_INSNS (1), /* cost of movzx */
1107 8, /* "large" insn */
1108 9, /* MOVE_RATIO */
1109 4, /* cost for loading QImode using movzbl */
1110 {3, 4, 3}, /* cost of loading integer registers
1111 in QImode, HImode and SImode.
1112 Relative to reg-reg move (2). */
1113 {3, 4, 3}, /* cost of storing integer registers */
1114 4, /* cost of reg,reg fld/fst */
1115 {4, 4, 12}, /* cost of loading fp registers
1116 in SFmode, DFmode and XFmode */
1117 {6, 6, 8}, /* cost of storing fp registers
1118 in SFmode, DFmode and XFmode */
1119 2, /* cost of moving MMX register */
1120 {3, 3}, /* cost of loading MMX registers
1121 in SImode and DImode */
1122 {4, 4}, /* cost of storing MMX registers
1123 in SImode and DImode */
1124 2, /* cost of moving SSE register */
1125 {4, 3, 6}, /* cost of loading SSE registers
1126 in SImode, DImode and TImode */
1127 {4, 4, 5}, /* cost of storing SSE registers
1128 in SImode, DImode and TImode */
1129 5, /* MMX or SSE register to integer */
1130 64, /* size of l1 cache. */
1131 512, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 3, /* Branch cost */
1140 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1146 /* K8 has optimized REP instruction for medium sized blocks, but for very
1147 small blocks it is better to use loop. For large blocks, libcall can
1148 do nontemporary accesses and beat inline considerably. */
1149 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1150 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1151 {{libcall, {{8, loop}, {24, unrolled_loop},
1152 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1153 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1154 4, /* scalar_stmt_cost. */
1155 2, /* scalar load_cost. */
1156 2, /* scalar_store_cost. */
1157 5, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 2, /* vec_align_load_cost. */
1161 3, /* vec_unalign_load_cost. */
1162 3, /* vec_store_cost. */
1163 3, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1165 };
1166
1167 struct processor_costs amdfam10_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (2), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (3), /* SI */
1175 COSTS_N_INSNS (4), /* DI */
1176 COSTS_N_INSNS (5)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1186 9, /* MOVE_RATIO */
1187 4, /* cost for loading QImode using movzbl */
1188 {3, 4, 3}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {3, 4, 3}, /* cost of storing integer registers */
1192 4, /* cost of reg,reg fld/fst */
1193 {4, 4, 12}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {6, 6, 8}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {3, 3}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {4, 4}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 2, /* cost of moving SSE register */
1203 {4, 4, 3}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {4, 4, 5}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 3, /* MMX or SSE register to integer */
1208 /* On K8:
1209 MOVD reg64, xmmreg Double FSTORE 4
1210 MOVD reg32, xmmreg Double FSTORE 4
1211 On AMDFAM10:
1212 MOVD reg64, xmmreg Double FADD 3
1213 1/1 1/1
1214 MOVD reg32, xmmreg Double FADD 3
1215 1/1 1/1 */
1216 64, /* size of l1 cache. */
1217 512, /* size of l2 cache. */
1218 64, /* size of prefetch block */
1219 /* New AMD processors never drop prefetches; if they cannot be performed
1220 immediately, they are queued. We set number of simultaneous prefetches
1221 to a large constant to reflect this (it probably is not a good idea not
1222 to limit number of prefetches at all, as their execution also takes some
1223 time). */
1224 100, /* number of parallel prefetches */
1225 2, /* Branch cost */
1226 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1227 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1228 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1229 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1230 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1231 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1232
1233 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1234 very small blocks it is better to use loop. For large blocks, libcall can
1235 do nontemporary accesses and beat inline considerably. */
1236 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1237 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1238 {{libcall, {{8, loop}, {24, unrolled_loop},
1239 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1240 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1241 4, /* scalar_stmt_cost. */
1242 2, /* scalar load_cost. */
1243 2, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 2, /* vec_align_load_cost. */
1248 2, /* vec_unalign_load_cost. */
1249 2, /* vec_store_cost. */
1250 2, /* cond_taken_branch_cost. */
1251 1, /* cond_not_taken_branch_cost. */
1252 };
1253
1254 struct processor_costs bdver1_cost = {
1255 COSTS_N_INSNS (1), /* cost of an add instruction */
1256 COSTS_N_INSNS (1), /* cost of a lea instruction */
1257 COSTS_N_INSNS (1), /* variable shift costs */
1258 COSTS_N_INSNS (1), /* constant shift costs */
1259 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1260 COSTS_N_INSNS (4), /* HI */
1261 COSTS_N_INSNS (4), /* SI */
1262 COSTS_N_INSNS (6), /* DI */
1263 COSTS_N_INSNS (6)}, /* other */
1264 0, /* cost of multiply per each bit set */
1265 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1266 COSTS_N_INSNS (35), /* HI */
1267 COSTS_N_INSNS (51), /* SI */
1268 COSTS_N_INSNS (83), /* DI */
1269 COSTS_N_INSNS (83)}, /* other */
1270 COSTS_N_INSNS (1), /* cost of movsx */
1271 COSTS_N_INSNS (1), /* cost of movzx */
1272 8, /* "large" insn */
1273 9, /* MOVE_RATIO */
1274 4, /* cost for loading QImode using movzbl */
1275 {5, 5, 4}, /* cost of loading integer registers
1276 in QImode, HImode and SImode.
1277 Relative to reg-reg move (2). */
1278 {4, 4, 4}, /* cost of storing integer registers */
1279 2, /* cost of reg,reg fld/fst */
1280 {5, 5, 12}, /* cost of loading fp registers
1281 in SFmode, DFmode and XFmode */
1282 {4, 4, 8}, /* cost of storing fp registers
1283 in SFmode, DFmode and XFmode */
1284 2, /* cost of moving MMX register */
1285 {4, 4}, /* cost of loading MMX registers
1286 in SImode and DImode */
1287 {4, 4}, /* cost of storing MMX registers
1288 in SImode and DImode */
1289 2, /* cost of moving SSE register */
1290 {4, 4, 4}, /* cost of loading SSE registers
1291 in SImode, DImode and TImode */
1292 {4, 4, 4}, /* cost of storing SSE registers
1293 in SImode, DImode and TImode */
1294 2, /* MMX or SSE register to integer */
1295 /* On K8:
1296 MOVD reg64, xmmreg Double FSTORE 4
1297 MOVD reg32, xmmreg Double FSTORE 4
1298 On AMDFAM10:
1299 MOVD reg64, xmmreg Double FADD 3
1300 1/1 1/1
1301 MOVD reg32, xmmreg Double FADD 3
1302 1/1 1/1 */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1319
1320 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1321 very small blocks it is better to use loop. For large blocks, libcall
1322 can do nontemporary accesses and beat inline considerably. */
1323 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1324 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1325 {{libcall, {{8, loop}, {24, unrolled_loop},
1326 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1327 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 2, /* cond_taken_branch_cost. */
1338 1, /* cond_not_taken_branch_cost. */
1339 };
1340
1341 struct processor_costs bdver2_cost = {
1342 COSTS_N_INSNS (1), /* cost of an add instruction */
1343 COSTS_N_INSNS (1), /* cost of a lea instruction */
1344 COSTS_N_INSNS (1), /* variable shift costs */
1345 COSTS_N_INSNS (1), /* constant shift costs */
1346 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1347 COSTS_N_INSNS (4), /* HI */
1348 COSTS_N_INSNS (4), /* SI */
1349 COSTS_N_INSNS (6), /* DI */
1350 COSTS_N_INSNS (6)}, /* other */
1351 0, /* cost of multiply per each bit set */
1352 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1353 COSTS_N_INSNS (35), /* HI */
1354 COSTS_N_INSNS (51), /* SI */
1355 COSTS_N_INSNS (83), /* DI */
1356 COSTS_N_INSNS (83)}, /* other */
1357 COSTS_N_INSNS (1), /* cost of movsx */
1358 COSTS_N_INSNS (1), /* cost of movzx */
1359 8, /* "large" insn */
1360 9, /* MOVE_RATIO */
1361 4, /* cost for loading QImode using movzbl */
1362 {5, 5, 4}, /* cost of loading integer registers
1363 in QImode, HImode and SImode.
1364 Relative to reg-reg move (2). */
1365 {4, 4, 4}, /* cost of storing integer registers */
1366 2, /* cost of reg,reg fld/fst */
1367 {5, 5, 12}, /* cost of loading fp registers
1368 in SFmode, DFmode and XFmode */
1369 {4, 4, 8}, /* cost of storing fp registers
1370 in SFmode, DFmode and XFmode */
1371 2, /* cost of moving MMX register */
1372 {4, 4}, /* cost of loading MMX registers
1373 in SImode and DImode */
1374 {4, 4}, /* cost of storing MMX registers
1375 in SImode and DImode */
1376 2, /* cost of moving SSE register */
1377 {4, 4, 4}, /* cost of loading SSE registers
1378 in SImode, DImode and TImode */
1379 {4, 4, 4}, /* cost of storing SSE registers
1380 in SImode, DImode and TImode */
1381 2, /* MMX or SSE register to integer */
1382 /* On K8:
1383 MOVD reg64, xmmreg Double FSTORE 4
1384 MOVD reg32, xmmreg Double FSTORE 4
1385 On AMDFAM10:
1386 MOVD reg64, xmmreg Double FADD 3
1387 1/1 1/1
1388 MOVD reg32, xmmreg Double FADD 3
1389 1/1 1/1 */
1390 16, /* size of l1 cache. */
1391 2048, /* size of l2 cache. */
1392 64, /* size of prefetch block */
1393 /* New AMD processors never drop prefetches; if they cannot be performed
1394 immediately, they are queued. We set number of simultaneous prefetches
1395 to a large constant to reflect this (it probably is not a good idea not
1396 to limit number of prefetches at all, as their execution also takes some
1397 time). */
1398 100, /* number of parallel prefetches */
1399 2, /* Branch cost */
1400 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1401 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1402 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1403 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1404 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1405 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1406
1407 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1408 very small blocks it is better to use loop. For large blocks, libcall
1409 can do nontemporary accesses and beat inline considerably. */
1410 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1411 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1412 {{libcall, {{8, loop}, {24, unrolled_loop},
1413 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1414 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1415 6, /* scalar_stmt_cost. */
1416 4, /* scalar load_cost. */
1417 4, /* scalar_store_cost. */
1418 6, /* vec_stmt_cost. */
1419 0, /* vec_to_scalar_cost. */
1420 2, /* scalar_to_vec_cost. */
1421 4, /* vec_align_load_cost. */
1422 4, /* vec_unalign_load_cost. */
1423 4, /* vec_store_cost. */
1424 2, /* cond_taken_branch_cost. */
1425 1, /* cond_not_taken_branch_cost. */
1426 };
1427
1428 struct processor_costs btver1_cost = {
1429 COSTS_N_INSNS (1), /* cost of an add instruction */
1430 COSTS_N_INSNS (2), /* cost of a lea instruction */
1431 COSTS_N_INSNS (1), /* variable shift costs */
1432 COSTS_N_INSNS (1), /* constant shift costs */
1433 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1434 COSTS_N_INSNS (4), /* HI */
1435 COSTS_N_INSNS (3), /* SI */
1436 COSTS_N_INSNS (4), /* DI */
1437 COSTS_N_INSNS (5)}, /* other */
1438 0, /* cost of multiply per each bit set */
1439 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1440 COSTS_N_INSNS (35), /* HI */
1441 COSTS_N_INSNS (51), /* SI */
1442 COSTS_N_INSNS (83), /* DI */
1443 COSTS_N_INSNS (83)}, /* other */
1444 COSTS_N_INSNS (1), /* cost of movsx */
1445 COSTS_N_INSNS (1), /* cost of movzx */
1446 8, /* "large" insn */
1447 9, /* MOVE_RATIO */
1448 4, /* cost for loading QImode using movzbl */
1449 {3, 4, 3}, /* cost of loading integer registers
1450 in QImode, HImode and SImode.
1451 Relative to reg-reg move (2). */
1452 {3, 4, 3}, /* cost of storing integer registers */
1453 4, /* cost of reg,reg fld/fst */
1454 {4, 4, 12}, /* cost of loading fp registers
1455 in SFmode, DFmode and XFmode */
1456 {6, 6, 8}, /* cost of storing fp registers
1457 in SFmode, DFmode and XFmode */
1458 2, /* cost of moving MMX register */
1459 {3, 3}, /* cost of loading MMX registers
1460 in SImode and DImode */
1461 {4, 4}, /* cost of storing MMX registers
1462 in SImode and DImode */
1463 2, /* cost of moving SSE register */
1464 {4, 4, 3}, /* cost of loading SSE registers
1465 in SImode, DImode and TImode */
1466 {4, 4, 5}, /* cost of storing SSE registers
1467 in SImode, DImode and TImode */
1468 3, /* MMX or SSE register to integer */
1469 /* On K8:
1470 MOVD reg64, xmmreg Double FSTORE 4
1471 MOVD reg32, xmmreg Double FSTORE 4
1472 On AMDFAM10:
1473 MOVD reg64, xmmreg Double FADD 3
1474 1/1 1/1
1475 MOVD reg32, xmmreg Double FADD 3
1476 1/1 1/1 */
1477 32, /* size of l1 cache. */
1478 512, /* size of l2 cache. */
1479 64, /* size of prefetch block */
1480 100, /* number of parallel prefetches */
1481 2, /* Branch cost */
1482 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1483 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1484 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1485 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1486 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1487 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1488
1489 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1490 very small blocks it is better to use loop. For large blocks, libcall can
1491 do nontemporary accesses and beat inline considerably. */
1492 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1493 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1494 {{libcall, {{8, loop}, {24, unrolled_loop},
1495 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1496 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1497 4, /* scalar_stmt_cost. */
1498 2, /* scalar load_cost. */
1499 2, /* scalar_store_cost. */
1500 6, /* vec_stmt_cost. */
1501 0, /* vec_to_scalar_cost. */
1502 2, /* scalar_to_vec_cost. */
1503 2, /* vec_align_load_cost. */
1504 2, /* vec_unalign_load_cost. */
1505 2, /* vec_store_cost. */
1506 2, /* cond_taken_branch_cost. */
1507 1, /* cond_not_taken_branch_cost. */
1508 };
1509
1510 static const
1511 struct processor_costs pentium4_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (3), /* cost of a lea instruction */
1514 COSTS_N_INSNS (4), /* variable shift costs */
1515 COSTS_N_INSNS (4), /* constant shift costs */
1516 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (15), /* HI */
1518 COSTS_N_INSNS (15), /* SI */
1519 COSTS_N_INSNS (15), /* DI */
1520 COSTS_N_INSNS (15)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (56), /* HI */
1524 COSTS_N_INSNS (56), /* SI */
1525 COSTS_N_INSNS (56), /* DI */
1526 COSTS_N_INSNS (56)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 16, /* "large" insn */
1530 6, /* MOVE_RATIO */
1531 2, /* cost for loading QImode using movzbl */
1532 {4, 5, 4}, /* cost of loading integer registers
1533 in QImode, HImode and SImode.
1534 Relative to reg-reg move (2). */
1535 {2, 3, 2}, /* cost of storing integer registers */
1536 2, /* cost of reg,reg fld/fst */
1537 {2, 2, 6}, /* cost of loading fp registers
1538 in SFmode, DFmode and XFmode */
1539 {4, 4, 6}, /* cost of storing fp registers
1540 in SFmode, DFmode and XFmode */
1541 2, /* cost of moving MMX register */
1542 {2, 2}, /* cost of loading MMX registers
1543 in SImode and DImode */
1544 {2, 2}, /* cost of storing MMX registers
1545 in SImode and DImode */
1546 12, /* cost of moving SSE register */
1547 {12, 12, 12}, /* cost of loading SSE registers
1548 in SImode, DImode and TImode */
1549 {2, 2, 8}, /* cost of storing SSE registers
1550 in SImode, DImode and TImode */
1551 10, /* MMX or SSE register to integer */
1552 8, /* size of l1 cache. */
1553 256, /* size of l2 cache. */
1554 64, /* size of prefetch block */
1555 6, /* number of parallel prefetches */
1556 2, /* Branch cost */
1557 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1558 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1559 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1560 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1561 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1562 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1563 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1564 DUMMY_STRINGOP_ALGS},
1565 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1566 {-1, libcall}}},
1567 DUMMY_STRINGOP_ALGS},
1568 1, /* scalar_stmt_cost. */
1569 1, /* scalar load_cost. */
1570 1, /* scalar_store_cost. */
1571 1, /* vec_stmt_cost. */
1572 1, /* vec_to_scalar_cost. */
1573 1, /* scalar_to_vec_cost. */
1574 1, /* vec_align_load_cost. */
1575 2, /* vec_unalign_load_cost. */
1576 1, /* vec_store_cost. */
1577 3, /* cond_taken_branch_cost. */
1578 1, /* cond_not_taken_branch_cost. */
1579 };
1580
1581 static const
1582 struct processor_costs nocona_cost = {
1583 COSTS_N_INSNS (1), /* cost of an add instruction */
1584 COSTS_N_INSNS (1), /* cost of a lea instruction */
1585 COSTS_N_INSNS (1), /* variable shift costs */
1586 COSTS_N_INSNS (1), /* constant shift costs */
1587 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1588 COSTS_N_INSNS (10), /* HI */
1589 COSTS_N_INSNS (10), /* SI */
1590 COSTS_N_INSNS (10), /* DI */
1591 COSTS_N_INSNS (10)}, /* other */
1592 0, /* cost of multiply per each bit set */
1593 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1594 COSTS_N_INSNS (66), /* HI */
1595 COSTS_N_INSNS (66), /* SI */
1596 COSTS_N_INSNS (66), /* DI */
1597 COSTS_N_INSNS (66)}, /* other */
1598 COSTS_N_INSNS (1), /* cost of movsx */
1599 COSTS_N_INSNS (1), /* cost of movzx */
1600 16, /* "large" insn */
1601 17, /* MOVE_RATIO */
1602 4, /* cost for loading QImode using movzbl */
1603 {4, 4, 4}, /* cost of loading integer registers
1604 in QImode, HImode and SImode.
1605 Relative to reg-reg move (2). */
1606 {4, 4, 4}, /* cost of storing integer registers */
1607 3, /* cost of reg,reg fld/fst */
1608 {12, 12, 12}, /* cost of loading fp registers
1609 in SFmode, DFmode and XFmode */
1610 {4, 4, 4}, /* cost of storing fp registers
1611 in SFmode, DFmode and XFmode */
1612 6, /* cost of moving MMX register */
1613 {12, 12}, /* cost of loading MMX registers
1614 in SImode and DImode */
1615 {12, 12}, /* cost of storing MMX registers
1616 in SImode and DImode */
1617 6, /* cost of moving SSE register */
1618 {12, 12, 12}, /* cost of loading SSE registers
1619 in SImode, DImode and TImode */
1620 {12, 12, 12}, /* cost of storing SSE registers
1621 in SImode, DImode and TImode */
1622 8, /* MMX or SSE register to integer */
1623 8, /* size of l1 cache. */
1624 1024, /* size of l2 cache. */
1625 128, /* size of prefetch block */
1626 8, /* number of parallel prefetches */
1627 1, /* Branch cost */
1628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1629 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1630 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1631 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1632 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1633 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1634 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1635 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1636 {100000, unrolled_loop}, {-1, libcall}}}},
1637 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1638 {-1, libcall}}},
1639 {libcall, {{24, loop}, {64, unrolled_loop},
1640 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1641 1, /* scalar_stmt_cost. */
1642 1, /* scalar load_cost. */
1643 1, /* scalar_store_cost. */
1644 1, /* vec_stmt_cost. */
1645 1, /* vec_to_scalar_cost. */
1646 1, /* scalar_to_vec_cost. */
1647 1, /* vec_align_load_cost. */
1648 2, /* vec_unalign_load_cost. */
1649 1, /* vec_store_cost. */
1650 3, /* cond_taken_branch_cost. */
1651 1, /* cond_not_taken_branch_cost. */
1652 };
1653
1654 static const
1655 struct processor_costs atom_cost = {
1656 COSTS_N_INSNS (1), /* cost of an add instruction */
1657 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1658 COSTS_N_INSNS (1), /* variable shift costs */
1659 COSTS_N_INSNS (1), /* constant shift costs */
1660 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1661 COSTS_N_INSNS (4), /* HI */
1662 COSTS_N_INSNS (3), /* SI */
1663 COSTS_N_INSNS (4), /* DI */
1664 COSTS_N_INSNS (2)}, /* other */
1665 0, /* cost of multiply per each bit set */
1666 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1667 COSTS_N_INSNS (26), /* HI */
1668 COSTS_N_INSNS (42), /* SI */
1669 COSTS_N_INSNS (74), /* DI */
1670 COSTS_N_INSNS (74)}, /* other */
1671 COSTS_N_INSNS (1), /* cost of movsx */
1672 COSTS_N_INSNS (1), /* cost of movzx */
1673 8, /* "large" insn */
1674 17, /* MOVE_RATIO */
1675 4, /* cost for loading QImode using movzbl */
1676 {4, 4, 4}, /* cost of loading integer registers
1677 in QImode, HImode and SImode.
1678 Relative to reg-reg move (2). */
1679 {4, 4, 4}, /* cost of storing integer registers */
1680 4, /* cost of reg,reg fld/fst */
1681 {12, 12, 12}, /* cost of loading fp registers
1682 in SFmode, DFmode and XFmode */
1683 {6, 6, 8}, /* cost of storing fp registers
1684 in SFmode, DFmode and XFmode */
1685 2, /* cost of moving MMX register */
1686 {8, 8}, /* cost of loading MMX registers
1687 in SImode and DImode */
1688 {8, 8}, /* cost of storing MMX registers
1689 in SImode and DImode */
1690 2, /* cost of moving SSE register */
1691 {8, 8, 8}, /* cost of loading SSE registers
1692 in SImode, DImode and TImode */
1693 {8, 8, 8}, /* cost of storing SSE registers
1694 in SImode, DImode and TImode */
1695 5, /* MMX or SSE register to integer */
1696 32, /* size of l1 cache. */
1697 256, /* size of l2 cache. */
1698 64, /* size of prefetch block */
1699 6, /* number of parallel prefetches */
1700 3, /* Branch cost */
1701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1702 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1703 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1704 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1705 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1706 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1707 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1708 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1709 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1710 {{libcall, {{8, loop}, {15, unrolled_loop},
1711 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1712 {libcall, {{24, loop}, {32, unrolled_loop},
1713 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1714 1, /* scalar_stmt_cost. */
1715 1, /* scalar load_cost. */
1716 1, /* scalar_store_cost. */
1717 1, /* vec_stmt_cost. */
1718 1, /* vec_to_scalar_cost. */
1719 1, /* scalar_to_vec_cost. */
1720 1, /* vec_align_load_cost. */
1721 2, /* vec_unalign_load_cost. */
1722 1, /* vec_store_cost. */
1723 3, /* cond_taken_branch_cost. */
1724 1, /* cond_not_taken_branch_cost. */
1725 };
1726
1727 /* Generic64 should produce code tuned for Nocona and K8. */
1728 static const
1729 struct processor_costs generic64_cost = {
1730 COSTS_N_INSNS (1), /* cost of an add instruction */
1731 /* On all chips taken into consideration lea is 2 cycles and more. With
1732 this cost however our current implementation of synth_mult results in
1733 use of unnecessary temporary registers causing regression on several
1734 SPECfp benchmarks. */
1735 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1736 COSTS_N_INSNS (1), /* variable shift costs */
1737 COSTS_N_INSNS (1), /* constant shift costs */
1738 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1739 COSTS_N_INSNS (4), /* HI */
1740 COSTS_N_INSNS (3), /* SI */
1741 COSTS_N_INSNS (4), /* DI */
1742 COSTS_N_INSNS (2)}, /* other */
1743 0, /* cost of multiply per each bit set */
1744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1745 COSTS_N_INSNS (26), /* HI */
1746 COSTS_N_INSNS (42), /* SI */
1747 COSTS_N_INSNS (74), /* DI */
1748 COSTS_N_INSNS (74)}, /* other */
1749 COSTS_N_INSNS (1), /* cost of movsx */
1750 COSTS_N_INSNS (1), /* cost of movzx */
1751 8, /* "large" insn */
1752 17, /* MOVE_RATIO */
1753 4, /* cost for loading QImode using movzbl */
1754 {4, 4, 4}, /* cost of loading integer registers
1755 in QImode, HImode and SImode.
1756 Relative to reg-reg move (2). */
1757 {4, 4, 4}, /* cost of storing integer registers */
1758 4, /* cost of reg,reg fld/fst */
1759 {12, 12, 12}, /* cost of loading fp registers
1760 in SFmode, DFmode and XFmode */
1761 {6, 6, 8}, /* cost of storing fp registers
1762 in SFmode, DFmode and XFmode */
1763 2, /* cost of moving MMX register */
1764 {8, 8}, /* cost of loading MMX registers
1765 in SImode and DImode */
1766 {8, 8}, /* cost of storing MMX registers
1767 in SImode and DImode */
1768 2, /* cost of moving SSE register */
1769 {8, 8, 8}, /* cost of loading SSE registers
1770 in SImode, DImode and TImode */
1771 {8, 8, 8}, /* cost of storing SSE registers
1772 in SImode, DImode and TImode */
1773 5, /* MMX or SSE register to integer */
1774 32, /* size of l1 cache. */
1775 512, /* size of l2 cache. */
1776 64, /* size of prefetch block */
1777 6, /* number of parallel prefetches */
1778 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1779 value is increased to perhaps more appropriate value of 5. */
1780 3, /* Branch cost */
1781 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1782 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1783 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1784 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1785 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1786 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1787 {DUMMY_STRINGOP_ALGS,
1788 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1789 {DUMMY_STRINGOP_ALGS,
1790 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 1, /* scalar_stmt_cost. */
1792 1, /* scalar load_cost. */
1793 1, /* scalar_store_cost. */
1794 1, /* vec_stmt_cost. */
1795 1, /* vec_to_scalar_cost. */
1796 1, /* scalar_to_vec_cost. */
1797 1, /* vec_align_load_cost. */
1798 2, /* vec_unalign_load_cost. */
1799 1, /* vec_store_cost. */
1800 3, /* cond_taken_branch_cost. */
1801 1, /* cond_not_taken_branch_cost. */
1802 };
1803
1804 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1805 Athlon and K8. */
1806 static const
1807 struct processor_costs generic32_cost = {
1808 COSTS_N_INSNS (1), /* cost of an add instruction */
1809 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1810 COSTS_N_INSNS (1), /* variable shift costs */
1811 COSTS_N_INSNS (1), /* constant shift costs */
1812 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1813 COSTS_N_INSNS (4), /* HI */
1814 COSTS_N_INSNS (3), /* SI */
1815 COSTS_N_INSNS (4), /* DI */
1816 COSTS_N_INSNS (2)}, /* other */
1817 0, /* cost of multiply per each bit set */
1818 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1819 COSTS_N_INSNS (26), /* HI */
1820 COSTS_N_INSNS (42), /* SI */
1821 COSTS_N_INSNS (74), /* DI */
1822 COSTS_N_INSNS (74)}, /* other */
1823 COSTS_N_INSNS (1), /* cost of movsx */
1824 COSTS_N_INSNS (1), /* cost of movzx */
1825 8, /* "large" insn */
1826 17, /* MOVE_RATIO */
1827 4, /* cost for loading QImode using movzbl */
1828 {4, 4, 4}, /* cost of loading integer registers
1829 in QImode, HImode and SImode.
1830 Relative to reg-reg move (2). */
1831 {4, 4, 4}, /* cost of storing integer registers */
1832 4, /* cost of reg,reg fld/fst */
1833 {12, 12, 12}, /* cost of loading fp registers
1834 in SFmode, DFmode and XFmode */
1835 {6, 6, 8}, /* cost of storing fp registers
1836 in SFmode, DFmode and XFmode */
1837 2, /* cost of moving MMX register */
1838 {8, 8}, /* cost of loading MMX registers
1839 in SImode and DImode */
1840 {8, 8}, /* cost of storing MMX registers
1841 in SImode and DImode */
1842 2, /* cost of moving SSE register */
1843 {8, 8, 8}, /* cost of loading SSE registers
1844 in SImode, DImode and TImode */
1845 {8, 8, 8}, /* cost of storing SSE registers
1846 in SImode, DImode and TImode */
1847 5, /* MMX or SSE register to integer */
1848 32, /* size of l1 cache. */
1849 256, /* size of l2 cache. */
1850 64, /* size of prefetch block */
1851 6, /* number of parallel prefetches */
1852 3, /* Branch cost */
1853 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1854 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1855 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1856 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1857 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1858 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1859 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1860 DUMMY_STRINGOP_ALGS},
1861 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1862 DUMMY_STRINGOP_ALGS},
1863 1, /* scalar_stmt_cost. */
1864 1, /* scalar load_cost. */
1865 1, /* scalar_store_cost. */
1866 1, /* vec_stmt_cost. */
1867 1, /* vec_to_scalar_cost. */
1868 1, /* scalar_to_vec_cost. */
1869 1, /* vec_align_load_cost. */
1870 2, /* vec_unalign_load_cost. */
1871 1, /* vec_store_cost. */
1872 3, /* cond_taken_branch_cost. */
1873 1, /* cond_not_taken_branch_cost. */
1874 };
1875
1876 const struct processor_costs *ix86_cost = &pentium_cost;
1877
1878 /* Processor feature/optimization bitmasks. */
1879 #define m_386 (1<<PROCESSOR_I386)
1880 #define m_486 (1<<PROCESSOR_I486)
1881 #define m_PENT (1<<PROCESSOR_PENTIUM)
1882 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1883 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1884 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1885 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1886 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1887 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1888 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1889 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1890 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1891 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1892 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1893 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1894 #define m_ATOM (1<<PROCESSOR_ATOM)
1895
1896 #define m_GEODE (1<<PROCESSOR_GEODE)
1897 #define m_K6 (1<<PROCESSOR_K6)
1898 #define m_K6_GEODE (m_K6 | m_GEODE)
1899 #define m_K8 (1<<PROCESSOR_K8)
1900 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1901 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1902 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1903 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1904 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1905 #define m_BDVER (m_BDVER1 | m_BDVER2)
1906 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1907 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1908
1909 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1910 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1911
1912 /* Generic instruction choice should be common subset of supported CPUs
1913 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1914 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1915
1916 /* Feature tests against the various tunings. */
1917 unsigned char ix86_tune_features[X86_TUNE_LAST];
1918
1919 /* Feature tests against the various tunings used to create ix86_tune_features
1920 based on the processor mask. */
1921 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1922 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1923 negatively, so enabling for Generic64 seems like good code size
1924 tradeoff. We can't enable it for 32bit generic because it does not
1925 work well with PPro base chips. */
1926 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1927
1928 /* X86_TUNE_PUSH_MEMORY */
1929 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1930
1931 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1932 m_486 | m_PENT,
1933
1934 /* X86_TUNE_UNROLL_STRLEN */
1935 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1936
1937 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1938 on simulation result. But after P4 was made, no performance benefit
1939 was observed with branch hints. It also increases the code size.
1940 As a result, icc never generates branch hints. */
1941 0,
1942
1943 /* X86_TUNE_DOUBLE_WITH_ADD */
1944 ~m_386,
1945
1946 /* X86_TUNE_USE_SAHF */
1947 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1948
1949 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1950 partial dependencies. */
1951 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1952
1953 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1954 register stalls on Generic32 compilation setting as well. However
1955 in current implementation the partial register stalls are not eliminated
1956 very well - they can be introduced via subregs synthesized by combine
1957 and can happen in caller/callee saving sequences. Because this option
1958 pays back little on PPro based chips and is in conflict with partial reg
1959 dependencies used by Athlon/P4 based chips, it is better to leave it off
1960 for generic32 for now. */
1961 m_PPRO,
1962
1963 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1964 m_CORE2I7 | m_GENERIC,
1965
1966 /* X86_TUNE_USE_HIMODE_FIOP */
1967 m_386 | m_486 | m_K6_GEODE,
1968
1969 /* X86_TUNE_USE_SIMODE_FIOP */
1970 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1971
1972 /* X86_TUNE_USE_MOV0 */
1973 m_K6,
1974
1975 /* X86_TUNE_USE_CLTD */
1976 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1977
1978 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1979 m_PENT4,
1980
1981 /* X86_TUNE_SPLIT_LONG_MOVES */
1982 m_PPRO,
1983
1984 /* X86_TUNE_READ_MODIFY_WRITE */
1985 ~m_PENT,
1986
1987 /* X86_TUNE_READ_MODIFY */
1988 ~(m_PENT | m_PPRO),
1989
1990 /* X86_TUNE_PROMOTE_QIMODE */
1991 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1992
1993 /* X86_TUNE_FAST_PREFIX */
1994 ~(m_386 | m_486 | m_PENT),
1995
1996 /* X86_TUNE_SINGLE_STRINGOP */
1997 m_386 | m_P4_NOCONA,
1998
1999 /* X86_TUNE_QIMODE_MATH */
2000 ~0,
2001
2002 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2003 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2004 might be considered for Generic32 if our scheme for avoiding partial
2005 stalls was more effective. */
2006 ~m_PPRO,
2007
2008 /* X86_TUNE_PROMOTE_QI_REGS */
2009 0,
2010
2011 /* X86_TUNE_PROMOTE_HI_REGS */
2012 m_PPRO,
2013
2014 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2015 over esp addition. */
2016 m_386 | m_486 | m_PENT | m_PPRO,
2017
2018 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2019 over esp addition. */
2020 m_PENT,
2021
2022 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2023 over esp subtraction. */
2024 m_386 | m_486 | m_PENT | m_K6_GEODE,
2025
2026 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2027 over esp subtraction. */
2028 m_PENT | m_K6_GEODE,
2029
2030 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2031 for DFmode copies */
2032 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2033
2034 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2035 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2036
2037 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2038 conflict here in between PPro/Pentium4 based chips that thread 128bit
2039 SSE registers as single units versus K8 based chips that divide SSE
2040 registers to two 64bit halves. This knob promotes all store destinations
2041 to be 128bit to allow register renaming on 128bit SSE units, but usually
2042 results in one extra microop on 64bit SSE units. Experimental results
2043 shows that disabling this option on P4 brings over 20% SPECfp regression,
2044 while enabling it on K8 brings roughly 2.4% regression that can be partly
2045 masked by careful scheduling of moves. */
2046 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2047
2048 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2049 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2050
2051 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2052 m_COREI7 | m_BDVER,
2053
2054 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2055 m_BDVER ,
2056
2057 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2058 are resolved on SSE register parts instead of whole registers, so we may
2059 maintain just lower part of scalar values in proper format leaving the
2060 upper part undefined. */
2061 m_ATHLON_K8,
2062
2063 /* X86_TUNE_SSE_TYPELESS_STORES */
2064 m_AMD_MULTIPLE,
2065
2066 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2067 m_PPRO | m_P4_NOCONA,
2068
2069 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2070 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2071
2072 /* X86_TUNE_PROLOGUE_USING_MOVE */
2073 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2074
2075 /* X86_TUNE_EPILOGUE_USING_MOVE */
2076 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2077
2078 /* X86_TUNE_SHIFT1 */
2079 ~m_486,
2080
2081 /* X86_TUNE_USE_FFREEP */
2082 m_AMD_MULTIPLE,
2083
2084 /* X86_TUNE_INTER_UNIT_MOVES */
2085 ~(m_AMD_MULTIPLE | m_GENERIC),
2086
2087 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2088 ~(m_AMDFAM10 | m_BDVER ),
2089
2090 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2091 than 4 branch instructions in the 16 byte window. */
2092 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2093
2094 /* X86_TUNE_SCHEDULE */
2095 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2096
2097 /* X86_TUNE_USE_BT */
2098 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2099
2100 /* X86_TUNE_USE_INCDEC */
2101 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2102
2103 /* X86_TUNE_PAD_RETURNS */
2104 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2105
2106 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2107 m_ATOM,
2108
2109 /* X86_TUNE_EXT_80387_CONSTANTS */
2110 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2111
2112 /* X86_TUNE_SHORTEN_X87_SSE */
2113 ~m_K8,
2114
2115 /* X86_TUNE_AVOID_VECTOR_DECODE */
2116 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2117
2118 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2119 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2120 ~(m_386 | m_486),
2121
2122 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2123 vector path on AMD machines. */
2124 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2125
2126 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2127 machines. */
2128 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2129
2130 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2131 than a MOV. */
2132 m_PENT,
2133
2134 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2135 but one byte longer. */
2136 m_PENT,
2137
2138 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2139 operand that cannot be represented using a modRM byte. The XOR
2140 replacement is long decoded, so this split helps here as well. */
2141 m_K6,
2142
2143 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2144 from FP to FP. */
2145 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2146
2147 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2148 from integer to FP. */
2149 m_AMDFAM10,
2150
2151 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2152 with a subsequent conditional jump instruction into a single
2153 compare-and-branch uop. */
2154 m_BDVER,
2155
2156 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2157 will impact LEA instruction selection. */
2158 m_ATOM,
2159
2160 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2161 instructions. */
2162 ~m_ATOM,
2163
2164 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2165 at -O3. For the moment, the prefetching seems badly tuned for Intel
2166 chips. */
2167 m_K6_GEODE | m_AMD_MULTIPLE,
2168
2169 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2170 the auto-vectorizer. */
2171 m_BDVER,
2172
2173 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2174 during reassociation of integer computation. */
2175 m_ATOM,
2176
2177 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2178 during reassociation of fp computation. */
2179 m_ATOM
2180 };
2181
2182 /* Feature tests against the various architecture variations. */
2183 unsigned char ix86_arch_features[X86_ARCH_LAST];
2184
2185 /* Feature tests against the various architecture variations, used to create
2186 ix86_arch_features based on the processor mask. */
2187 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2188 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2189 ~(m_386 | m_486 | m_PENT | m_K6),
2190
2191 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2192 ~m_386,
2193
2194 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2195 ~(m_386 | m_486),
2196
2197 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2198 ~m_386,
2199
2200 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2201 ~m_386,
2202 };
2203
2204 static const unsigned int x86_accumulate_outgoing_args
2205 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2206
2207 static const unsigned int x86_arch_always_fancy_math_387
2208 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2209
2210 static const unsigned int x86_avx256_split_unaligned_load
2211 = m_COREI7 | m_GENERIC;
2212
2213 static const unsigned int x86_avx256_split_unaligned_store
2214 = m_COREI7 | m_BDVER | m_GENERIC;
2215
2216 /* In case the average insn count for single function invocation is
2217 lower than this constant, emit fast (but longer) prologue and
2218 epilogue code. */
2219 #define FAST_PROLOGUE_INSN_COUNT 20
2220
2221 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2222 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2223 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2224 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2225
2226 /* Array of the smallest class containing reg number REGNO, indexed by
2227 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2228
2229 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2230 {
2231 /* ax, dx, cx, bx */
2232 AREG, DREG, CREG, BREG,
2233 /* si, di, bp, sp */
2234 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2235 /* FP registers */
2236 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2237 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2238 /* arg pointer */
2239 NON_Q_REGS,
2240 /* flags, fpsr, fpcr, frame */
2241 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2242 /* SSE registers */
2243 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2244 SSE_REGS, SSE_REGS,
2245 /* MMX registers */
2246 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2247 MMX_REGS, MMX_REGS,
2248 /* REX registers */
2249 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 /* SSE REX registers */
2252 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2253 SSE_REGS, SSE_REGS,
2254 };
2255
2256 /* The "default" register map used in 32bit mode. */
2257
2258 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2259 {
2260 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2261 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2262 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2263 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2264 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2265 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2267 };
2268
2269 /* The "default" register map used in 64bit mode. */
2270
2271 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2272 {
2273 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2274 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2275 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2276 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2277 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2278 8,9,10,11,12,13,14,15, /* extended integer registers */
2279 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2280 };
2281
2282 /* Define the register numbers to be used in Dwarf debugging information.
2283 The SVR4 reference port C compiler uses the following register numbers
2284 in its Dwarf output code:
2285 0 for %eax (gcc regno = 0)
2286 1 for %ecx (gcc regno = 2)
2287 2 for %edx (gcc regno = 1)
2288 3 for %ebx (gcc regno = 3)
2289 4 for %esp (gcc regno = 7)
2290 5 for %ebp (gcc regno = 6)
2291 6 for %esi (gcc regno = 4)
2292 7 for %edi (gcc regno = 5)
2293 The following three DWARF register numbers are never generated by
2294 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2295 believes these numbers have these meanings.
2296 8 for %eip (no gcc equivalent)
2297 9 for %eflags (gcc regno = 17)
2298 10 for %trapno (no gcc equivalent)
2299 It is not at all clear how we should number the FP stack registers
2300 for the x86 architecture. If the version of SDB on x86/svr4 were
2301 a bit less brain dead with respect to floating-point then we would
2302 have a precedent to follow with respect to DWARF register numbers
2303 for x86 FP registers, but the SDB on x86/svr4 is so completely
2304 broken with respect to FP registers that it is hardly worth thinking
2305 of it as something to strive for compatibility with.
2306 The version of x86/svr4 SDB I have at the moment does (partially)
2307 seem to believe that DWARF register number 11 is associated with
2308 the x86 register %st(0), but that's about all. Higher DWARF
2309 register numbers don't seem to be associated with anything in
2310 particular, and even for DWARF regno 11, SDB only seems to under-
2311 stand that it should say that a variable lives in %st(0) (when
2312 asked via an `=' command) if we said it was in DWARF regno 11,
2313 but SDB still prints garbage when asked for the value of the
2314 variable in question (via a `/' command).
2315 (Also note that the labels SDB prints for various FP stack regs
2316 when doing an `x' command are all wrong.)
2317 Note that these problems generally don't affect the native SVR4
2318 C compiler because it doesn't allow the use of -O with -g and
2319 because when it is *not* optimizing, it allocates a memory
2320 location for each floating-point variable, and the memory
2321 location is what gets described in the DWARF AT_location
2322 attribute for the variable in question.
2323 Regardless of the severe mental illness of the x86/svr4 SDB, we
2324 do something sensible here and we use the following DWARF
2325 register numbers. Note that these are all stack-top-relative
2326 numbers.
2327 11 for %st(0) (gcc regno = 8)
2328 12 for %st(1) (gcc regno = 9)
2329 13 for %st(2) (gcc regno = 10)
2330 14 for %st(3) (gcc regno = 11)
2331 15 for %st(4) (gcc regno = 12)
2332 16 for %st(5) (gcc regno = 13)
2333 17 for %st(6) (gcc regno = 14)
2334 18 for %st(7) (gcc regno = 15)
2335 */
2336 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2337 {
2338 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2339 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2340 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2341 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2342 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2343 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2345 };
2346
2347 /* Define parameter passing and return registers. */
2348
2349 static int const x86_64_int_parameter_registers[6] =
2350 {
2351 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2352 };
2353
2354 static int const x86_64_ms_abi_int_parameter_registers[4] =
2355 {
2356 CX_REG, DX_REG, R8_REG, R9_REG
2357 };
2358
2359 static int const x86_64_int_return_registers[4] =
2360 {
2361 AX_REG, DX_REG, DI_REG, SI_REG
2362 };
2363
2364 /* Define the structure for the machine field in struct function. */
2365
2366 struct GTY(()) stack_local_entry {
2367 unsigned short mode;
2368 unsigned short n;
2369 rtx rtl;
2370 struct stack_local_entry *next;
2371 };
2372
2373 /* Structure describing stack frame layout.
2374 Stack grows downward:
2375
2376 [arguments]
2377 <- ARG_POINTER
2378 saved pc
2379
2380 saved static chain if ix86_static_chain_on_stack
2381
2382 saved frame pointer if frame_pointer_needed
2383 <- HARD_FRAME_POINTER
2384 [saved regs]
2385 <- regs_save_offset
2386 [padding0]
2387
2388 [saved SSE regs]
2389 <- sse_regs_save_offset
2390 [padding1] |
2391 | <- FRAME_POINTER
2392 [va_arg registers] |
2393 |
2394 [frame] |
2395 |
2396 [padding2] | = to_allocate
2397 <- STACK_POINTER
2398 */
2399 struct ix86_frame
2400 {
2401 int nsseregs;
2402 int nregs;
2403 int va_arg_size;
2404 int red_zone_size;
2405 int outgoing_arguments_size;
2406 HOST_WIDE_INT frame;
2407
2408 /* The offsets relative to ARG_POINTER. */
2409 HOST_WIDE_INT frame_pointer_offset;
2410 HOST_WIDE_INT hard_frame_pointer_offset;
2411 HOST_WIDE_INT stack_pointer_offset;
2412 HOST_WIDE_INT hfp_save_offset;
2413 HOST_WIDE_INT reg_save_offset;
2414 HOST_WIDE_INT sse_reg_save_offset;
2415
2416 /* When save_regs_using_mov is set, emit prologue using
2417 move instead of push instructions. */
2418 bool save_regs_using_mov;
2419 };
2420
2421 /* Which cpu are we scheduling for. */
2422 enum attr_cpu ix86_schedule;
2423
2424 /* Which cpu are we optimizing for. */
2425 enum processor_type ix86_tune;
2426
2427 /* Which instruction set architecture to use. */
2428 enum processor_type ix86_arch;
2429
2430 /* true if sse prefetch instruction is not NOOP. */
2431 int x86_prefetch_sse;
2432
2433 /* -mstackrealign option */
2434 static const char ix86_force_align_arg_pointer_string[]
2435 = "force_align_arg_pointer";
2436
2437 static rtx (*ix86_gen_leave) (void);
2438 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2439 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2440 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2441 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2442 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2445 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2446 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2447 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2448 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2449
2450 /* Preferred alignment for stack boundary in bits. */
2451 unsigned int ix86_preferred_stack_boundary;
2452
2453 /* Alignment for incoming stack boundary in bits specified at
2454 command line. */
2455 static unsigned int ix86_user_incoming_stack_boundary;
2456
2457 /* Default alignment for incoming stack boundary in bits. */
2458 static unsigned int ix86_default_incoming_stack_boundary;
2459
2460 /* Alignment for incoming stack boundary in bits. */
2461 unsigned int ix86_incoming_stack_boundary;
2462
2463 /* Calling abi specific va_list type nodes. */
2464 static GTY(()) tree sysv_va_list_type_node;
2465 static GTY(()) tree ms_va_list_type_node;
2466
2467 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2468 char internal_label_prefix[16];
2469 int internal_label_prefix_len;
2470
2471 /* Fence to use after loop using movnt. */
2472 tree x86_mfence;
2473
2474 /* Register class used for passing given 64bit part of the argument.
2475 These represent classes as documented by the PS ABI, with the exception
2476 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2477 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2478
2479 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2480 whenever possible (upper half does contain padding). */
2481 enum x86_64_reg_class
2482 {
2483 X86_64_NO_CLASS,
2484 X86_64_INTEGER_CLASS,
2485 X86_64_INTEGERSI_CLASS,
2486 X86_64_SSE_CLASS,
2487 X86_64_SSESF_CLASS,
2488 X86_64_SSEDF_CLASS,
2489 X86_64_SSEUP_CLASS,
2490 X86_64_X87_CLASS,
2491 X86_64_X87UP_CLASS,
2492 X86_64_COMPLEX_X87_CLASS,
2493 X86_64_MEMORY_CLASS
2494 };
2495
2496 #define MAX_CLASSES 4
2497
2498 /* Table of constants used by fldpi, fldln2, etc.... */
2499 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2500 static bool ext_80387_constants_init = 0;
2501
2502 \f
2503 static struct machine_function * ix86_init_machine_status (void);
2504 static rtx ix86_function_value (const_tree, const_tree, bool);
2505 static bool ix86_function_value_regno_p (const unsigned int);
2506 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2507 const_tree);
2508 static rtx ix86_static_chain (const_tree, bool);
2509 static int ix86_function_regparm (const_tree, const_tree);
2510 static void ix86_compute_frame_layout (struct ix86_frame *);
2511 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2512 rtx, rtx, int);
2513 static void ix86_add_new_builtins (HOST_WIDE_INT);
2514 static tree ix86_canonical_va_list_type (tree);
2515 static void predict_jump (int);
2516 static unsigned int split_stack_prologue_scratch_regno (void);
2517 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2518
2519 enum ix86_function_specific_strings
2520 {
2521 IX86_FUNCTION_SPECIFIC_ARCH,
2522 IX86_FUNCTION_SPECIFIC_TUNE,
2523 IX86_FUNCTION_SPECIFIC_MAX
2524 };
2525
2526 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2527 const char *, enum fpmath_unit, bool);
2528 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2529 static void ix86_function_specific_save (struct cl_target_option *);
2530 static void ix86_function_specific_restore (struct cl_target_option *);
2531 static void ix86_function_specific_print (FILE *, int,
2532 struct cl_target_option *);
2533 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2534 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2535 struct gcc_options *);
2536 static bool ix86_can_inline_p (tree, tree);
2537 static void ix86_set_current_function (tree);
2538 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2539
2540 static enum calling_abi ix86_function_abi (const_tree);
2541
2542 \f
2543 #ifndef SUBTARGET32_DEFAULT_CPU
2544 #define SUBTARGET32_DEFAULT_CPU "i386"
2545 #endif
2546
2547 /* The svr4 ABI for the i386 says that records and unions are returned
2548 in memory. */
2549 #ifndef DEFAULT_PCC_STRUCT_RETURN
2550 #define DEFAULT_PCC_STRUCT_RETURN 1
2551 #endif
2552
2553 /* Whether -mtune= or -march= were specified */
2554 static int ix86_tune_defaulted;
2555 static int ix86_arch_specified;
2556
2557 /* Vectorization library interface and handlers. */
2558 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2559
2560 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2561 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2562
2563 /* Processor target table, indexed by processor number */
2564 struct ptt
2565 {
2566 const struct processor_costs *cost; /* Processor costs */
2567 const int align_loop; /* Default alignments. */
2568 const int align_loop_max_skip;
2569 const int align_jump;
2570 const int align_jump_max_skip;
2571 const int align_func;
2572 };
2573
2574 static const struct ptt processor_target_table[PROCESSOR_max] =
2575 {
2576 {&i386_cost, 4, 3, 4, 3, 4},
2577 {&i486_cost, 16, 15, 16, 15, 16},
2578 {&pentium_cost, 16, 7, 16, 7, 16},
2579 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2580 {&geode_cost, 0, 0, 0, 0, 0},
2581 {&k6_cost, 32, 7, 32, 7, 32},
2582 {&athlon_cost, 16, 7, 16, 7, 16},
2583 {&pentium4_cost, 0, 0, 0, 0, 0},
2584 {&k8_cost, 16, 7, 16, 7, 16},
2585 {&nocona_cost, 0, 0, 0, 0, 0},
2586 /* Core 2 32-bit. */
2587 {&generic32_cost, 16, 10, 16, 10, 16},
2588 /* Core 2 64-bit. */
2589 {&generic64_cost, 16, 10, 16, 10, 16},
2590 /* Core i7 32-bit. */
2591 {&generic32_cost, 16, 10, 16, 10, 16},
2592 /* Core i7 64-bit. */
2593 {&generic64_cost, 16, 10, 16, 10, 16},
2594 {&generic32_cost, 16, 7, 16, 7, 16},
2595 {&generic64_cost, 16, 10, 16, 10, 16},
2596 {&amdfam10_cost, 32, 24, 32, 7, 32},
2597 {&bdver1_cost, 32, 24, 32, 7, 32},
2598 {&bdver2_cost, 32, 24, 32, 7, 32},
2599 {&btver1_cost, 32, 24, 32, 7, 32},
2600 {&atom_cost, 16, 15, 16, 7, 16}
2601 };
2602
2603 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2604 {
2605 "generic",
2606 "i386",
2607 "i486",
2608 "pentium",
2609 "pentium-mmx",
2610 "pentiumpro",
2611 "pentium2",
2612 "pentium3",
2613 "pentium4",
2614 "pentium-m",
2615 "prescott",
2616 "nocona",
2617 "core2",
2618 "corei7",
2619 "atom",
2620 "geode",
2621 "k6",
2622 "k6-2",
2623 "k6-3",
2624 "athlon",
2625 "athlon-4",
2626 "k8",
2627 "amdfam10",
2628 "bdver1",
2629 "bdver2",
2630 "btver1"
2631 };
2632 \f
2633 /* Return true if a red-zone is in use. */
2634
2635 static inline bool
2636 ix86_using_red_zone (void)
2637 {
2638 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2639 }
2640 \f
2641 /* Return a string that documents the current -m options. The caller is
2642 responsible for freeing the string. */
2643
2644 static char *
2645 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2646 const char *tune, enum fpmath_unit fpmath,
2647 bool add_nl_p)
2648 {
2649 struct ix86_target_opts
2650 {
2651 const char *option; /* option string */
2652 HOST_WIDE_INT mask; /* isa mask options */
2653 };
2654
2655 /* This table is ordered so that options like -msse4.2 that imply
2656 preceding options while match those first. */
2657 static struct ix86_target_opts isa_opts[] =
2658 {
2659 { "-m64", OPTION_MASK_ISA_64BIT },
2660 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2661 { "-mfma", OPTION_MASK_ISA_FMA },
2662 { "-mxop", OPTION_MASK_ISA_XOP },
2663 { "-mlwp", OPTION_MASK_ISA_LWP },
2664 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2665 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2666 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2667 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2668 { "-msse3", OPTION_MASK_ISA_SSE3 },
2669 { "-msse2", OPTION_MASK_ISA_SSE2 },
2670 { "-msse", OPTION_MASK_ISA_SSE },
2671 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2672 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2673 { "-mmmx", OPTION_MASK_ISA_MMX },
2674 { "-mabm", OPTION_MASK_ISA_ABM },
2675 { "-mbmi", OPTION_MASK_ISA_BMI },
2676 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2677 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2678 { "-mtbm", OPTION_MASK_ISA_TBM },
2679 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2680 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2681 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2682 { "-maes", OPTION_MASK_ISA_AES },
2683 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2684 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2685 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2686 { "-mf16c", OPTION_MASK_ISA_F16C },
2687 { "-mrtm", OPTION_MASK_ISA_RTM },
2688 };
2689
2690 /* Flag options. */
2691 static struct ix86_target_opts flag_opts[] =
2692 {
2693 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2694 { "-m80387", MASK_80387 },
2695 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2696 { "-malign-double", MASK_ALIGN_DOUBLE },
2697 { "-mcld", MASK_CLD },
2698 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2699 { "-mieee-fp", MASK_IEEE_FP },
2700 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2701 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2702 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2703 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2704 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2705 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2706 { "-mno-red-zone", MASK_NO_RED_ZONE },
2707 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2708 { "-mrecip", MASK_RECIP },
2709 { "-mrtd", MASK_RTD },
2710 { "-msseregparm", MASK_SSEREGPARM },
2711 { "-mstack-arg-probe", MASK_STACK_PROBE },
2712 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2713 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2714 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2715 { "-mvzeroupper", MASK_VZEROUPPER },
2716 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2717 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2718 { "-mprefer-avx128", MASK_PREFER_AVX128},
2719 };
2720
2721 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2722
2723 char isa_other[40];
2724 char target_other[40];
2725 unsigned num = 0;
2726 unsigned i, j;
2727 char *ret;
2728 char *ptr;
2729 size_t len;
2730 size_t line_len;
2731 size_t sep_len;
2732
2733 memset (opts, '\0', sizeof (opts));
2734
2735 /* Add -march= option. */
2736 if (arch)
2737 {
2738 opts[num][0] = "-march=";
2739 opts[num++][1] = arch;
2740 }
2741
2742 /* Add -mtune= option. */
2743 if (tune)
2744 {
2745 opts[num][0] = "-mtune=";
2746 opts[num++][1] = tune;
2747 }
2748
2749 /* Pick out the options in isa options. */
2750 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2751 {
2752 if ((isa & isa_opts[i].mask) != 0)
2753 {
2754 opts[num++][0] = isa_opts[i].option;
2755 isa &= ~ isa_opts[i].mask;
2756 }
2757 }
2758
2759 if (isa && add_nl_p)
2760 {
2761 opts[num++][0] = isa_other;
2762 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2763 isa);
2764 }
2765
2766 /* Add flag options. */
2767 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2768 {
2769 if ((flags & flag_opts[i].mask) != 0)
2770 {
2771 opts[num++][0] = flag_opts[i].option;
2772 flags &= ~ flag_opts[i].mask;
2773 }
2774 }
2775
2776 if (flags && add_nl_p)
2777 {
2778 opts[num++][0] = target_other;
2779 sprintf (target_other, "(other flags: %#x)", flags);
2780 }
2781
2782 /* Add -fpmath= option. */
2783 if (fpmath)
2784 {
2785 opts[num][0] = "-mfpmath=";
2786 switch ((int) fpmath)
2787 {
2788 case FPMATH_387:
2789 opts[num++][1] = "387";
2790 break;
2791
2792 case FPMATH_SSE:
2793 opts[num++][1] = "sse";
2794 break;
2795
2796 case FPMATH_387 | FPMATH_SSE:
2797 opts[num++][1] = "sse+387";
2798 break;
2799
2800 default:
2801 gcc_unreachable ();
2802 }
2803 }
2804
2805 /* Any options? */
2806 if (num == 0)
2807 return NULL;
2808
2809 gcc_assert (num < ARRAY_SIZE (opts));
2810
2811 /* Size the string. */
2812 len = 0;
2813 sep_len = (add_nl_p) ? 3 : 1;
2814 for (i = 0; i < num; i++)
2815 {
2816 len += sep_len;
2817 for (j = 0; j < 2; j++)
2818 if (opts[i][j])
2819 len += strlen (opts[i][j]);
2820 }
2821
2822 /* Build the string. */
2823 ret = ptr = (char *) xmalloc (len);
2824 line_len = 0;
2825
2826 for (i = 0; i < num; i++)
2827 {
2828 size_t len2[2];
2829
2830 for (j = 0; j < 2; j++)
2831 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2832
2833 if (i != 0)
2834 {
2835 *ptr++ = ' ';
2836 line_len++;
2837
2838 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2839 {
2840 *ptr++ = '\\';
2841 *ptr++ = '\n';
2842 line_len = 0;
2843 }
2844 }
2845
2846 for (j = 0; j < 2; j++)
2847 if (opts[i][j])
2848 {
2849 memcpy (ptr, opts[i][j], len2[j]);
2850 ptr += len2[j];
2851 line_len += len2[j];
2852 }
2853 }
2854
2855 *ptr = '\0';
2856 gcc_assert (ret + len >= ptr);
2857
2858 return ret;
2859 }
2860
2861 /* Return true, if profiling code should be emitted before
2862 prologue. Otherwise it returns false.
2863 Note: For x86 with "hotfix" it is sorried. */
2864 static bool
2865 ix86_profile_before_prologue (void)
2866 {
2867 return flag_fentry != 0;
2868 }
2869
2870 /* Function that is callable from the debugger to print the current
2871 options. */
2872 void
2873 ix86_debug_options (void)
2874 {
2875 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2876 ix86_arch_string, ix86_tune_string,
2877 ix86_fpmath, true);
2878
2879 if (opts)
2880 {
2881 fprintf (stderr, "%s\n\n", opts);
2882 free (opts);
2883 }
2884 else
2885 fputs ("<no options>\n\n", stderr);
2886
2887 return;
2888 }
2889 \f
2890 /* Override various settings based on options. If MAIN_ARGS_P, the
2891 options are from the command line, otherwise they are from
2892 attributes. */
2893
2894 static void
2895 ix86_option_override_internal (bool main_args_p)
2896 {
2897 int i;
2898 unsigned int ix86_arch_mask, ix86_tune_mask;
2899 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2900 const char *prefix;
2901 const char *suffix;
2902 const char *sw;
2903
2904 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2905 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2906 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2907 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2908 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2909 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2910 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2911 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2912 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2913 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2914 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2915 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2916 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2917 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2918 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2919 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2920 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2921 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2922 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2923 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2924 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2925 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2926 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2927 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2928 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2929 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2930 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2931 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2932 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2933 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2934 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2935 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2936 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2937 /* if this reaches 64, need to widen struct pta flags below */
2938
2939 static struct pta
2940 {
2941 const char *const name; /* processor name or nickname. */
2942 const enum processor_type processor;
2943 const enum attr_cpu schedule;
2944 const unsigned HOST_WIDE_INT flags;
2945 }
2946 const processor_alias_table[] =
2947 {
2948 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2949 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2950 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2951 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2952 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2953 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2954 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2955 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2956 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2957 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2958 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2959 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2960 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2961 PTA_MMX | PTA_SSE},
2962 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2963 PTA_MMX | PTA_SSE},
2964 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2965 PTA_MMX | PTA_SSE | PTA_SSE2},
2966 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2967 PTA_MMX |PTA_SSE | PTA_SSE2},
2968 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2969 PTA_MMX | PTA_SSE | PTA_SSE2},
2970 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2971 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2972 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2973 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2974 | PTA_CX16 | PTA_NO_SAHF},
2975 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2976 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2977 | PTA_SSSE3 | PTA_CX16},
2978 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2979 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2980 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2981 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2982 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2983 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2984 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2985 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2986 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2987 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2988 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2989 | PTA_RDRND | PTA_F16C},
2990 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2991 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2992 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2993 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2994 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2995 | PTA_FMA | PTA_MOVBE | PTA_RTM},
2996 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2997 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2998 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
2999 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3000 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
3001 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3002 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3003 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3004 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3005 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3006 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3007 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3008 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3009 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3010 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3011 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3012 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3013 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3014 {"x86-64", PROCESSOR_K8, CPU_K8,
3015 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3016 {"k8", PROCESSOR_K8, CPU_K8,
3017 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3018 | PTA_SSE2 | PTA_NO_SAHF},
3019 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3020 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3021 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3022 {"opteron", PROCESSOR_K8, CPU_K8,
3023 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3024 | PTA_SSE2 | PTA_NO_SAHF},
3025 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3026 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3027 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3028 {"athlon64", PROCESSOR_K8, CPU_K8,
3029 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3030 | PTA_SSE2 | PTA_NO_SAHF},
3031 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3032 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3033 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3034 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3035 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3036 | PTA_SSE2 | PTA_NO_SAHF},
3037 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3038 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3039 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3040 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3041 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3042 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3043 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3044 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3045 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3046 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3047 | PTA_XOP | PTA_LWP},
3048 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3049 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3050 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3051 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3052 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3053 | PTA_FMA},
3054 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3055 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3056 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3057 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3058 0 /* flags are only used for -march switch. */ },
3059 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3060 PTA_64BIT /* flags are only used for -march switch. */ },
3061 };
3062
3063 /* -mrecip options. */
3064 static struct
3065 {
3066 const char *string; /* option name */
3067 unsigned int mask; /* mask bits to set */
3068 }
3069 const recip_options[] =
3070 {
3071 { "all", RECIP_MASK_ALL },
3072 { "none", RECIP_MASK_NONE },
3073 { "div", RECIP_MASK_DIV },
3074 { "sqrt", RECIP_MASK_SQRT },
3075 { "vec-div", RECIP_MASK_VEC_DIV },
3076 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3077 };
3078
3079 int const pta_size = ARRAY_SIZE (processor_alias_table);
3080
3081 /* Set up prefix/suffix so the error messages refer to either the command
3082 line argument, or the attribute(target). */
3083 if (main_args_p)
3084 {
3085 prefix = "-m";
3086 suffix = "";
3087 sw = "switch";
3088 }
3089 else
3090 {
3091 prefix = "option(\"";
3092 suffix = "\")";
3093 sw = "attribute";
3094 }
3095
3096 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3097 SUBTARGET_OVERRIDE_OPTIONS;
3098 #endif
3099
3100 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3101 SUBSUBTARGET_OVERRIDE_OPTIONS;
3102 #endif
3103
3104 if (TARGET_X32)
3105 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3106
3107 /* -fPIC is the default for x86_64. */
3108 if (TARGET_MACHO && TARGET_64BIT)
3109 flag_pic = 2;
3110
3111 /* Need to check -mtune=generic first. */
3112 if (ix86_tune_string)
3113 {
3114 if (!strcmp (ix86_tune_string, "generic")
3115 || !strcmp (ix86_tune_string, "i686")
3116 /* As special support for cross compilers we read -mtune=native
3117 as -mtune=generic. With native compilers we won't see the
3118 -mtune=native, as it was changed by the driver. */
3119 || !strcmp (ix86_tune_string, "native"))
3120 {
3121 if (TARGET_64BIT)
3122 ix86_tune_string = "generic64";
3123 else
3124 ix86_tune_string = "generic32";
3125 }
3126 /* If this call is for setting the option attribute, allow the
3127 generic32/generic64 that was previously set. */
3128 else if (!main_args_p
3129 && (!strcmp (ix86_tune_string, "generic32")
3130 || !strcmp (ix86_tune_string, "generic64")))
3131 ;
3132 else if (!strncmp (ix86_tune_string, "generic", 7))
3133 error ("bad value (%s) for %stune=%s %s",
3134 ix86_tune_string, prefix, suffix, sw);
3135 else if (!strcmp (ix86_tune_string, "x86-64"))
3136 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3137 "%stune=k8%s or %stune=generic%s instead as appropriate",
3138 prefix, suffix, prefix, suffix, prefix, suffix);
3139 }
3140 else
3141 {
3142 if (ix86_arch_string)
3143 ix86_tune_string = ix86_arch_string;
3144 if (!ix86_tune_string)
3145 {
3146 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3147 ix86_tune_defaulted = 1;
3148 }
3149
3150 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3151 need to use a sensible tune option. */
3152 if (!strcmp (ix86_tune_string, "generic")
3153 || !strcmp (ix86_tune_string, "x86-64")
3154 || !strcmp (ix86_tune_string, "i686"))
3155 {
3156 if (TARGET_64BIT)
3157 ix86_tune_string = "generic64";
3158 else
3159 ix86_tune_string = "generic32";
3160 }
3161 }
3162
3163 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3164 {
3165 /* rep; movq isn't available in 32-bit code. */
3166 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3167 ix86_stringop_alg = no_stringop;
3168 }
3169
3170 if (!ix86_arch_string)
3171 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3172 else
3173 ix86_arch_specified = 1;
3174
3175 if (global_options_set.x_ix86_pmode)
3176 {
3177 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3178 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3179 error ("address mode %qs not supported in the %s bit mode",
3180 TARGET_64BIT ? "short" : "long",
3181 TARGET_64BIT ? "64" : "32");
3182 }
3183 else
3184 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3185
3186 if (!global_options_set.x_ix86_abi)
3187 ix86_abi = DEFAULT_ABI;
3188
3189 if (global_options_set.x_ix86_cmodel)
3190 {
3191 switch (ix86_cmodel)
3192 {
3193 case CM_SMALL:
3194 case CM_SMALL_PIC:
3195 if (flag_pic)
3196 ix86_cmodel = CM_SMALL_PIC;
3197 if (!TARGET_64BIT)
3198 error ("code model %qs not supported in the %s bit mode",
3199 "small", "32");
3200 break;
3201
3202 case CM_MEDIUM:
3203 case CM_MEDIUM_PIC:
3204 if (flag_pic)
3205 ix86_cmodel = CM_MEDIUM_PIC;
3206 if (!TARGET_64BIT)
3207 error ("code model %qs not supported in the %s bit mode",
3208 "medium", "32");
3209 else if (TARGET_X32)
3210 error ("code model %qs not supported in x32 mode",
3211 "medium");
3212 break;
3213
3214 case CM_LARGE:
3215 case CM_LARGE_PIC:
3216 if (flag_pic)
3217 ix86_cmodel = CM_LARGE_PIC;
3218 if (!TARGET_64BIT)
3219 error ("code model %qs not supported in the %s bit mode",
3220 "large", "32");
3221 else if (TARGET_X32)
3222 error ("code model %qs not supported in x32 mode",
3223 "medium");
3224 break;
3225
3226 case CM_32:
3227 if (flag_pic)
3228 error ("code model %s does not support PIC mode", "32");
3229 if (TARGET_64BIT)
3230 error ("code model %qs not supported in the %s bit mode",
3231 "32", "64");
3232 break;
3233
3234 case CM_KERNEL:
3235 if (flag_pic)
3236 {
3237 error ("code model %s does not support PIC mode", "kernel");
3238 ix86_cmodel = CM_32;
3239 }
3240 if (!TARGET_64BIT)
3241 error ("code model %qs not supported in the %s bit mode",
3242 "kernel", "32");
3243 break;
3244
3245 default:
3246 gcc_unreachable ();
3247 }
3248 }
3249 else
3250 {
3251 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3252 use of rip-relative addressing. This eliminates fixups that
3253 would otherwise be needed if this object is to be placed in a
3254 DLL, and is essentially just as efficient as direct addressing. */
3255 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3256 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3257 else if (TARGET_64BIT)
3258 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3259 else
3260 ix86_cmodel = CM_32;
3261 }
3262 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3263 {
3264 error ("-masm=intel not supported in this configuration");
3265 ix86_asm_dialect = ASM_ATT;
3266 }
3267 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3268 sorry ("%i-bit mode not compiled in",
3269 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3270
3271 for (i = 0; i < pta_size; i++)
3272 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3273 {
3274 ix86_schedule = processor_alias_table[i].schedule;
3275 ix86_arch = processor_alias_table[i].processor;
3276 /* Default cpu tuning to the architecture. */
3277 ix86_tune = ix86_arch;
3278
3279 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3280 error ("CPU you selected does not support x86-64 "
3281 "instruction set");
3282
3283 if (processor_alias_table[i].flags & PTA_MMX
3284 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3285 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3286 if (processor_alias_table[i].flags & PTA_3DNOW
3287 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3288 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3289 if (processor_alias_table[i].flags & PTA_3DNOW_A
3290 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3291 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3292 if (processor_alias_table[i].flags & PTA_SSE
3293 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3294 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3295 if (processor_alias_table[i].flags & PTA_SSE2
3296 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3297 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3298 if (processor_alias_table[i].flags & PTA_SSE3
3299 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3300 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3301 if (processor_alias_table[i].flags & PTA_SSSE3
3302 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3303 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3304 if (processor_alias_table[i].flags & PTA_SSE4_1
3305 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3306 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3307 if (processor_alias_table[i].flags & PTA_SSE4_2
3308 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3309 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3310 if (processor_alias_table[i].flags & PTA_AVX
3311 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3312 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3313 if (processor_alias_table[i].flags & PTA_AVX2
3314 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3315 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3316 if (processor_alias_table[i].flags & PTA_FMA
3317 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3318 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3319 if (processor_alias_table[i].flags & PTA_SSE4A
3320 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3321 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3322 if (processor_alias_table[i].flags & PTA_FMA4
3323 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3324 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3325 if (processor_alias_table[i].flags & PTA_XOP
3326 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3327 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3328 if (processor_alias_table[i].flags & PTA_LWP
3329 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3330 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3331 if (processor_alias_table[i].flags & PTA_ABM
3332 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3333 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3334 if (processor_alias_table[i].flags & PTA_BMI
3335 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3336 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3337 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3338 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3339 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3340 if (processor_alias_table[i].flags & PTA_TBM
3341 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3342 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3343 if (processor_alias_table[i].flags & PTA_BMI2
3344 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3345 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3346 if (processor_alias_table[i].flags & PTA_CX16
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3348 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3349 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3351 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3352 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3354 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3355 if (processor_alias_table[i].flags & PTA_MOVBE
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3357 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3358 if (processor_alias_table[i].flags & PTA_AES
3359 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3360 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3361 if (processor_alias_table[i].flags & PTA_PCLMUL
3362 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3363 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3364 if (processor_alias_table[i].flags & PTA_FSGSBASE
3365 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3366 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3367 if (processor_alias_table[i].flags & PTA_RDRND
3368 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3369 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3370 if (processor_alias_table[i].flags & PTA_F16C
3371 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3372 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3373 if (processor_alias_table[i].flags & PTA_RTM
3374 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3375 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3376 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3377 x86_prefetch_sse = true;
3378
3379 break;
3380 }
3381
3382 if (!strcmp (ix86_arch_string, "generic"))
3383 error ("generic CPU can be used only for %stune=%s %s",
3384 prefix, suffix, sw);
3385 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3386 error ("bad value (%s) for %sarch=%s %s",
3387 ix86_arch_string, prefix, suffix, sw);
3388
3389 ix86_arch_mask = 1u << ix86_arch;
3390 for (i = 0; i < X86_ARCH_LAST; ++i)
3391 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3392
3393 for (i = 0; i < pta_size; i++)
3394 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3395 {
3396 ix86_schedule = processor_alias_table[i].schedule;
3397 ix86_tune = processor_alias_table[i].processor;
3398 if (TARGET_64BIT)
3399 {
3400 if (!(processor_alias_table[i].flags & PTA_64BIT))
3401 {
3402 if (ix86_tune_defaulted)
3403 {
3404 ix86_tune_string = "x86-64";
3405 for (i = 0; i < pta_size; i++)
3406 if (! strcmp (ix86_tune_string,
3407 processor_alias_table[i].name))
3408 break;
3409 ix86_schedule = processor_alias_table[i].schedule;
3410 ix86_tune = processor_alias_table[i].processor;
3411 }
3412 else
3413 error ("CPU you selected does not support x86-64 "
3414 "instruction set");
3415 }
3416 }
3417 else
3418 {
3419 /* Adjust tuning when compiling for 32-bit ABI. */
3420 switch (ix86_tune)
3421 {
3422 case PROCESSOR_GENERIC64:
3423 ix86_tune = PROCESSOR_GENERIC32;
3424 ix86_schedule = CPU_PENTIUMPRO;
3425 break;
3426
3427 case PROCESSOR_CORE2_64:
3428 ix86_tune = PROCESSOR_CORE2_32;
3429 break;
3430
3431 case PROCESSOR_COREI7_64:
3432 ix86_tune = PROCESSOR_COREI7_32;
3433 break;
3434
3435 default:
3436 break;
3437 }
3438 }
3439 /* Intel CPUs have always interpreted SSE prefetch instructions as
3440 NOPs; so, we can enable SSE prefetch instructions even when
3441 -mtune (rather than -march) points us to a processor that has them.
3442 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3443 higher processors. */
3444 if (TARGET_CMOVE
3445 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3446 x86_prefetch_sse = true;
3447 break;
3448 }
3449
3450 if (ix86_tune_specified && i == pta_size)
3451 error ("bad value (%s) for %stune=%s %s",
3452 ix86_tune_string, prefix, suffix, sw);
3453
3454 ix86_tune_mask = 1u << ix86_tune;
3455 for (i = 0; i < X86_TUNE_LAST; ++i)
3456 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3457
3458 #ifndef USE_IX86_FRAME_POINTER
3459 #define USE_IX86_FRAME_POINTER 0
3460 #endif
3461
3462 #ifndef USE_X86_64_FRAME_POINTER
3463 #define USE_X86_64_FRAME_POINTER 0
3464 #endif
3465
3466 /* Set the default values for switches whose default depends on TARGET_64BIT
3467 in case they weren't overwritten by command line options. */
3468 if (TARGET_64BIT)
3469 {
3470 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3471 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3472 if (flag_asynchronous_unwind_tables == 2)
3473 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3474 if (flag_pcc_struct_return == 2)
3475 flag_pcc_struct_return = 0;
3476 }
3477 else
3478 {
3479 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3480 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3481 if (flag_asynchronous_unwind_tables == 2)
3482 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3483 if (flag_pcc_struct_return == 2)
3484 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3485 }
3486
3487 if (optimize_size)
3488 ix86_cost = &ix86_size_cost;
3489 else
3490 ix86_cost = processor_target_table[ix86_tune].cost;
3491
3492 /* Arrange to set up i386_stack_locals for all functions. */
3493 init_machine_status = ix86_init_machine_status;
3494
3495 /* Validate -mregparm= value. */
3496 if (global_options_set.x_ix86_regparm)
3497 {
3498 if (TARGET_64BIT)
3499 warning (0, "-mregparm is ignored in 64-bit mode");
3500 if (ix86_regparm > REGPARM_MAX)
3501 {
3502 error ("-mregparm=%d is not between 0 and %d",
3503 ix86_regparm, REGPARM_MAX);
3504 ix86_regparm = 0;
3505 }
3506 }
3507 if (TARGET_64BIT)
3508 ix86_regparm = REGPARM_MAX;
3509
3510 /* Default align_* from the processor table. */
3511 if (align_loops == 0)
3512 {
3513 align_loops = processor_target_table[ix86_tune].align_loop;
3514 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3515 }
3516 if (align_jumps == 0)
3517 {
3518 align_jumps = processor_target_table[ix86_tune].align_jump;
3519 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3520 }
3521 if (align_functions == 0)
3522 {
3523 align_functions = processor_target_table[ix86_tune].align_func;
3524 }
3525
3526 /* Provide default for -mbranch-cost= value. */
3527 if (!global_options_set.x_ix86_branch_cost)
3528 ix86_branch_cost = ix86_cost->branch_cost;
3529
3530 if (TARGET_64BIT)
3531 {
3532 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3533
3534 /* Enable by default the SSE and MMX builtins. Do allow the user to
3535 explicitly disable any of these. In particular, disabling SSE and
3536 MMX for kernel code is extremely useful. */
3537 if (!ix86_arch_specified)
3538 ix86_isa_flags
3539 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3540 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3541
3542 if (TARGET_RTD)
3543 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3544 }
3545 else
3546 {
3547 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3548
3549 if (!ix86_arch_specified)
3550 ix86_isa_flags
3551 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3552
3553 /* i386 ABI does not specify red zone. It still makes sense to use it
3554 when programmer takes care to stack from being destroyed. */
3555 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3556 target_flags |= MASK_NO_RED_ZONE;
3557 }
3558
3559 /* Keep nonleaf frame pointers. */
3560 if (flag_omit_frame_pointer)
3561 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3562 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3563 flag_omit_frame_pointer = 1;
3564
3565 /* If we're doing fast math, we don't care about comparison order
3566 wrt NaNs. This lets us use a shorter comparison sequence. */
3567 if (flag_finite_math_only)
3568 target_flags &= ~MASK_IEEE_FP;
3569
3570 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3571 since the insns won't need emulation. */
3572 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3573 target_flags &= ~MASK_NO_FANCY_MATH_387;
3574
3575 /* Likewise, if the target doesn't have a 387, or we've specified
3576 software floating point, don't use 387 inline intrinsics. */
3577 if (!TARGET_80387)
3578 target_flags |= MASK_NO_FANCY_MATH_387;
3579
3580 /* Turn on MMX builtins for -msse. */
3581 if (TARGET_SSE)
3582 {
3583 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3584 x86_prefetch_sse = true;
3585 }
3586
3587 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3588 if (TARGET_SSE4_2 || TARGET_ABM)
3589 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3590
3591 /* Turn on lzcnt instruction for -mabm. */
3592 if (TARGET_ABM)
3593 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3594
3595 /* Validate -mpreferred-stack-boundary= value or default it to
3596 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3597 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3598 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3599 {
3600 int min = (TARGET_64BIT ? 4 : 2);
3601 int max = (TARGET_SEH ? 4 : 12);
3602
3603 if (ix86_preferred_stack_boundary_arg < min
3604 || ix86_preferred_stack_boundary_arg > max)
3605 {
3606 if (min == max)
3607 error ("-mpreferred-stack-boundary is not supported "
3608 "for this target");
3609 else
3610 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3611 ix86_preferred_stack_boundary_arg, min, max);
3612 }
3613 else
3614 ix86_preferred_stack_boundary
3615 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3616 }
3617
3618 /* Set the default value for -mstackrealign. */
3619 if (ix86_force_align_arg_pointer == -1)
3620 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3621
3622 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3623
3624 /* Validate -mincoming-stack-boundary= value or default it to
3625 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3626 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3627 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3628 {
3629 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3630 || ix86_incoming_stack_boundary_arg > 12)
3631 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3632 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3633 else
3634 {
3635 ix86_user_incoming_stack_boundary
3636 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3637 ix86_incoming_stack_boundary
3638 = ix86_user_incoming_stack_boundary;
3639 }
3640 }
3641
3642 /* Accept -msseregparm only if at least SSE support is enabled. */
3643 if (TARGET_SSEREGPARM
3644 && ! TARGET_SSE)
3645 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3646
3647 if (global_options_set.x_ix86_fpmath)
3648 {
3649 if (ix86_fpmath & FPMATH_SSE)
3650 {
3651 if (!TARGET_SSE)
3652 {
3653 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3654 ix86_fpmath = FPMATH_387;
3655 }
3656 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3657 {
3658 warning (0, "387 instruction set disabled, using SSE arithmetics");
3659 ix86_fpmath = FPMATH_SSE;
3660 }
3661 }
3662 }
3663 else
3664 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3665
3666 /* If the i387 is disabled, then do not return values in it. */
3667 if (!TARGET_80387)
3668 target_flags &= ~MASK_FLOAT_RETURNS;
3669
3670 /* Use external vectorized library in vectorizing intrinsics. */
3671 if (global_options_set.x_ix86_veclibabi_type)
3672 switch (ix86_veclibabi_type)
3673 {
3674 case ix86_veclibabi_type_svml:
3675 ix86_veclib_handler = ix86_veclibabi_svml;
3676 break;
3677
3678 case ix86_veclibabi_type_acml:
3679 ix86_veclib_handler = ix86_veclibabi_acml;
3680 break;
3681
3682 default:
3683 gcc_unreachable ();
3684 }
3685
3686 if ((!USE_IX86_FRAME_POINTER
3687 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3688 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3689 && !optimize_size)
3690 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3691
3692 /* ??? Unwind info is not correct around the CFG unless either a frame
3693 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3694 unwind info generation to be aware of the CFG and propagating states
3695 around edges. */
3696 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3697 || flag_exceptions || flag_non_call_exceptions)
3698 && flag_omit_frame_pointer
3699 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3700 {
3701 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3702 warning (0, "unwind tables currently require either a frame pointer "
3703 "or %saccumulate-outgoing-args%s for correctness",
3704 prefix, suffix);
3705 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3706 }
3707
3708 /* If stack probes are required, the space used for large function
3709 arguments on the stack must also be probed, so enable
3710 -maccumulate-outgoing-args so this happens in the prologue. */
3711 if (TARGET_STACK_PROBE
3712 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3713 {
3714 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3715 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3716 "for correctness", prefix, suffix);
3717 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3718 }
3719
3720 /* For sane SSE instruction set generation we need fcomi instruction.
3721 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3722 expands to a sequence that includes conditional move. */
3723 if (TARGET_SSE || TARGET_RDRND)
3724 TARGET_CMOVE = 1;
3725
3726 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3727 {
3728 char *p;
3729 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3730 p = strchr (internal_label_prefix, 'X');
3731 internal_label_prefix_len = p - internal_label_prefix;
3732 *p = '\0';
3733 }
3734
3735 /* When scheduling description is not available, disable scheduler pass
3736 so it won't slow down the compilation and make x87 code slower. */
3737 if (!TARGET_SCHEDULE)
3738 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3739
3740 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3741 ix86_cost->simultaneous_prefetches,
3742 global_options.x_param_values,
3743 global_options_set.x_param_values);
3744 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3745 global_options.x_param_values,
3746 global_options_set.x_param_values);
3747 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3748 global_options.x_param_values,
3749 global_options_set.x_param_values);
3750 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3751 global_options.x_param_values,
3752 global_options_set.x_param_values);
3753
3754 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3755 if (flag_prefetch_loop_arrays < 0
3756 && HAVE_prefetch
3757 && optimize >= 3
3758 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3759 flag_prefetch_loop_arrays = 1;
3760
3761 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3762 can be optimized to ap = __builtin_next_arg (0). */
3763 if (!TARGET_64BIT && !flag_split_stack)
3764 targetm.expand_builtin_va_start = NULL;
3765
3766 if (TARGET_64BIT)
3767 {
3768 ix86_gen_leave = gen_leave_rex64;
3769 if (Pmode == DImode)
3770 {
3771 ix86_gen_monitor = gen_sse3_monitor64_di;
3772 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3773 ix86_gen_tls_local_dynamic_base_64
3774 = gen_tls_local_dynamic_base_64_di;
3775 }
3776 else
3777 {
3778 ix86_gen_monitor = gen_sse3_monitor64_si;
3779 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3780 ix86_gen_tls_local_dynamic_base_64
3781 = gen_tls_local_dynamic_base_64_si;
3782 }
3783 }
3784 else
3785 {
3786 ix86_gen_leave = gen_leave;
3787 ix86_gen_monitor = gen_sse3_monitor;
3788 }
3789
3790 if (Pmode == DImode)
3791 {
3792 ix86_gen_add3 = gen_adddi3;
3793 ix86_gen_sub3 = gen_subdi3;
3794 ix86_gen_sub3_carry = gen_subdi3_carry;
3795 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3796 ix86_gen_andsp = gen_anddi3;
3797 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3798 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3799 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3800 }
3801 else
3802 {
3803 ix86_gen_add3 = gen_addsi3;
3804 ix86_gen_sub3 = gen_subsi3;
3805 ix86_gen_sub3_carry = gen_subsi3_carry;
3806 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3807 ix86_gen_andsp = gen_andsi3;
3808 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3809 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3810 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3811 }
3812
3813 #ifdef USE_IX86_CLD
3814 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3815 if (!TARGET_64BIT)
3816 target_flags |= MASK_CLD & ~target_flags_explicit;
3817 #endif
3818
3819 if (!TARGET_64BIT && flag_pic)
3820 {
3821 if (flag_fentry > 0)
3822 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3823 "with -fpic");
3824 flag_fentry = 0;
3825 }
3826 else if (TARGET_SEH)
3827 {
3828 if (flag_fentry == 0)
3829 sorry ("-mno-fentry isn%'t compatible with SEH");
3830 flag_fentry = 1;
3831 }
3832 else if (flag_fentry < 0)
3833 {
3834 #if defined(PROFILE_BEFORE_PROLOGUE)
3835 flag_fentry = 1;
3836 #else
3837 flag_fentry = 0;
3838 #endif
3839 }
3840
3841 if (TARGET_AVX)
3842 {
3843 /* When not optimize for size, enable vzeroupper optimization for
3844 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3845 AVX unaligned load/store. */
3846 if (!optimize_size)
3847 {
3848 if (flag_expensive_optimizations
3849 && !(target_flags_explicit & MASK_VZEROUPPER))
3850 target_flags |= MASK_VZEROUPPER;
3851 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3852 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3853 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3854 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3855 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3856 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3857 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3858 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3859 target_flags |= MASK_PREFER_AVX128;
3860 }
3861 }
3862 else
3863 {
3864 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3865 target_flags &= ~MASK_VZEROUPPER;
3866 }
3867
3868 if (ix86_recip_name)
3869 {
3870 char *p = ASTRDUP (ix86_recip_name);
3871 char *q;
3872 unsigned int mask, i;
3873 bool invert;
3874
3875 while ((q = strtok (p, ",")) != NULL)
3876 {
3877 p = NULL;
3878 if (*q == '!')
3879 {
3880 invert = true;
3881 q++;
3882 }
3883 else
3884 invert = false;
3885
3886 if (!strcmp (q, "default"))
3887 mask = RECIP_MASK_ALL;
3888 else
3889 {
3890 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3891 if (!strcmp (q, recip_options[i].string))
3892 {
3893 mask = recip_options[i].mask;
3894 break;
3895 }
3896
3897 if (i == ARRAY_SIZE (recip_options))
3898 {
3899 error ("unknown option for -mrecip=%s", q);
3900 invert = false;
3901 mask = RECIP_MASK_NONE;
3902 }
3903 }
3904
3905 recip_mask_explicit |= mask;
3906 if (invert)
3907 recip_mask &= ~mask;
3908 else
3909 recip_mask |= mask;
3910 }
3911 }
3912
3913 if (TARGET_RECIP)
3914 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3915 else if (target_flags_explicit & MASK_RECIP)
3916 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3917
3918 /* Save the initial options in case the user does function specific
3919 options. */
3920 if (main_args_p)
3921 target_option_default_node = target_option_current_node
3922 = build_target_option_node ();
3923 }
3924
3925 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3926
3927 static bool
3928 function_pass_avx256_p (const_rtx val)
3929 {
3930 if (!val)
3931 return false;
3932
3933 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3934 return true;
3935
3936 if (GET_CODE (val) == PARALLEL)
3937 {
3938 int i;
3939 rtx r;
3940
3941 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3942 {
3943 r = XVECEXP (val, 0, i);
3944 if (GET_CODE (r) == EXPR_LIST
3945 && XEXP (r, 0)
3946 && REG_P (XEXP (r, 0))
3947 && (GET_MODE (XEXP (r, 0)) == OImode
3948 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3949 return true;
3950 }
3951 }
3952
3953 return false;
3954 }
3955
3956 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3957
3958 static void
3959 ix86_option_override (void)
3960 {
3961 ix86_option_override_internal (true);
3962 }
3963
3964 /* Update register usage after having seen the compiler flags. */
3965
3966 static void
3967 ix86_conditional_register_usage (void)
3968 {
3969 int i;
3970 unsigned int j;
3971
3972 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3973 {
3974 if (fixed_regs[i] > 1)
3975 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3976 if (call_used_regs[i] > 1)
3977 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3978 }
3979
3980 /* The PIC register, if it exists, is fixed. */
3981 j = PIC_OFFSET_TABLE_REGNUM;
3982 if (j != INVALID_REGNUM)
3983 fixed_regs[j] = call_used_regs[j] = 1;
3984
3985 /* The 64-bit MS_ABI changes the set of call-used registers. */
3986 if (TARGET_64BIT_MS_ABI)
3987 {
3988 call_used_regs[SI_REG] = 0;
3989 call_used_regs[DI_REG] = 0;
3990 call_used_regs[XMM6_REG] = 0;
3991 call_used_regs[XMM7_REG] = 0;
3992 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3993 call_used_regs[i] = 0;
3994 }
3995
3996 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3997 other call-clobbered regs for 64-bit. */
3998 if (TARGET_64BIT)
3999 {
4000 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4001
4002 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4003 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4004 && call_used_regs[i])
4005 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4006 }
4007
4008 /* If MMX is disabled, squash the registers. */
4009 if (! TARGET_MMX)
4010 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4011 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4012 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4013
4014 /* If SSE is disabled, squash the registers. */
4015 if (! TARGET_SSE)
4016 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4017 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4018 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4019
4020 /* If the FPU is disabled, squash the registers. */
4021 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4022 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4023 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4024 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4025
4026 /* If 32-bit, squash the 64-bit registers. */
4027 if (! TARGET_64BIT)
4028 {
4029 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4030 reg_names[i] = "";
4031 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4032 reg_names[i] = "";
4033 }
4034 }
4035
4036 \f
4037 /* Save the current options */
4038
4039 static void
4040 ix86_function_specific_save (struct cl_target_option *ptr)
4041 {
4042 ptr->arch = ix86_arch;
4043 ptr->schedule = ix86_schedule;
4044 ptr->tune = ix86_tune;
4045 ptr->branch_cost = ix86_branch_cost;
4046 ptr->tune_defaulted = ix86_tune_defaulted;
4047 ptr->arch_specified = ix86_arch_specified;
4048 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4049 ptr->ix86_target_flags_explicit = target_flags_explicit;
4050 ptr->x_recip_mask_explicit = recip_mask_explicit;
4051
4052 /* The fields are char but the variables are not; make sure the
4053 values fit in the fields. */
4054 gcc_assert (ptr->arch == ix86_arch);
4055 gcc_assert (ptr->schedule == ix86_schedule);
4056 gcc_assert (ptr->tune == ix86_tune);
4057 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4058 }
4059
4060 /* Restore the current options */
4061
4062 static void
4063 ix86_function_specific_restore (struct cl_target_option *ptr)
4064 {
4065 enum processor_type old_tune = ix86_tune;
4066 enum processor_type old_arch = ix86_arch;
4067 unsigned int ix86_arch_mask, ix86_tune_mask;
4068 int i;
4069
4070 ix86_arch = (enum processor_type) ptr->arch;
4071 ix86_schedule = (enum attr_cpu) ptr->schedule;
4072 ix86_tune = (enum processor_type) ptr->tune;
4073 ix86_branch_cost = ptr->branch_cost;
4074 ix86_tune_defaulted = ptr->tune_defaulted;
4075 ix86_arch_specified = ptr->arch_specified;
4076 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4077 target_flags_explicit = ptr->ix86_target_flags_explicit;
4078 recip_mask_explicit = ptr->x_recip_mask_explicit;
4079
4080 /* Recreate the arch feature tests if the arch changed */
4081 if (old_arch != ix86_arch)
4082 {
4083 ix86_arch_mask = 1u << ix86_arch;
4084 for (i = 0; i < X86_ARCH_LAST; ++i)
4085 ix86_arch_features[i]
4086 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4087 }
4088
4089 /* Recreate the tune optimization tests */
4090 if (old_tune != ix86_tune)
4091 {
4092 ix86_tune_mask = 1u << ix86_tune;
4093 for (i = 0; i < X86_TUNE_LAST; ++i)
4094 ix86_tune_features[i]
4095 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4096 }
4097 }
4098
4099 /* Print the current options */
4100
4101 static void
4102 ix86_function_specific_print (FILE *file, int indent,
4103 struct cl_target_option *ptr)
4104 {
4105 char *target_string
4106 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4107 NULL, NULL, ptr->x_ix86_fpmath, false);
4108
4109 fprintf (file, "%*sarch = %d (%s)\n",
4110 indent, "",
4111 ptr->arch,
4112 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4113 ? cpu_names[ptr->arch]
4114 : "<unknown>"));
4115
4116 fprintf (file, "%*stune = %d (%s)\n",
4117 indent, "",
4118 ptr->tune,
4119 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4120 ? cpu_names[ptr->tune]
4121 : "<unknown>"));
4122
4123 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4124
4125 if (target_string)
4126 {
4127 fprintf (file, "%*s%s\n", indent, "", target_string);
4128 free (target_string);
4129 }
4130 }
4131
4132 \f
4133 /* Inner function to process the attribute((target(...))), take an argument and
4134 set the current options from the argument. If we have a list, recursively go
4135 over the list. */
4136
4137 static bool
4138 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4139 struct gcc_options *enum_opts_set)
4140 {
4141 char *next_optstr;
4142 bool ret = true;
4143
4144 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4145 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4146 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4147 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4148 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4149
4150 enum ix86_opt_type
4151 {
4152 ix86_opt_unknown,
4153 ix86_opt_yes,
4154 ix86_opt_no,
4155 ix86_opt_str,
4156 ix86_opt_enum,
4157 ix86_opt_isa
4158 };
4159
4160 static const struct
4161 {
4162 const char *string;
4163 size_t len;
4164 enum ix86_opt_type type;
4165 int opt;
4166 int mask;
4167 } attrs[] = {
4168 /* isa options */
4169 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4170 IX86_ATTR_ISA ("abm", OPT_mabm),
4171 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4172 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4173 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4174 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4175 IX86_ATTR_ISA ("aes", OPT_maes),
4176 IX86_ATTR_ISA ("avx", OPT_mavx),
4177 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4178 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4179 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4180 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4181 IX86_ATTR_ISA ("sse", OPT_msse),
4182 IX86_ATTR_ISA ("sse2", OPT_msse2),
4183 IX86_ATTR_ISA ("sse3", OPT_msse3),
4184 IX86_ATTR_ISA ("sse4", OPT_msse4),
4185 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4186 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4187 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4188 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4189 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4190 IX86_ATTR_ISA ("fma", OPT_mfma),
4191 IX86_ATTR_ISA ("xop", OPT_mxop),
4192 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4193 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4194 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4195 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4196 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4197
4198 /* enum options */
4199 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4200
4201 /* string options */
4202 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4203 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4204
4205 /* flag options */
4206 IX86_ATTR_YES ("cld",
4207 OPT_mcld,
4208 MASK_CLD),
4209
4210 IX86_ATTR_NO ("fancy-math-387",
4211 OPT_mfancy_math_387,
4212 MASK_NO_FANCY_MATH_387),
4213
4214 IX86_ATTR_YES ("ieee-fp",
4215 OPT_mieee_fp,
4216 MASK_IEEE_FP),
4217
4218 IX86_ATTR_YES ("inline-all-stringops",
4219 OPT_minline_all_stringops,
4220 MASK_INLINE_ALL_STRINGOPS),
4221
4222 IX86_ATTR_YES ("inline-stringops-dynamically",
4223 OPT_minline_stringops_dynamically,
4224 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4225
4226 IX86_ATTR_NO ("align-stringops",
4227 OPT_mno_align_stringops,
4228 MASK_NO_ALIGN_STRINGOPS),
4229
4230 IX86_ATTR_YES ("recip",
4231 OPT_mrecip,
4232 MASK_RECIP),
4233
4234 };
4235
4236 /* If this is a list, recurse to get the options. */
4237 if (TREE_CODE (args) == TREE_LIST)
4238 {
4239 bool ret = true;
4240
4241 for (; args; args = TREE_CHAIN (args))
4242 if (TREE_VALUE (args)
4243 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4244 p_strings, enum_opts_set))
4245 ret = false;
4246
4247 return ret;
4248 }
4249
4250 else if (TREE_CODE (args) != STRING_CST)
4251 gcc_unreachable ();
4252
4253 /* Handle multiple arguments separated by commas. */
4254 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4255
4256 while (next_optstr && *next_optstr != '\0')
4257 {
4258 char *p = next_optstr;
4259 char *orig_p = p;
4260 char *comma = strchr (next_optstr, ',');
4261 const char *opt_string;
4262 size_t len, opt_len;
4263 int opt;
4264 bool opt_set_p;
4265 char ch;
4266 unsigned i;
4267 enum ix86_opt_type type = ix86_opt_unknown;
4268 int mask = 0;
4269
4270 if (comma)
4271 {
4272 *comma = '\0';
4273 len = comma - next_optstr;
4274 next_optstr = comma + 1;
4275 }
4276 else
4277 {
4278 len = strlen (p);
4279 next_optstr = NULL;
4280 }
4281
4282 /* Recognize no-xxx. */
4283 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4284 {
4285 opt_set_p = false;
4286 p += 3;
4287 len -= 3;
4288 }
4289 else
4290 opt_set_p = true;
4291
4292 /* Find the option. */
4293 ch = *p;
4294 opt = N_OPTS;
4295 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4296 {
4297 type = attrs[i].type;
4298 opt_len = attrs[i].len;
4299 if (ch == attrs[i].string[0]
4300 && ((type != ix86_opt_str && type != ix86_opt_enum)
4301 ? len == opt_len
4302 : len > opt_len)
4303 && memcmp (p, attrs[i].string, opt_len) == 0)
4304 {
4305 opt = attrs[i].opt;
4306 mask = attrs[i].mask;
4307 opt_string = attrs[i].string;
4308 break;
4309 }
4310 }
4311
4312 /* Process the option. */
4313 if (opt == N_OPTS)
4314 {
4315 error ("attribute(target(\"%s\")) is unknown", orig_p);
4316 ret = false;
4317 }
4318
4319 else if (type == ix86_opt_isa)
4320 {
4321 struct cl_decoded_option decoded;
4322
4323 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4324 ix86_handle_option (&global_options, &global_options_set,
4325 &decoded, input_location);
4326 }
4327
4328 else if (type == ix86_opt_yes || type == ix86_opt_no)
4329 {
4330 if (type == ix86_opt_no)
4331 opt_set_p = !opt_set_p;
4332
4333 if (opt_set_p)
4334 target_flags |= mask;
4335 else
4336 target_flags &= ~mask;
4337 }
4338
4339 else if (type == ix86_opt_str)
4340 {
4341 if (p_strings[opt])
4342 {
4343 error ("option(\"%s\") was already specified", opt_string);
4344 ret = false;
4345 }
4346 else
4347 p_strings[opt] = xstrdup (p + opt_len);
4348 }
4349
4350 else if (type == ix86_opt_enum)
4351 {
4352 bool arg_ok;
4353 int value;
4354
4355 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4356 if (arg_ok)
4357 set_option (&global_options, enum_opts_set, opt, value,
4358 p + opt_len, DK_UNSPECIFIED, input_location,
4359 global_dc);
4360 else
4361 {
4362 error ("attribute(target(\"%s\")) is unknown", orig_p);
4363 ret = false;
4364 }
4365 }
4366
4367 else
4368 gcc_unreachable ();
4369 }
4370
4371 return ret;
4372 }
4373
4374 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4375
4376 tree
4377 ix86_valid_target_attribute_tree (tree args)
4378 {
4379 const char *orig_arch_string = ix86_arch_string;
4380 const char *orig_tune_string = ix86_tune_string;
4381 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4382 int orig_tune_defaulted = ix86_tune_defaulted;
4383 int orig_arch_specified = ix86_arch_specified;
4384 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4385 tree t = NULL_TREE;
4386 int i;
4387 struct cl_target_option *def
4388 = TREE_TARGET_OPTION (target_option_default_node);
4389 struct gcc_options enum_opts_set;
4390
4391 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4392
4393 /* Process each of the options on the chain. */
4394 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4395 &enum_opts_set))
4396 return NULL_TREE;
4397
4398 /* If the changed options are different from the default, rerun
4399 ix86_option_override_internal, and then save the options away.
4400 The string options are are attribute options, and will be undone
4401 when we copy the save structure. */
4402 if (ix86_isa_flags != def->x_ix86_isa_flags
4403 || target_flags != def->x_target_flags
4404 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4405 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4406 || enum_opts_set.x_ix86_fpmath)
4407 {
4408 /* If we are using the default tune= or arch=, undo the string assigned,
4409 and use the default. */
4410 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4411 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4412 else if (!orig_arch_specified)
4413 ix86_arch_string = NULL;
4414
4415 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4416 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4417 else if (orig_tune_defaulted)
4418 ix86_tune_string = NULL;
4419
4420 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4421 if (enum_opts_set.x_ix86_fpmath)
4422 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4423 else if (!TARGET_64BIT && TARGET_SSE)
4424 {
4425 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4426 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4427 }
4428
4429 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4430 ix86_option_override_internal (false);
4431
4432 /* Add any builtin functions with the new isa if any. */
4433 ix86_add_new_builtins (ix86_isa_flags);
4434
4435 /* Save the current options unless we are validating options for
4436 #pragma. */
4437 t = build_target_option_node ();
4438
4439 ix86_arch_string = orig_arch_string;
4440 ix86_tune_string = orig_tune_string;
4441 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4442
4443 /* Free up memory allocated to hold the strings */
4444 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4445 free (option_strings[i]);
4446 }
4447
4448 return t;
4449 }
4450
4451 /* Hook to validate attribute((target("string"))). */
4452
4453 static bool
4454 ix86_valid_target_attribute_p (tree fndecl,
4455 tree ARG_UNUSED (name),
4456 tree args,
4457 int ARG_UNUSED (flags))
4458 {
4459 struct cl_target_option cur_target;
4460 bool ret = true;
4461 tree old_optimize = build_optimization_node ();
4462 tree new_target, new_optimize;
4463 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4464
4465 /* If the function changed the optimization levels as well as setting target
4466 options, start with the optimizations specified. */
4467 if (func_optimize && func_optimize != old_optimize)
4468 cl_optimization_restore (&global_options,
4469 TREE_OPTIMIZATION (func_optimize));
4470
4471 /* The target attributes may also change some optimization flags, so update
4472 the optimization options if necessary. */
4473 cl_target_option_save (&cur_target, &global_options);
4474 new_target = ix86_valid_target_attribute_tree (args);
4475 new_optimize = build_optimization_node ();
4476
4477 if (!new_target)
4478 ret = false;
4479
4480 else if (fndecl)
4481 {
4482 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4483
4484 if (old_optimize != new_optimize)
4485 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4486 }
4487
4488 cl_target_option_restore (&global_options, &cur_target);
4489
4490 if (old_optimize != new_optimize)
4491 cl_optimization_restore (&global_options,
4492 TREE_OPTIMIZATION (old_optimize));
4493
4494 return ret;
4495 }
4496
4497 \f
4498 /* Hook to determine if one function can safely inline another. */
4499
4500 static bool
4501 ix86_can_inline_p (tree caller, tree callee)
4502 {
4503 bool ret = false;
4504 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4505 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4506
4507 /* If callee has no option attributes, then it is ok to inline. */
4508 if (!callee_tree)
4509 ret = true;
4510
4511 /* If caller has no option attributes, but callee does then it is not ok to
4512 inline. */
4513 else if (!caller_tree)
4514 ret = false;
4515
4516 else
4517 {
4518 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4519 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4520
4521 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4522 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4523 function. */
4524 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4525 != callee_opts->x_ix86_isa_flags)
4526 ret = false;
4527
4528 /* See if we have the same non-isa options. */
4529 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4530 ret = false;
4531
4532 /* See if arch, tune, etc. are the same. */
4533 else if (caller_opts->arch != callee_opts->arch)
4534 ret = false;
4535
4536 else if (caller_opts->tune != callee_opts->tune)
4537 ret = false;
4538
4539 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4540 ret = false;
4541
4542 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4543 ret = false;
4544
4545 else
4546 ret = true;
4547 }
4548
4549 return ret;
4550 }
4551
4552 \f
4553 /* Remember the last target of ix86_set_current_function. */
4554 static GTY(()) tree ix86_previous_fndecl;
4555
4556 /* Establish appropriate back-end context for processing the function
4557 FNDECL. The argument might be NULL to indicate processing at top
4558 level, outside of any function scope. */
4559 static void
4560 ix86_set_current_function (tree fndecl)
4561 {
4562 /* Only change the context if the function changes. This hook is called
4563 several times in the course of compiling a function, and we don't want to
4564 slow things down too much or call target_reinit when it isn't safe. */
4565 if (fndecl && fndecl != ix86_previous_fndecl)
4566 {
4567 tree old_tree = (ix86_previous_fndecl
4568 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4569 : NULL_TREE);
4570
4571 tree new_tree = (fndecl
4572 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4573 : NULL_TREE);
4574
4575 ix86_previous_fndecl = fndecl;
4576 if (old_tree == new_tree)
4577 ;
4578
4579 else if (new_tree)
4580 {
4581 cl_target_option_restore (&global_options,
4582 TREE_TARGET_OPTION (new_tree));
4583 target_reinit ();
4584 }
4585
4586 else if (old_tree)
4587 {
4588 struct cl_target_option *def
4589 = TREE_TARGET_OPTION (target_option_current_node);
4590
4591 cl_target_option_restore (&global_options, def);
4592 target_reinit ();
4593 }
4594 }
4595 }
4596
4597 \f
4598 /* Return true if this goes in large data/bss. */
4599
4600 static bool
4601 ix86_in_large_data_p (tree exp)
4602 {
4603 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4604 return false;
4605
4606 /* Functions are never large data. */
4607 if (TREE_CODE (exp) == FUNCTION_DECL)
4608 return false;
4609
4610 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4611 {
4612 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4613 if (strcmp (section, ".ldata") == 0
4614 || strcmp (section, ".lbss") == 0)
4615 return true;
4616 return false;
4617 }
4618 else
4619 {
4620 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4621
4622 /* If this is an incomplete type with size 0, then we can't put it
4623 in data because it might be too big when completed. */
4624 if (!size || size > ix86_section_threshold)
4625 return true;
4626 }
4627
4628 return false;
4629 }
4630
4631 /* Switch to the appropriate section for output of DECL.
4632 DECL is either a `VAR_DECL' node or a constant of some sort.
4633 RELOC indicates whether forming the initial value of DECL requires
4634 link-time relocations. */
4635
4636 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4637 ATTRIBUTE_UNUSED;
4638
4639 static section *
4640 x86_64_elf_select_section (tree decl, int reloc,
4641 unsigned HOST_WIDE_INT align)
4642 {
4643 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4644 && ix86_in_large_data_p (decl))
4645 {
4646 const char *sname = NULL;
4647 unsigned int flags = SECTION_WRITE;
4648 switch (categorize_decl_for_section (decl, reloc))
4649 {
4650 case SECCAT_DATA:
4651 sname = ".ldata";
4652 break;
4653 case SECCAT_DATA_REL:
4654 sname = ".ldata.rel";
4655 break;
4656 case SECCAT_DATA_REL_LOCAL:
4657 sname = ".ldata.rel.local";
4658 break;
4659 case SECCAT_DATA_REL_RO:
4660 sname = ".ldata.rel.ro";
4661 break;
4662 case SECCAT_DATA_REL_RO_LOCAL:
4663 sname = ".ldata.rel.ro.local";
4664 break;
4665 case SECCAT_BSS:
4666 sname = ".lbss";
4667 flags |= SECTION_BSS;
4668 break;
4669 case SECCAT_RODATA:
4670 case SECCAT_RODATA_MERGE_STR:
4671 case SECCAT_RODATA_MERGE_STR_INIT:
4672 case SECCAT_RODATA_MERGE_CONST:
4673 sname = ".lrodata";
4674 flags = 0;
4675 break;
4676 case SECCAT_SRODATA:
4677 case SECCAT_SDATA:
4678 case SECCAT_SBSS:
4679 gcc_unreachable ();
4680 case SECCAT_TEXT:
4681 case SECCAT_TDATA:
4682 case SECCAT_TBSS:
4683 /* We don't split these for medium model. Place them into
4684 default sections and hope for best. */
4685 break;
4686 }
4687 if (sname)
4688 {
4689 /* We might get called with string constants, but get_named_section
4690 doesn't like them as they are not DECLs. Also, we need to set
4691 flags in that case. */
4692 if (!DECL_P (decl))
4693 return get_section (sname, flags, NULL);
4694 return get_named_section (decl, sname, reloc);
4695 }
4696 }
4697 return default_elf_select_section (decl, reloc, align);
4698 }
4699
4700 /* Build up a unique section name, expressed as a
4701 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4702 RELOC indicates whether the initial value of EXP requires
4703 link-time relocations. */
4704
4705 static void ATTRIBUTE_UNUSED
4706 x86_64_elf_unique_section (tree decl, int reloc)
4707 {
4708 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4709 && ix86_in_large_data_p (decl))
4710 {
4711 const char *prefix = NULL;
4712 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4713 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4714
4715 switch (categorize_decl_for_section (decl, reloc))
4716 {
4717 case SECCAT_DATA:
4718 case SECCAT_DATA_REL:
4719 case SECCAT_DATA_REL_LOCAL:
4720 case SECCAT_DATA_REL_RO:
4721 case SECCAT_DATA_REL_RO_LOCAL:
4722 prefix = one_only ? ".ld" : ".ldata";
4723 break;
4724 case SECCAT_BSS:
4725 prefix = one_only ? ".lb" : ".lbss";
4726 break;
4727 case SECCAT_RODATA:
4728 case SECCAT_RODATA_MERGE_STR:
4729 case SECCAT_RODATA_MERGE_STR_INIT:
4730 case SECCAT_RODATA_MERGE_CONST:
4731 prefix = one_only ? ".lr" : ".lrodata";
4732 break;
4733 case SECCAT_SRODATA:
4734 case SECCAT_SDATA:
4735 case SECCAT_SBSS:
4736 gcc_unreachable ();
4737 case SECCAT_TEXT:
4738 case SECCAT_TDATA:
4739 case SECCAT_TBSS:
4740 /* We don't split these for medium model. Place them into
4741 default sections and hope for best. */
4742 break;
4743 }
4744 if (prefix)
4745 {
4746 const char *name, *linkonce;
4747 char *string;
4748
4749 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4750 name = targetm.strip_name_encoding (name);
4751
4752 /* If we're using one_only, then there needs to be a .gnu.linkonce
4753 prefix to the section name. */
4754 linkonce = one_only ? ".gnu.linkonce" : "";
4755
4756 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4757
4758 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4759 return;
4760 }
4761 }
4762 default_unique_section (decl, reloc);
4763 }
4764
4765 #ifdef COMMON_ASM_OP
4766 /* This says how to output assembler code to declare an
4767 uninitialized external linkage data object.
4768
4769 For medium model x86-64 we need to use .largecomm opcode for
4770 large objects. */
4771 void
4772 x86_elf_aligned_common (FILE *file,
4773 const char *name, unsigned HOST_WIDE_INT size,
4774 int align)
4775 {
4776 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4777 && size > (unsigned int)ix86_section_threshold)
4778 fputs (".largecomm\t", file);
4779 else
4780 fputs (COMMON_ASM_OP, file);
4781 assemble_name (file, name);
4782 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4783 size, align / BITS_PER_UNIT);
4784 }
4785 #endif
4786
4787 /* Utility function for targets to use in implementing
4788 ASM_OUTPUT_ALIGNED_BSS. */
4789
4790 void
4791 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4792 const char *name, unsigned HOST_WIDE_INT size,
4793 int align)
4794 {
4795 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4796 && size > (unsigned int)ix86_section_threshold)
4797 switch_to_section (get_named_section (decl, ".lbss", 0));
4798 else
4799 switch_to_section (bss_section);
4800 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4801 #ifdef ASM_DECLARE_OBJECT_NAME
4802 last_assemble_variable_decl = decl;
4803 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4804 #else
4805 /* Standard thing is just output label for the object. */
4806 ASM_OUTPUT_LABEL (file, name);
4807 #endif /* ASM_DECLARE_OBJECT_NAME */
4808 ASM_OUTPUT_SKIP (file, size ? size : 1);
4809 }
4810 \f
4811 /* Decide whether we must probe the stack before any space allocation
4812 on this target. It's essentially TARGET_STACK_PROBE except when
4813 -fstack-check causes the stack to be already probed differently. */
4814
4815 bool
4816 ix86_target_stack_probe (void)
4817 {
4818 /* Do not probe the stack twice if static stack checking is enabled. */
4819 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4820 return false;
4821
4822 return TARGET_STACK_PROBE;
4823 }
4824 \f
4825 /* Decide whether we can make a sibling call to a function. DECL is the
4826 declaration of the function being targeted by the call and EXP is the
4827 CALL_EXPR representing the call. */
4828
4829 static bool
4830 ix86_function_ok_for_sibcall (tree decl, tree exp)
4831 {
4832 tree type, decl_or_type;
4833 rtx a, b;
4834
4835 /* If we are generating position-independent code, we cannot sibcall
4836 optimize any indirect call, or a direct call to a global function,
4837 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4838 if (!TARGET_MACHO
4839 && !TARGET_64BIT
4840 && flag_pic
4841 && (!decl || !targetm.binds_local_p (decl)))
4842 return false;
4843
4844 /* If we need to align the outgoing stack, then sibcalling would
4845 unalign the stack, which may break the called function. */
4846 if (ix86_minimum_incoming_stack_boundary (true)
4847 < PREFERRED_STACK_BOUNDARY)
4848 return false;
4849
4850 if (decl)
4851 {
4852 decl_or_type = decl;
4853 type = TREE_TYPE (decl);
4854 }
4855 else
4856 {
4857 /* We're looking at the CALL_EXPR, we need the type of the function. */
4858 type = CALL_EXPR_FN (exp); /* pointer expression */
4859 type = TREE_TYPE (type); /* pointer type */
4860 type = TREE_TYPE (type); /* function type */
4861 decl_or_type = type;
4862 }
4863
4864 /* Check that the return value locations are the same. Like
4865 if we are returning floats on the 80387 register stack, we cannot
4866 make a sibcall from a function that doesn't return a float to a
4867 function that does or, conversely, from a function that does return
4868 a float to a function that doesn't; the necessary stack adjustment
4869 would not be executed. This is also the place we notice
4870 differences in the return value ABI. Note that it is ok for one
4871 of the functions to have void return type as long as the return
4872 value of the other is passed in a register. */
4873 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4874 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4875 cfun->decl, false);
4876 if (STACK_REG_P (a) || STACK_REG_P (b))
4877 {
4878 if (!rtx_equal_p (a, b))
4879 return false;
4880 }
4881 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4882 {
4883 /* Disable sibcall if we need to generate vzeroupper after
4884 callee returns. */
4885 if (TARGET_VZEROUPPER
4886 && cfun->machine->callee_return_avx256_p
4887 && !cfun->machine->caller_return_avx256_p)
4888 return false;
4889 }
4890 else if (!rtx_equal_p (a, b))
4891 return false;
4892
4893 if (TARGET_64BIT)
4894 {
4895 /* The SYSV ABI has more call-clobbered registers;
4896 disallow sibcalls from MS to SYSV. */
4897 if (cfun->machine->call_abi == MS_ABI
4898 && ix86_function_type_abi (type) == SYSV_ABI)
4899 return false;
4900 }
4901 else
4902 {
4903 /* If this call is indirect, we'll need to be able to use a
4904 call-clobbered register for the address of the target function.
4905 Make sure that all such registers are not used for passing
4906 parameters. Note that DLLIMPORT functions are indirect. */
4907 if (!decl
4908 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4909 {
4910 if (ix86_function_regparm (type, NULL) >= 3)
4911 {
4912 /* ??? Need to count the actual number of registers to be used,
4913 not the possible number of registers. Fix later. */
4914 return false;
4915 }
4916 }
4917 }
4918
4919 /* Otherwise okay. That also includes certain types of indirect calls. */
4920 return true;
4921 }
4922
4923 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4924 and "sseregparm" calling convention attributes;
4925 arguments as in struct attribute_spec.handler. */
4926
4927 static tree
4928 ix86_handle_cconv_attribute (tree *node, tree name,
4929 tree args,
4930 int flags ATTRIBUTE_UNUSED,
4931 bool *no_add_attrs)
4932 {
4933 if (TREE_CODE (*node) != FUNCTION_TYPE
4934 && TREE_CODE (*node) != METHOD_TYPE
4935 && TREE_CODE (*node) != FIELD_DECL
4936 && TREE_CODE (*node) != TYPE_DECL)
4937 {
4938 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4939 name);
4940 *no_add_attrs = true;
4941 return NULL_TREE;
4942 }
4943
4944 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4945 if (is_attribute_p ("regparm", name))
4946 {
4947 tree cst;
4948
4949 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4950 {
4951 error ("fastcall and regparm attributes are not compatible");
4952 }
4953
4954 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4955 {
4956 error ("regparam and thiscall attributes are not compatible");
4957 }
4958
4959 cst = TREE_VALUE (args);
4960 if (TREE_CODE (cst) != INTEGER_CST)
4961 {
4962 warning (OPT_Wattributes,
4963 "%qE attribute requires an integer constant argument",
4964 name);
4965 *no_add_attrs = true;
4966 }
4967 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4968 {
4969 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4970 name, REGPARM_MAX);
4971 *no_add_attrs = true;
4972 }
4973
4974 return NULL_TREE;
4975 }
4976
4977 if (TARGET_64BIT)
4978 {
4979 /* Do not warn when emulating the MS ABI. */
4980 if ((TREE_CODE (*node) != FUNCTION_TYPE
4981 && TREE_CODE (*node) != METHOD_TYPE)
4982 || ix86_function_type_abi (*node) != MS_ABI)
4983 warning (OPT_Wattributes, "%qE attribute ignored",
4984 name);
4985 *no_add_attrs = true;
4986 return NULL_TREE;
4987 }
4988
4989 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4990 if (is_attribute_p ("fastcall", name))
4991 {
4992 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4993 {
4994 error ("fastcall and cdecl attributes are not compatible");
4995 }
4996 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4997 {
4998 error ("fastcall and stdcall attributes are not compatible");
4999 }
5000 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5001 {
5002 error ("fastcall and regparm attributes are not compatible");
5003 }
5004 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5005 {
5006 error ("fastcall and thiscall attributes are not compatible");
5007 }
5008 }
5009
5010 /* Can combine stdcall with fastcall (redundant), regparm and
5011 sseregparm. */
5012 else if (is_attribute_p ("stdcall", name))
5013 {
5014 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5015 {
5016 error ("stdcall and cdecl attributes are not compatible");
5017 }
5018 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5019 {
5020 error ("stdcall and fastcall attributes are not compatible");
5021 }
5022 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5023 {
5024 error ("stdcall and thiscall attributes are not compatible");
5025 }
5026 }
5027
5028 /* Can combine cdecl with regparm and sseregparm. */
5029 else if (is_attribute_p ("cdecl", name))
5030 {
5031 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5032 {
5033 error ("stdcall and cdecl attributes are not compatible");
5034 }
5035 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5036 {
5037 error ("fastcall and cdecl attributes are not compatible");
5038 }
5039 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5040 {
5041 error ("cdecl and thiscall attributes are not compatible");
5042 }
5043 }
5044 else if (is_attribute_p ("thiscall", name))
5045 {
5046 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5047 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5048 name);
5049 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5050 {
5051 error ("stdcall and thiscall attributes are not compatible");
5052 }
5053 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5054 {
5055 error ("fastcall and thiscall attributes are not compatible");
5056 }
5057 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5058 {
5059 error ("cdecl and thiscall attributes are not compatible");
5060 }
5061 }
5062
5063 /* Can combine sseregparm with all attributes. */
5064
5065 return NULL_TREE;
5066 }
5067
5068 /* The transactional memory builtins are implicitly regparm or fastcall
5069 depending on the ABI. Override the generic do-nothing attribute that
5070 these builtins were declared with, and replace it with one of the two
5071 attributes that we expect elsewhere. */
5072
5073 static tree
5074 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5075 tree args ATTRIBUTE_UNUSED,
5076 int flags ATTRIBUTE_UNUSED,
5077 bool *no_add_attrs)
5078 {
5079 tree alt;
5080
5081 /* In no case do we want to add the placeholder attribute. */
5082 *no_add_attrs = true;
5083
5084 /* The 64-bit ABI is unchanged for transactional memory. */
5085 if (TARGET_64BIT)
5086 return NULL_TREE;
5087
5088 /* ??? Is there a better way to validate 32-bit windows? We have
5089 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5090 if (CHECK_STACK_LIMIT > 0)
5091 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5092 else
5093 {
5094 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5095 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5096 }
5097 decl_attributes (node, alt, flags);
5098
5099 return NULL_TREE;
5100 }
5101
5102 /* This function determines from TYPE the calling-convention. */
5103
5104 unsigned int
5105 ix86_get_callcvt (const_tree type)
5106 {
5107 unsigned int ret = 0;
5108 bool is_stdarg;
5109 tree attrs;
5110
5111 if (TARGET_64BIT)
5112 return IX86_CALLCVT_CDECL;
5113
5114 attrs = TYPE_ATTRIBUTES (type);
5115 if (attrs != NULL_TREE)
5116 {
5117 if (lookup_attribute ("cdecl", attrs))
5118 ret |= IX86_CALLCVT_CDECL;
5119 else if (lookup_attribute ("stdcall", attrs))
5120 ret |= IX86_CALLCVT_STDCALL;
5121 else if (lookup_attribute ("fastcall", attrs))
5122 ret |= IX86_CALLCVT_FASTCALL;
5123 else if (lookup_attribute ("thiscall", attrs))
5124 ret |= IX86_CALLCVT_THISCALL;
5125
5126 /* Regparam isn't allowed for thiscall and fastcall. */
5127 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5128 {
5129 if (lookup_attribute ("regparm", attrs))
5130 ret |= IX86_CALLCVT_REGPARM;
5131 if (lookup_attribute ("sseregparm", attrs))
5132 ret |= IX86_CALLCVT_SSEREGPARM;
5133 }
5134
5135 if (IX86_BASE_CALLCVT(ret) != 0)
5136 return ret;
5137 }
5138
5139 is_stdarg = stdarg_p (type);
5140 if (TARGET_RTD && !is_stdarg)
5141 return IX86_CALLCVT_STDCALL | ret;
5142
5143 if (ret != 0
5144 || is_stdarg
5145 || TREE_CODE (type) != METHOD_TYPE
5146 || ix86_function_type_abi (type) != MS_ABI)
5147 return IX86_CALLCVT_CDECL | ret;
5148
5149 return IX86_CALLCVT_THISCALL;
5150 }
5151
5152 /* Return 0 if the attributes for two types are incompatible, 1 if they
5153 are compatible, and 2 if they are nearly compatible (which causes a
5154 warning to be generated). */
5155
5156 static int
5157 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5158 {
5159 unsigned int ccvt1, ccvt2;
5160
5161 if (TREE_CODE (type1) != FUNCTION_TYPE
5162 && TREE_CODE (type1) != METHOD_TYPE)
5163 return 1;
5164
5165 ccvt1 = ix86_get_callcvt (type1);
5166 ccvt2 = ix86_get_callcvt (type2);
5167 if (ccvt1 != ccvt2)
5168 return 0;
5169 if (ix86_function_regparm (type1, NULL)
5170 != ix86_function_regparm (type2, NULL))
5171 return 0;
5172
5173 return 1;
5174 }
5175 \f
5176 /* Return the regparm value for a function with the indicated TYPE and DECL.
5177 DECL may be NULL when calling function indirectly
5178 or considering a libcall. */
5179
5180 static int
5181 ix86_function_regparm (const_tree type, const_tree decl)
5182 {
5183 tree attr;
5184 int regparm;
5185 unsigned int ccvt;
5186
5187 if (TARGET_64BIT)
5188 return (ix86_function_type_abi (type) == SYSV_ABI
5189 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5190 ccvt = ix86_get_callcvt (type);
5191 regparm = ix86_regparm;
5192
5193 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5194 {
5195 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5196 if (attr)
5197 {
5198 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5199 return regparm;
5200 }
5201 }
5202 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5203 return 2;
5204 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5205 return 1;
5206
5207 /* Use register calling convention for local functions when possible. */
5208 if (decl
5209 && TREE_CODE (decl) == FUNCTION_DECL
5210 && optimize
5211 && !(profile_flag && !flag_fentry))
5212 {
5213 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5214 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5215 if (i && i->local && i->can_change_signature)
5216 {
5217 int local_regparm, globals = 0, regno;
5218
5219 /* Make sure no regparm register is taken by a
5220 fixed register variable. */
5221 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5222 if (fixed_regs[local_regparm])
5223 break;
5224
5225 /* We don't want to use regparm(3) for nested functions as
5226 these use a static chain pointer in the third argument. */
5227 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5228 local_regparm = 2;
5229
5230 /* In 32-bit mode save a register for the split stack. */
5231 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5232 local_regparm = 2;
5233
5234 /* Each fixed register usage increases register pressure,
5235 so less registers should be used for argument passing.
5236 This functionality can be overriden by an explicit
5237 regparm value. */
5238 for (regno = 0; regno <= DI_REG; regno++)
5239 if (fixed_regs[regno])
5240 globals++;
5241
5242 local_regparm
5243 = globals < local_regparm ? local_regparm - globals : 0;
5244
5245 if (local_regparm > regparm)
5246 regparm = local_regparm;
5247 }
5248 }
5249
5250 return regparm;
5251 }
5252
5253 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5254 DFmode (2) arguments in SSE registers for a function with the
5255 indicated TYPE and DECL. DECL may be NULL when calling function
5256 indirectly or considering a libcall. Otherwise return 0. */
5257
5258 static int
5259 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5260 {
5261 gcc_assert (!TARGET_64BIT);
5262
5263 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5264 by the sseregparm attribute. */
5265 if (TARGET_SSEREGPARM
5266 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5267 {
5268 if (!TARGET_SSE)
5269 {
5270 if (warn)
5271 {
5272 if (decl)
5273 error ("calling %qD with attribute sseregparm without "
5274 "SSE/SSE2 enabled", decl);
5275 else
5276 error ("calling %qT with attribute sseregparm without "
5277 "SSE/SSE2 enabled", type);
5278 }
5279 return 0;
5280 }
5281
5282 return 2;
5283 }
5284
5285 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5286 (and DFmode for SSE2) arguments in SSE registers. */
5287 if (decl && TARGET_SSE_MATH && optimize
5288 && !(profile_flag && !flag_fentry))
5289 {
5290 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5291 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5292 if (i && i->local && i->can_change_signature)
5293 return TARGET_SSE2 ? 2 : 1;
5294 }
5295
5296 return 0;
5297 }
5298
5299 /* Return true if EAX is live at the start of the function. Used by
5300 ix86_expand_prologue to determine if we need special help before
5301 calling allocate_stack_worker. */
5302
5303 static bool
5304 ix86_eax_live_at_start_p (void)
5305 {
5306 /* Cheat. Don't bother working forward from ix86_function_regparm
5307 to the function type to whether an actual argument is located in
5308 eax. Instead just look at cfg info, which is still close enough
5309 to correct at this point. This gives false positives for broken
5310 functions that might use uninitialized data that happens to be
5311 allocated in eax, but who cares? */
5312 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5313 }
5314
5315 static bool
5316 ix86_keep_aggregate_return_pointer (tree fntype)
5317 {
5318 tree attr;
5319
5320 if (!TARGET_64BIT)
5321 {
5322 attr = lookup_attribute ("callee_pop_aggregate_return",
5323 TYPE_ATTRIBUTES (fntype));
5324 if (attr)
5325 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5326
5327 /* For 32-bit MS-ABI the default is to keep aggregate
5328 return pointer. */
5329 if (ix86_function_type_abi (fntype) == MS_ABI)
5330 return true;
5331 }
5332 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5333 }
5334
5335 /* Value is the number of bytes of arguments automatically
5336 popped when returning from a subroutine call.
5337 FUNDECL is the declaration node of the function (as a tree),
5338 FUNTYPE is the data type of the function (as a tree),
5339 or for a library call it is an identifier node for the subroutine name.
5340 SIZE is the number of bytes of arguments passed on the stack.
5341
5342 On the 80386, the RTD insn may be used to pop them if the number
5343 of args is fixed, but if the number is variable then the caller
5344 must pop them all. RTD can't be used for library calls now
5345 because the library is compiled with the Unix compiler.
5346 Use of RTD is a selectable option, since it is incompatible with
5347 standard Unix calling sequences. If the option is not selected,
5348 the caller must always pop the args.
5349
5350 The attribute stdcall is equivalent to RTD on a per module basis. */
5351
5352 static int
5353 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5354 {
5355 unsigned int ccvt;
5356
5357 /* None of the 64-bit ABIs pop arguments. */
5358 if (TARGET_64BIT)
5359 return 0;
5360
5361 ccvt = ix86_get_callcvt (funtype);
5362
5363 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5364 | IX86_CALLCVT_THISCALL)) != 0
5365 && ! stdarg_p (funtype))
5366 return size;
5367
5368 /* Lose any fake structure return argument if it is passed on the stack. */
5369 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5370 && !ix86_keep_aggregate_return_pointer (funtype))
5371 {
5372 int nregs = ix86_function_regparm (funtype, fundecl);
5373 if (nregs == 0)
5374 return GET_MODE_SIZE (Pmode);
5375 }
5376
5377 return 0;
5378 }
5379 \f
5380 /* Argument support functions. */
5381
5382 /* Return true when register may be used to pass function parameters. */
5383 bool
5384 ix86_function_arg_regno_p (int regno)
5385 {
5386 int i;
5387 const int *parm_regs;
5388
5389 if (!TARGET_64BIT)
5390 {
5391 if (TARGET_MACHO)
5392 return (regno < REGPARM_MAX
5393 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5394 else
5395 return (regno < REGPARM_MAX
5396 || (TARGET_MMX && MMX_REGNO_P (regno)
5397 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5398 || (TARGET_SSE && SSE_REGNO_P (regno)
5399 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5400 }
5401
5402 if (TARGET_MACHO)
5403 {
5404 if (SSE_REGNO_P (regno) && TARGET_SSE)
5405 return true;
5406 }
5407 else
5408 {
5409 if (TARGET_SSE && SSE_REGNO_P (regno)
5410 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5411 return true;
5412 }
5413
5414 /* TODO: The function should depend on current function ABI but
5415 builtins.c would need updating then. Therefore we use the
5416 default ABI. */
5417
5418 /* RAX is used as hidden argument to va_arg functions. */
5419 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5420 return true;
5421
5422 if (ix86_abi == MS_ABI)
5423 parm_regs = x86_64_ms_abi_int_parameter_registers;
5424 else
5425 parm_regs = x86_64_int_parameter_registers;
5426 for (i = 0; i < (ix86_abi == MS_ABI
5427 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5428 if (regno == parm_regs[i])
5429 return true;
5430 return false;
5431 }
5432
5433 /* Return if we do not know how to pass TYPE solely in registers. */
5434
5435 static bool
5436 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5437 {
5438 if (must_pass_in_stack_var_size_or_pad (mode, type))
5439 return true;
5440
5441 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5442 The layout_type routine is crafty and tries to trick us into passing
5443 currently unsupported vector types on the stack by using TImode. */
5444 return (!TARGET_64BIT && mode == TImode
5445 && type && TREE_CODE (type) != VECTOR_TYPE);
5446 }
5447
5448 /* It returns the size, in bytes, of the area reserved for arguments passed
5449 in registers for the function represented by fndecl dependent to the used
5450 abi format. */
5451 int
5452 ix86_reg_parm_stack_space (const_tree fndecl)
5453 {
5454 enum calling_abi call_abi = SYSV_ABI;
5455 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5456 call_abi = ix86_function_abi (fndecl);
5457 else
5458 call_abi = ix86_function_type_abi (fndecl);
5459 if (TARGET_64BIT && call_abi == MS_ABI)
5460 return 32;
5461 return 0;
5462 }
5463
5464 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5465 call abi used. */
5466 enum calling_abi
5467 ix86_function_type_abi (const_tree fntype)
5468 {
5469 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5470 {
5471 enum calling_abi abi = ix86_abi;
5472 if (abi == SYSV_ABI)
5473 {
5474 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5475 abi = MS_ABI;
5476 }
5477 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5478 abi = SYSV_ABI;
5479 return abi;
5480 }
5481 return ix86_abi;
5482 }
5483
5484 static bool
5485 ix86_function_ms_hook_prologue (const_tree fn)
5486 {
5487 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5488 {
5489 if (decl_function_context (fn) != NULL_TREE)
5490 error_at (DECL_SOURCE_LOCATION (fn),
5491 "ms_hook_prologue is not compatible with nested function");
5492 else
5493 return true;
5494 }
5495 return false;
5496 }
5497
5498 static enum calling_abi
5499 ix86_function_abi (const_tree fndecl)
5500 {
5501 if (! fndecl)
5502 return ix86_abi;
5503 return ix86_function_type_abi (TREE_TYPE (fndecl));
5504 }
5505
5506 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5507 call abi used. */
5508 enum calling_abi
5509 ix86_cfun_abi (void)
5510 {
5511 if (! cfun)
5512 return ix86_abi;
5513 return cfun->machine->call_abi;
5514 }
5515
5516 /* Write the extra assembler code needed to declare a function properly. */
5517
5518 void
5519 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5520 tree decl)
5521 {
5522 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5523
5524 if (is_ms_hook)
5525 {
5526 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5527 unsigned int filler_cc = 0xcccccccc;
5528
5529 for (i = 0; i < filler_count; i += 4)
5530 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5531 }
5532
5533 #ifdef SUBTARGET_ASM_UNWIND_INIT
5534 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5535 #endif
5536
5537 ASM_OUTPUT_LABEL (asm_out_file, fname);
5538
5539 /* Output magic byte marker, if hot-patch attribute is set. */
5540 if (is_ms_hook)
5541 {
5542 if (TARGET_64BIT)
5543 {
5544 /* leaq [%rsp + 0], %rsp */
5545 asm_fprintf (asm_out_file, ASM_BYTE
5546 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5547 }
5548 else
5549 {
5550 /* movl.s %edi, %edi
5551 push %ebp
5552 movl.s %esp, %ebp */
5553 asm_fprintf (asm_out_file, ASM_BYTE
5554 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5555 }
5556 }
5557 }
5558
5559 /* regclass.c */
5560 extern void init_regs (void);
5561
5562 /* Implementation of call abi switching target hook. Specific to FNDECL
5563 the specific call register sets are set. See also
5564 ix86_conditional_register_usage for more details. */
5565 void
5566 ix86_call_abi_override (const_tree fndecl)
5567 {
5568 if (fndecl == NULL_TREE)
5569 cfun->machine->call_abi = ix86_abi;
5570 else
5571 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5572 }
5573
5574 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5575 expensive re-initialization of init_regs each time we switch function context
5576 since this is needed only during RTL expansion. */
5577 static void
5578 ix86_maybe_switch_abi (void)
5579 {
5580 if (TARGET_64BIT &&
5581 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5582 reinit_regs ();
5583 }
5584
5585 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5586 for a call to a function whose data type is FNTYPE.
5587 For a library call, FNTYPE is 0. */
5588
5589 void
5590 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5591 tree fntype, /* tree ptr for function decl */
5592 rtx libname, /* SYMBOL_REF of library name or 0 */
5593 tree fndecl,
5594 int caller)
5595 {
5596 struct cgraph_local_info *i;
5597 tree fnret_type;
5598
5599 memset (cum, 0, sizeof (*cum));
5600
5601 /* Initialize for the current callee. */
5602 if (caller)
5603 {
5604 cfun->machine->callee_pass_avx256_p = false;
5605 cfun->machine->callee_return_avx256_p = false;
5606 }
5607
5608 if (fndecl)
5609 {
5610 i = cgraph_local_info (fndecl);
5611 cum->call_abi = ix86_function_abi (fndecl);
5612 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5613 }
5614 else
5615 {
5616 i = NULL;
5617 cum->call_abi = ix86_function_type_abi (fntype);
5618 if (fntype)
5619 fnret_type = TREE_TYPE (fntype);
5620 else
5621 fnret_type = NULL;
5622 }
5623
5624 if (TARGET_VZEROUPPER && fnret_type)
5625 {
5626 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5627 false);
5628 if (function_pass_avx256_p (fnret_value))
5629 {
5630 /* The return value of this function uses 256bit AVX modes. */
5631 if (caller)
5632 cfun->machine->callee_return_avx256_p = true;
5633 else
5634 cfun->machine->caller_return_avx256_p = true;
5635 }
5636 }
5637
5638 cum->caller = caller;
5639
5640 /* Set up the number of registers to use for passing arguments. */
5641
5642 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5643 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5644 "or subtarget optimization implying it");
5645 cum->nregs = ix86_regparm;
5646 if (TARGET_64BIT)
5647 {
5648 cum->nregs = (cum->call_abi == SYSV_ABI
5649 ? X86_64_REGPARM_MAX
5650 : X86_64_MS_REGPARM_MAX);
5651 }
5652 if (TARGET_SSE)
5653 {
5654 cum->sse_nregs = SSE_REGPARM_MAX;
5655 if (TARGET_64BIT)
5656 {
5657 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5658 ? X86_64_SSE_REGPARM_MAX
5659 : X86_64_MS_SSE_REGPARM_MAX);
5660 }
5661 }
5662 if (TARGET_MMX)
5663 cum->mmx_nregs = MMX_REGPARM_MAX;
5664 cum->warn_avx = true;
5665 cum->warn_sse = true;
5666 cum->warn_mmx = true;
5667
5668 /* Because type might mismatch in between caller and callee, we need to
5669 use actual type of function for local calls.
5670 FIXME: cgraph_analyze can be told to actually record if function uses
5671 va_start so for local functions maybe_vaarg can be made aggressive
5672 helping K&R code.
5673 FIXME: once typesytem is fixed, we won't need this code anymore. */
5674 if (i && i->local && i->can_change_signature)
5675 fntype = TREE_TYPE (fndecl);
5676 cum->maybe_vaarg = (fntype
5677 ? (!prototype_p (fntype) || stdarg_p (fntype))
5678 : !libname);
5679
5680 if (!TARGET_64BIT)
5681 {
5682 /* If there are variable arguments, then we won't pass anything
5683 in registers in 32-bit mode. */
5684 if (stdarg_p (fntype))
5685 {
5686 cum->nregs = 0;
5687 cum->sse_nregs = 0;
5688 cum->mmx_nregs = 0;
5689 cum->warn_avx = 0;
5690 cum->warn_sse = 0;
5691 cum->warn_mmx = 0;
5692 return;
5693 }
5694
5695 /* Use ecx and edx registers if function has fastcall attribute,
5696 else look for regparm information. */
5697 if (fntype)
5698 {
5699 unsigned int ccvt = ix86_get_callcvt (fntype);
5700 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5701 {
5702 cum->nregs = 1;
5703 cum->fastcall = 1; /* Same first register as in fastcall. */
5704 }
5705 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5706 {
5707 cum->nregs = 2;
5708 cum->fastcall = 1;
5709 }
5710 else
5711 cum->nregs = ix86_function_regparm (fntype, fndecl);
5712 }
5713
5714 /* Set up the number of SSE registers used for passing SFmode
5715 and DFmode arguments. Warn for mismatching ABI. */
5716 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5717 }
5718 }
5719
5720 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5721 But in the case of vector types, it is some vector mode.
5722
5723 When we have only some of our vector isa extensions enabled, then there
5724 are some modes for which vector_mode_supported_p is false. For these
5725 modes, the generic vector support in gcc will choose some non-vector mode
5726 in order to implement the type. By computing the natural mode, we'll
5727 select the proper ABI location for the operand and not depend on whatever
5728 the middle-end decides to do with these vector types.
5729
5730 The midde-end can't deal with the vector types > 16 bytes. In this
5731 case, we return the original mode and warn ABI change if CUM isn't
5732 NULL. */
5733
5734 static enum machine_mode
5735 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5736 {
5737 enum machine_mode mode = TYPE_MODE (type);
5738
5739 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5740 {
5741 HOST_WIDE_INT size = int_size_in_bytes (type);
5742 if ((size == 8 || size == 16 || size == 32)
5743 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5744 && TYPE_VECTOR_SUBPARTS (type) > 1)
5745 {
5746 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5747
5748 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5749 mode = MIN_MODE_VECTOR_FLOAT;
5750 else
5751 mode = MIN_MODE_VECTOR_INT;
5752
5753 /* Get the mode which has this inner mode and number of units. */
5754 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5755 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5756 && GET_MODE_INNER (mode) == innermode)
5757 {
5758 if (size == 32 && !TARGET_AVX)
5759 {
5760 static bool warnedavx;
5761
5762 if (cum
5763 && !warnedavx
5764 && cum->warn_avx)
5765 {
5766 warnedavx = true;
5767 warning (0, "AVX vector argument without AVX "
5768 "enabled changes the ABI");
5769 }
5770 return TYPE_MODE (type);
5771 }
5772 else
5773 return mode;
5774 }
5775
5776 gcc_unreachable ();
5777 }
5778 }
5779
5780 return mode;
5781 }
5782
5783 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5784 this may not agree with the mode that the type system has chosen for the
5785 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5786 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5787
5788 static rtx
5789 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5790 unsigned int regno)
5791 {
5792 rtx tmp;
5793
5794 if (orig_mode != BLKmode)
5795 tmp = gen_rtx_REG (orig_mode, regno);
5796 else
5797 {
5798 tmp = gen_rtx_REG (mode, regno);
5799 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5800 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5801 }
5802
5803 return tmp;
5804 }
5805
5806 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5807 of this code is to classify each 8bytes of incoming argument by the register
5808 class and assign registers accordingly. */
5809
5810 /* Return the union class of CLASS1 and CLASS2.
5811 See the x86-64 PS ABI for details. */
5812
5813 static enum x86_64_reg_class
5814 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5815 {
5816 /* Rule #1: If both classes are equal, this is the resulting class. */
5817 if (class1 == class2)
5818 return class1;
5819
5820 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5821 the other class. */
5822 if (class1 == X86_64_NO_CLASS)
5823 return class2;
5824 if (class2 == X86_64_NO_CLASS)
5825 return class1;
5826
5827 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5828 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5829 return X86_64_MEMORY_CLASS;
5830
5831 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5832 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5833 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5834 return X86_64_INTEGERSI_CLASS;
5835 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5836 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5837 return X86_64_INTEGER_CLASS;
5838
5839 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5840 MEMORY is used. */
5841 if (class1 == X86_64_X87_CLASS
5842 || class1 == X86_64_X87UP_CLASS
5843 || class1 == X86_64_COMPLEX_X87_CLASS
5844 || class2 == X86_64_X87_CLASS
5845 || class2 == X86_64_X87UP_CLASS
5846 || class2 == X86_64_COMPLEX_X87_CLASS)
5847 return X86_64_MEMORY_CLASS;
5848
5849 /* Rule #6: Otherwise class SSE is used. */
5850 return X86_64_SSE_CLASS;
5851 }
5852
5853 /* Classify the argument of type TYPE and mode MODE.
5854 CLASSES will be filled by the register class used to pass each word
5855 of the operand. The number of words is returned. In case the parameter
5856 should be passed in memory, 0 is returned. As a special case for zero
5857 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5858
5859 BIT_OFFSET is used internally for handling records and specifies offset
5860 of the offset in bits modulo 256 to avoid overflow cases.
5861
5862 See the x86-64 PS ABI for details.
5863 */
5864
5865 static int
5866 classify_argument (enum machine_mode mode, const_tree type,
5867 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5868 {
5869 HOST_WIDE_INT bytes =
5870 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5871 int words
5872 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5873
5874 /* Variable sized entities are always passed/returned in memory. */
5875 if (bytes < 0)
5876 return 0;
5877
5878 if (mode != VOIDmode
5879 && targetm.calls.must_pass_in_stack (mode, type))
5880 return 0;
5881
5882 if (type && AGGREGATE_TYPE_P (type))
5883 {
5884 int i;
5885 tree field;
5886 enum x86_64_reg_class subclasses[MAX_CLASSES];
5887
5888 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5889 if (bytes > 32)
5890 return 0;
5891
5892 for (i = 0; i < words; i++)
5893 classes[i] = X86_64_NO_CLASS;
5894
5895 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5896 signalize memory class, so handle it as special case. */
5897 if (!words)
5898 {
5899 classes[0] = X86_64_NO_CLASS;
5900 return 1;
5901 }
5902
5903 /* Classify each field of record and merge classes. */
5904 switch (TREE_CODE (type))
5905 {
5906 case RECORD_TYPE:
5907 /* And now merge the fields of structure. */
5908 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5909 {
5910 if (TREE_CODE (field) == FIELD_DECL)
5911 {
5912 int num;
5913
5914 if (TREE_TYPE (field) == error_mark_node)
5915 continue;
5916
5917 /* Bitfields are always classified as integer. Handle them
5918 early, since later code would consider them to be
5919 misaligned integers. */
5920 if (DECL_BIT_FIELD (field))
5921 {
5922 for (i = (int_bit_position (field)
5923 + (bit_offset % 64)) / 8 / 8;
5924 i < ((int_bit_position (field) + (bit_offset % 64))
5925 + tree_low_cst (DECL_SIZE (field), 0)
5926 + 63) / 8 / 8; i++)
5927 classes[i] =
5928 merge_classes (X86_64_INTEGER_CLASS,
5929 classes[i]);
5930 }
5931 else
5932 {
5933 int pos;
5934
5935 type = TREE_TYPE (field);
5936
5937 /* Flexible array member is ignored. */
5938 if (TYPE_MODE (type) == BLKmode
5939 && TREE_CODE (type) == ARRAY_TYPE
5940 && TYPE_SIZE (type) == NULL_TREE
5941 && TYPE_DOMAIN (type) != NULL_TREE
5942 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5943 == NULL_TREE))
5944 {
5945 static bool warned;
5946
5947 if (!warned && warn_psabi)
5948 {
5949 warned = true;
5950 inform (input_location,
5951 "the ABI of passing struct with"
5952 " a flexible array member has"
5953 " changed in GCC 4.4");
5954 }
5955 continue;
5956 }
5957 num = classify_argument (TYPE_MODE (type), type,
5958 subclasses,
5959 (int_bit_position (field)
5960 + bit_offset) % 256);
5961 if (!num)
5962 return 0;
5963 pos = (int_bit_position (field)
5964 + (bit_offset % 64)) / 8 / 8;
5965 for (i = 0; i < num && (i + pos) < words; i++)
5966 classes[i + pos] =
5967 merge_classes (subclasses[i], classes[i + pos]);
5968 }
5969 }
5970 }
5971 break;
5972
5973 case ARRAY_TYPE:
5974 /* Arrays are handled as small records. */
5975 {
5976 int num;
5977 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5978 TREE_TYPE (type), subclasses, bit_offset);
5979 if (!num)
5980 return 0;
5981
5982 /* The partial classes are now full classes. */
5983 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5984 subclasses[0] = X86_64_SSE_CLASS;
5985 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5986 && !((bit_offset % 64) == 0 && bytes == 4))
5987 subclasses[0] = X86_64_INTEGER_CLASS;
5988
5989 for (i = 0; i < words; i++)
5990 classes[i] = subclasses[i % num];
5991
5992 break;
5993 }
5994 case UNION_TYPE:
5995 case QUAL_UNION_TYPE:
5996 /* Unions are similar to RECORD_TYPE but offset is always 0.
5997 */
5998 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5999 {
6000 if (TREE_CODE (field) == FIELD_DECL)
6001 {
6002 int num;
6003
6004 if (TREE_TYPE (field) == error_mark_node)
6005 continue;
6006
6007 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6008 TREE_TYPE (field), subclasses,
6009 bit_offset);
6010 if (!num)
6011 return 0;
6012 for (i = 0; i < num; i++)
6013 classes[i] = merge_classes (subclasses[i], classes[i]);
6014 }
6015 }
6016 break;
6017
6018 default:
6019 gcc_unreachable ();
6020 }
6021
6022 if (words > 2)
6023 {
6024 /* When size > 16 bytes, if the first one isn't
6025 X86_64_SSE_CLASS or any other ones aren't
6026 X86_64_SSEUP_CLASS, everything should be passed in
6027 memory. */
6028 if (classes[0] != X86_64_SSE_CLASS)
6029 return 0;
6030
6031 for (i = 1; i < words; i++)
6032 if (classes[i] != X86_64_SSEUP_CLASS)
6033 return 0;
6034 }
6035
6036 /* Final merger cleanup. */
6037 for (i = 0; i < words; i++)
6038 {
6039 /* If one class is MEMORY, everything should be passed in
6040 memory. */
6041 if (classes[i] == X86_64_MEMORY_CLASS)
6042 return 0;
6043
6044 /* The X86_64_SSEUP_CLASS should be always preceded by
6045 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6046 if (classes[i] == X86_64_SSEUP_CLASS
6047 && classes[i - 1] != X86_64_SSE_CLASS
6048 && classes[i - 1] != X86_64_SSEUP_CLASS)
6049 {
6050 /* The first one should never be X86_64_SSEUP_CLASS. */
6051 gcc_assert (i != 0);
6052 classes[i] = X86_64_SSE_CLASS;
6053 }
6054
6055 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6056 everything should be passed in memory. */
6057 if (classes[i] == X86_64_X87UP_CLASS
6058 && (classes[i - 1] != X86_64_X87_CLASS))
6059 {
6060 static bool warned;
6061
6062 /* The first one should never be X86_64_X87UP_CLASS. */
6063 gcc_assert (i != 0);
6064 if (!warned && warn_psabi)
6065 {
6066 warned = true;
6067 inform (input_location,
6068 "the ABI of passing union with long double"
6069 " has changed in GCC 4.4");
6070 }
6071 return 0;
6072 }
6073 }
6074 return words;
6075 }
6076
6077 /* Compute alignment needed. We align all types to natural boundaries with
6078 exception of XFmode that is aligned to 64bits. */
6079 if (mode != VOIDmode && mode != BLKmode)
6080 {
6081 int mode_alignment = GET_MODE_BITSIZE (mode);
6082
6083 if (mode == XFmode)
6084 mode_alignment = 128;
6085 else if (mode == XCmode)
6086 mode_alignment = 256;
6087 if (COMPLEX_MODE_P (mode))
6088 mode_alignment /= 2;
6089 /* Misaligned fields are always returned in memory. */
6090 if (bit_offset % mode_alignment)
6091 return 0;
6092 }
6093
6094 /* for V1xx modes, just use the base mode */
6095 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6096 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6097 mode = GET_MODE_INNER (mode);
6098
6099 /* Classification of atomic types. */
6100 switch (mode)
6101 {
6102 case SDmode:
6103 case DDmode:
6104 classes[0] = X86_64_SSE_CLASS;
6105 return 1;
6106 case TDmode:
6107 classes[0] = X86_64_SSE_CLASS;
6108 classes[1] = X86_64_SSEUP_CLASS;
6109 return 2;
6110 case DImode:
6111 case SImode:
6112 case HImode:
6113 case QImode:
6114 case CSImode:
6115 case CHImode:
6116 case CQImode:
6117 {
6118 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6119
6120 if (size <= 32)
6121 {
6122 classes[0] = X86_64_INTEGERSI_CLASS;
6123 return 1;
6124 }
6125 else if (size <= 64)
6126 {
6127 classes[0] = X86_64_INTEGER_CLASS;
6128 return 1;
6129 }
6130 else if (size <= 64+32)
6131 {
6132 classes[0] = X86_64_INTEGER_CLASS;
6133 classes[1] = X86_64_INTEGERSI_CLASS;
6134 return 2;
6135 }
6136 else if (size <= 64+64)
6137 {
6138 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6139 return 2;
6140 }
6141 else
6142 gcc_unreachable ();
6143 }
6144 case CDImode:
6145 case TImode:
6146 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6147 return 2;
6148 case COImode:
6149 case OImode:
6150 /* OImode shouldn't be used directly. */
6151 gcc_unreachable ();
6152 case CTImode:
6153 return 0;
6154 case SFmode:
6155 if (!(bit_offset % 64))
6156 classes[0] = X86_64_SSESF_CLASS;
6157 else
6158 classes[0] = X86_64_SSE_CLASS;
6159 return 1;
6160 case DFmode:
6161 classes[0] = X86_64_SSEDF_CLASS;
6162 return 1;
6163 case XFmode:
6164 classes[0] = X86_64_X87_CLASS;
6165 classes[1] = X86_64_X87UP_CLASS;
6166 return 2;
6167 case TFmode:
6168 classes[0] = X86_64_SSE_CLASS;
6169 classes[1] = X86_64_SSEUP_CLASS;
6170 return 2;
6171 case SCmode:
6172 classes[0] = X86_64_SSE_CLASS;
6173 if (!(bit_offset % 64))
6174 return 1;
6175 else
6176 {
6177 static bool warned;
6178
6179 if (!warned && warn_psabi)
6180 {
6181 warned = true;
6182 inform (input_location,
6183 "the ABI of passing structure with complex float"
6184 " member has changed in GCC 4.4");
6185 }
6186 classes[1] = X86_64_SSESF_CLASS;
6187 return 2;
6188 }
6189 case DCmode:
6190 classes[0] = X86_64_SSEDF_CLASS;
6191 classes[1] = X86_64_SSEDF_CLASS;
6192 return 2;
6193 case XCmode:
6194 classes[0] = X86_64_COMPLEX_X87_CLASS;
6195 return 1;
6196 case TCmode:
6197 /* This modes is larger than 16 bytes. */
6198 return 0;
6199 case V8SFmode:
6200 case V8SImode:
6201 case V32QImode:
6202 case V16HImode:
6203 case V4DFmode:
6204 case V4DImode:
6205 classes[0] = X86_64_SSE_CLASS;
6206 classes[1] = X86_64_SSEUP_CLASS;
6207 classes[2] = X86_64_SSEUP_CLASS;
6208 classes[3] = X86_64_SSEUP_CLASS;
6209 return 4;
6210 case V4SFmode:
6211 case V4SImode:
6212 case V16QImode:
6213 case V8HImode:
6214 case V2DFmode:
6215 case V2DImode:
6216 classes[0] = X86_64_SSE_CLASS;
6217 classes[1] = X86_64_SSEUP_CLASS;
6218 return 2;
6219 case V1TImode:
6220 case V1DImode:
6221 case V2SFmode:
6222 case V2SImode:
6223 case V4HImode:
6224 case V8QImode:
6225 classes[0] = X86_64_SSE_CLASS;
6226 return 1;
6227 case BLKmode:
6228 case VOIDmode:
6229 return 0;
6230 default:
6231 gcc_assert (VECTOR_MODE_P (mode));
6232
6233 if (bytes > 16)
6234 return 0;
6235
6236 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6237
6238 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6239 classes[0] = X86_64_INTEGERSI_CLASS;
6240 else
6241 classes[0] = X86_64_INTEGER_CLASS;
6242 classes[1] = X86_64_INTEGER_CLASS;
6243 return 1 + (bytes > 8);
6244 }
6245 }
6246
6247 /* Examine the argument and return set number of register required in each
6248 class. Return 0 iff parameter should be passed in memory. */
6249 static int
6250 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6251 int *int_nregs, int *sse_nregs)
6252 {
6253 enum x86_64_reg_class regclass[MAX_CLASSES];
6254 int n = classify_argument (mode, type, regclass, 0);
6255
6256 *int_nregs = 0;
6257 *sse_nregs = 0;
6258 if (!n)
6259 return 0;
6260 for (n--; n >= 0; n--)
6261 switch (regclass[n])
6262 {
6263 case X86_64_INTEGER_CLASS:
6264 case X86_64_INTEGERSI_CLASS:
6265 (*int_nregs)++;
6266 break;
6267 case X86_64_SSE_CLASS:
6268 case X86_64_SSESF_CLASS:
6269 case X86_64_SSEDF_CLASS:
6270 (*sse_nregs)++;
6271 break;
6272 case X86_64_NO_CLASS:
6273 case X86_64_SSEUP_CLASS:
6274 break;
6275 case X86_64_X87_CLASS:
6276 case X86_64_X87UP_CLASS:
6277 if (!in_return)
6278 return 0;
6279 break;
6280 case X86_64_COMPLEX_X87_CLASS:
6281 return in_return ? 2 : 0;
6282 case X86_64_MEMORY_CLASS:
6283 gcc_unreachable ();
6284 }
6285 return 1;
6286 }
6287
6288 /* Construct container for the argument used by GCC interface. See
6289 FUNCTION_ARG for the detailed description. */
6290
6291 static rtx
6292 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6293 const_tree type, int in_return, int nintregs, int nsseregs,
6294 const int *intreg, int sse_regno)
6295 {
6296 /* The following variables hold the static issued_error state. */
6297 static bool issued_sse_arg_error;
6298 static bool issued_sse_ret_error;
6299 static bool issued_x87_ret_error;
6300
6301 enum machine_mode tmpmode;
6302 int bytes =
6303 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6304 enum x86_64_reg_class regclass[MAX_CLASSES];
6305 int n;
6306 int i;
6307 int nexps = 0;
6308 int needed_sseregs, needed_intregs;
6309 rtx exp[MAX_CLASSES];
6310 rtx ret;
6311
6312 n = classify_argument (mode, type, regclass, 0);
6313 if (!n)
6314 return NULL;
6315 if (!examine_argument (mode, type, in_return, &needed_intregs,
6316 &needed_sseregs))
6317 return NULL;
6318 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6319 return NULL;
6320
6321 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6322 some less clueful developer tries to use floating-point anyway. */
6323 if (needed_sseregs && !TARGET_SSE)
6324 {
6325 if (in_return)
6326 {
6327 if (!issued_sse_ret_error)
6328 {
6329 error ("SSE register return with SSE disabled");
6330 issued_sse_ret_error = true;
6331 }
6332 }
6333 else if (!issued_sse_arg_error)
6334 {
6335 error ("SSE register argument with SSE disabled");
6336 issued_sse_arg_error = true;
6337 }
6338 return NULL;
6339 }
6340
6341 /* Likewise, error if the ABI requires us to return values in the
6342 x87 registers and the user specified -mno-80387. */
6343 if (!TARGET_80387 && in_return)
6344 for (i = 0; i < n; i++)
6345 if (regclass[i] == X86_64_X87_CLASS
6346 || regclass[i] == X86_64_X87UP_CLASS
6347 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6348 {
6349 if (!issued_x87_ret_error)
6350 {
6351 error ("x87 register return with x87 disabled");
6352 issued_x87_ret_error = true;
6353 }
6354 return NULL;
6355 }
6356
6357 /* First construct simple cases. Avoid SCmode, since we want to use
6358 single register to pass this type. */
6359 if (n == 1 && mode != SCmode)
6360 switch (regclass[0])
6361 {
6362 case X86_64_INTEGER_CLASS:
6363 case X86_64_INTEGERSI_CLASS:
6364 return gen_rtx_REG (mode, intreg[0]);
6365 case X86_64_SSE_CLASS:
6366 case X86_64_SSESF_CLASS:
6367 case X86_64_SSEDF_CLASS:
6368 if (mode != BLKmode)
6369 return gen_reg_or_parallel (mode, orig_mode,
6370 SSE_REGNO (sse_regno));
6371 break;
6372 case X86_64_X87_CLASS:
6373 case X86_64_COMPLEX_X87_CLASS:
6374 return gen_rtx_REG (mode, FIRST_STACK_REG);
6375 case X86_64_NO_CLASS:
6376 /* Zero sized array, struct or class. */
6377 return NULL;
6378 default:
6379 gcc_unreachable ();
6380 }
6381 if (n == 2
6382 && regclass[0] == X86_64_SSE_CLASS
6383 && regclass[1] == X86_64_SSEUP_CLASS
6384 && mode != BLKmode)
6385 return gen_reg_or_parallel (mode, orig_mode,
6386 SSE_REGNO (sse_regno));
6387 if (n == 4
6388 && regclass[0] == X86_64_SSE_CLASS
6389 && regclass[1] == X86_64_SSEUP_CLASS
6390 && regclass[2] == X86_64_SSEUP_CLASS
6391 && regclass[3] == X86_64_SSEUP_CLASS
6392 && mode != BLKmode)
6393 return gen_reg_or_parallel (mode, orig_mode,
6394 SSE_REGNO (sse_regno));
6395 if (n == 2
6396 && regclass[0] == X86_64_X87_CLASS
6397 && regclass[1] == X86_64_X87UP_CLASS)
6398 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6399
6400 if (n == 2
6401 && regclass[0] == X86_64_INTEGER_CLASS
6402 && regclass[1] == X86_64_INTEGER_CLASS
6403 && (mode == CDImode || mode == TImode || mode == TFmode)
6404 && intreg[0] + 1 == intreg[1])
6405 return gen_rtx_REG (mode, intreg[0]);
6406
6407 /* Otherwise figure out the entries of the PARALLEL. */
6408 for (i = 0; i < n; i++)
6409 {
6410 int pos;
6411
6412 switch (regclass[i])
6413 {
6414 case X86_64_NO_CLASS:
6415 break;
6416 case X86_64_INTEGER_CLASS:
6417 case X86_64_INTEGERSI_CLASS:
6418 /* Merge TImodes on aligned occasions here too. */
6419 if (i * 8 + 8 > bytes)
6420 tmpmode
6421 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6422 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6423 tmpmode = SImode;
6424 else
6425 tmpmode = DImode;
6426 /* We've requested 24 bytes we
6427 don't have mode for. Use DImode. */
6428 if (tmpmode == BLKmode)
6429 tmpmode = DImode;
6430 exp [nexps++]
6431 = gen_rtx_EXPR_LIST (VOIDmode,
6432 gen_rtx_REG (tmpmode, *intreg),
6433 GEN_INT (i*8));
6434 intreg++;
6435 break;
6436 case X86_64_SSESF_CLASS:
6437 exp [nexps++]
6438 = gen_rtx_EXPR_LIST (VOIDmode,
6439 gen_rtx_REG (SFmode,
6440 SSE_REGNO (sse_regno)),
6441 GEN_INT (i*8));
6442 sse_regno++;
6443 break;
6444 case X86_64_SSEDF_CLASS:
6445 exp [nexps++]
6446 = gen_rtx_EXPR_LIST (VOIDmode,
6447 gen_rtx_REG (DFmode,
6448 SSE_REGNO (sse_regno)),
6449 GEN_INT (i*8));
6450 sse_regno++;
6451 break;
6452 case X86_64_SSE_CLASS:
6453 pos = i;
6454 switch (n)
6455 {
6456 case 1:
6457 tmpmode = DImode;
6458 break;
6459 case 2:
6460 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6461 {
6462 tmpmode = TImode;
6463 i++;
6464 }
6465 else
6466 tmpmode = DImode;
6467 break;
6468 case 4:
6469 gcc_assert (i == 0
6470 && regclass[1] == X86_64_SSEUP_CLASS
6471 && regclass[2] == X86_64_SSEUP_CLASS
6472 && regclass[3] == X86_64_SSEUP_CLASS);
6473 tmpmode = OImode;
6474 i += 3;
6475 break;
6476 default:
6477 gcc_unreachable ();
6478 }
6479 exp [nexps++]
6480 = gen_rtx_EXPR_LIST (VOIDmode,
6481 gen_rtx_REG (tmpmode,
6482 SSE_REGNO (sse_regno)),
6483 GEN_INT (pos*8));
6484 sse_regno++;
6485 break;
6486 default:
6487 gcc_unreachable ();
6488 }
6489 }
6490
6491 /* Empty aligned struct, union or class. */
6492 if (nexps == 0)
6493 return NULL;
6494
6495 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6496 for (i = 0; i < nexps; i++)
6497 XVECEXP (ret, 0, i) = exp [i];
6498 return ret;
6499 }
6500
6501 /* Update the data in CUM to advance over an argument of mode MODE
6502 and data type TYPE. (TYPE is null for libcalls where that information
6503 may not be available.) */
6504
6505 static void
6506 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6507 const_tree type, HOST_WIDE_INT bytes,
6508 HOST_WIDE_INT words)
6509 {
6510 switch (mode)
6511 {
6512 default:
6513 break;
6514
6515 case BLKmode:
6516 if (bytes < 0)
6517 break;
6518 /* FALLTHRU */
6519
6520 case DImode:
6521 case SImode:
6522 case HImode:
6523 case QImode:
6524 cum->words += words;
6525 cum->nregs -= words;
6526 cum->regno += words;
6527
6528 if (cum->nregs <= 0)
6529 {
6530 cum->nregs = 0;
6531 cum->regno = 0;
6532 }
6533 break;
6534
6535 case OImode:
6536 /* OImode shouldn't be used directly. */
6537 gcc_unreachable ();
6538
6539 case DFmode:
6540 if (cum->float_in_sse < 2)
6541 break;
6542 case SFmode:
6543 if (cum->float_in_sse < 1)
6544 break;
6545 /* FALLTHRU */
6546
6547 case V8SFmode:
6548 case V8SImode:
6549 case V32QImode:
6550 case V16HImode:
6551 case V4DFmode:
6552 case V4DImode:
6553 case TImode:
6554 case V16QImode:
6555 case V8HImode:
6556 case V4SImode:
6557 case V2DImode:
6558 case V4SFmode:
6559 case V2DFmode:
6560 if (!type || !AGGREGATE_TYPE_P (type))
6561 {
6562 cum->sse_words += words;
6563 cum->sse_nregs -= 1;
6564 cum->sse_regno += 1;
6565 if (cum->sse_nregs <= 0)
6566 {
6567 cum->sse_nregs = 0;
6568 cum->sse_regno = 0;
6569 }
6570 }
6571 break;
6572
6573 case V8QImode:
6574 case V4HImode:
6575 case V2SImode:
6576 case V2SFmode:
6577 case V1TImode:
6578 case V1DImode:
6579 if (!type || !AGGREGATE_TYPE_P (type))
6580 {
6581 cum->mmx_words += words;
6582 cum->mmx_nregs -= 1;
6583 cum->mmx_regno += 1;
6584 if (cum->mmx_nregs <= 0)
6585 {
6586 cum->mmx_nregs = 0;
6587 cum->mmx_regno = 0;
6588 }
6589 }
6590 break;
6591 }
6592 }
6593
6594 static void
6595 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6596 const_tree type, HOST_WIDE_INT words, bool named)
6597 {
6598 int int_nregs, sse_nregs;
6599
6600 /* Unnamed 256bit vector mode parameters are passed on stack. */
6601 if (!named && VALID_AVX256_REG_MODE (mode))
6602 return;
6603
6604 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6605 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6606 {
6607 cum->nregs -= int_nregs;
6608 cum->sse_nregs -= sse_nregs;
6609 cum->regno += int_nregs;
6610 cum->sse_regno += sse_nregs;
6611 }
6612 else
6613 {
6614 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6615 cum->words = (cum->words + align - 1) & ~(align - 1);
6616 cum->words += words;
6617 }
6618 }
6619
6620 static void
6621 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6622 HOST_WIDE_INT words)
6623 {
6624 /* Otherwise, this should be passed indirect. */
6625 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6626
6627 cum->words += words;
6628 if (cum->nregs > 0)
6629 {
6630 cum->nregs -= 1;
6631 cum->regno += 1;
6632 }
6633 }
6634
6635 /* Update the data in CUM to advance over an argument of mode MODE and
6636 data type TYPE. (TYPE is null for libcalls where that information
6637 may not be available.) */
6638
6639 static void
6640 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6641 const_tree type, bool named)
6642 {
6643 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6644 HOST_WIDE_INT bytes, words;
6645
6646 if (mode == BLKmode)
6647 bytes = int_size_in_bytes (type);
6648 else
6649 bytes = GET_MODE_SIZE (mode);
6650 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6651
6652 if (type)
6653 mode = type_natural_mode (type, NULL);
6654
6655 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6656 function_arg_advance_ms_64 (cum, bytes, words);
6657 else if (TARGET_64BIT)
6658 function_arg_advance_64 (cum, mode, type, words, named);
6659 else
6660 function_arg_advance_32 (cum, mode, type, bytes, words);
6661 }
6662
6663 /* Define where to put the arguments to a function.
6664 Value is zero to push the argument on the stack,
6665 or a hard register in which to store the argument.
6666
6667 MODE is the argument's machine mode.
6668 TYPE is the data type of the argument (as a tree).
6669 This is null for libcalls where that information may
6670 not be available.
6671 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6672 the preceding args and about the function being called.
6673 NAMED is nonzero if this argument is a named parameter
6674 (otherwise it is an extra parameter matching an ellipsis). */
6675
6676 static rtx
6677 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6678 enum machine_mode orig_mode, const_tree type,
6679 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6680 {
6681 static bool warnedsse, warnedmmx;
6682
6683 /* Avoid the AL settings for the Unix64 ABI. */
6684 if (mode == VOIDmode)
6685 return constm1_rtx;
6686
6687 switch (mode)
6688 {
6689 default:
6690 break;
6691
6692 case BLKmode:
6693 if (bytes < 0)
6694 break;
6695 /* FALLTHRU */
6696 case DImode:
6697 case SImode:
6698 case HImode:
6699 case QImode:
6700 if (words <= cum->nregs)
6701 {
6702 int regno = cum->regno;
6703
6704 /* Fastcall allocates the first two DWORD (SImode) or
6705 smaller arguments to ECX and EDX if it isn't an
6706 aggregate type . */
6707 if (cum->fastcall)
6708 {
6709 if (mode == BLKmode
6710 || mode == DImode
6711 || (type && AGGREGATE_TYPE_P (type)))
6712 break;
6713
6714 /* ECX not EAX is the first allocated register. */
6715 if (regno == AX_REG)
6716 regno = CX_REG;
6717 }
6718 return gen_rtx_REG (mode, regno);
6719 }
6720 break;
6721
6722 case DFmode:
6723 if (cum->float_in_sse < 2)
6724 break;
6725 case SFmode:
6726 if (cum->float_in_sse < 1)
6727 break;
6728 /* FALLTHRU */
6729 case TImode:
6730 /* In 32bit, we pass TImode in xmm registers. */
6731 case V16QImode:
6732 case V8HImode:
6733 case V4SImode:
6734 case V2DImode:
6735 case V4SFmode:
6736 case V2DFmode:
6737 if (!type || !AGGREGATE_TYPE_P (type))
6738 {
6739 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6740 {
6741 warnedsse = true;
6742 warning (0, "SSE vector argument without SSE enabled "
6743 "changes the ABI");
6744 }
6745 if (cum->sse_nregs)
6746 return gen_reg_or_parallel (mode, orig_mode,
6747 cum->sse_regno + FIRST_SSE_REG);
6748 }
6749 break;
6750
6751 case OImode:
6752 /* OImode shouldn't be used directly. */
6753 gcc_unreachable ();
6754
6755 case V8SFmode:
6756 case V8SImode:
6757 case V32QImode:
6758 case V16HImode:
6759 case V4DFmode:
6760 case V4DImode:
6761 if (!type || !AGGREGATE_TYPE_P (type))
6762 {
6763 if (cum->sse_nregs)
6764 return gen_reg_or_parallel (mode, orig_mode,
6765 cum->sse_regno + FIRST_SSE_REG);
6766 }
6767 break;
6768
6769 case V8QImode:
6770 case V4HImode:
6771 case V2SImode:
6772 case V2SFmode:
6773 case V1TImode:
6774 case V1DImode:
6775 if (!type || !AGGREGATE_TYPE_P (type))
6776 {
6777 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6778 {
6779 warnedmmx = true;
6780 warning (0, "MMX vector argument without MMX enabled "
6781 "changes the ABI");
6782 }
6783 if (cum->mmx_nregs)
6784 return gen_reg_or_parallel (mode, orig_mode,
6785 cum->mmx_regno + FIRST_MMX_REG);
6786 }
6787 break;
6788 }
6789
6790 return NULL_RTX;
6791 }
6792
6793 static rtx
6794 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6795 enum machine_mode orig_mode, const_tree type, bool named)
6796 {
6797 /* Handle a hidden AL argument containing number of registers
6798 for varargs x86-64 functions. */
6799 if (mode == VOIDmode)
6800 return GEN_INT (cum->maybe_vaarg
6801 ? (cum->sse_nregs < 0
6802 ? X86_64_SSE_REGPARM_MAX
6803 : cum->sse_regno)
6804 : -1);
6805
6806 switch (mode)
6807 {
6808 default:
6809 break;
6810
6811 case V8SFmode:
6812 case V8SImode:
6813 case V32QImode:
6814 case V16HImode:
6815 case V4DFmode:
6816 case V4DImode:
6817 /* Unnamed 256bit vector mode parameters are passed on stack. */
6818 if (!named)
6819 return NULL;
6820 break;
6821 }
6822
6823 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6824 cum->sse_nregs,
6825 &x86_64_int_parameter_registers [cum->regno],
6826 cum->sse_regno);
6827 }
6828
6829 static rtx
6830 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6831 enum machine_mode orig_mode, bool named,
6832 HOST_WIDE_INT bytes)
6833 {
6834 unsigned int regno;
6835
6836 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6837 We use value of -2 to specify that current function call is MSABI. */
6838 if (mode == VOIDmode)
6839 return GEN_INT (-2);
6840
6841 /* If we've run out of registers, it goes on the stack. */
6842 if (cum->nregs == 0)
6843 return NULL_RTX;
6844
6845 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6846
6847 /* Only floating point modes are passed in anything but integer regs. */
6848 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6849 {
6850 if (named)
6851 regno = cum->regno + FIRST_SSE_REG;
6852 else
6853 {
6854 rtx t1, t2;
6855
6856 /* Unnamed floating parameters are passed in both the
6857 SSE and integer registers. */
6858 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6859 t2 = gen_rtx_REG (mode, regno);
6860 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6861 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6862 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6863 }
6864 }
6865 /* Handle aggregated types passed in register. */
6866 if (orig_mode == BLKmode)
6867 {
6868 if (bytes > 0 && bytes <= 8)
6869 mode = (bytes > 4 ? DImode : SImode);
6870 if (mode == BLKmode)
6871 mode = DImode;
6872 }
6873
6874 return gen_reg_or_parallel (mode, orig_mode, regno);
6875 }
6876
6877 /* Return where to put the arguments to a function.
6878 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6879
6880 MODE is the argument's machine mode. TYPE is the data type of the
6881 argument. It is null for libcalls where that information may not be
6882 available. CUM gives information about the preceding args and about
6883 the function being called. NAMED is nonzero if this argument is a
6884 named parameter (otherwise it is an extra parameter matching an
6885 ellipsis). */
6886
6887 static rtx
6888 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6889 const_tree type, bool named)
6890 {
6891 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6892 enum machine_mode mode = omode;
6893 HOST_WIDE_INT bytes, words;
6894 rtx arg;
6895
6896 if (mode == BLKmode)
6897 bytes = int_size_in_bytes (type);
6898 else
6899 bytes = GET_MODE_SIZE (mode);
6900 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6901
6902 /* To simplify the code below, represent vector types with a vector mode
6903 even if MMX/SSE are not active. */
6904 if (type && TREE_CODE (type) == VECTOR_TYPE)
6905 mode = type_natural_mode (type, cum);
6906
6907 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6908 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6909 else if (TARGET_64BIT)
6910 arg = function_arg_64 (cum, mode, omode, type, named);
6911 else
6912 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6913
6914 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6915 {
6916 /* This argument uses 256bit AVX modes. */
6917 if (cum->caller)
6918 cfun->machine->callee_pass_avx256_p = true;
6919 else
6920 cfun->machine->caller_pass_avx256_p = true;
6921 }
6922
6923 return arg;
6924 }
6925
6926 /* A C expression that indicates when an argument must be passed by
6927 reference. If nonzero for an argument, a copy of that argument is
6928 made in memory and a pointer to the argument is passed instead of
6929 the argument itself. The pointer is passed in whatever way is
6930 appropriate for passing a pointer to that type. */
6931
6932 static bool
6933 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6934 enum machine_mode mode ATTRIBUTE_UNUSED,
6935 const_tree type, bool named ATTRIBUTE_UNUSED)
6936 {
6937 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6938
6939 /* See Windows x64 Software Convention. */
6940 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6941 {
6942 int msize = (int) GET_MODE_SIZE (mode);
6943 if (type)
6944 {
6945 /* Arrays are passed by reference. */
6946 if (TREE_CODE (type) == ARRAY_TYPE)
6947 return true;
6948
6949 if (AGGREGATE_TYPE_P (type))
6950 {
6951 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6952 are passed by reference. */
6953 msize = int_size_in_bytes (type);
6954 }
6955 }
6956
6957 /* __m128 is passed by reference. */
6958 switch (msize) {
6959 case 1: case 2: case 4: case 8:
6960 break;
6961 default:
6962 return true;
6963 }
6964 }
6965 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6966 return 1;
6967
6968 return 0;
6969 }
6970
6971 /* Return true when TYPE should be 128bit aligned for 32bit argument
6972 passing ABI. XXX: This function is obsolete and is only used for
6973 checking psABI compatibility with previous versions of GCC. */
6974
6975 static bool
6976 ix86_compat_aligned_value_p (const_tree type)
6977 {
6978 enum machine_mode mode = TYPE_MODE (type);
6979 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6980 || mode == TDmode
6981 || mode == TFmode
6982 || mode == TCmode)
6983 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6984 return true;
6985 if (TYPE_ALIGN (type) < 128)
6986 return false;
6987
6988 if (AGGREGATE_TYPE_P (type))
6989 {
6990 /* Walk the aggregates recursively. */
6991 switch (TREE_CODE (type))
6992 {
6993 case RECORD_TYPE:
6994 case UNION_TYPE:
6995 case QUAL_UNION_TYPE:
6996 {
6997 tree field;
6998
6999 /* Walk all the structure fields. */
7000 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7001 {
7002 if (TREE_CODE (field) == FIELD_DECL
7003 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7004 return true;
7005 }
7006 break;
7007 }
7008
7009 case ARRAY_TYPE:
7010 /* Just for use if some languages passes arrays by value. */
7011 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7012 return true;
7013 break;
7014
7015 default:
7016 gcc_unreachable ();
7017 }
7018 }
7019 return false;
7020 }
7021
7022 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7023 XXX: This function is obsolete and is only used for checking psABI
7024 compatibility with previous versions of GCC. */
7025
7026 static unsigned int
7027 ix86_compat_function_arg_boundary (enum machine_mode mode,
7028 const_tree type, unsigned int align)
7029 {
7030 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7031 natural boundaries. */
7032 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7033 {
7034 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7035 make an exception for SSE modes since these require 128bit
7036 alignment.
7037
7038 The handling here differs from field_alignment. ICC aligns MMX
7039 arguments to 4 byte boundaries, while structure fields are aligned
7040 to 8 byte boundaries. */
7041 if (!type)
7042 {
7043 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7044 align = PARM_BOUNDARY;
7045 }
7046 else
7047 {
7048 if (!ix86_compat_aligned_value_p (type))
7049 align = PARM_BOUNDARY;
7050 }
7051 }
7052 if (align > BIGGEST_ALIGNMENT)
7053 align = BIGGEST_ALIGNMENT;
7054 return align;
7055 }
7056
7057 /* Return true when TYPE should be 128bit aligned for 32bit argument
7058 passing ABI. */
7059
7060 static bool
7061 ix86_contains_aligned_value_p (const_tree type)
7062 {
7063 enum machine_mode mode = TYPE_MODE (type);
7064
7065 if (mode == XFmode || mode == XCmode)
7066 return false;
7067
7068 if (TYPE_ALIGN (type) < 128)
7069 return false;
7070
7071 if (AGGREGATE_TYPE_P (type))
7072 {
7073 /* Walk the aggregates recursively. */
7074 switch (TREE_CODE (type))
7075 {
7076 case RECORD_TYPE:
7077 case UNION_TYPE:
7078 case QUAL_UNION_TYPE:
7079 {
7080 tree field;
7081
7082 /* Walk all the structure fields. */
7083 for (field = TYPE_FIELDS (type);
7084 field;
7085 field = DECL_CHAIN (field))
7086 {
7087 if (TREE_CODE (field) == FIELD_DECL
7088 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7089 return true;
7090 }
7091 break;
7092 }
7093
7094 case ARRAY_TYPE:
7095 /* Just for use if some languages passes arrays by value. */
7096 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7097 return true;
7098 break;
7099
7100 default:
7101 gcc_unreachable ();
7102 }
7103 }
7104 else
7105 return TYPE_ALIGN (type) >= 128;
7106
7107 return false;
7108 }
7109
7110 /* Gives the alignment boundary, in bits, of an argument with the
7111 specified mode and type. */
7112
7113 static unsigned int
7114 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7115 {
7116 unsigned int align;
7117 if (type)
7118 {
7119 /* Since the main variant type is used for call, we convert it to
7120 the main variant type. */
7121 type = TYPE_MAIN_VARIANT (type);
7122 align = TYPE_ALIGN (type);
7123 }
7124 else
7125 align = GET_MODE_ALIGNMENT (mode);
7126 if (align < PARM_BOUNDARY)
7127 align = PARM_BOUNDARY;
7128 else
7129 {
7130 static bool warned;
7131 unsigned int saved_align = align;
7132
7133 if (!TARGET_64BIT)
7134 {
7135 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7136 if (!type)
7137 {
7138 if (mode == XFmode || mode == XCmode)
7139 align = PARM_BOUNDARY;
7140 }
7141 else if (!ix86_contains_aligned_value_p (type))
7142 align = PARM_BOUNDARY;
7143
7144 if (align < 128)
7145 align = PARM_BOUNDARY;
7146 }
7147
7148 if (warn_psabi
7149 && !warned
7150 && align != ix86_compat_function_arg_boundary (mode, type,
7151 saved_align))
7152 {
7153 warned = true;
7154 inform (input_location,
7155 "The ABI for passing parameters with %d-byte"
7156 " alignment has changed in GCC 4.6",
7157 align / BITS_PER_UNIT);
7158 }
7159 }
7160
7161 return align;
7162 }
7163
7164 /* Return true if N is a possible register number of function value. */
7165
7166 static bool
7167 ix86_function_value_regno_p (const unsigned int regno)
7168 {
7169 switch (regno)
7170 {
7171 case AX_REG:
7172 return true;
7173
7174 case FIRST_FLOAT_REG:
7175 /* TODO: The function should depend on current function ABI but
7176 builtins.c would need updating then. Therefore we use the
7177 default ABI. */
7178 if (TARGET_64BIT && ix86_abi == MS_ABI)
7179 return false;
7180 return TARGET_FLOAT_RETURNS_IN_80387;
7181
7182 case FIRST_SSE_REG:
7183 return TARGET_SSE;
7184
7185 case FIRST_MMX_REG:
7186 if (TARGET_MACHO || TARGET_64BIT)
7187 return false;
7188 return TARGET_MMX;
7189 }
7190
7191 return false;
7192 }
7193
7194 /* Define how to find the value returned by a function.
7195 VALTYPE is the data type of the value (as a tree).
7196 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7197 otherwise, FUNC is 0. */
7198
7199 static rtx
7200 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7201 const_tree fntype, const_tree fn)
7202 {
7203 unsigned int regno;
7204
7205 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7206 we normally prevent this case when mmx is not available. However
7207 some ABIs may require the result to be returned like DImode. */
7208 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7209 regno = FIRST_MMX_REG;
7210
7211 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7212 we prevent this case when sse is not available. However some ABIs
7213 may require the result to be returned like integer TImode. */
7214 else if (mode == TImode
7215 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7216 regno = FIRST_SSE_REG;
7217
7218 /* 32-byte vector modes in %ymm0. */
7219 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7220 regno = FIRST_SSE_REG;
7221
7222 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7223 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7224 regno = FIRST_FLOAT_REG;
7225 else
7226 /* Most things go in %eax. */
7227 regno = AX_REG;
7228
7229 /* Override FP return register with %xmm0 for local functions when
7230 SSE math is enabled or for functions with sseregparm attribute. */
7231 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7232 {
7233 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7234 if ((sse_level >= 1 && mode == SFmode)
7235 || (sse_level == 2 && mode == DFmode))
7236 regno = FIRST_SSE_REG;
7237 }
7238
7239 /* OImode shouldn't be used directly. */
7240 gcc_assert (mode != OImode);
7241
7242 return gen_rtx_REG (orig_mode, regno);
7243 }
7244
7245 static rtx
7246 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7247 const_tree valtype)
7248 {
7249 rtx ret;
7250
7251 /* Handle libcalls, which don't provide a type node. */
7252 if (valtype == NULL)
7253 {
7254 unsigned int regno;
7255
7256 switch (mode)
7257 {
7258 case SFmode:
7259 case SCmode:
7260 case DFmode:
7261 case DCmode:
7262 case TFmode:
7263 case SDmode:
7264 case DDmode:
7265 case TDmode:
7266 regno = FIRST_SSE_REG;
7267 break;
7268 case XFmode:
7269 case XCmode:
7270 regno = FIRST_FLOAT_REG;
7271 break;
7272 case TCmode:
7273 return NULL;
7274 default:
7275 regno = AX_REG;
7276 }
7277
7278 return gen_rtx_REG (mode, regno);
7279 }
7280 else if (POINTER_TYPE_P (valtype))
7281 {
7282 /* Pointers are always returned in word_mode. */
7283 mode = word_mode;
7284 }
7285
7286 ret = construct_container (mode, orig_mode, valtype, 1,
7287 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7288 x86_64_int_return_registers, 0);
7289
7290 /* For zero sized structures, construct_container returns NULL, but we
7291 need to keep rest of compiler happy by returning meaningful value. */
7292 if (!ret)
7293 ret = gen_rtx_REG (orig_mode, AX_REG);
7294
7295 return ret;
7296 }
7297
7298 static rtx
7299 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7300 {
7301 unsigned int regno = AX_REG;
7302
7303 if (TARGET_SSE)
7304 {
7305 switch (GET_MODE_SIZE (mode))
7306 {
7307 case 16:
7308 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7309 && !COMPLEX_MODE_P (mode))
7310 regno = FIRST_SSE_REG;
7311 break;
7312 case 8:
7313 case 4:
7314 if (mode == SFmode || mode == DFmode)
7315 regno = FIRST_SSE_REG;
7316 break;
7317 default:
7318 break;
7319 }
7320 }
7321 return gen_rtx_REG (orig_mode, regno);
7322 }
7323
7324 static rtx
7325 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7326 enum machine_mode orig_mode, enum machine_mode mode)
7327 {
7328 const_tree fn, fntype;
7329
7330 fn = NULL_TREE;
7331 if (fntype_or_decl && DECL_P (fntype_or_decl))
7332 fn = fntype_or_decl;
7333 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7334
7335 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7336 return function_value_ms_64 (orig_mode, mode);
7337 else if (TARGET_64BIT)
7338 return function_value_64 (orig_mode, mode, valtype);
7339 else
7340 return function_value_32 (orig_mode, mode, fntype, fn);
7341 }
7342
7343 static rtx
7344 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7345 bool outgoing ATTRIBUTE_UNUSED)
7346 {
7347 enum machine_mode mode, orig_mode;
7348
7349 orig_mode = TYPE_MODE (valtype);
7350 mode = type_natural_mode (valtype, NULL);
7351 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7352 }
7353
7354 /* Pointer function arguments and return values are promoted to
7355 word_mode. */
7356
7357 static enum machine_mode
7358 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7359 int *punsignedp, const_tree fntype,
7360 int for_return)
7361 {
7362 if (type != NULL_TREE && POINTER_TYPE_P (type))
7363 {
7364 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7365 return word_mode;
7366 }
7367 return default_promote_function_mode (type, mode, punsignedp, fntype,
7368 for_return);
7369 }
7370
7371 rtx
7372 ix86_libcall_value (enum machine_mode mode)
7373 {
7374 return ix86_function_value_1 (NULL, NULL, mode, mode);
7375 }
7376
7377 /* Return true iff type is returned in memory. */
7378
7379 static bool ATTRIBUTE_UNUSED
7380 return_in_memory_32 (const_tree type, enum machine_mode mode)
7381 {
7382 HOST_WIDE_INT size;
7383
7384 if (mode == BLKmode)
7385 return true;
7386
7387 size = int_size_in_bytes (type);
7388
7389 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7390 return false;
7391
7392 if (VECTOR_MODE_P (mode) || mode == TImode)
7393 {
7394 /* User-created vectors small enough to fit in EAX. */
7395 if (size < 8)
7396 return false;
7397
7398 /* MMX/3dNow values are returned in MM0,
7399 except when it doesn't exits or the ABI prescribes otherwise. */
7400 if (size == 8)
7401 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7402
7403 /* SSE values are returned in XMM0, except when it doesn't exist. */
7404 if (size == 16)
7405 return !TARGET_SSE;
7406
7407 /* AVX values are returned in YMM0, except when it doesn't exist. */
7408 if (size == 32)
7409 return !TARGET_AVX;
7410 }
7411
7412 if (mode == XFmode)
7413 return false;
7414
7415 if (size > 12)
7416 return true;
7417
7418 /* OImode shouldn't be used directly. */
7419 gcc_assert (mode != OImode);
7420
7421 return false;
7422 }
7423
7424 static bool ATTRIBUTE_UNUSED
7425 return_in_memory_64 (const_tree type, enum machine_mode mode)
7426 {
7427 int needed_intregs, needed_sseregs;
7428 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7429 }
7430
7431 static bool ATTRIBUTE_UNUSED
7432 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7433 {
7434 HOST_WIDE_INT size = int_size_in_bytes (type);
7435
7436 /* __m128 is returned in xmm0. */
7437 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7438 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7439 return false;
7440
7441 /* Otherwise, the size must be exactly in [1248]. */
7442 return size != 1 && size != 2 && size != 4 && size != 8;
7443 }
7444
7445 static bool
7446 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7447 {
7448 #ifdef SUBTARGET_RETURN_IN_MEMORY
7449 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7450 #else
7451 const enum machine_mode mode = type_natural_mode (type, NULL);
7452
7453 if (TARGET_64BIT)
7454 {
7455 if (ix86_function_type_abi (fntype) == MS_ABI)
7456 return return_in_memory_ms_64 (type, mode);
7457 else
7458 return return_in_memory_64 (type, mode);
7459 }
7460 else
7461 return return_in_memory_32 (type, mode);
7462 #endif
7463 }
7464
7465 /* When returning SSE vector types, we have a choice of either
7466 (1) being abi incompatible with a -march switch, or
7467 (2) generating an error.
7468 Given no good solution, I think the safest thing is one warning.
7469 The user won't be able to use -Werror, but....
7470
7471 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7472 called in response to actually generating a caller or callee that
7473 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7474 via aggregate_value_p for general type probing from tree-ssa. */
7475
7476 static rtx
7477 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7478 {
7479 static bool warnedsse, warnedmmx;
7480
7481 if (!TARGET_64BIT && type)
7482 {
7483 /* Look at the return type of the function, not the function type. */
7484 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7485
7486 if (!TARGET_SSE && !warnedsse)
7487 {
7488 if (mode == TImode
7489 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7490 {
7491 warnedsse = true;
7492 warning (0, "SSE vector return without SSE enabled "
7493 "changes the ABI");
7494 }
7495 }
7496
7497 if (!TARGET_MMX && !warnedmmx)
7498 {
7499 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7500 {
7501 warnedmmx = true;
7502 warning (0, "MMX vector return without MMX enabled "
7503 "changes the ABI");
7504 }
7505 }
7506 }
7507
7508 return NULL;
7509 }
7510
7511 \f
7512 /* Create the va_list data type. */
7513
7514 /* Returns the calling convention specific va_list date type.
7515 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7516
7517 static tree
7518 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7519 {
7520 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7521
7522 /* For i386 we use plain pointer to argument area. */
7523 if (!TARGET_64BIT || abi == MS_ABI)
7524 return build_pointer_type (char_type_node);
7525
7526 record = lang_hooks.types.make_type (RECORD_TYPE);
7527 type_decl = build_decl (BUILTINS_LOCATION,
7528 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7529
7530 f_gpr = build_decl (BUILTINS_LOCATION,
7531 FIELD_DECL, get_identifier ("gp_offset"),
7532 unsigned_type_node);
7533 f_fpr = build_decl (BUILTINS_LOCATION,
7534 FIELD_DECL, get_identifier ("fp_offset"),
7535 unsigned_type_node);
7536 f_ovf = build_decl (BUILTINS_LOCATION,
7537 FIELD_DECL, get_identifier ("overflow_arg_area"),
7538 ptr_type_node);
7539 f_sav = build_decl (BUILTINS_LOCATION,
7540 FIELD_DECL, get_identifier ("reg_save_area"),
7541 ptr_type_node);
7542
7543 va_list_gpr_counter_field = f_gpr;
7544 va_list_fpr_counter_field = f_fpr;
7545
7546 DECL_FIELD_CONTEXT (f_gpr) = record;
7547 DECL_FIELD_CONTEXT (f_fpr) = record;
7548 DECL_FIELD_CONTEXT (f_ovf) = record;
7549 DECL_FIELD_CONTEXT (f_sav) = record;
7550
7551 TYPE_STUB_DECL (record) = type_decl;
7552 TYPE_NAME (record) = type_decl;
7553 TYPE_FIELDS (record) = f_gpr;
7554 DECL_CHAIN (f_gpr) = f_fpr;
7555 DECL_CHAIN (f_fpr) = f_ovf;
7556 DECL_CHAIN (f_ovf) = f_sav;
7557
7558 layout_type (record);
7559
7560 /* The correct type is an array type of one element. */
7561 return build_array_type (record, build_index_type (size_zero_node));
7562 }
7563
7564 /* Setup the builtin va_list data type and for 64-bit the additional
7565 calling convention specific va_list data types. */
7566
7567 static tree
7568 ix86_build_builtin_va_list (void)
7569 {
7570 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7571
7572 /* Initialize abi specific va_list builtin types. */
7573 if (TARGET_64BIT)
7574 {
7575 tree t;
7576 if (ix86_abi == MS_ABI)
7577 {
7578 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7579 if (TREE_CODE (t) != RECORD_TYPE)
7580 t = build_variant_type_copy (t);
7581 sysv_va_list_type_node = t;
7582 }
7583 else
7584 {
7585 t = ret;
7586 if (TREE_CODE (t) != RECORD_TYPE)
7587 t = build_variant_type_copy (t);
7588 sysv_va_list_type_node = t;
7589 }
7590 if (ix86_abi != MS_ABI)
7591 {
7592 t = ix86_build_builtin_va_list_abi (MS_ABI);
7593 if (TREE_CODE (t) != RECORD_TYPE)
7594 t = build_variant_type_copy (t);
7595 ms_va_list_type_node = t;
7596 }
7597 else
7598 {
7599 t = ret;
7600 if (TREE_CODE (t) != RECORD_TYPE)
7601 t = build_variant_type_copy (t);
7602 ms_va_list_type_node = t;
7603 }
7604 }
7605
7606 return ret;
7607 }
7608
7609 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7610
7611 static void
7612 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7613 {
7614 rtx save_area, mem;
7615 alias_set_type set;
7616 int i, max;
7617
7618 /* GPR size of varargs save area. */
7619 if (cfun->va_list_gpr_size)
7620 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7621 else
7622 ix86_varargs_gpr_size = 0;
7623
7624 /* FPR size of varargs save area. We don't need it if we don't pass
7625 anything in SSE registers. */
7626 if (TARGET_SSE && cfun->va_list_fpr_size)
7627 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7628 else
7629 ix86_varargs_fpr_size = 0;
7630
7631 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7632 return;
7633
7634 save_area = frame_pointer_rtx;
7635 set = get_varargs_alias_set ();
7636
7637 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7638 if (max > X86_64_REGPARM_MAX)
7639 max = X86_64_REGPARM_MAX;
7640
7641 for (i = cum->regno; i < max; i++)
7642 {
7643 mem = gen_rtx_MEM (word_mode,
7644 plus_constant (save_area, i * UNITS_PER_WORD));
7645 MEM_NOTRAP_P (mem) = 1;
7646 set_mem_alias_set (mem, set);
7647 emit_move_insn (mem,
7648 gen_rtx_REG (word_mode,
7649 x86_64_int_parameter_registers[i]));
7650 }
7651
7652 if (ix86_varargs_fpr_size)
7653 {
7654 enum machine_mode smode;
7655 rtx label, test;
7656
7657 /* Now emit code to save SSE registers. The AX parameter contains number
7658 of SSE parameter registers used to call this function, though all we
7659 actually check here is the zero/non-zero status. */
7660
7661 label = gen_label_rtx ();
7662 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7663 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7664 label));
7665
7666 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7667 we used movdqa (i.e. TImode) instead? Perhaps even better would
7668 be if we could determine the real mode of the data, via a hook
7669 into pass_stdarg. Ignore all that for now. */
7670 smode = V4SFmode;
7671 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7672 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7673
7674 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7675 if (max > X86_64_SSE_REGPARM_MAX)
7676 max = X86_64_SSE_REGPARM_MAX;
7677
7678 for (i = cum->sse_regno; i < max; ++i)
7679 {
7680 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7681 mem = gen_rtx_MEM (smode, mem);
7682 MEM_NOTRAP_P (mem) = 1;
7683 set_mem_alias_set (mem, set);
7684 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7685
7686 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7687 }
7688
7689 emit_label (label);
7690 }
7691 }
7692
7693 static void
7694 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7695 {
7696 alias_set_type set = get_varargs_alias_set ();
7697 int i;
7698
7699 /* Reset to zero, as there might be a sysv vaarg used
7700 before. */
7701 ix86_varargs_gpr_size = 0;
7702 ix86_varargs_fpr_size = 0;
7703
7704 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7705 {
7706 rtx reg, mem;
7707
7708 mem = gen_rtx_MEM (Pmode,
7709 plus_constant (virtual_incoming_args_rtx,
7710 i * UNITS_PER_WORD));
7711 MEM_NOTRAP_P (mem) = 1;
7712 set_mem_alias_set (mem, set);
7713
7714 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7715 emit_move_insn (mem, reg);
7716 }
7717 }
7718
7719 static void
7720 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7721 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7722 int no_rtl)
7723 {
7724 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7725 CUMULATIVE_ARGS next_cum;
7726 tree fntype;
7727
7728 /* This argument doesn't appear to be used anymore. Which is good,
7729 because the old code here didn't suppress rtl generation. */
7730 gcc_assert (!no_rtl);
7731
7732 if (!TARGET_64BIT)
7733 return;
7734
7735 fntype = TREE_TYPE (current_function_decl);
7736
7737 /* For varargs, we do not want to skip the dummy va_dcl argument.
7738 For stdargs, we do want to skip the last named argument. */
7739 next_cum = *cum;
7740 if (stdarg_p (fntype))
7741 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7742 true);
7743
7744 if (cum->call_abi == MS_ABI)
7745 setup_incoming_varargs_ms_64 (&next_cum);
7746 else
7747 setup_incoming_varargs_64 (&next_cum);
7748 }
7749
7750 /* Checks if TYPE is of kind va_list char *. */
7751
7752 static bool
7753 is_va_list_char_pointer (tree type)
7754 {
7755 tree canonic;
7756
7757 /* For 32-bit it is always true. */
7758 if (!TARGET_64BIT)
7759 return true;
7760 canonic = ix86_canonical_va_list_type (type);
7761 return (canonic == ms_va_list_type_node
7762 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7763 }
7764
7765 /* Implement va_start. */
7766
7767 static void
7768 ix86_va_start (tree valist, rtx nextarg)
7769 {
7770 HOST_WIDE_INT words, n_gpr, n_fpr;
7771 tree f_gpr, f_fpr, f_ovf, f_sav;
7772 tree gpr, fpr, ovf, sav, t;
7773 tree type;
7774 rtx ovf_rtx;
7775
7776 if (flag_split_stack
7777 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7778 {
7779 unsigned int scratch_regno;
7780
7781 /* When we are splitting the stack, we can't refer to the stack
7782 arguments using internal_arg_pointer, because they may be on
7783 the old stack. The split stack prologue will arrange to
7784 leave a pointer to the old stack arguments in a scratch
7785 register, which we here copy to a pseudo-register. The split
7786 stack prologue can't set the pseudo-register directly because
7787 it (the prologue) runs before any registers have been saved. */
7788
7789 scratch_regno = split_stack_prologue_scratch_regno ();
7790 if (scratch_regno != INVALID_REGNUM)
7791 {
7792 rtx reg, seq;
7793
7794 reg = gen_reg_rtx (Pmode);
7795 cfun->machine->split_stack_varargs_pointer = reg;
7796
7797 start_sequence ();
7798 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7799 seq = get_insns ();
7800 end_sequence ();
7801
7802 push_topmost_sequence ();
7803 emit_insn_after (seq, entry_of_function ());
7804 pop_topmost_sequence ();
7805 }
7806 }
7807
7808 /* Only 64bit target needs something special. */
7809 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7810 {
7811 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7812 std_expand_builtin_va_start (valist, nextarg);
7813 else
7814 {
7815 rtx va_r, next;
7816
7817 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7818 next = expand_binop (ptr_mode, add_optab,
7819 cfun->machine->split_stack_varargs_pointer,
7820 crtl->args.arg_offset_rtx,
7821 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7822 convert_move (va_r, next, 0);
7823 }
7824 return;
7825 }
7826
7827 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7828 f_fpr = DECL_CHAIN (f_gpr);
7829 f_ovf = DECL_CHAIN (f_fpr);
7830 f_sav = DECL_CHAIN (f_ovf);
7831
7832 valist = build_simple_mem_ref (valist);
7833 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7834 /* The following should be folded into the MEM_REF offset. */
7835 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7836 f_gpr, NULL_TREE);
7837 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7838 f_fpr, NULL_TREE);
7839 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7840 f_ovf, NULL_TREE);
7841 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7842 f_sav, NULL_TREE);
7843
7844 /* Count number of gp and fp argument registers used. */
7845 words = crtl->args.info.words;
7846 n_gpr = crtl->args.info.regno;
7847 n_fpr = crtl->args.info.sse_regno;
7848
7849 if (cfun->va_list_gpr_size)
7850 {
7851 type = TREE_TYPE (gpr);
7852 t = build2 (MODIFY_EXPR, type,
7853 gpr, build_int_cst (type, n_gpr * 8));
7854 TREE_SIDE_EFFECTS (t) = 1;
7855 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7856 }
7857
7858 if (TARGET_SSE && cfun->va_list_fpr_size)
7859 {
7860 type = TREE_TYPE (fpr);
7861 t = build2 (MODIFY_EXPR, type, fpr,
7862 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7863 TREE_SIDE_EFFECTS (t) = 1;
7864 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7865 }
7866
7867 /* Find the overflow area. */
7868 type = TREE_TYPE (ovf);
7869 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7870 ovf_rtx = crtl->args.internal_arg_pointer;
7871 else
7872 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7873 t = make_tree (type, ovf_rtx);
7874 if (words != 0)
7875 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7876 t = build2 (MODIFY_EXPR, type, ovf, t);
7877 TREE_SIDE_EFFECTS (t) = 1;
7878 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7879
7880 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7881 {
7882 /* Find the register save area.
7883 Prologue of the function save it right above stack frame. */
7884 type = TREE_TYPE (sav);
7885 t = make_tree (type, frame_pointer_rtx);
7886 if (!ix86_varargs_gpr_size)
7887 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7888 t = build2 (MODIFY_EXPR, type, sav, t);
7889 TREE_SIDE_EFFECTS (t) = 1;
7890 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7891 }
7892 }
7893
7894 /* Implement va_arg. */
7895
7896 static tree
7897 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7898 gimple_seq *post_p)
7899 {
7900 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7901 tree f_gpr, f_fpr, f_ovf, f_sav;
7902 tree gpr, fpr, ovf, sav, t;
7903 int size, rsize;
7904 tree lab_false, lab_over = NULL_TREE;
7905 tree addr, t2;
7906 rtx container;
7907 int indirect_p = 0;
7908 tree ptrtype;
7909 enum machine_mode nat_mode;
7910 unsigned int arg_boundary;
7911
7912 /* Only 64bit target needs something special. */
7913 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7914 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7915
7916 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7917 f_fpr = DECL_CHAIN (f_gpr);
7918 f_ovf = DECL_CHAIN (f_fpr);
7919 f_sav = DECL_CHAIN (f_ovf);
7920
7921 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7922 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7923 valist = build_va_arg_indirect_ref (valist);
7924 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7925 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7926 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7927
7928 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7929 if (indirect_p)
7930 type = build_pointer_type (type);
7931 size = int_size_in_bytes (type);
7932 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7933
7934 nat_mode = type_natural_mode (type, NULL);
7935 switch (nat_mode)
7936 {
7937 case V8SFmode:
7938 case V8SImode:
7939 case V32QImode:
7940 case V16HImode:
7941 case V4DFmode:
7942 case V4DImode:
7943 /* Unnamed 256bit vector mode parameters are passed on stack. */
7944 if (!TARGET_64BIT_MS_ABI)
7945 {
7946 container = NULL;
7947 break;
7948 }
7949
7950 default:
7951 container = construct_container (nat_mode, TYPE_MODE (type),
7952 type, 0, X86_64_REGPARM_MAX,
7953 X86_64_SSE_REGPARM_MAX, intreg,
7954 0);
7955 break;
7956 }
7957
7958 /* Pull the value out of the saved registers. */
7959
7960 addr = create_tmp_var (ptr_type_node, "addr");
7961
7962 if (container)
7963 {
7964 int needed_intregs, needed_sseregs;
7965 bool need_temp;
7966 tree int_addr, sse_addr;
7967
7968 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7969 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7970
7971 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7972
7973 need_temp = (!REG_P (container)
7974 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7975 || TYPE_ALIGN (type) > 128));
7976
7977 /* In case we are passing structure, verify that it is consecutive block
7978 on the register save area. If not we need to do moves. */
7979 if (!need_temp && !REG_P (container))
7980 {
7981 /* Verify that all registers are strictly consecutive */
7982 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7983 {
7984 int i;
7985
7986 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7987 {
7988 rtx slot = XVECEXP (container, 0, i);
7989 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7990 || INTVAL (XEXP (slot, 1)) != i * 16)
7991 need_temp = 1;
7992 }
7993 }
7994 else
7995 {
7996 int i;
7997
7998 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7999 {
8000 rtx slot = XVECEXP (container, 0, i);
8001 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8002 || INTVAL (XEXP (slot, 1)) != i * 8)
8003 need_temp = 1;
8004 }
8005 }
8006 }
8007 if (!need_temp)
8008 {
8009 int_addr = addr;
8010 sse_addr = addr;
8011 }
8012 else
8013 {
8014 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8015 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8016 }
8017
8018 /* First ensure that we fit completely in registers. */
8019 if (needed_intregs)
8020 {
8021 t = build_int_cst (TREE_TYPE (gpr),
8022 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8023 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8024 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8025 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8026 gimplify_and_add (t, pre_p);
8027 }
8028 if (needed_sseregs)
8029 {
8030 t = build_int_cst (TREE_TYPE (fpr),
8031 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8032 + X86_64_REGPARM_MAX * 8);
8033 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8034 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8035 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8036 gimplify_and_add (t, pre_p);
8037 }
8038
8039 /* Compute index to start of area used for integer regs. */
8040 if (needed_intregs)
8041 {
8042 /* int_addr = gpr + sav; */
8043 t = fold_build_pointer_plus (sav, gpr);
8044 gimplify_assign (int_addr, t, pre_p);
8045 }
8046 if (needed_sseregs)
8047 {
8048 /* sse_addr = fpr + sav; */
8049 t = fold_build_pointer_plus (sav, fpr);
8050 gimplify_assign (sse_addr, t, pre_p);
8051 }
8052 if (need_temp)
8053 {
8054 int i, prev_size = 0;
8055 tree temp = create_tmp_var (type, "va_arg_tmp");
8056
8057 /* addr = &temp; */
8058 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8059 gimplify_assign (addr, t, pre_p);
8060
8061 for (i = 0; i < XVECLEN (container, 0); i++)
8062 {
8063 rtx slot = XVECEXP (container, 0, i);
8064 rtx reg = XEXP (slot, 0);
8065 enum machine_mode mode = GET_MODE (reg);
8066 tree piece_type;
8067 tree addr_type;
8068 tree daddr_type;
8069 tree src_addr, src;
8070 int src_offset;
8071 tree dest_addr, dest;
8072 int cur_size = GET_MODE_SIZE (mode);
8073
8074 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8075 prev_size = INTVAL (XEXP (slot, 1));
8076 if (prev_size + cur_size > size)
8077 {
8078 cur_size = size - prev_size;
8079 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8080 if (mode == BLKmode)
8081 mode = QImode;
8082 }
8083 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8084 if (mode == GET_MODE (reg))
8085 addr_type = build_pointer_type (piece_type);
8086 else
8087 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8088 true);
8089 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8090 true);
8091
8092 if (SSE_REGNO_P (REGNO (reg)))
8093 {
8094 src_addr = sse_addr;
8095 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8096 }
8097 else
8098 {
8099 src_addr = int_addr;
8100 src_offset = REGNO (reg) * 8;
8101 }
8102 src_addr = fold_convert (addr_type, src_addr);
8103 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8104
8105 dest_addr = fold_convert (daddr_type, addr);
8106 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8107 if (cur_size == GET_MODE_SIZE (mode))
8108 {
8109 src = build_va_arg_indirect_ref (src_addr);
8110 dest = build_va_arg_indirect_ref (dest_addr);
8111
8112 gimplify_assign (dest, src, pre_p);
8113 }
8114 else
8115 {
8116 tree copy
8117 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8118 3, dest_addr, src_addr,
8119 size_int (cur_size));
8120 gimplify_and_add (copy, pre_p);
8121 }
8122 prev_size += cur_size;
8123 }
8124 }
8125
8126 if (needed_intregs)
8127 {
8128 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8129 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8130 gimplify_assign (gpr, t, pre_p);
8131 }
8132
8133 if (needed_sseregs)
8134 {
8135 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8136 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8137 gimplify_assign (fpr, t, pre_p);
8138 }
8139
8140 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8141
8142 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8143 }
8144
8145 /* ... otherwise out of the overflow area. */
8146
8147 /* When we align parameter on stack for caller, if the parameter
8148 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8149 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8150 here with caller. */
8151 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8152 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8153 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8154
8155 /* Care for on-stack alignment if needed. */
8156 if (arg_boundary <= 64 || size == 0)
8157 t = ovf;
8158 else
8159 {
8160 HOST_WIDE_INT align = arg_boundary / 8;
8161 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8162 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8163 build_int_cst (TREE_TYPE (t), -align));
8164 }
8165
8166 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8167 gimplify_assign (addr, t, pre_p);
8168
8169 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8170 gimplify_assign (unshare_expr (ovf), t, pre_p);
8171
8172 if (container)
8173 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8174
8175 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8176 addr = fold_convert (ptrtype, addr);
8177
8178 if (indirect_p)
8179 addr = build_va_arg_indirect_ref (addr);
8180 return build_va_arg_indirect_ref (addr);
8181 }
8182 \f
8183 /* Return true if OPNUM's MEM should be matched
8184 in movabs* patterns. */
8185
8186 bool
8187 ix86_check_movabs (rtx insn, int opnum)
8188 {
8189 rtx set, mem;
8190
8191 set = PATTERN (insn);
8192 if (GET_CODE (set) == PARALLEL)
8193 set = XVECEXP (set, 0, 0);
8194 gcc_assert (GET_CODE (set) == SET);
8195 mem = XEXP (set, opnum);
8196 while (GET_CODE (mem) == SUBREG)
8197 mem = SUBREG_REG (mem);
8198 gcc_assert (MEM_P (mem));
8199 return volatile_ok || !MEM_VOLATILE_P (mem);
8200 }
8201 \f
8202 /* Initialize the table of extra 80387 mathematical constants. */
8203
8204 static void
8205 init_ext_80387_constants (void)
8206 {
8207 static const char * cst[5] =
8208 {
8209 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8210 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8211 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8212 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8213 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8214 };
8215 int i;
8216
8217 for (i = 0; i < 5; i++)
8218 {
8219 real_from_string (&ext_80387_constants_table[i], cst[i]);
8220 /* Ensure each constant is rounded to XFmode precision. */
8221 real_convert (&ext_80387_constants_table[i],
8222 XFmode, &ext_80387_constants_table[i]);
8223 }
8224
8225 ext_80387_constants_init = 1;
8226 }
8227
8228 /* Return non-zero if the constant is something that
8229 can be loaded with a special instruction. */
8230
8231 int
8232 standard_80387_constant_p (rtx x)
8233 {
8234 enum machine_mode mode = GET_MODE (x);
8235
8236 REAL_VALUE_TYPE r;
8237
8238 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8239 return -1;
8240
8241 if (x == CONST0_RTX (mode))
8242 return 1;
8243 if (x == CONST1_RTX (mode))
8244 return 2;
8245
8246 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8247
8248 /* For XFmode constants, try to find a special 80387 instruction when
8249 optimizing for size or on those CPUs that benefit from them. */
8250 if (mode == XFmode
8251 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8252 {
8253 int i;
8254
8255 if (! ext_80387_constants_init)
8256 init_ext_80387_constants ();
8257
8258 for (i = 0; i < 5; i++)
8259 if (real_identical (&r, &ext_80387_constants_table[i]))
8260 return i + 3;
8261 }
8262
8263 /* Load of the constant -0.0 or -1.0 will be split as
8264 fldz;fchs or fld1;fchs sequence. */
8265 if (real_isnegzero (&r))
8266 return 8;
8267 if (real_identical (&r, &dconstm1))
8268 return 9;
8269
8270 return 0;
8271 }
8272
8273 /* Return the opcode of the special instruction to be used to load
8274 the constant X. */
8275
8276 const char *
8277 standard_80387_constant_opcode (rtx x)
8278 {
8279 switch (standard_80387_constant_p (x))
8280 {
8281 case 1:
8282 return "fldz";
8283 case 2:
8284 return "fld1";
8285 case 3:
8286 return "fldlg2";
8287 case 4:
8288 return "fldln2";
8289 case 5:
8290 return "fldl2e";
8291 case 6:
8292 return "fldl2t";
8293 case 7:
8294 return "fldpi";
8295 case 8:
8296 case 9:
8297 return "#";
8298 default:
8299 gcc_unreachable ();
8300 }
8301 }
8302
8303 /* Return the CONST_DOUBLE representing the 80387 constant that is
8304 loaded by the specified special instruction. The argument IDX
8305 matches the return value from standard_80387_constant_p. */
8306
8307 rtx
8308 standard_80387_constant_rtx (int idx)
8309 {
8310 int i;
8311
8312 if (! ext_80387_constants_init)
8313 init_ext_80387_constants ();
8314
8315 switch (idx)
8316 {
8317 case 3:
8318 case 4:
8319 case 5:
8320 case 6:
8321 case 7:
8322 i = idx - 3;
8323 break;
8324
8325 default:
8326 gcc_unreachable ();
8327 }
8328
8329 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8330 XFmode);
8331 }
8332
8333 /* Return 1 if X is all 0s and 2 if x is all 1s
8334 in supported SSE/AVX vector mode. */
8335
8336 int
8337 standard_sse_constant_p (rtx x)
8338 {
8339 enum machine_mode mode = GET_MODE (x);
8340
8341 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8342 return 1;
8343 if (vector_all_ones_operand (x, mode))
8344 switch (mode)
8345 {
8346 case V16QImode:
8347 case V8HImode:
8348 case V4SImode:
8349 case V2DImode:
8350 if (TARGET_SSE2)
8351 return 2;
8352 case V32QImode:
8353 case V16HImode:
8354 case V8SImode:
8355 case V4DImode:
8356 if (TARGET_AVX2)
8357 return 2;
8358 default:
8359 break;
8360 }
8361
8362 return 0;
8363 }
8364
8365 /* Return the opcode of the special instruction to be used to load
8366 the constant X. */
8367
8368 const char *
8369 standard_sse_constant_opcode (rtx insn, rtx x)
8370 {
8371 switch (standard_sse_constant_p (x))
8372 {
8373 case 1:
8374 switch (get_attr_mode (insn))
8375 {
8376 case MODE_TI:
8377 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8378 return "%vpxor\t%0, %d0";
8379 case MODE_V2DF:
8380 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8381 return "%vxorpd\t%0, %d0";
8382 case MODE_V4SF:
8383 return "%vxorps\t%0, %d0";
8384
8385 case MODE_OI:
8386 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8387 return "vpxor\t%x0, %x0, %x0";
8388 case MODE_V4DF:
8389 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8390 return "vxorpd\t%x0, %x0, %x0";
8391 case MODE_V8SF:
8392 return "vxorps\t%x0, %x0, %x0";
8393
8394 default:
8395 break;
8396 }
8397
8398 case 2:
8399 if (TARGET_AVX)
8400 return "vpcmpeqd\t%0, %0, %0";
8401 else
8402 return "pcmpeqd\t%0, %0";
8403
8404 default:
8405 break;
8406 }
8407 gcc_unreachable ();
8408 }
8409
8410 /* Returns true if OP contains a symbol reference */
8411
8412 bool
8413 symbolic_reference_mentioned_p (rtx op)
8414 {
8415 const char *fmt;
8416 int i;
8417
8418 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8419 return true;
8420
8421 fmt = GET_RTX_FORMAT (GET_CODE (op));
8422 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8423 {
8424 if (fmt[i] == 'E')
8425 {
8426 int j;
8427
8428 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8429 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8430 return true;
8431 }
8432
8433 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8434 return true;
8435 }
8436
8437 return false;
8438 }
8439
8440 /* Return true if it is appropriate to emit `ret' instructions in the
8441 body of a function. Do this only if the epilogue is simple, needing a
8442 couple of insns. Prior to reloading, we can't tell how many registers
8443 must be saved, so return false then. Return false if there is no frame
8444 marker to de-allocate. */
8445
8446 bool
8447 ix86_can_use_return_insn_p (void)
8448 {
8449 struct ix86_frame frame;
8450
8451 if (! reload_completed || frame_pointer_needed)
8452 return 0;
8453
8454 /* Don't allow more than 32k pop, since that's all we can do
8455 with one instruction. */
8456 if (crtl->args.pops_args && crtl->args.size >= 32768)
8457 return 0;
8458
8459 ix86_compute_frame_layout (&frame);
8460 return (frame.stack_pointer_offset == UNITS_PER_WORD
8461 && (frame.nregs + frame.nsseregs) == 0);
8462 }
8463 \f
8464 /* Value should be nonzero if functions must have frame pointers.
8465 Zero means the frame pointer need not be set up (and parms may
8466 be accessed via the stack pointer) in functions that seem suitable. */
8467
8468 static bool
8469 ix86_frame_pointer_required (void)
8470 {
8471 /* If we accessed previous frames, then the generated code expects
8472 to be able to access the saved ebp value in our frame. */
8473 if (cfun->machine->accesses_prev_frame)
8474 return true;
8475
8476 /* Several x86 os'es need a frame pointer for other reasons,
8477 usually pertaining to setjmp. */
8478 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8479 return true;
8480
8481 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8482 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8483 return true;
8484
8485 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8486 turns off the frame pointer by default. Turn it back on now if
8487 we've not got a leaf function. */
8488 if (TARGET_OMIT_LEAF_FRAME_POINTER
8489 && (!current_function_is_leaf
8490 || ix86_current_function_calls_tls_descriptor))
8491 return true;
8492
8493 if (crtl->profile && !flag_fentry)
8494 return true;
8495
8496 return false;
8497 }
8498
8499 /* Record that the current function accesses previous call frames. */
8500
8501 void
8502 ix86_setup_frame_addresses (void)
8503 {
8504 cfun->machine->accesses_prev_frame = 1;
8505 }
8506 \f
8507 #ifndef USE_HIDDEN_LINKONCE
8508 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8509 # define USE_HIDDEN_LINKONCE 1
8510 # else
8511 # define USE_HIDDEN_LINKONCE 0
8512 # endif
8513 #endif
8514
8515 static int pic_labels_used;
8516
8517 /* Fills in the label name that should be used for a pc thunk for
8518 the given register. */
8519
8520 static void
8521 get_pc_thunk_name (char name[32], unsigned int regno)
8522 {
8523 gcc_assert (!TARGET_64BIT);
8524
8525 if (USE_HIDDEN_LINKONCE)
8526 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8527 else
8528 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8529 }
8530
8531
8532 /* This function generates code for -fpic that loads %ebx with
8533 the return address of the caller and then returns. */
8534
8535 static void
8536 ix86_code_end (void)
8537 {
8538 rtx xops[2];
8539 int regno;
8540
8541 for (regno = AX_REG; regno <= SP_REG; regno++)
8542 {
8543 char name[32];
8544 tree decl;
8545
8546 if (!(pic_labels_used & (1 << regno)))
8547 continue;
8548
8549 get_pc_thunk_name (name, regno);
8550
8551 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8552 get_identifier (name),
8553 build_function_type_list (void_type_node, NULL_TREE));
8554 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8555 NULL_TREE, void_type_node);
8556 TREE_PUBLIC (decl) = 1;
8557 TREE_STATIC (decl) = 1;
8558
8559 #if TARGET_MACHO
8560 if (TARGET_MACHO)
8561 {
8562 switch_to_section (darwin_sections[text_coal_section]);
8563 fputs ("\t.weak_definition\t", asm_out_file);
8564 assemble_name (asm_out_file, name);
8565 fputs ("\n\t.private_extern\t", asm_out_file);
8566 assemble_name (asm_out_file, name);
8567 putc ('\n', asm_out_file);
8568 ASM_OUTPUT_LABEL (asm_out_file, name);
8569 DECL_WEAK (decl) = 1;
8570 }
8571 else
8572 #endif
8573 if (USE_HIDDEN_LINKONCE)
8574 {
8575 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8576
8577 targetm.asm_out.unique_section (decl, 0);
8578 switch_to_section (get_named_section (decl, NULL, 0));
8579
8580 targetm.asm_out.globalize_label (asm_out_file, name);
8581 fputs ("\t.hidden\t", asm_out_file);
8582 assemble_name (asm_out_file, name);
8583 putc ('\n', asm_out_file);
8584 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8585 }
8586 else
8587 {
8588 switch_to_section (text_section);
8589 ASM_OUTPUT_LABEL (asm_out_file, name);
8590 }
8591
8592 DECL_INITIAL (decl) = make_node (BLOCK);
8593 current_function_decl = decl;
8594 init_function_start (decl);
8595 first_function_block_is_cold = false;
8596 /* Make sure unwind info is emitted for the thunk if needed. */
8597 final_start_function (emit_barrier (), asm_out_file, 1);
8598
8599 /* Pad stack IP move with 4 instructions (two NOPs count
8600 as one instruction). */
8601 if (TARGET_PAD_SHORT_FUNCTION)
8602 {
8603 int i = 8;
8604
8605 while (i--)
8606 fputs ("\tnop\n", asm_out_file);
8607 }
8608
8609 xops[0] = gen_rtx_REG (Pmode, regno);
8610 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8611 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8612 fputs ("\tret\n", asm_out_file);
8613 final_end_function ();
8614 init_insn_lengths ();
8615 free_after_compilation (cfun);
8616 set_cfun (NULL);
8617 current_function_decl = NULL;
8618 }
8619
8620 if (flag_split_stack)
8621 file_end_indicate_split_stack ();
8622 }
8623
8624 /* Emit code for the SET_GOT patterns. */
8625
8626 const char *
8627 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8628 {
8629 rtx xops[3];
8630
8631 xops[0] = dest;
8632
8633 if (TARGET_VXWORKS_RTP && flag_pic)
8634 {
8635 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8636 xops[2] = gen_rtx_MEM (Pmode,
8637 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8638 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8639
8640 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8641 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8642 an unadorned address. */
8643 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8644 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8645 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8646 return "";
8647 }
8648
8649 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8650
8651 if (!flag_pic)
8652 {
8653 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8654
8655 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8656
8657 #if TARGET_MACHO
8658 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8659 is what will be referenced by the Mach-O PIC subsystem. */
8660 if (!label)
8661 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8662 #endif
8663
8664 targetm.asm_out.internal_label (asm_out_file, "L",
8665 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8666 }
8667 else
8668 {
8669 char name[32];
8670 get_pc_thunk_name (name, REGNO (dest));
8671 pic_labels_used |= 1 << REGNO (dest);
8672
8673 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8674 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8675 output_asm_insn ("call\t%X2", xops);
8676 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8677 is what will be referenced by the Mach-O PIC subsystem. */
8678 #if TARGET_MACHO
8679 if (!label)
8680 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8681 else
8682 targetm.asm_out.internal_label (asm_out_file, "L",
8683 CODE_LABEL_NUMBER (label));
8684 #endif
8685 }
8686
8687 if (!TARGET_MACHO)
8688 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8689
8690 return "";
8691 }
8692
8693 /* Generate an "push" pattern for input ARG. */
8694
8695 static rtx
8696 gen_push (rtx arg)
8697 {
8698 struct machine_function *m = cfun->machine;
8699
8700 if (m->fs.cfa_reg == stack_pointer_rtx)
8701 m->fs.cfa_offset += UNITS_PER_WORD;
8702 m->fs.sp_offset += UNITS_PER_WORD;
8703
8704 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8705 arg = gen_rtx_REG (word_mode, REGNO (arg));
8706
8707 return gen_rtx_SET (VOIDmode,
8708 gen_rtx_MEM (word_mode,
8709 gen_rtx_PRE_DEC (Pmode,
8710 stack_pointer_rtx)),
8711 arg);
8712 }
8713
8714 /* Generate an "pop" pattern for input ARG. */
8715
8716 static rtx
8717 gen_pop (rtx arg)
8718 {
8719 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8720 arg = gen_rtx_REG (word_mode, REGNO (arg));
8721
8722 return gen_rtx_SET (VOIDmode,
8723 arg,
8724 gen_rtx_MEM (word_mode,
8725 gen_rtx_POST_INC (Pmode,
8726 stack_pointer_rtx)));
8727 }
8728
8729 /* Return >= 0 if there is an unused call-clobbered register available
8730 for the entire function. */
8731
8732 static unsigned int
8733 ix86_select_alt_pic_regnum (void)
8734 {
8735 if (current_function_is_leaf
8736 && !crtl->profile
8737 && !ix86_current_function_calls_tls_descriptor)
8738 {
8739 int i, drap;
8740 /* Can't use the same register for both PIC and DRAP. */
8741 if (crtl->drap_reg)
8742 drap = REGNO (crtl->drap_reg);
8743 else
8744 drap = -1;
8745 for (i = 2; i >= 0; --i)
8746 if (i != drap && !df_regs_ever_live_p (i))
8747 return i;
8748 }
8749
8750 return INVALID_REGNUM;
8751 }
8752
8753 /* Return TRUE if we need to save REGNO. */
8754
8755 static bool
8756 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8757 {
8758 if (pic_offset_table_rtx
8759 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8760 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8761 || crtl->profile
8762 || crtl->calls_eh_return
8763 || crtl->uses_const_pool))
8764 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8765
8766 if (crtl->calls_eh_return && maybe_eh_return)
8767 {
8768 unsigned i;
8769 for (i = 0; ; i++)
8770 {
8771 unsigned test = EH_RETURN_DATA_REGNO (i);
8772 if (test == INVALID_REGNUM)
8773 break;
8774 if (test == regno)
8775 return true;
8776 }
8777 }
8778
8779 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8780 return true;
8781
8782 return (df_regs_ever_live_p (regno)
8783 && !call_used_regs[regno]
8784 && !fixed_regs[regno]
8785 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8786 }
8787
8788 /* Return number of saved general prupose registers. */
8789
8790 static int
8791 ix86_nsaved_regs (void)
8792 {
8793 int nregs = 0;
8794 int regno;
8795
8796 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8797 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8798 nregs ++;
8799 return nregs;
8800 }
8801
8802 /* Return number of saved SSE registrers. */
8803
8804 static int
8805 ix86_nsaved_sseregs (void)
8806 {
8807 int nregs = 0;
8808 int regno;
8809
8810 if (!TARGET_64BIT_MS_ABI)
8811 return 0;
8812 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8813 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8814 nregs ++;
8815 return nregs;
8816 }
8817
8818 /* Given FROM and TO register numbers, say whether this elimination is
8819 allowed. If stack alignment is needed, we can only replace argument
8820 pointer with hard frame pointer, or replace frame pointer with stack
8821 pointer. Otherwise, frame pointer elimination is automatically
8822 handled and all other eliminations are valid. */
8823
8824 static bool
8825 ix86_can_eliminate (const int from, const int to)
8826 {
8827 if (stack_realign_fp)
8828 return ((from == ARG_POINTER_REGNUM
8829 && to == HARD_FRAME_POINTER_REGNUM)
8830 || (from == FRAME_POINTER_REGNUM
8831 && to == STACK_POINTER_REGNUM));
8832 else
8833 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8834 }
8835
8836 /* Return the offset between two registers, one to be eliminated, and the other
8837 its replacement, at the start of a routine. */
8838
8839 HOST_WIDE_INT
8840 ix86_initial_elimination_offset (int from, int to)
8841 {
8842 struct ix86_frame frame;
8843 ix86_compute_frame_layout (&frame);
8844
8845 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8846 return frame.hard_frame_pointer_offset;
8847 else if (from == FRAME_POINTER_REGNUM
8848 && to == HARD_FRAME_POINTER_REGNUM)
8849 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8850 else
8851 {
8852 gcc_assert (to == STACK_POINTER_REGNUM);
8853
8854 if (from == ARG_POINTER_REGNUM)
8855 return frame.stack_pointer_offset;
8856
8857 gcc_assert (from == FRAME_POINTER_REGNUM);
8858 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8859 }
8860 }
8861
8862 /* In a dynamically-aligned function, we can't know the offset from
8863 stack pointer to frame pointer, so we must ensure that setjmp
8864 eliminates fp against the hard fp (%ebp) rather than trying to
8865 index from %esp up to the top of the frame across a gap that is
8866 of unknown (at compile-time) size. */
8867 static rtx
8868 ix86_builtin_setjmp_frame_value (void)
8869 {
8870 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8871 }
8872
8873 /* When using -fsplit-stack, the allocation routines set a field in
8874 the TCB to the bottom of the stack plus this much space, measured
8875 in bytes. */
8876
8877 #define SPLIT_STACK_AVAILABLE 256
8878
8879 /* Fill structure ix86_frame about frame of currently computed function. */
8880
8881 static void
8882 ix86_compute_frame_layout (struct ix86_frame *frame)
8883 {
8884 unsigned int stack_alignment_needed;
8885 HOST_WIDE_INT offset;
8886 unsigned int preferred_alignment;
8887 HOST_WIDE_INT size = get_frame_size ();
8888 HOST_WIDE_INT to_allocate;
8889
8890 frame->nregs = ix86_nsaved_regs ();
8891 frame->nsseregs = ix86_nsaved_sseregs ();
8892
8893 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8894 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8895
8896 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8897 function prologues and leaf. */
8898 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8899 && (!current_function_is_leaf || cfun->calls_alloca != 0
8900 || ix86_current_function_calls_tls_descriptor))
8901 {
8902 preferred_alignment = 16;
8903 stack_alignment_needed = 16;
8904 crtl->preferred_stack_boundary = 128;
8905 crtl->stack_alignment_needed = 128;
8906 }
8907
8908 gcc_assert (!size || stack_alignment_needed);
8909 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8910 gcc_assert (preferred_alignment <= stack_alignment_needed);
8911
8912 /* For SEH we have to limit the amount of code movement into the prologue.
8913 At present we do this via a BLOCKAGE, at which point there's very little
8914 scheduling that can be done, which means that there's very little point
8915 in doing anything except PUSHs. */
8916 if (TARGET_SEH)
8917 cfun->machine->use_fast_prologue_epilogue = false;
8918
8919 /* During reload iteration the amount of registers saved can change.
8920 Recompute the value as needed. Do not recompute when amount of registers
8921 didn't change as reload does multiple calls to the function and does not
8922 expect the decision to change within single iteration. */
8923 else if (!optimize_function_for_size_p (cfun)
8924 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8925 {
8926 int count = frame->nregs;
8927 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8928
8929 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8930
8931 /* The fast prologue uses move instead of push to save registers. This
8932 is significantly longer, but also executes faster as modern hardware
8933 can execute the moves in parallel, but can't do that for push/pop.
8934
8935 Be careful about choosing what prologue to emit: When function takes
8936 many instructions to execute we may use slow version as well as in
8937 case function is known to be outside hot spot (this is known with
8938 feedback only). Weight the size of function by number of registers
8939 to save as it is cheap to use one or two push instructions but very
8940 slow to use many of them. */
8941 if (count)
8942 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8943 if (node->frequency < NODE_FREQUENCY_NORMAL
8944 || (flag_branch_probabilities
8945 && node->frequency < NODE_FREQUENCY_HOT))
8946 cfun->machine->use_fast_prologue_epilogue = false;
8947 else
8948 cfun->machine->use_fast_prologue_epilogue
8949 = !expensive_function_p (count);
8950 }
8951
8952 frame->save_regs_using_mov
8953 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8954 /* If static stack checking is enabled and done with probes,
8955 the registers need to be saved before allocating the frame. */
8956 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8957
8958 /* Skip return address. */
8959 offset = UNITS_PER_WORD;
8960
8961 /* Skip pushed static chain. */
8962 if (ix86_static_chain_on_stack)
8963 offset += UNITS_PER_WORD;
8964
8965 /* Skip saved base pointer. */
8966 if (frame_pointer_needed)
8967 offset += UNITS_PER_WORD;
8968 frame->hfp_save_offset = offset;
8969
8970 /* The traditional frame pointer location is at the top of the frame. */
8971 frame->hard_frame_pointer_offset = offset;
8972
8973 /* Register save area */
8974 offset += frame->nregs * UNITS_PER_WORD;
8975 frame->reg_save_offset = offset;
8976
8977 /* Align and set SSE register save area. */
8978 if (frame->nsseregs)
8979 {
8980 /* The only ABI that has saved SSE registers (Win64) also has a
8981 16-byte aligned default stack, and thus we don't need to be
8982 within the re-aligned local stack frame to save them. */
8983 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8984 offset = (offset + 16 - 1) & -16;
8985 offset += frame->nsseregs * 16;
8986 }
8987 frame->sse_reg_save_offset = offset;
8988
8989 /* The re-aligned stack starts here. Values before this point are not
8990 directly comparable with values below this point. In order to make
8991 sure that no value happens to be the same before and after, force
8992 the alignment computation below to add a non-zero value. */
8993 if (stack_realign_fp)
8994 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8995
8996 /* Va-arg area */
8997 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8998 offset += frame->va_arg_size;
8999
9000 /* Align start of frame for local function. */
9001 if (stack_realign_fp
9002 || offset != frame->sse_reg_save_offset
9003 || size != 0
9004 || !current_function_is_leaf
9005 || cfun->calls_alloca
9006 || ix86_current_function_calls_tls_descriptor)
9007 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9008
9009 /* Frame pointer points here. */
9010 frame->frame_pointer_offset = offset;
9011
9012 offset += size;
9013
9014 /* Add outgoing arguments area. Can be skipped if we eliminated
9015 all the function calls as dead code.
9016 Skipping is however impossible when function calls alloca. Alloca
9017 expander assumes that last crtl->outgoing_args_size
9018 of stack frame are unused. */
9019 if (ACCUMULATE_OUTGOING_ARGS
9020 && (!current_function_is_leaf || cfun->calls_alloca
9021 || ix86_current_function_calls_tls_descriptor))
9022 {
9023 offset += crtl->outgoing_args_size;
9024 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9025 }
9026 else
9027 frame->outgoing_arguments_size = 0;
9028
9029 /* Align stack boundary. Only needed if we're calling another function
9030 or using alloca. */
9031 if (!current_function_is_leaf || cfun->calls_alloca
9032 || ix86_current_function_calls_tls_descriptor)
9033 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9034
9035 /* We've reached end of stack frame. */
9036 frame->stack_pointer_offset = offset;
9037
9038 /* Size prologue needs to allocate. */
9039 to_allocate = offset - frame->sse_reg_save_offset;
9040
9041 if ((!to_allocate && frame->nregs <= 1)
9042 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9043 frame->save_regs_using_mov = false;
9044
9045 if (ix86_using_red_zone ()
9046 && current_function_sp_is_unchanging
9047 && current_function_is_leaf
9048 && !ix86_current_function_calls_tls_descriptor)
9049 {
9050 frame->red_zone_size = to_allocate;
9051 if (frame->save_regs_using_mov)
9052 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9053 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9054 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9055 }
9056 else
9057 frame->red_zone_size = 0;
9058 frame->stack_pointer_offset -= frame->red_zone_size;
9059
9060 /* The SEH frame pointer location is near the bottom of the frame.
9061 This is enforced by the fact that the difference between the
9062 stack pointer and the frame pointer is limited to 240 bytes in
9063 the unwind data structure. */
9064 if (TARGET_SEH)
9065 {
9066 HOST_WIDE_INT diff;
9067
9068 /* If we can leave the frame pointer where it is, do so. */
9069 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9070 if (diff > 240 || (diff & 15) != 0)
9071 {
9072 /* Ideally we'd determine what portion of the local stack frame
9073 (within the constraint of the lowest 240) is most heavily used.
9074 But without that complication, simply bias the frame pointer
9075 by 128 bytes so as to maximize the amount of the local stack
9076 frame that is addressable with 8-bit offsets. */
9077 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9078 }
9079 }
9080 }
9081
9082 /* This is semi-inlined memory_address_length, but simplified
9083 since we know that we're always dealing with reg+offset, and
9084 to avoid having to create and discard all that rtl. */
9085
9086 static inline int
9087 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9088 {
9089 int len = 4;
9090
9091 if (offset == 0)
9092 {
9093 /* EBP and R13 cannot be encoded without an offset. */
9094 len = (regno == BP_REG || regno == R13_REG);
9095 }
9096 else if (IN_RANGE (offset, -128, 127))
9097 len = 1;
9098
9099 /* ESP and R12 must be encoded with a SIB byte. */
9100 if (regno == SP_REG || regno == R12_REG)
9101 len++;
9102
9103 return len;
9104 }
9105
9106 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9107 The valid base registers are taken from CFUN->MACHINE->FS. */
9108
9109 static rtx
9110 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9111 {
9112 const struct machine_function *m = cfun->machine;
9113 rtx base_reg = NULL;
9114 HOST_WIDE_INT base_offset = 0;
9115
9116 if (m->use_fast_prologue_epilogue)
9117 {
9118 /* Choose the base register most likely to allow the most scheduling
9119 opportunities. Generally FP is valid througout the function,
9120 while DRAP must be reloaded within the epilogue. But choose either
9121 over the SP due to increased encoding size. */
9122
9123 if (m->fs.fp_valid)
9124 {
9125 base_reg = hard_frame_pointer_rtx;
9126 base_offset = m->fs.fp_offset - cfa_offset;
9127 }
9128 else if (m->fs.drap_valid)
9129 {
9130 base_reg = crtl->drap_reg;
9131 base_offset = 0 - cfa_offset;
9132 }
9133 else if (m->fs.sp_valid)
9134 {
9135 base_reg = stack_pointer_rtx;
9136 base_offset = m->fs.sp_offset - cfa_offset;
9137 }
9138 }
9139 else
9140 {
9141 HOST_WIDE_INT toffset;
9142 int len = 16, tlen;
9143
9144 /* Choose the base register with the smallest address encoding.
9145 With a tie, choose FP > DRAP > SP. */
9146 if (m->fs.sp_valid)
9147 {
9148 base_reg = stack_pointer_rtx;
9149 base_offset = m->fs.sp_offset - cfa_offset;
9150 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9151 }
9152 if (m->fs.drap_valid)
9153 {
9154 toffset = 0 - cfa_offset;
9155 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9156 if (tlen <= len)
9157 {
9158 base_reg = crtl->drap_reg;
9159 base_offset = toffset;
9160 len = tlen;
9161 }
9162 }
9163 if (m->fs.fp_valid)
9164 {
9165 toffset = m->fs.fp_offset - cfa_offset;
9166 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9167 if (tlen <= len)
9168 {
9169 base_reg = hard_frame_pointer_rtx;
9170 base_offset = toffset;
9171 len = tlen;
9172 }
9173 }
9174 }
9175 gcc_assert (base_reg != NULL);
9176
9177 return plus_constant (base_reg, base_offset);
9178 }
9179
9180 /* Emit code to save registers in the prologue. */
9181
9182 static void
9183 ix86_emit_save_regs (void)
9184 {
9185 unsigned int regno;
9186 rtx insn;
9187
9188 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9189 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9190 {
9191 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9192 RTX_FRAME_RELATED_P (insn) = 1;
9193 }
9194 }
9195
9196 /* Emit a single register save at CFA - CFA_OFFSET. */
9197
9198 static void
9199 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9200 HOST_WIDE_INT cfa_offset)
9201 {
9202 struct machine_function *m = cfun->machine;
9203 rtx reg = gen_rtx_REG (mode, regno);
9204 rtx mem, addr, base, insn;
9205
9206 addr = choose_baseaddr (cfa_offset);
9207 mem = gen_frame_mem (mode, addr);
9208
9209 /* For SSE saves, we need to indicate the 128-bit alignment. */
9210 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9211
9212 insn = emit_move_insn (mem, reg);
9213 RTX_FRAME_RELATED_P (insn) = 1;
9214
9215 base = addr;
9216 if (GET_CODE (base) == PLUS)
9217 base = XEXP (base, 0);
9218 gcc_checking_assert (REG_P (base));
9219
9220 /* When saving registers into a re-aligned local stack frame, avoid
9221 any tricky guessing by dwarf2out. */
9222 if (m->fs.realigned)
9223 {
9224 gcc_checking_assert (stack_realign_drap);
9225
9226 if (regno == REGNO (crtl->drap_reg))
9227 {
9228 /* A bit of a hack. We force the DRAP register to be saved in
9229 the re-aligned stack frame, which provides us with a copy
9230 of the CFA that will last past the prologue. Install it. */
9231 gcc_checking_assert (cfun->machine->fs.fp_valid);
9232 addr = plus_constant (hard_frame_pointer_rtx,
9233 cfun->machine->fs.fp_offset - cfa_offset);
9234 mem = gen_rtx_MEM (mode, addr);
9235 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9236 }
9237 else
9238 {
9239 /* The frame pointer is a stable reference within the
9240 aligned frame. Use it. */
9241 gcc_checking_assert (cfun->machine->fs.fp_valid);
9242 addr = plus_constant (hard_frame_pointer_rtx,
9243 cfun->machine->fs.fp_offset - cfa_offset);
9244 mem = gen_rtx_MEM (mode, addr);
9245 add_reg_note (insn, REG_CFA_EXPRESSION,
9246 gen_rtx_SET (VOIDmode, mem, reg));
9247 }
9248 }
9249
9250 /* The memory may not be relative to the current CFA register,
9251 which means that we may need to generate a new pattern for
9252 use by the unwind info. */
9253 else if (base != m->fs.cfa_reg)
9254 {
9255 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9256 mem = gen_rtx_MEM (mode, addr);
9257 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9258 }
9259 }
9260
9261 /* Emit code to save registers using MOV insns.
9262 First register is stored at CFA - CFA_OFFSET. */
9263 static void
9264 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9265 {
9266 unsigned int regno;
9267
9268 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9269 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9270 {
9271 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9272 cfa_offset -= UNITS_PER_WORD;
9273 }
9274 }
9275
9276 /* Emit code to save SSE registers using MOV insns.
9277 First register is stored at CFA - CFA_OFFSET. */
9278 static void
9279 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9280 {
9281 unsigned int regno;
9282
9283 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9284 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9285 {
9286 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9287 cfa_offset -= 16;
9288 }
9289 }
9290
9291 static GTY(()) rtx queued_cfa_restores;
9292
9293 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9294 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9295 Don't add the note if the previously saved value will be left untouched
9296 within stack red-zone till return, as unwinders can find the same value
9297 in the register and on the stack. */
9298
9299 static void
9300 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9301 {
9302 if (!crtl->shrink_wrapped
9303 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9304 return;
9305
9306 if (insn)
9307 {
9308 add_reg_note (insn, REG_CFA_RESTORE, reg);
9309 RTX_FRAME_RELATED_P (insn) = 1;
9310 }
9311 else
9312 queued_cfa_restores
9313 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9314 }
9315
9316 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9317
9318 static void
9319 ix86_add_queued_cfa_restore_notes (rtx insn)
9320 {
9321 rtx last;
9322 if (!queued_cfa_restores)
9323 return;
9324 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9325 ;
9326 XEXP (last, 1) = REG_NOTES (insn);
9327 REG_NOTES (insn) = queued_cfa_restores;
9328 queued_cfa_restores = NULL_RTX;
9329 RTX_FRAME_RELATED_P (insn) = 1;
9330 }
9331
9332 /* Expand prologue or epilogue stack adjustment.
9333 The pattern exist to put a dependency on all ebp-based memory accesses.
9334 STYLE should be negative if instructions should be marked as frame related,
9335 zero if %r11 register is live and cannot be freely used and positive
9336 otherwise. */
9337
9338 static void
9339 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9340 int style, bool set_cfa)
9341 {
9342 struct machine_function *m = cfun->machine;
9343 rtx insn;
9344 bool add_frame_related_expr = false;
9345
9346 if (Pmode == SImode)
9347 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9348 else if (x86_64_immediate_operand (offset, DImode))
9349 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9350 else
9351 {
9352 rtx tmp;
9353 /* r11 is used by indirect sibcall return as well, set before the
9354 epilogue and used after the epilogue. */
9355 if (style)
9356 tmp = gen_rtx_REG (DImode, R11_REG);
9357 else
9358 {
9359 gcc_assert (src != hard_frame_pointer_rtx
9360 && dest != hard_frame_pointer_rtx);
9361 tmp = hard_frame_pointer_rtx;
9362 }
9363 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9364 if (style < 0)
9365 add_frame_related_expr = true;
9366
9367 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9368 }
9369
9370 insn = emit_insn (insn);
9371 if (style >= 0)
9372 ix86_add_queued_cfa_restore_notes (insn);
9373
9374 if (set_cfa)
9375 {
9376 rtx r;
9377
9378 gcc_assert (m->fs.cfa_reg == src);
9379 m->fs.cfa_offset += INTVAL (offset);
9380 m->fs.cfa_reg = dest;
9381
9382 r = gen_rtx_PLUS (Pmode, src, offset);
9383 r = gen_rtx_SET (VOIDmode, dest, r);
9384 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9385 RTX_FRAME_RELATED_P (insn) = 1;
9386 }
9387 else if (style < 0)
9388 {
9389 RTX_FRAME_RELATED_P (insn) = 1;
9390 if (add_frame_related_expr)
9391 {
9392 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9393 r = gen_rtx_SET (VOIDmode, dest, r);
9394 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9395 }
9396 }
9397
9398 if (dest == stack_pointer_rtx)
9399 {
9400 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9401 bool valid = m->fs.sp_valid;
9402
9403 if (src == hard_frame_pointer_rtx)
9404 {
9405 valid = m->fs.fp_valid;
9406 ooffset = m->fs.fp_offset;
9407 }
9408 else if (src == crtl->drap_reg)
9409 {
9410 valid = m->fs.drap_valid;
9411 ooffset = 0;
9412 }
9413 else
9414 {
9415 /* Else there are two possibilities: SP itself, which we set
9416 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9417 taken care of this by hand along the eh_return path. */
9418 gcc_checking_assert (src == stack_pointer_rtx
9419 || offset == const0_rtx);
9420 }
9421
9422 m->fs.sp_offset = ooffset - INTVAL (offset);
9423 m->fs.sp_valid = valid;
9424 }
9425 }
9426
9427 /* Find an available register to be used as dynamic realign argument
9428 pointer regsiter. Such a register will be written in prologue and
9429 used in begin of body, so it must not be
9430 1. parameter passing register.
9431 2. GOT pointer.
9432 We reuse static-chain register if it is available. Otherwise, we
9433 use DI for i386 and R13 for x86-64. We chose R13 since it has
9434 shorter encoding.
9435
9436 Return: the regno of chosen register. */
9437
9438 static unsigned int
9439 find_drap_reg (void)
9440 {
9441 tree decl = cfun->decl;
9442
9443 if (TARGET_64BIT)
9444 {
9445 /* Use R13 for nested function or function need static chain.
9446 Since function with tail call may use any caller-saved
9447 registers in epilogue, DRAP must not use caller-saved
9448 register in such case. */
9449 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9450 return R13_REG;
9451
9452 return R10_REG;
9453 }
9454 else
9455 {
9456 /* Use DI for nested function or function need static chain.
9457 Since function with tail call may use any caller-saved
9458 registers in epilogue, DRAP must not use caller-saved
9459 register in such case. */
9460 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9461 return DI_REG;
9462
9463 /* Reuse static chain register if it isn't used for parameter
9464 passing. */
9465 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9466 {
9467 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9468 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9469 return CX_REG;
9470 }
9471 return DI_REG;
9472 }
9473 }
9474
9475 /* Return minimum incoming stack alignment. */
9476
9477 static unsigned int
9478 ix86_minimum_incoming_stack_boundary (bool sibcall)
9479 {
9480 unsigned int incoming_stack_boundary;
9481
9482 /* Prefer the one specified at command line. */
9483 if (ix86_user_incoming_stack_boundary)
9484 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9485 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9486 if -mstackrealign is used, it isn't used for sibcall check and
9487 estimated stack alignment is 128bit. */
9488 else if (!sibcall
9489 && !TARGET_64BIT
9490 && ix86_force_align_arg_pointer
9491 && crtl->stack_alignment_estimated == 128)
9492 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9493 else
9494 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9495
9496 /* Incoming stack alignment can be changed on individual functions
9497 via force_align_arg_pointer attribute. We use the smallest
9498 incoming stack boundary. */
9499 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9500 && lookup_attribute (ix86_force_align_arg_pointer_string,
9501 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9502 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9503
9504 /* The incoming stack frame has to be aligned at least at
9505 parm_stack_boundary. */
9506 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9507 incoming_stack_boundary = crtl->parm_stack_boundary;
9508
9509 /* Stack at entrance of main is aligned by runtime. We use the
9510 smallest incoming stack boundary. */
9511 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9512 && DECL_NAME (current_function_decl)
9513 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9514 && DECL_FILE_SCOPE_P (current_function_decl))
9515 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9516
9517 return incoming_stack_boundary;
9518 }
9519
9520 /* Update incoming stack boundary and estimated stack alignment. */
9521
9522 static void
9523 ix86_update_stack_boundary (void)
9524 {
9525 ix86_incoming_stack_boundary
9526 = ix86_minimum_incoming_stack_boundary (false);
9527
9528 /* x86_64 vararg needs 16byte stack alignment for register save
9529 area. */
9530 if (TARGET_64BIT
9531 && cfun->stdarg
9532 && crtl->stack_alignment_estimated < 128)
9533 crtl->stack_alignment_estimated = 128;
9534 }
9535
9536 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9537 needed or an rtx for DRAP otherwise. */
9538
9539 static rtx
9540 ix86_get_drap_rtx (void)
9541 {
9542 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9543 crtl->need_drap = true;
9544
9545 if (stack_realign_drap)
9546 {
9547 /* Assign DRAP to vDRAP and returns vDRAP */
9548 unsigned int regno = find_drap_reg ();
9549 rtx drap_vreg;
9550 rtx arg_ptr;
9551 rtx seq, insn;
9552
9553 arg_ptr = gen_rtx_REG (Pmode, regno);
9554 crtl->drap_reg = arg_ptr;
9555
9556 start_sequence ();
9557 drap_vreg = copy_to_reg (arg_ptr);
9558 seq = get_insns ();
9559 end_sequence ();
9560
9561 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9562 if (!optimize)
9563 {
9564 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9565 RTX_FRAME_RELATED_P (insn) = 1;
9566 }
9567 return drap_vreg;
9568 }
9569 else
9570 return NULL;
9571 }
9572
9573 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9574
9575 static rtx
9576 ix86_internal_arg_pointer (void)
9577 {
9578 return virtual_incoming_args_rtx;
9579 }
9580
9581 struct scratch_reg {
9582 rtx reg;
9583 bool saved;
9584 };
9585
9586 /* Return a short-lived scratch register for use on function entry.
9587 In 32-bit mode, it is valid only after the registers are saved
9588 in the prologue. This register must be released by means of
9589 release_scratch_register_on_entry once it is dead. */
9590
9591 static void
9592 get_scratch_register_on_entry (struct scratch_reg *sr)
9593 {
9594 int regno;
9595
9596 sr->saved = false;
9597
9598 if (TARGET_64BIT)
9599 {
9600 /* We always use R11 in 64-bit mode. */
9601 regno = R11_REG;
9602 }
9603 else
9604 {
9605 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9606 bool fastcall_p
9607 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9608 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9609 int regparm = ix86_function_regparm (fntype, decl);
9610 int drap_regno
9611 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9612
9613 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9614 for the static chain register. */
9615 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9616 && drap_regno != AX_REG)
9617 regno = AX_REG;
9618 else if (regparm < 2 && drap_regno != DX_REG)
9619 regno = DX_REG;
9620 /* ecx is the static chain register. */
9621 else if (regparm < 3 && !fastcall_p && !static_chain_p
9622 && drap_regno != CX_REG)
9623 regno = CX_REG;
9624 else if (ix86_save_reg (BX_REG, true))
9625 regno = BX_REG;
9626 /* esi is the static chain register. */
9627 else if (!(regparm == 3 && static_chain_p)
9628 && ix86_save_reg (SI_REG, true))
9629 regno = SI_REG;
9630 else if (ix86_save_reg (DI_REG, true))
9631 regno = DI_REG;
9632 else
9633 {
9634 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9635 sr->saved = true;
9636 }
9637 }
9638
9639 sr->reg = gen_rtx_REG (Pmode, regno);
9640 if (sr->saved)
9641 {
9642 rtx insn = emit_insn (gen_push (sr->reg));
9643 RTX_FRAME_RELATED_P (insn) = 1;
9644 }
9645 }
9646
9647 /* Release a scratch register obtained from the preceding function. */
9648
9649 static void
9650 release_scratch_register_on_entry (struct scratch_reg *sr)
9651 {
9652 if (sr->saved)
9653 {
9654 rtx x, insn = emit_insn (gen_pop (sr->reg));
9655
9656 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9657 RTX_FRAME_RELATED_P (insn) = 1;
9658 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9659 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9660 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9661 }
9662 }
9663
9664 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9665
9666 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9667
9668 static void
9669 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9670 {
9671 /* We skip the probe for the first interval + a small dope of 4 words and
9672 probe that many bytes past the specified size to maintain a protection
9673 area at the botton of the stack. */
9674 const int dope = 4 * UNITS_PER_WORD;
9675 rtx size_rtx = GEN_INT (size), last;
9676
9677 /* See if we have a constant small number of probes to generate. If so,
9678 that's the easy case. The run-time loop is made up of 11 insns in the
9679 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9680 for n # of intervals. */
9681 if (size <= 5 * PROBE_INTERVAL)
9682 {
9683 HOST_WIDE_INT i, adjust;
9684 bool first_probe = true;
9685
9686 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9687 values of N from 1 until it exceeds SIZE. If only one probe is
9688 needed, this will not generate any code. Then adjust and probe
9689 to PROBE_INTERVAL + SIZE. */
9690 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9691 {
9692 if (first_probe)
9693 {
9694 adjust = 2 * PROBE_INTERVAL + dope;
9695 first_probe = false;
9696 }
9697 else
9698 adjust = PROBE_INTERVAL;
9699
9700 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9701 plus_constant (stack_pointer_rtx, -adjust)));
9702 emit_stack_probe (stack_pointer_rtx);
9703 }
9704
9705 if (first_probe)
9706 adjust = size + PROBE_INTERVAL + dope;
9707 else
9708 adjust = size + PROBE_INTERVAL - i;
9709
9710 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9711 plus_constant (stack_pointer_rtx, -adjust)));
9712 emit_stack_probe (stack_pointer_rtx);
9713
9714 /* Adjust back to account for the additional first interval. */
9715 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9716 plus_constant (stack_pointer_rtx,
9717 PROBE_INTERVAL + dope)));
9718 }
9719
9720 /* Otherwise, do the same as above, but in a loop. Note that we must be
9721 extra careful with variables wrapping around because we might be at
9722 the very top (or the very bottom) of the address space and we have
9723 to be able to handle this case properly; in particular, we use an
9724 equality test for the loop condition. */
9725 else
9726 {
9727 HOST_WIDE_INT rounded_size;
9728 struct scratch_reg sr;
9729
9730 get_scratch_register_on_entry (&sr);
9731
9732
9733 /* Step 1: round SIZE to the previous multiple of the interval. */
9734
9735 rounded_size = size & -PROBE_INTERVAL;
9736
9737
9738 /* Step 2: compute initial and final value of the loop counter. */
9739
9740 /* SP = SP_0 + PROBE_INTERVAL. */
9741 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9742 plus_constant (stack_pointer_rtx,
9743 - (PROBE_INTERVAL + dope))));
9744
9745 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9746 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9747 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9748 gen_rtx_PLUS (Pmode, sr.reg,
9749 stack_pointer_rtx)));
9750
9751
9752 /* Step 3: the loop
9753
9754 while (SP != LAST_ADDR)
9755 {
9756 SP = SP + PROBE_INTERVAL
9757 probe at SP
9758 }
9759
9760 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9761 values of N from 1 until it is equal to ROUNDED_SIZE. */
9762
9763 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9764
9765
9766 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9767 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9768
9769 if (size != rounded_size)
9770 {
9771 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9772 plus_constant (stack_pointer_rtx,
9773 rounded_size - size)));
9774 emit_stack_probe (stack_pointer_rtx);
9775 }
9776
9777 /* Adjust back to account for the additional first interval. */
9778 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9779 plus_constant (stack_pointer_rtx,
9780 PROBE_INTERVAL + dope)));
9781
9782 release_scratch_register_on_entry (&sr);
9783 }
9784
9785 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9786
9787 /* Even if the stack pointer isn't the CFA register, we need to correctly
9788 describe the adjustments made to it, in particular differentiate the
9789 frame-related ones from the frame-unrelated ones. */
9790 if (size > 0)
9791 {
9792 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9793 XVECEXP (expr, 0, 0)
9794 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9795 plus_constant (stack_pointer_rtx, -size));
9796 XVECEXP (expr, 0, 1)
9797 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9798 plus_constant (stack_pointer_rtx,
9799 PROBE_INTERVAL + dope + size));
9800 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9801 RTX_FRAME_RELATED_P (last) = 1;
9802
9803 cfun->machine->fs.sp_offset += size;
9804 }
9805
9806 /* Make sure nothing is scheduled before we are done. */
9807 emit_insn (gen_blockage ());
9808 }
9809
9810 /* Adjust the stack pointer up to REG while probing it. */
9811
9812 const char *
9813 output_adjust_stack_and_probe (rtx reg)
9814 {
9815 static int labelno = 0;
9816 char loop_lab[32], end_lab[32];
9817 rtx xops[2];
9818
9819 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9820 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9821
9822 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9823
9824 /* Jump to END_LAB if SP == LAST_ADDR. */
9825 xops[0] = stack_pointer_rtx;
9826 xops[1] = reg;
9827 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9828 fputs ("\tje\t", asm_out_file);
9829 assemble_name_raw (asm_out_file, end_lab);
9830 fputc ('\n', asm_out_file);
9831
9832 /* SP = SP + PROBE_INTERVAL. */
9833 xops[1] = GEN_INT (PROBE_INTERVAL);
9834 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9835
9836 /* Probe at SP. */
9837 xops[1] = const0_rtx;
9838 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9839
9840 fprintf (asm_out_file, "\tjmp\t");
9841 assemble_name_raw (asm_out_file, loop_lab);
9842 fputc ('\n', asm_out_file);
9843
9844 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9845
9846 return "";
9847 }
9848
9849 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9850 inclusive. These are offsets from the current stack pointer. */
9851
9852 static void
9853 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9854 {
9855 /* See if we have a constant small number of probes to generate. If so,
9856 that's the easy case. The run-time loop is made up of 7 insns in the
9857 generic case while the compile-time loop is made up of n insns for n #
9858 of intervals. */
9859 if (size <= 7 * PROBE_INTERVAL)
9860 {
9861 HOST_WIDE_INT i;
9862
9863 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9864 it exceeds SIZE. If only one probe is needed, this will not
9865 generate any code. Then probe at FIRST + SIZE. */
9866 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9867 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9868
9869 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9870 }
9871
9872 /* Otherwise, do the same as above, but in a loop. Note that we must be
9873 extra careful with variables wrapping around because we might be at
9874 the very top (or the very bottom) of the address space and we have
9875 to be able to handle this case properly; in particular, we use an
9876 equality test for the loop condition. */
9877 else
9878 {
9879 HOST_WIDE_INT rounded_size, last;
9880 struct scratch_reg sr;
9881
9882 get_scratch_register_on_entry (&sr);
9883
9884
9885 /* Step 1: round SIZE to the previous multiple of the interval. */
9886
9887 rounded_size = size & -PROBE_INTERVAL;
9888
9889
9890 /* Step 2: compute initial and final value of the loop counter. */
9891
9892 /* TEST_OFFSET = FIRST. */
9893 emit_move_insn (sr.reg, GEN_INT (-first));
9894
9895 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9896 last = first + rounded_size;
9897
9898
9899 /* Step 3: the loop
9900
9901 while (TEST_ADDR != LAST_ADDR)
9902 {
9903 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9904 probe at TEST_ADDR
9905 }
9906
9907 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9908 until it is equal to ROUNDED_SIZE. */
9909
9910 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9911
9912
9913 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9914 that SIZE is equal to ROUNDED_SIZE. */
9915
9916 if (size != rounded_size)
9917 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9918 stack_pointer_rtx,
9919 sr.reg),
9920 rounded_size - size));
9921
9922 release_scratch_register_on_entry (&sr);
9923 }
9924
9925 /* Make sure nothing is scheduled before we are done. */
9926 emit_insn (gen_blockage ());
9927 }
9928
9929 /* Probe a range of stack addresses from REG to END, inclusive. These are
9930 offsets from the current stack pointer. */
9931
9932 const char *
9933 output_probe_stack_range (rtx reg, rtx end)
9934 {
9935 static int labelno = 0;
9936 char loop_lab[32], end_lab[32];
9937 rtx xops[3];
9938
9939 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9940 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9941
9942 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9943
9944 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9945 xops[0] = reg;
9946 xops[1] = end;
9947 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9948 fputs ("\tje\t", asm_out_file);
9949 assemble_name_raw (asm_out_file, end_lab);
9950 fputc ('\n', asm_out_file);
9951
9952 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9953 xops[1] = GEN_INT (PROBE_INTERVAL);
9954 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9955
9956 /* Probe at TEST_ADDR. */
9957 xops[0] = stack_pointer_rtx;
9958 xops[1] = reg;
9959 xops[2] = const0_rtx;
9960 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9961
9962 fprintf (asm_out_file, "\tjmp\t");
9963 assemble_name_raw (asm_out_file, loop_lab);
9964 fputc ('\n', asm_out_file);
9965
9966 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9967
9968 return "";
9969 }
9970
9971 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9972 to be generated in correct form. */
9973 static void
9974 ix86_finalize_stack_realign_flags (void)
9975 {
9976 /* Check if stack realign is really needed after reload, and
9977 stores result in cfun */
9978 unsigned int incoming_stack_boundary
9979 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9980 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9981 unsigned int stack_realign = (incoming_stack_boundary
9982 < (current_function_is_leaf
9983 ? crtl->max_used_stack_slot_alignment
9984 : crtl->stack_alignment_needed));
9985
9986 if (crtl->stack_realign_finalized)
9987 {
9988 /* After stack_realign_needed is finalized, we can't no longer
9989 change it. */
9990 gcc_assert (crtl->stack_realign_needed == stack_realign);
9991 return;
9992 }
9993
9994 /* If the only reason for frame_pointer_needed is that we conservatively
9995 assumed stack realignment might be needed, but in the end nothing that
9996 needed the stack alignment had been spilled, clear frame_pointer_needed
9997 and say we don't need stack realignment. */
9998 if (stack_realign
9999 && !crtl->need_drap
10000 && frame_pointer_needed
10001 && current_function_is_leaf
10002 && flag_omit_frame_pointer
10003 && current_function_sp_is_unchanging
10004 && !ix86_current_function_calls_tls_descriptor
10005 && !crtl->accesses_prior_frames
10006 && !cfun->calls_alloca
10007 && !crtl->calls_eh_return
10008 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10009 && !ix86_frame_pointer_required ()
10010 && get_frame_size () == 0
10011 && ix86_nsaved_sseregs () == 0
10012 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10013 {
10014 HARD_REG_SET set_up_by_prologue, prologue_used;
10015 basic_block bb;
10016
10017 CLEAR_HARD_REG_SET (prologue_used);
10018 CLEAR_HARD_REG_SET (set_up_by_prologue);
10019 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10020 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10021 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10022 HARD_FRAME_POINTER_REGNUM);
10023 FOR_EACH_BB (bb)
10024 {
10025 rtx insn;
10026 FOR_BB_INSNS (bb, insn)
10027 if (NONDEBUG_INSN_P (insn)
10028 && requires_stack_frame_p (insn, prologue_used,
10029 set_up_by_prologue))
10030 {
10031 crtl->stack_realign_needed = stack_realign;
10032 crtl->stack_realign_finalized = true;
10033 return;
10034 }
10035 }
10036
10037 frame_pointer_needed = false;
10038 stack_realign = false;
10039 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10040 crtl->stack_alignment_needed = incoming_stack_boundary;
10041 crtl->stack_alignment_estimated = incoming_stack_boundary;
10042 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10043 crtl->preferred_stack_boundary = incoming_stack_boundary;
10044 df_finish_pass (true);
10045 df_scan_alloc (NULL);
10046 df_scan_blocks ();
10047 df_compute_regs_ever_live (true);
10048 df_analyze ();
10049 }
10050
10051 crtl->stack_realign_needed = stack_realign;
10052 crtl->stack_realign_finalized = true;
10053 }
10054
10055 /* Expand the prologue into a bunch of separate insns. */
10056
10057 void
10058 ix86_expand_prologue (void)
10059 {
10060 struct machine_function *m = cfun->machine;
10061 rtx insn, t;
10062 bool pic_reg_used;
10063 struct ix86_frame frame;
10064 HOST_WIDE_INT allocate;
10065 bool int_registers_saved;
10066
10067 ix86_finalize_stack_realign_flags ();
10068
10069 /* DRAP should not coexist with stack_realign_fp */
10070 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10071
10072 memset (&m->fs, 0, sizeof (m->fs));
10073
10074 /* Initialize CFA state for before the prologue. */
10075 m->fs.cfa_reg = stack_pointer_rtx;
10076 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10077
10078 /* Track SP offset to the CFA. We continue tracking this after we've
10079 swapped the CFA register away from SP. In the case of re-alignment
10080 this is fudged; we're interested to offsets within the local frame. */
10081 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10082 m->fs.sp_valid = true;
10083
10084 ix86_compute_frame_layout (&frame);
10085
10086 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10087 {
10088 /* We should have already generated an error for any use of
10089 ms_hook on a nested function. */
10090 gcc_checking_assert (!ix86_static_chain_on_stack);
10091
10092 /* Check if profiling is active and we shall use profiling before
10093 prologue variant. If so sorry. */
10094 if (crtl->profile && flag_fentry != 0)
10095 sorry ("ms_hook_prologue attribute isn%'t compatible "
10096 "with -mfentry for 32-bit");
10097
10098 /* In ix86_asm_output_function_label we emitted:
10099 8b ff movl.s %edi,%edi
10100 55 push %ebp
10101 8b ec movl.s %esp,%ebp
10102
10103 This matches the hookable function prologue in Win32 API
10104 functions in Microsoft Windows XP Service Pack 2 and newer.
10105 Wine uses this to enable Windows apps to hook the Win32 API
10106 functions provided by Wine.
10107
10108 What that means is that we've already set up the frame pointer. */
10109
10110 if (frame_pointer_needed
10111 && !(crtl->drap_reg && crtl->stack_realign_needed))
10112 {
10113 rtx push, mov;
10114
10115 /* We've decided to use the frame pointer already set up.
10116 Describe this to the unwinder by pretending that both
10117 push and mov insns happen right here.
10118
10119 Putting the unwind info here at the end of the ms_hook
10120 is done so that we can make absolutely certain we get
10121 the required byte sequence at the start of the function,
10122 rather than relying on an assembler that can produce
10123 the exact encoding required.
10124
10125 However it does mean (in the unpatched case) that we have
10126 a 1 insn window where the asynchronous unwind info is
10127 incorrect. However, if we placed the unwind info at
10128 its correct location we would have incorrect unwind info
10129 in the patched case. Which is probably all moot since
10130 I don't expect Wine generates dwarf2 unwind info for the
10131 system libraries that use this feature. */
10132
10133 insn = emit_insn (gen_blockage ());
10134
10135 push = gen_push (hard_frame_pointer_rtx);
10136 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10137 stack_pointer_rtx);
10138 RTX_FRAME_RELATED_P (push) = 1;
10139 RTX_FRAME_RELATED_P (mov) = 1;
10140
10141 RTX_FRAME_RELATED_P (insn) = 1;
10142 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10143 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10144
10145 /* Note that gen_push incremented m->fs.cfa_offset, even
10146 though we didn't emit the push insn here. */
10147 m->fs.cfa_reg = hard_frame_pointer_rtx;
10148 m->fs.fp_offset = m->fs.cfa_offset;
10149 m->fs.fp_valid = true;
10150 }
10151 else
10152 {
10153 /* The frame pointer is not needed so pop %ebp again.
10154 This leaves us with a pristine state. */
10155 emit_insn (gen_pop (hard_frame_pointer_rtx));
10156 }
10157 }
10158
10159 /* The first insn of a function that accepts its static chain on the
10160 stack is to push the register that would be filled in by a direct
10161 call. This insn will be skipped by the trampoline. */
10162 else if (ix86_static_chain_on_stack)
10163 {
10164 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10165 emit_insn (gen_blockage ());
10166
10167 /* We don't want to interpret this push insn as a register save,
10168 only as a stack adjustment. The real copy of the register as
10169 a save will be done later, if needed. */
10170 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10171 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10172 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10173 RTX_FRAME_RELATED_P (insn) = 1;
10174 }
10175
10176 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10177 of DRAP is needed and stack realignment is really needed after reload */
10178 if (stack_realign_drap)
10179 {
10180 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10181
10182 /* Only need to push parameter pointer reg if it is caller saved. */
10183 if (!call_used_regs[REGNO (crtl->drap_reg)])
10184 {
10185 /* Push arg pointer reg */
10186 insn = emit_insn (gen_push (crtl->drap_reg));
10187 RTX_FRAME_RELATED_P (insn) = 1;
10188 }
10189
10190 /* Grab the argument pointer. */
10191 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10192 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10193 RTX_FRAME_RELATED_P (insn) = 1;
10194 m->fs.cfa_reg = crtl->drap_reg;
10195 m->fs.cfa_offset = 0;
10196
10197 /* Align the stack. */
10198 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10199 stack_pointer_rtx,
10200 GEN_INT (-align_bytes)));
10201 RTX_FRAME_RELATED_P (insn) = 1;
10202
10203 /* Replicate the return address on the stack so that return
10204 address can be reached via (argp - 1) slot. This is needed
10205 to implement macro RETURN_ADDR_RTX and intrinsic function
10206 expand_builtin_return_addr etc. */
10207 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10208 t = gen_frame_mem (word_mode, t);
10209 insn = emit_insn (gen_push (t));
10210 RTX_FRAME_RELATED_P (insn) = 1;
10211
10212 /* For the purposes of frame and register save area addressing,
10213 we've started over with a new frame. */
10214 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10215 m->fs.realigned = true;
10216 }
10217
10218 if (frame_pointer_needed && !m->fs.fp_valid)
10219 {
10220 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10221 slower on all targets. Also sdb doesn't like it. */
10222 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10223 RTX_FRAME_RELATED_P (insn) = 1;
10224
10225 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10226 {
10227 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10228 RTX_FRAME_RELATED_P (insn) = 1;
10229
10230 if (m->fs.cfa_reg == stack_pointer_rtx)
10231 m->fs.cfa_reg = hard_frame_pointer_rtx;
10232 m->fs.fp_offset = m->fs.sp_offset;
10233 m->fs.fp_valid = true;
10234 }
10235 }
10236
10237 int_registers_saved = (frame.nregs == 0);
10238
10239 if (!int_registers_saved)
10240 {
10241 /* If saving registers via PUSH, do so now. */
10242 if (!frame.save_regs_using_mov)
10243 {
10244 ix86_emit_save_regs ();
10245 int_registers_saved = true;
10246 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10247 }
10248
10249 /* When using red zone we may start register saving before allocating
10250 the stack frame saving one cycle of the prologue. However, avoid
10251 doing this if we have to probe the stack; at least on x86_64 the
10252 stack probe can turn into a call that clobbers a red zone location. */
10253 else if (ix86_using_red_zone ()
10254 && (! TARGET_STACK_PROBE
10255 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10256 {
10257 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10258 int_registers_saved = true;
10259 }
10260 }
10261
10262 if (stack_realign_fp)
10263 {
10264 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10265 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10266
10267 /* The computation of the size of the re-aligned stack frame means
10268 that we must allocate the size of the register save area before
10269 performing the actual alignment. Otherwise we cannot guarantee
10270 that there's enough storage above the realignment point. */
10271 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10272 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10273 GEN_INT (m->fs.sp_offset
10274 - frame.sse_reg_save_offset),
10275 -1, false);
10276
10277 /* Align the stack. */
10278 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10279 stack_pointer_rtx,
10280 GEN_INT (-align_bytes)));
10281
10282 /* For the purposes of register save area addressing, the stack
10283 pointer is no longer valid. As for the value of sp_offset,
10284 see ix86_compute_frame_layout, which we need to match in order
10285 to pass verification of stack_pointer_offset at the end. */
10286 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10287 m->fs.sp_valid = false;
10288 }
10289
10290 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10291
10292 if (flag_stack_usage_info)
10293 {
10294 /* We start to count from ARG_POINTER. */
10295 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10296
10297 /* If it was realigned, take into account the fake frame. */
10298 if (stack_realign_drap)
10299 {
10300 if (ix86_static_chain_on_stack)
10301 stack_size += UNITS_PER_WORD;
10302
10303 if (!call_used_regs[REGNO (crtl->drap_reg)])
10304 stack_size += UNITS_PER_WORD;
10305
10306 /* This over-estimates by 1 minimal-stack-alignment-unit but
10307 mitigates that by counting in the new return address slot. */
10308 current_function_dynamic_stack_size
10309 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10310 }
10311
10312 current_function_static_stack_size = stack_size;
10313 }
10314
10315 /* The stack has already been decremented by the instruction calling us
10316 so probe if the size is non-negative to preserve the protection area. */
10317 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10318 {
10319 /* We expect the registers to be saved when probes are used. */
10320 gcc_assert (int_registers_saved);
10321
10322 if (STACK_CHECK_MOVING_SP)
10323 {
10324 ix86_adjust_stack_and_probe (allocate);
10325 allocate = 0;
10326 }
10327 else
10328 {
10329 HOST_WIDE_INT size = allocate;
10330
10331 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10332 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10333
10334 if (TARGET_STACK_PROBE)
10335 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10336 else
10337 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10338 }
10339 }
10340
10341 if (allocate == 0)
10342 ;
10343 else if (!ix86_target_stack_probe ()
10344 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10345 {
10346 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10347 GEN_INT (-allocate), -1,
10348 m->fs.cfa_reg == stack_pointer_rtx);
10349 }
10350 else
10351 {
10352 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10353 rtx r10 = NULL;
10354 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10355
10356 bool eax_live = false;
10357 bool r10_live = false;
10358
10359 if (TARGET_64BIT)
10360 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10361 if (!TARGET_64BIT_MS_ABI)
10362 eax_live = ix86_eax_live_at_start_p ();
10363
10364 if (eax_live)
10365 {
10366 emit_insn (gen_push (eax));
10367 allocate -= UNITS_PER_WORD;
10368 }
10369 if (r10_live)
10370 {
10371 r10 = gen_rtx_REG (Pmode, R10_REG);
10372 emit_insn (gen_push (r10));
10373 allocate -= UNITS_PER_WORD;
10374 }
10375
10376 emit_move_insn (eax, GEN_INT (allocate));
10377 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10378
10379 /* Use the fact that AX still contains ALLOCATE. */
10380 adjust_stack_insn = (Pmode == DImode
10381 ? gen_pro_epilogue_adjust_stack_di_sub
10382 : gen_pro_epilogue_adjust_stack_si_sub);
10383
10384 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10385 stack_pointer_rtx, eax));
10386
10387 /* Note that SEH directives need to continue tracking the stack
10388 pointer even after the frame pointer has been set up. */
10389 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10390 {
10391 if (m->fs.cfa_reg == stack_pointer_rtx)
10392 m->fs.cfa_offset += allocate;
10393
10394 RTX_FRAME_RELATED_P (insn) = 1;
10395 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10396 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10397 plus_constant (stack_pointer_rtx,
10398 -allocate)));
10399 }
10400 m->fs.sp_offset += allocate;
10401
10402 if (r10_live && eax_live)
10403 {
10404 t = choose_baseaddr (m->fs.sp_offset - allocate);
10405 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10406 gen_frame_mem (word_mode, t));
10407 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10408 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10409 gen_frame_mem (word_mode, t));
10410 }
10411 else if (eax_live || r10_live)
10412 {
10413 t = choose_baseaddr (m->fs.sp_offset - allocate);
10414 emit_move_insn (gen_rtx_REG (word_mode,
10415 (eax_live ? AX_REG : R10_REG)),
10416 gen_frame_mem (word_mode, t));
10417 }
10418 }
10419 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10420
10421 /* If we havn't already set up the frame pointer, do so now. */
10422 if (frame_pointer_needed && !m->fs.fp_valid)
10423 {
10424 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10425 GEN_INT (frame.stack_pointer_offset
10426 - frame.hard_frame_pointer_offset));
10427 insn = emit_insn (insn);
10428 RTX_FRAME_RELATED_P (insn) = 1;
10429 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10430
10431 if (m->fs.cfa_reg == stack_pointer_rtx)
10432 m->fs.cfa_reg = hard_frame_pointer_rtx;
10433 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10434 m->fs.fp_valid = true;
10435 }
10436
10437 if (!int_registers_saved)
10438 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10439 if (frame.nsseregs)
10440 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10441
10442 pic_reg_used = false;
10443 if (pic_offset_table_rtx
10444 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10445 || crtl->profile))
10446 {
10447 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10448
10449 if (alt_pic_reg_used != INVALID_REGNUM)
10450 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10451
10452 pic_reg_used = true;
10453 }
10454
10455 if (pic_reg_used)
10456 {
10457 if (TARGET_64BIT)
10458 {
10459 if (ix86_cmodel == CM_LARGE_PIC)
10460 {
10461 rtx label, tmp_reg;
10462
10463 gcc_assert (Pmode == DImode);
10464 label = gen_label_rtx ();
10465 emit_label (label);
10466 LABEL_PRESERVE_P (label) = 1;
10467 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10468 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10469 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10470 label));
10471 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10472 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10473 pic_offset_table_rtx, tmp_reg));
10474 }
10475 else
10476 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10477 }
10478 else
10479 {
10480 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10481 RTX_FRAME_RELATED_P (insn) = 1;
10482 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10483 }
10484 }
10485
10486 /* In the pic_reg_used case, make sure that the got load isn't deleted
10487 when mcount needs it. Blockage to avoid call movement across mcount
10488 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10489 note. */
10490 if (crtl->profile && !flag_fentry && pic_reg_used)
10491 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10492
10493 if (crtl->drap_reg && !crtl->stack_realign_needed)
10494 {
10495 /* vDRAP is setup but after reload it turns out stack realign
10496 isn't necessary, here we will emit prologue to setup DRAP
10497 without stack realign adjustment */
10498 t = choose_baseaddr (0);
10499 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10500 }
10501
10502 /* Prevent instructions from being scheduled into register save push
10503 sequence when access to the redzone area is done through frame pointer.
10504 The offset between the frame pointer and the stack pointer is calculated
10505 relative to the value of the stack pointer at the end of the function
10506 prologue, and moving instructions that access redzone area via frame
10507 pointer inside push sequence violates this assumption. */
10508 if (frame_pointer_needed && frame.red_zone_size)
10509 emit_insn (gen_memory_blockage ());
10510
10511 /* Emit cld instruction if stringops are used in the function. */
10512 if (TARGET_CLD && ix86_current_function_needs_cld)
10513 emit_insn (gen_cld ());
10514
10515 /* SEH requires that the prologue end within 256 bytes of the start of
10516 the function. Prevent instruction schedules that would extend that.
10517 Further, prevent alloca modifications to the stack pointer from being
10518 combined with prologue modifications. */
10519 if (TARGET_SEH)
10520 emit_insn (gen_prologue_use (stack_pointer_rtx));
10521 }
10522
10523 /* Emit code to restore REG using a POP insn. */
10524
10525 static void
10526 ix86_emit_restore_reg_using_pop (rtx reg)
10527 {
10528 struct machine_function *m = cfun->machine;
10529 rtx insn = emit_insn (gen_pop (reg));
10530
10531 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10532 m->fs.sp_offset -= UNITS_PER_WORD;
10533
10534 if (m->fs.cfa_reg == crtl->drap_reg
10535 && REGNO (reg) == REGNO (crtl->drap_reg))
10536 {
10537 /* Previously we'd represented the CFA as an expression
10538 like *(%ebp - 8). We've just popped that value from
10539 the stack, which means we need to reset the CFA to
10540 the drap register. This will remain until we restore
10541 the stack pointer. */
10542 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10543 RTX_FRAME_RELATED_P (insn) = 1;
10544
10545 /* This means that the DRAP register is valid for addressing too. */
10546 m->fs.drap_valid = true;
10547 return;
10548 }
10549
10550 if (m->fs.cfa_reg == stack_pointer_rtx)
10551 {
10552 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10553 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10554 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10555 RTX_FRAME_RELATED_P (insn) = 1;
10556
10557 m->fs.cfa_offset -= UNITS_PER_WORD;
10558 }
10559
10560 /* When the frame pointer is the CFA, and we pop it, we are
10561 swapping back to the stack pointer as the CFA. This happens
10562 for stack frames that don't allocate other data, so we assume
10563 the stack pointer is now pointing at the return address, i.e.
10564 the function entry state, which makes the offset be 1 word. */
10565 if (reg == hard_frame_pointer_rtx)
10566 {
10567 m->fs.fp_valid = false;
10568 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10569 {
10570 m->fs.cfa_reg = stack_pointer_rtx;
10571 m->fs.cfa_offset -= UNITS_PER_WORD;
10572
10573 add_reg_note (insn, REG_CFA_DEF_CFA,
10574 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10575 GEN_INT (m->fs.cfa_offset)));
10576 RTX_FRAME_RELATED_P (insn) = 1;
10577 }
10578 }
10579 }
10580
10581 /* Emit code to restore saved registers using POP insns. */
10582
10583 static void
10584 ix86_emit_restore_regs_using_pop (void)
10585 {
10586 unsigned int regno;
10587
10588 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10589 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10590 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10591 }
10592
10593 /* Emit code and notes for the LEAVE instruction. */
10594
10595 static void
10596 ix86_emit_leave (void)
10597 {
10598 struct machine_function *m = cfun->machine;
10599 rtx insn = emit_insn (ix86_gen_leave ());
10600
10601 ix86_add_queued_cfa_restore_notes (insn);
10602
10603 gcc_assert (m->fs.fp_valid);
10604 m->fs.sp_valid = true;
10605 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10606 m->fs.fp_valid = false;
10607
10608 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10609 {
10610 m->fs.cfa_reg = stack_pointer_rtx;
10611 m->fs.cfa_offset = m->fs.sp_offset;
10612
10613 add_reg_note (insn, REG_CFA_DEF_CFA,
10614 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10615 RTX_FRAME_RELATED_P (insn) = 1;
10616 }
10617 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10618 m->fs.fp_offset);
10619 }
10620
10621 /* Emit code to restore saved registers using MOV insns.
10622 First register is restored from CFA - CFA_OFFSET. */
10623 static void
10624 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10625 bool maybe_eh_return)
10626 {
10627 struct machine_function *m = cfun->machine;
10628 unsigned int regno;
10629
10630 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10631 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10632 {
10633 rtx reg = gen_rtx_REG (word_mode, regno);
10634 rtx insn, mem;
10635
10636 mem = choose_baseaddr (cfa_offset);
10637 mem = gen_frame_mem (word_mode, mem);
10638 insn = emit_move_insn (reg, mem);
10639
10640 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10641 {
10642 /* Previously we'd represented the CFA as an expression
10643 like *(%ebp - 8). We've just popped that value from
10644 the stack, which means we need to reset the CFA to
10645 the drap register. This will remain until we restore
10646 the stack pointer. */
10647 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10648 RTX_FRAME_RELATED_P (insn) = 1;
10649
10650 /* This means that the DRAP register is valid for addressing. */
10651 m->fs.drap_valid = true;
10652 }
10653 else
10654 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10655
10656 cfa_offset -= UNITS_PER_WORD;
10657 }
10658 }
10659
10660 /* Emit code to restore saved registers using MOV insns.
10661 First register is restored from CFA - CFA_OFFSET. */
10662 static void
10663 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10664 bool maybe_eh_return)
10665 {
10666 unsigned int regno;
10667
10668 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10669 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10670 {
10671 rtx reg = gen_rtx_REG (V4SFmode, regno);
10672 rtx mem;
10673
10674 mem = choose_baseaddr (cfa_offset);
10675 mem = gen_rtx_MEM (V4SFmode, mem);
10676 set_mem_align (mem, 128);
10677 emit_move_insn (reg, mem);
10678
10679 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10680
10681 cfa_offset -= 16;
10682 }
10683 }
10684
10685 /* Emit vzeroupper if needed. */
10686
10687 void
10688 ix86_maybe_emit_epilogue_vzeroupper (void)
10689 {
10690 if (TARGET_VZEROUPPER
10691 && !TREE_THIS_VOLATILE (cfun->decl)
10692 && !cfun->machine->caller_return_avx256_p)
10693 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10694 }
10695
10696 /* Restore function stack, frame, and registers. */
10697
10698 void
10699 ix86_expand_epilogue (int style)
10700 {
10701 struct machine_function *m = cfun->machine;
10702 struct machine_frame_state frame_state_save = m->fs;
10703 struct ix86_frame frame;
10704 bool restore_regs_via_mov;
10705 bool using_drap;
10706
10707 ix86_finalize_stack_realign_flags ();
10708 ix86_compute_frame_layout (&frame);
10709
10710 m->fs.sp_valid = (!frame_pointer_needed
10711 || (current_function_sp_is_unchanging
10712 && !stack_realign_fp));
10713 gcc_assert (!m->fs.sp_valid
10714 || m->fs.sp_offset == frame.stack_pointer_offset);
10715
10716 /* The FP must be valid if the frame pointer is present. */
10717 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10718 gcc_assert (!m->fs.fp_valid
10719 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10720
10721 /* We must have *some* valid pointer to the stack frame. */
10722 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10723
10724 /* The DRAP is never valid at this point. */
10725 gcc_assert (!m->fs.drap_valid);
10726
10727 /* See the comment about red zone and frame
10728 pointer usage in ix86_expand_prologue. */
10729 if (frame_pointer_needed && frame.red_zone_size)
10730 emit_insn (gen_memory_blockage ());
10731
10732 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10733 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10734
10735 /* Determine the CFA offset of the end of the red-zone. */
10736 m->fs.red_zone_offset = 0;
10737 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10738 {
10739 /* The red-zone begins below the return address. */
10740 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10741
10742 /* When the register save area is in the aligned portion of
10743 the stack, determine the maximum runtime displacement that
10744 matches up with the aligned frame. */
10745 if (stack_realign_drap)
10746 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10747 + UNITS_PER_WORD);
10748 }
10749
10750 /* Special care must be taken for the normal return case of a function
10751 using eh_return: the eax and edx registers are marked as saved, but
10752 not restored along this path. Adjust the save location to match. */
10753 if (crtl->calls_eh_return && style != 2)
10754 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10755
10756 /* EH_RETURN requires the use of moves to function properly. */
10757 if (crtl->calls_eh_return)
10758 restore_regs_via_mov = true;
10759 /* SEH requires the use of pops to identify the epilogue. */
10760 else if (TARGET_SEH)
10761 restore_regs_via_mov = false;
10762 /* If we're only restoring one register and sp is not valid then
10763 using a move instruction to restore the register since it's
10764 less work than reloading sp and popping the register. */
10765 else if (!m->fs.sp_valid && frame.nregs <= 1)
10766 restore_regs_via_mov = true;
10767 else if (TARGET_EPILOGUE_USING_MOVE
10768 && cfun->machine->use_fast_prologue_epilogue
10769 && (frame.nregs > 1
10770 || m->fs.sp_offset != frame.reg_save_offset))
10771 restore_regs_via_mov = true;
10772 else if (frame_pointer_needed
10773 && !frame.nregs
10774 && m->fs.sp_offset != frame.reg_save_offset)
10775 restore_regs_via_mov = true;
10776 else if (frame_pointer_needed
10777 && TARGET_USE_LEAVE
10778 && cfun->machine->use_fast_prologue_epilogue
10779 && frame.nregs == 1)
10780 restore_regs_via_mov = true;
10781 else
10782 restore_regs_via_mov = false;
10783
10784 if (restore_regs_via_mov || frame.nsseregs)
10785 {
10786 /* Ensure that the entire register save area is addressable via
10787 the stack pointer, if we will restore via sp. */
10788 if (TARGET_64BIT
10789 && m->fs.sp_offset > 0x7fffffff
10790 && !(m->fs.fp_valid || m->fs.drap_valid)
10791 && (frame.nsseregs + frame.nregs) != 0)
10792 {
10793 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10794 GEN_INT (m->fs.sp_offset
10795 - frame.sse_reg_save_offset),
10796 style,
10797 m->fs.cfa_reg == stack_pointer_rtx);
10798 }
10799 }
10800
10801 /* If there are any SSE registers to restore, then we have to do it
10802 via moves, since there's obviously no pop for SSE regs. */
10803 if (frame.nsseregs)
10804 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10805 style == 2);
10806
10807 if (restore_regs_via_mov)
10808 {
10809 rtx t;
10810
10811 if (frame.nregs)
10812 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10813
10814 /* eh_return epilogues need %ecx added to the stack pointer. */
10815 if (style == 2)
10816 {
10817 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10818
10819 /* Stack align doesn't work with eh_return. */
10820 gcc_assert (!stack_realign_drap);
10821 /* Neither does regparm nested functions. */
10822 gcc_assert (!ix86_static_chain_on_stack);
10823
10824 if (frame_pointer_needed)
10825 {
10826 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10827 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10828 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10829
10830 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10831 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10832
10833 /* Note that we use SA as a temporary CFA, as the return
10834 address is at the proper place relative to it. We
10835 pretend this happens at the FP restore insn because
10836 prior to this insn the FP would be stored at the wrong
10837 offset relative to SA, and after this insn we have no
10838 other reasonable register to use for the CFA. We don't
10839 bother resetting the CFA to the SP for the duration of
10840 the return insn. */
10841 add_reg_note (insn, REG_CFA_DEF_CFA,
10842 plus_constant (sa, UNITS_PER_WORD));
10843 ix86_add_queued_cfa_restore_notes (insn);
10844 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10845 RTX_FRAME_RELATED_P (insn) = 1;
10846
10847 m->fs.cfa_reg = sa;
10848 m->fs.cfa_offset = UNITS_PER_WORD;
10849 m->fs.fp_valid = false;
10850
10851 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10852 const0_rtx, style, false);
10853 }
10854 else
10855 {
10856 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10857 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10858 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10859 ix86_add_queued_cfa_restore_notes (insn);
10860
10861 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10862 if (m->fs.cfa_offset != UNITS_PER_WORD)
10863 {
10864 m->fs.cfa_offset = UNITS_PER_WORD;
10865 add_reg_note (insn, REG_CFA_DEF_CFA,
10866 plus_constant (stack_pointer_rtx,
10867 UNITS_PER_WORD));
10868 RTX_FRAME_RELATED_P (insn) = 1;
10869 }
10870 }
10871 m->fs.sp_offset = UNITS_PER_WORD;
10872 m->fs.sp_valid = true;
10873 }
10874 }
10875 else
10876 {
10877 /* SEH requires that the function end with (1) a stack adjustment
10878 if necessary, (2) a sequence of pops, and (3) a return or
10879 jump instruction. Prevent insns from the function body from
10880 being scheduled into this sequence. */
10881 if (TARGET_SEH)
10882 {
10883 /* Prevent a catch region from being adjacent to the standard
10884 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10885 several other flags that would be interesting to test are
10886 not yet set up. */
10887 if (flag_non_call_exceptions)
10888 emit_insn (gen_nops (const1_rtx));
10889 else
10890 emit_insn (gen_blockage ());
10891 }
10892
10893 /* First step is to deallocate the stack frame so that we can
10894 pop the registers. */
10895 if (!m->fs.sp_valid)
10896 {
10897 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10898 GEN_INT (m->fs.fp_offset
10899 - frame.reg_save_offset),
10900 style, false);
10901 }
10902 else if (m->fs.sp_offset != frame.reg_save_offset)
10903 {
10904 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10905 GEN_INT (m->fs.sp_offset
10906 - frame.reg_save_offset),
10907 style,
10908 m->fs.cfa_reg == stack_pointer_rtx);
10909 }
10910
10911 ix86_emit_restore_regs_using_pop ();
10912 }
10913
10914 /* If we used a stack pointer and haven't already got rid of it,
10915 then do so now. */
10916 if (m->fs.fp_valid)
10917 {
10918 /* If the stack pointer is valid and pointing at the frame
10919 pointer store address, then we only need a pop. */
10920 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10921 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10922 /* Leave results in shorter dependency chains on CPUs that are
10923 able to grok it fast. */
10924 else if (TARGET_USE_LEAVE
10925 || optimize_function_for_size_p (cfun)
10926 || !cfun->machine->use_fast_prologue_epilogue)
10927 ix86_emit_leave ();
10928 else
10929 {
10930 pro_epilogue_adjust_stack (stack_pointer_rtx,
10931 hard_frame_pointer_rtx,
10932 const0_rtx, style, !using_drap);
10933 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10934 }
10935 }
10936
10937 if (using_drap)
10938 {
10939 int param_ptr_offset = UNITS_PER_WORD;
10940 rtx insn;
10941
10942 gcc_assert (stack_realign_drap);
10943
10944 if (ix86_static_chain_on_stack)
10945 param_ptr_offset += UNITS_PER_WORD;
10946 if (!call_used_regs[REGNO (crtl->drap_reg)])
10947 param_ptr_offset += UNITS_PER_WORD;
10948
10949 insn = emit_insn (gen_rtx_SET
10950 (VOIDmode, stack_pointer_rtx,
10951 gen_rtx_PLUS (Pmode,
10952 crtl->drap_reg,
10953 GEN_INT (-param_ptr_offset))));
10954 m->fs.cfa_reg = stack_pointer_rtx;
10955 m->fs.cfa_offset = param_ptr_offset;
10956 m->fs.sp_offset = param_ptr_offset;
10957 m->fs.realigned = false;
10958
10959 add_reg_note (insn, REG_CFA_DEF_CFA,
10960 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10961 GEN_INT (param_ptr_offset)));
10962 RTX_FRAME_RELATED_P (insn) = 1;
10963
10964 if (!call_used_regs[REGNO (crtl->drap_reg)])
10965 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10966 }
10967
10968 /* At this point the stack pointer must be valid, and we must have
10969 restored all of the registers. We may not have deallocated the
10970 entire stack frame. We've delayed this until now because it may
10971 be possible to merge the local stack deallocation with the
10972 deallocation forced by ix86_static_chain_on_stack. */
10973 gcc_assert (m->fs.sp_valid);
10974 gcc_assert (!m->fs.fp_valid);
10975 gcc_assert (!m->fs.realigned);
10976 if (m->fs.sp_offset != UNITS_PER_WORD)
10977 {
10978 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10979 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10980 style, true);
10981 }
10982 else
10983 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10984
10985 /* Sibcall epilogues don't want a return instruction. */
10986 if (style == 0)
10987 {
10988 m->fs = frame_state_save;
10989 return;
10990 }
10991
10992 /* Emit vzeroupper if needed. */
10993 ix86_maybe_emit_epilogue_vzeroupper ();
10994
10995 if (crtl->args.pops_args && crtl->args.size)
10996 {
10997 rtx popc = GEN_INT (crtl->args.pops_args);
10998
10999 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11000 address, do explicit add, and jump indirectly to the caller. */
11001
11002 if (crtl->args.pops_args >= 65536)
11003 {
11004 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11005 rtx insn;
11006
11007 /* There is no "pascal" calling convention in any 64bit ABI. */
11008 gcc_assert (!TARGET_64BIT);
11009
11010 insn = emit_insn (gen_pop (ecx));
11011 m->fs.cfa_offset -= UNITS_PER_WORD;
11012 m->fs.sp_offset -= UNITS_PER_WORD;
11013
11014 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11015 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11016 add_reg_note (insn, REG_CFA_REGISTER,
11017 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11018 RTX_FRAME_RELATED_P (insn) = 1;
11019
11020 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11021 popc, -1, true);
11022 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11023 }
11024 else
11025 emit_jump_insn (gen_simple_return_pop_internal (popc));
11026 }
11027 else
11028 emit_jump_insn (gen_simple_return_internal ());
11029
11030 /* Restore the state back to the state from the prologue,
11031 so that it's correct for the next epilogue. */
11032 m->fs = frame_state_save;
11033 }
11034
11035 /* Reset from the function's potential modifications. */
11036
11037 static void
11038 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11039 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11040 {
11041 if (pic_offset_table_rtx)
11042 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11043 #if TARGET_MACHO
11044 /* Mach-O doesn't support labels at the end of objects, so if
11045 it looks like we might want one, insert a NOP. */
11046 {
11047 rtx insn = get_last_insn ();
11048 rtx deleted_debug_label = NULL_RTX;
11049 while (insn
11050 && NOTE_P (insn)
11051 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11052 {
11053 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11054 notes only, instead set their CODE_LABEL_NUMBER to -1,
11055 otherwise there would be code generation differences
11056 in between -g and -g0. */
11057 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11058 deleted_debug_label = insn;
11059 insn = PREV_INSN (insn);
11060 }
11061 if (insn
11062 && (LABEL_P (insn)
11063 || (NOTE_P (insn)
11064 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11065 fputs ("\tnop\n", file);
11066 else if (deleted_debug_label)
11067 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11068 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11069 CODE_LABEL_NUMBER (insn) = -1;
11070 }
11071 #endif
11072
11073 }
11074
11075 /* Return a scratch register to use in the split stack prologue. The
11076 split stack prologue is used for -fsplit-stack. It is the first
11077 instructions in the function, even before the regular prologue.
11078 The scratch register can be any caller-saved register which is not
11079 used for parameters or for the static chain. */
11080
11081 static unsigned int
11082 split_stack_prologue_scratch_regno (void)
11083 {
11084 if (TARGET_64BIT)
11085 return R11_REG;
11086 else
11087 {
11088 bool is_fastcall;
11089 int regparm;
11090
11091 is_fastcall = (lookup_attribute ("fastcall",
11092 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11093 != NULL);
11094 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11095
11096 if (is_fastcall)
11097 {
11098 if (DECL_STATIC_CHAIN (cfun->decl))
11099 {
11100 sorry ("-fsplit-stack does not support fastcall with "
11101 "nested function");
11102 return INVALID_REGNUM;
11103 }
11104 return AX_REG;
11105 }
11106 else if (regparm < 3)
11107 {
11108 if (!DECL_STATIC_CHAIN (cfun->decl))
11109 return CX_REG;
11110 else
11111 {
11112 if (regparm >= 2)
11113 {
11114 sorry ("-fsplit-stack does not support 2 register "
11115 " parameters for a nested function");
11116 return INVALID_REGNUM;
11117 }
11118 return DX_REG;
11119 }
11120 }
11121 else
11122 {
11123 /* FIXME: We could make this work by pushing a register
11124 around the addition and comparison. */
11125 sorry ("-fsplit-stack does not support 3 register parameters");
11126 return INVALID_REGNUM;
11127 }
11128 }
11129 }
11130
11131 /* A SYMBOL_REF for the function which allocates new stackspace for
11132 -fsplit-stack. */
11133
11134 static GTY(()) rtx split_stack_fn;
11135
11136 /* A SYMBOL_REF for the more stack function when using the large
11137 model. */
11138
11139 static GTY(()) rtx split_stack_fn_large;
11140
11141 /* Handle -fsplit-stack. These are the first instructions in the
11142 function, even before the regular prologue. */
11143
11144 void
11145 ix86_expand_split_stack_prologue (void)
11146 {
11147 struct ix86_frame frame;
11148 HOST_WIDE_INT allocate;
11149 unsigned HOST_WIDE_INT args_size;
11150 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11151 rtx scratch_reg = NULL_RTX;
11152 rtx varargs_label = NULL_RTX;
11153 rtx fn;
11154
11155 gcc_assert (flag_split_stack && reload_completed);
11156
11157 ix86_finalize_stack_realign_flags ();
11158 ix86_compute_frame_layout (&frame);
11159 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11160
11161 /* This is the label we will branch to if we have enough stack
11162 space. We expect the basic block reordering pass to reverse this
11163 branch if optimizing, so that we branch in the unlikely case. */
11164 label = gen_label_rtx ();
11165
11166 /* We need to compare the stack pointer minus the frame size with
11167 the stack boundary in the TCB. The stack boundary always gives
11168 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11169 can compare directly. Otherwise we need to do an addition. */
11170
11171 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11172 UNSPEC_STACK_CHECK);
11173 limit = gen_rtx_CONST (Pmode, limit);
11174 limit = gen_rtx_MEM (Pmode, limit);
11175 if (allocate < SPLIT_STACK_AVAILABLE)
11176 current = stack_pointer_rtx;
11177 else
11178 {
11179 unsigned int scratch_regno;
11180 rtx offset;
11181
11182 /* We need a scratch register to hold the stack pointer minus
11183 the required frame size. Since this is the very start of the
11184 function, the scratch register can be any caller-saved
11185 register which is not used for parameters. */
11186 offset = GEN_INT (- allocate);
11187 scratch_regno = split_stack_prologue_scratch_regno ();
11188 if (scratch_regno == INVALID_REGNUM)
11189 return;
11190 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11191 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11192 {
11193 /* We don't use ix86_gen_add3 in this case because it will
11194 want to split to lea, but when not optimizing the insn
11195 will not be split after this point. */
11196 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11197 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11198 offset)));
11199 }
11200 else
11201 {
11202 emit_move_insn (scratch_reg, offset);
11203 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11204 stack_pointer_rtx));
11205 }
11206 current = scratch_reg;
11207 }
11208
11209 ix86_expand_branch (GEU, current, limit, label);
11210 jump_insn = get_last_insn ();
11211 JUMP_LABEL (jump_insn) = label;
11212
11213 /* Mark the jump as very likely to be taken. */
11214 add_reg_note (jump_insn, REG_BR_PROB,
11215 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11216
11217 if (split_stack_fn == NULL_RTX)
11218 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11219 fn = split_stack_fn;
11220
11221 /* Get more stack space. We pass in the desired stack space and the
11222 size of the arguments to copy to the new stack. In 32-bit mode
11223 we push the parameters; __morestack will return on a new stack
11224 anyhow. In 64-bit mode we pass the parameters in r10 and
11225 r11. */
11226 allocate_rtx = GEN_INT (allocate);
11227 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11228 call_fusage = NULL_RTX;
11229 if (TARGET_64BIT)
11230 {
11231 rtx reg10, reg11;
11232
11233 reg10 = gen_rtx_REG (Pmode, R10_REG);
11234 reg11 = gen_rtx_REG (Pmode, R11_REG);
11235
11236 /* If this function uses a static chain, it will be in %r10.
11237 Preserve it across the call to __morestack. */
11238 if (DECL_STATIC_CHAIN (cfun->decl))
11239 {
11240 rtx rax;
11241
11242 rax = gen_rtx_REG (word_mode, AX_REG);
11243 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11244 use_reg (&call_fusage, rax);
11245 }
11246
11247 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11248 {
11249 HOST_WIDE_INT argval;
11250
11251 gcc_assert (Pmode == DImode);
11252 /* When using the large model we need to load the address
11253 into a register, and we've run out of registers. So we
11254 switch to a different calling convention, and we call a
11255 different function: __morestack_large. We pass the
11256 argument size in the upper 32 bits of r10 and pass the
11257 frame size in the lower 32 bits. */
11258 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11259 gcc_assert ((args_size & 0xffffffff) == args_size);
11260
11261 if (split_stack_fn_large == NULL_RTX)
11262 split_stack_fn_large =
11263 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11264
11265 if (ix86_cmodel == CM_LARGE_PIC)
11266 {
11267 rtx label, x;
11268
11269 label = gen_label_rtx ();
11270 emit_label (label);
11271 LABEL_PRESERVE_P (label) = 1;
11272 emit_insn (gen_set_rip_rex64 (reg10, label));
11273 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11274 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11275 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11276 UNSPEC_GOT);
11277 x = gen_rtx_CONST (Pmode, x);
11278 emit_move_insn (reg11, x);
11279 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11280 x = gen_const_mem (Pmode, x);
11281 emit_move_insn (reg11, x);
11282 }
11283 else
11284 emit_move_insn (reg11, split_stack_fn_large);
11285
11286 fn = reg11;
11287
11288 argval = ((args_size << 16) << 16) + allocate;
11289 emit_move_insn (reg10, GEN_INT (argval));
11290 }
11291 else
11292 {
11293 emit_move_insn (reg10, allocate_rtx);
11294 emit_move_insn (reg11, GEN_INT (args_size));
11295 use_reg (&call_fusage, reg11);
11296 }
11297
11298 use_reg (&call_fusage, reg10);
11299 }
11300 else
11301 {
11302 emit_insn (gen_push (GEN_INT (args_size)));
11303 emit_insn (gen_push (allocate_rtx));
11304 }
11305 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11306 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11307 NULL_RTX, false);
11308 add_function_usage_to (call_insn, call_fusage);
11309
11310 /* In order to make call/return prediction work right, we now need
11311 to execute a return instruction. See
11312 libgcc/config/i386/morestack.S for the details on how this works.
11313
11314 For flow purposes gcc must not see this as a return
11315 instruction--we need control flow to continue at the subsequent
11316 label. Therefore, we use an unspec. */
11317 gcc_assert (crtl->args.pops_args < 65536);
11318 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11319
11320 /* If we are in 64-bit mode and this function uses a static chain,
11321 we saved %r10 in %rax before calling _morestack. */
11322 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11323 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11324 gen_rtx_REG (word_mode, AX_REG));
11325
11326 /* If this function calls va_start, we need to store a pointer to
11327 the arguments on the old stack, because they may not have been
11328 all copied to the new stack. At this point the old stack can be
11329 found at the frame pointer value used by __morestack, because
11330 __morestack has set that up before calling back to us. Here we
11331 store that pointer in a scratch register, and in
11332 ix86_expand_prologue we store the scratch register in a stack
11333 slot. */
11334 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11335 {
11336 unsigned int scratch_regno;
11337 rtx frame_reg;
11338 int words;
11339
11340 scratch_regno = split_stack_prologue_scratch_regno ();
11341 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11342 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11343
11344 /* 64-bit:
11345 fp -> old fp value
11346 return address within this function
11347 return address of caller of this function
11348 stack arguments
11349 So we add three words to get to the stack arguments.
11350
11351 32-bit:
11352 fp -> old fp value
11353 return address within this function
11354 first argument to __morestack
11355 second argument to __morestack
11356 return address of caller of this function
11357 stack arguments
11358 So we add five words to get to the stack arguments.
11359 */
11360 words = TARGET_64BIT ? 3 : 5;
11361 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11362 gen_rtx_PLUS (Pmode, frame_reg,
11363 GEN_INT (words * UNITS_PER_WORD))));
11364
11365 varargs_label = gen_label_rtx ();
11366 emit_jump_insn (gen_jump (varargs_label));
11367 JUMP_LABEL (get_last_insn ()) = varargs_label;
11368
11369 emit_barrier ();
11370 }
11371
11372 emit_label (label);
11373 LABEL_NUSES (label) = 1;
11374
11375 /* If this function calls va_start, we now have to set the scratch
11376 register for the case where we do not call __morestack. In this
11377 case we need to set it based on the stack pointer. */
11378 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11379 {
11380 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11381 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11382 GEN_INT (UNITS_PER_WORD))));
11383
11384 emit_label (varargs_label);
11385 LABEL_NUSES (varargs_label) = 1;
11386 }
11387 }
11388
11389 /* We may have to tell the dataflow pass that the split stack prologue
11390 is initializing a scratch register. */
11391
11392 static void
11393 ix86_live_on_entry (bitmap regs)
11394 {
11395 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11396 {
11397 gcc_assert (flag_split_stack);
11398 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11399 }
11400 }
11401 \f
11402 /* Determine if op is suitable SUBREG RTX for address. */
11403
11404 static bool
11405 ix86_address_subreg_operand (rtx op)
11406 {
11407 enum machine_mode mode;
11408
11409 if (!REG_P (op))
11410 return false;
11411
11412 mode = GET_MODE (op);
11413
11414 if (GET_MODE_CLASS (mode) != MODE_INT)
11415 return false;
11416
11417 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11418 failures when the register is one word out of a two word structure. */
11419 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11420 return false;
11421
11422 /* Allow only SUBREGs of non-eliminable hard registers. */
11423 return register_no_elim_operand (op, mode);
11424 }
11425
11426 /* Extract the parts of an RTL expression that is a valid memory address
11427 for an instruction. Return 0 if the structure of the address is
11428 grossly off. Return -1 if the address contains ASHIFT, so it is not
11429 strictly valid, but still used for computing length of lea instruction. */
11430
11431 int
11432 ix86_decompose_address (rtx addr, struct ix86_address *out)
11433 {
11434 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11435 rtx base_reg, index_reg;
11436 HOST_WIDE_INT scale = 1;
11437 rtx scale_rtx = NULL_RTX;
11438 rtx tmp;
11439 int retval = 1;
11440 enum ix86_address_seg seg = SEG_DEFAULT;
11441
11442 /* Allow zero-extended SImode addresses,
11443 they will be emitted with addr32 prefix. */
11444 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11445 {
11446 if (GET_CODE (addr) == ZERO_EXTEND
11447 && GET_MODE (XEXP (addr, 0)) == SImode)
11448 addr = XEXP (addr, 0);
11449 else if (GET_CODE (addr) == AND
11450 && const_32bit_mask (XEXP (addr, 1), DImode))
11451 {
11452 addr = XEXP (addr, 0);
11453
11454 /* Adjust SUBREGs. */
11455 if (GET_CODE (addr) == SUBREG
11456 && GET_MODE (SUBREG_REG (addr)) == SImode)
11457 addr = SUBREG_REG (addr);
11458 else if (GET_MODE (addr) == DImode)
11459 addr = gen_rtx_SUBREG (SImode, addr, 0);
11460 else
11461 return 0;
11462 }
11463 }
11464
11465 if (REG_P (addr))
11466 base = addr;
11467 else if (GET_CODE (addr) == SUBREG)
11468 {
11469 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11470 base = addr;
11471 else
11472 return 0;
11473 }
11474 else if (GET_CODE (addr) == PLUS)
11475 {
11476 rtx addends[4], op;
11477 int n = 0, i;
11478
11479 op = addr;
11480 do
11481 {
11482 if (n >= 4)
11483 return 0;
11484 addends[n++] = XEXP (op, 1);
11485 op = XEXP (op, 0);
11486 }
11487 while (GET_CODE (op) == PLUS);
11488 if (n >= 4)
11489 return 0;
11490 addends[n] = op;
11491
11492 for (i = n; i >= 0; --i)
11493 {
11494 op = addends[i];
11495 switch (GET_CODE (op))
11496 {
11497 case MULT:
11498 if (index)
11499 return 0;
11500 index = XEXP (op, 0);
11501 scale_rtx = XEXP (op, 1);
11502 break;
11503
11504 case ASHIFT:
11505 if (index)
11506 return 0;
11507 index = XEXP (op, 0);
11508 tmp = XEXP (op, 1);
11509 if (!CONST_INT_P (tmp))
11510 return 0;
11511 scale = INTVAL (tmp);
11512 if ((unsigned HOST_WIDE_INT) scale > 3)
11513 return 0;
11514 scale = 1 << scale;
11515 break;
11516
11517 case ZERO_EXTEND:
11518 op = XEXP (op, 0);
11519 if (GET_CODE (op) != UNSPEC)
11520 return 0;
11521 /* FALLTHRU */
11522
11523 case UNSPEC:
11524 if (XINT (op, 1) == UNSPEC_TP
11525 && TARGET_TLS_DIRECT_SEG_REFS
11526 && seg == SEG_DEFAULT)
11527 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11528 else
11529 return 0;
11530 break;
11531
11532 case SUBREG:
11533 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11534 return 0;
11535 /* FALLTHRU */
11536
11537 case REG:
11538 if (!base)
11539 base = op;
11540 else if (!index)
11541 index = op;
11542 else
11543 return 0;
11544 break;
11545
11546 case CONST:
11547 case CONST_INT:
11548 case SYMBOL_REF:
11549 case LABEL_REF:
11550 if (disp)
11551 return 0;
11552 disp = op;
11553 break;
11554
11555 default:
11556 return 0;
11557 }
11558 }
11559 }
11560 else if (GET_CODE (addr) == MULT)
11561 {
11562 index = XEXP (addr, 0); /* index*scale */
11563 scale_rtx = XEXP (addr, 1);
11564 }
11565 else if (GET_CODE (addr) == ASHIFT)
11566 {
11567 /* We're called for lea too, which implements ashift on occasion. */
11568 index = XEXP (addr, 0);
11569 tmp = XEXP (addr, 1);
11570 if (!CONST_INT_P (tmp))
11571 return 0;
11572 scale = INTVAL (tmp);
11573 if ((unsigned HOST_WIDE_INT) scale > 3)
11574 return 0;
11575 scale = 1 << scale;
11576 retval = -1;
11577 }
11578 else
11579 disp = addr; /* displacement */
11580
11581 if (index)
11582 {
11583 if (REG_P (index))
11584 ;
11585 else if (GET_CODE (index) == SUBREG
11586 && ix86_address_subreg_operand (SUBREG_REG (index)))
11587 ;
11588 else
11589 return 0;
11590 }
11591
11592 /* Address override works only on the (%reg) part of %fs:(%reg). */
11593 if (seg != SEG_DEFAULT
11594 && ((base && GET_MODE (base) != word_mode)
11595 || (index && GET_MODE (index) != word_mode)))
11596 return 0;
11597
11598 /* Extract the integral value of scale. */
11599 if (scale_rtx)
11600 {
11601 if (!CONST_INT_P (scale_rtx))
11602 return 0;
11603 scale = INTVAL (scale_rtx);
11604 }
11605
11606 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11607 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11608
11609 /* Avoid useless 0 displacement. */
11610 if (disp == const0_rtx && (base || index))
11611 disp = NULL_RTX;
11612
11613 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11614 if (base_reg && index_reg && scale == 1
11615 && (index_reg == arg_pointer_rtx
11616 || index_reg == frame_pointer_rtx
11617 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11618 {
11619 rtx tmp;
11620 tmp = base, base = index, index = tmp;
11621 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11622 }
11623
11624 /* Special case: %ebp cannot be encoded as a base without a displacement.
11625 Similarly %r13. */
11626 if (!disp
11627 && base_reg
11628 && (base_reg == hard_frame_pointer_rtx
11629 || base_reg == frame_pointer_rtx
11630 || base_reg == arg_pointer_rtx
11631 || (REG_P (base_reg)
11632 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11633 || REGNO (base_reg) == R13_REG))))
11634 disp = const0_rtx;
11635
11636 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11637 Avoid this by transforming to [%esi+0].
11638 Reload calls address legitimization without cfun defined, so we need
11639 to test cfun for being non-NULL. */
11640 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11641 && base_reg && !index_reg && !disp
11642 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11643 disp = const0_rtx;
11644
11645 /* Special case: encode reg+reg instead of reg*2. */
11646 if (!base && index && scale == 2)
11647 base = index, base_reg = index_reg, scale = 1;
11648
11649 /* Special case: scaling cannot be encoded without base or displacement. */
11650 if (!base && !disp && index && scale != 1)
11651 disp = const0_rtx;
11652
11653 out->base = base;
11654 out->index = index;
11655 out->disp = disp;
11656 out->scale = scale;
11657 out->seg = seg;
11658
11659 return retval;
11660 }
11661 \f
11662 /* Return cost of the memory address x.
11663 For i386, it is better to use a complex address than let gcc copy
11664 the address into a reg and make a new pseudo. But not if the address
11665 requires to two regs - that would mean more pseudos with longer
11666 lifetimes. */
11667 static int
11668 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11669 {
11670 struct ix86_address parts;
11671 int cost = 1;
11672 int ok = ix86_decompose_address (x, &parts);
11673
11674 gcc_assert (ok);
11675
11676 if (parts.base && GET_CODE (parts.base) == SUBREG)
11677 parts.base = SUBREG_REG (parts.base);
11678 if (parts.index && GET_CODE (parts.index) == SUBREG)
11679 parts.index = SUBREG_REG (parts.index);
11680
11681 /* Attempt to minimize number of registers in the address. */
11682 if ((parts.base
11683 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11684 || (parts.index
11685 && (!REG_P (parts.index)
11686 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11687 cost++;
11688
11689 if (parts.base
11690 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11691 && parts.index
11692 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11693 && parts.base != parts.index)
11694 cost++;
11695
11696 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11697 since it's predecode logic can't detect the length of instructions
11698 and it degenerates to vector decoded. Increase cost of such
11699 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11700 to split such addresses or even refuse such addresses at all.
11701
11702 Following addressing modes are affected:
11703 [base+scale*index]
11704 [scale*index+disp]
11705 [base+index]
11706
11707 The first and last case may be avoidable by explicitly coding the zero in
11708 memory address, but I don't have AMD-K6 machine handy to check this
11709 theory. */
11710
11711 if (TARGET_K6
11712 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11713 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11714 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11715 cost += 10;
11716
11717 return cost;
11718 }
11719 \f
11720 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11721 this is used for to form addresses to local data when -fPIC is in
11722 use. */
11723
11724 static bool
11725 darwin_local_data_pic (rtx disp)
11726 {
11727 return (GET_CODE (disp) == UNSPEC
11728 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11729 }
11730
11731 /* Determine if a given RTX is a valid constant. We already know this
11732 satisfies CONSTANT_P. */
11733
11734 static bool
11735 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11736 {
11737 switch (GET_CODE (x))
11738 {
11739 case CONST:
11740 x = XEXP (x, 0);
11741
11742 if (GET_CODE (x) == PLUS)
11743 {
11744 if (!CONST_INT_P (XEXP (x, 1)))
11745 return false;
11746 x = XEXP (x, 0);
11747 }
11748
11749 if (TARGET_MACHO && darwin_local_data_pic (x))
11750 return true;
11751
11752 /* Only some unspecs are valid as "constants". */
11753 if (GET_CODE (x) == UNSPEC)
11754 switch (XINT (x, 1))
11755 {
11756 case UNSPEC_GOT:
11757 case UNSPEC_GOTOFF:
11758 case UNSPEC_PLTOFF:
11759 return TARGET_64BIT;
11760 case UNSPEC_TPOFF:
11761 case UNSPEC_NTPOFF:
11762 x = XVECEXP (x, 0, 0);
11763 return (GET_CODE (x) == SYMBOL_REF
11764 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11765 case UNSPEC_DTPOFF:
11766 x = XVECEXP (x, 0, 0);
11767 return (GET_CODE (x) == SYMBOL_REF
11768 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11769 default:
11770 return false;
11771 }
11772
11773 /* We must have drilled down to a symbol. */
11774 if (GET_CODE (x) == LABEL_REF)
11775 return true;
11776 if (GET_CODE (x) != SYMBOL_REF)
11777 return false;
11778 /* FALLTHRU */
11779
11780 case SYMBOL_REF:
11781 /* TLS symbols are never valid. */
11782 if (SYMBOL_REF_TLS_MODEL (x))
11783 return false;
11784
11785 /* DLLIMPORT symbols are never valid. */
11786 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11787 && SYMBOL_REF_DLLIMPORT_P (x))
11788 return false;
11789
11790 #if TARGET_MACHO
11791 /* mdynamic-no-pic */
11792 if (MACHO_DYNAMIC_NO_PIC_P)
11793 return machopic_symbol_defined_p (x);
11794 #endif
11795 break;
11796
11797 case CONST_DOUBLE:
11798 if (GET_MODE (x) == TImode
11799 && x != CONST0_RTX (TImode)
11800 && !TARGET_64BIT)
11801 return false;
11802 break;
11803
11804 case CONST_VECTOR:
11805 if (!standard_sse_constant_p (x))
11806 return false;
11807
11808 default:
11809 break;
11810 }
11811
11812 /* Otherwise we handle everything else in the move patterns. */
11813 return true;
11814 }
11815
11816 /* Determine if it's legal to put X into the constant pool. This
11817 is not possible for the address of thread-local symbols, which
11818 is checked above. */
11819
11820 static bool
11821 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11822 {
11823 /* We can always put integral constants and vectors in memory. */
11824 switch (GET_CODE (x))
11825 {
11826 case CONST_INT:
11827 case CONST_DOUBLE:
11828 case CONST_VECTOR:
11829 return false;
11830
11831 default:
11832 break;
11833 }
11834 return !ix86_legitimate_constant_p (mode, x);
11835 }
11836
11837
11838 /* Nonzero if the constant value X is a legitimate general operand
11839 when generating PIC code. It is given that flag_pic is on and
11840 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11841
11842 bool
11843 legitimate_pic_operand_p (rtx x)
11844 {
11845 rtx inner;
11846
11847 switch (GET_CODE (x))
11848 {
11849 case CONST:
11850 inner = XEXP (x, 0);
11851 if (GET_CODE (inner) == PLUS
11852 && CONST_INT_P (XEXP (inner, 1)))
11853 inner = XEXP (inner, 0);
11854
11855 /* Only some unspecs are valid as "constants". */
11856 if (GET_CODE (inner) == UNSPEC)
11857 switch (XINT (inner, 1))
11858 {
11859 case UNSPEC_GOT:
11860 case UNSPEC_GOTOFF:
11861 case UNSPEC_PLTOFF:
11862 return TARGET_64BIT;
11863 case UNSPEC_TPOFF:
11864 x = XVECEXP (inner, 0, 0);
11865 return (GET_CODE (x) == SYMBOL_REF
11866 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11867 case UNSPEC_MACHOPIC_OFFSET:
11868 return legitimate_pic_address_disp_p (x);
11869 default:
11870 return false;
11871 }
11872 /* FALLTHRU */
11873
11874 case SYMBOL_REF:
11875 case LABEL_REF:
11876 return legitimate_pic_address_disp_p (x);
11877
11878 default:
11879 return true;
11880 }
11881 }
11882
11883 /* Determine if a given CONST RTX is a valid memory displacement
11884 in PIC mode. */
11885
11886 bool
11887 legitimate_pic_address_disp_p (rtx disp)
11888 {
11889 bool saw_plus;
11890
11891 /* In 64bit mode we can allow direct addresses of symbols and labels
11892 when they are not dynamic symbols. */
11893 if (TARGET_64BIT)
11894 {
11895 rtx op0 = disp, op1;
11896
11897 switch (GET_CODE (disp))
11898 {
11899 case LABEL_REF:
11900 return true;
11901
11902 case CONST:
11903 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11904 break;
11905 op0 = XEXP (XEXP (disp, 0), 0);
11906 op1 = XEXP (XEXP (disp, 0), 1);
11907 if (!CONST_INT_P (op1)
11908 || INTVAL (op1) >= 16*1024*1024
11909 || INTVAL (op1) < -16*1024*1024)
11910 break;
11911 if (GET_CODE (op0) == LABEL_REF)
11912 return true;
11913 if (GET_CODE (op0) == CONST
11914 && GET_CODE (XEXP (op0, 0)) == UNSPEC
11915 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
11916 return true;
11917 if (GET_CODE (op0) == UNSPEC
11918 && XINT (op0, 1) == UNSPEC_PCREL)
11919 return true;
11920 if (GET_CODE (op0) != SYMBOL_REF)
11921 break;
11922 /* FALLTHRU */
11923
11924 case SYMBOL_REF:
11925 /* TLS references should always be enclosed in UNSPEC. */
11926 if (SYMBOL_REF_TLS_MODEL (op0))
11927 return false;
11928 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11929 && ix86_cmodel != CM_LARGE_PIC)
11930 return true;
11931 break;
11932
11933 default:
11934 break;
11935 }
11936 }
11937 if (GET_CODE (disp) != CONST)
11938 return false;
11939 disp = XEXP (disp, 0);
11940
11941 if (TARGET_64BIT)
11942 {
11943 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11944 of GOT tables. We should not need these anyway. */
11945 if (GET_CODE (disp) != UNSPEC
11946 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11947 && XINT (disp, 1) != UNSPEC_GOTOFF
11948 && XINT (disp, 1) != UNSPEC_PCREL
11949 && XINT (disp, 1) != UNSPEC_PLTOFF))
11950 return false;
11951
11952 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11953 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11954 return false;
11955 return true;
11956 }
11957
11958 saw_plus = false;
11959 if (GET_CODE (disp) == PLUS)
11960 {
11961 if (!CONST_INT_P (XEXP (disp, 1)))
11962 return false;
11963 disp = XEXP (disp, 0);
11964 saw_plus = true;
11965 }
11966
11967 if (TARGET_MACHO && darwin_local_data_pic (disp))
11968 return true;
11969
11970 if (GET_CODE (disp) != UNSPEC)
11971 return false;
11972
11973 switch (XINT (disp, 1))
11974 {
11975 case UNSPEC_GOT:
11976 if (saw_plus)
11977 return false;
11978 /* We need to check for both symbols and labels because VxWorks loads
11979 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11980 details. */
11981 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11982 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11983 case UNSPEC_GOTOFF:
11984 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11985 While ABI specify also 32bit relocation but we don't produce it in
11986 small PIC model at all. */
11987 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11988 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11989 && !TARGET_64BIT)
11990 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11991 return false;
11992 case UNSPEC_GOTTPOFF:
11993 case UNSPEC_GOTNTPOFF:
11994 case UNSPEC_INDNTPOFF:
11995 if (saw_plus)
11996 return false;
11997 disp = XVECEXP (disp, 0, 0);
11998 return (GET_CODE (disp) == SYMBOL_REF
11999 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12000 case UNSPEC_NTPOFF:
12001 disp = XVECEXP (disp, 0, 0);
12002 return (GET_CODE (disp) == SYMBOL_REF
12003 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12004 case UNSPEC_DTPOFF:
12005 disp = XVECEXP (disp, 0, 0);
12006 return (GET_CODE (disp) == SYMBOL_REF
12007 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12008 }
12009
12010 return false;
12011 }
12012
12013 /* Recognizes RTL expressions that are valid memory addresses for an
12014 instruction. The MODE argument is the machine mode for the MEM
12015 expression that wants to use this address.
12016
12017 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12018 convert common non-canonical forms to canonical form so that they will
12019 be recognized. */
12020
12021 static bool
12022 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12023 rtx addr, bool strict)
12024 {
12025 struct ix86_address parts;
12026 rtx base, index, disp;
12027 HOST_WIDE_INT scale;
12028
12029 /* Since constant address in x32 is signed extended to 64bit,
12030 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12031 if (TARGET_X32
12032 && CONST_INT_P (addr)
12033 && INTVAL (addr) < 0)
12034 return false;
12035
12036 if (ix86_decompose_address (addr, &parts) <= 0)
12037 /* Decomposition failed. */
12038 return false;
12039
12040 base = parts.base;
12041 index = parts.index;
12042 disp = parts.disp;
12043 scale = parts.scale;
12044
12045 /* Validate base register. */
12046 if (base)
12047 {
12048 rtx reg;
12049
12050 if (REG_P (base))
12051 reg = base;
12052 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12053 reg = SUBREG_REG (base);
12054 else
12055 /* Base is not a register. */
12056 return false;
12057
12058 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12059 return false;
12060
12061 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12062 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12063 /* Base is not valid. */
12064 return false;
12065 }
12066
12067 /* Validate index register. */
12068 if (index)
12069 {
12070 rtx reg;
12071
12072 if (REG_P (index))
12073 reg = index;
12074 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12075 reg = SUBREG_REG (index);
12076 else
12077 /* Index is not a register. */
12078 return false;
12079
12080 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12081 return false;
12082
12083 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12084 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12085 /* Index is not valid. */
12086 return false;
12087 }
12088
12089 /* Index and base should have the same mode. */
12090 if (base && index
12091 && GET_MODE (base) != GET_MODE (index))
12092 return false;
12093
12094 /* Validate scale factor. */
12095 if (scale != 1)
12096 {
12097 if (!index)
12098 /* Scale without index. */
12099 return false;
12100
12101 if (scale != 2 && scale != 4 && scale != 8)
12102 /* Scale is not a valid multiplier. */
12103 return false;
12104 }
12105
12106 /* Validate displacement. */
12107 if (disp)
12108 {
12109 if (GET_CODE (disp) == CONST
12110 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12111 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12112 switch (XINT (XEXP (disp, 0), 1))
12113 {
12114 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12115 used. While ABI specify also 32bit relocations, we don't produce
12116 them at all and use IP relative instead. */
12117 case UNSPEC_GOT:
12118 case UNSPEC_GOTOFF:
12119 gcc_assert (flag_pic);
12120 if (!TARGET_64BIT)
12121 goto is_legitimate_pic;
12122
12123 /* 64bit address unspec. */
12124 return false;
12125
12126 case UNSPEC_GOTPCREL:
12127 case UNSPEC_PCREL:
12128 gcc_assert (flag_pic);
12129 goto is_legitimate_pic;
12130
12131 case UNSPEC_GOTTPOFF:
12132 case UNSPEC_GOTNTPOFF:
12133 case UNSPEC_INDNTPOFF:
12134 case UNSPEC_NTPOFF:
12135 case UNSPEC_DTPOFF:
12136 break;
12137
12138 case UNSPEC_STACK_CHECK:
12139 gcc_assert (flag_split_stack);
12140 break;
12141
12142 default:
12143 /* Invalid address unspec. */
12144 return false;
12145 }
12146
12147 else if (SYMBOLIC_CONST (disp)
12148 && (flag_pic
12149 || (TARGET_MACHO
12150 #if TARGET_MACHO
12151 && MACHOPIC_INDIRECT
12152 && !machopic_operand_p (disp)
12153 #endif
12154 )))
12155 {
12156
12157 is_legitimate_pic:
12158 if (TARGET_64BIT && (index || base))
12159 {
12160 /* foo@dtpoff(%rX) is ok. */
12161 if (GET_CODE (disp) != CONST
12162 || GET_CODE (XEXP (disp, 0)) != PLUS
12163 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12164 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12165 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12166 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12167 /* Non-constant pic memory reference. */
12168 return false;
12169 }
12170 else if ((!TARGET_MACHO || flag_pic)
12171 && ! legitimate_pic_address_disp_p (disp))
12172 /* Displacement is an invalid pic construct. */
12173 return false;
12174 #if TARGET_MACHO
12175 else if (MACHO_DYNAMIC_NO_PIC_P
12176 && !ix86_legitimate_constant_p (Pmode, disp))
12177 /* displacment must be referenced via non_lazy_pointer */
12178 return false;
12179 #endif
12180
12181 /* This code used to verify that a symbolic pic displacement
12182 includes the pic_offset_table_rtx register.
12183
12184 While this is good idea, unfortunately these constructs may
12185 be created by "adds using lea" optimization for incorrect
12186 code like:
12187
12188 int a;
12189 int foo(int i)
12190 {
12191 return *(&a+i);
12192 }
12193
12194 This code is nonsensical, but results in addressing
12195 GOT table with pic_offset_table_rtx base. We can't
12196 just refuse it easily, since it gets matched by
12197 "addsi3" pattern, that later gets split to lea in the
12198 case output register differs from input. While this
12199 can be handled by separate addsi pattern for this case
12200 that never results in lea, this seems to be easier and
12201 correct fix for crash to disable this test. */
12202 }
12203 else if (GET_CODE (disp) != LABEL_REF
12204 && !CONST_INT_P (disp)
12205 && (GET_CODE (disp) != CONST
12206 || !ix86_legitimate_constant_p (Pmode, disp))
12207 && (GET_CODE (disp) != SYMBOL_REF
12208 || !ix86_legitimate_constant_p (Pmode, disp)))
12209 /* Displacement is not constant. */
12210 return false;
12211 else if (TARGET_64BIT
12212 && !x86_64_immediate_operand (disp, VOIDmode))
12213 /* Displacement is out of range. */
12214 return false;
12215 }
12216
12217 /* Everything looks valid. */
12218 return true;
12219 }
12220
12221 /* Determine if a given RTX is a valid constant address. */
12222
12223 bool
12224 constant_address_p (rtx x)
12225 {
12226 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12227 }
12228 \f
12229 /* Return a unique alias set for the GOT. */
12230
12231 static alias_set_type
12232 ix86_GOT_alias_set (void)
12233 {
12234 static alias_set_type set = -1;
12235 if (set == -1)
12236 set = new_alias_set ();
12237 return set;
12238 }
12239
12240 /* Return a legitimate reference for ORIG (an address) using the
12241 register REG. If REG is 0, a new pseudo is generated.
12242
12243 There are two types of references that must be handled:
12244
12245 1. Global data references must load the address from the GOT, via
12246 the PIC reg. An insn is emitted to do this load, and the reg is
12247 returned.
12248
12249 2. Static data references, constant pool addresses, and code labels
12250 compute the address as an offset from the GOT, whose base is in
12251 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12252 differentiate them from global data objects. The returned
12253 address is the PIC reg + an unspec constant.
12254
12255 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12256 reg also appears in the address. */
12257
12258 static rtx
12259 legitimize_pic_address (rtx orig, rtx reg)
12260 {
12261 rtx addr = orig;
12262 rtx new_rtx = orig;
12263 rtx base;
12264
12265 #if TARGET_MACHO
12266 if (TARGET_MACHO && !TARGET_64BIT)
12267 {
12268 if (reg == 0)
12269 reg = gen_reg_rtx (Pmode);
12270 /* Use the generic Mach-O PIC machinery. */
12271 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12272 }
12273 #endif
12274
12275 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12276 new_rtx = addr;
12277 else if (TARGET_64BIT
12278 && ix86_cmodel != CM_SMALL_PIC
12279 && gotoff_operand (addr, Pmode))
12280 {
12281 rtx tmpreg;
12282 /* This symbol may be referenced via a displacement from the PIC
12283 base address (@GOTOFF). */
12284
12285 if (reload_in_progress)
12286 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12287 if (GET_CODE (addr) == CONST)
12288 addr = XEXP (addr, 0);
12289 if (GET_CODE (addr) == PLUS)
12290 {
12291 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12292 UNSPEC_GOTOFF);
12293 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12294 }
12295 else
12296 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12297 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12298 if (!reg)
12299 tmpreg = gen_reg_rtx (Pmode);
12300 else
12301 tmpreg = reg;
12302 emit_move_insn (tmpreg, new_rtx);
12303
12304 if (reg != 0)
12305 {
12306 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12307 tmpreg, 1, OPTAB_DIRECT);
12308 new_rtx = reg;
12309 }
12310 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12311 }
12312 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12313 {
12314 /* This symbol may be referenced via a displacement from the PIC
12315 base address (@GOTOFF). */
12316
12317 if (reload_in_progress)
12318 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12319 if (GET_CODE (addr) == CONST)
12320 addr = XEXP (addr, 0);
12321 if (GET_CODE (addr) == PLUS)
12322 {
12323 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12324 UNSPEC_GOTOFF);
12325 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12326 }
12327 else
12328 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12329 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12330 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12331
12332 if (reg != 0)
12333 {
12334 emit_move_insn (reg, new_rtx);
12335 new_rtx = reg;
12336 }
12337 }
12338 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12339 /* We can't use @GOTOFF for text labels on VxWorks;
12340 see gotoff_operand. */
12341 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12342 {
12343 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12344 {
12345 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12346 return legitimize_dllimport_symbol (addr, true);
12347 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12348 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12349 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12350 {
12351 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12352 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12353 }
12354 }
12355
12356 /* For x64 PE-COFF there is no GOT table. So we use address
12357 directly. */
12358 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12359 {
12360 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12361 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12362
12363 if (reg == 0)
12364 reg = gen_reg_rtx (Pmode);
12365 emit_move_insn (reg, new_rtx);
12366 new_rtx = reg;
12367 }
12368 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12369 {
12370 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12371 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12372 new_rtx = gen_const_mem (Pmode, new_rtx);
12373 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12374
12375 if (reg == 0)
12376 reg = gen_reg_rtx (Pmode);
12377 /* Use directly gen_movsi, otherwise the address is loaded
12378 into register for CSE. We don't want to CSE this addresses,
12379 instead we CSE addresses from the GOT table, so skip this. */
12380 emit_insn (gen_movsi (reg, new_rtx));
12381 new_rtx = reg;
12382 }
12383 else
12384 {
12385 /* This symbol must be referenced via a load from the
12386 Global Offset Table (@GOT). */
12387
12388 if (reload_in_progress)
12389 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12390 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12391 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12392 if (TARGET_64BIT)
12393 new_rtx = force_reg (Pmode, new_rtx);
12394 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12395 new_rtx = gen_const_mem (Pmode, new_rtx);
12396 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12397
12398 if (reg == 0)
12399 reg = gen_reg_rtx (Pmode);
12400 emit_move_insn (reg, new_rtx);
12401 new_rtx = reg;
12402 }
12403 }
12404 else
12405 {
12406 if (CONST_INT_P (addr)
12407 && !x86_64_immediate_operand (addr, VOIDmode))
12408 {
12409 if (reg)
12410 {
12411 emit_move_insn (reg, addr);
12412 new_rtx = reg;
12413 }
12414 else
12415 new_rtx = force_reg (Pmode, addr);
12416 }
12417 else if (GET_CODE (addr) == CONST)
12418 {
12419 addr = XEXP (addr, 0);
12420
12421 /* We must match stuff we generate before. Assume the only
12422 unspecs that can get here are ours. Not that we could do
12423 anything with them anyway.... */
12424 if (GET_CODE (addr) == UNSPEC
12425 || (GET_CODE (addr) == PLUS
12426 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12427 return orig;
12428 gcc_assert (GET_CODE (addr) == PLUS);
12429 }
12430 if (GET_CODE (addr) == PLUS)
12431 {
12432 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12433
12434 /* Check first to see if this is a constant offset from a @GOTOFF
12435 symbol reference. */
12436 if (gotoff_operand (op0, Pmode)
12437 && CONST_INT_P (op1))
12438 {
12439 if (!TARGET_64BIT)
12440 {
12441 if (reload_in_progress)
12442 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12443 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12444 UNSPEC_GOTOFF);
12445 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12446 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12447 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12448
12449 if (reg != 0)
12450 {
12451 emit_move_insn (reg, new_rtx);
12452 new_rtx = reg;
12453 }
12454 }
12455 else
12456 {
12457 if (INTVAL (op1) < -16*1024*1024
12458 || INTVAL (op1) >= 16*1024*1024)
12459 {
12460 if (!x86_64_immediate_operand (op1, Pmode))
12461 op1 = force_reg (Pmode, op1);
12462 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12463 }
12464 }
12465 }
12466 else
12467 {
12468 base = legitimize_pic_address (XEXP (addr, 0), reg);
12469 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12470 base == reg ? NULL_RTX : reg);
12471
12472 if (CONST_INT_P (new_rtx))
12473 new_rtx = plus_constant (base, INTVAL (new_rtx));
12474 else
12475 {
12476 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12477 {
12478 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12479 new_rtx = XEXP (new_rtx, 1);
12480 }
12481 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12482 }
12483 }
12484 }
12485 }
12486 return new_rtx;
12487 }
12488 \f
12489 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12490
12491 static rtx
12492 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12493 {
12494 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12495
12496 if (GET_MODE (tp) != tp_mode)
12497 {
12498 gcc_assert (GET_MODE (tp) == SImode);
12499 gcc_assert (tp_mode == DImode);
12500
12501 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12502 }
12503
12504 if (to_reg)
12505 tp = copy_to_mode_reg (tp_mode, tp);
12506
12507 return tp;
12508 }
12509
12510 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12511
12512 static GTY(()) rtx ix86_tls_symbol;
12513
12514 static rtx
12515 ix86_tls_get_addr (void)
12516 {
12517 if (!ix86_tls_symbol)
12518 {
12519 const char *sym
12520 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12521 ? "___tls_get_addr" : "__tls_get_addr");
12522
12523 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12524 }
12525
12526 return ix86_tls_symbol;
12527 }
12528
12529 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12530
12531 static GTY(()) rtx ix86_tls_module_base_symbol;
12532
12533 rtx
12534 ix86_tls_module_base (void)
12535 {
12536 if (!ix86_tls_module_base_symbol)
12537 {
12538 ix86_tls_module_base_symbol
12539 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12540
12541 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12542 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12543 }
12544
12545 return ix86_tls_module_base_symbol;
12546 }
12547
12548 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12549 false if we expect this to be used for a memory address and true if
12550 we expect to load the address into a register. */
12551
12552 static rtx
12553 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12554 {
12555 rtx dest, base, off;
12556 rtx pic = NULL_RTX, tp = NULL_RTX;
12557 enum machine_mode tp_mode = Pmode;
12558 int type;
12559
12560 switch (model)
12561 {
12562 case TLS_MODEL_GLOBAL_DYNAMIC:
12563 dest = gen_reg_rtx (Pmode);
12564
12565 if (!TARGET_64BIT)
12566 {
12567 if (flag_pic)
12568 pic = pic_offset_table_rtx;
12569 else
12570 {
12571 pic = gen_reg_rtx (Pmode);
12572 emit_insn (gen_set_got (pic));
12573 }
12574 }
12575
12576 if (TARGET_GNU2_TLS)
12577 {
12578 if (TARGET_64BIT)
12579 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12580 else
12581 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12582
12583 tp = get_thread_pointer (Pmode, true);
12584 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12585
12586 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12587 }
12588 else
12589 {
12590 rtx caddr = ix86_tls_get_addr ();
12591
12592 if (TARGET_64BIT)
12593 {
12594 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12595
12596 start_sequence ();
12597 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
12598 caddr));
12599 insns = get_insns ();
12600 end_sequence ();
12601
12602 RTL_CONST_CALL_P (insns) = 1;
12603 emit_libcall_block (insns, dest, rax, x);
12604 }
12605 else
12606 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12607 }
12608 break;
12609
12610 case TLS_MODEL_LOCAL_DYNAMIC:
12611 base = gen_reg_rtx (Pmode);
12612
12613 if (!TARGET_64BIT)
12614 {
12615 if (flag_pic)
12616 pic = pic_offset_table_rtx;
12617 else
12618 {
12619 pic = gen_reg_rtx (Pmode);
12620 emit_insn (gen_set_got (pic));
12621 }
12622 }
12623
12624 if (TARGET_GNU2_TLS)
12625 {
12626 rtx tmp = ix86_tls_module_base ();
12627
12628 if (TARGET_64BIT)
12629 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12630 else
12631 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12632
12633 tp = get_thread_pointer (Pmode, true);
12634 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12635 gen_rtx_MINUS (Pmode, tmp, tp));
12636 }
12637 else
12638 {
12639 rtx caddr = ix86_tls_get_addr ();
12640
12641 if (TARGET_64BIT)
12642 {
12643 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12644
12645 start_sequence ();
12646 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
12647 caddr));
12648 insns = get_insns ();
12649 end_sequence ();
12650
12651 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12652 share the LD_BASE result with other LD model accesses. */
12653 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12654 UNSPEC_TLS_LD_BASE);
12655
12656 RTL_CONST_CALL_P (insns) = 1;
12657 emit_libcall_block (insns, base, rax, eqv);
12658 }
12659 else
12660 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12661 }
12662
12663 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12664 off = gen_rtx_CONST (Pmode, off);
12665
12666 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12667
12668 if (TARGET_GNU2_TLS)
12669 {
12670 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12671
12672 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12673 }
12674 break;
12675
12676 case TLS_MODEL_INITIAL_EXEC:
12677 if (TARGET_64BIT)
12678 {
12679 if (TARGET_SUN_TLS)
12680 {
12681 /* The Sun linker took the AMD64 TLS spec literally
12682 and can only handle %rax as destination of the
12683 initial executable code sequence. */
12684
12685 dest = gen_reg_rtx (Pmode);
12686 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12687 return dest;
12688 }
12689
12690 /* Generate DImode references to avoid %fs:(%reg32)
12691 problems and linker IE->LE relaxation bug. */
12692 tp_mode = DImode;
12693 pic = NULL;
12694 type = UNSPEC_GOTNTPOFF;
12695 }
12696 else if (flag_pic)
12697 {
12698 if (reload_in_progress)
12699 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12700 pic = pic_offset_table_rtx;
12701 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12702 }
12703 else if (!TARGET_ANY_GNU_TLS)
12704 {
12705 pic = gen_reg_rtx (Pmode);
12706 emit_insn (gen_set_got (pic));
12707 type = UNSPEC_GOTTPOFF;
12708 }
12709 else
12710 {
12711 pic = NULL;
12712 type = UNSPEC_INDNTPOFF;
12713 }
12714
12715 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12716 off = gen_rtx_CONST (tp_mode, off);
12717 if (pic)
12718 off = gen_rtx_PLUS (tp_mode, pic, off);
12719 off = gen_const_mem (tp_mode, off);
12720 set_mem_alias_set (off, ix86_GOT_alias_set ());
12721
12722 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12723 {
12724 base = get_thread_pointer (tp_mode,
12725 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12726 off = force_reg (tp_mode, off);
12727 return gen_rtx_PLUS (tp_mode, base, off);
12728 }
12729 else
12730 {
12731 base = get_thread_pointer (Pmode, true);
12732 dest = gen_reg_rtx (Pmode);
12733 emit_insn (ix86_gen_sub3 (dest, base, off));
12734 }
12735 break;
12736
12737 case TLS_MODEL_LOCAL_EXEC:
12738 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12739 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12740 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12741 off = gen_rtx_CONST (Pmode, off);
12742
12743 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12744 {
12745 base = get_thread_pointer (Pmode,
12746 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12747 return gen_rtx_PLUS (Pmode, base, off);
12748 }
12749 else
12750 {
12751 base = get_thread_pointer (Pmode, true);
12752 dest = gen_reg_rtx (Pmode);
12753 emit_insn (ix86_gen_sub3 (dest, base, off));
12754 }
12755 break;
12756
12757 default:
12758 gcc_unreachable ();
12759 }
12760
12761 return dest;
12762 }
12763
12764 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12765 to symbol DECL. */
12766
12767 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12768 htab_t dllimport_map;
12769
12770 static tree
12771 get_dllimport_decl (tree decl)
12772 {
12773 struct tree_map *h, in;
12774 void **loc;
12775 const char *name;
12776 const char *prefix;
12777 size_t namelen, prefixlen;
12778 char *imp_name;
12779 tree to;
12780 rtx rtl;
12781
12782 if (!dllimport_map)
12783 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12784
12785 in.hash = htab_hash_pointer (decl);
12786 in.base.from = decl;
12787 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12788 h = (struct tree_map *) *loc;
12789 if (h)
12790 return h->to;
12791
12792 *loc = h = ggc_alloc_tree_map ();
12793 h->hash = in.hash;
12794 h->base.from = decl;
12795 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12796 VAR_DECL, NULL, ptr_type_node);
12797 DECL_ARTIFICIAL (to) = 1;
12798 DECL_IGNORED_P (to) = 1;
12799 DECL_EXTERNAL (to) = 1;
12800 TREE_READONLY (to) = 1;
12801
12802 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12803 name = targetm.strip_name_encoding (name);
12804 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12805 ? "*__imp_" : "*__imp__";
12806 namelen = strlen (name);
12807 prefixlen = strlen (prefix);
12808 imp_name = (char *) alloca (namelen + prefixlen + 1);
12809 memcpy (imp_name, prefix, prefixlen);
12810 memcpy (imp_name + prefixlen, name, namelen + 1);
12811
12812 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12813 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12814 SET_SYMBOL_REF_DECL (rtl, to);
12815 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12816
12817 rtl = gen_const_mem (Pmode, rtl);
12818 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12819
12820 SET_DECL_RTL (to, rtl);
12821 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12822
12823 return to;
12824 }
12825
12826 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12827 true if we require the result be a register. */
12828
12829 static rtx
12830 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12831 {
12832 tree imp_decl;
12833 rtx x;
12834
12835 gcc_assert (SYMBOL_REF_DECL (symbol));
12836 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12837
12838 x = DECL_RTL (imp_decl);
12839 if (want_reg)
12840 x = force_reg (Pmode, x);
12841 return x;
12842 }
12843
12844 /* Try machine-dependent ways of modifying an illegitimate address
12845 to be legitimate. If we find one, return the new, valid address.
12846 This macro is used in only one place: `memory_address' in explow.c.
12847
12848 OLDX is the address as it was before break_out_memory_refs was called.
12849 In some cases it is useful to look at this to decide what needs to be done.
12850
12851 It is always safe for this macro to do nothing. It exists to recognize
12852 opportunities to optimize the output.
12853
12854 For the 80386, we handle X+REG by loading X into a register R and
12855 using R+REG. R will go in a general reg and indexing will be used.
12856 However, if REG is a broken-out memory address or multiplication,
12857 nothing needs to be done because REG can certainly go in a general reg.
12858
12859 When -fpic is used, special handling is needed for symbolic references.
12860 See comments by legitimize_pic_address in i386.c for details. */
12861
12862 static rtx
12863 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12864 enum machine_mode mode)
12865 {
12866 int changed = 0;
12867 unsigned log;
12868
12869 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12870 if (log)
12871 return legitimize_tls_address (x, (enum tls_model) log, false);
12872 if (GET_CODE (x) == CONST
12873 && GET_CODE (XEXP (x, 0)) == PLUS
12874 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12875 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12876 {
12877 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12878 (enum tls_model) log, false);
12879 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12880 }
12881
12882 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12883 {
12884 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12885 return legitimize_dllimport_symbol (x, true);
12886 if (GET_CODE (x) == CONST
12887 && GET_CODE (XEXP (x, 0)) == PLUS
12888 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12889 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12890 {
12891 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12892 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12893 }
12894 }
12895
12896 if (flag_pic && SYMBOLIC_CONST (x))
12897 return legitimize_pic_address (x, 0);
12898
12899 #if TARGET_MACHO
12900 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12901 return machopic_indirect_data_reference (x, 0);
12902 #endif
12903
12904 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12905 if (GET_CODE (x) == ASHIFT
12906 && CONST_INT_P (XEXP (x, 1))
12907 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12908 {
12909 changed = 1;
12910 log = INTVAL (XEXP (x, 1));
12911 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12912 GEN_INT (1 << log));
12913 }
12914
12915 if (GET_CODE (x) == PLUS)
12916 {
12917 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12918
12919 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12920 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12921 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12922 {
12923 changed = 1;
12924 log = INTVAL (XEXP (XEXP (x, 0), 1));
12925 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12926 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12927 GEN_INT (1 << log));
12928 }
12929
12930 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12931 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12932 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12933 {
12934 changed = 1;
12935 log = INTVAL (XEXP (XEXP (x, 1), 1));
12936 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12937 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12938 GEN_INT (1 << log));
12939 }
12940
12941 /* Put multiply first if it isn't already. */
12942 if (GET_CODE (XEXP (x, 1)) == MULT)
12943 {
12944 rtx tmp = XEXP (x, 0);
12945 XEXP (x, 0) = XEXP (x, 1);
12946 XEXP (x, 1) = tmp;
12947 changed = 1;
12948 }
12949
12950 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
12951 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
12952 created by virtual register instantiation, register elimination, and
12953 similar optimizations. */
12954 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
12955 {
12956 changed = 1;
12957 x = gen_rtx_PLUS (Pmode,
12958 gen_rtx_PLUS (Pmode, XEXP (x, 0),
12959 XEXP (XEXP (x, 1), 0)),
12960 XEXP (XEXP (x, 1), 1));
12961 }
12962
12963 /* Canonicalize
12964 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
12965 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
12966 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
12967 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12968 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
12969 && CONSTANT_P (XEXP (x, 1)))
12970 {
12971 rtx constant;
12972 rtx other = NULL_RTX;
12973
12974 if (CONST_INT_P (XEXP (x, 1)))
12975 {
12976 constant = XEXP (x, 1);
12977 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
12978 }
12979 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
12980 {
12981 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
12982 other = XEXP (x, 1);
12983 }
12984 else
12985 constant = 0;
12986
12987 if (constant)
12988 {
12989 changed = 1;
12990 x = gen_rtx_PLUS (Pmode,
12991 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
12992 XEXP (XEXP (XEXP (x, 0), 1), 0)),
12993 plus_constant (other, INTVAL (constant)));
12994 }
12995 }
12996
12997 if (changed && ix86_legitimate_address_p (mode, x, false))
12998 return x;
12999
13000 if (GET_CODE (XEXP (x, 0)) == MULT)
13001 {
13002 changed = 1;
13003 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13004 }
13005
13006 if (GET_CODE (XEXP (x, 1)) == MULT)
13007 {
13008 changed = 1;
13009 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13010 }
13011
13012 if (changed
13013 && REG_P (XEXP (x, 1))
13014 && REG_P (XEXP (x, 0)))
13015 return x;
13016
13017 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13018 {
13019 changed = 1;
13020 x = legitimize_pic_address (x, 0);
13021 }
13022
13023 if (changed && ix86_legitimate_address_p (mode, x, false))
13024 return x;
13025
13026 if (REG_P (XEXP (x, 0)))
13027 {
13028 rtx temp = gen_reg_rtx (Pmode);
13029 rtx val = force_operand (XEXP (x, 1), temp);
13030 if (val != temp)
13031 {
13032 if (GET_MODE (val) != Pmode)
13033 val = convert_to_mode (Pmode, val, 1);
13034 emit_move_insn (temp, val);
13035 }
13036
13037 XEXP (x, 1) = temp;
13038 return x;
13039 }
13040
13041 else if (REG_P (XEXP (x, 1)))
13042 {
13043 rtx temp = gen_reg_rtx (Pmode);
13044 rtx val = force_operand (XEXP (x, 0), temp);
13045 if (val != temp)
13046 {
13047 if (GET_MODE (val) != Pmode)
13048 val = convert_to_mode (Pmode, val, 1);
13049 emit_move_insn (temp, val);
13050 }
13051
13052 XEXP (x, 0) = temp;
13053 return x;
13054 }
13055 }
13056
13057 return x;
13058 }
13059 \f
13060 /* Print an integer constant expression in assembler syntax. Addition
13061 and subtraction are the only arithmetic that may appear in these
13062 expressions. FILE is the stdio stream to write to, X is the rtx, and
13063 CODE is the operand print code from the output string. */
13064
13065 static void
13066 output_pic_addr_const (FILE *file, rtx x, int code)
13067 {
13068 char buf[256];
13069
13070 switch (GET_CODE (x))
13071 {
13072 case PC:
13073 gcc_assert (flag_pic);
13074 putc ('.', file);
13075 break;
13076
13077 case SYMBOL_REF:
13078 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13079 output_addr_const (file, x);
13080 else
13081 {
13082 const char *name = XSTR (x, 0);
13083
13084 /* Mark the decl as referenced so that cgraph will
13085 output the function. */
13086 if (SYMBOL_REF_DECL (x))
13087 mark_decl_referenced (SYMBOL_REF_DECL (x));
13088
13089 #if TARGET_MACHO
13090 if (MACHOPIC_INDIRECT
13091 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13092 name = machopic_indirection_name (x, /*stub_p=*/true);
13093 #endif
13094 assemble_name (file, name);
13095 }
13096 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13097 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13098 fputs ("@PLT", file);
13099 break;
13100
13101 case LABEL_REF:
13102 x = XEXP (x, 0);
13103 /* FALLTHRU */
13104 case CODE_LABEL:
13105 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13106 assemble_name (asm_out_file, buf);
13107 break;
13108
13109 case CONST_INT:
13110 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13111 break;
13112
13113 case CONST:
13114 /* This used to output parentheses around the expression,
13115 but that does not work on the 386 (either ATT or BSD assembler). */
13116 output_pic_addr_const (file, XEXP (x, 0), code);
13117 break;
13118
13119 case CONST_DOUBLE:
13120 if (GET_MODE (x) == VOIDmode)
13121 {
13122 /* We can use %d if the number is <32 bits and positive. */
13123 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13124 fprintf (file, "0x%lx%08lx",
13125 (unsigned long) CONST_DOUBLE_HIGH (x),
13126 (unsigned long) CONST_DOUBLE_LOW (x));
13127 else
13128 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13129 }
13130 else
13131 /* We can't handle floating point constants;
13132 TARGET_PRINT_OPERAND must handle them. */
13133 output_operand_lossage ("floating constant misused");
13134 break;
13135
13136 case PLUS:
13137 /* Some assemblers need integer constants to appear first. */
13138 if (CONST_INT_P (XEXP (x, 0)))
13139 {
13140 output_pic_addr_const (file, XEXP (x, 0), code);
13141 putc ('+', file);
13142 output_pic_addr_const (file, XEXP (x, 1), code);
13143 }
13144 else
13145 {
13146 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13147 output_pic_addr_const (file, XEXP (x, 1), code);
13148 putc ('+', file);
13149 output_pic_addr_const (file, XEXP (x, 0), code);
13150 }
13151 break;
13152
13153 case MINUS:
13154 if (!TARGET_MACHO)
13155 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13156 output_pic_addr_const (file, XEXP (x, 0), code);
13157 putc ('-', file);
13158 output_pic_addr_const (file, XEXP (x, 1), code);
13159 if (!TARGET_MACHO)
13160 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13161 break;
13162
13163 case UNSPEC:
13164 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13165 {
13166 bool f = i386_asm_output_addr_const_extra (file, x);
13167 gcc_assert (f);
13168 break;
13169 }
13170
13171 gcc_assert (XVECLEN (x, 0) == 1);
13172 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13173 switch (XINT (x, 1))
13174 {
13175 case UNSPEC_GOT:
13176 fputs ("@GOT", file);
13177 break;
13178 case UNSPEC_GOTOFF:
13179 fputs ("@GOTOFF", file);
13180 break;
13181 case UNSPEC_PLTOFF:
13182 fputs ("@PLTOFF", file);
13183 break;
13184 case UNSPEC_PCREL:
13185 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13186 "(%rip)" : "[rip]", file);
13187 break;
13188 case UNSPEC_GOTPCREL:
13189 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13190 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13191 break;
13192 case UNSPEC_GOTTPOFF:
13193 /* FIXME: This might be @TPOFF in Sun ld too. */
13194 fputs ("@gottpoff", file);
13195 break;
13196 case UNSPEC_TPOFF:
13197 fputs ("@tpoff", file);
13198 break;
13199 case UNSPEC_NTPOFF:
13200 if (TARGET_64BIT)
13201 fputs ("@tpoff", file);
13202 else
13203 fputs ("@ntpoff", file);
13204 break;
13205 case UNSPEC_DTPOFF:
13206 fputs ("@dtpoff", file);
13207 break;
13208 case UNSPEC_GOTNTPOFF:
13209 if (TARGET_64BIT)
13210 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13211 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13212 else
13213 fputs ("@gotntpoff", file);
13214 break;
13215 case UNSPEC_INDNTPOFF:
13216 fputs ("@indntpoff", file);
13217 break;
13218 #if TARGET_MACHO
13219 case UNSPEC_MACHOPIC_OFFSET:
13220 putc ('-', file);
13221 machopic_output_function_base_name (file);
13222 break;
13223 #endif
13224 default:
13225 output_operand_lossage ("invalid UNSPEC as operand");
13226 break;
13227 }
13228 break;
13229
13230 default:
13231 output_operand_lossage ("invalid expression as operand");
13232 }
13233 }
13234
13235 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13236 We need to emit DTP-relative relocations. */
13237
13238 static void ATTRIBUTE_UNUSED
13239 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13240 {
13241 fputs (ASM_LONG, file);
13242 output_addr_const (file, x);
13243 fputs ("@dtpoff", file);
13244 switch (size)
13245 {
13246 case 4:
13247 break;
13248 case 8:
13249 fputs (", 0", file);
13250 break;
13251 default:
13252 gcc_unreachable ();
13253 }
13254 }
13255
13256 /* Return true if X is a representation of the PIC register. This copes
13257 with calls from ix86_find_base_term, where the register might have
13258 been replaced by a cselib value. */
13259
13260 static bool
13261 ix86_pic_register_p (rtx x)
13262 {
13263 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13264 return (pic_offset_table_rtx
13265 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13266 else
13267 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13268 }
13269
13270 /* Helper function for ix86_delegitimize_address.
13271 Attempt to delegitimize TLS local-exec accesses. */
13272
13273 static rtx
13274 ix86_delegitimize_tls_address (rtx orig_x)
13275 {
13276 rtx x = orig_x, unspec;
13277 struct ix86_address addr;
13278
13279 if (!TARGET_TLS_DIRECT_SEG_REFS)
13280 return orig_x;
13281 if (MEM_P (x))
13282 x = XEXP (x, 0);
13283 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13284 return orig_x;
13285 if (ix86_decompose_address (x, &addr) == 0
13286 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13287 || addr.disp == NULL_RTX
13288 || GET_CODE (addr.disp) != CONST)
13289 return orig_x;
13290 unspec = XEXP (addr.disp, 0);
13291 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13292 unspec = XEXP (unspec, 0);
13293 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13294 return orig_x;
13295 x = XVECEXP (unspec, 0, 0);
13296 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13297 if (unspec != XEXP (addr.disp, 0))
13298 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13299 if (addr.index)
13300 {
13301 rtx idx = addr.index;
13302 if (addr.scale != 1)
13303 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13304 x = gen_rtx_PLUS (Pmode, idx, x);
13305 }
13306 if (addr.base)
13307 x = gen_rtx_PLUS (Pmode, addr.base, x);
13308 if (MEM_P (orig_x))
13309 x = replace_equiv_address_nv (orig_x, x);
13310 return x;
13311 }
13312
13313 /* In the name of slightly smaller debug output, and to cater to
13314 general assembler lossage, recognize PIC+GOTOFF and turn it back
13315 into a direct symbol reference.
13316
13317 On Darwin, this is necessary to avoid a crash, because Darwin
13318 has a different PIC label for each routine but the DWARF debugging
13319 information is not associated with any particular routine, so it's
13320 necessary to remove references to the PIC label from RTL stored by
13321 the DWARF output code. */
13322
13323 static rtx
13324 ix86_delegitimize_address (rtx x)
13325 {
13326 rtx orig_x = delegitimize_mem_from_attrs (x);
13327 /* addend is NULL or some rtx if x is something+GOTOFF where
13328 something doesn't include the PIC register. */
13329 rtx addend = NULL_RTX;
13330 /* reg_addend is NULL or a multiple of some register. */
13331 rtx reg_addend = NULL_RTX;
13332 /* const_addend is NULL or a const_int. */
13333 rtx const_addend = NULL_RTX;
13334 /* This is the result, or NULL. */
13335 rtx result = NULL_RTX;
13336
13337 x = orig_x;
13338
13339 if (MEM_P (x))
13340 x = XEXP (x, 0);
13341
13342 if (TARGET_64BIT)
13343 {
13344 if (GET_CODE (x) == CONST
13345 && GET_CODE (XEXP (x, 0)) == PLUS
13346 && GET_MODE (XEXP (x, 0)) == Pmode
13347 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13348 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13349 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13350 {
13351 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13352 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13353 if (MEM_P (orig_x))
13354 x = replace_equiv_address_nv (orig_x, x);
13355 return x;
13356 }
13357 if (GET_CODE (x) != CONST
13358 || GET_CODE (XEXP (x, 0)) != UNSPEC
13359 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13360 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13361 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13362 return ix86_delegitimize_tls_address (orig_x);
13363 x = XVECEXP (XEXP (x, 0), 0, 0);
13364 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13365 {
13366 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13367 GET_MODE (x), 0);
13368 if (x == NULL_RTX)
13369 return orig_x;
13370 }
13371 return x;
13372 }
13373
13374 if (GET_CODE (x) != PLUS
13375 || GET_CODE (XEXP (x, 1)) != CONST)
13376 return ix86_delegitimize_tls_address (orig_x);
13377
13378 if (ix86_pic_register_p (XEXP (x, 0)))
13379 /* %ebx + GOT/GOTOFF */
13380 ;
13381 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13382 {
13383 /* %ebx + %reg * scale + GOT/GOTOFF */
13384 reg_addend = XEXP (x, 0);
13385 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13386 reg_addend = XEXP (reg_addend, 1);
13387 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13388 reg_addend = XEXP (reg_addend, 0);
13389 else
13390 {
13391 reg_addend = NULL_RTX;
13392 addend = XEXP (x, 0);
13393 }
13394 }
13395 else
13396 addend = XEXP (x, 0);
13397
13398 x = XEXP (XEXP (x, 1), 0);
13399 if (GET_CODE (x) == PLUS
13400 && CONST_INT_P (XEXP (x, 1)))
13401 {
13402 const_addend = XEXP (x, 1);
13403 x = XEXP (x, 0);
13404 }
13405
13406 if (GET_CODE (x) == UNSPEC
13407 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13408 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13409 result = XVECEXP (x, 0, 0);
13410
13411 if (TARGET_MACHO && darwin_local_data_pic (x)
13412 && !MEM_P (orig_x))
13413 result = XVECEXP (x, 0, 0);
13414
13415 if (! result)
13416 return ix86_delegitimize_tls_address (orig_x);
13417
13418 if (const_addend)
13419 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13420 if (reg_addend)
13421 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13422 if (addend)
13423 {
13424 /* If the rest of original X doesn't involve the PIC register, add
13425 addend and subtract pic_offset_table_rtx. This can happen e.g.
13426 for code like:
13427 leal (%ebx, %ecx, 4), %ecx
13428 ...
13429 movl foo@GOTOFF(%ecx), %edx
13430 in which case we return (%ecx - %ebx) + foo. */
13431 if (pic_offset_table_rtx)
13432 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13433 pic_offset_table_rtx),
13434 result);
13435 else
13436 return orig_x;
13437 }
13438 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13439 {
13440 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13441 if (result == NULL_RTX)
13442 return orig_x;
13443 }
13444 return result;
13445 }
13446
13447 /* If X is a machine specific address (i.e. a symbol or label being
13448 referenced as a displacement from the GOT implemented using an
13449 UNSPEC), then return the base term. Otherwise return X. */
13450
13451 rtx
13452 ix86_find_base_term (rtx x)
13453 {
13454 rtx term;
13455
13456 if (TARGET_64BIT)
13457 {
13458 if (GET_CODE (x) != CONST)
13459 return x;
13460 term = XEXP (x, 0);
13461 if (GET_CODE (term) == PLUS
13462 && (CONST_INT_P (XEXP (term, 1))
13463 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13464 term = XEXP (term, 0);
13465 if (GET_CODE (term) != UNSPEC
13466 || (XINT (term, 1) != UNSPEC_GOTPCREL
13467 && XINT (term, 1) != UNSPEC_PCREL))
13468 return x;
13469
13470 return XVECEXP (term, 0, 0);
13471 }
13472
13473 return ix86_delegitimize_address (x);
13474 }
13475 \f
13476 static void
13477 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13478 int fp, FILE *file)
13479 {
13480 const char *suffix;
13481
13482 if (mode == CCFPmode || mode == CCFPUmode)
13483 {
13484 code = ix86_fp_compare_code_to_integer (code);
13485 mode = CCmode;
13486 }
13487 if (reverse)
13488 code = reverse_condition (code);
13489
13490 switch (code)
13491 {
13492 case EQ:
13493 switch (mode)
13494 {
13495 case CCAmode:
13496 suffix = "a";
13497 break;
13498
13499 case CCCmode:
13500 suffix = "c";
13501 break;
13502
13503 case CCOmode:
13504 suffix = "o";
13505 break;
13506
13507 case CCSmode:
13508 suffix = "s";
13509 break;
13510
13511 default:
13512 suffix = "e";
13513 }
13514 break;
13515 case NE:
13516 switch (mode)
13517 {
13518 case CCAmode:
13519 suffix = "na";
13520 break;
13521
13522 case CCCmode:
13523 suffix = "nc";
13524 break;
13525
13526 case CCOmode:
13527 suffix = "no";
13528 break;
13529
13530 case CCSmode:
13531 suffix = "ns";
13532 break;
13533
13534 default:
13535 suffix = "ne";
13536 }
13537 break;
13538 case GT:
13539 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13540 suffix = "g";
13541 break;
13542 case GTU:
13543 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13544 Those same assemblers have the same but opposite lossage on cmov. */
13545 if (mode == CCmode)
13546 suffix = fp ? "nbe" : "a";
13547 else if (mode == CCCmode)
13548 suffix = "b";
13549 else
13550 gcc_unreachable ();
13551 break;
13552 case LT:
13553 switch (mode)
13554 {
13555 case CCNOmode:
13556 case CCGOCmode:
13557 suffix = "s";
13558 break;
13559
13560 case CCmode:
13561 case CCGCmode:
13562 suffix = "l";
13563 break;
13564
13565 default:
13566 gcc_unreachable ();
13567 }
13568 break;
13569 case LTU:
13570 gcc_assert (mode == CCmode || mode == CCCmode);
13571 suffix = "b";
13572 break;
13573 case GE:
13574 switch (mode)
13575 {
13576 case CCNOmode:
13577 case CCGOCmode:
13578 suffix = "ns";
13579 break;
13580
13581 case CCmode:
13582 case CCGCmode:
13583 suffix = "ge";
13584 break;
13585
13586 default:
13587 gcc_unreachable ();
13588 }
13589 break;
13590 case GEU:
13591 /* ??? As above. */
13592 gcc_assert (mode == CCmode || mode == CCCmode);
13593 suffix = fp ? "nb" : "ae";
13594 break;
13595 case LE:
13596 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13597 suffix = "le";
13598 break;
13599 case LEU:
13600 /* ??? As above. */
13601 if (mode == CCmode)
13602 suffix = "be";
13603 else if (mode == CCCmode)
13604 suffix = fp ? "nb" : "ae";
13605 else
13606 gcc_unreachable ();
13607 break;
13608 case UNORDERED:
13609 suffix = fp ? "u" : "p";
13610 break;
13611 case ORDERED:
13612 suffix = fp ? "nu" : "np";
13613 break;
13614 default:
13615 gcc_unreachable ();
13616 }
13617 fputs (suffix, file);
13618 }
13619
13620 /* Print the name of register X to FILE based on its machine mode and number.
13621 If CODE is 'w', pretend the mode is HImode.
13622 If CODE is 'b', pretend the mode is QImode.
13623 If CODE is 'k', pretend the mode is SImode.
13624 If CODE is 'q', pretend the mode is DImode.
13625 If CODE is 'x', pretend the mode is V4SFmode.
13626 If CODE is 't', pretend the mode is V8SFmode.
13627 If CODE is 'h', pretend the reg is the 'high' byte register.
13628 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13629 If CODE is 'd', duplicate the operand for AVX instruction.
13630 */
13631
13632 void
13633 print_reg (rtx x, int code, FILE *file)
13634 {
13635 const char *reg;
13636 bool duplicated = code == 'd' && TARGET_AVX;
13637
13638 gcc_assert (x == pc_rtx
13639 || (REGNO (x) != ARG_POINTER_REGNUM
13640 && REGNO (x) != FRAME_POINTER_REGNUM
13641 && REGNO (x) != FLAGS_REG
13642 && REGNO (x) != FPSR_REG
13643 && REGNO (x) != FPCR_REG));
13644
13645 if (ASSEMBLER_DIALECT == ASM_ATT)
13646 putc ('%', file);
13647
13648 if (x == pc_rtx)
13649 {
13650 gcc_assert (TARGET_64BIT);
13651 fputs ("rip", file);
13652 return;
13653 }
13654
13655 if (code == 'w' || MMX_REG_P (x))
13656 code = 2;
13657 else if (code == 'b')
13658 code = 1;
13659 else if (code == 'k')
13660 code = 4;
13661 else if (code == 'q')
13662 code = 8;
13663 else if (code == 'y')
13664 code = 3;
13665 else if (code == 'h')
13666 code = 0;
13667 else if (code == 'x')
13668 code = 16;
13669 else if (code == 't')
13670 code = 32;
13671 else
13672 code = GET_MODE_SIZE (GET_MODE (x));
13673
13674 /* Irritatingly, AMD extended registers use different naming convention
13675 from the normal registers: "r%d[bwd]" */
13676 if (REX_INT_REG_P (x))
13677 {
13678 gcc_assert (TARGET_64BIT);
13679 putc ('r', file);
13680 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13681 switch (code)
13682 {
13683 case 0:
13684 error ("extended registers have no high halves");
13685 break;
13686 case 1:
13687 putc ('b', file);
13688 break;
13689 case 2:
13690 putc ('w', file);
13691 break;
13692 case 4:
13693 putc ('d', file);
13694 break;
13695 case 8:
13696 /* no suffix */
13697 break;
13698 default:
13699 error ("unsupported operand size for extended register");
13700 break;
13701 }
13702 return;
13703 }
13704
13705 reg = NULL;
13706 switch (code)
13707 {
13708 case 3:
13709 if (STACK_TOP_P (x))
13710 {
13711 reg = "st(0)";
13712 break;
13713 }
13714 /* FALLTHRU */
13715 case 8:
13716 case 4:
13717 case 12:
13718 if (! ANY_FP_REG_P (x))
13719 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13720 /* FALLTHRU */
13721 case 16:
13722 case 2:
13723 normal:
13724 reg = hi_reg_name[REGNO (x)];
13725 break;
13726 case 1:
13727 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13728 goto normal;
13729 reg = qi_reg_name[REGNO (x)];
13730 break;
13731 case 0:
13732 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13733 goto normal;
13734 reg = qi_high_reg_name[REGNO (x)];
13735 break;
13736 case 32:
13737 if (SSE_REG_P (x))
13738 {
13739 gcc_assert (!duplicated);
13740 putc ('y', file);
13741 fputs (hi_reg_name[REGNO (x)] + 1, file);
13742 return;
13743 }
13744 break;
13745 default:
13746 gcc_unreachable ();
13747 }
13748
13749 fputs (reg, file);
13750 if (duplicated)
13751 {
13752 if (ASSEMBLER_DIALECT == ASM_ATT)
13753 fprintf (file, ", %%%s", reg);
13754 else
13755 fprintf (file, ", %s", reg);
13756 }
13757 }
13758
13759 /* Locate some local-dynamic symbol still in use by this function
13760 so that we can print its name in some tls_local_dynamic_base
13761 pattern. */
13762
13763 static int
13764 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13765 {
13766 rtx x = *px;
13767
13768 if (GET_CODE (x) == SYMBOL_REF
13769 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13770 {
13771 cfun->machine->some_ld_name = XSTR (x, 0);
13772 return 1;
13773 }
13774
13775 return 0;
13776 }
13777
13778 static const char *
13779 get_some_local_dynamic_name (void)
13780 {
13781 rtx insn;
13782
13783 if (cfun->machine->some_ld_name)
13784 return cfun->machine->some_ld_name;
13785
13786 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13787 if (NONDEBUG_INSN_P (insn)
13788 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13789 return cfun->machine->some_ld_name;
13790
13791 return NULL;
13792 }
13793
13794 /* Meaning of CODE:
13795 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13796 C -- print opcode suffix for set/cmov insn.
13797 c -- like C, but print reversed condition
13798 F,f -- likewise, but for floating-point.
13799 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13800 otherwise nothing
13801 R -- print the prefix for register names.
13802 z -- print the opcode suffix for the size of the current operand.
13803 Z -- likewise, with special suffixes for x87 instructions.
13804 * -- print a star (in certain assembler syntax)
13805 A -- print an absolute memory reference.
13806 E -- print address with DImode register names if TARGET_64BIT.
13807 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13808 s -- print a shift double count, followed by the assemblers argument
13809 delimiter.
13810 b -- print the QImode name of the register for the indicated operand.
13811 %b0 would print %al if operands[0] is reg 0.
13812 w -- likewise, print the HImode name of the register.
13813 k -- likewise, print the SImode name of the register.
13814 q -- likewise, print the DImode name of the register.
13815 x -- likewise, print the V4SFmode name of the register.
13816 t -- likewise, print the V8SFmode name of the register.
13817 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13818 y -- print "st(0)" instead of "st" as a register.
13819 d -- print duplicated register operand for AVX instruction.
13820 D -- print condition for SSE cmp instruction.
13821 P -- if PIC, print an @PLT suffix.
13822 p -- print raw symbol name.
13823 X -- don't print any sort of PIC '@' suffix for a symbol.
13824 & -- print some in-use local-dynamic symbol name.
13825 H -- print a memory address offset by 8; used for sse high-parts
13826 Y -- print condition for XOP pcom* instruction.
13827 + -- print a branch hint as 'cs' or 'ds' prefix
13828 ; -- print a semicolon (after prefixes due to bug in older gas).
13829 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13830 @ -- print a segment register of thread base pointer load
13831 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
13832 */
13833
13834 void
13835 ix86_print_operand (FILE *file, rtx x, int code)
13836 {
13837 if (code)
13838 {
13839 switch (code)
13840 {
13841 case '*':
13842 if (ASSEMBLER_DIALECT == ASM_ATT)
13843 putc ('*', file);
13844 return;
13845
13846 case '&':
13847 {
13848 const char *name = get_some_local_dynamic_name ();
13849 if (name == NULL)
13850 output_operand_lossage ("'%%&' used without any "
13851 "local dynamic TLS references");
13852 else
13853 assemble_name (file, name);
13854 return;
13855 }
13856
13857 case 'A':
13858 switch (ASSEMBLER_DIALECT)
13859 {
13860 case ASM_ATT:
13861 putc ('*', file);
13862 break;
13863
13864 case ASM_INTEL:
13865 /* Intel syntax. For absolute addresses, registers should not
13866 be surrounded by braces. */
13867 if (!REG_P (x))
13868 {
13869 putc ('[', file);
13870 ix86_print_operand (file, x, 0);
13871 putc (']', file);
13872 return;
13873 }
13874 break;
13875
13876 default:
13877 gcc_unreachable ();
13878 }
13879
13880 ix86_print_operand (file, x, 0);
13881 return;
13882
13883 case 'E':
13884 /* Wrap address in an UNSPEC to declare special handling. */
13885 if (TARGET_64BIT)
13886 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
13887
13888 output_address (x);
13889 return;
13890
13891 case 'L':
13892 if (ASSEMBLER_DIALECT == ASM_ATT)
13893 putc ('l', file);
13894 return;
13895
13896 case 'W':
13897 if (ASSEMBLER_DIALECT == ASM_ATT)
13898 putc ('w', file);
13899 return;
13900
13901 case 'B':
13902 if (ASSEMBLER_DIALECT == ASM_ATT)
13903 putc ('b', file);
13904 return;
13905
13906 case 'Q':
13907 if (ASSEMBLER_DIALECT == ASM_ATT)
13908 putc ('l', file);
13909 return;
13910
13911 case 'S':
13912 if (ASSEMBLER_DIALECT == ASM_ATT)
13913 putc ('s', file);
13914 return;
13915
13916 case 'T':
13917 if (ASSEMBLER_DIALECT == ASM_ATT)
13918 putc ('t', file);
13919 return;
13920
13921 case 'z':
13922 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13923 {
13924 /* Opcodes don't get size suffixes if using Intel opcodes. */
13925 if (ASSEMBLER_DIALECT == ASM_INTEL)
13926 return;
13927
13928 switch (GET_MODE_SIZE (GET_MODE (x)))
13929 {
13930 case 1:
13931 putc ('b', file);
13932 return;
13933
13934 case 2:
13935 putc ('w', file);
13936 return;
13937
13938 case 4:
13939 putc ('l', file);
13940 return;
13941
13942 case 8:
13943 putc ('q', file);
13944 return;
13945
13946 default:
13947 output_operand_lossage
13948 ("invalid operand size for operand code '%c'", code);
13949 return;
13950 }
13951 }
13952
13953 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13954 warning
13955 (0, "non-integer operand used with operand code '%c'", code);
13956 /* FALLTHRU */
13957
13958 case 'Z':
13959 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
13960 if (ASSEMBLER_DIALECT == ASM_INTEL)
13961 return;
13962
13963 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13964 {
13965 switch (GET_MODE_SIZE (GET_MODE (x)))
13966 {
13967 case 2:
13968 #ifdef HAVE_AS_IX86_FILDS
13969 putc ('s', file);
13970 #endif
13971 return;
13972
13973 case 4:
13974 putc ('l', file);
13975 return;
13976
13977 case 8:
13978 #ifdef HAVE_AS_IX86_FILDQ
13979 putc ('q', file);
13980 #else
13981 fputs ("ll", file);
13982 #endif
13983 return;
13984
13985 default:
13986 break;
13987 }
13988 }
13989 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13990 {
13991 /* 387 opcodes don't get size suffixes
13992 if the operands are registers. */
13993 if (STACK_REG_P (x))
13994 return;
13995
13996 switch (GET_MODE_SIZE (GET_MODE (x)))
13997 {
13998 case 4:
13999 putc ('s', file);
14000 return;
14001
14002 case 8:
14003 putc ('l', file);
14004 return;
14005
14006 case 12:
14007 case 16:
14008 putc ('t', file);
14009 return;
14010
14011 default:
14012 break;
14013 }
14014 }
14015 else
14016 {
14017 output_operand_lossage
14018 ("invalid operand type used with operand code '%c'", code);
14019 return;
14020 }
14021
14022 output_operand_lossage
14023 ("invalid operand size for operand code '%c'", code);
14024 return;
14025
14026 case 'd':
14027 case 'b':
14028 case 'w':
14029 case 'k':
14030 case 'q':
14031 case 'h':
14032 case 't':
14033 case 'y':
14034 case 'x':
14035 case 'X':
14036 case 'P':
14037 case 'p':
14038 break;
14039
14040 case 's':
14041 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14042 {
14043 ix86_print_operand (file, x, 0);
14044 fputs (", ", file);
14045 }
14046 return;
14047
14048 case 'D':
14049 /* Little bit of braindamage here. The SSE compare instructions
14050 does use completely different names for the comparisons that the
14051 fp conditional moves. */
14052 if (TARGET_AVX)
14053 {
14054 switch (GET_CODE (x))
14055 {
14056 case EQ:
14057 fputs ("eq", file);
14058 break;
14059 case UNEQ:
14060 fputs ("eq_us", file);
14061 break;
14062 case LT:
14063 fputs ("lt", file);
14064 break;
14065 case UNLT:
14066 fputs ("nge", file);
14067 break;
14068 case LE:
14069 fputs ("le", file);
14070 break;
14071 case UNLE:
14072 fputs ("ngt", file);
14073 break;
14074 case UNORDERED:
14075 fputs ("unord", file);
14076 break;
14077 case NE:
14078 fputs ("neq", file);
14079 break;
14080 case LTGT:
14081 fputs ("neq_oq", file);
14082 break;
14083 case GE:
14084 fputs ("ge", file);
14085 break;
14086 case UNGE:
14087 fputs ("nlt", file);
14088 break;
14089 case GT:
14090 fputs ("gt", file);
14091 break;
14092 case UNGT:
14093 fputs ("nle", file);
14094 break;
14095 case ORDERED:
14096 fputs ("ord", file);
14097 break;
14098 default:
14099 output_operand_lossage ("operand is not a condition code, "
14100 "invalid operand code 'D'");
14101 return;
14102 }
14103 }
14104 else
14105 {
14106 switch (GET_CODE (x))
14107 {
14108 case EQ:
14109 case UNEQ:
14110 fputs ("eq", file);
14111 break;
14112 case LT:
14113 case UNLT:
14114 fputs ("lt", file);
14115 break;
14116 case LE:
14117 case UNLE:
14118 fputs ("le", file);
14119 break;
14120 case UNORDERED:
14121 fputs ("unord", file);
14122 break;
14123 case NE:
14124 case LTGT:
14125 fputs ("neq", file);
14126 break;
14127 case UNGE:
14128 case GE:
14129 fputs ("nlt", file);
14130 break;
14131 case UNGT:
14132 case GT:
14133 fputs ("nle", file);
14134 break;
14135 case ORDERED:
14136 fputs ("ord", file);
14137 break;
14138 default:
14139 output_operand_lossage ("operand is not a condition code, "
14140 "invalid operand code 'D'");
14141 return;
14142 }
14143 }
14144 return;
14145 case 'O':
14146 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14147 if (ASSEMBLER_DIALECT == ASM_ATT)
14148 {
14149 switch (GET_MODE (x))
14150 {
14151 case HImode: putc ('w', file); break;
14152 case SImode:
14153 case SFmode: putc ('l', file); break;
14154 case DImode:
14155 case DFmode: putc ('q', file); break;
14156 default: gcc_unreachable ();
14157 }
14158 putc ('.', file);
14159 }
14160 #endif
14161 return;
14162 case 'C':
14163 if (!COMPARISON_P (x))
14164 {
14165 output_operand_lossage ("operand is neither a constant nor a "
14166 "condition code, invalid operand code "
14167 "'C'");
14168 return;
14169 }
14170 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
14171 return;
14172 case 'F':
14173 if (!COMPARISON_P (x))
14174 {
14175 output_operand_lossage ("operand is neither a constant nor a "
14176 "condition code, invalid operand code "
14177 "'F'");
14178 return;
14179 }
14180 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14181 if (ASSEMBLER_DIALECT == ASM_ATT)
14182 putc ('.', file);
14183 #endif
14184 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
14185 return;
14186
14187 /* Like above, but reverse condition */
14188 case 'c':
14189 /* Check to see if argument to %c is really a constant
14190 and not a condition code which needs to be reversed. */
14191 if (!COMPARISON_P (x))
14192 {
14193 output_operand_lossage ("operand is neither a constant nor a "
14194 "condition code, invalid operand "
14195 "code 'c'");
14196 return;
14197 }
14198 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
14199 return;
14200 case 'f':
14201 if (!COMPARISON_P (x))
14202 {
14203 output_operand_lossage ("operand is neither a constant nor a "
14204 "condition code, invalid operand "
14205 "code 'f'");
14206 return;
14207 }
14208 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14209 if (ASSEMBLER_DIALECT == ASM_ATT)
14210 putc ('.', file);
14211 #endif
14212 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14213 return;
14214
14215 case 'H':
14216 if (!offsettable_memref_p (x))
14217 {
14218 output_operand_lossage ("operand is not an offsettable memory "
14219 "reference, invalid operand "
14220 "code 'H'");
14221 return;
14222 }
14223 /* It doesn't actually matter what mode we use here, as we're
14224 only going to use this for printing. */
14225 x = adjust_address_nv (x, DImode, 8);
14226 break;
14227
14228 case '+':
14229 {
14230 rtx x;
14231
14232 if (!optimize
14233 || optimize_function_for_size_p (cfun)
14234 || !TARGET_BRANCH_PREDICTION_HINTS)
14235 return;
14236
14237 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14238 if (x)
14239 {
14240 int pred_val = INTVAL (XEXP (x, 0));
14241
14242 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14243 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14244 {
14245 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14246 bool cputaken
14247 = final_forward_branch_p (current_output_insn) == 0;
14248
14249 /* Emit hints only in the case default branch prediction
14250 heuristics would fail. */
14251 if (taken != cputaken)
14252 {
14253 /* We use 3e (DS) prefix for taken branches and
14254 2e (CS) prefix for not taken branches. */
14255 if (taken)
14256 fputs ("ds ; ", file);
14257 else
14258 fputs ("cs ; ", file);
14259 }
14260 }
14261 }
14262 return;
14263 }
14264
14265 case 'Y':
14266 switch (GET_CODE (x))
14267 {
14268 case NE:
14269 fputs ("neq", file);
14270 break;
14271 case EQ:
14272 fputs ("eq", file);
14273 break;
14274 case GE:
14275 case GEU:
14276 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14277 break;
14278 case GT:
14279 case GTU:
14280 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14281 break;
14282 case LE:
14283 case LEU:
14284 fputs ("le", file);
14285 break;
14286 case LT:
14287 case LTU:
14288 fputs ("lt", file);
14289 break;
14290 case UNORDERED:
14291 fputs ("unord", file);
14292 break;
14293 case ORDERED:
14294 fputs ("ord", file);
14295 break;
14296 case UNEQ:
14297 fputs ("ueq", file);
14298 break;
14299 case UNGE:
14300 fputs ("nlt", file);
14301 break;
14302 case UNGT:
14303 fputs ("nle", file);
14304 break;
14305 case UNLE:
14306 fputs ("ule", file);
14307 break;
14308 case UNLT:
14309 fputs ("ult", file);
14310 break;
14311 case LTGT:
14312 fputs ("une", file);
14313 break;
14314 default:
14315 output_operand_lossage ("operand is not a condition code, "
14316 "invalid operand code 'Y'");
14317 return;
14318 }
14319 return;
14320
14321 case ';':
14322 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14323 putc (';', file);
14324 #endif
14325 return;
14326
14327 case '@':
14328 if (ASSEMBLER_DIALECT == ASM_ATT)
14329 putc ('%', file);
14330
14331 /* The kernel uses a different segment register for performance
14332 reasons; a system call would not have to trash the userspace
14333 segment register, which would be expensive. */
14334 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14335 fputs ("fs", file);
14336 else
14337 fputs ("gs", file);
14338 return;
14339
14340 case '~':
14341 putc (TARGET_AVX2 ? 'i' : 'f', file);
14342 return;
14343
14344 case '^':
14345 if (TARGET_64BIT && Pmode != word_mode)
14346 fputs ("addr32 ", file);
14347 return;
14348
14349 default:
14350 output_operand_lossage ("invalid operand code '%c'", code);
14351 }
14352 }
14353
14354 if (REG_P (x))
14355 print_reg (x, code, file);
14356
14357 else if (MEM_P (x))
14358 {
14359 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14360 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14361 && GET_MODE (x) != BLKmode)
14362 {
14363 const char * size;
14364 switch (GET_MODE_SIZE (GET_MODE (x)))
14365 {
14366 case 1: size = "BYTE"; break;
14367 case 2: size = "WORD"; break;
14368 case 4: size = "DWORD"; break;
14369 case 8: size = "QWORD"; break;
14370 case 12: size = "TBYTE"; break;
14371 case 16:
14372 if (GET_MODE (x) == XFmode)
14373 size = "TBYTE";
14374 else
14375 size = "XMMWORD";
14376 break;
14377 case 32: size = "YMMWORD"; break;
14378 default:
14379 gcc_unreachable ();
14380 }
14381
14382 /* Check for explicit size override (codes 'b', 'w', 'k',
14383 'q' and 'x') */
14384 if (code == 'b')
14385 size = "BYTE";
14386 else if (code == 'w')
14387 size = "WORD";
14388 else if (code == 'k')
14389 size = "DWORD";
14390 else if (code == 'q')
14391 size = "QWORD";
14392 else if (code == 'x')
14393 size = "XMMWORD";
14394
14395 fputs (size, file);
14396 fputs (" PTR ", file);
14397 }
14398
14399 x = XEXP (x, 0);
14400 /* Avoid (%rip) for call operands. */
14401 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14402 && !CONST_INT_P (x))
14403 output_addr_const (file, x);
14404 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14405 output_operand_lossage ("invalid constraints for operand");
14406 else
14407 output_address (x);
14408 }
14409
14410 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14411 {
14412 REAL_VALUE_TYPE r;
14413 long l;
14414
14415 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14416 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14417
14418 if (ASSEMBLER_DIALECT == ASM_ATT)
14419 putc ('$', file);
14420 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14421 if (code == 'q')
14422 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14423 else
14424 fprintf (file, "0x%08x", (unsigned int) l);
14425 }
14426
14427 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14428 {
14429 REAL_VALUE_TYPE r;
14430 long l[2];
14431
14432 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14433 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14434
14435 if (ASSEMBLER_DIALECT == ASM_ATT)
14436 putc ('$', file);
14437 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14438 }
14439
14440 /* These float cases don't actually occur as immediate operands. */
14441 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14442 {
14443 char dstr[30];
14444
14445 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14446 fputs (dstr, file);
14447 }
14448
14449 else
14450 {
14451 /* We have patterns that allow zero sets of memory, for instance.
14452 In 64-bit mode, we should probably support all 8-byte vectors,
14453 since we can in fact encode that into an immediate. */
14454 if (GET_CODE (x) == CONST_VECTOR)
14455 {
14456 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14457 x = const0_rtx;
14458 }
14459
14460 if (code != 'P' && code != 'p')
14461 {
14462 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14463 {
14464 if (ASSEMBLER_DIALECT == ASM_ATT)
14465 putc ('$', file);
14466 }
14467 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14468 || GET_CODE (x) == LABEL_REF)
14469 {
14470 if (ASSEMBLER_DIALECT == ASM_ATT)
14471 putc ('$', file);
14472 else
14473 fputs ("OFFSET FLAT:", file);
14474 }
14475 }
14476 if (CONST_INT_P (x))
14477 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14478 else if (flag_pic || MACHOPIC_INDIRECT)
14479 output_pic_addr_const (file, x, code);
14480 else
14481 output_addr_const (file, x);
14482 }
14483 }
14484
14485 static bool
14486 ix86_print_operand_punct_valid_p (unsigned char code)
14487 {
14488 return (code == '@' || code == '*' || code == '+' || code == '&'
14489 || code == ';' || code == '~' || code == '^');
14490 }
14491 \f
14492 /* Print a memory operand whose address is ADDR. */
14493
14494 static void
14495 ix86_print_operand_address (FILE *file, rtx addr)
14496 {
14497 struct ix86_address parts;
14498 rtx base, index, disp;
14499 int scale;
14500 int ok;
14501 bool vsib = false;
14502 int code = 0;
14503
14504 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14505 {
14506 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14507 gcc_assert (parts.index == NULL_RTX);
14508 parts.index = XVECEXP (addr, 0, 1);
14509 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14510 addr = XVECEXP (addr, 0, 0);
14511 vsib = true;
14512 }
14513 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14514 {
14515 gcc_assert (TARGET_64BIT);
14516 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14517 code = 'q';
14518 }
14519 else
14520 ok = ix86_decompose_address (addr, &parts);
14521
14522 gcc_assert (ok);
14523
14524 if (parts.base && GET_CODE (parts.base) == SUBREG)
14525 {
14526 rtx tmp = SUBREG_REG (parts.base);
14527 parts.base = simplify_subreg (GET_MODE (parts.base),
14528 tmp, GET_MODE (tmp), 0);
14529 }
14530
14531 if (parts.index && GET_CODE (parts.index) == SUBREG)
14532 {
14533 rtx tmp = SUBREG_REG (parts.index);
14534 parts.index = simplify_subreg (GET_MODE (parts.index),
14535 tmp, GET_MODE (tmp), 0);
14536 }
14537
14538 base = parts.base;
14539 index = parts.index;
14540 disp = parts.disp;
14541 scale = parts.scale;
14542
14543 switch (parts.seg)
14544 {
14545 case SEG_DEFAULT:
14546 break;
14547 case SEG_FS:
14548 case SEG_GS:
14549 if (ASSEMBLER_DIALECT == ASM_ATT)
14550 putc ('%', file);
14551 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14552 break;
14553 default:
14554 gcc_unreachable ();
14555 }
14556
14557 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14558 if (TARGET_64BIT && !base && !index)
14559 {
14560 rtx symbol = disp;
14561
14562 if (GET_CODE (disp) == CONST
14563 && GET_CODE (XEXP (disp, 0)) == PLUS
14564 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14565 symbol = XEXP (XEXP (disp, 0), 0);
14566
14567 if (GET_CODE (symbol) == LABEL_REF
14568 || (GET_CODE (symbol) == SYMBOL_REF
14569 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14570 base = pc_rtx;
14571 }
14572 if (!base && !index)
14573 {
14574 /* Displacement only requires special attention. */
14575
14576 if (CONST_INT_P (disp))
14577 {
14578 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14579 fputs ("ds:", file);
14580 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14581 }
14582 else if (flag_pic)
14583 output_pic_addr_const (file, disp, 0);
14584 else
14585 output_addr_const (file, disp);
14586 }
14587 else
14588 {
14589 /* Print SImode register names for zero-extended
14590 addresses to force addr32 prefix. */
14591 if (TARGET_64BIT
14592 && (GET_CODE (addr) == ZERO_EXTEND
14593 || GET_CODE (addr) == AND))
14594 {
14595 gcc_assert (!code);
14596 code = 'l';
14597 }
14598
14599 if (ASSEMBLER_DIALECT == ASM_ATT)
14600 {
14601 if (disp)
14602 {
14603 if (flag_pic)
14604 output_pic_addr_const (file, disp, 0);
14605 else if (GET_CODE (disp) == LABEL_REF)
14606 output_asm_label (disp);
14607 else
14608 output_addr_const (file, disp);
14609 }
14610
14611 putc ('(', file);
14612 if (base)
14613 print_reg (base, code, file);
14614 if (index)
14615 {
14616 putc (',', file);
14617 print_reg (index, vsib ? 0 : code, file);
14618 if (scale != 1 || vsib)
14619 fprintf (file, ",%d", scale);
14620 }
14621 putc (')', file);
14622 }
14623 else
14624 {
14625 rtx offset = NULL_RTX;
14626
14627 if (disp)
14628 {
14629 /* Pull out the offset of a symbol; print any symbol itself. */
14630 if (GET_CODE (disp) == CONST
14631 && GET_CODE (XEXP (disp, 0)) == PLUS
14632 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14633 {
14634 offset = XEXP (XEXP (disp, 0), 1);
14635 disp = gen_rtx_CONST (VOIDmode,
14636 XEXP (XEXP (disp, 0), 0));
14637 }
14638
14639 if (flag_pic)
14640 output_pic_addr_const (file, disp, 0);
14641 else if (GET_CODE (disp) == LABEL_REF)
14642 output_asm_label (disp);
14643 else if (CONST_INT_P (disp))
14644 offset = disp;
14645 else
14646 output_addr_const (file, disp);
14647 }
14648
14649 putc ('[', file);
14650 if (base)
14651 {
14652 print_reg (base, code, file);
14653 if (offset)
14654 {
14655 if (INTVAL (offset) >= 0)
14656 putc ('+', file);
14657 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14658 }
14659 }
14660 else if (offset)
14661 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14662 else
14663 putc ('0', file);
14664
14665 if (index)
14666 {
14667 putc ('+', file);
14668 print_reg (index, vsib ? 0 : code, file);
14669 if (scale != 1 || vsib)
14670 fprintf (file, "*%d", scale);
14671 }
14672 putc (']', file);
14673 }
14674 }
14675 }
14676
14677 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14678
14679 static bool
14680 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14681 {
14682 rtx op;
14683
14684 if (GET_CODE (x) != UNSPEC)
14685 return false;
14686
14687 op = XVECEXP (x, 0, 0);
14688 switch (XINT (x, 1))
14689 {
14690 case UNSPEC_GOTTPOFF:
14691 output_addr_const (file, op);
14692 /* FIXME: This might be @TPOFF in Sun ld. */
14693 fputs ("@gottpoff", file);
14694 break;
14695 case UNSPEC_TPOFF:
14696 output_addr_const (file, op);
14697 fputs ("@tpoff", file);
14698 break;
14699 case UNSPEC_NTPOFF:
14700 output_addr_const (file, op);
14701 if (TARGET_64BIT)
14702 fputs ("@tpoff", file);
14703 else
14704 fputs ("@ntpoff", file);
14705 break;
14706 case UNSPEC_DTPOFF:
14707 output_addr_const (file, op);
14708 fputs ("@dtpoff", file);
14709 break;
14710 case UNSPEC_GOTNTPOFF:
14711 output_addr_const (file, op);
14712 if (TARGET_64BIT)
14713 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14714 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14715 else
14716 fputs ("@gotntpoff", file);
14717 break;
14718 case UNSPEC_INDNTPOFF:
14719 output_addr_const (file, op);
14720 fputs ("@indntpoff", file);
14721 break;
14722 #if TARGET_MACHO
14723 case UNSPEC_MACHOPIC_OFFSET:
14724 output_addr_const (file, op);
14725 putc ('-', file);
14726 machopic_output_function_base_name (file);
14727 break;
14728 #endif
14729
14730 case UNSPEC_STACK_CHECK:
14731 {
14732 int offset;
14733
14734 gcc_assert (flag_split_stack);
14735
14736 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14737 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14738 #else
14739 gcc_unreachable ();
14740 #endif
14741
14742 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14743 }
14744 break;
14745
14746 default:
14747 return false;
14748 }
14749
14750 return true;
14751 }
14752 \f
14753 /* Split one or more double-mode RTL references into pairs of half-mode
14754 references. The RTL can be REG, offsettable MEM, integer constant, or
14755 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14756 split and "num" is its length. lo_half and hi_half are output arrays
14757 that parallel "operands". */
14758
14759 void
14760 split_double_mode (enum machine_mode mode, rtx operands[],
14761 int num, rtx lo_half[], rtx hi_half[])
14762 {
14763 enum machine_mode half_mode;
14764 unsigned int byte;
14765
14766 switch (mode)
14767 {
14768 case TImode:
14769 half_mode = DImode;
14770 break;
14771 case DImode:
14772 half_mode = SImode;
14773 break;
14774 default:
14775 gcc_unreachable ();
14776 }
14777
14778 byte = GET_MODE_SIZE (half_mode);
14779
14780 while (num--)
14781 {
14782 rtx op = operands[num];
14783
14784 /* simplify_subreg refuse to split volatile memory addresses,
14785 but we still have to handle it. */
14786 if (MEM_P (op))
14787 {
14788 lo_half[num] = adjust_address (op, half_mode, 0);
14789 hi_half[num] = adjust_address (op, half_mode, byte);
14790 }
14791 else
14792 {
14793 lo_half[num] = simplify_gen_subreg (half_mode, op,
14794 GET_MODE (op) == VOIDmode
14795 ? mode : GET_MODE (op), 0);
14796 hi_half[num] = simplify_gen_subreg (half_mode, op,
14797 GET_MODE (op) == VOIDmode
14798 ? mode : GET_MODE (op), byte);
14799 }
14800 }
14801 }
14802 \f
14803 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14804 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14805 is the expression of the binary operation. The output may either be
14806 emitted here, or returned to the caller, like all output_* functions.
14807
14808 There is no guarantee that the operands are the same mode, as they
14809 might be within FLOAT or FLOAT_EXTEND expressions. */
14810
14811 #ifndef SYSV386_COMPAT
14812 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14813 wants to fix the assemblers because that causes incompatibility
14814 with gcc. No-one wants to fix gcc because that causes
14815 incompatibility with assemblers... You can use the option of
14816 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14817 #define SYSV386_COMPAT 1
14818 #endif
14819
14820 const char *
14821 output_387_binary_op (rtx insn, rtx *operands)
14822 {
14823 static char buf[40];
14824 const char *p;
14825 const char *ssep;
14826 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14827
14828 #ifdef ENABLE_CHECKING
14829 /* Even if we do not want to check the inputs, this documents input
14830 constraints. Which helps in understanding the following code. */
14831 if (STACK_REG_P (operands[0])
14832 && ((REG_P (operands[1])
14833 && REGNO (operands[0]) == REGNO (operands[1])
14834 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14835 || (REG_P (operands[2])
14836 && REGNO (operands[0]) == REGNO (operands[2])
14837 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14838 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14839 ; /* ok */
14840 else
14841 gcc_assert (is_sse);
14842 #endif
14843
14844 switch (GET_CODE (operands[3]))
14845 {
14846 case PLUS:
14847 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14848 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14849 p = "fiadd";
14850 else
14851 p = "fadd";
14852 ssep = "vadd";
14853 break;
14854
14855 case MINUS:
14856 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14857 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14858 p = "fisub";
14859 else
14860 p = "fsub";
14861 ssep = "vsub";
14862 break;
14863
14864 case MULT:
14865 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14866 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14867 p = "fimul";
14868 else
14869 p = "fmul";
14870 ssep = "vmul";
14871 break;
14872
14873 case DIV:
14874 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14875 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14876 p = "fidiv";
14877 else
14878 p = "fdiv";
14879 ssep = "vdiv";
14880 break;
14881
14882 default:
14883 gcc_unreachable ();
14884 }
14885
14886 if (is_sse)
14887 {
14888 if (TARGET_AVX)
14889 {
14890 strcpy (buf, ssep);
14891 if (GET_MODE (operands[0]) == SFmode)
14892 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14893 else
14894 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14895 }
14896 else
14897 {
14898 strcpy (buf, ssep + 1);
14899 if (GET_MODE (operands[0]) == SFmode)
14900 strcat (buf, "ss\t{%2, %0|%0, %2}");
14901 else
14902 strcat (buf, "sd\t{%2, %0|%0, %2}");
14903 }
14904 return buf;
14905 }
14906 strcpy (buf, p);
14907
14908 switch (GET_CODE (operands[3]))
14909 {
14910 case MULT:
14911 case PLUS:
14912 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14913 {
14914 rtx temp = operands[2];
14915 operands[2] = operands[1];
14916 operands[1] = temp;
14917 }
14918
14919 /* know operands[0] == operands[1]. */
14920
14921 if (MEM_P (operands[2]))
14922 {
14923 p = "%Z2\t%2";
14924 break;
14925 }
14926
14927 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14928 {
14929 if (STACK_TOP_P (operands[0]))
14930 /* How is it that we are storing to a dead operand[2]?
14931 Well, presumably operands[1] is dead too. We can't
14932 store the result to st(0) as st(0) gets popped on this
14933 instruction. Instead store to operands[2] (which I
14934 think has to be st(1)). st(1) will be popped later.
14935 gcc <= 2.8.1 didn't have this check and generated
14936 assembly code that the Unixware assembler rejected. */
14937 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14938 else
14939 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14940 break;
14941 }
14942
14943 if (STACK_TOP_P (operands[0]))
14944 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14945 else
14946 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14947 break;
14948
14949 case MINUS:
14950 case DIV:
14951 if (MEM_P (operands[1]))
14952 {
14953 p = "r%Z1\t%1";
14954 break;
14955 }
14956
14957 if (MEM_P (operands[2]))
14958 {
14959 p = "%Z2\t%2";
14960 break;
14961 }
14962
14963 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14964 {
14965 #if SYSV386_COMPAT
14966 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14967 derived assemblers, confusingly reverse the direction of
14968 the operation for fsub{r} and fdiv{r} when the
14969 destination register is not st(0). The Intel assembler
14970 doesn't have this brain damage. Read !SYSV386_COMPAT to
14971 figure out what the hardware really does. */
14972 if (STACK_TOP_P (operands[0]))
14973 p = "{p\t%0, %2|rp\t%2, %0}";
14974 else
14975 p = "{rp\t%2, %0|p\t%0, %2}";
14976 #else
14977 if (STACK_TOP_P (operands[0]))
14978 /* As above for fmul/fadd, we can't store to st(0). */
14979 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14980 else
14981 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14982 #endif
14983 break;
14984 }
14985
14986 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
14987 {
14988 #if SYSV386_COMPAT
14989 if (STACK_TOP_P (operands[0]))
14990 p = "{rp\t%0, %1|p\t%1, %0}";
14991 else
14992 p = "{p\t%1, %0|rp\t%0, %1}";
14993 #else
14994 if (STACK_TOP_P (operands[0]))
14995 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
14996 else
14997 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
14998 #endif
14999 break;
15000 }
15001
15002 if (STACK_TOP_P (operands[0]))
15003 {
15004 if (STACK_TOP_P (operands[1]))
15005 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15006 else
15007 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15008 break;
15009 }
15010 else if (STACK_TOP_P (operands[1]))
15011 {
15012 #if SYSV386_COMPAT
15013 p = "{\t%1, %0|r\t%0, %1}";
15014 #else
15015 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15016 #endif
15017 }
15018 else
15019 {
15020 #if SYSV386_COMPAT
15021 p = "{r\t%2, %0|\t%0, %2}";
15022 #else
15023 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15024 #endif
15025 }
15026 break;
15027
15028 default:
15029 gcc_unreachable ();
15030 }
15031
15032 strcat (buf, p);
15033 return buf;
15034 }
15035
15036 /* Return needed mode for entity in optimize_mode_switching pass. */
15037
15038 int
15039 ix86_mode_needed (int entity, rtx insn)
15040 {
15041 enum attr_i387_cw mode;
15042
15043 /* The mode UNINITIALIZED is used to store control word after a
15044 function call or ASM pattern. The mode ANY specify that function
15045 has no requirements on the control word and make no changes in the
15046 bits we are interested in. */
15047
15048 if (CALL_P (insn)
15049 || (NONJUMP_INSN_P (insn)
15050 && (asm_noperands (PATTERN (insn)) >= 0
15051 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15052 return I387_CW_UNINITIALIZED;
15053
15054 if (recog_memoized (insn) < 0)
15055 return I387_CW_ANY;
15056
15057 mode = get_attr_i387_cw (insn);
15058
15059 switch (entity)
15060 {
15061 case I387_TRUNC:
15062 if (mode == I387_CW_TRUNC)
15063 return mode;
15064 break;
15065
15066 case I387_FLOOR:
15067 if (mode == I387_CW_FLOOR)
15068 return mode;
15069 break;
15070
15071 case I387_CEIL:
15072 if (mode == I387_CW_CEIL)
15073 return mode;
15074 break;
15075
15076 case I387_MASK_PM:
15077 if (mode == I387_CW_MASK_PM)
15078 return mode;
15079 break;
15080
15081 default:
15082 gcc_unreachable ();
15083 }
15084
15085 return I387_CW_ANY;
15086 }
15087
15088 /* Output code to initialize control word copies used by trunc?f?i and
15089 rounding patterns. CURRENT_MODE is set to current control word,
15090 while NEW_MODE is set to new control word. */
15091
15092 void
15093 emit_i387_cw_initialization (int mode)
15094 {
15095 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15096 rtx new_mode;
15097
15098 enum ix86_stack_slot slot;
15099
15100 rtx reg = gen_reg_rtx (HImode);
15101
15102 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15103 emit_move_insn (reg, copy_rtx (stored_mode));
15104
15105 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15106 || optimize_function_for_size_p (cfun))
15107 {
15108 switch (mode)
15109 {
15110 case I387_CW_TRUNC:
15111 /* round toward zero (truncate) */
15112 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15113 slot = SLOT_CW_TRUNC;
15114 break;
15115
15116 case I387_CW_FLOOR:
15117 /* round down toward -oo */
15118 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15119 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15120 slot = SLOT_CW_FLOOR;
15121 break;
15122
15123 case I387_CW_CEIL:
15124 /* round up toward +oo */
15125 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15126 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15127 slot = SLOT_CW_CEIL;
15128 break;
15129
15130 case I387_CW_MASK_PM:
15131 /* mask precision exception for nearbyint() */
15132 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15133 slot = SLOT_CW_MASK_PM;
15134 break;
15135
15136 default:
15137 gcc_unreachable ();
15138 }
15139 }
15140 else
15141 {
15142 switch (mode)
15143 {
15144 case I387_CW_TRUNC:
15145 /* round toward zero (truncate) */
15146 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15147 slot = SLOT_CW_TRUNC;
15148 break;
15149
15150 case I387_CW_FLOOR:
15151 /* round down toward -oo */
15152 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15153 slot = SLOT_CW_FLOOR;
15154 break;
15155
15156 case I387_CW_CEIL:
15157 /* round up toward +oo */
15158 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15159 slot = SLOT_CW_CEIL;
15160 break;
15161
15162 case I387_CW_MASK_PM:
15163 /* mask precision exception for nearbyint() */
15164 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15165 slot = SLOT_CW_MASK_PM;
15166 break;
15167
15168 default:
15169 gcc_unreachable ();
15170 }
15171 }
15172
15173 gcc_assert (slot < MAX_386_STACK_LOCALS);
15174
15175 new_mode = assign_386_stack_local (HImode, slot);
15176 emit_move_insn (new_mode, reg);
15177 }
15178
15179 /* Output code for INSN to convert a float to a signed int. OPERANDS
15180 are the insn operands. The output may be [HSD]Imode and the input
15181 operand may be [SDX]Fmode. */
15182
15183 const char *
15184 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15185 {
15186 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15187 int dimode_p = GET_MODE (operands[0]) == DImode;
15188 int round_mode = get_attr_i387_cw (insn);
15189
15190 /* Jump through a hoop or two for DImode, since the hardware has no
15191 non-popping instruction. We used to do this a different way, but
15192 that was somewhat fragile and broke with post-reload splitters. */
15193 if ((dimode_p || fisttp) && !stack_top_dies)
15194 output_asm_insn ("fld\t%y1", operands);
15195
15196 gcc_assert (STACK_TOP_P (operands[1]));
15197 gcc_assert (MEM_P (operands[0]));
15198 gcc_assert (GET_MODE (operands[1]) != TFmode);
15199
15200 if (fisttp)
15201 output_asm_insn ("fisttp%Z0\t%0", operands);
15202 else
15203 {
15204 if (round_mode != I387_CW_ANY)
15205 output_asm_insn ("fldcw\t%3", operands);
15206 if (stack_top_dies || dimode_p)
15207 output_asm_insn ("fistp%Z0\t%0", operands);
15208 else
15209 output_asm_insn ("fist%Z0\t%0", operands);
15210 if (round_mode != I387_CW_ANY)
15211 output_asm_insn ("fldcw\t%2", operands);
15212 }
15213
15214 return "";
15215 }
15216
15217 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15218 have the values zero or one, indicates the ffreep insn's operand
15219 from the OPERANDS array. */
15220
15221 static const char *
15222 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15223 {
15224 if (TARGET_USE_FFREEP)
15225 #ifdef HAVE_AS_IX86_FFREEP
15226 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15227 #else
15228 {
15229 static char retval[32];
15230 int regno = REGNO (operands[opno]);
15231
15232 gcc_assert (FP_REGNO_P (regno));
15233
15234 regno -= FIRST_STACK_REG;
15235
15236 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15237 return retval;
15238 }
15239 #endif
15240
15241 return opno ? "fstp\t%y1" : "fstp\t%y0";
15242 }
15243
15244
15245 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15246 should be used. UNORDERED_P is true when fucom should be used. */
15247
15248 const char *
15249 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15250 {
15251 int stack_top_dies;
15252 rtx cmp_op0, cmp_op1;
15253 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15254
15255 if (eflags_p)
15256 {
15257 cmp_op0 = operands[0];
15258 cmp_op1 = operands[1];
15259 }
15260 else
15261 {
15262 cmp_op0 = operands[1];
15263 cmp_op1 = operands[2];
15264 }
15265
15266 if (is_sse)
15267 {
15268 if (GET_MODE (operands[0]) == SFmode)
15269 if (unordered_p)
15270 return "%vucomiss\t{%1, %0|%0, %1}";
15271 else
15272 return "%vcomiss\t{%1, %0|%0, %1}";
15273 else
15274 if (unordered_p)
15275 return "%vucomisd\t{%1, %0|%0, %1}";
15276 else
15277 return "%vcomisd\t{%1, %0|%0, %1}";
15278 }
15279
15280 gcc_assert (STACK_TOP_P (cmp_op0));
15281
15282 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15283
15284 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15285 {
15286 if (stack_top_dies)
15287 {
15288 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15289 return output_387_ffreep (operands, 1);
15290 }
15291 else
15292 return "ftst\n\tfnstsw\t%0";
15293 }
15294
15295 if (STACK_REG_P (cmp_op1)
15296 && stack_top_dies
15297 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15298 && REGNO (cmp_op1) != FIRST_STACK_REG)
15299 {
15300 /* If both the top of the 387 stack dies, and the other operand
15301 is also a stack register that dies, then this must be a
15302 `fcompp' float compare */
15303
15304 if (eflags_p)
15305 {
15306 /* There is no double popping fcomi variant. Fortunately,
15307 eflags is immune from the fstp's cc clobbering. */
15308 if (unordered_p)
15309 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15310 else
15311 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15312 return output_387_ffreep (operands, 0);
15313 }
15314 else
15315 {
15316 if (unordered_p)
15317 return "fucompp\n\tfnstsw\t%0";
15318 else
15319 return "fcompp\n\tfnstsw\t%0";
15320 }
15321 }
15322 else
15323 {
15324 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15325
15326 static const char * const alt[16] =
15327 {
15328 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15329 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15330 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15331 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15332
15333 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15334 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15335 NULL,
15336 NULL,
15337
15338 "fcomi\t{%y1, %0|%0, %y1}",
15339 "fcomip\t{%y1, %0|%0, %y1}",
15340 "fucomi\t{%y1, %0|%0, %y1}",
15341 "fucomip\t{%y1, %0|%0, %y1}",
15342
15343 NULL,
15344 NULL,
15345 NULL,
15346 NULL
15347 };
15348
15349 int mask;
15350 const char *ret;
15351
15352 mask = eflags_p << 3;
15353 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15354 mask |= unordered_p << 1;
15355 mask |= stack_top_dies;
15356
15357 gcc_assert (mask < 16);
15358 ret = alt[mask];
15359 gcc_assert (ret);
15360
15361 return ret;
15362 }
15363 }
15364
15365 void
15366 ix86_output_addr_vec_elt (FILE *file, int value)
15367 {
15368 const char *directive = ASM_LONG;
15369
15370 #ifdef ASM_QUAD
15371 if (TARGET_LP64)
15372 directive = ASM_QUAD;
15373 #else
15374 gcc_assert (!TARGET_64BIT);
15375 #endif
15376
15377 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15378 }
15379
15380 void
15381 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15382 {
15383 const char *directive = ASM_LONG;
15384
15385 #ifdef ASM_QUAD
15386 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15387 directive = ASM_QUAD;
15388 #else
15389 gcc_assert (!TARGET_64BIT);
15390 #endif
15391 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15392 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15393 fprintf (file, "%s%s%d-%s%d\n",
15394 directive, LPREFIX, value, LPREFIX, rel);
15395 else if (HAVE_AS_GOTOFF_IN_DATA)
15396 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15397 #if TARGET_MACHO
15398 else if (TARGET_MACHO)
15399 {
15400 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15401 machopic_output_function_base_name (file);
15402 putc ('\n', file);
15403 }
15404 #endif
15405 else
15406 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15407 GOT_SYMBOL_NAME, LPREFIX, value);
15408 }
15409 \f
15410 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15411 for the target. */
15412
15413 void
15414 ix86_expand_clear (rtx dest)
15415 {
15416 rtx tmp;
15417
15418 /* We play register width games, which are only valid after reload. */
15419 gcc_assert (reload_completed);
15420
15421 /* Avoid HImode and its attendant prefix byte. */
15422 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15423 dest = gen_rtx_REG (SImode, REGNO (dest));
15424 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15425
15426 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15427 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15428 {
15429 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15430 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15431 }
15432
15433 emit_insn (tmp);
15434 }
15435
15436 /* X is an unchanging MEM. If it is a constant pool reference, return
15437 the constant pool rtx, else NULL. */
15438
15439 rtx
15440 maybe_get_pool_constant (rtx x)
15441 {
15442 x = ix86_delegitimize_address (XEXP (x, 0));
15443
15444 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15445 return get_pool_constant (x);
15446
15447 return NULL_RTX;
15448 }
15449
15450 void
15451 ix86_expand_move (enum machine_mode mode, rtx operands[])
15452 {
15453 rtx op0, op1;
15454 enum tls_model model;
15455
15456 op0 = operands[0];
15457 op1 = operands[1];
15458
15459 if (GET_CODE (op1) == SYMBOL_REF)
15460 {
15461 model = SYMBOL_REF_TLS_MODEL (op1);
15462 if (model)
15463 {
15464 op1 = legitimize_tls_address (op1, model, true);
15465 op1 = force_operand (op1, op0);
15466 if (op1 == op0)
15467 return;
15468 if (GET_MODE (op1) != mode)
15469 op1 = convert_to_mode (mode, op1, 1);
15470 }
15471 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15472 && SYMBOL_REF_DLLIMPORT_P (op1))
15473 op1 = legitimize_dllimport_symbol (op1, false);
15474 }
15475 else if (GET_CODE (op1) == CONST
15476 && GET_CODE (XEXP (op1, 0)) == PLUS
15477 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15478 {
15479 rtx addend = XEXP (XEXP (op1, 0), 1);
15480 rtx symbol = XEXP (XEXP (op1, 0), 0);
15481 rtx tmp = NULL;
15482
15483 model = SYMBOL_REF_TLS_MODEL (symbol);
15484 if (model)
15485 tmp = legitimize_tls_address (symbol, model, true);
15486 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15487 && SYMBOL_REF_DLLIMPORT_P (symbol))
15488 tmp = legitimize_dllimport_symbol (symbol, true);
15489
15490 if (tmp)
15491 {
15492 tmp = force_operand (tmp, NULL);
15493 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15494 op0, 1, OPTAB_DIRECT);
15495 if (tmp == op0)
15496 return;
15497 if (GET_MODE (tmp) != mode)
15498 op1 = convert_to_mode (mode, tmp, 1);
15499 }
15500 }
15501
15502 if ((flag_pic || MACHOPIC_INDIRECT)
15503 && symbolic_operand (op1, mode))
15504 {
15505 if (TARGET_MACHO && !TARGET_64BIT)
15506 {
15507 #if TARGET_MACHO
15508 /* dynamic-no-pic */
15509 if (MACHOPIC_INDIRECT)
15510 {
15511 rtx temp = ((reload_in_progress
15512 || ((op0 && REG_P (op0))
15513 && mode == Pmode))
15514 ? op0 : gen_reg_rtx (Pmode));
15515 op1 = machopic_indirect_data_reference (op1, temp);
15516 if (MACHOPIC_PURE)
15517 op1 = machopic_legitimize_pic_address (op1, mode,
15518 temp == op1 ? 0 : temp);
15519 }
15520 if (op0 != op1 && GET_CODE (op0) != MEM)
15521 {
15522 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15523 emit_insn (insn);
15524 return;
15525 }
15526 if (GET_CODE (op0) == MEM)
15527 op1 = force_reg (Pmode, op1);
15528 else
15529 {
15530 rtx temp = op0;
15531 if (GET_CODE (temp) != REG)
15532 temp = gen_reg_rtx (Pmode);
15533 temp = legitimize_pic_address (op1, temp);
15534 if (temp == op0)
15535 return;
15536 op1 = temp;
15537 }
15538 /* dynamic-no-pic */
15539 #endif
15540 }
15541 else
15542 {
15543 if (MEM_P (op0))
15544 op1 = force_reg (mode, op1);
15545 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15546 {
15547 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15548 op1 = legitimize_pic_address (op1, reg);
15549 if (op0 == op1)
15550 return;
15551 if (GET_MODE (op1) != mode)
15552 op1 = convert_to_mode (mode, op1, 1);
15553 }
15554 }
15555 }
15556 else
15557 {
15558 if (MEM_P (op0)
15559 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15560 || !push_operand (op0, mode))
15561 && MEM_P (op1))
15562 op1 = force_reg (mode, op1);
15563
15564 if (push_operand (op0, mode)
15565 && ! general_no_elim_operand (op1, mode))
15566 op1 = copy_to_mode_reg (mode, op1);
15567
15568 /* Force large constants in 64bit compilation into register
15569 to get them CSEed. */
15570 if (can_create_pseudo_p ()
15571 && (mode == DImode) && TARGET_64BIT
15572 && immediate_operand (op1, mode)
15573 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15574 && !register_operand (op0, mode)
15575 && optimize)
15576 op1 = copy_to_mode_reg (mode, op1);
15577
15578 if (can_create_pseudo_p ()
15579 && FLOAT_MODE_P (mode)
15580 && GET_CODE (op1) == CONST_DOUBLE)
15581 {
15582 /* If we are loading a floating point constant to a register,
15583 force the value to memory now, since we'll get better code
15584 out the back end. */
15585
15586 op1 = validize_mem (force_const_mem (mode, op1));
15587 if (!register_operand (op0, mode))
15588 {
15589 rtx temp = gen_reg_rtx (mode);
15590 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15591 emit_move_insn (op0, temp);
15592 return;
15593 }
15594 }
15595 }
15596
15597 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15598 }
15599
15600 void
15601 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15602 {
15603 rtx op0 = operands[0], op1 = operands[1];
15604 unsigned int align = GET_MODE_ALIGNMENT (mode);
15605
15606 /* Force constants other than zero into memory. We do not know how
15607 the instructions used to build constants modify the upper 64 bits
15608 of the register, once we have that information we may be able
15609 to handle some of them more efficiently. */
15610 if (can_create_pseudo_p ()
15611 && register_operand (op0, mode)
15612 && (CONSTANT_P (op1)
15613 || (GET_CODE (op1) == SUBREG
15614 && CONSTANT_P (SUBREG_REG (op1))))
15615 && !standard_sse_constant_p (op1))
15616 op1 = validize_mem (force_const_mem (mode, op1));
15617
15618 /* We need to check memory alignment for SSE mode since attribute
15619 can make operands unaligned. */
15620 if (can_create_pseudo_p ()
15621 && SSE_REG_MODE_P (mode)
15622 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15623 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15624 {
15625 rtx tmp[2];
15626
15627 /* ix86_expand_vector_move_misalign() does not like constants ... */
15628 if (CONSTANT_P (op1)
15629 || (GET_CODE (op1) == SUBREG
15630 && CONSTANT_P (SUBREG_REG (op1))))
15631 op1 = validize_mem (force_const_mem (mode, op1));
15632
15633 /* ... nor both arguments in memory. */
15634 if (!register_operand (op0, mode)
15635 && !register_operand (op1, mode))
15636 op1 = force_reg (mode, op1);
15637
15638 tmp[0] = op0; tmp[1] = op1;
15639 ix86_expand_vector_move_misalign (mode, tmp);
15640 return;
15641 }
15642
15643 /* Make operand1 a register if it isn't already. */
15644 if (can_create_pseudo_p ()
15645 && !register_operand (op0, mode)
15646 && !register_operand (op1, mode))
15647 {
15648 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15649 return;
15650 }
15651
15652 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15653 }
15654
15655 /* Split 32-byte AVX unaligned load and store if needed. */
15656
15657 static void
15658 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15659 {
15660 rtx m;
15661 rtx (*extract) (rtx, rtx, rtx);
15662 rtx (*move_unaligned) (rtx, rtx);
15663 enum machine_mode mode;
15664
15665 switch (GET_MODE (op0))
15666 {
15667 default:
15668 gcc_unreachable ();
15669 case V32QImode:
15670 extract = gen_avx_vextractf128v32qi;
15671 move_unaligned = gen_avx_movdqu256;
15672 mode = V16QImode;
15673 break;
15674 case V8SFmode:
15675 extract = gen_avx_vextractf128v8sf;
15676 move_unaligned = gen_avx_movups256;
15677 mode = V4SFmode;
15678 break;
15679 case V4DFmode:
15680 extract = gen_avx_vextractf128v4df;
15681 move_unaligned = gen_avx_movupd256;
15682 mode = V2DFmode;
15683 break;
15684 }
15685
15686 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15687 {
15688 rtx r = gen_reg_rtx (mode);
15689 m = adjust_address (op1, mode, 0);
15690 emit_move_insn (r, m);
15691 m = adjust_address (op1, mode, 16);
15692 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15693 emit_move_insn (op0, r);
15694 }
15695 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15696 {
15697 m = adjust_address (op0, mode, 0);
15698 emit_insn (extract (m, op1, const0_rtx));
15699 m = adjust_address (op0, mode, 16);
15700 emit_insn (extract (m, op1, const1_rtx));
15701 }
15702 else
15703 emit_insn (move_unaligned (op0, op1));
15704 }
15705
15706 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15707 straight to ix86_expand_vector_move. */
15708 /* Code generation for scalar reg-reg moves of single and double precision data:
15709 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15710 movaps reg, reg
15711 else
15712 movss reg, reg
15713 if (x86_sse_partial_reg_dependency == true)
15714 movapd reg, reg
15715 else
15716 movsd reg, reg
15717
15718 Code generation for scalar loads of double precision data:
15719 if (x86_sse_split_regs == true)
15720 movlpd mem, reg (gas syntax)
15721 else
15722 movsd mem, reg
15723
15724 Code generation for unaligned packed loads of single precision data
15725 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15726 if (x86_sse_unaligned_move_optimal)
15727 movups mem, reg
15728
15729 if (x86_sse_partial_reg_dependency == true)
15730 {
15731 xorps reg, reg
15732 movlps mem, reg
15733 movhps mem+8, reg
15734 }
15735 else
15736 {
15737 movlps mem, reg
15738 movhps mem+8, reg
15739 }
15740
15741 Code generation for unaligned packed loads of double precision data
15742 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15743 if (x86_sse_unaligned_move_optimal)
15744 movupd mem, reg
15745
15746 if (x86_sse_split_regs == true)
15747 {
15748 movlpd mem, reg
15749 movhpd mem+8, reg
15750 }
15751 else
15752 {
15753 movsd mem, reg
15754 movhpd mem+8, reg
15755 }
15756 */
15757
15758 void
15759 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15760 {
15761 rtx op0, op1, m;
15762
15763 op0 = operands[0];
15764 op1 = operands[1];
15765
15766 if (TARGET_AVX)
15767 {
15768 switch (GET_MODE_CLASS (mode))
15769 {
15770 case MODE_VECTOR_INT:
15771 case MODE_INT:
15772 switch (GET_MODE_SIZE (mode))
15773 {
15774 case 16:
15775 /* If we're optimizing for size, movups is the smallest. */
15776 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15777 {
15778 op0 = gen_lowpart (V4SFmode, op0);
15779 op1 = gen_lowpart (V4SFmode, op1);
15780 emit_insn (gen_sse_movups (op0, op1));
15781 return;
15782 }
15783 op0 = gen_lowpart (V16QImode, op0);
15784 op1 = gen_lowpart (V16QImode, op1);
15785 emit_insn (gen_sse2_movdqu (op0, op1));
15786 break;
15787 case 32:
15788 op0 = gen_lowpart (V32QImode, op0);
15789 op1 = gen_lowpart (V32QImode, op1);
15790 ix86_avx256_split_vector_move_misalign (op0, op1);
15791 break;
15792 default:
15793 gcc_unreachable ();
15794 }
15795 break;
15796 case MODE_VECTOR_FLOAT:
15797 op0 = gen_lowpart (mode, op0);
15798 op1 = gen_lowpart (mode, op1);
15799
15800 switch (mode)
15801 {
15802 case V4SFmode:
15803 emit_insn (gen_sse_movups (op0, op1));
15804 break;
15805 case V8SFmode:
15806 ix86_avx256_split_vector_move_misalign (op0, op1);
15807 break;
15808 case V2DFmode:
15809 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15810 {
15811 op0 = gen_lowpart (V4SFmode, op0);
15812 op1 = gen_lowpart (V4SFmode, op1);
15813 emit_insn (gen_sse_movups (op0, op1));
15814 return;
15815 }
15816 emit_insn (gen_sse2_movupd (op0, op1));
15817 break;
15818 case V4DFmode:
15819 ix86_avx256_split_vector_move_misalign (op0, op1);
15820 break;
15821 default:
15822 gcc_unreachable ();
15823 }
15824 break;
15825
15826 default:
15827 gcc_unreachable ();
15828 }
15829
15830 return;
15831 }
15832
15833 if (MEM_P (op1))
15834 {
15835 /* If we're optimizing for size, movups is the smallest. */
15836 if (optimize_insn_for_size_p ()
15837 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15838 {
15839 op0 = gen_lowpart (V4SFmode, op0);
15840 op1 = gen_lowpart (V4SFmode, op1);
15841 emit_insn (gen_sse_movups (op0, op1));
15842 return;
15843 }
15844
15845 /* ??? If we have typed data, then it would appear that using
15846 movdqu is the only way to get unaligned data loaded with
15847 integer type. */
15848 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15849 {
15850 op0 = gen_lowpart (V16QImode, op0);
15851 op1 = gen_lowpart (V16QImode, op1);
15852 emit_insn (gen_sse2_movdqu (op0, op1));
15853 return;
15854 }
15855
15856 if (TARGET_SSE2 && mode == V2DFmode)
15857 {
15858 rtx zero;
15859
15860 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15861 {
15862 op0 = gen_lowpart (V2DFmode, op0);
15863 op1 = gen_lowpart (V2DFmode, op1);
15864 emit_insn (gen_sse2_movupd (op0, op1));
15865 return;
15866 }
15867
15868 /* When SSE registers are split into halves, we can avoid
15869 writing to the top half twice. */
15870 if (TARGET_SSE_SPLIT_REGS)
15871 {
15872 emit_clobber (op0);
15873 zero = op0;
15874 }
15875 else
15876 {
15877 /* ??? Not sure about the best option for the Intel chips.
15878 The following would seem to satisfy; the register is
15879 entirely cleared, breaking the dependency chain. We
15880 then store to the upper half, with a dependency depth
15881 of one. A rumor has it that Intel recommends two movsd
15882 followed by an unpacklpd, but this is unconfirmed. And
15883 given that the dependency depth of the unpacklpd would
15884 still be one, I'm not sure why this would be better. */
15885 zero = CONST0_RTX (V2DFmode);
15886 }
15887
15888 m = adjust_address (op1, DFmode, 0);
15889 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15890 m = adjust_address (op1, DFmode, 8);
15891 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15892 }
15893 else
15894 {
15895 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15896 {
15897 op0 = gen_lowpart (V4SFmode, op0);
15898 op1 = gen_lowpart (V4SFmode, op1);
15899 emit_insn (gen_sse_movups (op0, op1));
15900 return;
15901 }
15902
15903 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15904 emit_move_insn (op0, CONST0_RTX (mode));
15905 else
15906 emit_clobber (op0);
15907
15908 if (mode != V4SFmode)
15909 op0 = gen_lowpart (V4SFmode, op0);
15910 m = adjust_address (op1, V2SFmode, 0);
15911 emit_insn (gen_sse_loadlps (op0, op0, m));
15912 m = adjust_address (op1, V2SFmode, 8);
15913 emit_insn (gen_sse_loadhps (op0, op0, m));
15914 }
15915 }
15916 else if (MEM_P (op0))
15917 {
15918 /* If we're optimizing for size, movups is the smallest. */
15919 if (optimize_insn_for_size_p ()
15920 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15921 {
15922 op0 = gen_lowpart (V4SFmode, op0);
15923 op1 = gen_lowpart (V4SFmode, op1);
15924 emit_insn (gen_sse_movups (op0, op1));
15925 return;
15926 }
15927
15928 /* ??? Similar to above, only less clear because of quote
15929 typeless stores unquote. */
15930 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15931 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15932 {
15933 op0 = gen_lowpart (V16QImode, op0);
15934 op1 = gen_lowpart (V16QImode, op1);
15935 emit_insn (gen_sse2_movdqu (op0, op1));
15936 return;
15937 }
15938
15939 if (TARGET_SSE2 && mode == V2DFmode)
15940 {
15941 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15942 {
15943 op0 = gen_lowpart (V2DFmode, op0);
15944 op1 = gen_lowpart (V2DFmode, op1);
15945 emit_insn (gen_sse2_movupd (op0, op1));
15946 }
15947 else
15948 {
15949 m = adjust_address (op0, DFmode, 0);
15950 emit_insn (gen_sse2_storelpd (m, op1));
15951 m = adjust_address (op0, DFmode, 8);
15952 emit_insn (gen_sse2_storehpd (m, op1));
15953 }
15954 }
15955 else
15956 {
15957 if (mode != V4SFmode)
15958 op1 = gen_lowpart (V4SFmode, op1);
15959
15960 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15961 {
15962 op0 = gen_lowpart (V4SFmode, op0);
15963 emit_insn (gen_sse_movups (op0, op1));
15964 }
15965 else
15966 {
15967 m = adjust_address (op0, V2SFmode, 0);
15968 emit_insn (gen_sse_storelps (m, op1));
15969 m = adjust_address (op0, V2SFmode, 8);
15970 emit_insn (gen_sse_storehps (m, op1));
15971 }
15972 }
15973 }
15974 else
15975 gcc_unreachable ();
15976 }
15977
15978 /* Expand a push in MODE. This is some mode for which we do not support
15979 proper push instructions, at least from the registers that we expect
15980 the value to live in. */
15981
15982 void
15983 ix86_expand_push (enum machine_mode mode, rtx x)
15984 {
15985 rtx tmp;
15986
15987 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
15988 GEN_INT (-GET_MODE_SIZE (mode)),
15989 stack_pointer_rtx, 1, OPTAB_DIRECT);
15990 if (tmp != stack_pointer_rtx)
15991 emit_move_insn (stack_pointer_rtx, tmp);
15992
15993 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
15994
15995 /* When we push an operand onto stack, it has to be aligned at least
15996 at the function argument boundary. However since we don't have
15997 the argument type, we can't determine the actual argument
15998 boundary. */
15999 emit_move_insn (tmp, x);
16000 }
16001
16002 /* Helper function of ix86_fixup_binary_operands to canonicalize
16003 operand order. Returns true if the operands should be swapped. */
16004
16005 static bool
16006 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16007 rtx operands[])
16008 {
16009 rtx dst = operands[0];
16010 rtx src1 = operands[1];
16011 rtx src2 = operands[2];
16012
16013 /* If the operation is not commutative, we can't do anything. */
16014 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16015 return false;
16016
16017 /* Highest priority is that src1 should match dst. */
16018 if (rtx_equal_p (dst, src1))
16019 return false;
16020 if (rtx_equal_p (dst, src2))
16021 return true;
16022
16023 /* Next highest priority is that immediate constants come second. */
16024 if (immediate_operand (src2, mode))
16025 return false;
16026 if (immediate_operand (src1, mode))
16027 return true;
16028
16029 /* Lowest priority is that memory references should come second. */
16030 if (MEM_P (src2))
16031 return false;
16032 if (MEM_P (src1))
16033 return true;
16034
16035 return false;
16036 }
16037
16038
16039 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16040 destination to use for the operation. If different from the true
16041 destination in operands[0], a copy operation will be required. */
16042
16043 rtx
16044 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16045 rtx operands[])
16046 {
16047 rtx dst = operands[0];
16048 rtx src1 = operands[1];
16049 rtx src2 = operands[2];
16050
16051 /* Canonicalize operand order. */
16052 if (ix86_swap_binary_operands_p (code, mode, operands))
16053 {
16054 rtx temp;
16055
16056 /* It is invalid to swap operands of different modes. */
16057 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16058
16059 temp = src1;
16060 src1 = src2;
16061 src2 = temp;
16062 }
16063
16064 /* Both source operands cannot be in memory. */
16065 if (MEM_P (src1) && MEM_P (src2))
16066 {
16067 /* Optimization: Only read from memory once. */
16068 if (rtx_equal_p (src1, src2))
16069 {
16070 src2 = force_reg (mode, src2);
16071 src1 = src2;
16072 }
16073 else
16074 src2 = force_reg (mode, src2);
16075 }
16076
16077 /* If the destination is memory, and we do not have matching source
16078 operands, do things in registers. */
16079 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16080 dst = gen_reg_rtx (mode);
16081
16082 /* Source 1 cannot be a constant. */
16083 if (CONSTANT_P (src1))
16084 src1 = force_reg (mode, src1);
16085
16086 /* Source 1 cannot be a non-matching memory. */
16087 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16088 src1 = force_reg (mode, src1);
16089
16090 /* Improve address combine. */
16091 if (code == PLUS
16092 && GET_MODE_CLASS (mode) == MODE_INT
16093 && MEM_P (src2))
16094 src2 = force_reg (mode, src2);
16095
16096 operands[1] = src1;
16097 operands[2] = src2;
16098 return dst;
16099 }
16100
16101 /* Similarly, but assume that the destination has already been
16102 set up properly. */
16103
16104 void
16105 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16106 enum machine_mode mode, rtx operands[])
16107 {
16108 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16109 gcc_assert (dst == operands[0]);
16110 }
16111
16112 /* Attempt to expand a binary operator. Make the expansion closer to the
16113 actual machine, then just general_operand, which will allow 3 separate
16114 memory references (one output, two input) in a single insn. */
16115
16116 void
16117 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16118 rtx operands[])
16119 {
16120 rtx src1, src2, dst, op, clob;
16121
16122 dst = ix86_fixup_binary_operands (code, mode, operands);
16123 src1 = operands[1];
16124 src2 = operands[2];
16125
16126 /* Emit the instruction. */
16127
16128 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16129 if (reload_in_progress)
16130 {
16131 /* Reload doesn't know about the flags register, and doesn't know that
16132 it doesn't want to clobber it. We can only do this with PLUS. */
16133 gcc_assert (code == PLUS);
16134 emit_insn (op);
16135 }
16136 else if (reload_completed
16137 && code == PLUS
16138 && !rtx_equal_p (dst, src1))
16139 {
16140 /* This is going to be an LEA; avoid splitting it later. */
16141 emit_insn (op);
16142 }
16143 else
16144 {
16145 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16146 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16147 }
16148
16149 /* Fix up the destination if needed. */
16150 if (dst != operands[0])
16151 emit_move_insn (operands[0], dst);
16152 }
16153
16154 /* Return TRUE or FALSE depending on whether the binary operator meets the
16155 appropriate constraints. */
16156
16157 bool
16158 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16159 rtx operands[3])
16160 {
16161 rtx dst = operands[0];
16162 rtx src1 = operands[1];
16163 rtx src2 = operands[2];
16164
16165 /* Both source operands cannot be in memory. */
16166 if (MEM_P (src1) && MEM_P (src2))
16167 return false;
16168
16169 /* Canonicalize operand order for commutative operators. */
16170 if (ix86_swap_binary_operands_p (code, mode, operands))
16171 {
16172 rtx temp = src1;
16173 src1 = src2;
16174 src2 = temp;
16175 }
16176
16177 /* If the destination is memory, we must have a matching source operand. */
16178 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16179 return false;
16180
16181 /* Source 1 cannot be a constant. */
16182 if (CONSTANT_P (src1))
16183 return false;
16184
16185 /* Source 1 cannot be a non-matching memory. */
16186 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16187 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16188 return (code == AND
16189 && (mode == HImode
16190 || mode == SImode
16191 || (TARGET_64BIT && mode == DImode))
16192 && satisfies_constraint_L (src2));
16193
16194 return true;
16195 }
16196
16197 /* Attempt to expand a unary operator. Make the expansion closer to the
16198 actual machine, then just general_operand, which will allow 2 separate
16199 memory references (one output, one input) in a single insn. */
16200
16201 void
16202 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16203 rtx operands[])
16204 {
16205 int matching_memory;
16206 rtx src, dst, op, clob;
16207
16208 dst = operands[0];
16209 src = operands[1];
16210
16211 /* If the destination is memory, and we do not have matching source
16212 operands, do things in registers. */
16213 matching_memory = 0;
16214 if (MEM_P (dst))
16215 {
16216 if (rtx_equal_p (dst, src))
16217 matching_memory = 1;
16218 else
16219 dst = gen_reg_rtx (mode);
16220 }
16221
16222 /* When source operand is memory, destination must match. */
16223 if (MEM_P (src) && !matching_memory)
16224 src = force_reg (mode, src);
16225
16226 /* Emit the instruction. */
16227
16228 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16229 if (reload_in_progress || code == NOT)
16230 {
16231 /* Reload doesn't know about the flags register, and doesn't know that
16232 it doesn't want to clobber it. */
16233 gcc_assert (code == NOT);
16234 emit_insn (op);
16235 }
16236 else
16237 {
16238 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16239 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16240 }
16241
16242 /* Fix up the destination if needed. */
16243 if (dst != operands[0])
16244 emit_move_insn (operands[0], dst);
16245 }
16246
16247 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16248 divisor are within the range [0-255]. */
16249
16250 void
16251 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16252 bool signed_p)
16253 {
16254 rtx end_label, qimode_label;
16255 rtx insn, div, mod;
16256 rtx scratch, tmp0, tmp1, tmp2;
16257 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16258 rtx (*gen_zero_extend) (rtx, rtx);
16259 rtx (*gen_test_ccno_1) (rtx, rtx);
16260
16261 switch (mode)
16262 {
16263 case SImode:
16264 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16265 gen_test_ccno_1 = gen_testsi_ccno_1;
16266 gen_zero_extend = gen_zero_extendqisi2;
16267 break;
16268 case DImode:
16269 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16270 gen_test_ccno_1 = gen_testdi_ccno_1;
16271 gen_zero_extend = gen_zero_extendqidi2;
16272 break;
16273 default:
16274 gcc_unreachable ();
16275 }
16276
16277 end_label = gen_label_rtx ();
16278 qimode_label = gen_label_rtx ();
16279
16280 scratch = gen_reg_rtx (mode);
16281
16282 /* Use 8bit unsigned divimod if dividend and divisor are within
16283 the range [0-255]. */
16284 emit_move_insn (scratch, operands[2]);
16285 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16286 scratch, 1, OPTAB_DIRECT);
16287 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16288 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16289 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16290 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16291 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16292 pc_rtx);
16293 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16294 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16295 JUMP_LABEL (insn) = qimode_label;
16296
16297 /* Generate original signed/unsigned divimod. */
16298 div = gen_divmod4_1 (operands[0], operands[1],
16299 operands[2], operands[3]);
16300 emit_insn (div);
16301
16302 /* Branch to the end. */
16303 emit_jump_insn (gen_jump (end_label));
16304 emit_barrier ();
16305
16306 /* Generate 8bit unsigned divide. */
16307 emit_label (qimode_label);
16308 /* Don't use operands[0] for result of 8bit divide since not all
16309 registers support QImode ZERO_EXTRACT. */
16310 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16311 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16312 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16313 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16314
16315 if (signed_p)
16316 {
16317 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16318 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16319 }
16320 else
16321 {
16322 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16323 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16324 }
16325
16326 /* Extract remainder from AH. */
16327 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16328 if (REG_P (operands[1]))
16329 insn = emit_move_insn (operands[1], tmp1);
16330 else
16331 {
16332 /* Need a new scratch register since the old one has result
16333 of 8bit divide. */
16334 scratch = gen_reg_rtx (mode);
16335 emit_move_insn (scratch, tmp1);
16336 insn = emit_move_insn (operands[1], scratch);
16337 }
16338 set_unique_reg_note (insn, REG_EQUAL, mod);
16339
16340 /* Zero extend quotient from AL. */
16341 tmp1 = gen_lowpart (QImode, tmp0);
16342 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16343 set_unique_reg_note (insn, REG_EQUAL, div);
16344
16345 emit_label (end_label);
16346 }
16347
16348 #define LEA_MAX_STALL (3)
16349 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16350
16351 /* Increase given DISTANCE in half-cycles according to
16352 dependencies between PREV and NEXT instructions.
16353 Add 1 half-cycle if there is no dependency and
16354 go to next cycle if there is some dependecy. */
16355
16356 static unsigned int
16357 increase_distance (rtx prev, rtx next, unsigned int distance)
16358 {
16359 df_ref *use_rec;
16360 df_ref *def_rec;
16361
16362 if (!prev || !next)
16363 return distance + (distance & 1) + 2;
16364
16365 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16366 return distance + 1;
16367
16368 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16369 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16370 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16371 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16372 return distance + (distance & 1) + 2;
16373
16374 return distance + 1;
16375 }
16376
16377 /* Function checks if instruction INSN defines register number
16378 REGNO1 or REGNO2. */
16379
16380 static bool
16381 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16382 rtx insn)
16383 {
16384 df_ref *def_rec;
16385
16386 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16387 if (DF_REF_REG_DEF_P (*def_rec)
16388 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16389 && (regno1 == DF_REF_REGNO (*def_rec)
16390 || regno2 == DF_REF_REGNO (*def_rec)))
16391 {
16392 return true;
16393 }
16394
16395 return false;
16396 }
16397
16398 /* Function checks if instruction INSN uses register number
16399 REGNO as a part of address expression. */
16400
16401 static bool
16402 insn_uses_reg_mem (unsigned int regno, rtx insn)
16403 {
16404 df_ref *use_rec;
16405
16406 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16407 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16408 return true;
16409
16410 return false;
16411 }
16412
16413 /* Search backward for non-agu definition of register number REGNO1
16414 or register number REGNO2 in basic block starting from instruction
16415 START up to head of basic block or instruction INSN.
16416
16417 Function puts true value into *FOUND var if definition was found
16418 and false otherwise.
16419
16420 Distance in half-cycles between START and found instruction or head
16421 of BB is added to DISTANCE and returned. */
16422
16423 static int
16424 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16425 rtx insn, int distance,
16426 rtx start, bool *found)
16427 {
16428 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16429 rtx prev = start;
16430 rtx next = NULL;
16431
16432 *found = false;
16433
16434 while (prev
16435 && prev != insn
16436 && distance < LEA_SEARCH_THRESHOLD)
16437 {
16438 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16439 {
16440 distance = increase_distance (prev, next, distance);
16441 if (insn_defines_reg (regno1, regno2, prev))
16442 {
16443 if (recog_memoized (prev) < 0
16444 || get_attr_type (prev) != TYPE_LEA)
16445 {
16446 *found = true;
16447 return distance;
16448 }
16449 }
16450
16451 next = prev;
16452 }
16453 if (prev == BB_HEAD (bb))
16454 break;
16455
16456 prev = PREV_INSN (prev);
16457 }
16458
16459 return distance;
16460 }
16461
16462 /* Search backward for non-agu definition of register number REGNO1
16463 or register number REGNO2 in INSN's basic block until
16464 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16465 2. Reach neighbour BBs boundary, or
16466 3. Reach agu definition.
16467 Returns the distance between the non-agu definition point and INSN.
16468 If no definition point, returns -1. */
16469
16470 static int
16471 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16472 rtx insn)
16473 {
16474 basic_block bb = BLOCK_FOR_INSN (insn);
16475 int distance = 0;
16476 bool found = false;
16477
16478 if (insn != BB_HEAD (bb))
16479 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16480 distance, PREV_INSN (insn),
16481 &found);
16482
16483 if (!found && distance < LEA_SEARCH_THRESHOLD)
16484 {
16485 edge e;
16486 edge_iterator ei;
16487 bool simple_loop = false;
16488
16489 FOR_EACH_EDGE (e, ei, bb->preds)
16490 if (e->src == bb)
16491 {
16492 simple_loop = true;
16493 break;
16494 }
16495
16496 if (simple_loop)
16497 distance = distance_non_agu_define_in_bb (regno1, regno2,
16498 insn, distance,
16499 BB_END (bb), &found);
16500 else
16501 {
16502 int shortest_dist = -1;
16503 bool found_in_bb = false;
16504
16505 FOR_EACH_EDGE (e, ei, bb->preds)
16506 {
16507 int bb_dist
16508 = distance_non_agu_define_in_bb (regno1, regno2,
16509 insn, distance,
16510 BB_END (e->src),
16511 &found_in_bb);
16512 if (found_in_bb)
16513 {
16514 if (shortest_dist < 0)
16515 shortest_dist = bb_dist;
16516 else if (bb_dist > 0)
16517 shortest_dist = MIN (bb_dist, shortest_dist);
16518
16519 found = true;
16520 }
16521 }
16522
16523 distance = shortest_dist;
16524 }
16525 }
16526
16527 /* get_attr_type may modify recog data. We want to make sure
16528 that recog data is valid for instruction INSN, on which
16529 distance_non_agu_define is called. INSN is unchanged here. */
16530 extract_insn_cached (insn);
16531
16532 if (!found)
16533 return -1;
16534
16535 return distance >> 1;
16536 }
16537
16538 /* Return the distance in half-cycles between INSN and the next
16539 insn that uses register number REGNO in memory address added
16540 to DISTANCE. Return -1 if REGNO0 is set.
16541
16542 Put true value into *FOUND if register usage was found and
16543 false otherwise.
16544 Put true value into *REDEFINED if register redefinition was
16545 found and false otherwise. */
16546
16547 static int
16548 distance_agu_use_in_bb (unsigned int regno,
16549 rtx insn, int distance, rtx start,
16550 bool *found, bool *redefined)
16551 {
16552 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16553 rtx next = start;
16554 rtx prev = NULL;
16555
16556 *found = false;
16557 *redefined = false;
16558
16559 while (next
16560 && next != insn
16561 && distance < LEA_SEARCH_THRESHOLD)
16562 {
16563 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16564 {
16565 distance = increase_distance(prev, next, distance);
16566 if (insn_uses_reg_mem (regno, next))
16567 {
16568 /* Return DISTANCE if OP0 is used in memory
16569 address in NEXT. */
16570 *found = true;
16571 return distance;
16572 }
16573
16574 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16575 {
16576 /* Return -1 if OP0 is set in NEXT. */
16577 *redefined = true;
16578 return -1;
16579 }
16580
16581 prev = next;
16582 }
16583
16584 if (next == BB_END (bb))
16585 break;
16586
16587 next = NEXT_INSN (next);
16588 }
16589
16590 return distance;
16591 }
16592
16593 /* Return the distance between INSN and the next insn that uses
16594 register number REGNO0 in memory address. Return -1 if no such
16595 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16596
16597 static int
16598 distance_agu_use (unsigned int regno0, rtx insn)
16599 {
16600 basic_block bb = BLOCK_FOR_INSN (insn);
16601 int distance = 0;
16602 bool found = false;
16603 bool redefined = false;
16604
16605 if (insn != BB_END (bb))
16606 distance = distance_agu_use_in_bb (regno0, insn, distance,
16607 NEXT_INSN (insn),
16608 &found, &redefined);
16609
16610 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16611 {
16612 edge e;
16613 edge_iterator ei;
16614 bool simple_loop = false;
16615
16616 FOR_EACH_EDGE (e, ei, bb->succs)
16617 if (e->dest == bb)
16618 {
16619 simple_loop = true;
16620 break;
16621 }
16622
16623 if (simple_loop)
16624 distance = distance_agu_use_in_bb (regno0, insn,
16625 distance, BB_HEAD (bb),
16626 &found, &redefined);
16627 else
16628 {
16629 int shortest_dist = -1;
16630 bool found_in_bb = false;
16631 bool redefined_in_bb = false;
16632
16633 FOR_EACH_EDGE (e, ei, bb->succs)
16634 {
16635 int bb_dist
16636 = distance_agu_use_in_bb (regno0, insn,
16637 distance, BB_HEAD (e->dest),
16638 &found_in_bb, &redefined_in_bb);
16639 if (found_in_bb)
16640 {
16641 if (shortest_dist < 0)
16642 shortest_dist = bb_dist;
16643 else if (bb_dist > 0)
16644 shortest_dist = MIN (bb_dist, shortest_dist);
16645
16646 found = true;
16647 }
16648 }
16649
16650 distance = shortest_dist;
16651 }
16652 }
16653
16654 if (!found || redefined)
16655 return -1;
16656
16657 return distance >> 1;
16658 }
16659
16660 /* Define this macro to tune LEA priority vs ADD, it take effect when
16661 there is a dilemma of choicing LEA or ADD
16662 Negative value: ADD is more preferred than LEA
16663 Zero: Netrual
16664 Positive value: LEA is more preferred than ADD*/
16665 #define IX86_LEA_PRIORITY 0
16666
16667 /* Return true if usage of lea INSN has performance advantage
16668 over a sequence of instructions. Instructions sequence has
16669 SPLIT_COST cycles higher latency than lea latency. */
16670
16671 bool
16672 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16673 unsigned int regno2, unsigned int split_cost)
16674 {
16675 int dist_define, dist_use;
16676
16677 dist_define = distance_non_agu_define (regno1, regno2, insn);
16678 dist_use = distance_agu_use (regno0, insn);
16679
16680 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16681 {
16682 /* If there is no non AGU operand definition, no AGU
16683 operand usage and split cost is 0 then both lea
16684 and non lea variants have same priority. Currently
16685 we prefer lea for 64 bit code and non lea on 32 bit
16686 code. */
16687 if (dist_use < 0 && split_cost == 0)
16688 return TARGET_64BIT || IX86_LEA_PRIORITY;
16689 else
16690 return true;
16691 }
16692
16693 /* With longer definitions distance lea is more preferable.
16694 Here we change it to take into account splitting cost and
16695 lea priority. */
16696 dist_define += split_cost + IX86_LEA_PRIORITY;
16697
16698 /* If there is no use in memory addess then we just check
16699 that split cost does not exceed AGU stall. */
16700 if (dist_use < 0)
16701 return dist_define >= LEA_MAX_STALL;
16702
16703 /* If this insn has both backward non-agu dependence and forward
16704 agu dependence, the one with short distance takes effect. */
16705 return dist_define >= dist_use;
16706 }
16707
16708 /* Return true if it is legal to clobber flags by INSN and
16709 false otherwise. */
16710
16711 static bool
16712 ix86_ok_to_clobber_flags (rtx insn)
16713 {
16714 basic_block bb = BLOCK_FOR_INSN (insn);
16715 df_ref *use;
16716 bitmap live;
16717
16718 while (insn)
16719 {
16720 if (NONDEBUG_INSN_P (insn))
16721 {
16722 for (use = DF_INSN_USES (insn); *use; use++)
16723 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16724 return false;
16725
16726 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16727 return true;
16728 }
16729
16730 if (insn == BB_END (bb))
16731 break;
16732
16733 insn = NEXT_INSN (insn);
16734 }
16735
16736 live = df_get_live_out(bb);
16737 return !REGNO_REG_SET_P (live, FLAGS_REG);
16738 }
16739
16740 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16741 move and add to avoid AGU stalls. */
16742
16743 bool
16744 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16745 {
16746 unsigned int regno0 = true_regnum (operands[0]);
16747 unsigned int regno1 = true_regnum (operands[1]);
16748 unsigned int regno2 = true_regnum (operands[2]);
16749
16750 /* Check if we need to optimize. */
16751 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16752 return false;
16753
16754 /* Check it is correct to split here. */
16755 if (!ix86_ok_to_clobber_flags(insn))
16756 return false;
16757
16758 /* We need to split only adds with non destructive
16759 destination operand. */
16760 if (regno0 == regno1 || regno0 == regno2)
16761 return false;
16762 else
16763 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16764 }
16765
16766 /* Return true if we should emit lea instruction instead of mov
16767 instruction. */
16768
16769 bool
16770 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16771 {
16772 unsigned int regno0;
16773 unsigned int regno1;
16774
16775 /* Check if we need to optimize. */
16776 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16777 return false;
16778
16779 /* Use lea for reg to reg moves only. */
16780 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16781 return false;
16782
16783 regno0 = true_regnum (operands[0]);
16784 regno1 = true_regnum (operands[1]);
16785
16786 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16787 }
16788
16789 /* Return true if we need to split lea into a sequence of
16790 instructions to avoid AGU stalls. */
16791
16792 bool
16793 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16794 {
16795 unsigned int regno0 = true_regnum (operands[0]) ;
16796 unsigned int regno1 = -1;
16797 unsigned int regno2 = -1;
16798 unsigned int split_cost = 0;
16799 struct ix86_address parts;
16800 int ok;
16801
16802 /* Check we need to optimize. */
16803 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16804 return false;
16805
16806 /* Check it is correct to split here. */
16807 if (!ix86_ok_to_clobber_flags(insn))
16808 return false;
16809
16810 ok = ix86_decompose_address (operands[1], &parts);
16811 gcc_assert (ok);
16812
16813 /* We should not split into add if non legitimate pic
16814 operand is used as displacement. */
16815 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16816 return false;
16817
16818 if (parts.base)
16819 regno1 = true_regnum (parts.base);
16820 if (parts.index)
16821 regno2 = true_regnum (parts.index);
16822
16823 /* Compute how many cycles we will add to execution time
16824 if split lea into a sequence of instructions. */
16825 if (parts.base || parts.index)
16826 {
16827 /* Have to use mov instruction if non desctructive
16828 destination form is used. */
16829 if (regno1 != regno0 && regno2 != regno0)
16830 split_cost += 1;
16831
16832 /* Have to add index to base if both exist. */
16833 if (parts.base && parts.index)
16834 split_cost += 1;
16835
16836 /* Have to use shift and adds if scale is 2 or greater. */
16837 if (parts.scale > 1)
16838 {
16839 if (regno0 != regno1)
16840 split_cost += 1;
16841 else if (regno2 == regno0)
16842 split_cost += 4;
16843 else
16844 split_cost += parts.scale;
16845 }
16846
16847 /* Have to use add instruction with immediate if
16848 disp is non zero. */
16849 if (parts.disp && parts.disp != const0_rtx)
16850 split_cost += 1;
16851
16852 /* Subtract the price of lea. */
16853 split_cost -= 1;
16854 }
16855
16856 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16857 }
16858
16859 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16860 matches destination. RTX includes clobber of FLAGS_REG. */
16861
16862 static void
16863 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16864 rtx dst, rtx src)
16865 {
16866 rtx op, clob;
16867
16868 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16869 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16870
16871 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16872 }
16873
16874 /* Split lea instructions into a sequence of instructions
16875 which are executed on ALU to avoid AGU stalls.
16876 It is assumed that it is allowed to clobber flags register
16877 at lea position. */
16878
16879 extern void
16880 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16881 {
16882 unsigned int regno0 = true_regnum (operands[0]) ;
16883 unsigned int regno1 = INVALID_REGNUM;
16884 unsigned int regno2 = INVALID_REGNUM;
16885 struct ix86_address parts;
16886 rtx tmp;
16887 int ok, adds;
16888
16889 ok = ix86_decompose_address (operands[1], &parts);
16890 gcc_assert (ok);
16891
16892 if (parts.base)
16893 {
16894 if (GET_MODE (parts.base) != mode)
16895 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16896 regno1 = true_regnum (parts.base);
16897 }
16898
16899 if (parts.index)
16900 {
16901 if (GET_MODE (parts.index) != mode)
16902 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16903 regno2 = true_regnum (parts.index);
16904 }
16905
16906 if (parts.scale > 1)
16907 {
16908 /* Case r1 = r1 + ... */
16909 if (regno1 == regno0)
16910 {
16911 /* If we have a case r1 = r1 + C * r1 then we
16912 should use multiplication which is very
16913 expensive. Assume cost model is wrong if we
16914 have such case here. */
16915 gcc_assert (regno2 != regno0);
16916
16917 for (adds = parts.scale; adds > 0; adds--)
16918 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16919 }
16920 else
16921 {
16922 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16923 if (regno0 != regno2)
16924 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16925
16926 /* Use shift for scaling. */
16927 ix86_emit_binop (ASHIFT, mode, operands[0],
16928 GEN_INT (exact_log2 (parts.scale)));
16929
16930 if (parts.base)
16931 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16932
16933 if (parts.disp && parts.disp != const0_rtx)
16934 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16935 }
16936 }
16937 else if (!parts.base && !parts.index)
16938 {
16939 gcc_assert(parts.disp);
16940 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16941 }
16942 else
16943 {
16944 if (!parts.base)
16945 {
16946 if (regno0 != regno2)
16947 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16948 }
16949 else if (!parts.index)
16950 {
16951 if (regno0 != regno1)
16952 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16953 }
16954 else
16955 {
16956 if (regno0 == regno1)
16957 tmp = parts.index;
16958 else if (regno0 == regno2)
16959 tmp = parts.base;
16960 else
16961 {
16962 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16963 tmp = parts.index;
16964 }
16965
16966 ix86_emit_binop (PLUS, mode, operands[0], tmp);
16967 }
16968
16969 if (parts.disp && parts.disp != const0_rtx)
16970 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16971 }
16972 }
16973
16974 /* Return true if it is ok to optimize an ADD operation to LEA
16975 operation to avoid flag register consumation. For most processors,
16976 ADD is faster than LEA. For the processors like ATOM, if the
16977 destination register of LEA holds an actual address which will be
16978 used soon, LEA is better and otherwise ADD is better. */
16979
16980 bool
16981 ix86_lea_for_add_ok (rtx insn, rtx operands[])
16982 {
16983 unsigned int regno0 = true_regnum (operands[0]);
16984 unsigned int regno1 = true_regnum (operands[1]);
16985 unsigned int regno2 = true_regnum (operands[2]);
16986
16987 /* If a = b + c, (a!=b && a!=c), must use lea form. */
16988 if (regno0 != regno1 && regno0 != regno2)
16989 return true;
16990
16991 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16992 return false;
16993
16994 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
16995 }
16996
16997 /* Return true if destination reg of SET_BODY is shift count of
16998 USE_BODY. */
16999
17000 static bool
17001 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17002 {
17003 rtx set_dest;
17004 rtx shift_rtx;
17005 int i;
17006
17007 /* Retrieve destination of SET_BODY. */
17008 switch (GET_CODE (set_body))
17009 {
17010 case SET:
17011 set_dest = SET_DEST (set_body);
17012 if (!set_dest || !REG_P (set_dest))
17013 return false;
17014 break;
17015 case PARALLEL:
17016 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17017 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17018 use_body))
17019 return true;
17020 default:
17021 return false;
17022 break;
17023 }
17024
17025 /* Retrieve shift count of USE_BODY. */
17026 switch (GET_CODE (use_body))
17027 {
17028 case SET:
17029 shift_rtx = XEXP (use_body, 1);
17030 break;
17031 case PARALLEL:
17032 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17033 if (ix86_dep_by_shift_count_body (set_body,
17034 XVECEXP (use_body, 0, i)))
17035 return true;
17036 default:
17037 return false;
17038 break;
17039 }
17040
17041 if (shift_rtx
17042 && (GET_CODE (shift_rtx) == ASHIFT
17043 || GET_CODE (shift_rtx) == LSHIFTRT
17044 || GET_CODE (shift_rtx) == ASHIFTRT
17045 || GET_CODE (shift_rtx) == ROTATE
17046 || GET_CODE (shift_rtx) == ROTATERT))
17047 {
17048 rtx shift_count = XEXP (shift_rtx, 1);
17049
17050 /* Return true if shift count is dest of SET_BODY. */
17051 if (REG_P (shift_count)
17052 && true_regnum (set_dest) == true_regnum (shift_count))
17053 return true;
17054 }
17055
17056 return false;
17057 }
17058
17059 /* Return true if destination reg of SET_INSN is shift count of
17060 USE_INSN. */
17061
17062 bool
17063 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17064 {
17065 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17066 PATTERN (use_insn));
17067 }
17068
17069 /* Return TRUE or FALSE depending on whether the unary operator meets the
17070 appropriate constraints. */
17071
17072 bool
17073 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17074 enum machine_mode mode ATTRIBUTE_UNUSED,
17075 rtx operands[2] ATTRIBUTE_UNUSED)
17076 {
17077 /* If one of operands is memory, source and destination must match. */
17078 if ((MEM_P (operands[0])
17079 || MEM_P (operands[1]))
17080 && ! rtx_equal_p (operands[0], operands[1]))
17081 return false;
17082 return true;
17083 }
17084
17085 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17086 are ok, keeping in mind the possible movddup alternative. */
17087
17088 bool
17089 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17090 {
17091 if (MEM_P (operands[0]))
17092 return rtx_equal_p (operands[0], operands[1 + high]);
17093 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17094 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17095 return true;
17096 }
17097
17098 /* Post-reload splitter for converting an SF or DFmode value in an
17099 SSE register into an unsigned SImode. */
17100
17101 void
17102 ix86_split_convert_uns_si_sse (rtx operands[])
17103 {
17104 enum machine_mode vecmode;
17105 rtx value, large, zero_or_two31, input, two31, x;
17106
17107 large = operands[1];
17108 zero_or_two31 = operands[2];
17109 input = operands[3];
17110 two31 = operands[4];
17111 vecmode = GET_MODE (large);
17112 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17113
17114 /* Load up the value into the low element. We must ensure that the other
17115 elements are valid floats -- zero is the easiest such value. */
17116 if (MEM_P (input))
17117 {
17118 if (vecmode == V4SFmode)
17119 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17120 else
17121 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17122 }
17123 else
17124 {
17125 input = gen_rtx_REG (vecmode, REGNO (input));
17126 emit_move_insn (value, CONST0_RTX (vecmode));
17127 if (vecmode == V4SFmode)
17128 emit_insn (gen_sse_movss (value, value, input));
17129 else
17130 emit_insn (gen_sse2_movsd (value, value, input));
17131 }
17132
17133 emit_move_insn (large, two31);
17134 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17135
17136 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17137 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17138
17139 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17140 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17141
17142 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17143 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17144
17145 large = gen_rtx_REG (V4SImode, REGNO (large));
17146 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17147
17148 x = gen_rtx_REG (V4SImode, REGNO (value));
17149 if (vecmode == V4SFmode)
17150 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17151 else
17152 emit_insn (gen_sse2_cvttpd2dq (x, value));
17153 value = x;
17154
17155 emit_insn (gen_xorv4si3 (value, value, large));
17156 }
17157
17158 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17159 Expects the 64-bit DImode to be supplied in a pair of integral
17160 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17161 -mfpmath=sse, !optimize_size only. */
17162
17163 void
17164 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17165 {
17166 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17167 rtx int_xmm, fp_xmm;
17168 rtx biases, exponents;
17169 rtx x;
17170
17171 int_xmm = gen_reg_rtx (V4SImode);
17172 if (TARGET_INTER_UNIT_MOVES)
17173 emit_insn (gen_movdi_to_sse (int_xmm, input));
17174 else if (TARGET_SSE_SPLIT_REGS)
17175 {
17176 emit_clobber (int_xmm);
17177 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17178 }
17179 else
17180 {
17181 x = gen_reg_rtx (V2DImode);
17182 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17183 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17184 }
17185
17186 x = gen_rtx_CONST_VECTOR (V4SImode,
17187 gen_rtvec (4, GEN_INT (0x43300000UL),
17188 GEN_INT (0x45300000UL),
17189 const0_rtx, const0_rtx));
17190 exponents = validize_mem (force_const_mem (V4SImode, x));
17191
17192 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17193 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17194
17195 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17196 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17197 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17198 (0x1.0p84 + double(fp_value_hi_xmm)).
17199 Note these exponents differ by 32. */
17200
17201 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17202
17203 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17204 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17205 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17206 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17207 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17208 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17209 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17210 biases = validize_mem (force_const_mem (V2DFmode, biases));
17211 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17212
17213 /* Add the upper and lower DFmode values together. */
17214 if (TARGET_SSE3)
17215 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17216 else
17217 {
17218 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17219 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17220 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17221 }
17222
17223 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17224 }
17225
17226 /* Not used, but eases macroization of patterns. */
17227 void
17228 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17229 rtx input ATTRIBUTE_UNUSED)
17230 {
17231 gcc_unreachable ();
17232 }
17233
17234 /* Convert an unsigned SImode value into a DFmode. Only currently used
17235 for SSE, but applicable anywhere. */
17236
17237 void
17238 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17239 {
17240 REAL_VALUE_TYPE TWO31r;
17241 rtx x, fp;
17242
17243 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17244 NULL, 1, OPTAB_DIRECT);
17245
17246 fp = gen_reg_rtx (DFmode);
17247 emit_insn (gen_floatsidf2 (fp, x));
17248
17249 real_ldexp (&TWO31r, &dconst1, 31);
17250 x = const_double_from_real_value (TWO31r, DFmode);
17251
17252 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17253 if (x != target)
17254 emit_move_insn (target, x);
17255 }
17256
17257 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17258 32-bit mode; otherwise we have a direct convert instruction. */
17259
17260 void
17261 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17262 {
17263 REAL_VALUE_TYPE TWO32r;
17264 rtx fp_lo, fp_hi, x;
17265
17266 fp_lo = gen_reg_rtx (DFmode);
17267 fp_hi = gen_reg_rtx (DFmode);
17268
17269 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17270
17271 real_ldexp (&TWO32r, &dconst1, 32);
17272 x = const_double_from_real_value (TWO32r, DFmode);
17273 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17274
17275 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17276
17277 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17278 0, OPTAB_DIRECT);
17279 if (x != target)
17280 emit_move_insn (target, x);
17281 }
17282
17283 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17284 For x86_32, -mfpmath=sse, !optimize_size only. */
17285 void
17286 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17287 {
17288 REAL_VALUE_TYPE ONE16r;
17289 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17290
17291 real_ldexp (&ONE16r, &dconst1, 16);
17292 x = const_double_from_real_value (ONE16r, SFmode);
17293 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17294 NULL, 0, OPTAB_DIRECT);
17295 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17296 NULL, 0, OPTAB_DIRECT);
17297 fp_hi = gen_reg_rtx (SFmode);
17298 fp_lo = gen_reg_rtx (SFmode);
17299 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17300 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17301 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17302 0, OPTAB_DIRECT);
17303 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17304 0, OPTAB_DIRECT);
17305 if (!rtx_equal_p (target, fp_hi))
17306 emit_move_insn (target, fp_hi);
17307 }
17308
17309 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17310 a vector of unsigned ints VAL to vector of floats TARGET. */
17311
17312 void
17313 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17314 {
17315 rtx tmp[8];
17316 REAL_VALUE_TYPE TWO16r;
17317 enum machine_mode intmode = GET_MODE (val);
17318 enum machine_mode fltmode = GET_MODE (target);
17319 rtx (*cvt) (rtx, rtx);
17320
17321 if (intmode == V4SImode)
17322 cvt = gen_floatv4siv4sf2;
17323 else
17324 cvt = gen_floatv8siv8sf2;
17325 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17326 tmp[0] = force_reg (intmode, tmp[0]);
17327 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17328 OPTAB_DIRECT);
17329 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17330 NULL_RTX, 1, OPTAB_DIRECT);
17331 tmp[3] = gen_reg_rtx (fltmode);
17332 emit_insn (cvt (tmp[3], tmp[1]));
17333 tmp[4] = gen_reg_rtx (fltmode);
17334 emit_insn (cvt (tmp[4], tmp[2]));
17335 real_ldexp (&TWO16r, &dconst1, 16);
17336 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17337 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17338 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17339 OPTAB_DIRECT);
17340 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17341 OPTAB_DIRECT);
17342 if (tmp[7] != target)
17343 emit_move_insn (target, tmp[7]);
17344 }
17345
17346 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17347 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17348 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17349 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17350
17351 rtx
17352 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17353 {
17354 REAL_VALUE_TYPE TWO31r;
17355 rtx two31r, tmp[4];
17356 enum machine_mode mode = GET_MODE (val);
17357 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17358 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17359 rtx (*cmp) (rtx, rtx, rtx, rtx);
17360 int i;
17361
17362 for (i = 0; i < 3; i++)
17363 tmp[i] = gen_reg_rtx (mode);
17364 real_ldexp (&TWO31r, &dconst1, 31);
17365 two31r = const_double_from_real_value (TWO31r, scalarmode);
17366 two31r = ix86_build_const_vector (mode, 1, two31r);
17367 two31r = force_reg (mode, two31r);
17368 switch (mode)
17369 {
17370 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17371 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17372 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17373 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17374 default: gcc_unreachable ();
17375 }
17376 tmp[3] = gen_rtx_LE (mode, two31r, val);
17377 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17378 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17379 0, OPTAB_DIRECT);
17380 if (intmode == V4SImode || TARGET_AVX2)
17381 *xorp = expand_simple_binop (intmode, ASHIFT,
17382 gen_lowpart (intmode, tmp[0]),
17383 GEN_INT (31), NULL_RTX, 0,
17384 OPTAB_DIRECT);
17385 else
17386 {
17387 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17388 two31 = ix86_build_const_vector (intmode, 1, two31);
17389 *xorp = expand_simple_binop (intmode, AND,
17390 gen_lowpart (intmode, tmp[0]),
17391 two31, NULL_RTX, 0,
17392 OPTAB_DIRECT);
17393 }
17394 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17395 0, OPTAB_DIRECT);
17396 }
17397
17398 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17399 then replicate the value for all elements of the vector
17400 register. */
17401
17402 rtx
17403 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17404 {
17405 int i, n_elt;
17406 rtvec v;
17407 enum machine_mode scalar_mode;
17408
17409 switch (mode)
17410 {
17411 case V32QImode:
17412 case V16QImode:
17413 case V16HImode:
17414 case V8HImode:
17415 case V8SImode:
17416 case V4SImode:
17417 case V4DImode:
17418 case V2DImode:
17419 gcc_assert (vect);
17420 case V8SFmode:
17421 case V4SFmode:
17422 case V4DFmode:
17423 case V2DFmode:
17424 n_elt = GET_MODE_NUNITS (mode);
17425 v = rtvec_alloc (n_elt);
17426 scalar_mode = GET_MODE_INNER (mode);
17427
17428 RTVEC_ELT (v, 0) = value;
17429
17430 for (i = 1; i < n_elt; ++i)
17431 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17432
17433 return gen_rtx_CONST_VECTOR (mode, v);
17434
17435 default:
17436 gcc_unreachable ();
17437 }
17438 }
17439
17440 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17441 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17442 for an SSE register. If VECT is true, then replicate the mask for
17443 all elements of the vector register. If INVERT is true, then create
17444 a mask excluding the sign bit. */
17445
17446 rtx
17447 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17448 {
17449 enum machine_mode vec_mode, imode;
17450 HOST_WIDE_INT hi, lo;
17451 int shift = 63;
17452 rtx v;
17453 rtx mask;
17454
17455 /* Find the sign bit, sign extended to 2*HWI. */
17456 switch (mode)
17457 {
17458 case V8SImode:
17459 case V4SImode:
17460 case V8SFmode:
17461 case V4SFmode:
17462 vec_mode = mode;
17463 mode = GET_MODE_INNER (mode);
17464 imode = SImode;
17465 lo = 0x80000000, hi = lo < 0;
17466 break;
17467
17468 case V4DImode:
17469 case V2DImode:
17470 case V4DFmode:
17471 case V2DFmode:
17472 vec_mode = mode;
17473 mode = GET_MODE_INNER (mode);
17474 imode = DImode;
17475 if (HOST_BITS_PER_WIDE_INT >= 64)
17476 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17477 else
17478 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17479 break;
17480
17481 case TImode:
17482 case TFmode:
17483 vec_mode = VOIDmode;
17484 if (HOST_BITS_PER_WIDE_INT >= 64)
17485 {
17486 imode = TImode;
17487 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17488 }
17489 else
17490 {
17491 rtvec vec;
17492
17493 imode = DImode;
17494 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17495
17496 if (invert)
17497 {
17498 lo = ~lo, hi = ~hi;
17499 v = constm1_rtx;
17500 }
17501 else
17502 v = const0_rtx;
17503
17504 mask = immed_double_const (lo, hi, imode);
17505
17506 vec = gen_rtvec (2, v, mask);
17507 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17508 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17509
17510 return v;
17511 }
17512 break;
17513
17514 default:
17515 gcc_unreachable ();
17516 }
17517
17518 if (invert)
17519 lo = ~lo, hi = ~hi;
17520
17521 /* Force this value into the low part of a fp vector constant. */
17522 mask = immed_double_const (lo, hi, imode);
17523 mask = gen_lowpart (mode, mask);
17524
17525 if (vec_mode == VOIDmode)
17526 return force_reg (mode, mask);
17527
17528 v = ix86_build_const_vector (vec_mode, vect, mask);
17529 return force_reg (vec_mode, v);
17530 }
17531
17532 /* Generate code for floating point ABS or NEG. */
17533
17534 void
17535 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17536 rtx operands[])
17537 {
17538 rtx mask, set, dst, src;
17539 bool use_sse = false;
17540 bool vector_mode = VECTOR_MODE_P (mode);
17541 enum machine_mode vmode = mode;
17542
17543 if (vector_mode)
17544 use_sse = true;
17545 else if (mode == TFmode)
17546 use_sse = true;
17547 else if (TARGET_SSE_MATH)
17548 {
17549 use_sse = SSE_FLOAT_MODE_P (mode);
17550 if (mode == SFmode)
17551 vmode = V4SFmode;
17552 else if (mode == DFmode)
17553 vmode = V2DFmode;
17554 }
17555
17556 /* NEG and ABS performed with SSE use bitwise mask operations.
17557 Create the appropriate mask now. */
17558 if (use_sse)
17559 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17560 else
17561 mask = NULL_RTX;
17562
17563 dst = operands[0];
17564 src = operands[1];
17565
17566 set = gen_rtx_fmt_e (code, mode, src);
17567 set = gen_rtx_SET (VOIDmode, dst, set);
17568
17569 if (mask)
17570 {
17571 rtx use, clob;
17572 rtvec par;
17573
17574 use = gen_rtx_USE (VOIDmode, mask);
17575 if (vector_mode)
17576 par = gen_rtvec (2, set, use);
17577 else
17578 {
17579 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17580 par = gen_rtvec (3, set, use, clob);
17581 }
17582 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17583 }
17584 else
17585 emit_insn (set);
17586 }
17587
17588 /* Expand a copysign operation. Special case operand 0 being a constant. */
17589
17590 void
17591 ix86_expand_copysign (rtx operands[])
17592 {
17593 enum machine_mode mode, vmode;
17594 rtx dest, op0, op1, mask, nmask;
17595
17596 dest = operands[0];
17597 op0 = operands[1];
17598 op1 = operands[2];
17599
17600 mode = GET_MODE (dest);
17601
17602 if (mode == SFmode)
17603 vmode = V4SFmode;
17604 else if (mode == DFmode)
17605 vmode = V2DFmode;
17606 else
17607 vmode = mode;
17608
17609 if (GET_CODE (op0) == CONST_DOUBLE)
17610 {
17611 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17612
17613 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17614 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17615
17616 if (mode == SFmode || mode == DFmode)
17617 {
17618 if (op0 == CONST0_RTX (mode))
17619 op0 = CONST0_RTX (vmode);
17620 else
17621 {
17622 rtx v = ix86_build_const_vector (vmode, false, op0);
17623
17624 op0 = force_reg (vmode, v);
17625 }
17626 }
17627 else if (op0 != CONST0_RTX (mode))
17628 op0 = force_reg (mode, op0);
17629
17630 mask = ix86_build_signbit_mask (vmode, 0, 0);
17631
17632 if (mode == SFmode)
17633 copysign_insn = gen_copysignsf3_const;
17634 else if (mode == DFmode)
17635 copysign_insn = gen_copysigndf3_const;
17636 else
17637 copysign_insn = gen_copysigntf3_const;
17638
17639 emit_insn (copysign_insn (dest, op0, op1, mask));
17640 }
17641 else
17642 {
17643 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17644
17645 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17646 mask = ix86_build_signbit_mask (vmode, 0, 0);
17647
17648 if (mode == SFmode)
17649 copysign_insn = gen_copysignsf3_var;
17650 else if (mode == DFmode)
17651 copysign_insn = gen_copysigndf3_var;
17652 else
17653 copysign_insn = gen_copysigntf3_var;
17654
17655 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17656 }
17657 }
17658
17659 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17660 be a constant, and so has already been expanded into a vector constant. */
17661
17662 void
17663 ix86_split_copysign_const (rtx operands[])
17664 {
17665 enum machine_mode mode, vmode;
17666 rtx dest, op0, mask, x;
17667
17668 dest = operands[0];
17669 op0 = operands[1];
17670 mask = operands[3];
17671
17672 mode = GET_MODE (dest);
17673 vmode = GET_MODE (mask);
17674
17675 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17676 x = gen_rtx_AND (vmode, dest, mask);
17677 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17678
17679 if (op0 != CONST0_RTX (vmode))
17680 {
17681 x = gen_rtx_IOR (vmode, dest, op0);
17682 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17683 }
17684 }
17685
17686 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17687 so we have to do two masks. */
17688
17689 void
17690 ix86_split_copysign_var (rtx operands[])
17691 {
17692 enum machine_mode mode, vmode;
17693 rtx dest, scratch, op0, op1, mask, nmask, x;
17694
17695 dest = operands[0];
17696 scratch = operands[1];
17697 op0 = operands[2];
17698 op1 = operands[3];
17699 nmask = operands[4];
17700 mask = operands[5];
17701
17702 mode = GET_MODE (dest);
17703 vmode = GET_MODE (mask);
17704
17705 if (rtx_equal_p (op0, op1))
17706 {
17707 /* Shouldn't happen often (it's useless, obviously), but when it does
17708 we'd generate incorrect code if we continue below. */
17709 emit_move_insn (dest, op0);
17710 return;
17711 }
17712
17713 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17714 {
17715 gcc_assert (REGNO (op1) == REGNO (scratch));
17716
17717 x = gen_rtx_AND (vmode, scratch, mask);
17718 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17719
17720 dest = mask;
17721 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17722 x = gen_rtx_NOT (vmode, dest);
17723 x = gen_rtx_AND (vmode, x, op0);
17724 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17725 }
17726 else
17727 {
17728 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17729 {
17730 x = gen_rtx_AND (vmode, scratch, mask);
17731 }
17732 else /* alternative 2,4 */
17733 {
17734 gcc_assert (REGNO (mask) == REGNO (scratch));
17735 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17736 x = gen_rtx_AND (vmode, scratch, op1);
17737 }
17738 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17739
17740 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17741 {
17742 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17743 x = gen_rtx_AND (vmode, dest, nmask);
17744 }
17745 else /* alternative 3,4 */
17746 {
17747 gcc_assert (REGNO (nmask) == REGNO (dest));
17748 dest = nmask;
17749 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17750 x = gen_rtx_AND (vmode, dest, op0);
17751 }
17752 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17753 }
17754
17755 x = gen_rtx_IOR (vmode, dest, scratch);
17756 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17757 }
17758
17759 /* Return TRUE or FALSE depending on whether the first SET in INSN
17760 has source and destination with matching CC modes, and that the
17761 CC mode is at least as constrained as REQ_MODE. */
17762
17763 bool
17764 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17765 {
17766 rtx set;
17767 enum machine_mode set_mode;
17768
17769 set = PATTERN (insn);
17770 if (GET_CODE (set) == PARALLEL)
17771 set = XVECEXP (set, 0, 0);
17772 gcc_assert (GET_CODE (set) == SET);
17773 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17774
17775 set_mode = GET_MODE (SET_DEST (set));
17776 switch (set_mode)
17777 {
17778 case CCNOmode:
17779 if (req_mode != CCNOmode
17780 && (req_mode != CCmode
17781 || XEXP (SET_SRC (set), 1) != const0_rtx))
17782 return false;
17783 break;
17784 case CCmode:
17785 if (req_mode == CCGCmode)
17786 return false;
17787 /* FALLTHRU */
17788 case CCGCmode:
17789 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17790 return false;
17791 /* FALLTHRU */
17792 case CCGOCmode:
17793 if (req_mode == CCZmode)
17794 return false;
17795 /* FALLTHRU */
17796 case CCZmode:
17797 break;
17798
17799 case CCAmode:
17800 case CCCmode:
17801 case CCOmode:
17802 case CCSmode:
17803 if (set_mode != req_mode)
17804 return false;
17805 break;
17806
17807 default:
17808 gcc_unreachable ();
17809 }
17810
17811 return GET_MODE (SET_SRC (set)) == set_mode;
17812 }
17813
17814 /* Generate insn patterns to do an integer compare of OPERANDS. */
17815
17816 static rtx
17817 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17818 {
17819 enum machine_mode cmpmode;
17820 rtx tmp, flags;
17821
17822 cmpmode = SELECT_CC_MODE (code, op0, op1);
17823 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17824
17825 /* This is very simple, but making the interface the same as in the
17826 FP case makes the rest of the code easier. */
17827 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17828 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17829
17830 /* Return the test that should be put into the flags user, i.e.
17831 the bcc, scc, or cmov instruction. */
17832 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17833 }
17834
17835 /* Figure out whether to use ordered or unordered fp comparisons.
17836 Return the appropriate mode to use. */
17837
17838 enum machine_mode
17839 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17840 {
17841 /* ??? In order to make all comparisons reversible, we do all comparisons
17842 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17843 all forms trapping and nontrapping comparisons, we can make inequality
17844 comparisons trapping again, since it results in better code when using
17845 FCOM based compares. */
17846 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17847 }
17848
17849 enum machine_mode
17850 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17851 {
17852 enum machine_mode mode = GET_MODE (op0);
17853
17854 if (SCALAR_FLOAT_MODE_P (mode))
17855 {
17856 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17857 return ix86_fp_compare_mode (code);
17858 }
17859
17860 switch (code)
17861 {
17862 /* Only zero flag is needed. */
17863 case EQ: /* ZF=0 */
17864 case NE: /* ZF!=0 */
17865 return CCZmode;
17866 /* Codes needing carry flag. */
17867 case GEU: /* CF=0 */
17868 case LTU: /* CF=1 */
17869 /* Detect overflow checks. They need just the carry flag. */
17870 if (GET_CODE (op0) == PLUS
17871 && rtx_equal_p (op1, XEXP (op0, 0)))
17872 return CCCmode;
17873 else
17874 return CCmode;
17875 case GTU: /* CF=0 & ZF=0 */
17876 case LEU: /* CF=1 | ZF=1 */
17877 /* Detect overflow checks. They need just the carry flag. */
17878 if (GET_CODE (op0) == MINUS
17879 && rtx_equal_p (op1, XEXP (op0, 0)))
17880 return CCCmode;
17881 else
17882 return CCmode;
17883 /* Codes possibly doable only with sign flag when
17884 comparing against zero. */
17885 case GE: /* SF=OF or SF=0 */
17886 case LT: /* SF<>OF or SF=1 */
17887 if (op1 == const0_rtx)
17888 return CCGOCmode;
17889 else
17890 /* For other cases Carry flag is not required. */
17891 return CCGCmode;
17892 /* Codes doable only with sign flag when comparing
17893 against zero, but we miss jump instruction for it
17894 so we need to use relational tests against overflow
17895 that thus needs to be zero. */
17896 case GT: /* ZF=0 & SF=OF */
17897 case LE: /* ZF=1 | SF<>OF */
17898 if (op1 == const0_rtx)
17899 return CCNOmode;
17900 else
17901 return CCGCmode;
17902 /* strcmp pattern do (use flags) and combine may ask us for proper
17903 mode. */
17904 case USE:
17905 return CCmode;
17906 default:
17907 gcc_unreachable ();
17908 }
17909 }
17910
17911 /* Return the fixed registers used for condition codes. */
17912
17913 static bool
17914 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17915 {
17916 *p1 = FLAGS_REG;
17917 *p2 = FPSR_REG;
17918 return true;
17919 }
17920
17921 /* If two condition code modes are compatible, return a condition code
17922 mode which is compatible with both. Otherwise, return
17923 VOIDmode. */
17924
17925 static enum machine_mode
17926 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17927 {
17928 if (m1 == m2)
17929 return m1;
17930
17931 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17932 return VOIDmode;
17933
17934 if ((m1 == CCGCmode && m2 == CCGOCmode)
17935 || (m1 == CCGOCmode && m2 == CCGCmode))
17936 return CCGCmode;
17937
17938 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
17939 return m2;
17940 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
17941 return m1;
17942
17943 switch (m1)
17944 {
17945 default:
17946 gcc_unreachable ();
17947
17948 case CCmode:
17949 case CCGCmode:
17950 case CCGOCmode:
17951 case CCNOmode:
17952 case CCAmode:
17953 case CCCmode:
17954 case CCOmode:
17955 case CCSmode:
17956 case CCZmode:
17957 switch (m2)
17958 {
17959 default:
17960 return VOIDmode;
17961
17962 case CCmode:
17963 case CCGCmode:
17964 case CCGOCmode:
17965 case CCNOmode:
17966 case CCAmode:
17967 case CCCmode:
17968 case CCOmode:
17969 case CCSmode:
17970 case CCZmode:
17971 return CCmode;
17972 }
17973
17974 case CCFPmode:
17975 case CCFPUmode:
17976 /* These are only compatible with themselves, which we already
17977 checked above. */
17978 return VOIDmode;
17979 }
17980 }
17981
17982
17983 /* Return a comparison we can do and that it is equivalent to
17984 swap_condition (code) apart possibly from orderedness.
17985 But, never change orderedness if TARGET_IEEE_FP, returning
17986 UNKNOWN in that case if necessary. */
17987
17988 static enum rtx_code
17989 ix86_fp_swap_condition (enum rtx_code code)
17990 {
17991 switch (code)
17992 {
17993 case GT: /* GTU - CF=0 & ZF=0 */
17994 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
17995 case GE: /* GEU - CF=0 */
17996 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
17997 case UNLT: /* LTU - CF=1 */
17998 return TARGET_IEEE_FP ? UNKNOWN : GT;
17999 case UNLE: /* LEU - CF=1 | ZF=1 */
18000 return TARGET_IEEE_FP ? UNKNOWN : GE;
18001 default:
18002 return swap_condition (code);
18003 }
18004 }
18005
18006 /* Return cost of comparison CODE using the best strategy for performance.
18007 All following functions do use number of instructions as a cost metrics.
18008 In future this should be tweaked to compute bytes for optimize_size and
18009 take into account performance of various instructions on various CPUs. */
18010
18011 static int
18012 ix86_fp_comparison_cost (enum rtx_code code)
18013 {
18014 int arith_cost;
18015
18016 /* The cost of code using bit-twiddling on %ah. */
18017 switch (code)
18018 {
18019 case UNLE:
18020 case UNLT:
18021 case LTGT:
18022 case GT:
18023 case GE:
18024 case UNORDERED:
18025 case ORDERED:
18026 case UNEQ:
18027 arith_cost = 4;
18028 break;
18029 case LT:
18030 case NE:
18031 case EQ:
18032 case UNGE:
18033 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18034 break;
18035 case LE:
18036 case UNGT:
18037 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18038 break;
18039 default:
18040 gcc_unreachable ();
18041 }
18042
18043 switch (ix86_fp_comparison_strategy (code))
18044 {
18045 case IX86_FPCMP_COMI:
18046 return arith_cost > 4 ? 3 : 2;
18047 case IX86_FPCMP_SAHF:
18048 return arith_cost > 4 ? 4 : 3;
18049 default:
18050 return arith_cost;
18051 }
18052 }
18053
18054 /* Return strategy to use for floating-point. We assume that fcomi is always
18055 preferrable where available, since that is also true when looking at size
18056 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18057
18058 enum ix86_fpcmp_strategy
18059 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18060 {
18061 /* Do fcomi/sahf based test when profitable. */
18062
18063 if (TARGET_CMOVE)
18064 return IX86_FPCMP_COMI;
18065
18066 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18067 return IX86_FPCMP_SAHF;
18068
18069 return IX86_FPCMP_ARITH;
18070 }
18071
18072 /* Swap, force into registers, or otherwise massage the two operands
18073 to a fp comparison. The operands are updated in place; the new
18074 comparison code is returned. */
18075
18076 static enum rtx_code
18077 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18078 {
18079 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18080 rtx op0 = *pop0, op1 = *pop1;
18081 enum machine_mode op_mode = GET_MODE (op0);
18082 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18083
18084 /* All of the unordered compare instructions only work on registers.
18085 The same is true of the fcomi compare instructions. The XFmode
18086 compare instructions require registers except when comparing
18087 against zero or when converting operand 1 from fixed point to
18088 floating point. */
18089
18090 if (!is_sse
18091 && (fpcmp_mode == CCFPUmode
18092 || (op_mode == XFmode
18093 && ! (standard_80387_constant_p (op0) == 1
18094 || standard_80387_constant_p (op1) == 1)
18095 && GET_CODE (op1) != FLOAT)
18096 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18097 {
18098 op0 = force_reg (op_mode, op0);
18099 op1 = force_reg (op_mode, op1);
18100 }
18101 else
18102 {
18103 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18104 things around if they appear profitable, otherwise force op0
18105 into a register. */
18106
18107 if (standard_80387_constant_p (op0) == 0
18108 || (MEM_P (op0)
18109 && ! (standard_80387_constant_p (op1) == 0
18110 || MEM_P (op1))))
18111 {
18112 enum rtx_code new_code = ix86_fp_swap_condition (code);
18113 if (new_code != UNKNOWN)
18114 {
18115 rtx tmp;
18116 tmp = op0, op0 = op1, op1 = tmp;
18117 code = new_code;
18118 }
18119 }
18120
18121 if (!REG_P (op0))
18122 op0 = force_reg (op_mode, op0);
18123
18124 if (CONSTANT_P (op1))
18125 {
18126 int tmp = standard_80387_constant_p (op1);
18127 if (tmp == 0)
18128 op1 = validize_mem (force_const_mem (op_mode, op1));
18129 else if (tmp == 1)
18130 {
18131 if (TARGET_CMOVE)
18132 op1 = force_reg (op_mode, op1);
18133 }
18134 else
18135 op1 = force_reg (op_mode, op1);
18136 }
18137 }
18138
18139 /* Try to rearrange the comparison to make it cheaper. */
18140 if (ix86_fp_comparison_cost (code)
18141 > ix86_fp_comparison_cost (swap_condition (code))
18142 && (REG_P (op1) || can_create_pseudo_p ()))
18143 {
18144 rtx tmp;
18145 tmp = op0, op0 = op1, op1 = tmp;
18146 code = swap_condition (code);
18147 if (!REG_P (op0))
18148 op0 = force_reg (op_mode, op0);
18149 }
18150
18151 *pop0 = op0;
18152 *pop1 = op1;
18153 return code;
18154 }
18155
18156 /* Convert comparison codes we use to represent FP comparison to integer
18157 code that will result in proper branch. Return UNKNOWN if no such code
18158 is available. */
18159
18160 enum rtx_code
18161 ix86_fp_compare_code_to_integer (enum rtx_code code)
18162 {
18163 switch (code)
18164 {
18165 case GT:
18166 return GTU;
18167 case GE:
18168 return GEU;
18169 case ORDERED:
18170 case UNORDERED:
18171 return code;
18172 break;
18173 case UNEQ:
18174 return EQ;
18175 break;
18176 case UNLT:
18177 return LTU;
18178 break;
18179 case UNLE:
18180 return LEU;
18181 break;
18182 case LTGT:
18183 return NE;
18184 break;
18185 default:
18186 return UNKNOWN;
18187 }
18188 }
18189
18190 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18191
18192 static rtx
18193 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18194 {
18195 enum machine_mode fpcmp_mode, intcmp_mode;
18196 rtx tmp, tmp2;
18197
18198 fpcmp_mode = ix86_fp_compare_mode (code);
18199 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18200
18201 /* Do fcomi/sahf based test when profitable. */
18202 switch (ix86_fp_comparison_strategy (code))
18203 {
18204 case IX86_FPCMP_COMI:
18205 intcmp_mode = fpcmp_mode;
18206 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18207 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18208 tmp);
18209 emit_insn (tmp);
18210 break;
18211
18212 case IX86_FPCMP_SAHF:
18213 intcmp_mode = fpcmp_mode;
18214 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18215 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18216 tmp);
18217
18218 if (!scratch)
18219 scratch = gen_reg_rtx (HImode);
18220 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18221 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18222 break;
18223
18224 case IX86_FPCMP_ARITH:
18225 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18226 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18227 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18228 if (!scratch)
18229 scratch = gen_reg_rtx (HImode);
18230 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18231
18232 /* In the unordered case, we have to check C2 for NaN's, which
18233 doesn't happen to work out to anything nice combination-wise.
18234 So do some bit twiddling on the value we've got in AH to come
18235 up with an appropriate set of condition codes. */
18236
18237 intcmp_mode = CCNOmode;
18238 switch (code)
18239 {
18240 case GT:
18241 case UNGT:
18242 if (code == GT || !TARGET_IEEE_FP)
18243 {
18244 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18245 code = EQ;
18246 }
18247 else
18248 {
18249 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18250 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18251 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18252 intcmp_mode = CCmode;
18253 code = GEU;
18254 }
18255 break;
18256 case LT:
18257 case UNLT:
18258 if (code == LT && TARGET_IEEE_FP)
18259 {
18260 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18261 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18262 intcmp_mode = CCmode;
18263 code = EQ;
18264 }
18265 else
18266 {
18267 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18268 code = NE;
18269 }
18270 break;
18271 case GE:
18272 case UNGE:
18273 if (code == GE || !TARGET_IEEE_FP)
18274 {
18275 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18276 code = EQ;
18277 }
18278 else
18279 {
18280 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18281 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18282 code = NE;
18283 }
18284 break;
18285 case LE:
18286 case UNLE:
18287 if (code == LE && TARGET_IEEE_FP)
18288 {
18289 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18290 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18291 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18292 intcmp_mode = CCmode;
18293 code = LTU;
18294 }
18295 else
18296 {
18297 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18298 code = NE;
18299 }
18300 break;
18301 case EQ:
18302 case UNEQ:
18303 if (code == EQ && TARGET_IEEE_FP)
18304 {
18305 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18306 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18307 intcmp_mode = CCmode;
18308 code = EQ;
18309 }
18310 else
18311 {
18312 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18313 code = NE;
18314 }
18315 break;
18316 case NE:
18317 case LTGT:
18318 if (code == NE && TARGET_IEEE_FP)
18319 {
18320 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18321 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18322 GEN_INT (0x40)));
18323 code = NE;
18324 }
18325 else
18326 {
18327 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18328 code = EQ;
18329 }
18330 break;
18331
18332 case UNORDERED:
18333 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18334 code = NE;
18335 break;
18336 case ORDERED:
18337 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18338 code = EQ;
18339 break;
18340
18341 default:
18342 gcc_unreachable ();
18343 }
18344 break;
18345
18346 default:
18347 gcc_unreachable();
18348 }
18349
18350 /* Return the test that should be put into the flags user, i.e.
18351 the bcc, scc, or cmov instruction. */
18352 return gen_rtx_fmt_ee (code, VOIDmode,
18353 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18354 const0_rtx);
18355 }
18356
18357 static rtx
18358 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18359 {
18360 rtx ret;
18361
18362 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18363 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18364
18365 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18366 {
18367 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18368 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18369 }
18370 else
18371 ret = ix86_expand_int_compare (code, op0, op1);
18372
18373 return ret;
18374 }
18375
18376 void
18377 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18378 {
18379 enum machine_mode mode = GET_MODE (op0);
18380 rtx tmp;
18381
18382 switch (mode)
18383 {
18384 case SFmode:
18385 case DFmode:
18386 case XFmode:
18387 case QImode:
18388 case HImode:
18389 case SImode:
18390 simple:
18391 tmp = ix86_expand_compare (code, op0, op1);
18392 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18393 gen_rtx_LABEL_REF (VOIDmode, label),
18394 pc_rtx);
18395 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18396 return;
18397
18398 case DImode:
18399 if (TARGET_64BIT)
18400 goto simple;
18401 case TImode:
18402 /* Expand DImode branch into multiple compare+branch. */
18403 {
18404 rtx lo[2], hi[2], label2;
18405 enum rtx_code code1, code2, code3;
18406 enum machine_mode submode;
18407
18408 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18409 {
18410 tmp = op0, op0 = op1, op1 = tmp;
18411 code = swap_condition (code);
18412 }
18413
18414 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18415 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18416
18417 submode = mode == DImode ? SImode : DImode;
18418
18419 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18420 avoid two branches. This costs one extra insn, so disable when
18421 optimizing for size. */
18422
18423 if ((code == EQ || code == NE)
18424 && (!optimize_insn_for_size_p ()
18425 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18426 {
18427 rtx xor0, xor1;
18428
18429 xor1 = hi[0];
18430 if (hi[1] != const0_rtx)
18431 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18432 NULL_RTX, 0, OPTAB_WIDEN);
18433
18434 xor0 = lo[0];
18435 if (lo[1] != const0_rtx)
18436 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18437 NULL_RTX, 0, OPTAB_WIDEN);
18438
18439 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18440 NULL_RTX, 0, OPTAB_WIDEN);
18441
18442 ix86_expand_branch (code, tmp, const0_rtx, label);
18443 return;
18444 }
18445
18446 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18447 op1 is a constant and the low word is zero, then we can just
18448 examine the high word. Similarly for low word -1 and
18449 less-or-equal-than or greater-than. */
18450
18451 if (CONST_INT_P (hi[1]))
18452 switch (code)
18453 {
18454 case LT: case LTU: case GE: case GEU:
18455 if (lo[1] == const0_rtx)
18456 {
18457 ix86_expand_branch (code, hi[0], hi[1], label);
18458 return;
18459 }
18460 break;
18461 case LE: case LEU: case GT: case GTU:
18462 if (lo[1] == constm1_rtx)
18463 {
18464 ix86_expand_branch (code, hi[0], hi[1], label);
18465 return;
18466 }
18467 break;
18468 default:
18469 break;
18470 }
18471
18472 /* Otherwise, we need two or three jumps. */
18473
18474 label2 = gen_label_rtx ();
18475
18476 code1 = code;
18477 code2 = swap_condition (code);
18478 code3 = unsigned_condition (code);
18479
18480 switch (code)
18481 {
18482 case LT: case GT: case LTU: case GTU:
18483 break;
18484
18485 case LE: code1 = LT; code2 = GT; break;
18486 case GE: code1 = GT; code2 = LT; break;
18487 case LEU: code1 = LTU; code2 = GTU; break;
18488 case GEU: code1 = GTU; code2 = LTU; break;
18489
18490 case EQ: code1 = UNKNOWN; code2 = NE; break;
18491 case NE: code2 = UNKNOWN; break;
18492
18493 default:
18494 gcc_unreachable ();
18495 }
18496
18497 /*
18498 * a < b =>
18499 * if (hi(a) < hi(b)) goto true;
18500 * if (hi(a) > hi(b)) goto false;
18501 * if (lo(a) < lo(b)) goto true;
18502 * false:
18503 */
18504
18505 if (code1 != UNKNOWN)
18506 ix86_expand_branch (code1, hi[0], hi[1], label);
18507 if (code2 != UNKNOWN)
18508 ix86_expand_branch (code2, hi[0], hi[1], label2);
18509
18510 ix86_expand_branch (code3, lo[0], lo[1], label);
18511
18512 if (code2 != UNKNOWN)
18513 emit_label (label2);
18514 return;
18515 }
18516
18517 default:
18518 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18519 goto simple;
18520 }
18521 }
18522
18523 /* Split branch based on floating point condition. */
18524 void
18525 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18526 rtx target1, rtx target2, rtx tmp, rtx pushed)
18527 {
18528 rtx condition;
18529 rtx i;
18530
18531 if (target2 != pc_rtx)
18532 {
18533 rtx tmp = target2;
18534 code = reverse_condition_maybe_unordered (code);
18535 target2 = target1;
18536 target1 = tmp;
18537 }
18538
18539 condition = ix86_expand_fp_compare (code, op1, op2,
18540 tmp);
18541
18542 /* Remove pushed operand from stack. */
18543 if (pushed)
18544 ix86_free_from_memory (GET_MODE (pushed));
18545
18546 i = emit_jump_insn (gen_rtx_SET
18547 (VOIDmode, pc_rtx,
18548 gen_rtx_IF_THEN_ELSE (VOIDmode,
18549 condition, target1, target2)));
18550 if (split_branch_probability >= 0)
18551 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18552 }
18553
18554 void
18555 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18556 {
18557 rtx ret;
18558
18559 gcc_assert (GET_MODE (dest) == QImode);
18560
18561 ret = ix86_expand_compare (code, op0, op1);
18562 PUT_MODE (ret, QImode);
18563 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18564 }
18565
18566 /* Expand comparison setting or clearing carry flag. Return true when
18567 successful and set pop for the operation. */
18568 static bool
18569 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18570 {
18571 enum machine_mode mode =
18572 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18573
18574 /* Do not handle double-mode compares that go through special path. */
18575 if (mode == (TARGET_64BIT ? TImode : DImode))
18576 return false;
18577
18578 if (SCALAR_FLOAT_MODE_P (mode))
18579 {
18580 rtx compare_op, compare_seq;
18581
18582 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18583
18584 /* Shortcut: following common codes never translate
18585 into carry flag compares. */
18586 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18587 || code == ORDERED || code == UNORDERED)
18588 return false;
18589
18590 /* These comparisons require zero flag; swap operands so they won't. */
18591 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18592 && !TARGET_IEEE_FP)
18593 {
18594 rtx tmp = op0;
18595 op0 = op1;
18596 op1 = tmp;
18597 code = swap_condition (code);
18598 }
18599
18600 /* Try to expand the comparison and verify that we end up with
18601 carry flag based comparison. This fails to be true only when
18602 we decide to expand comparison using arithmetic that is not
18603 too common scenario. */
18604 start_sequence ();
18605 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18606 compare_seq = get_insns ();
18607 end_sequence ();
18608
18609 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18610 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18611 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18612 else
18613 code = GET_CODE (compare_op);
18614
18615 if (code != LTU && code != GEU)
18616 return false;
18617
18618 emit_insn (compare_seq);
18619 *pop = compare_op;
18620 return true;
18621 }
18622
18623 if (!INTEGRAL_MODE_P (mode))
18624 return false;
18625
18626 switch (code)
18627 {
18628 case LTU:
18629 case GEU:
18630 break;
18631
18632 /* Convert a==0 into (unsigned)a<1. */
18633 case EQ:
18634 case NE:
18635 if (op1 != const0_rtx)
18636 return false;
18637 op1 = const1_rtx;
18638 code = (code == EQ ? LTU : GEU);
18639 break;
18640
18641 /* Convert a>b into b<a or a>=b-1. */
18642 case GTU:
18643 case LEU:
18644 if (CONST_INT_P (op1))
18645 {
18646 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18647 /* Bail out on overflow. We still can swap operands but that
18648 would force loading of the constant into register. */
18649 if (op1 == const0_rtx
18650 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18651 return false;
18652 code = (code == GTU ? GEU : LTU);
18653 }
18654 else
18655 {
18656 rtx tmp = op1;
18657 op1 = op0;
18658 op0 = tmp;
18659 code = (code == GTU ? LTU : GEU);
18660 }
18661 break;
18662
18663 /* Convert a>=0 into (unsigned)a<0x80000000. */
18664 case LT:
18665 case GE:
18666 if (mode == DImode || op1 != const0_rtx)
18667 return false;
18668 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18669 code = (code == LT ? GEU : LTU);
18670 break;
18671 case LE:
18672 case GT:
18673 if (mode == DImode || op1 != constm1_rtx)
18674 return false;
18675 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18676 code = (code == LE ? GEU : LTU);
18677 break;
18678
18679 default:
18680 return false;
18681 }
18682 /* Swapping operands may cause constant to appear as first operand. */
18683 if (!nonimmediate_operand (op0, VOIDmode))
18684 {
18685 if (!can_create_pseudo_p ())
18686 return false;
18687 op0 = force_reg (mode, op0);
18688 }
18689 *pop = ix86_expand_compare (code, op0, op1);
18690 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18691 return true;
18692 }
18693
18694 bool
18695 ix86_expand_int_movcc (rtx operands[])
18696 {
18697 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18698 rtx compare_seq, compare_op;
18699 enum machine_mode mode = GET_MODE (operands[0]);
18700 bool sign_bit_compare_p = false;
18701 rtx op0 = XEXP (operands[1], 0);
18702 rtx op1 = XEXP (operands[1], 1);
18703
18704 start_sequence ();
18705 compare_op = ix86_expand_compare (code, op0, op1);
18706 compare_seq = get_insns ();
18707 end_sequence ();
18708
18709 compare_code = GET_CODE (compare_op);
18710
18711 if ((op1 == const0_rtx && (code == GE || code == LT))
18712 || (op1 == constm1_rtx && (code == GT || code == LE)))
18713 sign_bit_compare_p = true;
18714
18715 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18716 HImode insns, we'd be swallowed in word prefix ops. */
18717
18718 if ((mode != HImode || TARGET_FAST_PREFIX)
18719 && (mode != (TARGET_64BIT ? TImode : DImode))
18720 && CONST_INT_P (operands[2])
18721 && CONST_INT_P (operands[3]))
18722 {
18723 rtx out = operands[0];
18724 HOST_WIDE_INT ct = INTVAL (operands[2]);
18725 HOST_WIDE_INT cf = INTVAL (operands[3]);
18726 HOST_WIDE_INT diff;
18727
18728 diff = ct - cf;
18729 /* Sign bit compares are better done using shifts than we do by using
18730 sbb. */
18731 if (sign_bit_compare_p
18732 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18733 {
18734 /* Detect overlap between destination and compare sources. */
18735 rtx tmp = out;
18736
18737 if (!sign_bit_compare_p)
18738 {
18739 rtx flags;
18740 bool fpcmp = false;
18741
18742 compare_code = GET_CODE (compare_op);
18743
18744 flags = XEXP (compare_op, 0);
18745
18746 if (GET_MODE (flags) == CCFPmode
18747 || GET_MODE (flags) == CCFPUmode)
18748 {
18749 fpcmp = true;
18750 compare_code
18751 = ix86_fp_compare_code_to_integer (compare_code);
18752 }
18753
18754 /* To simplify rest of code, restrict to the GEU case. */
18755 if (compare_code == LTU)
18756 {
18757 HOST_WIDE_INT tmp = ct;
18758 ct = cf;
18759 cf = tmp;
18760 compare_code = reverse_condition (compare_code);
18761 code = reverse_condition (code);
18762 }
18763 else
18764 {
18765 if (fpcmp)
18766 PUT_CODE (compare_op,
18767 reverse_condition_maybe_unordered
18768 (GET_CODE (compare_op)));
18769 else
18770 PUT_CODE (compare_op,
18771 reverse_condition (GET_CODE (compare_op)));
18772 }
18773 diff = ct - cf;
18774
18775 if (reg_overlap_mentioned_p (out, op0)
18776 || reg_overlap_mentioned_p (out, op1))
18777 tmp = gen_reg_rtx (mode);
18778
18779 if (mode == DImode)
18780 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18781 else
18782 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18783 flags, compare_op));
18784 }
18785 else
18786 {
18787 if (code == GT || code == GE)
18788 code = reverse_condition (code);
18789 else
18790 {
18791 HOST_WIDE_INT tmp = ct;
18792 ct = cf;
18793 cf = tmp;
18794 diff = ct - cf;
18795 }
18796 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18797 }
18798
18799 if (diff == 1)
18800 {
18801 /*
18802 * cmpl op0,op1
18803 * sbbl dest,dest
18804 * [addl dest, ct]
18805 *
18806 * Size 5 - 8.
18807 */
18808 if (ct)
18809 tmp = expand_simple_binop (mode, PLUS,
18810 tmp, GEN_INT (ct),
18811 copy_rtx (tmp), 1, OPTAB_DIRECT);
18812 }
18813 else if (cf == -1)
18814 {
18815 /*
18816 * cmpl op0,op1
18817 * sbbl dest,dest
18818 * orl $ct, dest
18819 *
18820 * Size 8.
18821 */
18822 tmp = expand_simple_binop (mode, IOR,
18823 tmp, GEN_INT (ct),
18824 copy_rtx (tmp), 1, OPTAB_DIRECT);
18825 }
18826 else if (diff == -1 && ct)
18827 {
18828 /*
18829 * cmpl op0,op1
18830 * sbbl dest,dest
18831 * notl dest
18832 * [addl dest, cf]
18833 *
18834 * Size 8 - 11.
18835 */
18836 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18837 if (cf)
18838 tmp = expand_simple_binop (mode, PLUS,
18839 copy_rtx (tmp), GEN_INT (cf),
18840 copy_rtx (tmp), 1, OPTAB_DIRECT);
18841 }
18842 else
18843 {
18844 /*
18845 * cmpl op0,op1
18846 * sbbl dest,dest
18847 * [notl dest]
18848 * andl cf - ct, dest
18849 * [addl dest, ct]
18850 *
18851 * Size 8 - 11.
18852 */
18853
18854 if (cf == 0)
18855 {
18856 cf = ct;
18857 ct = 0;
18858 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18859 }
18860
18861 tmp = expand_simple_binop (mode, AND,
18862 copy_rtx (tmp),
18863 gen_int_mode (cf - ct, mode),
18864 copy_rtx (tmp), 1, OPTAB_DIRECT);
18865 if (ct)
18866 tmp = expand_simple_binop (mode, PLUS,
18867 copy_rtx (tmp), GEN_INT (ct),
18868 copy_rtx (tmp), 1, OPTAB_DIRECT);
18869 }
18870
18871 if (!rtx_equal_p (tmp, out))
18872 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18873
18874 return true;
18875 }
18876
18877 if (diff < 0)
18878 {
18879 enum machine_mode cmp_mode = GET_MODE (op0);
18880
18881 HOST_WIDE_INT tmp;
18882 tmp = ct, ct = cf, cf = tmp;
18883 diff = -diff;
18884
18885 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18886 {
18887 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18888
18889 /* We may be reversing unordered compare to normal compare, that
18890 is not valid in general (we may convert non-trapping condition
18891 to trapping one), however on i386 we currently emit all
18892 comparisons unordered. */
18893 compare_code = reverse_condition_maybe_unordered (compare_code);
18894 code = reverse_condition_maybe_unordered (code);
18895 }
18896 else
18897 {
18898 compare_code = reverse_condition (compare_code);
18899 code = reverse_condition (code);
18900 }
18901 }
18902
18903 compare_code = UNKNOWN;
18904 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18905 && CONST_INT_P (op1))
18906 {
18907 if (op1 == const0_rtx
18908 && (code == LT || code == GE))
18909 compare_code = code;
18910 else if (op1 == constm1_rtx)
18911 {
18912 if (code == LE)
18913 compare_code = LT;
18914 else if (code == GT)
18915 compare_code = GE;
18916 }
18917 }
18918
18919 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18920 if (compare_code != UNKNOWN
18921 && GET_MODE (op0) == GET_MODE (out)
18922 && (cf == -1 || ct == -1))
18923 {
18924 /* If lea code below could be used, only optimize
18925 if it results in a 2 insn sequence. */
18926
18927 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18928 || diff == 3 || diff == 5 || diff == 9)
18929 || (compare_code == LT && ct == -1)
18930 || (compare_code == GE && cf == -1))
18931 {
18932 /*
18933 * notl op1 (if necessary)
18934 * sarl $31, op1
18935 * orl cf, op1
18936 */
18937 if (ct != -1)
18938 {
18939 cf = ct;
18940 ct = -1;
18941 code = reverse_condition (code);
18942 }
18943
18944 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18945
18946 out = expand_simple_binop (mode, IOR,
18947 out, GEN_INT (cf),
18948 out, 1, OPTAB_DIRECT);
18949 if (out != operands[0])
18950 emit_move_insn (operands[0], out);
18951
18952 return true;
18953 }
18954 }
18955
18956
18957 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
18958 || diff == 3 || diff == 5 || diff == 9)
18959 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
18960 && (mode != DImode
18961 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
18962 {
18963 /*
18964 * xorl dest,dest
18965 * cmpl op1,op2
18966 * setcc dest
18967 * lea cf(dest*(ct-cf)),dest
18968 *
18969 * Size 14.
18970 *
18971 * This also catches the degenerate setcc-only case.
18972 */
18973
18974 rtx tmp;
18975 int nops;
18976
18977 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18978
18979 nops = 0;
18980 /* On x86_64 the lea instruction operates on Pmode, so we need
18981 to get arithmetics done in proper mode to match. */
18982 if (diff == 1)
18983 tmp = copy_rtx (out);
18984 else
18985 {
18986 rtx out1;
18987 out1 = copy_rtx (out);
18988 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
18989 nops++;
18990 if (diff & 1)
18991 {
18992 tmp = gen_rtx_PLUS (mode, tmp, out1);
18993 nops++;
18994 }
18995 }
18996 if (cf != 0)
18997 {
18998 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
18999 nops++;
19000 }
19001 if (!rtx_equal_p (tmp, out))
19002 {
19003 if (nops == 1)
19004 out = force_operand (tmp, copy_rtx (out));
19005 else
19006 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19007 }
19008 if (!rtx_equal_p (out, operands[0]))
19009 emit_move_insn (operands[0], copy_rtx (out));
19010
19011 return true;
19012 }
19013
19014 /*
19015 * General case: Jumpful:
19016 * xorl dest,dest cmpl op1, op2
19017 * cmpl op1, op2 movl ct, dest
19018 * setcc dest jcc 1f
19019 * decl dest movl cf, dest
19020 * andl (cf-ct),dest 1:
19021 * addl ct,dest
19022 *
19023 * Size 20. Size 14.
19024 *
19025 * This is reasonably steep, but branch mispredict costs are
19026 * high on modern cpus, so consider failing only if optimizing
19027 * for space.
19028 */
19029
19030 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19031 && BRANCH_COST (optimize_insn_for_speed_p (),
19032 false) >= 2)
19033 {
19034 if (cf == 0)
19035 {
19036 enum machine_mode cmp_mode = GET_MODE (op0);
19037
19038 cf = ct;
19039 ct = 0;
19040
19041 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19042 {
19043 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19044
19045 /* We may be reversing unordered compare to normal compare,
19046 that is not valid in general (we may convert non-trapping
19047 condition to trapping one), however on i386 we currently
19048 emit all comparisons unordered. */
19049 code = reverse_condition_maybe_unordered (code);
19050 }
19051 else
19052 {
19053 code = reverse_condition (code);
19054 if (compare_code != UNKNOWN)
19055 compare_code = reverse_condition (compare_code);
19056 }
19057 }
19058
19059 if (compare_code != UNKNOWN)
19060 {
19061 /* notl op1 (if needed)
19062 sarl $31, op1
19063 andl (cf-ct), op1
19064 addl ct, op1
19065
19066 For x < 0 (resp. x <= -1) there will be no notl,
19067 so if possible swap the constants to get rid of the
19068 complement.
19069 True/false will be -1/0 while code below (store flag
19070 followed by decrement) is 0/-1, so the constants need
19071 to be exchanged once more. */
19072
19073 if (compare_code == GE || !cf)
19074 {
19075 code = reverse_condition (code);
19076 compare_code = LT;
19077 }
19078 else
19079 {
19080 HOST_WIDE_INT tmp = cf;
19081 cf = ct;
19082 ct = tmp;
19083 }
19084
19085 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19086 }
19087 else
19088 {
19089 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19090
19091 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19092 constm1_rtx,
19093 copy_rtx (out), 1, OPTAB_DIRECT);
19094 }
19095
19096 out = expand_simple_binop (mode, AND, copy_rtx (out),
19097 gen_int_mode (cf - ct, mode),
19098 copy_rtx (out), 1, OPTAB_DIRECT);
19099 if (ct)
19100 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19101 copy_rtx (out), 1, OPTAB_DIRECT);
19102 if (!rtx_equal_p (out, operands[0]))
19103 emit_move_insn (operands[0], copy_rtx (out));
19104
19105 return true;
19106 }
19107 }
19108
19109 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19110 {
19111 /* Try a few things more with specific constants and a variable. */
19112
19113 optab op;
19114 rtx var, orig_out, out, tmp;
19115
19116 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19117 return false;
19118
19119 /* If one of the two operands is an interesting constant, load a
19120 constant with the above and mask it in with a logical operation. */
19121
19122 if (CONST_INT_P (operands[2]))
19123 {
19124 var = operands[3];
19125 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19126 operands[3] = constm1_rtx, op = and_optab;
19127 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19128 operands[3] = const0_rtx, op = ior_optab;
19129 else
19130 return false;
19131 }
19132 else if (CONST_INT_P (operands[3]))
19133 {
19134 var = operands[2];
19135 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19136 operands[2] = constm1_rtx, op = and_optab;
19137 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19138 operands[2] = const0_rtx, op = ior_optab;
19139 else
19140 return false;
19141 }
19142 else
19143 return false;
19144
19145 orig_out = operands[0];
19146 tmp = gen_reg_rtx (mode);
19147 operands[0] = tmp;
19148
19149 /* Recurse to get the constant loaded. */
19150 if (ix86_expand_int_movcc (operands) == 0)
19151 return false;
19152
19153 /* Mask in the interesting variable. */
19154 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19155 OPTAB_WIDEN);
19156 if (!rtx_equal_p (out, orig_out))
19157 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19158
19159 return true;
19160 }
19161
19162 /*
19163 * For comparison with above,
19164 *
19165 * movl cf,dest
19166 * movl ct,tmp
19167 * cmpl op1,op2
19168 * cmovcc tmp,dest
19169 *
19170 * Size 15.
19171 */
19172
19173 if (! nonimmediate_operand (operands[2], mode))
19174 operands[2] = force_reg (mode, operands[2]);
19175 if (! nonimmediate_operand (operands[3], mode))
19176 operands[3] = force_reg (mode, operands[3]);
19177
19178 if (! register_operand (operands[2], VOIDmode)
19179 && (mode == QImode
19180 || ! register_operand (operands[3], VOIDmode)))
19181 operands[2] = force_reg (mode, operands[2]);
19182
19183 if (mode == QImode
19184 && ! register_operand (operands[3], VOIDmode))
19185 operands[3] = force_reg (mode, operands[3]);
19186
19187 emit_insn (compare_seq);
19188 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19189 gen_rtx_IF_THEN_ELSE (mode,
19190 compare_op, operands[2],
19191 operands[3])));
19192 return true;
19193 }
19194
19195 /* Swap, force into registers, or otherwise massage the two operands
19196 to an sse comparison with a mask result. Thus we differ a bit from
19197 ix86_prepare_fp_compare_args which expects to produce a flags result.
19198
19199 The DEST operand exists to help determine whether to commute commutative
19200 operators. The POP0/POP1 operands are updated in place. The new
19201 comparison code is returned, or UNKNOWN if not implementable. */
19202
19203 static enum rtx_code
19204 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19205 rtx *pop0, rtx *pop1)
19206 {
19207 rtx tmp;
19208
19209 switch (code)
19210 {
19211 case LTGT:
19212 case UNEQ:
19213 /* AVX supports all the needed comparisons. */
19214 if (TARGET_AVX)
19215 break;
19216 /* We have no LTGT as an operator. We could implement it with
19217 NE & ORDERED, but this requires an extra temporary. It's
19218 not clear that it's worth it. */
19219 return UNKNOWN;
19220
19221 case LT:
19222 case LE:
19223 case UNGT:
19224 case UNGE:
19225 /* These are supported directly. */
19226 break;
19227
19228 case EQ:
19229 case NE:
19230 case UNORDERED:
19231 case ORDERED:
19232 /* AVX has 3 operand comparisons, no need to swap anything. */
19233 if (TARGET_AVX)
19234 break;
19235 /* For commutative operators, try to canonicalize the destination
19236 operand to be first in the comparison - this helps reload to
19237 avoid extra moves. */
19238 if (!dest || !rtx_equal_p (dest, *pop1))
19239 break;
19240 /* FALLTHRU */
19241
19242 case GE:
19243 case GT:
19244 case UNLE:
19245 case UNLT:
19246 /* These are not supported directly before AVX, and furthermore
19247 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19248 comparison operands to transform into something that is
19249 supported. */
19250 tmp = *pop0;
19251 *pop0 = *pop1;
19252 *pop1 = tmp;
19253 code = swap_condition (code);
19254 break;
19255
19256 default:
19257 gcc_unreachable ();
19258 }
19259
19260 return code;
19261 }
19262
19263 /* Detect conditional moves that exactly match min/max operational
19264 semantics. Note that this is IEEE safe, as long as we don't
19265 interchange the operands.
19266
19267 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19268 and TRUE if the operation is successful and instructions are emitted. */
19269
19270 static bool
19271 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19272 rtx cmp_op1, rtx if_true, rtx if_false)
19273 {
19274 enum machine_mode mode;
19275 bool is_min;
19276 rtx tmp;
19277
19278 if (code == LT)
19279 ;
19280 else if (code == UNGE)
19281 {
19282 tmp = if_true;
19283 if_true = if_false;
19284 if_false = tmp;
19285 }
19286 else
19287 return false;
19288
19289 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19290 is_min = true;
19291 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19292 is_min = false;
19293 else
19294 return false;
19295
19296 mode = GET_MODE (dest);
19297
19298 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19299 but MODE may be a vector mode and thus not appropriate. */
19300 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19301 {
19302 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19303 rtvec v;
19304
19305 if_true = force_reg (mode, if_true);
19306 v = gen_rtvec (2, if_true, if_false);
19307 tmp = gen_rtx_UNSPEC (mode, v, u);
19308 }
19309 else
19310 {
19311 code = is_min ? SMIN : SMAX;
19312 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19313 }
19314
19315 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19316 return true;
19317 }
19318
19319 /* Expand an sse vector comparison. Return the register with the result. */
19320
19321 static rtx
19322 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19323 rtx op_true, rtx op_false)
19324 {
19325 enum machine_mode mode = GET_MODE (dest);
19326 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19327 rtx x;
19328
19329 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19330 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19331 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19332
19333 if (optimize
19334 || reg_overlap_mentioned_p (dest, op_true)
19335 || reg_overlap_mentioned_p (dest, op_false))
19336 dest = gen_reg_rtx (mode);
19337
19338 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19339 if (cmp_mode != mode)
19340 {
19341 x = force_reg (cmp_mode, x);
19342 convert_move (dest, x, false);
19343 }
19344 else
19345 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19346
19347 return dest;
19348 }
19349
19350 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19351 operations. This is used for both scalar and vector conditional moves. */
19352
19353 static void
19354 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19355 {
19356 enum machine_mode mode = GET_MODE (dest);
19357 rtx t2, t3, x;
19358
19359 if (vector_all_ones_operand (op_true, mode)
19360 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19361 {
19362 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19363 }
19364 else if (op_false == CONST0_RTX (mode))
19365 {
19366 op_true = force_reg (mode, op_true);
19367 x = gen_rtx_AND (mode, cmp, op_true);
19368 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19369 }
19370 else if (op_true == CONST0_RTX (mode))
19371 {
19372 op_false = force_reg (mode, op_false);
19373 x = gen_rtx_NOT (mode, cmp);
19374 x = gen_rtx_AND (mode, x, op_false);
19375 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19376 }
19377 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19378 {
19379 op_false = force_reg (mode, op_false);
19380 x = gen_rtx_IOR (mode, cmp, op_false);
19381 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19382 }
19383 else if (TARGET_XOP)
19384 {
19385 op_true = force_reg (mode, op_true);
19386
19387 if (!nonimmediate_operand (op_false, mode))
19388 op_false = force_reg (mode, op_false);
19389
19390 emit_insn (gen_rtx_SET (mode, dest,
19391 gen_rtx_IF_THEN_ELSE (mode, cmp,
19392 op_true,
19393 op_false)));
19394 }
19395 else
19396 {
19397 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19398
19399 if (!nonimmediate_operand (op_true, mode))
19400 op_true = force_reg (mode, op_true);
19401
19402 op_false = force_reg (mode, op_false);
19403
19404 switch (mode)
19405 {
19406 case V4SFmode:
19407 if (TARGET_SSE4_1)
19408 gen = gen_sse4_1_blendvps;
19409 break;
19410 case V2DFmode:
19411 if (TARGET_SSE4_1)
19412 gen = gen_sse4_1_blendvpd;
19413 break;
19414 case V16QImode:
19415 case V8HImode:
19416 case V4SImode:
19417 case V2DImode:
19418 if (TARGET_SSE4_1)
19419 {
19420 gen = gen_sse4_1_pblendvb;
19421 dest = gen_lowpart (V16QImode, dest);
19422 op_false = gen_lowpart (V16QImode, op_false);
19423 op_true = gen_lowpart (V16QImode, op_true);
19424 cmp = gen_lowpart (V16QImode, cmp);
19425 }
19426 break;
19427 case V8SFmode:
19428 if (TARGET_AVX)
19429 gen = gen_avx_blendvps256;
19430 break;
19431 case V4DFmode:
19432 if (TARGET_AVX)
19433 gen = gen_avx_blendvpd256;
19434 break;
19435 case V32QImode:
19436 case V16HImode:
19437 case V8SImode:
19438 case V4DImode:
19439 if (TARGET_AVX2)
19440 {
19441 gen = gen_avx2_pblendvb;
19442 dest = gen_lowpart (V32QImode, dest);
19443 op_false = gen_lowpart (V32QImode, op_false);
19444 op_true = gen_lowpart (V32QImode, op_true);
19445 cmp = gen_lowpart (V32QImode, cmp);
19446 }
19447 break;
19448 default:
19449 break;
19450 }
19451
19452 if (gen != NULL)
19453 emit_insn (gen (dest, op_false, op_true, cmp));
19454 else
19455 {
19456 op_true = force_reg (mode, op_true);
19457
19458 t2 = gen_reg_rtx (mode);
19459 if (optimize)
19460 t3 = gen_reg_rtx (mode);
19461 else
19462 t3 = dest;
19463
19464 x = gen_rtx_AND (mode, op_true, cmp);
19465 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19466
19467 x = gen_rtx_NOT (mode, cmp);
19468 x = gen_rtx_AND (mode, x, op_false);
19469 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19470
19471 x = gen_rtx_IOR (mode, t3, t2);
19472 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19473 }
19474 }
19475 }
19476
19477 /* Expand a floating-point conditional move. Return true if successful. */
19478
19479 bool
19480 ix86_expand_fp_movcc (rtx operands[])
19481 {
19482 enum machine_mode mode = GET_MODE (operands[0]);
19483 enum rtx_code code = GET_CODE (operands[1]);
19484 rtx tmp, compare_op;
19485 rtx op0 = XEXP (operands[1], 0);
19486 rtx op1 = XEXP (operands[1], 1);
19487
19488 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19489 {
19490 enum machine_mode cmode;
19491
19492 /* Since we've no cmove for sse registers, don't force bad register
19493 allocation just to gain access to it. Deny movcc when the
19494 comparison mode doesn't match the move mode. */
19495 cmode = GET_MODE (op0);
19496 if (cmode == VOIDmode)
19497 cmode = GET_MODE (op1);
19498 if (cmode != mode)
19499 return false;
19500
19501 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19502 if (code == UNKNOWN)
19503 return false;
19504
19505 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19506 operands[2], operands[3]))
19507 return true;
19508
19509 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19510 operands[2], operands[3]);
19511 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19512 return true;
19513 }
19514
19515 /* The floating point conditional move instructions don't directly
19516 support conditions resulting from a signed integer comparison. */
19517
19518 compare_op = ix86_expand_compare (code, op0, op1);
19519 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19520 {
19521 tmp = gen_reg_rtx (QImode);
19522 ix86_expand_setcc (tmp, code, op0, op1);
19523
19524 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19525 }
19526
19527 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19528 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19529 operands[2], operands[3])));
19530
19531 return true;
19532 }
19533
19534 /* Expand a floating-point vector conditional move; a vcond operation
19535 rather than a movcc operation. */
19536
19537 bool
19538 ix86_expand_fp_vcond (rtx operands[])
19539 {
19540 enum rtx_code code = GET_CODE (operands[3]);
19541 rtx cmp;
19542
19543 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19544 &operands[4], &operands[5]);
19545 if (code == UNKNOWN)
19546 {
19547 rtx temp;
19548 switch (GET_CODE (operands[3]))
19549 {
19550 case LTGT:
19551 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19552 operands[5], operands[0], operands[0]);
19553 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19554 operands[5], operands[1], operands[2]);
19555 code = AND;
19556 break;
19557 case UNEQ:
19558 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19559 operands[5], operands[0], operands[0]);
19560 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19561 operands[5], operands[1], operands[2]);
19562 code = IOR;
19563 break;
19564 default:
19565 gcc_unreachable ();
19566 }
19567 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19568 OPTAB_DIRECT);
19569 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19570 return true;
19571 }
19572
19573 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19574 operands[5], operands[1], operands[2]))
19575 return true;
19576
19577 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19578 operands[1], operands[2]);
19579 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19580 return true;
19581 }
19582
19583 /* Expand a signed/unsigned integral vector conditional move. */
19584
19585 bool
19586 ix86_expand_int_vcond (rtx operands[])
19587 {
19588 enum machine_mode data_mode = GET_MODE (operands[0]);
19589 enum machine_mode mode = GET_MODE (operands[4]);
19590 enum rtx_code code = GET_CODE (operands[3]);
19591 bool negate = false;
19592 rtx x, cop0, cop1;
19593
19594 cop0 = operands[4];
19595 cop1 = operands[5];
19596
19597 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19598 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19599 if ((code == LT || code == GE)
19600 && data_mode == mode
19601 && cop1 == CONST0_RTX (mode)
19602 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19603 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19604 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19605 && (GET_MODE_SIZE (data_mode) == 16
19606 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19607 {
19608 rtx negop = operands[2 - (code == LT)];
19609 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19610 if (negop == CONST1_RTX (data_mode))
19611 {
19612 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19613 operands[0], 1, OPTAB_DIRECT);
19614 if (res != operands[0])
19615 emit_move_insn (operands[0], res);
19616 return true;
19617 }
19618 else if (GET_MODE_INNER (data_mode) != DImode
19619 && vector_all_ones_operand (negop, data_mode))
19620 {
19621 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19622 operands[0], 0, OPTAB_DIRECT);
19623 if (res != operands[0])
19624 emit_move_insn (operands[0], res);
19625 return true;
19626 }
19627 }
19628
19629 if (!nonimmediate_operand (cop1, mode))
19630 cop1 = force_reg (mode, cop1);
19631 if (!general_operand (operands[1], data_mode))
19632 operands[1] = force_reg (data_mode, operands[1]);
19633 if (!general_operand (operands[2], data_mode))
19634 operands[2] = force_reg (data_mode, operands[2]);
19635
19636 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19637 if (TARGET_XOP
19638 && (mode == V16QImode || mode == V8HImode
19639 || mode == V4SImode || mode == V2DImode))
19640 ;
19641 else
19642 {
19643 /* Canonicalize the comparison to EQ, GT, GTU. */
19644 switch (code)
19645 {
19646 case EQ:
19647 case GT:
19648 case GTU:
19649 break;
19650
19651 case NE:
19652 case LE:
19653 case LEU:
19654 code = reverse_condition (code);
19655 negate = true;
19656 break;
19657
19658 case GE:
19659 case GEU:
19660 code = reverse_condition (code);
19661 negate = true;
19662 /* FALLTHRU */
19663
19664 case LT:
19665 case LTU:
19666 code = swap_condition (code);
19667 x = cop0, cop0 = cop1, cop1 = x;
19668 break;
19669
19670 default:
19671 gcc_unreachable ();
19672 }
19673
19674 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19675 if (mode == V2DImode)
19676 {
19677 switch (code)
19678 {
19679 case EQ:
19680 /* SSE4.1 supports EQ. */
19681 if (!TARGET_SSE4_1)
19682 return false;
19683 break;
19684
19685 case GT:
19686 case GTU:
19687 /* SSE4.2 supports GT/GTU. */
19688 if (!TARGET_SSE4_2)
19689 return false;
19690 break;
19691
19692 default:
19693 gcc_unreachable ();
19694 }
19695 }
19696
19697 /* Unsigned parallel compare is not supported by the hardware.
19698 Play some tricks to turn this into a signed comparison
19699 against 0. */
19700 if (code == GTU)
19701 {
19702 cop0 = force_reg (mode, cop0);
19703
19704 switch (mode)
19705 {
19706 case V8SImode:
19707 case V4DImode:
19708 case V4SImode:
19709 case V2DImode:
19710 {
19711 rtx t1, t2, mask;
19712 rtx (*gen_sub3) (rtx, rtx, rtx);
19713
19714 switch (mode)
19715 {
19716 case V8SImode: gen_sub3 = gen_subv8si3; break;
19717 case V4DImode: gen_sub3 = gen_subv4di3; break;
19718 case V4SImode: gen_sub3 = gen_subv4si3; break;
19719 case V2DImode: gen_sub3 = gen_subv2di3; break;
19720 default:
19721 gcc_unreachable ();
19722 }
19723 /* Subtract (-(INT MAX) - 1) from both operands to make
19724 them signed. */
19725 mask = ix86_build_signbit_mask (mode, true, false);
19726 t1 = gen_reg_rtx (mode);
19727 emit_insn (gen_sub3 (t1, cop0, mask));
19728
19729 t2 = gen_reg_rtx (mode);
19730 emit_insn (gen_sub3 (t2, cop1, mask));
19731
19732 cop0 = t1;
19733 cop1 = t2;
19734 code = GT;
19735 }
19736 break;
19737
19738 case V32QImode:
19739 case V16HImode:
19740 case V16QImode:
19741 case V8HImode:
19742 /* Perform a parallel unsigned saturating subtraction. */
19743 x = gen_reg_rtx (mode);
19744 emit_insn (gen_rtx_SET (VOIDmode, x,
19745 gen_rtx_US_MINUS (mode, cop0, cop1)));
19746
19747 cop0 = x;
19748 cop1 = CONST0_RTX (mode);
19749 code = EQ;
19750 negate = !negate;
19751 break;
19752
19753 default:
19754 gcc_unreachable ();
19755 }
19756 }
19757 }
19758
19759 /* Allow the comparison to be done in one mode, but the movcc to
19760 happen in another mode. */
19761 if (data_mode == mode)
19762 {
19763 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19764 operands[1+negate], operands[2-negate]);
19765 }
19766 else
19767 {
19768 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19769 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19770 code, cop0, cop1,
19771 operands[1+negate], operands[2-negate]);
19772 x = gen_lowpart (data_mode, x);
19773 }
19774
19775 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19776 operands[2-negate]);
19777 return true;
19778 }
19779
19780 /* Expand a variable vector permutation. */
19781
19782 void
19783 ix86_expand_vec_perm (rtx operands[])
19784 {
19785 rtx target = operands[0];
19786 rtx op0 = operands[1];
19787 rtx op1 = operands[2];
19788 rtx mask = operands[3];
19789 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19790 enum machine_mode mode = GET_MODE (op0);
19791 enum machine_mode maskmode = GET_MODE (mask);
19792 int w, e, i;
19793 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19794
19795 /* Number of elements in the vector. */
19796 w = GET_MODE_NUNITS (mode);
19797 e = GET_MODE_UNIT_SIZE (mode);
19798 gcc_assert (w <= 32);
19799
19800 if (TARGET_AVX2)
19801 {
19802 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19803 {
19804 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19805 an constant shuffle operand. With a tiny bit of effort we can
19806 use VPERMD instead. A re-interpretation stall for V4DFmode is
19807 unfortunate but there's no avoiding it.
19808 Similarly for V16HImode we don't have instructions for variable
19809 shuffling, while for V32QImode we can use after preparing suitable
19810 masks vpshufb; vpshufb; vpermq; vpor. */
19811
19812 if (mode == V16HImode)
19813 {
19814 maskmode = mode = V32QImode;
19815 w = 32;
19816 e = 1;
19817 }
19818 else
19819 {
19820 maskmode = mode = V8SImode;
19821 w = 8;
19822 e = 4;
19823 }
19824 t1 = gen_reg_rtx (maskmode);
19825
19826 /* Replicate the low bits of the V4DImode mask into V8SImode:
19827 mask = { A B C D }
19828 t1 = { A A B B C C D D }. */
19829 for (i = 0; i < w / 2; ++i)
19830 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19831 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19832 vt = force_reg (maskmode, vt);
19833 mask = gen_lowpart (maskmode, mask);
19834 if (maskmode == V8SImode)
19835 emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
19836 else
19837 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19838
19839 /* Multiply the shuffle indicies by two. */
19840 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19841 OPTAB_DIRECT);
19842
19843 /* Add one to the odd shuffle indicies:
19844 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19845 for (i = 0; i < w / 2; ++i)
19846 {
19847 vec[i * 2] = const0_rtx;
19848 vec[i * 2 + 1] = const1_rtx;
19849 }
19850 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19851 vt = force_const_mem (maskmode, vt);
19852 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19853 OPTAB_DIRECT);
19854
19855 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19856 operands[3] = mask = t1;
19857 target = gen_lowpart (mode, target);
19858 op0 = gen_lowpart (mode, op0);
19859 op1 = gen_lowpart (mode, op1);
19860 }
19861
19862 switch (mode)
19863 {
19864 case V8SImode:
19865 /* The VPERMD and VPERMPS instructions already properly ignore
19866 the high bits of the shuffle elements. No need for us to
19867 perform an AND ourselves. */
19868 if (one_operand_shuffle)
19869 emit_insn (gen_avx2_permvarv8si (target, mask, op0));
19870 else
19871 {
19872 t1 = gen_reg_rtx (V8SImode);
19873 t2 = gen_reg_rtx (V8SImode);
19874 emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
19875 emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
19876 goto merge_two;
19877 }
19878 return;
19879
19880 case V8SFmode:
19881 mask = gen_lowpart (V8SFmode, mask);
19882 if (one_operand_shuffle)
19883 emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
19884 else
19885 {
19886 t1 = gen_reg_rtx (V8SFmode);
19887 t2 = gen_reg_rtx (V8SFmode);
19888 emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
19889 emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
19890 goto merge_two;
19891 }
19892 return;
19893
19894 case V4SImode:
19895 /* By combining the two 128-bit input vectors into one 256-bit
19896 input vector, we can use VPERMD and VPERMPS for the full
19897 two-operand shuffle. */
19898 t1 = gen_reg_rtx (V8SImode);
19899 t2 = gen_reg_rtx (V8SImode);
19900 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19901 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19902 emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
19903 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19904 return;
19905
19906 case V4SFmode:
19907 t1 = gen_reg_rtx (V8SFmode);
19908 t2 = gen_reg_rtx (V8SFmode);
19909 mask = gen_lowpart (V4SFmode, mask);
19910 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19911 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19912 emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
19913 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19914 return;
19915
19916 case V32QImode:
19917 t1 = gen_reg_rtx (V32QImode);
19918 t2 = gen_reg_rtx (V32QImode);
19919 t3 = gen_reg_rtx (V32QImode);
19920 vt2 = GEN_INT (128);
19921 for (i = 0; i < 32; i++)
19922 vec[i] = vt2;
19923 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19924 vt = force_reg (V32QImode, vt);
19925 for (i = 0; i < 32; i++)
19926 vec[i] = i < 16 ? vt2 : const0_rtx;
19927 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19928 vt2 = force_reg (V32QImode, vt2);
19929 /* From mask create two adjusted masks, which contain the same
19930 bits as mask in the low 7 bits of each vector element.
19931 The first mask will have the most significant bit clear
19932 if it requests element from the same 128-bit lane
19933 and MSB set if it requests element from the other 128-bit lane.
19934 The second mask will have the opposite values of the MSB,
19935 and additionally will have its 128-bit lanes swapped.
19936 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19937 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19938 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19939 stands for other 12 bytes. */
19940 /* The bit whether element is from the same lane or the other
19941 lane is bit 4, so shift it up by 3 to the MSB position. */
19942 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19943 gen_lowpart (V4DImode, mask),
19944 GEN_INT (3)));
19945 /* Clear MSB bits from the mask just in case it had them set. */
19946 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19947 /* After this t1 will have MSB set for elements from other lane. */
19948 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
19949 /* Clear bits other than MSB. */
19950 emit_insn (gen_andv32qi3 (t1, t1, vt));
19951 /* Or in the lower bits from mask into t3. */
19952 emit_insn (gen_iorv32qi3 (t3, t1, t2));
19953 /* And invert MSB bits in t1, so MSB is set for elements from the same
19954 lane. */
19955 emit_insn (gen_xorv32qi3 (t1, t1, vt));
19956 /* Swap 128-bit lanes in t3. */
19957 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19958 gen_lowpart (V4DImode, t3),
19959 const2_rtx, GEN_INT (3),
19960 const0_rtx, const1_rtx));
19961 /* And or in the lower bits from mask into t1. */
19962 emit_insn (gen_iorv32qi3 (t1, t1, t2));
19963 if (one_operand_shuffle)
19964 {
19965 /* Each of these shuffles will put 0s in places where
19966 element from the other 128-bit lane is needed, otherwise
19967 will shuffle in the requested value. */
19968 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
19969 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
19970 /* For t3 the 128-bit lanes are swapped again. */
19971 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19972 gen_lowpart (V4DImode, t3),
19973 const2_rtx, GEN_INT (3),
19974 const0_rtx, const1_rtx));
19975 /* And oring both together leads to the result. */
19976 emit_insn (gen_iorv32qi3 (target, t1, t3));
19977 return;
19978 }
19979
19980 t4 = gen_reg_rtx (V32QImode);
19981 /* Similarly to the above one_operand_shuffle code,
19982 just for repeated twice for each operand. merge_two:
19983 code will merge the two results together. */
19984 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
19985 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
19986 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
19987 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
19988 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
19989 gen_lowpart (V4DImode, t4),
19990 const2_rtx, GEN_INT (3),
19991 const0_rtx, const1_rtx));
19992 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19993 gen_lowpart (V4DImode, t3),
19994 const2_rtx, GEN_INT (3),
19995 const0_rtx, const1_rtx));
19996 emit_insn (gen_iorv32qi3 (t4, t2, t4));
19997 emit_insn (gen_iorv32qi3 (t3, t1, t3));
19998 t1 = t4;
19999 t2 = t3;
20000 goto merge_two;
20001
20002 default:
20003 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20004 break;
20005 }
20006 }
20007
20008 if (TARGET_XOP)
20009 {
20010 /* The XOP VPPERM insn supports three inputs. By ignoring the
20011 one_operand_shuffle special case, we avoid creating another
20012 set of constant vectors in memory. */
20013 one_operand_shuffle = false;
20014
20015 /* mask = mask & {2*w-1, ...} */
20016 vt = GEN_INT (2*w - 1);
20017 }
20018 else
20019 {
20020 /* mask = mask & {w-1, ...} */
20021 vt = GEN_INT (w - 1);
20022 }
20023
20024 for (i = 0; i < w; i++)
20025 vec[i] = vt;
20026 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20027 mask = expand_simple_binop (maskmode, AND, mask, vt,
20028 NULL_RTX, 0, OPTAB_DIRECT);
20029
20030 /* For non-QImode operations, convert the word permutation control
20031 into a byte permutation control. */
20032 if (mode != V16QImode)
20033 {
20034 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20035 GEN_INT (exact_log2 (e)),
20036 NULL_RTX, 0, OPTAB_DIRECT);
20037
20038 /* Convert mask to vector of chars. */
20039 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20040
20041 /* Replicate each of the input bytes into byte positions:
20042 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20043 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20044 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20045 for (i = 0; i < 16; ++i)
20046 vec[i] = GEN_INT (i/e * e);
20047 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20048 vt = force_const_mem (V16QImode, vt);
20049 if (TARGET_XOP)
20050 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20051 else
20052 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20053
20054 /* Convert it into the byte positions by doing
20055 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20056 for (i = 0; i < 16; ++i)
20057 vec[i] = GEN_INT (i % e);
20058 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20059 vt = force_const_mem (V16QImode, vt);
20060 emit_insn (gen_addv16qi3 (mask, mask, vt));
20061 }
20062
20063 /* The actual shuffle operations all operate on V16QImode. */
20064 op0 = gen_lowpart (V16QImode, op0);
20065 op1 = gen_lowpart (V16QImode, op1);
20066 target = gen_lowpart (V16QImode, target);
20067
20068 if (TARGET_XOP)
20069 {
20070 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20071 }
20072 else if (one_operand_shuffle)
20073 {
20074 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20075 }
20076 else
20077 {
20078 rtx xops[6];
20079 bool ok;
20080
20081 /* Shuffle the two input vectors independently. */
20082 t1 = gen_reg_rtx (V16QImode);
20083 t2 = gen_reg_rtx (V16QImode);
20084 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20085 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20086
20087 merge_two:
20088 /* Then merge them together. The key is whether any given control
20089 element contained a bit set that indicates the second word. */
20090 mask = operands[3];
20091 vt = GEN_INT (w);
20092 if (maskmode == V2DImode && !TARGET_SSE4_1)
20093 {
20094 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20095 more shuffle to convert the V2DI input mask into a V4SI
20096 input mask. At which point the masking that expand_int_vcond
20097 will work as desired. */
20098 rtx t3 = gen_reg_rtx (V4SImode);
20099 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20100 const0_rtx, const0_rtx,
20101 const2_rtx, const2_rtx));
20102 mask = t3;
20103 maskmode = V4SImode;
20104 e = w = 4;
20105 }
20106
20107 for (i = 0; i < w; i++)
20108 vec[i] = vt;
20109 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20110 vt = force_reg (maskmode, vt);
20111 mask = expand_simple_binop (maskmode, AND, mask, vt,
20112 NULL_RTX, 0, OPTAB_DIRECT);
20113
20114 xops[0] = gen_lowpart (mode, operands[0]);
20115 xops[1] = gen_lowpart (mode, t2);
20116 xops[2] = gen_lowpart (mode, t1);
20117 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20118 xops[4] = mask;
20119 xops[5] = vt;
20120 ok = ix86_expand_int_vcond (xops);
20121 gcc_assert (ok);
20122 }
20123 }
20124
20125 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20126 true if we should do zero extension, else sign extension. HIGH_P is
20127 true if we want the N/2 high elements, else the low elements. */
20128
20129 void
20130 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
20131 {
20132 enum machine_mode imode = GET_MODE (operands[1]);
20133 rtx tmp, dest;
20134
20135 if (TARGET_SSE4_1)
20136 {
20137 rtx (*unpack)(rtx, rtx);
20138 rtx (*extract)(rtx, rtx) = NULL;
20139 enum machine_mode halfmode = BLKmode;
20140
20141 switch (imode)
20142 {
20143 case V32QImode:
20144 if (unsigned_p)
20145 unpack = gen_avx2_zero_extendv16qiv16hi2;
20146 else
20147 unpack = gen_avx2_sign_extendv16qiv16hi2;
20148 halfmode = V16QImode;
20149 extract
20150 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20151 break;
20152 case V16HImode:
20153 if (unsigned_p)
20154 unpack = gen_avx2_zero_extendv8hiv8si2;
20155 else
20156 unpack = gen_avx2_sign_extendv8hiv8si2;
20157 halfmode = V8HImode;
20158 extract
20159 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20160 break;
20161 case V8SImode:
20162 if (unsigned_p)
20163 unpack = gen_avx2_zero_extendv4siv4di2;
20164 else
20165 unpack = gen_avx2_sign_extendv4siv4di2;
20166 halfmode = V4SImode;
20167 extract
20168 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20169 break;
20170 case V16QImode:
20171 if (unsigned_p)
20172 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20173 else
20174 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20175 break;
20176 case V8HImode:
20177 if (unsigned_p)
20178 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20179 else
20180 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20181 break;
20182 case V4SImode:
20183 if (unsigned_p)
20184 unpack = gen_sse4_1_zero_extendv2siv2di2;
20185 else
20186 unpack = gen_sse4_1_sign_extendv2siv2di2;
20187 break;
20188 default:
20189 gcc_unreachable ();
20190 }
20191
20192 if (GET_MODE_SIZE (imode) == 32)
20193 {
20194 tmp = gen_reg_rtx (halfmode);
20195 emit_insn (extract (tmp, operands[1]));
20196 }
20197 else if (high_p)
20198 {
20199 /* Shift higher 8 bytes to lower 8 bytes. */
20200 tmp = gen_reg_rtx (imode);
20201 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20202 gen_lowpart (V1TImode, operands[1]),
20203 GEN_INT (64)));
20204 }
20205 else
20206 tmp = operands[1];
20207
20208 emit_insn (unpack (operands[0], tmp));
20209 }
20210 else
20211 {
20212 rtx (*unpack)(rtx, rtx, rtx);
20213
20214 switch (imode)
20215 {
20216 case V16QImode:
20217 if (high_p)
20218 unpack = gen_vec_interleave_highv16qi;
20219 else
20220 unpack = gen_vec_interleave_lowv16qi;
20221 break;
20222 case V8HImode:
20223 if (high_p)
20224 unpack = gen_vec_interleave_highv8hi;
20225 else
20226 unpack = gen_vec_interleave_lowv8hi;
20227 break;
20228 case V4SImode:
20229 if (high_p)
20230 unpack = gen_vec_interleave_highv4si;
20231 else
20232 unpack = gen_vec_interleave_lowv4si;
20233 break;
20234 default:
20235 gcc_unreachable ();
20236 }
20237
20238 dest = gen_lowpart (imode, operands[0]);
20239
20240 if (unsigned_p)
20241 tmp = force_reg (imode, CONST0_RTX (imode));
20242 else
20243 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20244 operands[1], pc_rtx, pc_rtx);
20245
20246 emit_insn (unpack (dest, operands[1], tmp));
20247 }
20248 }
20249
20250 /* Expand conditional increment or decrement using adb/sbb instructions.
20251 The default case using setcc followed by the conditional move can be
20252 done by generic code. */
20253 bool
20254 ix86_expand_int_addcc (rtx operands[])
20255 {
20256 enum rtx_code code = GET_CODE (operands[1]);
20257 rtx flags;
20258 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20259 rtx compare_op;
20260 rtx val = const0_rtx;
20261 bool fpcmp = false;
20262 enum machine_mode mode;
20263 rtx op0 = XEXP (operands[1], 0);
20264 rtx op1 = XEXP (operands[1], 1);
20265
20266 if (operands[3] != const1_rtx
20267 && operands[3] != constm1_rtx)
20268 return false;
20269 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20270 return false;
20271 code = GET_CODE (compare_op);
20272
20273 flags = XEXP (compare_op, 0);
20274
20275 if (GET_MODE (flags) == CCFPmode
20276 || GET_MODE (flags) == CCFPUmode)
20277 {
20278 fpcmp = true;
20279 code = ix86_fp_compare_code_to_integer (code);
20280 }
20281
20282 if (code != LTU)
20283 {
20284 val = constm1_rtx;
20285 if (fpcmp)
20286 PUT_CODE (compare_op,
20287 reverse_condition_maybe_unordered
20288 (GET_CODE (compare_op)));
20289 else
20290 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20291 }
20292
20293 mode = GET_MODE (operands[0]);
20294
20295 /* Construct either adc or sbb insn. */
20296 if ((code == LTU) == (operands[3] == constm1_rtx))
20297 {
20298 switch (mode)
20299 {
20300 case QImode:
20301 insn = gen_subqi3_carry;
20302 break;
20303 case HImode:
20304 insn = gen_subhi3_carry;
20305 break;
20306 case SImode:
20307 insn = gen_subsi3_carry;
20308 break;
20309 case DImode:
20310 insn = gen_subdi3_carry;
20311 break;
20312 default:
20313 gcc_unreachable ();
20314 }
20315 }
20316 else
20317 {
20318 switch (mode)
20319 {
20320 case QImode:
20321 insn = gen_addqi3_carry;
20322 break;
20323 case HImode:
20324 insn = gen_addhi3_carry;
20325 break;
20326 case SImode:
20327 insn = gen_addsi3_carry;
20328 break;
20329 case DImode:
20330 insn = gen_adddi3_carry;
20331 break;
20332 default:
20333 gcc_unreachable ();
20334 }
20335 }
20336 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20337
20338 return true;
20339 }
20340
20341
20342 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20343 but works for floating pointer parameters and nonoffsetable memories.
20344 For pushes, it returns just stack offsets; the values will be saved
20345 in the right order. Maximally three parts are generated. */
20346
20347 static int
20348 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20349 {
20350 int size;
20351
20352 if (!TARGET_64BIT)
20353 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20354 else
20355 size = (GET_MODE_SIZE (mode) + 4) / 8;
20356
20357 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20358 gcc_assert (size >= 2 && size <= 4);
20359
20360 /* Optimize constant pool reference to immediates. This is used by fp
20361 moves, that force all constants to memory to allow combining. */
20362 if (MEM_P (operand) && MEM_READONLY_P (operand))
20363 {
20364 rtx tmp = maybe_get_pool_constant (operand);
20365 if (tmp)
20366 operand = tmp;
20367 }
20368
20369 if (MEM_P (operand) && !offsettable_memref_p (operand))
20370 {
20371 /* The only non-offsetable memories we handle are pushes. */
20372 int ok = push_operand (operand, VOIDmode);
20373
20374 gcc_assert (ok);
20375
20376 operand = copy_rtx (operand);
20377 PUT_MODE (operand, word_mode);
20378 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20379 return size;
20380 }
20381
20382 if (GET_CODE (operand) == CONST_VECTOR)
20383 {
20384 enum machine_mode imode = int_mode_for_mode (mode);
20385 /* Caution: if we looked through a constant pool memory above,
20386 the operand may actually have a different mode now. That's
20387 ok, since we want to pun this all the way back to an integer. */
20388 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20389 gcc_assert (operand != NULL);
20390 mode = imode;
20391 }
20392
20393 if (!TARGET_64BIT)
20394 {
20395 if (mode == DImode)
20396 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20397 else
20398 {
20399 int i;
20400
20401 if (REG_P (operand))
20402 {
20403 gcc_assert (reload_completed);
20404 for (i = 0; i < size; i++)
20405 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20406 }
20407 else if (offsettable_memref_p (operand))
20408 {
20409 operand = adjust_address (operand, SImode, 0);
20410 parts[0] = operand;
20411 for (i = 1; i < size; i++)
20412 parts[i] = adjust_address (operand, SImode, 4 * i);
20413 }
20414 else if (GET_CODE (operand) == CONST_DOUBLE)
20415 {
20416 REAL_VALUE_TYPE r;
20417 long l[4];
20418
20419 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20420 switch (mode)
20421 {
20422 case TFmode:
20423 real_to_target (l, &r, mode);
20424 parts[3] = gen_int_mode (l[3], SImode);
20425 parts[2] = gen_int_mode (l[2], SImode);
20426 break;
20427 case XFmode:
20428 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20429 parts[2] = gen_int_mode (l[2], SImode);
20430 break;
20431 case DFmode:
20432 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20433 break;
20434 default:
20435 gcc_unreachable ();
20436 }
20437 parts[1] = gen_int_mode (l[1], SImode);
20438 parts[0] = gen_int_mode (l[0], SImode);
20439 }
20440 else
20441 gcc_unreachable ();
20442 }
20443 }
20444 else
20445 {
20446 if (mode == TImode)
20447 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20448 if (mode == XFmode || mode == TFmode)
20449 {
20450 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20451 if (REG_P (operand))
20452 {
20453 gcc_assert (reload_completed);
20454 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20455 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20456 }
20457 else if (offsettable_memref_p (operand))
20458 {
20459 operand = adjust_address (operand, DImode, 0);
20460 parts[0] = operand;
20461 parts[1] = adjust_address (operand, upper_mode, 8);
20462 }
20463 else if (GET_CODE (operand) == CONST_DOUBLE)
20464 {
20465 REAL_VALUE_TYPE r;
20466 long l[4];
20467
20468 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20469 real_to_target (l, &r, mode);
20470
20471 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20472 if (HOST_BITS_PER_WIDE_INT >= 64)
20473 parts[0]
20474 = gen_int_mode
20475 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20476 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20477 DImode);
20478 else
20479 parts[0] = immed_double_const (l[0], l[1], DImode);
20480
20481 if (upper_mode == SImode)
20482 parts[1] = gen_int_mode (l[2], SImode);
20483 else if (HOST_BITS_PER_WIDE_INT >= 64)
20484 parts[1]
20485 = gen_int_mode
20486 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20487 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20488 DImode);
20489 else
20490 parts[1] = immed_double_const (l[2], l[3], DImode);
20491 }
20492 else
20493 gcc_unreachable ();
20494 }
20495 }
20496
20497 return size;
20498 }
20499
20500 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20501 Return false when normal moves are needed; true when all required
20502 insns have been emitted. Operands 2-4 contain the input values
20503 int the correct order; operands 5-7 contain the output values. */
20504
20505 void
20506 ix86_split_long_move (rtx operands[])
20507 {
20508 rtx part[2][4];
20509 int nparts, i, j;
20510 int push = 0;
20511 int collisions = 0;
20512 enum machine_mode mode = GET_MODE (operands[0]);
20513 bool collisionparts[4];
20514
20515 /* The DFmode expanders may ask us to move double.
20516 For 64bit target this is single move. By hiding the fact
20517 here we simplify i386.md splitters. */
20518 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20519 {
20520 /* Optimize constant pool reference to immediates. This is used by
20521 fp moves, that force all constants to memory to allow combining. */
20522
20523 if (MEM_P (operands[1])
20524 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20525 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20526 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20527 if (push_operand (operands[0], VOIDmode))
20528 {
20529 operands[0] = copy_rtx (operands[0]);
20530 PUT_MODE (operands[0], word_mode);
20531 }
20532 else
20533 operands[0] = gen_lowpart (DImode, operands[0]);
20534 operands[1] = gen_lowpart (DImode, operands[1]);
20535 emit_move_insn (operands[0], operands[1]);
20536 return;
20537 }
20538
20539 /* The only non-offsettable memory we handle is push. */
20540 if (push_operand (operands[0], VOIDmode))
20541 push = 1;
20542 else
20543 gcc_assert (!MEM_P (operands[0])
20544 || offsettable_memref_p (operands[0]));
20545
20546 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20547 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20548
20549 /* When emitting push, take care for source operands on the stack. */
20550 if (push && MEM_P (operands[1])
20551 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20552 {
20553 rtx src_base = XEXP (part[1][nparts - 1], 0);
20554
20555 /* Compensate for the stack decrement by 4. */
20556 if (!TARGET_64BIT && nparts == 3
20557 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20558 src_base = plus_constant (src_base, 4);
20559
20560 /* src_base refers to the stack pointer and is
20561 automatically decreased by emitted push. */
20562 for (i = 0; i < nparts; i++)
20563 part[1][i] = change_address (part[1][i],
20564 GET_MODE (part[1][i]), src_base);
20565 }
20566
20567 /* We need to do copy in the right order in case an address register
20568 of the source overlaps the destination. */
20569 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20570 {
20571 rtx tmp;
20572
20573 for (i = 0; i < nparts; i++)
20574 {
20575 collisionparts[i]
20576 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20577 if (collisionparts[i])
20578 collisions++;
20579 }
20580
20581 /* Collision in the middle part can be handled by reordering. */
20582 if (collisions == 1 && nparts == 3 && collisionparts [1])
20583 {
20584 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20585 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20586 }
20587 else if (collisions == 1
20588 && nparts == 4
20589 && (collisionparts [1] || collisionparts [2]))
20590 {
20591 if (collisionparts [1])
20592 {
20593 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20594 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20595 }
20596 else
20597 {
20598 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20599 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20600 }
20601 }
20602
20603 /* If there are more collisions, we can't handle it by reordering.
20604 Do an lea to the last part and use only one colliding move. */
20605 else if (collisions > 1)
20606 {
20607 rtx base;
20608
20609 collisions = 1;
20610
20611 base = part[0][nparts - 1];
20612
20613 /* Handle the case when the last part isn't valid for lea.
20614 Happens in 64-bit mode storing the 12-byte XFmode. */
20615 if (GET_MODE (base) != Pmode)
20616 base = gen_rtx_REG (Pmode, REGNO (base));
20617
20618 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20619 part[1][0] = replace_equiv_address (part[1][0], base);
20620 for (i = 1; i < nparts; i++)
20621 {
20622 tmp = plus_constant (base, UNITS_PER_WORD * i);
20623 part[1][i] = replace_equiv_address (part[1][i], tmp);
20624 }
20625 }
20626 }
20627
20628 if (push)
20629 {
20630 if (!TARGET_64BIT)
20631 {
20632 if (nparts == 3)
20633 {
20634 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20635 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
20636 stack_pointer_rtx, GEN_INT (-4)));
20637 emit_move_insn (part[0][2], part[1][2]);
20638 }
20639 else if (nparts == 4)
20640 {
20641 emit_move_insn (part[0][3], part[1][3]);
20642 emit_move_insn (part[0][2], part[1][2]);
20643 }
20644 }
20645 else
20646 {
20647 /* In 64bit mode we don't have 32bit push available. In case this is
20648 register, it is OK - we will just use larger counterpart. We also
20649 retype memory - these comes from attempt to avoid REX prefix on
20650 moving of second half of TFmode value. */
20651 if (GET_MODE (part[1][1]) == SImode)
20652 {
20653 switch (GET_CODE (part[1][1]))
20654 {
20655 case MEM:
20656 part[1][1] = adjust_address (part[1][1], DImode, 0);
20657 break;
20658
20659 case REG:
20660 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20661 break;
20662
20663 default:
20664 gcc_unreachable ();
20665 }
20666
20667 if (GET_MODE (part[1][0]) == SImode)
20668 part[1][0] = part[1][1];
20669 }
20670 }
20671 emit_move_insn (part[0][1], part[1][1]);
20672 emit_move_insn (part[0][0], part[1][0]);
20673 return;
20674 }
20675
20676 /* Choose correct order to not overwrite the source before it is copied. */
20677 if ((REG_P (part[0][0])
20678 && REG_P (part[1][1])
20679 && (REGNO (part[0][0]) == REGNO (part[1][1])
20680 || (nparts == 3
20681 && REGNO (part[0][0]) == REGNO (part[1][2]))
20682 || (nparts == 4
20683 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20684 || (collisions > 0
20685 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20686 {
20687 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20688 {
20689 operands[2 + i] = part[0][j];
20690 operands[6 + i] = part[1][j];
20691 }
20692 }
20693 else
20694 {
20695 for (i = 0; i < nparts; i++)
20696 {
20697 operands[2 + i] = part[0][i];
20698 operands[6 + i] = part[1][i];
20699 }
20700 }
20701
20702 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20703 if (optimize_insn_for_size_p ())
20704 {
20705 for (j = 0; j < nparts - 1; j++)
20706 if (CONST_INT_P (operands[6 + j])
20707 && operands[6 + j] != const0_rtx
20708 && REG_P (operands[2 + j]))
20709 for (i = j; i < nparts - 1; i++)
20710 if (CONST_INT_P (operands[7 + i])
20711 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20712 operands[7 + i] = operands[2 + j];
20713 }
20714
20715 for (i = 0; i < nparts; i++)
20716 emit_move_insn (operands[2 + i], operands[6 + i]);
20717
20718 return;
20719 }
20720
20721 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20722 left shift by a constant, either using a single shift or
20723 a sequence of add instructions. */
20724
20725 static void
20726 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20727 {
20728 rtx (*insn)(rtx, rtx, rtx);
20729
20730 if (count == 1
20731 || (count * ix86_cost->add <= ix86_cost->shift_const
20732 && !optimize_insn_for_size_p ()))
20733 {
20734 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20735 while (count-- > 0)
20736 emit_insn (insn (operand, operand, operand));
20737 }
20738 else
20739 {
20740 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20741 emit_insn (insn (operand, operand, GEN_INT (count)));
20742 }
20743 }
20744
20745 void
20746 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20747 {
20748 rtx (*gen_ashl3)(rtx, rtx, rtx);
20749 rtx (*gen_shld)(rtx, rtx, rtx);
20750 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20751
20752 rtx low[2], high[2];
20753 int count;
20754
20755 if (CONST_INT_P (operands[2]))
20756 {
20757 split_double_mode (mode, operands, 2, low, high);
20758 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20759
20760 if (count >= half_width)
20761 {
20762 emit_move_insn (high[0], low[1]);
20763 emit_move_insn (low[0], const0_rtx);
20764
20765 if (count > half_width)
20766 ix86_expand_ashl_const (high[0], count - half_width, mode);
20767 }
20768 else
20769 {
20770 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20771
20772 if (!rtx_equal_p (operands[0], operands[1]))
20773 emit_move_insn (operands[0], operands[1]);
20774
20775 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20776 ix86_expand_ashl_const (low[0], count, mode);
20777 }
20778 return;
20779 }
20780
20781 split_double_mode (mode, operands, 1, low, high);
20782
20783 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20784
20785 if (operands[1] == const1_rtx)
20786 {
20787 /* Assuming we've chosen a QImode capable registers, then 1 << N
20788 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20789 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20790 {
20791 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20792
20793 ix86_expand_clear (low[0]);
20794 ix86_expand_clear (high[0]);
20795 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20796
20797 d = gen_lowpart (QImode, low[0]);
20798 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20799 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20800 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20801
20802 d = gen_lowpart (QImode, high[0]);
20803 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20804 s = gen_rtx_NE (QImode, flags, const0_rtx);
20805 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20806 }
20807
20808 /* Otherwise, we can get the same results by manually performing
20809 a bit extract operation on bit 5/6, and then performing the two
20810 shifts. The two methods of getting 0/1 into low/high are exactly
20811 the same size. Avoiding the shift in the bit extract case helps
20812 pentium4 a bit; no one else seems to care much either way. */
20813 else
20814 {
20815 enum machine_mode half_mode;
20816 rtx (*gen_lshr3)(rtx, rtx, rtx);
20817 rtx (*gen_and3)(rtx, rtx, rtx);
20818 rtx (*gen_xor3)(rtx, rtx, rtx);
20819 HOST_WIDE_INT bits;
20820 rtx x;
20821
20822 if (mode == DImode)
20823 {
20824 half_mode = SImode;
20825 gen_lshr3 = gen_lshrsi3;
20826 gen_and3 = gen_andsi3;
20827 gen_xor3 = gen_xorsi3;
20828 bits = 5;
20829 }
20830 else
20831 {
20832 half_mode = DImode;
20833 gen_lshr3 = gen_lshrdi3;
20834 gen_and3 = gen_anddi3;
20835 gen_xor3 = gen_xordi3;
20836 bits = 6;
20837 }
20838
20839 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20840 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20841 else
20842 x = gen_lowpart (half_mode, operands[2]);
20843 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20844
20845 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20846 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20847 emit_move_insn (low[0], high[0]);
20848 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20849 }
20850
20851 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20852 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20853 return;
20854 }
20855
20856 if (operands[1] == constm1_rtx)
20857 {
20858 /* For -1 << N, we can avoid the shld instruction, because we
20859 know that we're shifting 0...31/63 ones into a -1. */
20860 emit_move_insn (low[0], constm1_rtx);
20861 if (optimize_insn_for_size_p ())
20862 emit_move_insn (high[0], low[0]);
20863 else
20864 emit_move_insn (high[0], constm1_rtx);
20865 }
20866 else
20867 {
20868 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20869
20870 if (!rtx_equal_p (operands[0], operands[1]))
20871 emit_move_insn (operands[0], operands[1]);
20872
20873 split_double_mode (mode, operands, 1, low, high);
20874 emit_insn (gen_shld (high[0], low[0], operands[2]));
20875 }
20876
20877 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20878
20879 if (TARGET_CMOVE && scratch)
20880 {
20881 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20882 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20883
20884 ix86_expand_clear (scratch);
20885 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20886 }
20887 else
20888 {
20889 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20890 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20891
20892 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20893 }
20894 }
20895
20896 void
20897 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20898 {
20899 rtx (*gen_ashr3)(rtx, rtx, rtx)
20900 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20901 rtx (*gen_shrd)(rtx, rtx, rtx);
20902 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20903
20904 rtx low[2], high[2];
20905 int count;
20906
20907 if (CONST_INT_P (operands[2]))
20908 {
20909 split_double_mode (mode, operands, 2, low, high);
20910 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20911
20912 if (count == GET_MODE_BITSIZE (mode) - 1)
20913 {
20914 emit_move_insn (high[0], high[1]);
20915 emit_insn (gen_ashr3 (high[0], high[0],
20916 GEN_INT (half_width - 1)));
20917 emit_move_insn (low[0], high[0]);
20918
20919 }
20920 else if (count >= half_width)
20921 {
20922 emit_move_insn (low[0], high[1]);
20923 emit_move_insn (high[0], low[0]);
20924 emit_insn (gen_ashr3 (high[0], high[0],
20925 GEN_INT (half_width - 1)));
20926
20927 if (count > half_width)
20928 emit_insn (gen_ashr3 (low[0], low[0],
20929 GEN_INT (count - half_width)));
20930 }
20931 else
20932 {
20933 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20934
20935 if (!rtx_equal_p (operands[0], operands[1]))
20936 emit_move_insn (operands[0], operands[1]);
20937
20938 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20939 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20940 }
20941 }
20942 else
20943 {
20944 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20945
20946 if (!rtx_equal_p (operands[0], operands[1]))
20947 emit_move_insn (operands[0], operands[1]);
20948
20949 split_double_mode (mode, operands, 1, low, high);
20950
20951 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20952 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
20953
20954 if (TARGET_CMOVE && scratch)
20955 {
20956 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20957 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20958
20959 emit_move_insn (scratch, high[0]);
20960 emit_insn (gen_ashr3 (scratch, scratch,
20961 GEN_INT (half_width - 1)));
20962 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20963 scratch));
20964 }
20965 else
20966 {
20967 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
20968 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
20969
20970 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
20971 }
20972 }
20973 }
20974
20975 void
20976 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
20977 {
20978 rtx (*gen_lshr3)(rtx, rtx, rtx)
20979 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
20980 rtx (*gen_shrd)(rtx, rtx, rtx);
20981 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20982
20983 rtx low[2], high[2];
20984 int count;
20985
20986 if (CONST_INT_P (operands[2]))
20987 {
20988 split_double_mode (mode, operands, 2, low, high);
20989 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20990
20991 if (count >= half_width)
20992 {
20993 emit_move_insn (low[0], high[1]);
20994 ix86_expand_clear (high[0]);
20995
20996 if (count > half_width)
20997 emit_insn (gen_lshr3 (low[0], low[0],
20998 GEN_INT (count - half_width)));
20999 }
21000 else
21001 {
21002 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21003
21004 if (!rtx_equal_p (operands[0], operands[1]))
21005 emit_move_insn (operands[0], operands[1]);
21006
21007 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21008 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21009 }
21010 }
21011 else
21012 {
21013 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21014
21015 if (!rtx_equal_p (operands[0], operands[1]))
21016 emit_move_insn (operands[0], operands[1]);
21017
21018 split_double_mode (mode, operands, 1, low, high);
21019
21020 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21021 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21022
21023 if (TARGET_CMOVE && scratch)
21024 {
21025 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21026 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21027
21028 ix86_expand_clear (scratch);
21029 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21030 scratch));
21031 }
21032 else
21033 {
21034 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21035 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21036
21037 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21038 }
21039 }
21040 }
21041
21042 /* Predict just emitted jump instruction to be taken with probability PROB. */
21043 static void
21044 predict_jump (int prob)
21045 {
21046 rtx insn = get_last_insn ();
21047 gcc_assert (JUMP_P (insn));
21048 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21049 }
21050
21051 /* Helper function for the string operations below. Dest VARIABLE whether
21052 it is aligned to VALUE bytes. If true, jump to the label. */
21053 static rtx
21054 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21055 {
21056 rtx label = gen_label_rtx ();
21057 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21058 if (GET_MODE (variable) == DImode)
21059 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21060 else
21061 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21062 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21063 1, label);
21064 if (epilogue)
21065 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21066 else
21067 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21068 return label;
21069 }
21070
21071 /* Adjust COUNTER by the VALUE. */
21072 static void
21073 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21074 {
21075 rtx (*gen_add)(rtx, rtx, rtx)
21076 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21077
21078 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21079 }
21080
21081 /* Zero extend possibly SImode EXP to Pmode register. */
21082 rtx
21083 ix86_zero_extend_to_Pmode (rtx exp)
21084 {
21085 if (GET_MODE (exp) != Pmode)
21086 exp = convert_to_mode (Pmode, exp, 1);
21087 return force_reg (Pmode, exp);
21088 }
21089
21090 /* Divide COUNTREG by SCALE. */
21091 static rtx
21092 scale_counter (rtx countreg, int scale)
21093 {
21094 rtx sc;
21095
21096 if (scale == 1)
21097 return countreg;
21098 if (CONST_INT_P (countreg))
21099 return GEN_INT (INTVAL (countreg) / scale);
21100 gcc_assert (REG_P (countreg));
21101
21102 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21103 GEN_INT (exact_log2 (scale)),
21104 NULL, 1, OPTAB_DIRECT);
21105 return sc;
21106 }
21107
21108 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21109 DImode for constant loop counts. */
21110
21111 static enum machine_mode
21112 counter_mode (rtx count_exp)
21113 {
21114 if (GET_MODE (count_exp) != VOIDmode)
21115 return GET_MODE (count_exp);
21116 if (!CONST_INT_P (count_exp))
21117 return Pmode;
21118 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21119 return DImode;
21120 return SImode;
21121 }
21122
21123 /* When SRCPTR is non-NULL, output simple loop to move memory
21124 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21125 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21126 equivalent loop to set memory by VALUE (supposed to be in MODE).
21127
21128 The size is rounded down to whole number of chunk size moved at once.
21129 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21130
21131
21132 static void
21133 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21134 rtx destptr, rtx srcptr, rtx value,
21135 rtx count, enum machine_mode mode, int unroll,
21136 int expected_size)
21137 {
21138 rtx out_label, top_label, iter, tmp;
21139 enum machine_mode iter_mode = counter_mode (count);
21140 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21141 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21142 rtx size;
21143 rtx x_addr;
21144 rtx y_addr;
21145 int i;
21146
21147 top_label = gen_label_rtx ();
21148 out_label = gen_label_rtx ();
21149 iter = gen_reg_rtx (iter_mode);
21150
21151 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21152 NULL, 1, OPTAB_DIRECT);
21153 /* Those two should combine. */
21154 if (piece_size == const1_rtx)
21155 {
21156 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21157 true, out_label);
21158 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21159 }
21160 emit_move_insn (iter, const0_rtx);
21161
21162 emit_label (top_label);
21163
21164 tmp = convert_modes (Pmode, iter_mode, iter, true);
21165 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21166 destmem = change_address (destmem, mode, x_addr);
21167
21168 if (srcmem)
21169 {
21170 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21171 srcmem = change_address (srcmem, mode, y_addr);
21172
21173 /* When unrolling for chips that reorder memory reads and writes,
21174 we can save registers by using single temporary.
21175 Also using 4 temporaries is overkill in 32bit mode. */
21176 if (!TARGET_64BIT && 0)
21177 {
21178 for (i = 0; i < unroll; i++)
21179 {
21180 if (i)
21181 {
21182 destmem =
21183 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21184 srcmem =
21185 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21186 }
21187 emit_move_insn (destmem, srcmem);
21188 }
21189 }
21190 else
21191 {
21192 rtx tmpreg[4];
21193 gcc_assert (unroll <= 4);
21194 for (i = 0; i < unroll; i++)
21195 {
21196 tmpreg[i] = gen_reg_rtx (mode);
21197 if (i)
21198 {
21199 srcmem =
21200 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21201 }
21202 emit_move_insn (tmpreg[i], srcmem);
21203 }
21204 for (i = 0; i < unroll; i++)
21205 {
21206 if (i)
21207 {
21208 destmem =
21209 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21210 }
21211 emit_move_insn (destmem, tmpreg[i]);
21212 }
21213 }
21214 }
21215 else
21216 for (i = 0; i < unroll; i++)
21217 {
21218 if (i)
21219 destmem =
21220 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21221 emit_move_insn (destmem, value);
21222 }
21223
21224 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21225 true, OPTAB_LIB_WIDEN);
21226 if (tmp != iter)
21227 emit_move_insn (iter, tmp);
21228
21229 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21230 true, top_label);
21231 if (expected_size != -1)
21232 {
21233 expected_size /= GET_MODE_SIZE (mode) * unroll;
21234 if (expected_size == 0)
21235 predict_jump (0);
21236 else if (expected_size > REG_BR_PROB_BASE)
21237 predict_jump (REG_BR_PROB_BASE - 1);
21238 else
21239 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21240 }
21241 else
21242 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21243 iter = ix86_zero_extend_to_Pmode (iter);
21244 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21245 true, OPTAB_LIB_WIDEN);
21246 if (tmp != destptr)
21247 emit_move_insn (destptr, tmp);
21248 if (srcptr)
21249 {
21250 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21251 true, OPTAB_LIB_WIDEN);
21252 if (tmp != srcptr)
21253 emit_move_insn (srcptr, tmp);
21254 }
21255 emit_label (out_label);
21256 }
21257
21258 /* Output "rep; mov" instruction.
21259 Arguments have same meaning as for previous function */
21260 static void
21261 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21262 rtx destptr, rtx srcptr,
21263 rtx count,
21264 enum machine_mode mode)
21265 {
21266 rtx destexp;
21267 rtx srcexp;
21268 rtx countreg;
21269 HOST_WIDE_INT rounded_count;
21270
21271 /* If the size is known, it is shorter to use rep movs. */
21272 if (mode == QImode && CONST_INT_P (count)
21273 && !(INTVAL (count) & 3))
21274 mode = SImode;
21275
21276 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21277 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21278 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21279 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21280 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21281 if (mode != QImode)
21282 {
21283 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21284 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21285 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21286 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21287 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21288 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21289 }
21290 else
21291 {
21292 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21293 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21294 }
21295 if (CONST_INT_P (count))
21296 {
21297 rounded_count = (INTVAL (count)
21298 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21299 destmem = shallow_copy_rtx (destmem);
21300 srcmem = shallow_copy_rtx (srcmem);
21301 set_mem_size (destmem, rounded_count);
21302 set_mem_size (srcmem, rounded_count);
21303 }
21304 else
21305 {
21306 if (MEM_SIZE_KNOWN_P (destmem))
21307 clear_mem_size (destmem);
21308 if (MEM_SIZE_KNOWN_P (srcmem))
21309 clear_mem_size (srcmem);
21310 }
21311 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21312 destexp, srcexp));
21313 }
21314
21315 /* Output "rep; stos" instruction.
21316 Arguments have same meaning as for previous function */
21317 static void
21318 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21319 rtx count, enum machine_mode mode,
21320 rtx orig_value)
21321 {
21322 rtx destexp;
21323 rtx countreg;
21324 HOST_WIDE_INT rounded_count;
21325
21326 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21327 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21328 value = force_reg (mode, gen_lowpart (mode, value));
21329 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21330 if (mode != QImode)
21331 {
21332 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21333 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21334 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21335 }
21336 else
21337 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21338 if (orig_value == const0_rtx && CONST_INT_P (count))
21339 {
21340 rounded_count = (INTVAL (count)
21341 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21342 destmem = shallow_copy_rtx (destmem);
21343 set_mem_size (destmem, rounded_count);
21344 }
21345 else if (MEM_SIZE_KNOWN_P (destmem))
21346 clear_mem_size (destmem);
21347 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21348 }
21349
21350 static void
21351 emit_strmov (rtx destmem, rtx srcmem,
21352 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21353 {
21354 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21355 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21356 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21357 }
21358
21359 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21360 static void
21361 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21362 rtx destptr, rtx srcptr, rtx count, int max_size)
21363 {
21364 rtx src, dest;
21365 if (CONST_INT_P (count))
21366 {
21367 HOST_WIDE_INT countval = INTVAL (count);
21368 int offset = 0;
21369
21370 if ((countval & 0x10) && max_size > 16)
21371 {
21372 if (TARGET_64BIT)
21373 {
21374 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21375 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21376 }
21377 else
21378 gcc_unreachable ();
21379 offset += 16;
21380 }
21381 if ((countval & 0x08) && max_size > 8)
21382 {
21383 if (TARGET_64BIT)
21384 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21385 else
21386 {
21387 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21388 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21389 }
21390 offset += 8;
21391 }
21392 if ((countval & 0x04) && max_size > 4)
21393 {
21394 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21395 offset += 4;
21396 }
21397 if ((countval & 0x02) && max_size > 2)
21398 {
21399 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21400 offset += 2;
21401 }
21402 if ((countval & 0x01) && max_size > 1)
21403 {
21404 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21405 offset += 1;
21406 }
21407 return;
21408 }
21409 if (max_size > 8)
21410 {
21411 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21412 count, 1, OPTAB_DIRECT);
21413 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21414 count, QImode, 1, 4);
21415 return;
21416 }
21417
21418 /* When there are stringops, we can cheaply increase dest and src pointers.
21419 Otherwise we save code size by maintaining offset (zero is readily
21420 available from preceding rep operation) and using x86 addressing modes.
21421 */
21422 if (TARGET_SINGLE_STRINGOP)
21423 {
21424 if (max_size > 4)
21425 {
21426 rtx label = ix86_expand_aligntest (count, 4, true);
21427 src = change_address (srcmem, SImode, srcptr);
21428 dest = change_address (destmem, SImode, destptr);
21429 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21430 emit_label (label);
21431 LABEL_NUSES (label) = 1;
21432 }
21433 if (max_size > 2)
21434 {
21435 rtx label = ix86_expand_aligntest (count, 2, true);
21436 src = change_address (srcmem, HImode, srcptr);
21437 dest = change_address (destmem, HImode, destptr);
21438 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21439 emit_label (label);
21440 LABEL_NUSES (label) = 1;
21441 }
21442 if (max_size > 1)
21443 {
21444 rtx label = ix86_expand_aligntest (count, 1, true);
21445 src = change_address (srcmem, QImode, srcptr);
21446 dest = change_address (destmem, QImode, destptr);
21447 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21448 emit_label (label);
21449 LABEL_NUSES (label) = 1;
21450 }
21451 }
21452 else
21453 {
21454 rtx offset = force_reg (Pmode, const0_rtx);
21455 rtx tmp;
21456
21457 if (max_size > 4)
21458 {
21459 rtx label = ix86_expand_aligntest (count, 4, true);
21460 src = change_address (srcmem, SImode, srcptr);
21461 dest = change_address (destmem, SImode, destptr);
21462 emit_move_insn (dest, src);
21463 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21464 true, OPTAB_LIB_WIDEN);
21465 if (tmp != offset)
21466 emit_move_insn (offset, tmp);
21467 emit_label (label);
21468 LABEL_NUSES (label) = 1;
21469 }
21470 if (max_size > 2)
21471 {
21472 rtx label = ix86_expand_aligntest (count, 2, true);
21473 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21474 src = change_address (srcmem, HImode, tmp);
21475 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21476 dest = change_address (destmem, HImode, tmp);
21477 emit_move_insn (dest, src);
21478 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21479 true, OPTAB_LIB_WIDEN);
21480 if (tmp != offset)
21481 emit_move_insn (offset, tmp);
21482 emit_label (label);
21483 LABEL_NUSES (label) = 1;
21484 }
21485 if (max_size > 1)
21486 {
21487 rtx label = ix86_expand_aligntest (count, 1, true);
21488 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21489 src = change_address (srcmem, QImode, tmp);
21490 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21491 dest = change_address (destmem, QImode, tmp);
21492 emit_move_insn (dest, src);
21493 emit_label (label);
21494 LABEL_NUSES (label) = 1;
21495 }
21496 }
21497 }
21498
21499 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21500 static void
21501 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21502 rtx count, int max_size)
21503 {
21504 count =
21505 expand_simple_binop (counter_mode (count), AND, count,
21506 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21507 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21508 gen_lowpart (QImode, value), count, QImode,
21509 1, max_size / 2);
21510 }
21511
21512 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21513 static void
21514 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21515 {
21516 rtx dest;
21517
21518 if (CONST_INT_P (count))
21519 {
21520 HOST_WIDE_INT countval = INTVAL (count);
21521 int offset = 0;
21522
21523 if ((countval & 0x10) && max_size > 16)
21524 {
21525 if (TARGET_64BIT)
21526 {
21527 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21528 emit_insn (gen_strset (destptr, dest, value));
21529 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21530 emit_insn (gen_strset (destptr, dest, value));
21531 }
21532 else
21533 gcc_unreachable ();
21534 offset += 16;
21535 }
21536 if ((countval & 0x08) && max_size > 8)
21537 {
21538 if (TARGET_64BIT)
21539 {
21540 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21541 emit_insn (gen_strset (destptr, dest, value));
21542 }
21543 else
21544 {
21545 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21546 emit_insn (gen_strset (destptr, dest, value));
21547 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21548 emit_insn (gen_strset (destptr, dest, value));
21549 }
21550 offset += 8;
21551 }
21552 if ((countval & 0x04) && max_size > 4)
21553 {
21554 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21555 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21556 offset += 4;
21557 }
21558 if ((countval & 0x02) && max_size > 2)
21559 {
21560 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21561 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21562 offset += 2;
21563 }
21564 if ((countval & 0x01) && max_size > 1)
21565 {
21566 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21567 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21568 offset += 1;
21569 }
21570 return;
21571 }
21572 if (max_size > 32)
21573 {
21574 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21575 return;
21576 }
21577 if (max_size > 16)
21578 {
21579 rtx label = ix86_expand_aligntest (count, 16, true);
21580 if (TARGET_64BIT)
21581 {
21582 dest = change_address (destmem, DImode, destptr);
21583 emit_insn (gen_strset (destptr, dest, value));
21584 emit_insn (gen_strset (destptr, dest, value));
21585 }
21586 else
21587 {
21588 dest = change_address (destmem, SImode, destptr);
21589 emit_insn (gen_strset (destptr, dest, value));
21590 emit_insn (gen_strset (destptr, dest, value));
21591 emit_insn (gen_strset (destptr, dest, value));
21592 emit_insn (gen_strset (destptr, dest, value));
21593 }
21594 emit_label (label);
21595 LABEL_NUSES (label) = 1;
21596 }
21597 if (max_size > 8)
21598 {
21599 rtx label = ix86_expand_aligntest (count, 8, true);
21600 if (TARGET_64BIT)
21601 {
21602 dest = change_address (destmem, DImode, destptr);
21603 emit_insn (gen_strset (destptr, dest, value));
21604 }
21605 else
21606 {
21607 dest = change_address (destmem, SImode, destptr);
21608 emit_insn (gen_strset (destptr, dest, value));
21609 emit_insn (gen_strset (destptr, dest, value));
21610 }
21611 emit_label (label);
21612 LABEL_NUSES (label) = 1;
21613 }
21614 if (max_size > 4)
21615 {
21616 rtx label = ix86_expand_aligntest (count, 4, true);
21617 dest = change_address (destmem, SImode, destptr);
21618 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21619 emit_label (label);
21620 LABEL_NUSES (label) = 1;
21621 }
21622 if (max_size > 2)
21623 {
21624 rtx label = ix86_expand_aligntest (count, 2, true);
21625 dest = change_address (destmem, HImode, destptr);
21626 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21627 emit_label (label);
21628 LABEL_NUSES (label) = 1;
21629 }
21630 if (max_size > 1)
21631 {
21632 rtx label = ix86_expand_aligntest (count, 1, true);
21633 dest = change_address (destmem, QImode, destptr);
21634 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21635 emit_label (label);
21636 LABEL_NUSES (label) = 1;
21637 }
21638 }
21639
21640 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21641 DESIRED_ALIGNMENT. */
21642 static void
21643 expand_movmem_prologue (rtx destmem, rtx srcmem,
21644 rtx destptr, rtx srcptr, rtx count,
21645 int align, int desired_alignment)
21646 {
21647 if (align <= 1 && desired_alignment > 1)
21648 {
21649 rtx label = ix86_expand_aligntest (destptr, 1, false);
21650 srcmem = change_address (srcmem, QImode, srcptr);
21651 destmem = change_address (destmem, QImode, destptr);
21652 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21653 ix86_adjust_counter (count, 1);
21654 emit_label (label);
21655 LABEL_NUSES (label) = 1;
21656 }
21657 if (align <= 2 && desired_alignment > 2)
21658 {
21659 rtx label = ix86_expand_aligntest (destptr, 2, false);
21660 srcmem = change_address (srcmem, HImode, srcptr);
21661 destmem = change_address (destmem, HImode, destptr);
21662 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21663 ix86_adjust_counter (count, 2);
21664 emit_label (label);
21665 LABEL_NUSES (label) = 1;
21666 }
21667 if (align <= 4 && desired_alignment > 4)
21668 {
21669 rtx label = ix86_expand_aligntest (destptr, 4, false);
21670 srcmem = change_address (srcmem, SImode, srcptr);
21671 destmem = change_address (destmem, SImode, destptr);
21672 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21673 ix86_adjust_counter (count, 4);
21674 emit_label (label);
21675 LABEL_NUSES (label) = 1;
21676 }
21677 gcc_assert (desired_alignment <= 8);
21678 }
21679
21680 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21681 ALIGN_BYTES is how many bytes need to be copied. */
21682 static rtx
21683 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21684 int desired_align, int align_bytes)
21685 {
21686 rtx src = *srcp;
21687 rtx orig_dst = dst;
21688 rtx orig_src = src;
21689 int off = 0;
21690 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21691 if (src_align_bytes >= 0)
21692 src_align_bytes = desired_align - src_align_bytes;
21693 if (align_bytes & 1)
21694 {
21695 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21696 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21697 off = 1;
21698 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21699 }
21700 if (align_bytes & 2)
21701 {
21702 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21703 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21704 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21705 set_mem_align (dst, 2 * BITS_PER_UNIT);
21706 if (src_align_bytes >= 0
21707 && (src_align_bytes & 1) == (align_bytes & 1)
21708 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21709 set_mem_align (src, 2 * BITS_PER_UNIT);
21710 off = 2;
21711 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21712 }
21713 if (align_bytes & 4)
21714 {
21715 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21716 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21717 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21718 set_mem_align (dst, 4 * BITS_PER_UNIT);
21719 if (src_align_bytes >= 0)
21720 {
21721 unsigned int src_align = 0;
21722 if ((src_align_bytes & 3) == (align_bytes & 3))
21723 src_align = 4;
21724 else if ((src_align_bytes & 1) == (align_bytes & 1))
21725 src_align = 2;
21726 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21727 set_mem_align (src, src_align * BITS_PER_UNIT);
21728 }
21729 off = 4;
21730 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21731 }
21732 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21733 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21734 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21735 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21736 if (src_align_bytes >= 0)
21737 {
21738 unsigned int src_align = 0;
21739 if ((src_align_bytes & 7) == (align_bytes & 7))
21740 src_align = 8;
21741 else if ((src_align_bytes & 3) == (align_bytes & 3))
21742 src_align = 4;
21743 else if ((src_align_bytes & 1) == (align_bytes & 1))
21744 src_align = 2;
21745 if (src_align > (unsigned int) desired_align)
21746 src_align = desired_align;
21747 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21748 set_mem_align (src, src_align * BITS_PER_UNIT);
21749 }
21750 if (MEM_SIZE_KNOWN_P (orig_dst))
21751 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21752 if (MEM_SIZE_KNOWN_P (orig_src))
21753 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21754 *srcp = src;
21755 return dst;
21756 }
21757
21758 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21759 DESIRED_ALIGNMENT. */
21760 static void
21761 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21762 int align, int desired_alignment)
21763 {
21764 if (align <= 1 && desired_alignment > 1)
21765 {
21766 rtx label = ix86_expand_aligntest (destptr, 1, false);
21767 destmem = change_address (destmem, QImode, destptr);
21768 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21769 ix86_adjust_counter (count, 1);
21770 emit_label (label);
21771 LABEL_NUSES (label) = 1;
21772 }
21773 if (align <= 2 && desired_alignment > 2)
21774 {
21775 rtx label = ix86_expand_aligntest (destptr, 2, false);
21776 destmem = change_address (destmem, HImode, destptr);
21777 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21778 ix86_adjust_counter (count, 2);
21779 emit_label (label);
21780 LABEL_NUSES (label) = 1;
21781 }
21782 if (align <= 4 && desired_alignment > 4)
21783 {
21784 rtx label = ix86_expand_aligntest (destptr, 4, false);
21785 destmem = change_address (destmem, SImode, destptr);
21786 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21787 ix86_adjust_counter (count, 4);
21788 emit_label (label);
21789 LABEL_NUSES (label) = 1;
21790 }
21791 gcc_assert (desired_alignment <= 8);
21792 }
21793
21794 /* Set enough from DST to align DST known to by aligned by ALIGN to
21795 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21796 static rtx
21797 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21798 int desired_align, int align_bytes)
21799 {
21800 int off = 0;
21801 rtx orig_dst = dst;
21802 if (align_bytes & 1)
21803 {
21804 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21805 off = 1;
21806 emit_insn (gen_strset (destreg, dst,
21807 gen_lowpart (QImode, value)));
21808 }
21809 if (align_bytes & 2)
21810 {
21811 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21812 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21813 set_mem_align (dst, 2 * BITS_PER_UNIT);
21814 off = 2;
21815 emit_insn (gen_strset (destreg, dst,
21816 gen_lowpart (HImode, value)));
21817 }
21818 if (align_bytes & 4)
21819 {
21820 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21821 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21822 set_mem_align (dst, 4 * BITS_PER_UNIT);
21823 off = 4;
21824 emit_insn (gen_strset (destreg, dst,
21825 gen_lowpart (SImode, value)));
21826 }
21827 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21828 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21829 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21830 if (MEM_SIZE_KNOWN_P (orig_dst))
21831 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21832 return dst;
21833 }
21834
21835 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21836 static enum stringop_alg
21837 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21838 int *dynamic_check)
21839 {
21840 const struct stringop_algs * algs;
21841 bool optimize_for_speed;
21842 /* Algorithms using the rep prefix want at least edi and ecx;
21843 additionally, memset wants eax and memcpy wants esi. Don't
21844 consider such algorithms if the user has appropriated those
21845 registers for their own purposes. */
21846 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21847 || (memset
21848 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21849
21850 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21851 || (alg != rep_prefix_1_byte \
21852 && alg != rep_prefix_4_byte \
21853 && alg != rep_prefix_8_byte))
21854 const struct processor_costs *cost;
21855
21856 /* Even if the string operation call is cold, we still might spend a lot
21857 of time processing large blocks. */
21858 if (optimize_function_for_size_p (cfun)
21859 || (optimize_insn_for_size_p ()
21860 && expected_size != -1 && expected_size < 256))
21861 optimize_for_speed = false;
21862 else
21863 optimize_for_speed = true;
21864
21865 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21866
21867 *dynamic_check = -1;
21868 if (memset)
21869 algs = &cost->memset[TARGET_64BIT != 0];
21870 else
21871 algs = &cost->memcpy[TARGET_64BIT != 0];
21872 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21873 return ix86_stringop_alg;
21874 /* rep; movq or rep; movl is the smallest variant. */
21875 else if (!optimize_for_speed)
21876 {
21877 if (!count || (count & 3))
21878 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21879 else
21880 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21881 }
21882 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21883 */
21884 else if (expected_size != -1 && expected_size < 4)
21885 return loop_1_byte;
21886 else if (expected_size != -1)
21887 {
21888 unsigned int i;
21889 enum stringop_alg alg = libcall;
21890 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21891 {
21892 /* We get here if the algorithms that were not libcall-based
21893 were rep-prefix based and we are unable to use rep prefixes
21894 based on global register usage. Break out of the loop and
21895 use the heuristic below. */
21896 if (algs->size[i].max == 0)
21897 break;
21898 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21899 {
21900 enum stringop_alg candidate = algs->size[i].alg;
21901
21902 if (candidate != libcall && ALG_USABLE_P (candidate))
21903 alg = candidate;
21904 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21905 last non-libcall inline algorithm. */
21906 if (TARGET_INLINE_ALL_STRINGOPS)
21907 {
21908 /* When the current size is best to be copied by a libcall,
21909 but we are still forced to inline, run the heuristic below
21910 that will pick code for medium sized blocks. */
21911 if (alg != libcall)
21912 return alg;
21913 break;
21914 }
21915 else if (ALG_USABLE_P (candidate))
21916 return candidate;
21917 }
21918 }
21919 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21920 }
21921 /* When asked to inline the call anyway, try to pick meaningful choice.
21922 We look for maximal size of block that is faster to copy by hand and
21923 take blocks of at most of that size guessing that average size will
21924 be roughly half of the block.
21925
21926 If this turns out to be bad, we might simply specify the preferred
21927 choice in ix86_costs. */
21928 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21929 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21930 {
21931 int max = -1;
21932 enum stringop_alg alg;
21933 int i;
21934 bool any_alg_usable_p = true;
21935
21936 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21937 {
21938 enum stringop_alg candidate = algs->size[i].alg;
21939 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21940
21941 if (candidate != libcall && candidate
21942 && ALG_USABLE_P (candidate))
21943 max = algs->size[i].max;
21944 }
21945 /* If there aren't any usable algorithms, then recursing on
21946 smaller sizes isn't going to find anything. Just return the
21947 simple byte-at-a-time copy loop. */
21948 if (!any_alg_usable_p)
21949 {
21950 /* Pick something reasonable. */
21951 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21952 *dynamic_check = 128;
21953 return loop_1_byte;
21954 }
21955 if (max == -1)
21956 max = 4096;
21957 alg = decide_alg (count, max / 2, memset, dynamic_check);
21958 gcc_assert (*dynamic_check == -1);
21959 gcc_assert (alg != libcall);
21960 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21961 *dynamic_check = max;
21962 return alg;
21963 }
21964 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
21965 #undef ALG_USABLE_P
21966 }
21967
21968 /* Decide on alignment. We know that the operand is already aligned to ALIGN
21969 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
21970 static int
21971 decide_alignment (int align,
21972 enum stringop_alg alg,
21973 int expected_size)
21974 {
21975 int desired_align = 0;
21976 switch (alg)
21977 {
21978 case no_stringop:
21979 gcc_unreachable ();
21980 case loop:
21981 case unrolled_loop:
21982 desired_align = GET_MODE_SIZE (Pmode);
21983 break;
21984 case rep_prefix_8_byte:
21985 desired_align = 8;
21986 break;
21987 case rep_prefix_4_byte:
21988 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21989 copying whole cacheline at once. */
21990 if (TARGET_PENTIUMPRO)
21991 desired_align = 8;
21992 else
21993 desired_align = 4;
21994 break;
21995 case rep_prefix_1_byte:
21996 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21997 copying whole cacheline at once. */
21998 if (TARGET_PENTIUMPRO)
21999 desired_align = 8;
22000 else
22001 desired_align = 1;
22002 break;
22003 case loop_1_byte:
22004 desired_align = 1;
22005 break;
22006 case libcall:
22007 return 0;
22008 }
22009
22010 if (optimize_size)
22011 desired_align = 1;
22012 if (desired_align < align)
22013 desired_align = align;
22014 if (expected_size != -1 && expected_size < 4)
22015 desired_align = align;
22016 return desired_align;
22017 }
22018
22019 /* Return the smallest power of 2 greater than VAL. */
22020 static int
22021 smallest_pow2_greater_than (int val)
22022 {
22023 int ret = 1;
22024 while (ret <= val)
22025 ret <<= 1;
22026 return ret;
22027 }
22028
22029 /* Expand string move (memcpy) operation. Use i386 string operations
22030 when profitable. expand_setmem contains similar code. The code
22031 depends upon architecture, block size and alignment, but always has
22032 the same overall structure:
22033
22034 1) Prologue guard: Conditional that jumps up to epilogues for small
22035 blocks that can be handled by epilogue alone. This is faster
22036 but also needed for correctness, since prologue assume the block
22037 is larger than the desired alignment.
22038
22039 Optional dynamic check for size and libcall for large
22040 blocks is emitted here too, with -minline-stringops-dynamically.
22041
22042 2) Prologue: copy first few bytes in order to get destination
22043 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22044 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22045 copied. We emit either a jump tree on power of two sized
22046 blocks, or a byte loop.
22047
22048 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22049 with specified algorithm.
22050
22051 4) Epilogue: code copying tail of the block that is too small to be
22052 handled by main body (or up to size guarded by prologue guard). */
22053
22054 bool
22055 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22056 rtx expected_align_exp, rtx expected_size_exp)
22057 {
22058 rtx destreg;
22059 rtx srcreg;
22060 rtx label = NULL;
22061 rtx tmp;
22062 rtx jump_around_label = NULL;
22063 HOST_WIDE_INT align = 1;
22064 unsigned HOST_WIDE_INT count = 0;
22065 HOST_WIDE_INT expected_size = -1;
22066 int size_needed = 0, epilogue_size_needed;
22067 int desired_align = 0, align_bytes = 0;
22068 enum stringop_alg alg;
22069 int dynamic_check;
22070 bool need_zero_guard = false;
22071
22072 if (CONST_INT_P (align_exp))
22073 align = INTVAL (align_exp);
22074 /* i386 can do misaligned access on reasonably increased cost. */
22075 if (CONST_INT_P (expected_align_exp)
22076 && INTVAL (expected_align_exp) > align)
22077 align = INTVAL (expected_align_exp);
22078 /* ALIGN is the minimum of destination and source alignment, but we care here
22079 just about destination alignment. */
22080 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22081 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22082
22083 if (CONST_INT_P (count_exp))
22084 count = expected_size = INTVAL (count_exp);
22085 if (CONST_INT_P (expected_size_exp) && count == 0)
22086 expected_size = INTVAL (expected_size_exp);
22087
22088 /* Make sure we don't need to care about overflow later on. */
22089 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22090 return false;
22091
22092 /* Step 0: Decide on preferred algorithm, desired alignment and
22093 size of chunks to be copied by main loop. */
22094
22095 alg = decide_alg (count, expected_size, false, &dynamic_check);
22096 desired_align = decide_alignment (align, alg, expected_size);
22097
22098 if (!TARGET_ALIGN_STRINGOPS)
22099 align = desired_align;
22100
22101 if (alg == libcall)
22102 return false;
22103 gcc_assert (alg != no_stringop);
22104 if (!count)
22105 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22106 destreg = copy_addr_to_reg (XEXP (dst, 0));
22107 srcreg = copy_addr_to_reg (XEXP (src, 0));
22108 switch (alg)
22109 {
22110 case libcall:
22111 case no_stringop:
22112 gcc_unreachable ();
22113 case loop:
22114 need_zero_guard = true;
22115 size_needed = GET_MODE_SIZE (word_mode);
22116 break;
22117 case unrolled_loop:
22118 need_zero_guard = true;
22119 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22120 break;
22121 case rep_prefix_8_byte:
22122 size_needed = 8;
22123 break;
22124 case rep_prefix_4_byte:
22125 size_needed = 4;
22126 break;
22127 case rep_prefix_1_byte:
22128 size_needed = 1;
22129 break;
22130 case loop_1_byte:
22131 need_zero_guard = true;
22132 size_needed = 1;
22133 break;
22134 }
22135
22136 epilogue_size_needed = size_needed;
22137
22138 /* Step 1: Prologue guard. */
22139
22140 /* Alignment code needs count to be in register. */
22141 if (CONST_INT_P (count_exp) && desired_align > align)
22142 {
22143 if (INTVAL (count_exp) > desired_align
22144 && INTVAL (count_exp) > size_needed)
22145 {
22146 align_bytes
22147 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22148 if (align_bytes <= 0)
22149 align_bytes = 0;
22150 else
22151 align_bytes = desired_align - align_bytes;
22152 }
22153 if (align_bytes == 0)
22154 count_exp = force_reg (counter_mode (count_exp), count_exp);
22155 }
22156 gcc_assert (desired_align >= 1 && align >= 1);
22157
22158 /* Ensure that alignment prologue won't copy past end of block. */
22159 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22160 {
22161 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22162 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22163 Make sure it is power of 2. */
22164 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22165
22166 if (count)
22167 {
22168 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22169 {
22170 /* If main algorithm works on QImode, no epilogue is needed.
22171 For small sizes just don't align anything. */
22172 if (size_needed == 1)
22173 desired_align = align;
22174 else
22175 goto epilogue;
22176 }
22177 }
22178 else
22179 {
22180 label = gen_label_rtx ();
22181 emit_cmp_and_jump_insns (count_exp,
22182 GEN_INT (epilogue_size_needed),
22183 LTU, 0, counter_mode (count_exp), 1, label);
22184 if (expected_size == -1 || expected_size < epilogue_size_needed)
22185 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22186 else
22187 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22188 }
22189 }
22190
22191 /* Emit code to decide on runtime whether library call or inline should be
22192 used. */
22193 if (dynamic_check != -1)
22194 {
22195 if (CONST_INT_P (count_exp))
22196 {
22197 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22198 {
22199 emit_block_move_via_libcall (dst, src, count_exp, false);
22200 count_exp = const0_rtx;
22201 goto epilogue;
22202 }
22203 }
22204 else
22205 {
22206 rtx hot_label = gen_label_rtx ();
22207 jump_around_label = gen_label_rtx ();
22208 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22209 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22210 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22211 emit_block_move_via_libcall (dst, src, count_exp, false);
22212 emit_jump (jump_around_label);
22213 emit_label (hot_label);
22214 }
22215 }
22216
22217 /* Step 2: Alignment prologue. */
22218
22219 if (desired_align > align)
22220 {
22221 if (align_bytes == 0)
22222 {
22223 /* Except for the first move in epilogue, we no longer know
22224 constant offset in aliasing info. It don't seems to worth
22225 the pain to maintain it for the first move, so throw away
22226 the info early. */
22227 src = change_address (src, BLKmode, srcreg);
22228 dst = change_address (dst, BLKmode, destreg);
22229 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22230 desired_align);
22231 }
22232 else
22233 {
22234 /* If we know how many bytes need to be stored before dst is
22235 sufficiently aligned, maintain aliasing info accurately. */
22236 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22237 desired_align, align_bytes);
22238 count_exp = plus_constant (count_exp, -align_bytes);
22239 count -= align_bytes;
22240 }
22241 if (need_zero_guard
22242 && (count < (unsigned HOST_WIDE_INT) size_needed
22243 || (align_bytes == 0
22244 && count < ((unsigned HOST_WIDE_INT) size_needed
22245 + desired_align - align))))
22246 {
22247 /* It is possible that we copied enough so the main loop will not
22248 execute. */
22249 gcc_assert (size_needed > 1);
22250 if (label == NULL_RTX)
22251 label = gen_label_rtx ();
22252 emit_cmp_and_jump_insns (count_exp,
22253 GEN_INT (size_needed),
22254 LTU, 0, counter_mode (count_exp), 1, label);
22255 if (expected_size == -1
22256 || expected_size < (desired_align - align) / 2 + size_needed)
22257 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22258 else
22259 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22260 }
22261 }
22262 if (label && size_needed == 1)
22263 {
22264 emit_label (label);
22265 LABEL_NUSES (label) = 1;
22266 label = NULL;
22267 epilogue_size_needed = 1;
22268 }
22269 else if (label == NULL_RTX)
22270 epilogue_size_needed = size_needed;
22271
22272 /* Step 3: Main loop. */
22273
22274 switch (alg)
22275 {
22276 case libcall:
22277 case no_stringop:
22278 gcc_unreachable ();
22279 case loop_1_byte:
22280 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22281 count_exp, QImode, 1, expected_size);
22282 break;
22283 case loop:
22284 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22285 count_exp, word_mode, 1, expected_size);
22286 break;
22287 case unrolled_loop:
22288 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22289 registers for 4 temporaries anyway. */
22290 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22291 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22292 expected_size);
22293 break;
22294 case rep_prefix_8_byte:
22295 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22296 DImode);
22297 break;
22298 case rep_prefix_4_byte:
22299 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22300 SImode);
22301 break;
22302 case rep_prefix_1_byte:
22303 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22304 QImode);
22305 break;
22306 }
22307 /* Adjust properly the offset of src and dest memory for aliasing. */
22308 if (CONST_INT_P (count_exp))
22309 {
22310 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22311 (count / size_needed) * size_needed);
22312 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22313 (count / size_needed) * size_needed);
22314 }
22315 else
22316 {
22317 src = change_address (src, BLKmode, srcreg);
22318 dst = change_address (dst, BLKmode, destreg);
22319 }
22320
22321 /* Step 4: Epilogue to copy the remaining bytes. */
22322 epilogue:
22323 if (label)
22324 {
22325 /* When the main loop is done, COUNT_EXP might hold original count,
22326 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22327 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22328 bytes. Compensate if needed. */
22329
22330 if (size_needed < epilogue_size_needed)
22331 {
22332 tmp =
22333 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22334 GEN_INT (size_needed - 1), count_exp, 1,
22335 OPTAB_DIRECT);
22336 if (tmp != count_exp)
22337 emit_move_insn (count_exp, tmp);
22338 }
22339 emit_label (label);
22340 LABEL_NUSES (label) = 1;
22341 }
22342
22343 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22344 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22345 epilogue_size_needed);
22346 if (jump_around_label)
22347 emit_label (jump_around_label);
22348 return true;
22349 }
22350
22351 /* Helper function for memcpy. For QImode value 0xXY produce
22352 0xXYXYXYXY of wide specified by MODE. This is essentially
22353 a * 0x10101010, but we can do slightly better than
22354 synth_mult by unwinding the sequence by hand on CPUs with
22355 slow multiply. */
22356 static rtx
22357 promote_duplicated_reg (enum machine_mode mode, rtx val)
22358 {
22359 enum machine_mode valmode = GET_MODE (val);
22360 rtx tmp;
22361 int nops = mode == DImode ? 3 : 2;
22362
22363 gcc_assert (mode == SImode || mode == DImode);
22364 if (val == const0_rtx)
22365 return copy_to_mode_reg (mode, const0_rtx);
22366 if (CONST_INT_P (val))
22367 {
22368 HOST_WIDE_INT v = INTVAL (val) & 255;
22369
22370 v |= v << 8;
22371 v |= v << 16;
22372 if (mode == DImode)
22373 v |= (v << 16) << 16;
22374 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22375 }
22376
22377 if (valmode == VOIDmode)
22378 valmode = QImode;
22379 if (valmode != QImode)
22380 val = gen_lowpart (QImode, val);
22381 if (mode == QImode)
22382 return val;
22383 if (!TARGET_PARTIAL_REG_STALL)
22384 nops--;
22385 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22386 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22387 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22388 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22389 {
22390 rtx reg = convert_modes (mode, QImode, val, true);
22391 tmp = promote_duplicated_reg (mode, const1_rtx);
22392 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22393 OPTAB_DIRECT);
22394 }
22395 else
22396 {
22397 rtx reg = convert_modes (mode, QImode, val, true);
22398
22399 if (!TARGET_PARTIAL_REG_STALL)
22400 if (mode == SImode)
22401 emit_insn (gen_movsi_insv_1 (reg, reg));
22402 else
22403 emit_insn (gen_movdi_insv_1 (reg, reg));
22404 else
22405 {
22406 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22407 NULL, 1, OPTAB_DIRECT);
22408 reg =
22409 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22410 }
22411 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22412 NULL, 1, OPTAB_DIRECT);
22413 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22414 if (mode == SImode)
22415 return reg;
22416 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22417 NULL, 1, OPTAB_DIRECT);
22418 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22419 return reg;
22420 }
22421 }
22422
22423 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22424 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22425 alignment from ALIGN to DESIRED_ALIGN. */
22426 static rtx
22427 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22428 {
22429 rtx promoted_val;
22430
22431 if (TARGET_64BIT
22432 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22433 promoted_val = promote_duplicated_reg (DImode, val);
22434 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22435 promoted_val = promote_duplicated_reg (SImode, val);
22436 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22437 promoted_val = promote_duplicated_reg (HImode, val);
22438 else
22439 promoted_val = val;
22440
22441 return promoted_val;
22442 }
22443
22444 /* Expand string clear operation (bzero). Use i386 string operations when
22445 profitable. See expand_movmem comment for explanation of individual
22446 steps performed. */
22447 bool
22448 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22449 rtx expected_align_exp, rtx expected_size_exp)
22450 {
22451 rtx destreg;
22452 rtx label = NULL;
22453 rtx tmp;
22454 rtx jump_around_label = NULL;
22455 HOST_WIDE_INT align = 1;
22456 unsigned HOST_WIDE_INT count = 0;
22457 HOST_WIDE_INT expected_size = -1;
22458 int size_needed = 0, epilogue_size_needed;
22459 int desired_align = 0, align_bytes = 0;
22460 enum stringop_alg alg;
22461 rtx promoted_val = NULL;
22462 bool force_loopy_epilogue = false;
22463 int dynamic_check;
22464 bool need_zero_guard = false;
22465
22466 if (CONST_INT_P (align_exp))
22467 align = INTVAL (align_exp);
22468 /* i386 can do misaligned access on reasonably increased cost. */
22469 if (CONST_INT_P (expected_align_exp)
22470 && INTVAL (expected_align_exp) > align)
22471 align = INTVAL (expected_align_exp);
22472 if (CONST_INT_P (count_exp))
22473 count = expected_size = INTVAL (count_exp);
22474 if (CONST_INT_P (expected_size_exp) && count == 0)
22475 expected_size = INTVAL (expected_size_exp);
22476
22477 /* Make sure we don't need to care about overflow later on. */
22478 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22479 return false;
22480
22481 /* Step 0: Decide on preferred algorithm, desired alignment and
22482 size of chunks to be copied by main loop. */
22483
22484 alg = decide_alg (count, expected_size, true, &dynamic_check);
22485 desired_align = decide_alignment (align, alg, expected_size);
22486
22487 if (!TARGET_ALIGN_STRINGOPS)
22488 align = desired_align;
22489
22490 if (alg == libcall)
22491 return false;
22492 gcc_assert (alg != no_stringop);
22493 if (!count)
22494 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22495 destreg = copy_addr_to_reg (XEXP (dst, 0));
22496 switch (alg)
22497 {
22498 case libcall:
22499 case no_stringop:
22500 gcc_unreachable ();
22501 case loop:
22502 need_zero_guard = true;
22503 size_needed = GET_MODE_SIZE (word_mode);
22504 break;
22505 case unrolled_loop:
22506 need_zero_guard = true;
22507 size_needed = GET_MODE_SIZE (word_mode) * 4;
22508 break;
22509 case rep_prefix_8_byte:
22510 size_needed = 8;
22511 break;
22512 case rep_prefix_4_byte:
22513 size_needed = 4;
22514 break;
22515 case rep_prefix_1_byte:
22516 size_needed = 1;
22517 break;
22518 case loop_1_byte:
22519 need_zero_guard = true;
22520 size_needed = 1;
22521 break;
22522 }
22523 epilogue_size_needed = size_needed;
22524
22525 /* Step 1: Prologue guard. */
22526
22527 /* Alignment code needs count to be in register. */
22528 if (CONST_INT_P (count_exp) && desired_align > align)
22529 {
22530 if (INTVAL (count_exp) > desired_align
22531 && INTVAL (count_exp) > size_needed)
22532 {
22533 align_bytes
22534 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22535 if (align_bytes <= 0)
22536 align_bytes = 0;
22537 else
22538 align_bytes = desired_align - align_bytes;
22539 }
22540 if (align_bytes == 0)
22541 {
22542 enum machine_mode mode = SImode;
22543 if (TARGET_64BIT && (count & ~0xffffffff))
22544 mode = DImode;
22545 count_exp = force_reg (mode, count_exp);
22546 }
22547 }
22548 /* Do the cheap promotion to allow better CSE across the
22549 main loop and epilogue (ie one load of the big constant in the
22550 front of all code. */
22551 if (CONST_INT_P (val_exp))
22552 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22553 desired_align, align);
22554 /* Ensure that alignment prologue won't copy past end of block. */
22555 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22556 {
22557 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22558 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22559 Make sure it is power of 2. */
22560 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22561
22562 /* To improve performance of small blocks, we jump around the VAL
22563 promoting mode. This mean that if the promoted VAL is not constant,
22564 we might not use it in the epilogue and have to use byte
22565 loop variant. */
22566 if (epilogue_size_needed > 2 && !promoted_val)
22567 force_loopy_epilogue = true;
22568 if (count)
22569 {
22570 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22571 {
22572 /* If main algorithm works on QImode, no epilogue is needed.
22573 For small sizes just don't align anything. */
22574 if (size_needed == 1)
22575 desired_align = align;
22576 else
22577 goto epilogue;
22578 }
22579 }
22580 else
22581 {
22582 label = gen_label_rtx ();
22583 emit_cmp_and_jump_insns (count_exp,
22584 GEN_INT (epilogue_size_needed),
22585 LTU, 0, counter_mode (count_exp), 1, label);
22586 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22587 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22588 else
22589 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22590 }
22591 }
22592 if (dynamic_check != -1)
22593 {
22594 rtx hot_label = gen_label_rtx ();
22595 jump_around_label = gen_label_rtx ();
22596 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22597 LEU, 0, counter_mode (count_exp), 1, hot_label);
22598 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22599 set_storage_via_libcall (dst, count_exp, val_exp, false);
22600 emit_jump (jump_around_label);
22601 emit_label (hot_label);
22602 }
22603
22604 /* Step 2: Alignment prologue. */
22605
22606 /* Do the expensive promotion once we branched off the small blocks. */
22607 if (!promoted_val)
22608 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22609 desired_align, align);
22610 gcc_assert (desired_align >= 1 && align >= 1);
22611
22612 if (desired_align > align)
22613 {
22614 if (align_bytes == 0)
22615 {
22616 /* Except for the first move in epilogue, we no longer know
22617 constant offset in aliasing info. It don't seems to worth
22618 the pain to maintain it for the first move, so throw away
22619 the info early. */
22620 dst = change_address (dst, BLKmode, destreg);
22621 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22622 desired_align);
22623 }
22624 else
22625 {
22626 /* If we know how many bytes need to be stored before dst is
22627 sufficiently aligned, maintain aliasing info accurately. */
22628 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22629 desired_align, align_bytes);
22630 count_exp = plus_constant (count_exp, -align_bytes);
22631 count -= align_bytes;
22632 }
22633 if (need_zero_guard
22634 && (count < (unsigned HOST_WIDE_INT) size_needed
22635 || (align_bytes == 0
22636 && count < ((unsigned HOST_WIDE_INT) size_needed
22637 + desired_align - align))))
22638 {
22639 /* It is possible that we copied enough so the main loop will not
22640 execute. */
22641 gcc_assert (size_needed > 1);
22642 if (label == NULL_RTX)
22643 label = gen_label_rtx ();
22644 emit_cmp_and_jump_insns (count_exp,
22645 GEN_INT (size_needed),
22646 LTU, 0, counter_mode (count_exp), 1, label);
22647 if (expected_size == -1
22648 || expected_size < (desired_align - align) / 2 + size_needed)
22649 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22650 else
22651 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22652 }
22653 }
22654 if (label && size_needed == 1)
22655 {
22656 emit_label (label);
22657 LABEL_NUSES (label) = 1;
22658 label = NULL;
22659 promoted_val = val_exp;
22660 epilogue_size_needed = 1;
22661 }
22662 else if (label == NULL_RTX)
22663 epilogue_size_needed = size_needed;
22664
22665 /* Step 3: Main loop. */
22666
22667 switch (alg)
22668 {
22669 case libcall:
22670 case no_stringop:
22671 gcc_unreachable ();
22672 case loop_1_byte:
22673 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22674 count_exp, QImode, 1, expected_size);
22675 break;
22676 case loop:
22677 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22678 count_exp, word_mode, 1, expected_size);
22679 break;
22680 case unrolled_loop:
22681 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22682 count_exp, word_mode, 4, expected_size);
22683 break;
22684 case rep_prefix_8_byte:
22685 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22686 DImode, val_exp);
22687 break;
22688 case rep_prefix_4_byte:
22689 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22690 SImode, val_exp);
22691 break;
22692 case rep_prefix_1_byte:
22693 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22694 QImode, val_exp);
22695 break;
22696 }
22697 /* Adjust properly the offset of src and dest memory for aliasing. */
22698 if (CONST_INT_P (count_exp))
22699 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22700 (count / size_needed) * size_needed);
22701 else
22702 dst = change_address (dst, BLKmode, destreg);
22703
22704 /* Step 4: Epilogue to copy the remaining bytes. */
22705
22706 if (label)
22707 {
22708 /* When the main loop is done, COUNT_EXP might hold original count,
22709 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22710 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22711 bytes. Compensate if needed. */
22712
22713 if (size_needed < epilogue_size_needed)
22714 {
22715 tmp =
22716 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22717 GEN_INT (size_needed - 1), count_exp, 1,
22718 OPTAB_DIRECT);
22719 if (tmp != count_exp)
22720 emit_move_insn (count_exp, tmp);
22721 }
22722 emit_label (label);
22723 LABEL_NUSES (label) = 1;
22724 }
22725 epilogue:
22726 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22727 {
22728 if (force_loopy_epilogue)
22729 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22730 epilogue_size_needed);
22731 else
22732 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22733 epilogue_size_needed);
22734 }
22735 if (jump_around_label)
22736 emit_label (jump_around_label);
22737 return true;
22738 }
22739
22740 /* Expand the appropriate insns for doing strlen if not just doing
22741 repnz; scasb
22742
22743 out = result, initialized with the start address
22744 align_rtx = alignment of the address.
22745 scratch = scratch register, initialized with the startaddress when
22746 not aligned, otherwise undefined
22747
22748 This is just the body. It needs the initializations mentioned above and
22749 some address computing at the end. These things are done in i386.md. */
22750
22751 static void
22752 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22753 {
22754 int align;
22755 rtx tmp;
22756 rtx align_2_label = NULL_RTX;
22757 rtx align_3_label = NULL_RTX;
22758 rtx align_4_label = gen_label_rtx ();
22759 rtx end_0_label = gen_label_rtx ();
22760 rtx mem;
22761 rtx tmpreg = gen_reg_rtx (SImode);
22762 rtx scratch = gen_reg_rtx (SImode);
22763 rtx cmp;
22764
22765 align = 0;
22766 if (CONST_INT_P (align_rtx))
22767 align = INTVAL (align_rtx);
22768
22769 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22770
22771 /* Is there a known alignment and is it less than 4? */
22772 if (align < 4)
22773 {
22774 rtx scratch1 = gen_reg_rtx (Pmode);
22775 emit_move_insn (scratch1, out);
22776 /* Is there a known alignment and is it not 2? */
22777 if (align != 2)
22778 {
22779 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22780 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22781
22782 /* Leave just the 3 lower bits. */
22783 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22784 NULL_RTX, 0, OPTAB_WIDEN);
22785
22786 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22787 Pmode, 1, align_4_label);
22788 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22789 Pmode, 1, align_2_label);
22790 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22791 Pmode, 1, align_3_label);
22792 }
22793 else
22794 {
22795 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22796 check if is aligned to 4 - byte. */
22797
22798 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22799 NULL_RTX, 0, OPTAB_WIDEN);
22800
22801 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22802 Pmode, 1, align_4_label);
22803 }
22804
22805 mem = change_address (src, QImode, out);
22806
22807 /* Now compare the bytes. */
22808
22809 /* Compare the first n unaligned byte on a byte per byte basis. */
22810 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22811 QImode, 1, end_0_label);
22812
22813 /* Increment the address. */
22814 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22815
22816 /* Not needed with an alignment of 2 */
22817 if (align != 2)
22818 {
22819 emit_label (align_2_label);
22820
22821 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22822 end_0_label);
22823
22824 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22825
22826 emit_label (align_3_label);
22827 }
22828
22829 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22830 end_0_label);
22831
22832 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22833 }
22834
22835 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22836 align this loop. It gives only huge programs, but does not help to
22837 speed up. */
22838 emit_label (align_4_label);
22839
22840 mem = change_address (src, SImode, out);
22841 emit_move_insn (scratch, mem);
22842 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22843
22844 /* This formula yields a nonzero result iff one of the bytes is zero.
22845 This saves three branches inside loop and many cycles. */
22846
22847 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22848 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22849 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22850 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22851 gen_int_mode (0x80808080, SImode)));
22852 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22853 align_4_label);
22854
22855 if (TARGET_CMOVE)
22856 {
22857 rtx reg = gen_reg_rtx (SImode);
22858 rtx reg2 = gen_reg_rtx (Pmode);
22859 emit_move_insn (reg, tmpreg);
22860 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22861
22862 /* If zero is not in the first two bytes, move two bytes forward. */
22863 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22864 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22865 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22866 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22867 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22868 reg,
22869 tmpreg)));
22870 /* Emit lea manually to avoid clobbering of flags. */
22871 emit_insn (gen_rtx_SET (SImode, reg2,
22872 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22873
22874 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22875 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22876 emit_insn (gen_rtx_SET (VOIDmode, out,
22877 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22878 reg2,
22879 out)));
22880 }
22881 else
22882 {
22883 rtx end_2_label = gen_label_rtx ();
22884 /* Is zero in the first two bytes? */
22885
22886 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22887 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22888 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22889 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22890 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22891 pc_rtx);
22892 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22893 JUMP_LABEL (tmp) = end_2_label;
22894
22895 /* Not in the first two. Move two bytes forward. */
22896 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22897 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22898
22899 emit_label (end_2_label);
22900
22901 }
22902
22903 /* Avoid branch in fixing the byte. */
22904 tmpreg = gen_lowpart (QImode, tmpreg);
22905 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22906 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22907 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22908 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22909
22910 emit_label (end_0_label);
22911 }
22912
22913 /* Expand strlen. */
22914
22915 bool
22916 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22917 {
22918 rtx addr, scratch1, scratch2, scratch3, scratch4;
22919
22920 /* The generic case of strlen expander is long. Avoid it's
22921 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22922
22923 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22924 && !TARGET_INLINE_ALL_STRINGOPS
22925 && !optimize_insn_for_size_p ()
22926 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22927 return false;
22928
22929 addr = force_reg (Pmode, XEXP (src, 0));
22930 scratch1 = gen_reg_rtx (Pmode);
22931
22932 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22933 && !optimize_insn_for_size_p ())
22934 {
22935 /* Well it seems that some optimizer does not combine a call like
22936 foo(strlen(bar), strlen(bar));
22937 when the move and the subtraction is done here. It does calculate
22938 the length just once when these instructions are done inside of
22939 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
22940 often used and I use one fewer register for the lifetime of
22941 output_strlen_unroll() this is better. */
22942
22943 emit_move_insn (out, addr);
22944
22945 ix86_expand_strlensi_unroll_1 (out, src, align);
22946
22947 /* strlensi_unroll_1 returns the address of the zero at the end of
22948 the string, like memchr(), so compute the length by subtracting
22949 the start address. */
22950 emit_insn (ix86_gen_sub3 (out, out, addr));
22951 }
22952 else
22953 {
22954 rtx unspec;
22955
22956 /* Can't use this if the user has appropriated eax, ecx, or edi. */
22957 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
22958 return false;
22959
22960 scratch2 = gen_reg_rtx (Pmode);
22961 scratch3 = gen_reg_rtx (Pmode);
22962 scratch4 = force_reg (Pmode, constm1_rtx);
22963
22964 emit_move_insn (scratch3, addr);
22965 eoschar = force_reg (QImode, eoschar);
22966
22967 src = replace_equiv_address_nv (src, scratch3);
22968
22969 /* If .md starts supporting :P, this can be done in .md. */
22970 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
22971 scratch4), UNSPEC_SCAS);
22972 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
22973 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
22974 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
22975 }
22976 return true;
22977 }
22978
22979 /* For given symbol (function) construct code to compute address of it's PLT
22980 entry in large x86-64 PIC model. */
22981 rtx
22982 construct_plt_address (rtx symbol)
22983 {
22984 rtx tmp, unspec;
22985
22986 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
22987 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
22988 gcc_assert (Pmode == DImode);
22989
22990 tmp = gen_reg_rtx (Pmode);
22991 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
22992
22993 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
22994 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
22995 return tmp;
22996 }
22997
22998 rtx
22999 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23000 rtx callarg2,
23001 rtx pop, bool sibcall)
23002 {
23003 /* We need to represent that SI and DI registers are clobbered
23004 by SYSV calls. */
23005 static int clobbered_registers[] = {
23006 XMM6_REG, XMM7_REG, XMM8_REG,
23007 XMM9_REG, XMM10_REG, XMM11_REG,
23008 XMM12_REG, XMM13_REG, XMM14_REG,
23009 XMM15_REG, SI_REG, DI_REG
23010 };
23011 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23012 rtx use = NULL, call;
23013 unsigned int vec_len;
23014
23015 if (pop == const0_rtx)
23016 pop = NULL;
23017 gcc_assert (!TARGET_64BIT || !pop);
23018
23019 if (TARGET_MACHO && !TARGET_64BIT)
23020 {
23021 #if TARGET_MACHO
23022 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23023 fnaddr = machopic_indirect_call_target (fnaddr);
23024 #endif
23025 }
23026 else
23027 {
23028 /* Static functions and indirect calls don't need the pic register. */
23029 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23030 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23031 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23032 use_reg (&use, pic_offset_table_rtx);
23033 }
23034
23035 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23036 {
23037 rtx al = gen_rtx_REG (QImode, AX_REG);
23038 emit_move_insn (al, callarg2);
23039 use_reg (&use, al);
23040 }
23041
23042 if (ix86_cmodel == CM_LARGE_PIC
23043 && MEM_P (fnaddr)
23044 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23045 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23046 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23047 else if (sibcall
23048 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23049 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23050 {
23051 fnaddr = XEXP (fnaddr, 0);
23052 if (GET_MODE (fnaddr) != word_mode)
23053 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23054 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23055 }
23056
23057 vec_len = 0;
23058 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23059 if (retval)
23060 call = gen_rtx_SET (VOIDmode, retval, call);
23061 vec[vec_len++] = call;
23062
23063 if (pop)
23064 {
23065 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23066 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23067 vec[vec_len++] = pop;
23068 }
23069
23070 if (TARGET_64BIT_MS_ABI
23071 && (!callarg2 || INTVAL (callarg2) != -2))
23072 {
23073 unsigned i;
23074
23075 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23076 UNSPEC_MS_TO_SYSV_CALL);
23077
23078 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23079 vec[vec_len++]
23080 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23081 ? TImode : DImode,
23082 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23083 ? TImode : DImode,
23084 clobbered_registers[i]));
23085 }
23086
23087 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23088 if (TARGET_VZEROUPPER)
23089 {
23090 int avx256;
23091 if (cfun->machine->callee_pass_avx256_p)
23092 {
23093 if (cfun->machine->callee_return_avx256_p)
23094 avx256 = callee_return_pass_avx256;
23095 else
23096 avx256 = callee_pass_avx256;
23097 }
23098 else if (cfun->machine->callee_return_avx256_p)
23099 avx256 = callee_return_avx256;
23100 else
23101 avx256 = call_no_avx256;
23102
23103 if (reload_completed)
23104 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23105 else
23106 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23107 gen_rtvec (1, GEN_INT (avx256)),
23108 UNSPEC_CALL_NEEDS_VZEROUPPER);
23109 }
23110
23111 if (vec_len > 1)
23112 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23113 call = emit_call_insn (call);
23114 if (use)
23115 CALL_INSN_FUNCTION_USAGE (call) = use;
23116
23117 return call;
23118 }
23119
23120 void
23121 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23122 {
23123 rtx pat = PATTERN (insn);
23124 rtvec vec = XVEC (pat, 0);
23125 int len = GET_NUM_ELEM (vec) - 1;
23126
23127 /* Strip off the last entry of the parallel. */
23128 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23129 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23130 if (len == 1)
23131 pat = RTVEC_ELT (vec, 0);
23132 else
23133 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23134
23135 emit_insn (gen_avx_vzeroupper (vzeroupper));
23136 emit_call_insn (pat);
23137 }
23138
23139 /* Output the assembly for a call instruction. */
23140
23141 const char *
23142 ix86_output_call_insn (rtx insn, rtx call_op)
23143 {
23144 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23145 bool seh_nop_p = false;
23146 const char *xasm;
23147
23148 if (SIBLING_CALL_P (insn))
23149 {
23150 if (direct_p)
23151 xasm = "jmp\t%P0";
23152 /* SEH epilogue detection requires the indirect branch case
23153 to include REX.W. */
23154 else if (TARGET_SEH)
23155 xasm = "rex.W jmp %A0";
23156 else
23157 xasm = "jmp\t%A0";
23158
23159 output_asm_insn (xasm, &call_op);
23160 return "";
23161 }
23162
23163 /* SEH unwinding can require an extra nop to be emitted in several
23164 circumstances. Determine if we have one of those. */
23165 if (TARGET_SEH)
23166 {
23167 rtx i;
23168
23169 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23170 {
23171 /* If we get to another real insn, we don't need the nop. */
23172 if (INSN_P (i))
23173 break;
23174
23175 /* If we get to the epilogue note, prevent a catch region from
23176 being adjacent to the standard epilogue sequence. If non-
23177 call-exceptions, we'll have done this during epilogue emission. */
23178 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23179 && !flag_non_call_exceptions
23180 && !can_throw_internal (insn))
23181 {
23182 seh_nop_p = true;
23183 break;
23184 }
23185 }
23186
23187 /* If we didn't find a real insn following the call, prevent the
23188 unwinder from looking into the next function. */
23189 if (i == NULL)
23190 seh_nop_p = true;
23191 }
23192
23193 if (direct_p)
23194 xasm = "call\t%P0";
23195 else
23196 xasm = "call\t%A0";
23197
23198 output_asm_insn (xasm, &call_op);
23199
23200 if (seh_nop_p)
23201 return "nop";
23202
23203 return "";
23204 }
23205 \f
23206 /* Clear stack slot assignments remembered from previous functions.
23207 This is called from INIT_EXPANDERS once before RTL is emitted for each
23208 function. */
23209
23210 static struct machine_function *
23211 ix86_init_machine_status (void)
23212 {
23213 struct machine_function *f;
23214
23215 f = ggc_alloc_cleared_machine_function ();
23216 f->use_fast_prologue_epilogue_nregs = -1;
23217 f->tls_descriptor_call_expanded_p = 0;
23218 f->call_abi = ix86_abi;
23219
23220 return f;
23221 }
23222
23223 /* Return a MEM corresponding to a stack slot with mode MODE.
23224 Allocate a new slot if necessary.
23225
23226 The RTL for a function can have several slots available: N is
23227 which slot to use. */
23228
23229 rtx
23230 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23231 {
23232 struct stack_local_entry *s;
23233
23234 gcc_assert (n < MAX_386_STACK_LOCALS);
23235
23236 /* Virtual slot is valid only before vregs are instantiated. */
23237 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23238
23239 for (s = ix86_stack_locals; s; s = s->next)
23240 if (s->mode == mode && s->n == n)
23241 return validize_mem (copy_rtx (s->rtl));
23242
23243 s = ggc_alloc_stack_local_entry ();
23244 s->n = n;
23245 s->mode = mode;
23246 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23247
23248 s->next = ix86_stack_locals;
23249 ix86_stack_locals = s;
23250 return validize_mem (s->rtl);
23251 }
23252 \f
23253 /* Calculate the length of the memory address in the instruction encoding.
23254 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23255 or other prefixes. */
23256
23257 int
23258 memory_address_length (rtx addr)
23259 {
23260 struct ix86_address parts;
23261 rtx base, index, disp;
23262 int len;
23263 int ok;
23264
23265 if (GET_CODE (addr) == PRE_DEC
23266 || GET_CODE (addr) == POST_INC
23267 || GET_CODE (addr) == PRE_MODIFY
23268 || GET_CODE (addr) == POST_MODIFY)
23269 return 0;
23270
23271 ok = ix86_decompose_address (addr, &parts);
23272 gcc_assert (ok);
23273
23274 if (parts.base && GET_CODE (parts.base) == SUBREG)
23275 parts.base = SUBREG_REG (parts.base);
23276 if (parts.index && GET_CODE (parts.index) == SUBREG)
23277 parts.index = SUBREG_REG (parts.index);
23278
23279 base = parts.base;
23280 index = parts.index;
23281 disp = parts.disp;
23282
23283 /* Add length of addr32 prefix. */
23284 len = (GET_CODE (addr) == ZERO_EXTEND
23285 || GET_CODE (addr) == AND);
23286
23287 /* Rule of thumb:
23288 - esp as the base always wants an index,
23289 - ebp as the base always wants a displacement,
23290 - r12 as the base always wants an index,
23291 - r13 as the base always wants a displacement. */
23292
23293 /* Register Indirect. */
23294 if (base && !index && !disp)
23295 {
23296 /* esp (for its index) and ebp (for its displacement) need
23297 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23298 code. */
23299 if (REG_P (addr)
23300 && (addr == arg_pointer_rtx
23301 || addr == frame_pointer_rtx
23302 || REGNO (addr) == SP_REG
23303 || REGNO (addr) == BP_REG
23304 || REGNO (addr) == R12_REG
23305 || REGNO (addr) == R13_REG))
23306 len = 1;
23307 }
23308
23309 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23310 is not disp32, but disp32(%rip), so for disp32
23311 SIB byte is needed, unless print_operand_address
23312 optimizes it into disp32(%rip) or (%rip) is implied
23313 by UNSPEC. */
23314 else if (disp && !base && !index)
23315 {
23316 len = 4;
23317 if (TARGET_64BIT)
23318 {
23319 rtx symbol = disp;
23320
23321 if (GET_CODE (disp) == CONST)
23322 symbol = XEXP (disp, 0);
23323 if (GET_CODE (symbol) == PLUS
23324 && CONST_INT_P (XEXP (symbol, 1)))
23325 symbol = XEXP (symbol, 0);
23326
23327 if (GET_CODE (symbol) != LABEL_REF
23328 && (GET_CODE (symbol) != SYMBOL_REF
23329 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23330 && (GET_CODE (symbol) != UNSPEC
23331 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23332 && XINT (symbol, 1) != UNSPEC_PCREL
23333 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23334 len += 1;
23335 }
23336 }
23337
23338 else
23339 {
23340 /* Find the length of the displacement constant. */
23341 if (disp)
23342 {
23343 if (base && satisfies_constraint_K (disp))
23344 len = 1;
23345 else
23346 len = 4;
23347 }
23348 /* ebp always wants a displacement. Similarly r13. */
23349 else if (base && REG_P (base)
23350 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23351 len = 1;
23352
23353 /* An index requires the two-byte modrm form.... */
23354 if (index
23355 /* ...like esp (or r12), which always wants an index. */
23356 || base == arg_pointer_rtx
23357 || base == frame_pointer_rtx
23358 || (base && REG_P (base)
23359 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23360 len += 1;
23361 }
23362
23363 switch (parts.seg)
23364 {
23365 case SEG_FS:
23366 case SEG_GS:
23367 len += 1;
23368 break;
23369 default:
23370 break;
23371 }
23372
23373 return len;
23374 }
23375
23376 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23377 is set, expect that insn have 8bit immediate alternative. */
23378 int
23379 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23380 {
23381 int len = 0;
23382 int i;
23383 extract_insn_cached (insn);
23384 for (i = recog_data.n_operands - 1; i >= 0; --i)
23385 if (CONSTANT_P (recog_data.operand[i]))
23386 {
23387 enum attr_mode mode = get_attr_mode (insn);
23388
23389 gcc_assert (!len);
23390 if (shortform && CONST_INT_P (recog_data.operand[i]))
23391 {
23392 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23393 switch (mode)
23394 {
23395 case MODE_QI:
23396 len = 1;
23397 continue;
23398 case MODE_HI:
23399 ival = trunc_int_for_mode (ival, HImode);
23400 break;
23401 case MODE_SI:
23402 ival = trunc_int_for_mode (ival, SImode);
23403 break;
23404 default:
23405 break;
23406 }
23407 if (IN_RANGE (ival, -128, 127))
23408 {
23409 len = 1;
23410 continue;
23411 }
23412 }
23413 switch (mode)
23414 {
23415 case MODE_QI:
23416 len = 1;
23417 break;
23418 case MODE_HI:
23419 len = 2;
23420 break;
23421 case MODE_SI:
23422 len = 4;
23423 break;
23424 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23425 case MODE_DI:
23426 len = 4;
23427 break;
23428 default:
23429 fatal_insn ("unknown insn mode", insn);
23430 }
23431 }
23432 return len;
23433 }
23434 /* Compute default value for "length_address" attribute. */
23435 int
23436 ix86_attr_length_address_default (rtx insn)
23437 {
23438 int i;
23439
23440 if (get_attr_type (insn) == TYPE_LEA)
23441 {
23442 rtx set = PATTERN (insn), addr;
23443
23444 if (GET_CODE (set) == PARALLEL)
23445 set = XVECEXP (set, 0, 0);
23446
23447 gcc_assert (GET_CODE (set) == SET);
23448
23449 addr = SET_SRC (set);
23450 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23451 {
23452 if (GET_CODE (addr) == ZERO_EXTEND)
23453 addr = XEXP (addr, 0);
23454 if (GET_CODE (addr) == SUBREG)
23455 addr = SUBREG_REG (addr);
23456 }
23457
23458 return memory_address_length (addr);
23459 }
23460
23461 extract_insn_cached (insn);
23462 for (i = recog_data.n_operands - 1; i >= 0; --i)
23463 if (MEM_P (recog_data.operand[i]))
23464 {
23465 constrain_operands_cached (reload_completed);
23466 if (which_alternative != -1)
23467 {
23468 const char *constraints = recog_data.constraints[i];
23469 int alt = which_alternative;
23470
23471 while (*constraints == '=' || *constraints == '+')
23472 constraints++;
23473 while (alt-- > 0)
23474 while (*constraints++ != ',')
23475 ;
23476 /* Skip ignored operands. */
23477 if (*constraints == 'X')
23478 continue;
23479 }
23480 return memory_address_length (XEXP (recog_data.operand[i], 0));
23481 }
23482 return 0;
23483 }
23484
23485 /* Compute default value for "length_vex" attribute. It includes
23486 2 or 3 byte VEX prefix and 1 opcode byte. */
23487
23488 int
23489 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23490 {
23491 int i;
23492
23493 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23494 byte VEX prefix. */
23495 if (!has_0f_opcode || has_vex_w)
23496 return 3 + 1;
23497
23498 /* We can always use 2 byte VEX prefix in 32bit. */
23499 if (!TARGET_64BIT)
23500 return 2 + 1;
23501
23502 extract_insn_cached (insn);
23503
23504 for (i = recog_data.n_operands - 1; i >= 0; --i)
23505 if (REG_P (recog_data.operand[i]))
23506 {
23507 /* REX.W bit uses 3 byte VEX prefix. */
23508 if (GET_MODE (recog_data.operand[i]) == DImode
23509 && GENERAL_REG_P (recog_data.operand[i]))
23510 return 3 + 1;
23511 }
23512 else
23513 {
23514 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23515 if (MEM_P (recog_data.operand[i])
23516 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23517 return 3 + 1;
23518 }
23519
23520 return 2 + 1;
23521 }
23522 \f
23523 /* Return the maximum number of instructions a cpu can issue. */
23524
23525 static int
23526 ix86_issue_rate (void)
23527 {
23528 switch (ix86_tune)
23529 {
23530 case PROCESSOR_PENTIUM:
23531 case PROCESSOR_ATOM:
23532 case PROCESSOR_K6:
23533 return 2;
23534
23535 case PROCESSOR_PENTIUMPRO:
23536 case PROCESSOR_PENTIUM4:
23537 case PROCESSOR_CORE2_32:
23538 case PROCESSOR_CORE2_64:
23539 case PROCESSOR_COREI7_32:
23540 case PROCESSOR_COREI7_64:
23541 case PROCESSOR_ATHLON:
23542 case PROCESSOR_K8:
23543 case PROCESSOR_AMDFAM10:
23544 case PROCESSOR_NOCONA:
23545 case PROCESSOR_GENERIC32:
23546 case PROCESSOR_GENERIC64:
23547 case PROCESSOR_BDVER1:
23548 case PROCESSOR_BDVER2:
23549 case PROCESSOR_BTVER1:
23550 return 3;
23551
23552 default:
23553 return 1;
23554 }
23555 }
23556
23557 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23558 by DEP_INSN and nothing set by DEP_INSN. */
23559
23560 static bool
23561 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23562 {
23563 rtx set, set2;
23564
23565 /* Simplify the test for uninteresting insns. */
23566 if (insn_type != TYPE_SETCC
23567 && insn_type != TYPE_ICMOV
23568 && insn_type != TYPE_FCMOV
23569 && insn_type != TYPE_IBR)
23570 return false;
23571
23572 if ((set = single_set (dep_insn)) != 0)
23573 {
23574 set = SET_DEST (set);
23575 set2 = NULL_RTX;
23576 }
23577 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23578 && XVECLEN (PATTERN (dep_insn), 0) == 2
23579 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23580 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23581 {
23582 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23583 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23584 }
23585 else
23586 return false;
23587
23588 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23589 return false;
23590
23591 /* This test is true if the dependent insn reads the flags but
23592 not any other potentially set register. */
23593 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23594 return false;
23595
23596 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23597 return false;
23598
23599 return true;
23600 }
23601
23602 /* Return true iff USE_INSN has a memory address with operands set by
23603 SET_INSN. */
23604
23605 bool
23606 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23607 {
23608 int i;
23609 extract_insn_cached (use_insn);
23610 for (i = recog_data.n_operands - 1; i >= 0; --i)
23611 if (MEM_P (recog_data.operand[i]))
23612 {
23613 rtx addr = XEXP (recog_data.operand[i], 0);
23614 return modified_in_p (addr, set_insn) != 0;
23615 }
23616 return false;
23617 }
23618
23619 static int
23620 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23621 {
23622 enum attr_type insn_type, dep_insn_type;
23623 enum attr_memory memory;
23624 rtx set, set2;
23625 int dep_insn_code_number;
23626
23627 /* Anti and output dependencies have zero cost on all CPUs. */
23628 if (REG_NOTE_KIND (link) != 0)
23629 return 0;
23630
23631 dep_insn_code_number = recog_memoized (dep_insn);
23632
23633 /* If we can't recognize the insns, we can't really do anything. */
23634 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23635 return cost;
23636
23637 insn_type = get_attr_type (insn);
23638 dep_insn_type = get_attr_type (dep_insn);
23639
23640 switch (ix86_tune)
23641 {
23642 case PROCESSOR_PENTIUM:
23643 /* Address Generation Interlock adds a cycle of latency. */
23644 if (insn_type == TYPE_LEA)
23645 {
23646 rtx addr = PATTERN (insn);
23647
23648 if (GET_CODE (addr) == PARALLEL)
23649 addr = XVECEXP (addr, 0, 0);
23650
23651 gcc_assert (GET_CODE (addr) == SET);
23652
23653 addr = SET_SRC (addr);
23654 if (modified_in_p (addr, dep_insn))
23655 cost += 1;
23656 }
23657 else if (ix86_agi_dependent (dep_insn, insn))
23658 cost += 1;
23659
23660 /* ??? Compares pair with jump/setcc. */
23661 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23662 cost = 0;
23663
23664 /* Floating point stores require value to be ready one cycle earlier. */
23665 if (insn_type == TYPE_FMOV
23666 && get_attr_memory (insn) == MEMORY_STORE
23667 && !ix86_agi_dependent (dep_insn, insn))
23668 cost += 1;
23669 break;
23670
23671 case PROCESSOR_PENTIUMPRO:
23672 memory = get_attr_memory (insn);
23673
23674 /* INT->FP conversion is expensive. */
23675 if (get_attr_fp_int_src (dep_insn))
23676 cost += 5;
23677
23678 /* There is one cycle extra latency between an FP op and a store. */
23679 if (insn_type == TYPE_FMOV
23680 && (set = single_set (dep_insn)) != NULL_RTX
23681 && (set2 = single_set (insn)) != NULL_RTX
23682 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23683 && MEM_P (SET_DEST (set2)))
23684 cost += 1;
23685
23686 /* Show ability of reorder buffer to hide latency of load by executing
23687 in parallel with previous instruction in case
23688 previous instruction is not needed to compute the address. */
23689 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23690 && !ix86_agi_dependent (dep_insn, insn))
23691 {
23692 /* Claim moves to take one cycle, as core can issue one load
23693 at time and the next load can start cycle later. */
23694 if (dep_insn_type == TYPE_IMOV
23695 || dep_insn_type == TYPE_FMOV)
23696 cost = 1;
23697 else if (cost > 1)
23698 cost--;
23699 }
23700 break;
23701
23702 case PROCESSOR_K6:
23703 memory = get_attr_memory (insn);
23704
23705 /* The esp dependency is resolved before the instruction is really
23706 finished. */
23707 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23708 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23709 return 1;
23710
23711 /* INT->FP conversion is expensive. */
23712 if (get_attr_fp_int_src (dep_insn))
23713 cost += 5;
23714
23715 /* Show ability of reorder buffer to hide latency of load by executing
23716 in parallel with previous instruction in case
23717 previous instruction is not needed to compute the address. */
23718 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23719 && !ix86_agi_dependent (dep_insn, insn))
23720 {
23721 /* Claim moves to take one cycle, as core can issue one load
23722 at time and the next load can start cycle later. */
23723 if (dep_insn_type == TYPE_IMOV
23724 || dep_insn_type == TYPE_FMOV)
23725 cost = 1;
23726 else if (cost > 2)
23727 cost -= 2;
23728 else
23729 cost = 1;
23730 }
23731 break;
23732
23733 case PROCESSOR_ATHLON:
23734 case PROCESSOR_K8:
23735 case PROCESSOR_AMDFAM10:
23736 case PROCESSOR_BDVER1:
23737 case PROCESSOR_BDVER2:
23738 case PROCESSOR_BTVER1:
23739 case PROCESSOR_ATOM:
23740 case PROCESSOR_GENERIC32:
23741 case PROCESSOR_GENERIC64:
23742 memory = get_attr_memory (insn);
23743
23744 /* Show ability of reorder buffer to hide latency of load by executing
23745 in parallel with previous instruction in case
23746 previous instruction is not needed to compute the address. */
23747 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23748 && !ix86_agi_dependent (dep_insn, insn))
23749 {
23750 enum attr_unit unit = get_attr_unit (insn);
23751 int loadcost = 3;
23752
23753 /* Because of the difference between the length of integer and
23754 floating unit pipeline preparation stages, the memory operands
23755 for floating point are cheaper.
23756
23757 ??? For Athlon it the difference is most probably 2. */
23758 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23759 loadcost = 3;
23760 else
23761 loadcost = TARGET_ATHLON ? 2 : 0;
23762
23763 if (cost >= loadcost)
23764 cost -= loadcost;
23765 else
23766 cost = 0;
23767 }
23768
23769 default:
23770 break;
23771 }
23772
23773 return cost;
23774 }
23775
23776 /* How many alternative schedules to try. This should be as wide as the
23777 scheduling freedom in the DFA, but no wider. Making this value too
23778 large results extra work for the scheduler. */
23779
23780 static int
23781 ia32_multipass_dfa_lookahead (void)
23782 {
23783 switch (ix86_tune)
23784 {
23785 case PROCESSOR_PENTIUM:
23786 return 2;
23787
23788 case PROCESSOR_PENTIUMPRO:
23789 case PROCESSOR_K6:
23790 return 1;
23791
23792 case PROCESSOR_CORE2_32:
23793 case PROCESSOR_CORE2_64:
23794 case PROCESSOR_COREI7_32:
23795 case PROCESSOR_COREI7_64:
23796 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23797 as many instructions can be executed on a cycle, i.e.,
23798 issue_rate. I wonder why tuning for many CPUs does not do this. */
23799 return ix86_issue_rate ();
23800
23801 default:
23802 return 0;
23803 }
23804 }
23805
23806 \f
23807
23808 /* Model decoder of Core 2/i7.
23809 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23810 track the instruction fetch block boundaries and make sure that long
23811 (9+ bytes) instructions are assigned to D0. */
23812
23813 /* Maximum length of an insn that can be handled by
23814 a secondary decoder unit. '8' for Core 2/i7. */
23815 static int core2i7_secondary_decoder_max_insn_size;
23816
23817 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23818 '16' for Core 2/i7. */
23819 static int core2i7_ifetch_block_size;
23820
23821 /* Maximum number of instructions decoder can handle per cycle.
23822 '6' for Core 2/i7. */
23823 static int core2i7_ifetch_block_max_insns;
23824
23825 typedef struct ix86_first_cycle_multipass_data_ *
23826 ix86_first_cycle_multipass_data_t;
23827 typedef const struct ix86_first_cycle_multipass_data_ *
23828 const_ix86_first_cycle_multipass_data_t;
23829
23830 /* A variable to store target state across calls to max_issue within
23831 one cycle. */
23832 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23833 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23834
23835 /* Initialize DATA. */
23836 static void
23837 core2i7_first_cycle_multipass_init (void *_data)
23838 {
23839 ix86_first_cycle_multipass_data_t data
23840 = (ix86_first_cycle_multipass_data_t) _data;
23841
23842 data->ifetch_block_len = 0;
23843 data->ifetch_block_n_insns = 0;
23844 data->ready_try_change = NULL;
23845 data->ready_try_change_size = 0;
23846 }
23847
23848 /* Advancing the cycle; reset ifetch block counts. */
23849 static void
23850 core2i7_dfa_post_advance_cycle (void)
23851 {
23852 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
23853
23854 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23855
23856 data->ifetch_block_len = 0;
23857 data->ifetch_block_n_insns = 0;
23858 }
23859
23860 static int min_insn_size (rtx);
23861
23862 /* Filter out insns from ready_try that the core will not be able to issue
23863 on current cycle due to decoder. */
23864 static void
23865 core2i7_first_cycle_multipass_filter_ready_try
23866 (const_ix86_first_cycle_multipass_data_t data,
23867 char *ready_try, int n_ready, bool first_cycle_insn_p)
23868 {
23869 while (n_ready--)
23870 {
23871 rtx insn;
23872 int insn_size;
23873
23874 if (ready_try[n_ready])
23875 continue;
23876
23877 insn = get_ready_element (n_ready);
23878 insn_size = min_insn_size (insn);
23879
23880 if (/* If this is a too long an insn for a secondary decoder ... */
23881 (!first_cycle_insn_p
23882 && insn_size > core2i7_secondary_decoder_max_insn_size)
23883 /* ... or it would not fit into the ifetch block ... */
23884 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
23885 /* ... or the decoder is full already ... */
23886 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
23887 /* ... mask the insn out. */
23888 {
23889 ready_try[n_ready] = 1;
23890
23891 if (data->ready_try_change)
23892 SET_BIT (data->ready_try_change, n_ready);
23893 }
23894 }
23895 }
23896
23897 /* Prepare for a new round of multipass lookahead scheduling. */
23898 static void
23899 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
23900 bool first_cycle_insn_p)
23901 {
23902 ix86_first_cycle_multipass_data_t data
23903 = (ix86_first_cycle_multipass_data_t) _data;
23904 const_ix86_first_cycle_multipass_data_t prev_data
23905 = ix86_first_cycle_multipass_data;
23906
23907 /* Restore the state from the end of the previous round. */
23908 data->ifetch_block_len = prev_data->ifetch_block_len;
23909 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
23910
23911 /* Filter instructions that cannot be issued on current cycle due to
23912 decoder restrictions. */
23913 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23914 first_cycle_insn_p);
23915 }
23916
23917 /* INSN is being issued in current solution. Account for its impact on
23918 the decoder model. */
23919 static void
23920 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
23921 rtx insn, const void *_prev_data)
23922 {
23923 ix86_first_cycle_multipass_data_t data
23924 = (ix86_first_cycle_multipass_data_t) _data;
23925 const_ix86_first_cycle_multipass_data_t prev_data
23926 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
23927
23928 int insn_size = min_insn_size (insn);
23929
23930 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
23931 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
23932 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
23933 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23934
23935 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
23936 if (!data->ready_try_change)
23937 {
23938 data->ready_try_change = sbitmap_alloc (n_ready);
23939 data->ready_try_change_size = n_ready;
23940 }
23941 else if (data->ready_try_change_size < n_ready)
23942 {
23943 data->ready_try_change = sbitmap_resize (data->ready_try_change,
23944 n_ready, 0);
23945 data->ready_try_change_size = n_ready;
23946 }
23947 sbitmap_zero (data->ready_try_change);
23948
23949 /* Filter out insns from ready_try that the core will not be able to issue
23950 on current cycle due to decoder. */
23951 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23952 false);
23953 }
23954
23955 /* Revert the effect on ready_try. */
23956 static void
23957 core2i7_first_cycle_multipass_backtrack (const void *_data,
23958 char *ready_try,
23959 int n_ready ATTRIBUTE_UNUSED)
23960 {
23961 const_ix86_first_cycle_multipass_data_t data
23962 = (const_ix86_first_cycle_multipass_data_t) _data;
23963 unsigned int i = 0;
23964 sbitmap_iterator sbi;
23965
23966 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
23967 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
23968 {
23969 ready_try[i] = 0;
23970 }
23971 }
23972
23973 /* Save the result of multipass lookahead scheduling for the next round. */
23974 static void
23975 core2i7_first_cycle_multipass_end (const void *_data)
23976 {
23977 const_ix86_first_cycle_multipass_data_t data
23978 = (const_ix86_first_cycle_multipass_data_t) _data;
23979 ix86_first_cycle_multipass_data_t next_data
23980 = ix86_first_cycle_multipass_data;
23981
23982 if (data != NULL)
23983 {
23984 next_data->ifetch_block_len = data->ifetch_block_len;
23985 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
23986 }
23987 }
23988
23989 /* Deallocate target data. */
23990 static void
23991 core2i7_first_cycle_multipass_fini (void *_data)
23992 {
23993 ix86_first_cycle_multipass_data_t data
23994 = (ix86_first_cycle_multipass_data_t) _data;
23995
23996 if (data->ready_try_change)
23997 {
23998 sbitmap_free (data->ready_try_change);
23999 data->ready_try_change = NULL;
24000 data->ready_try_change_size = 0;
24001 }
24002 }
24003
24004 /* Prepare for scheduling pass. */
24005 static void
24006 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24007 int verbose ATTRIBUTE_UNUSED,
24008 int max_uid ATTRIBUTE_UNUSED)
24009 {
24010 /* Install scheduling hooks for current CPU. Some of these hooks are used
24011 in time-critical parts of the scheduler, so we only set them up when
24012 they are actually used. */
24013 switch (ix86_tune)
24014 {
24015 case PROCESSOR_CORE2_32:
24016 case PROCESSOR_CORE2_64:
24017 case PROCESSOR_COREI7_32:
24018 case PROCESSOR_COREI7_64:
24019 targetm.sched.dfa_post_advance_cycle
24020 = core2i7_dfa_post_advance_cycle;
24021 targetm.sched.first_cycle_multipass_init
24022 = core2i7_first_cycle_multipass_init;
24023 targetm.sched.first_cycle_multipass_begin
24024 = core2i7_first_cycle_multipass_begin;
24025 targetm.sched.first_cycle_multipass_issue
24026 = core2i7_first_cycle_multipass_issue;
24027 targetm.sched.first_cycle_multipass_backtrack
24028 = core2i7_first_cycle_multipass_backtrack;
24029 targetm.sched.first_cycle_multipass_end
24030 = core2i7_first_cycle_multipass_end;
24031 targetm.sched.first_cycle_multipass_fini
24032 = core2i7_first_cycle_multipass_fini;
24033
24034 /* Set decoder parameters. */
24035 core2i7_secondary_decoder_max_insn_size = 8;
24036 core2i7_ifetch_block_size = 16;
24037 core2i7_ifetch_block_max_insns = 6;
24038 break;
24039
24040 default:
24041 targetm.sched.dfa_post_advance_cycle = NULL;
24042 targetm.sched.first_cycle_multipass_init = NULL;
24043 targetm.sched.first_cycle_multipass_begin = NULL;
24044 targetm.sched.first_cycle_multipass_issue = NULL;
24045 targetm.sched.first_cycle_multipass_backtrack = NULL;
24046 targetm.sched.first_cycle_multipass_end = NULL;
24047 targetm.sched.first_cycle_multipass_fini = NULL;
24048 break;
24049 }
24050 }
24051
24052 \f
24053 /* Compute the alignment given to a constant that is being placed in memory.
24054 EXP is the constant and ALIGN is the alignment that the object would
24055 ordinarily have.
24056 The value of this function is used instead of that alignment to align
24057 the object. */
24058
24059 int
24060 ix86_constant_alignment (tree exp, int align)
24061 {
24062 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24063 || TREE_CODE (exp) == INTEGER_CST)
24064 {
24065 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24066 return 64;
24067 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24068 return 128;
24069 }
24070 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24071 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24072 return BITS_PER_WORD;
24073
24074 return align;
24075 }
24076
24077 /* Compute the alignment for a static variable.
24078 TYPE is the data type, and ALIGN is the alignment that
24079 the object would ordinarily have. The value of this function is used
24080 instead of that alignment to align the object. */
24081
24082 int
24083 ix86_data_alignment (tree type, int align)
24084 {
24085 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24086
24087 if (AGGREGATE_TYPE_P (type)
24088 && TYPE_SIZE (type)
24089 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24090 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24091 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24092 && align < max_align)
24093 align = max_align;
24094
24095 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24096 to 16byte boundary. */
24097 if (TARGET_64BIT)
24098 {
24099 if (AGGREGATE_TYPE_P (type)
24100 && TYPE_SIZE (type)
24101 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24102 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24103 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24104 return 128;
24105 }
24106
24107 if (TREE_CODE (type) == ARRAY_TYPE)
24108 {
24109 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24110 return 64;
24111 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24112 return 128;
24113 }
24114 else if (TREE_CODE (type) == COMPLEX_TYPE)
24115 {
24116
24117 if (TYPE_MODE (type) == DCmode && align < 64)
24118 return 64;
24119 if ((TYPE_MODE (type) == XCmode
24120 || TYPE_MODE (type) == TCmode) && align < 128)
24121 return 128;
24122 }
24123 else if ((TREE_CODE (type) == RECORD_TYPE
24124 || TREE_CODE (type) == UNION_TYPE
24125 || TREE_CODE (type) == QUAL_UNION_TYPE)
24126 && TYPE_FIELDS (type))
24127 {
24128 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24129 return 64;
24130 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24131 return 128;
24132 }
24133 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24134 || TREE_CODE (type) == INTEGER_TYPE)
24135 {
24136 if (TYPE_MODE (type) == DFmode && align < 64)
24137 return 64;
24138 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24139 return 128;
24140 }
24141
24142 return align;
24143 }
24144
24145 /* Compute the alignment for a local variable or a stack slot. EXP is
24146 the data type or decl itself, MODE is the widest mode available and
24147 ALIGN is the alignment that the object would ordinarily have. The
24148 value of this macro is used instead of that alignment to align the
24149 object. */
24150
24151 unsigned int
24152 ix86_local_alignment (tree exp, enum machine_mode mode,
24153 unsigned int align)
24154 {
24155 tree type, decl;
24156
24157 if (exp && DECL_P (exp))
24158 {
24159 type = TREE_TYPE (exp);
24160 decl = exp;
24161 }
24162 else
24163 {
24164 type = exp;
24165 decl = NULL;
24166 }
24167
24168 /* Don't do dynamic stack realignment for long long objects with
24169 -mpreferred-stack-boundary=2. */
24170 if (!TARGET_64BIT
24171 && align == 64
24172 && ix86_preferred_stack_boundary < 64
24173 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24174 && (!type || !TYPE_USER_ALIGN (type))
24175 && (!decl || !DECL_USER_ALIGN (decl)))
24176 align = 32;
24177
24178 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24179 register in MODE. We will return the largest alignment of XF
24180 and DF. */
24181 if (!type)
24182 {
24183 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24184 align = GET_MODE_ALIGNMENT (DFmode);
24185 return align;
24186 }
24187
24188 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24189 to 16byte boundary. Exact wording is:
24190
24191 An array uses the same alignment as its elements, except that a local or
24192 global array variable of length at least 16 bytes or
24193 a C99 variable-length array variable always has alignment of at least 16 bytes.
24194
24195 This was added to allow use of aligned SSE instructions at arrays. This
24196 rule is meant for static storage (where compiler can not do the analysis
24197 by itself). We follow it for automatic variables only when convenient.
24198 We fully control everything in the function compiled and functions from
24199 other unit can not rely on the alignment.
24200
24201 Exclude va_list type. It is the common case of local array where
24202 we can not benefit from the alignment. */
24203 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24204 && TARGET_SSE)
24205 {
24206 if (AGGREGATE_TYPE_P (type)
24207 && (va_list_type_node == NULL_TREE
24208 || (TYPE_MAIN_VARIANT (type)
24209 != TYPE_MAIN_VARIANT (va_list_type_node)))
24210 && TYPE_SIZE (type)
24211 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24212 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24213 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24214 return 128;
24215 }
24216 if (TREE_CODE (type) == ARRAY_TYPE)
24217 {
24218 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24219 return 64;
24220 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24221 return 128;
24222 }
24223 else if (TREE_CODE (type) == COMPLEX_TYPE)
24224 {
24225 if (TYPE_MODE (type) == DCmode && align < 64)
24226 return 64;
24227 if ((TYPE_MODE (type) == XCmode
24228 || TYPE_MODE (type) == TCmode) && align < 128)
24229 return 128;
24230 }
24231 else if ((TREE_CODE (type) == RECORD_TYPE
24232 || TREE_CODE (type) == UNION_TYPE
24233 || TREE_CODE (type) == QUAL_UNION_TYPE)
24234 && TYPE_FIELDS (type))
24235 {
24236 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24237 return 64;
24238 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24239 return 128;
24240 }
24241 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24242 || TREE_CODE (type) == INTEGER_TYPE)
24243 {
24244
24245 if (TYPE_MODE (type) == DFmode && align < 64)
24246 return 64;
24247 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24248 return 128;
24249 }
24250 return align;
24251 }
24252
24253 /* Compute the minimum required alignment for dynamic stack realignment
24254 purposes for a local variable, parameter or a stack slot. EXP is
24255 the data type or decl itself, MODE is its mode and ALIGN is the
24256 alignment that the object would ordinarily have. */
24257
24258 unsigned int
24259 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24260 unsigned int align)
24261 {
24262 tree type, decl;
24263
24264 if (exp && DECL_P (exp))
24265 {
24266 type = TREE_TYPE (exp);
24267 decl = exp;
24268 }
24269 else
24270 {
24271 type = exp;
24272 decl = NULL;
24273 }
24274
24275 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24276 return align;
24277
24278 /* Don't do dynamic stack realignment for long long objects with
24279 -mpreferred-stack-boundary=2. */
24280 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24281 && (!type || !TYPE_USER_ALIGN (type))
24282 && (!decl || !DECL_USER_ALIGN (decl)))
24283 return 32;
24284
24285 return align;
24286 }
24287 \f
24288 /* Find a location for the static chain incoming to a nested function.
24289 This is a register, unless all free registers are used by arguments. */
24290
24291 static rtx
24292 ix86_static_chain (const_tree fndecl, bool incoming_p)
24293 {
24294 unsigned regno;
24295
24296 if (!DECL_STATIC_CHAIN (fndecl))
24297 return NULL;
24298
24299 if (TARGET_64BIT)
24300 {
24301 /* We always use R10 in 64-bit mode. */
24302 regno = R10_REG;
24303 }
24304 else
24305 {
24306 tree fntype;
24307 unsigned int ccvt;
24308
24309 /* By default in 32-bit mode we use ECX to pass the static chain. */
24310 regno = CX_REG;
24311
24312 fntype = TREE_TYPE (fndecl);
24313 ccvt = ix86_get_callcvt (fntype);
24314 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24315 {
24316 /* Fastcall functions use ecx/edx for arguments, which leaves
24317 us with EAX for the static chain.
24318 Thiscall functions use ecx for arguments, which also
24319 leaves us with EAX for the static chain. */
24320 regno = AX_REG;
24321 }
24322 else if (ix86_function_regparm (fntype, fndecl) == 3)
24323 {
24324 /* For regparm 3, we have no free call-clobbered registers in
24325 which to store the static chain. In order to implement this,
24326 we have the trampoline push the static chain to the stack.
24327 However, we can't push a value below the return address when
24328 we call the nested function directly, so we have to use an
24329 alternate entry point. For this we use ESI, and have the
24330 alternate entry point push ESI, so that things appear the
24331 same once we're executing the nested function. */
24332 if (incoming_p)
24333 {
24334 if (fndecl == current_function_decl)
24335 ix86_static_chain_on_stack = true;
24336 return gen_frame_mem (SImode,
24337 plus_constant (arg_pointer_rtx, -8));
24338 }
24339 regno = SI_REG;
24340 }
24341 }
24342
24343 return gen_rtx_REG (Pmode, regno);
24344 }
24345
24346 /* Emit RTL insns to initialize the variable parts of a trampoline.
24347 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24348 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24349 to be passed to the target function. */
24350
24351 static void
24352 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24353 {
24354 rtx mem, fnaddr;
24355 int opcode;
24356 int offset = 0;
24357
24358 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24359
24360 if (TARGET_64BIT)
24361 {
24362 int size;
24363
24364 /* Load the function address to r11. Try to load address using
24365 the shorter movl instead of movabs. We may want to support
24366 movq for kernel mode, but kernel does not use trampolines at
24367 the moment. FNADDR is a 32bit address and may not be in
24368 DImode when ptr_mode == SImode. Always use movl in this
24369 case. */
24370 if (ptr_mode == SImode
24371 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24372 {
24373 fnaddr = copy_addr_to_reg (fnaddr);
24374
24375 mem = adjust_address (m_tramp, HImode, offset);
24376 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24377
24378 mem = adjust_address (m_tramp, SImode, offset + 2);
24379 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24380 offset += 6;
24381 }
24382 else
24383 {
24384 mem = adjust_address (m_tramp, HImode, offset);
24385 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24386
24387 mem = adjust_address (m_tramp, DImode, offset + 2);
24388 emit_move_insn (mem, fnaddr);
24389 offset += 10;
24390 }
24391
24392 /* Load static chain using movabs to r10. Use the shorter movl
24393 instead of movabs when ptr_mode == SImode. */
24394 if (ptr_mode == SImode)
24395 {
24396 opcode = 0xba41;
24397 size = 6;
24398 }
24399 else
24400 {
24401 opcode = 0xba49;
24402 size = 10;
24403 }
24404
24405 mem = adjust_address (m_tramp, HImode, offset);
24406 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24407
24408 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24409 emit_move_insn (mem, chain_value);
24410 offset += size;
24411
24412 /* Jump to r11; the last (unused) byte is a nop, only there to
24413 pad the write out to a single 32-bit store. */
24414 mem = adjust_address (m_tramp, SImode, offset);
24415 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24416 offset += 4;
24417 }
24418 else
24419 {
24420 rtx disp, chain;
24421
24422 /* Depending on the static chain location, either load a register
24423 with a constant, or push the constant to the stack. All of the
24424 instructions are the same size. */
24425 chain = ix86_static_chain (fndecl, true);
24426 if (REG_P (chain))
24427 {
24428 switch (REGNO (chain))
24429 {
24430 case AX_REG:
24431 opcode = 0xb8; break;
24432 case CX_REG:
24433 opcode = 0xb9; break;
24434 default:
24435 gcc_unreachable ();
24436 }
24437 }
24438 else
24439 opcode = 0x68;
24440
24441 mem = adjust_address (m_tramp, QImode, offset);
24442 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24443
24444 mem = adjust_address (m_tramp, SImode, offset + 1);
24445 emit_move_insn (mem, chain_value);
24446 offset += 5;
24447
24448 mem = adjust_address (m_tramp, QImode, offset);
24449 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24450
24451 mem = adjust_address (m_tramp, SImode, offset + 1);
24452
24453 /* Compute offset from the end of the jmp to the target function.
24454 In the case in which the trampoline stores the static chain on
24455 the stack, we need to skip the first insn which pushes the
24456 (call-saved) register static chain; this push is 1 byte. */
24457 offset += 5;
24458 disp = expand_binop (SImode, sub_optab, fnaddr,
24459 plus_constant (XEXP (m_tramp, 0),
24460 offset - (MEM_P (chain) ? 1 : 0)),
24461 NULL_RTX, 1, OPTAB_DIRECT);
24462 emit_move_insn (mem, disp);
24463 }
24464
24465 gcc_assert (offset <= TRAMPOLINE_SIZE);
24466
24467 #ifdef HAVE_ENABLE_EXECUTE_STACK
24468 #ifdef CHECK_EXECUTE_STACK_ENABLED
24469 if (CHECK_EXECUTE_STACK_ENABLED)
24470 #endif
24471 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24472 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24473 #endif
24474 }
24475 \f
24476 /* The following file contains several enumerations and data structures
24477 built from the definitions in i386-builtin-types.def. */
24478
24479 #include "i386-builtin-types.inc"
24480
24481 /* Table for the ix86 builtin non-function types. */
24482 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24483
24484 /* Retrieve an element from the above table, building some of
24485 the types lazily. */
24486
24487 static tree
24488 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24489 {
24490 unsigned int index;
24491 tree type, itype;
24492
24493 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24494
24495 type = ix86_builtin_type_tab[(int) tcode];
24496 if (type != NULL)
24497 return type;
24498
24499 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24500 if (tcode <= IX86_BT_LAST_VECT)
24501 {
24502 enum machine_mode mode;
24503
24504 index = tcode - IX86_BT_LAST_PRIM - 1;
24505 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24506 mode = ix86_builtin_type_vect_mode[index];
24507
24508 type = build_vector_type_for_mode (itype, mode);
24509 }
24510 else
24511 {
24512 int quals;
24513
24514 index = tcode - IX86_BT_LAST_VECT - 1;
24515 if (tcode <= IX86_BT_LAST_PTR)
24516 quals = TYPE_UNQUALIFIED;
24517 else
24518 quals = TYPE_QUAL_CONST;
24519
24520 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24521 if (quals != TYPE_UNQUALIFIED)
24522 itype = build_qualified_type (itype, quals);
24523
24524 type = build_pointer_type (itype);
24525 }
24526
24527 ix86_builtin_type_tab[(int) tcode] = type;
24528 return type;
24529 }
24530
24531 /* Table for the ix86 builtin function types. */
24532 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24533
24534 /* Retrieve an element from the above table, building some of
24535 the types lazily. */
24536
24537 static tree
24538 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24539 {
24540 tree type;
24541
24542 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24543
24544 type = ix86_builtin_func_type_tab[(int) tcode];
24545 if (type != NULL)
24546 return type;
24547
24548 if (tcode <= IX86_BT_LAST_FUNC)
24549 {
24550 unsigned start = ix86_builtin_func_start[(int) tcode];
24551 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24552 tree rtype, atype, args = void_list_node;
24553 unsigned i;
24554
24555 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24556 for (i = after - 1; i > start; --i)
24557 {
24558 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24559 args = tree_cons (NULL, atype, args);
24560 }
24561
24562 type = build_function_type (rtype, args);
24563 }
24564 else
24565 {
24566 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24567 enum ix86_builtin_func_type icode;
24568
24569 icode = ix86_builtin_func_alias_base[index];
24570 type = ix86_get_builtin_func_type (icode);
24571 }
24572
24573 ix86_builtin_func_type_tab[(int) tcode] = type;
24574 return type;
24575 }
24576
24577
24578 /* Codes for all the SSE/MMX builtins. */
24579 enum ix86_builtins
24580 {
24581 IX86_BUILTIN_ADDPS,
24582 IX86_BUILTIN_ADDSS,
24583 IX86_BUILTIN_DIVPS,
24584 IX86_BUILTIN_DIVSS,
24585 IX86_BUILTIN_MULPS,
24586 IX86_BUILTIN_MULSS,
24587 IX86_BUILTIN_SUBPS,
24588 IX86_BUILTIN_SUBSS,
24589
24590 IX86_BUILTIN_CMPEQPS,
24591 IX86_BUILTIN_CMPLTPS,
24592 IX86_BUILTIN_CMPLEPS,
24593 IX86_BUILTIN_CMPGTPS,
24594 IX86_BUILTIN_CMPGEPS,
24595 IX86_BUILTIN_CMPNEQPS,
24596 IX86_BUILTIN_CMPNLTPS,
24597 IX86_BUILTIN_CMPNLEPS,
24598 IX86_BUILTIN_CMPNGTPS,
24599 IX86_BUILTIN_CMPNGEPS,
24600 IX86_BUILTIN_CMPORDPS,
24601 IX86_BUILTIN_CMPUNORDPS,
24602 IX86_BUILTIN_CMPEQSS,
24603 IX86_BUILTIN_CMPLTSS,
24604 IX86_BUILTIN_CMPLESS,
24605 IX86_BUILTIN_CMPNEQSS,
24606 IX86_BUILTIN_CMPNLTSS,
24607 IX86_BUILTIN_CMPNLESS,
24608 IX86_BUILTIN_CMPNGTSS,
24609 IX86_BUILTIN_CMPNGESS,
24610 IX86_BUILTIN_CMPORDSS,
24611 IX86_BUILTIN_CMPUNORDSS,
24612
24613 IX86_BUILTIN_COMIEQSS,
24614 IX86_BUILTIN_COMILTSS,
24615 IX86_BUILTIN_COMILESS,
24616 IX86_BUILTIN_COMIGTSS,
24617 IX86_BUILTIN_COMIGESS,
24618 IX86_BUILTIN_COMINEQSS,
24619 IX86_BUILTIN_UCOMIEQSS,
24620 IX86_BUILTIN_UCOMILTSS,
24621 IX86_BUILTIN_UCOMILESS,
24622 IX86_BUILTIN_UCOMIGTSS,
24623 IX86_BUILTIN_UCOMIGESS,
24624 IX86_BUILTIN_UCOMINEQSS,
24625
24626 IX86_BUILTIN_CVTPI2PS,
24627 IX86_BUILTIN_CVTPS2PI,
24628 IX86_BUILTIN_CVTSI2SS,
24629 IX86_BUILTIN_CVTSI642SS,
24630 IX86_BUILTIN_CVTSS2SI,
24631 IX86_BUILTIN_CVTSS2SI64,
24632 IX86_BUILTIN_CVTTPS2PI,
24633 IX86_BUILTIN_CVTTSS2SI,
24634 IX86_BUILTIN_CVTTSS2SI64,
24635
24636 IX86_BUILTIN_MAXPS,
24637 IX86_BUILTIN_MAXSS,
24638 IX86_BUILTIN_MINPS,
24639 IX86_BUILTIN_MINSS,
24640
24641 IX86_BUILTIN_LOADUPS,
24642 IX86_BUILTIN_STOREUPS,
24643 IX86_BUILTIN_MOVSS,
24644
24645 IX86_BUILTIN_MOVHLPS,
24646 IX86_BUILTIN_MOVLHPS,
24647 IX86_BUILTIN_LOADHPS,
24648 IX86_BUILTIN_LOADLPS,
24649 IX86_BUILTIN_STOREHPS,
24650 IX86_BUILTIN_STORELPS,
24651
24652 IX86_BUILTIN_MASKMOVQ,
24653 IX86_BUILTIN_MOVMSKPS,
24654 IX86_BUILTIN_PMOVMSKB,
24655
24656 IX86_BUILTIN_MOVNTPS,
24657 IX86_BUILTIN_MOVNTQ,
24658
24659 IX86_BUILTIN_LOADDQU,
24660 IX86_BUILTIN_STOREDQU,
24661
24662 IX86_BUILTIN_PACKSSWB,
24663 IX86_BUILTIN_PACKSSDW,
24664 IX86_BUILTIN_PACKUSWB,
24665
24666 IX86_BUILTIN_PADDB,
24667 IX86_BUILTIN_PADDW,
24668 IX86_BUILTIN_PADDD,
24669 IX86_BUILTIN_PADDQ,
24670 IX86_BUILTIN_PADDSB,
24671 IX86_BUILTIN_PADDSW,
24672 IX86_BUILTIN_PADDUSB,
24673 IX86_BUILTIN_PADDUSW,
24674 IX86_BUILTIN_PSUBB,
24675 IX86_BUILTIN_PSUBW,
24676 IX86_BUILTIN_PSUBD,
24677 IX86_BUILTIN_PSUBQ,
24678 IX86_BUILTIN_PSUBSB,
24679 IX86_BUILTIN_PSUBSW,
24680 IX86_BUILTIN_PSUBUSB,
24681 IX86_BUILTIN_PSUBUSW,
24682
24683 IX86_BUILTIN_PAND,
24684 IX86_BUILTIN_PANDN,
24685 IX86_BUILTIN_POR,
24686 IX86_BUILTIN_PXOR,
24687
24688 IX86_BUILTIN_PAVGB,
24689 IX86_BUILTIN_PAVGW,
24690
24691 IX86_BUILTIN_PCMPEQB,
24692 IX86_BUILTIN_PCMPEQW,
24693 IX86_BUILTIN_PCMPEQD,
24694 IX86_BUILTIN_PCMPGTB,
24695 IX86_BUILTIN_PCMPGTW,
24696 IX86_BUILTIN_PCMPGTD,
24697
24698 IX86_BUILTIN_PMADDWD,
24699
24700 IX86_BUILTIN_PMAXSW,
24701 IX86_BUILTIN_PMAXUB,
24702 IX86_BUILTIN_PMINSW,
24703 IX86_BUILTIN_PMINUB,
24704
24705 IX86_BUILTIN_PMULHUW,
24706 IX86_BUILTIN_PMULHW,
24707 IX86_BUILTIN_PMULLW,
24708
24709 IX86_BUILTIN_PSADBW,
24710 IX86_BUILTIN_PSHUFW,
24711
24712 IX86_BUILTIN_PSLLW,
24713 IX86_BUILTIN_PSLLD,
24714 IX86_BUILTIN_PSLLQ,
24715 IX86_BUILTIN_PSRAW,
24716 IX86_BUILTIN_PSRAD,
24717 IX86_BUILTIN_PSRLW,
24718 IX86_BUILTIN_PSRLD,
24719 IX86_BUILTIN_PSRLQ,
24720 IX86_BUILTIN_PSLLWI,
24721 IX86_BUILTIN_PSLLDI,
24722 IX86_BUILTIN_PSLLQI,
24723 IX86_BUILTIN_PSRAWI,
24724 IX86_BUILTIN_PSRADI,
24725 IX86_BUILTIN_PSRLWI,
24726 IX86_BUILTIN_PSRLDI,
24727 IX86_BUILTIN_PSRLQI,
24728
24729 IX86_BUILTIN_PUNPCKHBW,
24730 IX86_BUILTIN_PUNPCKHWD,
24731 IX86_BUILTIN_PUNPCKHDQ,
24732 IX86_BUILTIN_PUNPCKLBW,
24733 IX86_BUILTIN_PUNPCKLWD,
24734 IX86_BUILTIN_PUNPCKLDQ,
24735
24736 IX86_BUILTIN_SHUFPS,
24737
24738 IX86_BUILTIN_RCPPS,
24739 IX86_BUILTIN_RCPSS,
24740 IX86_BUILTIN_RSQRTPS,
24741 IX86_BUILTIN_RSQRTPS_NR,
24742 IX86_BUILTIN_RSQRTSS,
24743 IX86_BUILTIN_RSQRTF,
24744 IX86_BUILTIN_SQRTPS,
24745 IX86_BUILTIN_SQRTPS_NR,
24746 IX86_BUILTIN_SQRTSS,
24747
24748 IX86_BUILTIN_UNPCKHPS,
24749 IX86_BUILTIN_UNPCKLPS,
24750
24751 IX86_BUILTIN_ANDPS,
24752 IX86_BUILTIN_ANDNPS,
24753 IX86_BUILTIN_ORPS,
24754 IX86_BUILTIN_XORPS,
24755
24756 IX86_BUILTIN_EMMS,
24757 IX86_BUILTIN_LDMXCSR,
24758 IX86_BUILTIN_STMXCSR,
24759 IX86_BUILTIN_SFENCE,
24760
24761 /* 3DNow! Original */
24762 IX86_BUILTIN_FEMMS,
24763 IX86_BUILTIN_PAVGUSB,
24764 IX86_BUILTIN_PF2ID,
24765 IX86_BUILTIN_PFACC,
24766 IX86_BUILTIN_PFADD,
24767 IX86_BUILTIN_PFCMPEQ,
24768 IX86_BUILTIN_PFCMPGE,
24769 IX86_BUILTIN_PFCMPGT,
24770 IX86_BUILTIN_PFMAX,
24771 IX86_BUILTIN_PFMIN,
24772 IX86_BUILTIN_PFMUL,
24773 IX86_BUILTIN_PFRCP,
24774 IX86_BUILTIN_PFRCPIT1,
24775 IX86_BUILTIN_PFRCPIT2,
24776 IX86_BUILTIN_PFRSQIT1,
24777 IX86_BUILTIN_PFRSQRT,
24778 IX86_BUILTIN_PFSUB,
24779 IX86_BUILTIN_PFSUBR,
24780 IX86_BUILTIN_PI2FD,
24781 IX86_BUILTIN_PMULHRW,
24782
24783 /* 3DNow! Athlon Extensions */
24784 IX86_BUILTIN_PF2IW,
24785 IX86_BUILTIN_PFNACC,
24786 IX86_BUILTIN_PFPNACC,
24787 IX86_BUILTIN_PI2FW,
24788 IX86_BUILTIN_PSWAPDSI,
24789 IX86_BUILTIN_PSWAPDSF,
24790
24791 /* SSE2 */
24792 IX86_BUILTIN_ADDPD,
24793 IX86_BUILTIN_ADDSD,
24794 IX86_BUILTIN_DIVPD,
24795 IX86_BUILTIN_DIVSD,
24796 IX86_BUILTIN_MULPD,
24797 IX86_BUILTIN_MULSD,
24798 IX86_BUILTIN_SUBPD,
24799 IX86_BUILTIN_SUBSD,
24800
24801 IX86_BUILTIN_CMPEQPD,
24802 IX86_BUILTIN_CMPLTPD,
24803 IX86_BUILTIN_CMPLEPD,
24804 IX86_BUILTIN_CMPGTPD,
24805 IX86_BUILTIN_CMPGEPD,
24806 IX86_BUILTIN_CMPNEQPD,
24807 IX86_BUILTIN_CMPNLTPD,
24808 IX86_BUILTIN_CMPNLEPD,
24809 IX86_BUILTIN_CMPNGTPD,
24810 IX86_BUILTIN_CMPNGEPD,
24811 IX86_BUILTIN_CMPORDPD,
24812 IX86_BUILTIN_CMPUNORDPD,
24813 IX86_BUILTIN_CMPEQSD,
24814 IX86_BUILTIN_CMPLTSD,
24815 IX86_BUILTIN_CMPLESD,
24816 IX86_BUILTIN_CMPNEQSD,
24817 IX86_BUILTIN_CMPNLTSD,
24818 IX86_BUILTIN_CMPNLESD,
24819 IX86_BUILTIN_CMPORDSD,
24820 IX86_BUILTIN_CMPUNORDSD,
24821
24822 IX86_BUILTIN_COMIEQSD,
24823 IX86_BUILTIN_COMILTSD,
24824 IX86_BUILTIN_COMILESD,
24825 IX86_BUILTIN_COMIGTSD,
24826 IX86_BUILTIN_COMIGESD,
24827 IX86_BUILTIN_COMINEQSD,
24828 IX86_BUILTIN_UCOMIEQSD,
24829 IX86_BUILTIN_UCOMILTSD,
24830 IX86_BUILTIN_UCOMILESD,
24831 IX86_BUILTIN_UCOMIGTSD,
24832 IX86_BUILTIN_UCOMIGESD,
24833 IX86_BUILTIN_UCOMINEQSD,
24834
24835 IX86_BUILTIN_MAXPD,
24836 IX86_BUILTIN_MAXSD,
24837 IX86_BUILTIN_MINPD,
24838 IX86_BUILTIN_MINSD,
24839
24840 IX86_BUILTIN_ANDPD,
24841 IX86_BUILTIN_ANDNPD,
24842 IX86_BUILTIN_ORPD,
24843 IX86_BUILTIN_XORPD,
24844
24845 IX86_BUILTIN_SQRTPD,
24846 IX86_BUILTIN_SQRTSD,
24847
24848 IX86_BUILTIN_UNPCKHPD,
24849 IX86_BUILTIN_UNPCKLPD,
24850
24851 IX86_BUILTIN_SHUFPD,
24852
24853 IX86_BUILTIN_LOADUPD,
24854 IX86_BUILTIN_STOREUPD,
24855 IX86_BUILTIN_MOVSD,
24856
24857 IX86_BUILTIN_LOADHPD,
24858 IX86_BUILTIN_LOADLPD,
24859
24860 IX86_BUILTIN_CVTDQ2PD,
24861 IX86_BUILTIN_CVTDQ2PS,
24862
24863 IX86_BUILTIN_CVTPD2DQ,
24864 IX86_BUILTIN_CVTPD2PI,
24865 IX86_BUILTIN_CVTPD2PS,
24866 IX86_BUILTIN_CVTTPD2DQ,
24867 IX86_BUILTIN_CVTTPD2PI,
24868
24869 IX86_BUILTIN_CVTPI2PD,
24870 IX86_BUILTIN_CVTSI2SD,
24871 IX86_BUILTIN_CVTSI642SD,
24872
24873 IX86_BUILTIN_CVTSD2SI,
24874 IX86_BUILTIN_CVTSD2SI64,
24875 IX86_BUILTIN_CVTSD2SS,
24876 IX86_BUILTIN_CVTSS2SD,
24877 IX86_BUILTIN_CVTTSD2SI,
24878 IX86_BUILTIN_CVTTSD2SI64,
24879
24880 IX86_BUILTIN_CVTPS2DQ,
24881 IX86_BUILTIN_CVTPS2PD,
24882 IX86_BUILTIN_CVTTPS2DQ,
24883
24884 IX86_BUILTIN_MOVNTI,
24885 IX86_BUILTIN_MOVNTI64,
24886 IX86_BUILTIN_MOVNTPD,
24887 IX86_BUILTIN_MOVNTDQ,
24888
24889 IX86_BUILTIN_MOVQ128,
24890
24891 /* SSE2 MMX */
24892 IX86_BUILTIN_MASKMOVDQU,
24893 IX86_BUILTIN_MOVMSKPD,
24894 IX86_BUILTIN_PMOVMSKB128,
24895
24896 IX86_BUILTIN_PACKSSWB128,
24897 IX86_BUILTIN_PACKSSDW128,
24898 IX86_BUILTIN_PACKUSWB128,
24899
24900 IX86_BUILTIN_PADDB128,
24901 IX86_BUILTIN_PADDW128,
24902 IX86_BUILTIN_PADDD128,
24903 IX86_BUILTIN_PADDQ128,
24904 IX86_BUILTIN_PADDSB128,
24905 IX86_BUILTIN_PADDSW128,
24906 IX86_BUILTIN_PADDUSB128,
24907 IX86_BUILTIN_PADDUSW128,
24908 IX86_BUILTIN_PSUBB128,
24909 IX86_BUILTIN_PSUBW128,
24910 IX86_BUILTIN_PSUBD128,
24911 IX86_BUILTIN_PSUBQ128,
24912 IX86_BUILTIN_PSUBSB128,
24913 IX86_BUILTIN_PSUBSW128,
24914 IX86_BUILTIN_PSUBUSB128,
24915 IX86_BUILTIN_PSUBUSW128,
24916
24917 IX86_BUILTIN_PAND128,
24918 IX86_BUILTIN_PANDN128,
24919 IX86_BUILTIN_POR128,
24920 IX86_BUILTIN_PXOR128,
24921
24922 IX86_BUILTIN_PAVGB128,
24923 IX86_BUILTIN_PAVGW128,
24924
24925 IX86_BUILTIN_PCMPEQB128,
24926 IX86_BUILTIN_PCMPEQW128,
24927 IX86_BUILTIN_PCMPEQD128,
24928 IX86_BUILTIN_PCMPGTB128,
24929 IX86_BUILTIN_PCMPGTW128,
24930 IX86_BUILTIN_PCMPGTD128,
24931
24932 IX86_BUILTIN_PMADDWD128,
24933
24934 IX86_BUILTIN_PMAXSW128,
24935 IX86_BUILTIN_PMAXUB128,
24936 IX86_BUILTIN_PMINSW128,
24937 IX86_BUILTIN_PMINUB128,
24938
24939 IX86_BUILTIN_PMULUDQ,
24940 IX86_BUILTIN_PMULUDQ128,
24941 IX86_BUILTIN_PMULHUW128,
24942 IX86_BUILTIN_PMULHW128,
24943 IX86_BUILTIN_PMULLW128,
24944
24945 IX86_BUILTIN_PSADBW128,
24946 IX86_BUILTIN_PSHUFHW,
24947 IX86_BUILTIN_PSHUFLW,
24948 IX86_BUILTIN_PSHUFD,
24949
24950 IX86_BUILTIN_PSLLDQI128,
24951 IX86_BUILTIN_PSLLWI128,
24952 IX86_BUILTIN_PSLLDI128,
24953 IX86_BUILTIN_PSLLQI128,
24954 IX86_BUILTIN_PSRAWI128,
24955 IX86_BUILTIN_PSRADI128,
24956 IX86_BUILTIN_PSRLDQI128,
24957 IX86_BUILTIN_PSRLWI128,
24958 IX86_BUILTIN_PSRLDI128,
24959 IX86_BUILTIN_PSRLQI128,
24960
24961 IX86_BUILTIN_PSLLDQ128,
24962 IX86_BUILTIN_PSLLW128,
24963 IX86_BUILTIN_PSLLD128,
24964 IX86_BUILTIN_PSLLQ128,
24965 IX86_BUILTIN_PSRAW128,
24966 IX86_BUILTIN_PSRAD128,
24967 IX86_BUILTIN_PSRLW128,
24968 IX86_BUILTIN_PSRLD128,
24969 IX86_BUILTIN_PSRLQ128,
24970
24971 IX86_BUILTIN_PUNPCKHBW128,
24972 IX86_BUILTIN_PUNPCKHWD128,
24973 IX86_BUILTIN_PUNPCKHDQ128,
24974 IX86_BUILTIN_PUNPCKHQDQ128,
24975 IX86_BUILTIN_PUNPCKLBW128,
24976 IX86_BUILTIN_PUNPCKLWD128,
24977 IX86_BUILTIN_PUNPCKLDQ128,
24978 IX86_BUILTIN_PUNPCKLQDQ128,
24979
24980 IX86_BUILTIN_CLFLUSH,
24981 IX86_BUILTIN_MFENCE,
24982 IX86_BUILTIN_LFENCE,
24983 IX86_BUILTIN_PAUSE,
24984
24985 IX86_BUILTIN_BSRSI,
24986 IX86_BUILTIN_BSRDI,
24987 IX86_BUILTIN_RDPMC,
24988 IX86_BUILTIN_RDTSC,
24989 IX86_BUILTIN_RDTSCP,
24990 IX86_BUILTIN_ROLQI,
24991 IX86_BUILTIN_ROLHI,
24992 IX86_BUILTIN_RORQI,
24993 IX86_BUILTIN_RORHI,
24994
24995 /* SSE3. */
24996 IX86_BUILTIN_ADDSUBPS,
24997 IX86_BUILTIN_HADDPS,
24998 IX86_BUILTIN_HSUBPS,
24999 IX86_BUILTIN_MOVSHDUP,
25000 IX86_BUILTIN_MOVSLDUP,
25001 IX86_BUILTIN_ADDSUBPD,
25002 IX86_BUILTIN_HADDPD,
25003 IX86_BUILTIN_HSUBPD,
25004 IX86_BUILTIN_LDDQU,
25005
25006 IX86_BUILTIN_MONITOR,
25007 IX86_BUILTIN_MWAIT,
25008
25009 /* SSSE3. */
25010 IX86_BUILTIN_PHADDW,
25011 IX86_BUILTIN_PHADDD,
25012 IX86_BUILTIN_PHADDSW,
25013 IX86_BUILTIN_PHSUBW,
25014 IX86_BUILTIN_PHSUBD,
25015 IX86_BUILTIN_PHSUBSW,
25016 IX86_BUILTIN_PMADDUBSW,
25017 IX86_BUILTIN_PMULHRSW,
25018 IX86_BUILTIN_PSHUFB,
25019 IX86_BUILTIN_PSIGNB,
25020 IX86_BUILTIN_PSIGNW,
25021 IX86_BUILTIN_PSIGND,
25022 IX86_BUILTIN_PALIGNR,
25023 IX86_BUILTIN_PABSB,
25024 IX86_BUILTIN_PABSW,
25025 IX86_BUILTIN_PABSD,
25026
25027 IX86_BUILTIN_PHADDW128,
25028 IX86_BUILTIN_PHADDD128,
25029 IX86_BUILTIN_PHADDSW128,
25030 IX86_BUILTIN_PHSUBW128,
25031 IX86_BUILTIN_PHSUBD128,
25032 IX86_BUILTIN_PHSUBSW128,
25033 IX86_BUILTIN_PMADDUBSW128,
25034 IX86_BUILTIN_PMULHRSW128,
25035 IX86_BUILTIN_PSHUFB128,
25036 IX86_BUILTIN_PSIGNB128,
25037 IX86_BUILTIN_PSIGNW128,
25038 IX86_BUILTIN_PSIGND128,
25039 IX86_BUILTIN_PALIGNR128,
25040 IX86_BUILTIN_PABSB128,
25041 IX86_BUILTIN_PABSW128,
25042 IX86_BUILTIN_PABSD128,
25043
25044 /* AMDFAM10 - SSE4A New Instructions. */
25045 IX86_BUILTIN_MOVNTSD,
25046 IX86_BUILTIN_MOVNTSS,
25047 IX86_BUILTIN_EXTRQI,
25048 IX86_BUILTIN_EXTRQ,
25049 IX86_BUILTIN_INSERTQI,
25050 IX86_BUILTIN_INSERTQ,
25051
25052 /* SSE4.1. */
25053 IX86_BUILTIN_BLENDPD,
25054 IX86_BUILTIN_BLENDPS,
25055 IX86_BUILTIN_BLENDVPD,
25056 IX86_BUILTIN_BLENDVPS,
25057 IX86_BUILTIN_PBLENDVB128,
25058 IX86_BUILTIN_PBLENDW128,
25059
25060 IX86_BUILTIN_DPPD,
25061 IX86_BUILTIN_DPPS,
25062
25063 IX86_BUILTIN_INSERTPS128,
25064
25065 IX86_BUILTIN_MOVNTDQA,
25066 IX86_BUILTIN_MPSADBW128,
25067 IX86_BUILTIN_PACKUSDW128,
25068 IX86_BUILTIN_PCMPEQQ,
25069 IX86_BUILTIN_PHMINPOSUW128,
25070
25071 IX86_BUILTIN_PMAXSB128,
25072 IX86_BUILTIN_PMAXSD128,
25073 IX86_BUILTIN_PMAXUD128,
25074 IX86_BUILTIN_PMAXUW128,
25075
25076 IX86_BUILTIN_PMINSB128,
25077 IX86_BUILTIN_PMINSD128,
25078 IX86_BUILTIN_PMINUD128,
25079 IX86_BUILTIN_PMINUW128,
25080
25081 IX86_BUILTIN_PMOVSXBW128,
25082 IX86_BUILTIN_PMOVSXBD128,
25083 IX86_BUILTIN_PMOVSXBQ128,
25084 IX86_BUILTIN_PMOVSXWD128,
25085 IX86_BUILTIN_PMOVSXWQ128,
25086 IX86_BUILTIN_PMOVSXDQ128,
25087
25088 IX86_BUILTIN_PMOVZXBW128,
25089 IX86_BUILTIN_PMOVZXBD128,
25090 IX86_BUILTIN_PMOVZXBQ128,
25091 IX86_BUILTIN_PMOVZXWD128,
25092 IX86_BUILTIN_PMOVZXWQ128,
25093 IX86_BUILTIN_PMOVZXDQ128,
25094
25095 IX86_BUILTIN_PMULDQ128,
25096 IX86_BUILTIN_PMULLD128,
25097
25098 IX86_BUILTIN_ROUNDSD,
25099 IX86_BUILTIN_ROUNDSS,
25100
25101 IX86_BUILTIN_ROUNDPD,
25102 IX86_BUILTIN_ROUNDPS,
25103
25104 IX86_BUILTIN_FLOORPD,
25105 IX86_BUILTIN_CEILPD,
25106 IX86_BUILTIN_TRUNCPD,
25107 IX86_BUILTIN_RINTPD,
25108 IX86_BUILTIN_ROUNDPD_AZ,
25109
25110 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25111 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25112 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25113
25114 IX86_BUILTIN_FLOORPS,
25115 IX86_BUILTIN_CEILPS,
25116 IX86_BUILTIN_TRUNCPS,
25117 IX86_BUILTIN_RINTPS,
25118 IX86_BUILTIN_ROUNDPS_AZ,
25119
25120 IX86_BUILTIN_FLOORPS_SFIX,
25121 IX86_BUILTIN_CEILPS_SFIX,
25122 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25123
25124 IX86_BUILTIN_PTESTZ,
25125 IX86_BUILTIN_PTESTC,
25126 IX86_BUILTIN_PTESTNZC,
25127
25128 IX86_BUILTIN_VEC_INIT_V2SI,
25129 IX86_BUILTIN_VEC_INIT_V4HI,
25130 IX86_BUILTIN_VEC_INIT_V8QI,
25131 IX86_BUILTIN_VEC_EXT_V2DF,
25132 IX86_BUILTIN_VEC_EXT_V2DI,
25133 IX86_BUILTIN_VEC_EXT_V4SF,
25134 IX86_BUILTIN_VEC_EXT_V4SI,
25135 IX86_BUILTIN_VEC_EXT_V8HI,
25136 IX86_BUILTIN_VEC_EXT_V2SI,
25137 IX86_BUILTIN_VEC_EXT_V4HI,
25138 IX86_BUILTIN_VEC_EXT_V16QI,
25139 IX86_BUILTIN_VEC_SET_V2DI,
25140 IX86_BUILTIN_VEC_SET_V4SF,
25141 IX86_BUILTIN_VEC_SET_V4SI,
25142 IX86_BUILTIN_VEC_SET_V8HI,
25143 IX86_BUILTIN_VEC_SET_V4HI,
25144 IX86_BUILTIN_VEC_SET_V16QI,
25145
25146 IX86_BUILTIN_VEC_PACK_SFIX,
25147 IX86_BUILTIN_VEC_PACK_SFIX256,
25148
25149 /* SSE4.2. */
25150 IX86_BUILTIN_CRC32QI,
25151 IX86_BUILTIN_CRC32HI,
25152 IX86_BUILTIN_CRC32SI,
25153 IX86_BUILTIN_CRC32DI,
25154
25155 IX86_BUILTIN_PCMPESTRI128,
25156 IX86_BUILTIN_PCMPESTRM128,
25157 IX86_BUILTIN_PCMPESTRA128,
25158 IX86_BUILTIN_PCMPESTRC128,
25159 IX86_BUILTIN_PCMPESTRO128,
25160 IX86_BUILTIN_PCMPESTRS128,
25161 IX86_BUILTIN_PCMPESTRZ128,
25162 IX86_BUILTIN_PCMPISTRI128,
25163 IX86_BUILTIN_PCMPISTRM128,
25164 IX86_BUILTIN_PCMPISTRA128,
25165 IX86_BUILTIN_PCMPISTRC128,
25166 IX86_BUILTIN_PCMPISTRO128,
25167 IX86_BUILTIN_PCMPISTRS128,
25168 IX86_BUILTIN_PCMPISTRZ128,
25169
25170 IX86_BUILTIN_PCMPGTQ,
25171
25172 /* AES instructions */
25173 IX86_BUILTIN_AESENC128,
25174 IX86_BUILTIN_AESENCLAST128,
25175 IX86_BUILTIN_AESDEC128,
25176 IX86_BUILTIN_AESDECLAST128,
25177 IX86_BUILTIN_AESIMC128,
25178 IX86_BUILTIN_AESKEYGENASSIST128,
25179
25180 /* PCLMUL instruction */
25181 IX86_BUILTIN_PCLMULQDQ128,
25182
25183 /* AVX */
25184 IX86_BUILTIN_ADDPD256,
25185 IX86_BUILTIN_ADDPS256,
25186 IX86_BUILTIN_ADDSUBPD256,
25187 IX86_BUILTIN_ADDSUBPS256,
25188 IX86_BUILTIN_ANDPD256,
25189 IX86_BUILTIN_ANDPS256,
25190 IX86_BUILTIN_ANDNPD256,
25191 IX86_BUILTIN_ANDNPS256,
25192 IX86_BUILTIN_BLENDPD256,
25193 IX86_BUILTIN_BLENDPS256,
25194 IX86_BUILTIN_BLENDVPD256,
25195 IX86_BUILTIN_BLENDVPS256,
25196 IX86_BUILTIN_DIVPD256,
25197 IX86_BUILTIN_DIVPS256,
25198 IX86_BUILTIN_DPPS256,
25199 IX86_BUILTIN_HADDPD256,
25200 IX86_BUILTIN_HADDPS256,
25201 IX86_BUILTIN_HSUBPD256,
25202 IX86_BUILTIN_HSUBPS256,
25203 IX86_BUILTIN_MAXPD256,
25204 IX86_BUILTIN_MAXPS256,
25205 IX86_BUILTIN_MINPD256,
25206 IX86_BUILTIN_MINPS256,
25207 IX86_BUILTIN_MULPD256,
25208 IX86_BUILTIN_MULPS256,
25209 IX86_BUILTIN_ORPD256,
25210 IX86_BUILTIN_ORPS256,
25211 IX86_BUILTIN_SHUFPD256,
25212 IX86_BUILTIN_SHUFPS256,
25213 IX86_BUILTIN_SUBPD256,
25214 IX86_BUILTIN_SUBPS256,
25215 IX86_BUILTIN_XORPD256,
25216 IX86_BUILTIN_XORPS256,
25217 IX86_BUILTIN_CMPSD,
25218 IX86_BUILTIN_CMPSS,
25219 IX86_BUILTIN_CMPPD,
25220 IX86_BUILTIN_CMPPS,
25221 IX86_BUILTIN_CMPPD256,
25222 IX86_BUILTIN_CMPPS256,
25223 IX86_BUILTIN_CVTDQ2PD256,
25224 IX86_BUILTIN_CVTDQ2PS256,
25225 IX86_BUILTIN_CVTPD2PS256,
25226 IX86_BUILTIN_CVTPS2DQ256,
25227 IX86_BUILTIN_CVTPS2PD256,
25228 IX86_BUILTIN_CVTTPD2DQ256,
25229 IX86_BUILTIN_CVTPD2DQ256,
25230 IX86_BUILTIN_CVTTPS2DQ256,
25231 IX86_BUILTIN_EXTRACTF128PD256,
25232 IX86_BUILTIN_EXTRACTF128PS256,
25233 IX86_BUILTIN_EXTRACTF128SI256,
25234 IX86_BUILTIN_VZEROALL,
25235 IX86_BUILTIN_VZEROUPPER,
25236 IX86_BUILTIN_VPERMILVARPD,
25237 IX86_BUILTIN_VPERMILVARPS,
25238 IX86_BUILTIN_VPERMILVARPD256,
25239 IX86_BUILTIN_VPERMILVARPS256,
25240 IX86_BUILTIN_VPERMILPD,
25241 IX86_BUILTIN_VPERMILPS,
25242 IX86_BUILTIN_VPERMILPD256,
25243 IX86_BUILTIN_VPERMILPS256,
25244 IX86_BUILTIN_VPERMIL2PD,
25245 IX86_BUILTIN_VPERMIL2PS,
25246 IX86_BUILTIN_VPERMIL2PD256,
25247 IX86_BUILTIN_VPERMIL2PS256,
25248 IX86_BUILTIN_VPERM2F128PD256,
25249 IX86_BUILTIN_VPERM2F128PS256,
25250 IX86_BUILTIN_VPERM2F128SI256,
25251 IX86_BUILTIN_VBROADCASTSS,
25252 IX86_BUILTIN_VBROADCASTSD256,
25253 IX86_BUILTIN_VBROADCASTSS256,
25254 IX86_BUILTIN_VBROADCASTPD256,
25255 IX86_BUILTIN_VBROADCASTPS256,
25256 IX86_BUILTIN_VINSERTF128PD256,
25257 IX86_BUILTIN_VINSERTF128PS256,
25258 IX86_BUILTIN_VINSERTF128SI256,
25259 IX86_BUILTIN_LOADUPD256,
25260 IX86_BUILTIN_LOADUPS256,
25261 IX86_BUILTIN_STOREUPD256,
25262 IX86_BUILTIN_STOREUPS256,
25263 IX86_BUILTIN_LDDQU256,
25264 IX86_BUILTIN_MOVNTDQ256,
25265 IX86_BUILTIN_MOVNTPD256,
25266 IX86_BUILTIN_MOVNTPS256,
25267 IX86_BUILTIN_LOADDQU256,
25268 IX86_BUILTIN_STOREDQU256,
25269 IX86_BUILTIN_MASKLOADPD,
25270 IX86_BUILTIN_MASKLOADPS,
25271 IX86_BUILTIN_MASKSTOREPD,
25272 IX86_BUILTIN_MASKSTOREPS,
25273 IX86_BUILTIN_MASKLOADPD256,
25274 IX86_BUILTIN_MASKLOADPS256,
25275 IX86_BUILTIN_MASKSTOREPD256,
25276 IX86_BUILTIN_MASKSTOREPS256,
25277 IX86_BUILTIN_MOVSHDUP256,
25278 IX86_BUILTIN_MOVSLDUP256,
25279 IX86_BUILTIN_MOVDDUP256,
25280
25281 IX86_BUILTIN_SQRTPD256,
25282 IX86_BUILTIN_SQRTPS256,
25283 IX86_BUILTIN_SQRTPS_NR256,
25284 IX86_BUILTIN_RSQRTPS256,
25285 IX86_BUILTIN_RSQRTPS_NR256,
25286
25287 IX86_BUILTIN_RCPPS256,
25288
25289 IX86_BUILTIN_ROUNDPD256,
25290 IX86_BUILTIN_ROUNDPS256,
25291
25292 IX86_BUILTIN_FLOORPD256,
25293 IX86_BUILTIN_CEILPD256,
25294 IX86_BUILTIN_TRUNCPD256,
25295 IX86_BUILTIN_RINTPD256,
25296 IX86_BUILTIN_ROUNDPD_AZ256,
25297
25298 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25299 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25300 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25301
25302 IX86_BUILTIN_FLOORPS256,
25303 IX86_BUILTIN_CEILPS256,
25304 IX86_BUILTIN_TRUNCPS256,
25305 IX86_BUILTIN_RINTPS256,
25306 IX86_BUILTIN_ROUNDPS_AZ256,
25307
25308 IX86_BUILTIN_FLOORPS_SFIX256,
25309 IX86_BUILTIN_CEILPS_SFIX256,
25310 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25311
25312 IX86_BUILTIN_UNPCKHPD256,
25313 IX86_BUILTIN_UNPCKLPD256,
25314 IX86_BUILTIN_UNPCKHPS256,
25315 IX86_BUILTIN_UNPCKLPS256,
25316
25317 IX86_BUILTIN_SI256_SI,
25318 IX86_BUILTIN_PS256_PS,
25319 IX86_BUILTIN_PD256_PD,
25320 IX86_BUILTIN_SI_SI256,
25321 IX86_BUILTIN_PS_PS256,
25322 IX86_BUILTIN_PD_PD256,
25323
25324 IX86_BUILTIN_VTESTZPD,
25325 IX86_BUILTIN_VTESTCPD,
25326 IX86_BUILTIN_VTESTNZCPD,
25327 IX86_BUILTIN_VTESTZPS,
25328 IX86_BUILTIN_VTESTCPS,
25329 IX86_BUILTIN_VTESTNZCPS,
25330 IX86_BUILTIN_VTESTZPD256,
25331 IX86_BUILTIN_VTESTCPD256,
25332 IX86_BUILTIN_VTESTNZCPD256,
25333 IX86_BUILTIN_VTESTZPS256,
25334 IX86_BUILTIN_VTESTCPS256,
25335 IX86_BUILTIN_VTESTNZCPS256,
25336 IX86_BUILTIN_PTESTZ256,
25337 IX86_BUILTIN_PTESTC256,
25338 IX86_BUILTIN_PTESTNZC256,
25339
25340 IX86_BUILTIN_MOVMSKPD256,
25341 IX86_BUILTIN_MOVMSKPS256,
25342
25343 /* AVX2 */
25344 IX86_BUILTIN_MPSADBW256,
25345 IX86_BUILTIN_PABSB256,
25346 IX86_BUILTIN_PABSW256,
25347 IX86_BUILTIN_PABSD256,
25348 IX86_BUILTIN_PACKSSDW256,
25349 IX86_BUILTIN_PACKSSWB256,
25350 IX86_BUILTIN_PACKUSDW256,
25351 IX86_BUILTIN_PACKUSWB256,
25352 IX86_BUILTIN_PADDB256,
25353 IX86_BUILTIN_PADDW256,
25354 IX86_BUILTIN_PADDD256,
25355 IX86_BUILTIN_PADDQ256,
25356 IX86_BUILTIN_PADDSB256,
25357 IX86_BUILTIN_PADDSW256,
25358 IX86_BUILTIN_PADDUSB256,
25359 IX86_BUILTIN_PADDUSW256,
25360 IX86_BUILTIN_PALIGNR256,
25361 IX86_BUILTIN_AND256I,
25362 IX86_BUILTIN_ANDNOT256I,
25363 IX86_BUILTIN_PAVGB256,
25364 IX86_BUILTIN_PAVGW256,
25365 IX86_BUILTIN_PBLENDVB256,
25366 IX86_BUILTIN_PBLENDVW256,
25367 IX86_BUILTIN_PCMPEQB256,
25368 IX86_BUILTIN_PCMPEQW256,
25369 IX86_BUILTIN_PCMPEQD256,
25370 IX86_BUILTIN_PCMPEQQ256,
25371 IX86_BUILTIN_PCMPGTB256,
25372 IX86_BUILTIN_PCMPGTW256,
25373 IX86_BUILTIN_PCMPGTD256,
25374 IX86_BUILTIN_PCMPGTQ256,
25375 IX86_BUILTIN_PHADDW256,
25376 IX86_BUILTIN_PHADDD256,
25377 IX86_BUILTIN_PHADDSW256,
25378 IX86_BUILTIN_PHSUBW256,
25379 IX86_BUILTIN_PHSUBD256,
25380 IX86_BUILTIN_PHSUBSW256,
25381 IX86_BUILTIN_PMADDUBSW256,
25382 IX86_BUILTIN_PMADDWD256,
25383 IX86_BUILTIN_PMAXSB256,
25384 IX86_BUILTIN_PMAXSW256,
25385 IX86_BUILTIN_PMAXSD256,
25386 IX86_BUILTIN_PMAXUB256,
25387 IX86_BUILTIN_PMAXUW256,
25388 IX86_BUILTIN_PMAXUD256,
25389 IX86_BUILTIN_PMINSB256,
25390 IX86_BUILTIN_PMINSW256,
25391 IX86_BUILTIN_PMINSD256,
25392 IX86_BUILTIN_PMINUB256,
25393 IX86_BUILTIN_PMINUW256,
25394 IX86_BUILTIN_PMINUD256,
25395 IX86_BUILTIN_PMOVMSKB256,
25396 IX86_BUILTIN_PMOVSXBW256,
25397 IX86_BUILTIN_PMOVSXBD256,
25398 IX86_BUILTIN_PMOVSXBQ256,
25399 IX86_BUILTIN_PMOVSXWD256,
25400 IX86_BUILTIN_PMOVSXWQ256,
25401 IX86_BUILTIN_PMOVSXDQ256,
25402 IX86_BUILTIN_PMOVZXBW256,
25403 IX86_BUILTIN_PMOVZXBD256,
25404 IX86_BUILTIN_PMOVZXBQ256,
25405 IX86_BUILTIN_PMOVZXWD256,
25406 IX86_BUILTIN_PMOVZXWQ256,
25407 IX86_BUILTIN_PMOVZXDQ256,
25408 IX86_BUILTIN_PMULDQ256,
25409 IX86_BUILTIN_PMULHRSW256,
25410 IX86_BUILTIN_PMULHUW256,
25411 IX86_BUILTIN_PMULHW256,
25412 IX86_BUILTIN_PMULLW256,
25413 IX86_BUILTIN_PMULLD256,
25414 IX86_BUILTIN_PMULUDQ256,
25415 IX86_BUILTIN_POR256,
25416 IX86_BUILTIN_PSADBW256,
25417 IX86_BUILTIN_PSHUFB256,
25418 IX86_BUILTIN_PSHUFD256,
25419 IX86_BUILTIN_PSHUFHW256,
25420 IX86_BUILTIN_PSHUFLW256,
25421 IX86_BUILTIN_PSIGNB256,
25422 IX86_BUILTIN_PSIGNW256,
25423 IX86_BUILTIN_PSIGND256,
25424 IX86_BUILTIN_PSLLDQI256,
25425 IX86_BUILTIN_PSLLWI256,
25426 IX86_BUILTIN_PSLLW256,
25427 IX86_BUILTIN_PSLLDI256,
25428 IX86_BUILTIN_PSLLD256,
25429 IX86_BUILTIN_PSLLQI256,
25430 IX86_BUILTIN_PSLLQ256,
25431 IX86_BUILTIN_PSRAWI256,
25432 IX86_BUILTIN_PSRAW256,
25433 IX86_BUILTIN_PSRADI256,
25434 IX86_BUILTIN_PSRAD256,
25435 IX86_BUILTIN_PSRLDQI256,
25436 IX86_BUILTIN_PSRLWI256,
25437 IX86_BUILTIN_PSRLW256,
25438 IX86_BUILTIN_PSRLDI256,
25439 IX86_BUILTIN_PSRLD256,
25440 IX86_BUILTIN_PSRLQI256,
25441 IX86_BUILTIN_PSRLQ256,
25442 IX86_BUILTIN_PSUBB256,
25443 IX86_BUILTIN_PSUBW256,
25444 IX86_BUILTIN_PSUBD256,
25445 IX86_BUILTIN_PSUBQ256,
25446 IX86_BUILTIN_PSUBSB256,
25447 IX86_BUILTIN_PSUBSW256,
25448 IX86_BUILTIN_PSUBUSB256,
25449 IX86_BUILTIN_PSUBUSW256,
25450 IX86_BUILTIN_PUNPCKHBW256,
25451 IX86_BUILTIN_PUNPCKHWD256,
25452 IX86_BUILTIN_PUNPCKHDQ256,
25453 IX86_BUILTIN_PUNPCKHQDQ256,
25454 IX86_BUILTIN_PUNPCKLBW256,
25455 IX86_BUILTIN_PUNPCKLWD256,
25456 IX86_BUILTIN_PUNPCKLDQ256,
25457 IX86_BUILTIN_PUNPCKLQDQ256,
25458 IX86_BUILTIN_PXOR256,
25459 IX86_BUILTIN_MOVNTDQA256,
25460 IX86_BUILTIN_VBROADCASTSS_PS,
25461 IX86_BUILTIN_VBROADCASTSS_PS256,
25462 IX86_BUILTIN_VBROADCASTSD_PD256,
25463 IX86_BUILTIN_VBROADCASTSI256,
25464 IX86_BUILTIN_PBLENDD256,
25465 IX86_BUILTIN_PBLENDD128,
25466 IX86_BUILTIN_PBROADCASTB256,
25467 IX86_BUILTIN_PBROADCASTW256,
25468 IX86_BUILTIN_PBROADCASTD256,
25469 IX86_BUILTIN_PBROADCASTQ256,
25470 IX86_BUILTIN_PBROADCASTB128,
25471 IX86_BUILTIN_PBROADCASTW128,
25472 IX86_BUILTIN_PBROADCASTD128,
25473 IX86_BUILTIN_PBROADCASTQ128,
25474 IX86_BUILTIN_VPERMVARSI256,
25475 IX86_BUILTIN_VPERMDF256,
25476 IX86_BUILTIN_VPERMVARSF256,
25477 IX86_BUILTIN_VPERMDI256,
25478 IX86_BUILTIN_VPERMTI256,
25479 IX86_BUILTIN_VEXTRACT128I256,
25480 IX86_BUILTIN_VINSERT128I256,
25481 IX86_BUILTIN_MASKLOADD,
25482 IX86_BUILTIN_MASKLOADQ,
25483 IX86_BUILTIN_MASKLOADD256,
25484 IX86_BUILTIN_MASKLOADQ256,
25485 IX86_BUILTIN_MASKSTORED,
25486 IX86_BUILTIN_MASKSTOREQ,
25487 IX86_BUILTIN_MASKSTORED256,
25488 IX86_BUILTIN_MASKSTOREQ256,
25489 IX86_BUILTIN_PSLLVV4DI,
25490 IX86_BUILTIN_PSLLVV2DI,
25491 IX86_BUILTIN_PSLLVV8SI,
25492 IX86_BUILTIN_PSLLVV4SI,
25493 IX86_BUILTIN_PSRAVV8SI,
25494 IX86_BUILTIN_PSRAVV4SI,
25495 IX86_BUILTIN_PSRLVV4DI,
25496 IX86_BUILTIN_PSRLVV2DI,
25497 IX86_BUILTIN_PSRLVV8SI,
25498 IX86_BUILTIN_PSRLVV4SI,
25499
25500 IX86_BUILTIN_GATHERSIV2DF,
25501 IX86_BUILTIN_GATHERSIV4DF,
25502 IX86_BUILTIN_GATHERDIV2DF,
25503 IX86_BUILTIN_GATHERDIV4DF,
25504 IX86_BUILTIN_GATHERSIV4SF,
25505 IX86_BUILTIN_GATHERSIV8SF,
25506 IX86_BUILTIN_GATHERDIV4SF,
25507 IX86_BUILTIN_GATHERDIV8SF,
25508 IX86_BUILTIN_GATHERSIV2DI,
25509 IX86_BUILTIN_GATHERSIV4DI,
25510 IX86_BUILTIN_GATHERDIV2DI,
25511 IX86_BUILTIN_GATHERDIV4DI,
25512 IX86_BUILTIN_GATHERSIV4SI,
25513 IX86_BUILTIN_GATHERSIV8SI,
25514 IX86_BUILTIN_GATHERDIV4SI,
25515 IX86_BUILTIN_GATHERDIV8SI,
25516
25517 /* Alternate 4 element gather for the vectorizer where
25518 all operands are 32-byte wide. */
25519 IX86_BUILTIN_GATHERALTSIV4DF,
25520 IX86_BUILTIN_GATHERALTDIV8SF,
25521 IX86_BUILTIN_GATHERALTSIV4DI,
25522 IX86_BUILTIN_GATHERALTDIV8SI,
25523
25524 /* TFmode support builtins. */
25525 IX86_BUILTIN_INFQ,
25526 IX86_BUILTIN_HUGE_VALQ,
25527 IX86_BUILTIN_FABSQ,
25528 IX86_BUILTIN_COPYSIGNQ,
25529
25530 /* Vectorizer support builtins. */
25531 IX86_BUILTIN_CPYSGNPS,
25532 IX86_BUILTIN_CPYSGNPD,
25533 IX86_BUILTIN_CPYSGNPS256,
25534 IX86_BUILTIN_CPYSGNPD256,
25535
25536 /* FMA4 instructions. */
25537 IX86_BUILTIN_VFMADDSS,
25538 IX86_BUILTIN_VFMADDSD,
25539 IX86_BUILTIN_VFMADDPS,
25540 IX86_BUILTIN_VFMADDPD,
25541 IX86_BUILTIN_VFMADDPS256,
25542 IX86_BUILTIN_VFMADDPD256,
25543 IX86_BUILTIN_VFMADDSUBPS,
25544 IX86_BUILTIN_VFMADDSUBPD,
25545 IX86_BUILTIN_VFMADDSUBPS256,
25546 IX86_BUILTIN_VFMADDSUBPD256,
25547
25548 /* FMA3 instructions. */
25549 IX86_BUILTIN_VFMADDSS3,
25550 IX86_BUILTIN_VFMADDSD3,
25551
25552 /* XOP instructions. */
25553 IX86_BUILTIN_VPCMOV,
25554 IX86_BUILTIN_VPCMOV_V2DI,
25555 IX86_BUILTIN_VPCMOV_V4SI,
25556 IX86_BUILTIN_VPCMOV_V8HI,
25557 IX86_BUILTIN_VPCMOV_V16QI,
25558 IX86_BUILTIN_VPCMOV_V4SF,
25559 IX86_BUILTIN_VPCMOV_V2DF,
25560 IX86_BUILTIN_VPCMOV256,
25561 IX86_BUILTIN_VPCMOV_V4DI256,
25562 IX86_BUILTIN_VPCMOV_V8SI256,
25563 IX86_BUILTIN_VPCMOV_V16HI256,
25564 IX86_BUILTIN_VPCMOV_V32QI256,
25565 IX86_BUILTIN_VPCMOV_V8SF256,
25566 IX86_BUILTIN_VPCMOV_V4DF256,
25567
25568 IX86_BUILTIN_VPPERM,
25569
25570 IX86_BUILTIN_VPMACSSWW,
25571 IX86_BUILTIN_VPMACSWW,
25572 IX86_BUILTIN_VPMACSSWD,
25573 IX86_BUILTIN_VPMACSWD,
25574 IX86_BUILTIN_VPMACSSDD,
25575 IX86_BUILTIN_VPMACSDD,
25576 IX86_BUILTIN_VPMACSSDQL,
25577 IX86_BUILTIN_VPMACSSDQH,
25578 IX86_BUILTIN_VPMACSDQL,
25579 IX86_BUILTIN_VPMACSDQH,
25580 IX86_BUILTIN_VPMADCSSWD,
25581 IX86_BUILTIN_VPMADCSWD,
25582
25583 IX86_BUILTIN_VPHADDBW,
25584 IX86_BUILTIN_VPHADDBD,
25585 IX86_BUILTIN_VPHADDBQ,
25586 IX86_BUILTIN_VPHADDWD,
25587 IX86_BUILTIN_VPHADDWQ,
25588 IX86_BUILTIN_VPHADDDQ,
25589 IX86_BUILTIN_VPHADDUBW,
25590 IX86_BUILTIN_VPHADDUBD,
25591 IX86_BUILTIN_VPHADDUBQ,
25592 IX86_BUILTIN_VPHADDUWD,
25593 IX86_BUILTIN_VPHADDUWQ,
25594 IX86_BUILTIN_VPHADDUDQ,
25595 IX86_BUILTIN_VPHSUBBW,
25596 IX86_BUILTIN_VPHSUBWD,
25597 IX86_BUILTIN_VPHSUBDQ,
25598
25599 IX86_BUILTIN_VPROTB,
25600 IX86_BUILTIN_VPROTW,
25601 IX86_BUILTIN_VPROTD,
25602 IX86_BUILTIN_VPROTQ,
25603 IX86_BUILTIN_VPROTB_IMM,
25604 IX86_BUILTIN_VPROTW_IMM,
25605 IX86_BUILTIN_VPROTD_IMM,
25606 IX86_BUILTIN_VPROTQ_IMM,
25607
25608 IX86_BUILTIN_VPSHLB,
25609 IX86_BUILTIN_VPSHLW,
25610 IX86_BUILTIN_VPSHLD,
25611 IX86_BUILTIN_VPSHLQ,
25612 IX86_BUILTIN_VPSHAB,
25613 IX86_BUILTIN_VPSHAW,
25614 IX86_BUILTIN_VPSHAD,
25615 IX86_BUILTIN_VPSHAQ,
25616
25617 IX86_BUILTIN_VFRCZSS,
25618 IX86_BUILTIN_VFRCZSD,
25619 IX86_BUILTIN_VFRCZPS,
25620 IX86_BUILTIN_VFRCZPD,
25621 IX86_BUILTIN_VFRCZPS256,
25622 IX86_BUILTIN_VFRCZPD256,
25623
25624 IX86_BUILTIN_VPCOMEQUB,
25625 IX86_BUILTIN_VPCOMNEUB,
25626 IX86_BUILTIN_VPCOMLTUB,
25627 IX86_BUILTIN_VPCOMLEUB,
25628 IX86_BUILTIN_VPCOMGTUB,
25629 IX86_BUILTIN_VPCOMGEUB,
25630 IX86_BUILTIN_VPCOMFALSEUB,
25631 IX86_BUILTIN_VPCOMTRUEUB,
25632
25633 IX86_BUILTIN_VPCOMEQUW,
25634 IX86_BUILTIN_VPCOMNEUW,
25635 IX86_BUILTIN_VPCOMLTUW,
25636 IX86_BUILTIN_VPCOMLEUW,
25637 IX86_BUILTIN_VPCOMGTUW,
25638 IX86_BUILTIN_VPCOMGEUW,
25639 IX86_BUILTIN_VPCOMFALSEUW,
25640 IX86_BUILTIN_VPCOMTRUEUW,
25641
25642 IX86_BUILTIN_VPCOMEQUD,
25643 IX86_BUILTIN_VPCOMNEUD,
25644 IX86_BUILTIN_VPCOMLTUD,
25645 IX86_BUILTIN_VPCOMLEUD,
25646 IX86_BUILTIN_VPCOMGTUD,
25647 IX86_BUILTIN_VPCOMGEUD,
25648 IX86_BUILTIN_VPCOMFALSEUD,
25649 IX86_BUILTIN_VPCOMTRUEUD,
25650
25651 IX86_BUILTIN_VPCOMEQUQ,
25652 IX86_BUILTIN_VPCOMNEUQ,
25653 IX86_BUILTIN_VPCOMLTUQ,
25654 IX86_BUILTIN_VPCOMLEUQ,
25655 IX86_BUILTIN_VPCOMGTUQ,
25656 IX86_BUILTIN_VPCOMGEUQ,
25657 IX86_BUILTIN_VPCOMFALSEUQ,
25658 IX86_BUILTIN_VPCOMTRUEUQ,
25659
25660 IX86_BUILTIN_VPCOMEQB,
25661 IX86_BUILTIN_VPCOMNEB,
25662 IX86_BUILTIN_VPCOMLTB,
25663 IX86_BUILTIN_VPCOMLEB,
25664 IX86_BUILTIN_VPCOMGTB,
25665 IX86_BUILTIN_VPCOMGEB,
25666 IX86_BUILTIN_VPCOMFALSEB,
25667 IX86_BUILTIN_VPCOMTRUEB,
25668
25669 IX86_BUILTIN_VPCOMEQW,
25670 IX86_BUILTIN_VPCOMNEW,
25671 IX86_BUILTIN_VPCOMLTW,
25672 IX86_BUILTIN_VPCOMLEW,
25673 IX86_BUILTIN_VPCOMGTW,
25674 IX86_BUILTIN_VPCOMGEW,
25675 IX86_BUILTIN_VPCOMFALSEW,
25676 IX86_BUILTIN_VPCOMTRUEW,
25677
25678 IX86_BUILTIN_VPCOMEQD,
25679 IX86_BUILTIN_VPCOMNED,
25680 IX86_BUILTIN_VPCOMLTD,
25681 IX86_BUILTIN_VPCOMLED,
25682 IX86_BUILTIN_VPCOMGTD,
25683 IX86_BUILTIN_VPCOMGED,
25684 IX86_BUILTIN_VPCOMFALSED,
25685 IX86_BUILTIN_VPCOMTRUED,
25686
25687 IX86_BUILTIN_VPCOMEQQ,
25688 IX86_BUILTIN_VPCOMNEQ,
25689 IX86_BUILTIN_VPCOMLTQ,
25690 IX86_BUILTIN_VPCOMLEQ,
25691 IX86_BUILTIN_VPCOMGTQ,
25692 IX86_BUILTIN_VPCOMGEQ,
25693 IX86_BUILTIN_VPCOMFALSEQ,
25694 IX86_BUILTIN_VPCOMTRUEQ,
25695
25696 /* LWP instructions. */
25697 IX86_BUILTIN_LLWPCB,
25698 IX86_BUILTIN_SLWPCB,
25699 IX86_BUILTIN_LWPVAL32,
25700 IX86_BUILTIN_LWPVAL64,
25701 IX86_BUILTIN_LWPINS32,
25702 IX86_BUILTIN_LWPINS64,
25703
25704 IX86_BUILTIN_CLZS,
25705
25706 /* RTM */
25707 IX86_BUILTIN_XBEGIN,
25708 IX86_BUILTIN_XEND,
25709 IX86_BUILTIN_XABORT,
25710 IX86_BUILTIN_XTEST,
25711
25712 /* BMI instructions. */
25713 IX86_BUILTIN_BEXTR32,
25714 IX86_BUILTIN_BEXTR64,
25715 IX86_BUILTIN_CTZS,
25716
25717 /* TBM instructions. */
25718 IX86_BUILTIN_BEXTRI32,
25719 IX86_BUILTIN_BEXTRI64,
25720
25721 /* BMI2 instructions. */
25722 IX86_BUILTIN_BZHI32,
25723 IX86_BUILTIN_BZHI64,
25724 IX86_BUILTIN_PDEP32,
25725 IX86_BUILTIN_PDEP64,
25726 IX86_BUILTIN_PEXT32,
25727 IX86_BUILTIN_PEXT64,
25728
25729 /* FSGSBASE instructions. */
25730 IX86_BUILTIN_RDFSBASE32,
25731 IX86_BUILTIN_RDFSBASE64,
25732 IX86_BUILTIN_RDGSBASE32,
25733 IX86_BUILTIN_RDGSBASE64,
25734 IX86_BUILTIN_WRFSBASE32,
25735 IX86_BUILTIN_WRFSBASE64,
25736 IX86_BUILTIN_WRGSBASE32,
25737 IX86_BUILTIN_WRGSBASE64,
25738
25739 /* RDRND instructions. */
25740 IX86_BUILTIN_RDRAND16_STEP,
25741 IX86_BUILTIN_RDRAND32_STEP,
25742 IX86_BUILTIN_RDRAND64_STEP,
25743
25744 /* F16C instructions. */
25745 IX86_BUILTIN_CVTPH2PS,
25746 IX86_BUILTIN_CVTPH2PS256,
25747 IX86_BUILTIN_CVTPS2PH,
25748 IX86_BUILTIN_CVTPS2PH256,
25749
25750 /* CFString built-in for darwin */
25751 IX86_BUILTIN_CFSTRING,
25752
25753 IX86_BUILTIN_MAX
25754 };
25755
25756 /* Table for the ix86 builtin decls. */
25757 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25758
25759 /* Table of all of the builtin functions that are possible with different ISA's
25760 but are waiting to be built until a function is declared to use that
25761 ISA. */
25762 struct builtin_isa {
25763 const char *name; /* function name */
25764 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25765 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25766 bool const_p; /* true if the declaration is constant */
25767 bool set_and_not_built_p;
25768 };
25769
25770 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25771
25772
25773 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25774 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25775 function decl in the ix86_builtins array. Returns the function decl or
25776 NULL_TREE, if the builtin was not added.
25777
25778 If the front end has a special hook for builtin functions, delay adding
25779 builtin functions that aren't in the current ISA until the ISA is changed
25780 with function specific optimization. Doing so, can save about 300K for the
25781 default compiler. When the builtin is expanded, check at that time whether
25782 it is valid.
25783
25784 If the front end doesn't have a special hook, record all builtins, even if
25785 it isn't an instruction set in the current ISA in case the user uses
25786 function specific options for a different ISA, so that we don't get scope
25787 errors if a builtin is added in the middle of a function scope. */
25788
25789 static inline tree
25790 def_builtin (HOST_WIDE_INT mask, const char *name,
25791 enum ix86_builtin_func_type tcode,
25792 enum ix86_builtins code)
25793 {
25794 tree decl = NULL_TREE;
25795
25796 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25797 {
25798 ix86_builtins_isa[(int) code].isa = mask;
25799
25800 mask &= ~OPTION_MASK_ISA_64BIT;
25801 if (mask == 0
25802 || (mask & ix86_isa_flags) != 0
25803 || (lang_hooks.builtin_function
25804 == lang_hooks.builtin_function_ext_scope))
25805
25806 {
25807 tree type = ix86_get_builtin_func_type (tcode);
25808 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25809 NULL, NULL_TREE);
25810 ix86_builtins[(int) code] = decl;
25811 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25812 }
25813 else
25814 {
25815 ix86_builtins[(int) code] = NULL_TREE;
25816 ix86_builtins_isa[(int) code].tcode = tcode;
25817 ix86_builtins_isa[(int) code].name = name;
25818 ix86_builtins_isa[(int) code].const_p = false;
25819 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25820 }
25821 }
25822
25823 return decl;
25824 }
25825
25826 /* Like def_builtin, but also marks the function decl "const". */
25827
25828 static inline tree
25829 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25830 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25831 {
25832 tree decl = def_builtin (mask, name, tcode, code);
25833 if (decl)
25834 TREE_READONLY (decl) = 1;
25835 else
25836 ix86_builtins_isa[(int) code].const_p = true;
25837
25838 return decl;
25839 }
25840
25841 /* Add any new builtin functions for a given ISA that may not have been
25842 declared. This saves a bit of space compared to adding all of the
25843 declarations to the tree, even if we didn't use them. */
25844
25845 static void
25846 ix86_add_new_builtins (HOST_WIDE_INT isa)
25847 {
25848 int i;
25849
25850 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
25851 {
25852 if ((ix86_builtins_isa[i].isa & isa) != 0
25853 && ix86_builtins_isa[i].set_and_not_built_p)
25854 {
25855 tree decl, type;
25856
25857 /* Don't define the builtin again. */
25858 ix86_builtins_isa[i].set_and_not_built_p = false;
25859
25860 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
25861 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
25862 type, i, BUILT_IN_MD, NULL,
25863 NULL_TREE);
25864
25865 ix86_builtins[i] = decl;
25866 if (ix86_builtins_isa[i].const_p)
25867 TREE_READONLY (decl) = 1;
25868 }
25869 }
25870 }
25871
25872 /* Bits for builtin_description.flag. */
25873
25874 /* Set when we don't support the comparison natively, and should
25875 swap_comparison in order to support it. */
25876 #define BUILTIN_DESC_SWAP_OPERANDS 1
25877
25878 struct builtin_description
25879 {
25880 const HOST_WIDE_INT mask;
25881 const enum insn_code icode;
25882 const char *const name;
25883 const enum ix86_builtins code;
25884 const enum rtx_code comparison;
25885 const int flag;
25886 };
25887
25888 static const struct builtin_description bdesc_comi[] =
25889 {
25890 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
25891 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
25892 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
25893 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
25894 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
25895 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
25896 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
25897 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
25898 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
25899 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
25900 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
25901 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
25902 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
25903 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
25904 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
25905 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
25906 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
25907 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
25908 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
25909 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
25910 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
25911 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
25912 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
25913 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
25914 };
25915
25916 static const struct builtin_description bdesc_pcmpestr[] =
25917 {
25918 /* SSE4.2 */
25919 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
25920 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
25921 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
25922 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
25923 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
25924 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
25925 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
25926 };
25927
25928 static const struct builtin_description bdesc_pcmpistr[] =
25929 {
25930 /* SSE4.2 */
25931 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
25932 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
25933 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
25934 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
25935 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
25936 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
25937 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
25938 };
25939
25940 /* Special builtins with variable number of arguments. */
25941 static const struct builtin_description bdesc_special_args[] =
25942 {
25943 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
25944 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
25945 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
25946
25947 /* MMX */
25948 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25949
25950 /* 3DNow! */
25951 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25952
25953 /* SSE */
25954 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25955 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25956 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25957
25958 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25959 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25960 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25961 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25962
25963 /* SSE or 3DNow!A */
25964 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25965 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
25966
25967 /* SSE2 */
25968 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25969 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25970 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25971 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
25972 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25973 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
25974 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
25975 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
25976 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
25977 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25978
25979 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25980 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25981
25982 /* SSE3 */
25983 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25984
25985 /* SSE4.1 */
25986 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
25987
25988 /* SSE4A */
25989 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25990 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25991
25992 /* AVX */
25993 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
25994 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
25995
25996 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25997 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25998 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25999 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26000 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26001
26002 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26003 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26004 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26005 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26006 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26007 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26008 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26009
26010 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26011 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26012 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26013
26014 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26015 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26016 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26017 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26018 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26019 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26020 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26021 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26022
26023 /* AVX2 */
26024 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26025 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26026 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26027 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26028 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26029 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26030 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26031 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26032 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26033
26034 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26035 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26036 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26037 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26038 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26039 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26040
26041 /* FSGSBASE */
26042 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26043 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26044 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26045 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26046 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26047 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26048 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26049 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26050
26051 /* RTM */
26052 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26053 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26054 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26055 };
26056
26057 /* Builtins with variable number of arguments. */
26058 static const struct builtin_description bdesc_args[] =
26059 {
26060 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26061 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26062 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26063 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26064 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26065 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26066 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26067
26068 /* MMX */
26069 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26070 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26071 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26072 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26073 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26074 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26075
26076 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26077 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26078 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26079 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26080 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26081 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26082 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26083 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26084
26085 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26086 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26087
26088 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26089 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26090 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26091 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26092
26093 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26094 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26095 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26096 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26097 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26098 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26099
26100 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26101 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26102 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26103 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26104 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26105 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26106
26107 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26108 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26109 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26110
26111 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26112
26113 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26114 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26115 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26116 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26117 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26118 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26119
26120 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26121 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26122 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26123 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26124 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26125 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26126
26127 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26128 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26129 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26130 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26131
26132 /* 3DNow! */
26133 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26134 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26135 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26136 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26137
26138 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26139 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26140 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26141 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26142 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26143 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26144 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26145 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26146 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26147 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26148 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26149 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26150 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26151 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26152 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26153
26154 /* 3DNow!A */
26155 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26156 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26157 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26158 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26159 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26160 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26161
26162 /* SSE */
26163 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26164 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26165 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26166 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26167 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26168 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26169 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26170 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26171 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26172 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26173 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26174 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26175
26176 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26177
26178 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26179 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26180 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26181 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26182 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26183 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26184 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26185 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26186
26187 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26188 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26189 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26190 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26191 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26192 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26193 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26194 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26195 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26196 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26197 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26198 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26199 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26200 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26201 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26202 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26203 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26204 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26205 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26206 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26207 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26208 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26209
26210 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26211 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26212 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26213 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26214
26215 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26216 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26217 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26218 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26219
26220 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26221
26222 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26223 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26224 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26225 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26226 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26227
26228 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26229 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26230 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26231
26232 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26233
26234 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26235 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26236 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26237
26238 /* SSE MMX or 3Dnow!A */
26239 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26240 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26241 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26242
26243 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26244 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26245 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26246 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26247
26248 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26249 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26250
26251 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26252
26253 /* SSE2 */
26254 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26255
26256 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26257 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26258 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26259 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26260 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26261
26262 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26263 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26264 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26265 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26266 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26267
26268 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26269
26270 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26271 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26272 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26273 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26274
26275 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26276 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26277 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26278
26279 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26280 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26281 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26282 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26283 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26284 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26285 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26286 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26287
26288 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26289 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26290 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26291 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26292 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26293 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26294 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26295 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26296 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26297 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26298 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26299 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26300 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26301 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26302 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26303 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26304 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26305 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26306 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26307 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26308
26309 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26310 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26311 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26312 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26313
26314 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26315 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26316 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26317 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26318
26319 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26320
26321 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26322 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26323 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26324
26325 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26326
26327 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26328 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26329 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26330 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26331 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26332 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26333 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26334 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26335
26336 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26337 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26338 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26339 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26340 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26341 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26342 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26343 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26344
26345 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26346 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26347
26348 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26349 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26350 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26351 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26352
26353 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26354 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26355
26356 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26357 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26358 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26359 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26360 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26361 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26362
26363 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26364 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26365 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26366 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26367
26368 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26369 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26370 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26371 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26372 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26373 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26374 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26375 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26376
26377 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26378 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26379 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26380
26381 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26382 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26383
26384 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26385 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26386
26387 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26388
26389 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26390 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26391 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26392 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26393
26394 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26395 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26396 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26397 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26398 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26399 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26400 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26401
26402 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26403 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26404 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26405 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26406 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26407 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26408 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26409
26410 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26411 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26412 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26413 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26414
26415 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26416 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26417 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26418
26419 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26420
26421 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26422 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26423
26424 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26425
26426 /* SSE2 MMX */
26427 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26428 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26429
26430 /* SSE3 */
26431 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26432 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26433
26434 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26435 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26436 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26437 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26438 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26439 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26440
26441 /* SSSE3 */
26442 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26443 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26444 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26445 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26446 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26447 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26448
26449 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26450 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26451 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26452 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26453 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26454 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26455 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26456 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26457 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26458 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26459 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26460 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26461 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26462 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26463 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26464 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26465 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26466 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26467 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26468 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26469 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26470 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26471 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26472 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26473
26474 /* SSSE3. */
26475 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26476 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26477
26478 /* SSE4.1 */
26479 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26480 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26481 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26482 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26483 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26484 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26485 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26486 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26487 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26488 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26489
26490 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26491 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26492 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26493 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26494 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26495 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26496 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26497 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26498 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26499 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26500 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26501 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26502 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26503
26504 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26505 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26506 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26507 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26508 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26509 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26510 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26511 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26512 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26513 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26514 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26515 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26516
26517 /* SSE4.1 */
26518 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26519 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26520 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26521 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26522
26523 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26524 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26525 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26526 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26527
26528 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26529 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26530
26531 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26532 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26533
26534 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26535 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26536 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26537 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26538
26539 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26540 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26541
26542 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26543 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26544
26545 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26546 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26547 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26548
26549 /* SSE4.2 */
26550 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26551 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26552 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26553 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26554 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26555
26556 /* SSE4A */
26557 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26558 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26559 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26560 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26561
26562 /* AES */
26563 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26564 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26565
26566 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26567 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26568 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26569 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26570
26571 /* PCLMUL */
26572 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26573
26574 /* AVX */
26575 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26576 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26577 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26579 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26580 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26581 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26582 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26583 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26584 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26585 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26587 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26588 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26589 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26590 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26591 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26592 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26593 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26594 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26595 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26596 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26597 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26598 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26599 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26600 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26601
26602 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26603 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26604 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26605 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26606
26607 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26608 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26609 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26610 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26611 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26612 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26613 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26614 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26615 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26616 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26617 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26618 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26619 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26620 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26621 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26622 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26623 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26624 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26625 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26626 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26627 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26628 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26629 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26630 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26631 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26632 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26633 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26634 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26635 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26636 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26637 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26638 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26639 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26640 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26641
26642 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26643 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26644 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26645
26646 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26647 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26648 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26649 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26650 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26651
26652 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26653
26654 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26655 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26656
26657 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26658 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26659 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26660 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26661
26662 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26663 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26664
26665 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26666 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26667
26668 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26669 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26670 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26671 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26672
26673 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26674 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26675
26676 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26677 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26678
26679 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26680 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26681 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26682 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26683
26684 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26685 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26686 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26687 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26688 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26689 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26690
26691 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26692 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26693 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26694 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26695 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26696 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26697 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26698 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26699 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26700 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26701 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26702 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26703 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26704 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26705 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26706
26707 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26708 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26709
26710 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26711 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26712
26713 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26714
26715 /* AVX2 */
26716 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26717 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26718 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26719 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26720 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26721 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26722 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26723 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26724 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26725 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26726 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26727 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26733 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26734 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26737 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26738 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26739 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26740 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26742 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26744 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26745 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26746 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26747 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26748 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26749 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26750 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26751 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26752 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26753 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26754 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26755 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26756 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26757 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26758 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26759 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26760 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26761 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26762 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26763 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26764 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26765 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26766 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26767 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26768 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26769 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26770 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26771 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26772 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26773 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26774 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26775 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26776 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26778 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26779 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26780 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26781 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26782 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26783 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26784 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26785 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26786 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26787 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26788 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26789 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26790 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26791 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26792 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26793 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26794 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26795 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26796 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26797 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26798 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26799 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26800 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26801 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26802 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26803 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26804 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26805 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26806 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26807 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26808 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26809 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26810 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26811 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26812 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26813 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26814 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26815 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26816 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26817 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26818 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26819 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26820 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26821 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26822 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26823 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26824 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26825 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26826 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26827 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26828 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26829 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26830 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26831 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26832 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26833 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26834 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26835 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
26836 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26837 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
26838 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
26839 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26840 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26841 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26842 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26843 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26844 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26845 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26846 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26847 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26848 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
26849 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
26850 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
26851 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
26852 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26853 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26854 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26855 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26856 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26857 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26858 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26859 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26860 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26861 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26862
26863 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26864
26865 /* BMI */
26866 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26867 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26868 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26869
26870 /* TBM */
26871 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26872 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26873
26874 /* F16C */
26875 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
26876 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
26877 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
26878 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
26879
26880 /* BMI2 */
26881 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26882 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26883 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26884 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26885 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26886 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26887 };
26888
26889 /* FMA4 and XOP. */
26890 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
26891 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
26892 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
26893 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
26894 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
26895 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
26896 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
26897 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
26898 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
26899 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
26900 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
26901 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
26902 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
26903 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
26904 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
26905 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
26906 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
26907 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
26908 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
26909 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
26910 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
26911 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
26912 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
26913 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
26914 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
26915 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
26916 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
26917 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
26918 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
26919 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
26920 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
26921 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
26922 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
26923 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
26924 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
26925 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
26926 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
26927 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
26928 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
26929 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
26930 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
26931 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
26932 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
26933 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
26934 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
26935 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
26936 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
26937 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
26938 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
26939 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
26940 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
26941 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
26942
26943 static const struct builtin_description bdesc_multi_arg[] =
26944 {
26945 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
26946 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
26947 UNKNOWN, (int)MULTI_ARG_3_SF },
26948 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
26949 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
26950 UNKNOWN, (int)MULTI_ARG_3_DF },
26951
26952 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
26953 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
26954 UNKNOWN, (int)MULTI_ARG_3_SF },
26955 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
26956 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
26957 UNKNOWN, (int)MULTI_ARG_3_DF },
26958
26959 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
26960 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
26961 UNKNOWN, (int)MULTI_ARG_3_SF },
26962 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
26963 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
26964 UNKNOWN, (int)MULTI_ARG_3_DF },
26965 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
26966 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
26967 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26968 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
26969 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
26970 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26971
26972 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
26973 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
26974 UNKNOWN, (int)MULTI_ARG_3_SF },
26975 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
26976 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
26977 UNKNOWN, (int)MULTI_ARG_3_DF },
26978 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
26979 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
26980 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26981 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
26982 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
26983 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26984
26985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
26986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
26987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
26988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
26989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
26990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
26991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
26992
26993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
26996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
26997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
26998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
26999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27000
27001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27002
27003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27015
27016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27028 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27029 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27032
27033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27037 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27039
27040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27042 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27045 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27046 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27047 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27049 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27051 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27053 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27054 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27055
27056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27059 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27060 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27061 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27062 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27063
27064 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27065 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27066 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27067 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27068 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27069 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27070 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27071
27072 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27073 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27074 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27075 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27076 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27077 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27078 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27079
27080 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27081 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27082 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27083 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27084 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27085 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27086 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27087
27088 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27089 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27090 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27091 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27092 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27093 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27094 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27095
27096 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27097 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27098 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27099 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27100 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27101 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27102 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27103
27104 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27105 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27106 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27107 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27108 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27109 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27110 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27111
27112 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27113 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27114 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27115 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27116 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27117 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27118 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27119
27120 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27121 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27122 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27123 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27124 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27125 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27126 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27127 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27128
27129 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27130 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27131 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27132 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27133 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27134 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27135 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27136 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27137
27138 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27139 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27140 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27141 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27142
27143 };
27144 \f
27145 /* TM vector builtins. */
27146
27147 /* Reuse the existing x86-specific `struct builtin_description' cause
27148 we're lazy. Add casts to make them fit. */
27149 static const struct builtin_description bdesc_tm[] =
27150 {
27151 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27152 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27153 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27154 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27155 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27156 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27157 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27158
27159 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27160 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27161 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27162 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27163 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27164 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27165 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27166
27167 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27168 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27169 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27170 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27171 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27172 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27173 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27174
27175 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27176 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27177 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27178 };
27179
27180 /* TM callbacks. */
27181
27182 /* Return the builtin decl needed to load a vector of TYPE. */
27183
27184 static tree
27185 ix86_builtin_tm_load (tree type)
27186 {
27187 if (TREE_CODE (type) == VECTOR_TYPE)
27188 {
27189 switch (tree_low_cst (TYPE_SIZE (type), 1))
27190 {
27191 case 64:
27192 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27193 case 128:
27194 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27195 case 256:
27196 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27197 }
27198 }
27199 return NULL_TREE;
27200 }
27201
27202 /* Return the builtin decl needed to store a vector of TYPE. */
27203
27204 static tree
27205 ix86_builtin_tm_store (tree type)
27206 {
27207 if (TREE_CODE (type) == VECTOR_TYPE)
27208 {
27209 switch (tree_low_cst (TYPE_SIZE (type), 1))
27210 {
27211 case 64:
27212 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27213 case 128:
27214 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27215 case 256:
27216 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27217 }
27218 }
27219 return NULL_TREE;
27220 }
27221 \f
27222 /* Initialize the transactional memory vector load/store builtins. */
27223
27224 static void
27225 ix86_init_tm_builtins (void)
27226 {
27227 enum ix86_builtin_func_type ftype;
27228 const struct builtin_description *d;
27229 size_t i;
27230 tree decl;
27231 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27232 tree attrs_log, attrs_type_log;
27233
27234 if (!flag_tm)
27235 return;
27236
27237 /* If there are no builtins defined, we must be compiling in a
27238 language without trans-mem support. */
27239 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27240 return;
27241
27242 /* Use whatever attributes a normal TM load has. */
27243 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27244 attrs_load = DECL_ATTRIBUTES (decl);
27245 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27246 /* Use whatever attributes a normal TM store has. */
27247 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27248 attrs_store = DECL_ATTRIBUTES (decl);
27249 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27250 /* Use whatever attributes a normal TM log has. */
27251 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27252 attrs_log = DECL_ATTRIBUTES (decl);
27253 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27254
27255 for (i = 0, d = bdesc_tm;
27256 i < ARRAY_SIZE (bdesc_tm);
27257 i++, d++)
27258 {
27259 if ((d->mask & ix86_isa_flags) != 0
27260 || (lang_hooks.builtin_function
27261 == lang_hooks.builtin_function_ext_scope))
27262 {
27263 tree type, attrs, attrs_type;
27264 enum built_in_function code = (enum built_in_function) d->code;
27265
27266 ftype = (enum ix86_builtin_func_type) d->flag;
27267 type = ix86_get_builtin_func_type (ftype);
27268
27269 if (BUILTIN_TM_LOAD_P (code))
27270 {
27271 attrs = attrs_load;
27272 attrs_type = attrs_type_load;
27273 }
27274 else if (BUILTIN_TM_STORE_P (code))
27275 {
27276 attrs = attrs_store;
27277 attrs_type = attrs_type_store;
27278 }
27279 else
27280 {
27281 attrs = attrs_log;
27282 attrs_type = attrs_type_log;
27283 }
27284 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27285 /* The builtin without the prefix for
27286 calling it directly. */
27287 d->name + strlen ("__builtin_"),
27288 attrs);
27289 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27290 set the TYPE_ATTRIBUTES. */
27291 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27292
27293 set_builtin_decl (code, decl, false);
27294 }
27295 }
27296 }
27297
27298 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27299 in the current target ISA to allow the user to compile particular modules
27300 with different target specific options that differ from the command line
27301 options. */
27302 static void
27303 ix86_init_mmx_sse_builtins (void)
27304 {
27305 const struct builtin_description * d;
27306 enum ix86_builtin_func_type ftype;
27307 size_t i;
27308
27309 /* Add all special builtins with variable number of operands. */
27310 for (i = 0, d = bdesc_special_args;
27311 i < ARRAY_SIZE (bdesc_special_args);
27312 i++, d++)
27313 {
27314 if (d->name == 0)
27315 continue;
27316
27317 ftype = (enum ix86_builtin_func_type) d->flag;
27318 def_builtin (d->mask, d->name, ftype, d->code);
27319 }
27320
27321 /* Add all builtins with variable number of operands. */
27322 for (i = 0, d = bdesc_args;
27323 i < ARRAY_SIZE (bdesc_args);
27324 i++, d++)
27325 {
27326 if (d->name == 0)
27327 continue;
27328
27329 ftype = (enum ix86_builtin_func_type) d->flag;
27330 def_builtin_const (d->mask, d->name, ftype, d->code);
27331 }
27332
27333 /* pcmpestr[im] insns. */
27334 for (i = 0, d = bdesc_pcmpestr;
27335 i < ARRAY_SIZE (bdesc_pcmpestr);
27336 i++, d++)
27337 {
27338 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27339 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27340 else
27341 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27342 def_builtin_const (d->mask, d->name, ftype, d->code);
27343 }
27344
27345 /* pcmpistr[im] insns. */
27346 for (i = 0, d = bdesc_pcmpistr;
27347 i < ARRAY_SIZE (bdesc_pcmpistr);
27348 i++, d++)
27349 {
27350 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27351 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27352 else
27353 ftype = INT_FTYPE_V16QI_V16QI_INT;
27354 def_builtin_const (d->mask, d->name, ftype, d->code);
27355 }
27356
27357 /* comi/ucomi insns. */
27358 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27359 {
27360 if (d->mask == OPTION_MASK_ISA_SSE2)
27361 ftype = INT_FTYPE_V2DF_V2DF;
27362 else
27363 ftype = INT_FTYPE_V4SF_V4SF;
27364 def_builtin_const (d->mask, d->name, ftype, d->code);
27365 }
27366
27367 /* SSE */
27368 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27369 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27370 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27371 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27372
27373 /* SSE or 3DNow!A */
27374 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27375 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27376 IX86_BUILTIN_MASKMOVQ);
27377
27378 /* SSE2 */
27379 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27380 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27381
27382 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27383 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27384 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27385 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27386
27387 /* SSE3. */
27388 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27389 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27390 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27391 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27392
27393 /* AES */
27394 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27395 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27396 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27397 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27398 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27399 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27400 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27401 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27402 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27403 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27404 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27405 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27406
27407 /* PCLMUL */
27408 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27409 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27410
27411 /* RDRND */
27412 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27413 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27414 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27415 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27416 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27417 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27418 IX86_BUILTIN_RDRAND64_STEP);
27419
27420 /* AVX2 */
27421 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27422 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27423 IX86_BUILTIN_GATHERSIV2DF);
27424
27425 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27426 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27427 IX86_BUILTIN_GATHERSIV4DF);
27428
27429 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27430 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27431 IX86_BUILTIN_GATHERDIV2DF);
27432
27433 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27434 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27435 IX86_BUILTIN_GATHERDIV4DF);
27436
27437 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27438 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27439 IX86_BUILTIN_GATHERSIV4SF);
27440
27441 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27442 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27443 IX86_BUILTIN_GATHERSIV8SF);
27444
27445 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27446 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27447 IX86_BUILTIN_GATHERDIV4SF);
27448
27449 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27450 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27451 IX86_BUILTIN_GATHERDIV8SF);
27452
27453 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27454 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27455 IX86_BUILTIN_GATHERSIV2DI);
27456
27457 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27458 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27459 IX86_BUILTIN_GATHERSIV4DI);
27460
27461 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27462 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27463 IX86_BUILTIN_GATHERDIV2DI);
27464
27465 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27466 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27467 IX86_BUILTIN_GATHERDIV4DI);
27468
27469 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27470 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27471 IX86_BUILTIN_GATHERSIV4SI);
27472
27473 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27474 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27475 IX86_BUILTIN_GATHERSIV8SI);
27476
27477 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27478 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27479 IX86_BUILTIN_GATHERDIV4SI);
27480
27481 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27482 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27483 IX86_BUILTIN_GATHERDIV8SI);
27484
27485 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27486 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27487 IX86_BUILTIN_GATHERALTSIV4DF);
27488
27489 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27490 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27491 IX86_BUILTIN_GATHERALTDIV8SF);
27492
27493 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27494 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27495 IX86_BUILTIN_GATHERALTSIV4DI);
27496
27497 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27498 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27499 IX86_BUILTIN_GATHERALTDIV8SI);
27500
27501 /* RTM. */
27502 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
27503 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
27504
27505 /* MMX access to the vec_init patterns. */
27506 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27507 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27508
27509 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27510 V4HI_FTYPE_HI_HI_HI_HI,
27511 IX86_BUILTIN_VEC_INIT_V4HI);
27512
27513 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27514 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27515 IX86_BUILTIN_VEC_INIT_V8QI);
27516
27517 /* Access to the vec_extract patterns. */
27518 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27519 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27520 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27521 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27522 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27523 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27524 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27525 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27526 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27527 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27528
27529 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27530 "__builtin_ia32_vec_ext_v4hi",
27531 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27532
27533 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27534 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27535
27536 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27537 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27538
27539 /* Access to the vec_set patterns. */
27540 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27541 "__builtin_ia32_vec_set_v2di",
27542 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27543
27544 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27545 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27546
27547 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27548 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27549
27550 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27551 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27552
27553 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27554 "__builtin_ia32_vec_set_v4hi",
27555 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27556
27557 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27558 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27559
27560 /* Add FMA4 multi-arg argument instructions */
27561 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27562 {
27563 if (d->name == 0)
27564 continue;
27565
27566 ftype = (enum ix86_builtin_func_type) d->flag;
27567 def_builtin_const (d->mask, d->name, ftype, d->code);
27568 }
27569 }
27570
27571 /* Internal method for ix86_init_builtins. */
27572
27573 static void
27574 ix86_init_builtins_va_builtins_abi (void)
27575 {
27576 tree ms_va_ref, sysv_va_ref;
27577 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27578 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
27579 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
27580 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
27581
27582 if (!TARGET_64BIT)
27583 return;
27584 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
27585 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
27586 ms_va_ref = build_reference_type (ms_va_list_type_node);
27587 sysv_va_ref =
27588 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
27589
27590 fnvoid_va_end_ms =
27591 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27592 fnvoid_va_start_ms =
27593 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27594 fnvoid_va_end_sysv =
27595 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
27596 fnvoid_va_start_sysv =
27597 build_varargs_function_type_list (void_type_node, sysv_va_ref,
27598 NULL_TREE);
27599 fnvoid_va_copy_ms =
27600 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
27601 NULL_TREE);
27602 fnvoid_va_copy_sysv =
27603 build_function_type_list (void_type_node, sysv_va_ref,
27604 sysv_va_ref, NULL_TREE);
27605
27606 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
27607 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
27608 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
27609 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
27610 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
27611 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
27612 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
27613 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27614 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
27615 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27616 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
27617 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27618 }
27619
27620 static void
27621 ix86_init_builtin_types (void)
27622 {
27623 tree float128_type_node, float80_type_node;
27624
27625 /* The __float80 type. */
27626 float80_type_node = long_double_type_node;
27627 if (TYPE_MODE (float80_type_node) != XFmode)
27628 {
27629 /* The __float80 type. */
27630 float80_type_node = make_node (REAL_TYPE);
27631
27632 TYPE_PRECISION (float80_type_node) = 80;
27633 layout_type (float80_type_node);
27634 }
27635 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
27636
27637 /* The __float128 type. */
27638 float128_type_node = make_node (REAL_TYPE);
27639 TYPE_PRECISION (float128_type_node) = 128;
27640 layout_type (float128_type_node);
27641 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
27642
27643 /* This macro is built by i386-builtin-types.awk. */
27644 DEFINE_BUILTIN_PRIMITIVE_TYPES;
27645 }
27646
27647 static void
27648 ix86_init_builtins (void)
27649 {
27650 tree t;
27651
27652 ix86_init_builtin_types ();
27653
27654 /* TFmode support builtins. */
27655 def_builtin_const (0, "__builtin_infq",
27656 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
27657 def_builtin_const (0, "__builtin_huge_valq",
27658 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
27659
27660 /* We will expand them to normal call if SSE2 isn't available since
27661 they are used by libgcc. */
27662 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27663 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27664 BUILT_IN_MD, "__fabstf2", NULL_TREE);
27665 TREE_READONLY (t) = 1;
27666 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27667
27668 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27669 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27670 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27671 TREE_READONLY (t) = 1;
27672 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27673
27674 ix86_init_tm_builtins ();
27675 ix86_init_mmx_sse_builtins ();
27676
27677 if (TARGET_LP64)
27678 ix86_init_builtins_va_builtins_abi ();
27679
27680 #ifdef SUBTARGET_INIT_BUILTINS
27681 SUBTARGET_INIT_BUILTINS;
27682 #endif
27683 }
27684
27685 /* Return the ix86 builtin for CODE. */
27686
27687 static tree
27688 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27689 {
27690 if (code >= IX86_BUILTIN_MAX)
27691 return error_mark_node;
27692
27693 return ix86_builtins[code];
27694 }
27695
27696 /* Errors in the source file can cause expand_expr to return const0_rtx
27697 where we expect a vector. To avoid crashing, use one of the vector
27698 clear instructions. */
27699 static rtx
27700 safe_vector_operand (rtx x, enum machine_mode mode)
27701 {
27702 if (x == const0_rtx)
27703 x = CONST0_RTX (mode);
27704 return x;
27705 }
27706
27707 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
27708
27709 static rtx
27710 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27711 {
27712 rtx pat;
27713 tree arg0 = CALL_EXPR_ARG (exp, 0);
27714 tree arg1 = CALL_EXPR_ARG (exp, 1);
27715 rtx op0 = expand_normal (arg0);
27716 rtx op1 = expand_normal (arg1);
27717 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27718 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27719 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27720
27721 if (VECTOR_MODE_P (mode0))
27722 op0 = safe_vector_operand (op0, mode0);
27723 if (VECTOR_MODE_P (mode1))
27724 op1 = safe_vector_operand (op1, mode1);
27725
27726 if (optimize || !target
27727 || GET_MODE (target) != tmode
27728 || !insn_data[icode].operand[0].predicate (target, tmode))
27729 target = gen_reg_rtx (tmode);
27730
27731 if (GET_MODE (op1) == SImode && mode1 == TImode)
27732 {
27733 rtx x = gen_reg_rtx (V4SImode);
27734 emit_insn (gen_sse2_loadd (x, op1));
27735 op1 = gen_lowpart (TImode, x);
27736 }
27737
27738 if (!insn_data[icode].operand[1].predicate (op0, mode0))
27739 op0 = copy_to_mode_reg (mode0, op0);
27740 if (!insn_data[icode].operand[2].predicate (op1, mode1))
27741 op1 = copy_to_mode_reg (mode1, op1);
27742
27743 pat = GEN_FCN (icode) (target, op0, op1);
27744 if (! pat)
27745 return 0;
27746
27747 emit_insn (pat);
27748
27749 return target;
27750 }
27751
27752 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
27753
27754 static rtx
27755 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27756 enum ix86_builtin_func_type m_type,
27757 enum rtx_code sub_code)
27758 {
27759 rtx pat;
27760 int i;
27761 int nargs;
27762 bool comparison_p = false;
27763 bool tf_p = false;
27764 bool last_arg_constant = false;
27765 int num_memory = 0;
27766 struct {
27767 rtx op;
27768 enum machine_mode mode;
27769 } args[4];
27770
27771 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27772
27773 switch (m_type)
27774 {
27775 case MULTI_ARG_4_DF2_DI_I:
27776 case MULTI_ARG_4_DF2_DI_I1:
27777 case MULTI_ARG_4_SF2_SI_I:
27778 case MULTI_ARG_4_SF2_SI_I1:
27779 nargs = 4;
27780 last_arg_constant = true;
27781 break;
27782
27783 case MULTI_ARG_3_SF:
27784 case MULTI_ARG_3_DF:
27785 case MULTI_ARG_3_SF2:
27786 case MULTI_ARG_3_DF2:
27787 case MULTI_ARG_3_DI:
27788 case MULTI_ARG_3_SI:
27789 case MULTI_ARG_3_SI_DI:
27790 case MULTI_ARG_3_HI:
27791 case MULTI_ARG_3_HI_SI:
27792 case MULTI_ARG_3_QI:
27793 case MULTI_ARG_3_DI2:
27794 case MULTI_ARG_3_SI2:
27795 case MULTI_ARG_3_HI2:
27796 case MULTI_ARG_3_QI2:
27797 nargs = 3;
27798 break;
27799
27800 case MULTI_ARG_2_SF:
27801 case MULTI_ARG_2_DF:
27802 case MULTI_ARG_2_DI:
27803 case MULTI_ARG_2_SI:
27804 case MULTI_ARG_2_HI:
27805 case MULTI_ARG_2_QI:
27806 nargs = 2;
27807 break;
27808
27809 case MULTI_ARG_2_DI_IMM:
27810 case MULTI_ARG_2_SI_IMM:
27811 case MULTI_ARG_2_HI_IMM:
27812 case MULTI_ARG_2_QI_IMM:
27813 nargs = 2;
27814 last_arg_constant = true;
27815 break;
27816
27817 case MULTI_ARG_1_SF:
27818 case MULTI_ARG_1_DF:
27819 case MULTI_ARG_1_SF2:
27820 case MULTI_ARG_1_DF2:
27821 case MULTI_ARG_1_DI:
27822 case MULTI_ARG_1_SI:
27823 case MULTI_ARG_1_HI:
27824 case MULTI_ARG_1_QI:
27825 case MULTI_ARG_1_SI_DI:
27826 case MULTI_ARG_1_HI_DI:
27827 case MULTI_ARG_1_HI_SI:
27828 case MULTI_ARG_1_QI_DI:
27829 case MULTI_ARG_1_QI_SI:
27830 case MULTI_ARG_1_QI_HI:
27831 nargs = 1;
27832 break;
27833
27834 case MULTI_ARG_2_DI_CMP:
27835 case MULTI_ARG_2_SI_CMP:
27836 case MULTI_ARG_2_HI_CMP:
27837 case MULTI_ARG_2_QI_CMP:
27838 nargs = 2;
27839 comparison_p = true;
27840 break;
27841
27842 case MULTI_ARG_2_SF_TF:
27843 case MULTI_ARG_2_DF_TF:
27844 case MULTI_ARG_2_DI_TF:
27845 case MULTI_ARG_2_SI_TF:
27846 case MULTI_ARG_2_HI_TF:
27847 case MULTI_ARG_2_QI_TF:
27848 nargs = 2;
27849 tf_p = true;
27850 break;
27851
27852 default:
27853 gcc_unreachable ();
27854 }
27855
27856 if (optimize || !target
27857 || GET_MODE (target) != tmode
27858 || !insn_data[icode].operand[0].predicate (target, tmode))
27859 target = gen_reg_rtx (tmode);
27860
27861 gcc_assert (nargs <= 4);
27862
27863 for (i = 0; i < nargs; i++)
27864 {
27865 tree arg = CALL_EXPR_ARG (exp, i);
27866 rtx op = expand_normal (arg);
27867 int adjust = (comparison_p) ? 1 : 0;
27868 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
27869
27870 if (last_arg_constant && i == nargs - 1)
27871 {
27872 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
27873 {
27874 enum insn_code new_icode = icode;
27875 switch (icode)
27876 {
27877 case CODE_FOR_xop_vpermil2v2df3:
27878 case CODE_FOR_xop_vpermil2v4sf3:
27879 case CODE_FOR_xop_vpermil2v4df3:
27880 case CODE_FOR_xop_vpermil2v8sf3:
27881 error ("the last argument must be a 2-bit immediate");
27882 return gen_reg_rtx (tmode);
27883 case CODE_FOR_xop_rotlv2di3:
27884 new_icode = CODE_FOR_rotlv2di3;
27885 goto xop_rotl;
27886 case CODE_FOR_xop_rotlv4si3:
27887 new_icode = CODE_FOR_rotlv4si3;
27888 goto xop_rotl;
27889 case CODE_FOR_xop_rotlv8hi3:
27890 new_icode = CODE_FOR_rotlv8hi3;
27891 goto xop_rotl;
27892 case CODE_FOR_xop_rotlv16qi3:
27893 new_icode = CODE_FOR_rotlv16qi3;
27894 xop_rotl:
27895 if (CONST_INT_P (op))
27896 {
27897 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
27898 op = GEN_INT (INTVAL (op) & mask);
27899 gcc_checking_assert
27900 (insn_data[icode].operand[i + 1].predicate (op, mode));
27901 }
27902 else
27903 {
27904 gcc_checking_assert
27905 (nargs == 2
27906 && insn_data[new_icode].operand[0].mode == tmode
27907 && insn_data[new_icode].operand[1].mode == tmode
27908 && insn_data[new_icode].operand[2].mode == mode
27909 && insn_data[new_icode].operand[0].predicate
27910 == insn_data[icode].operand[0].predicate
27911 && insn_data[new_icode].operand[1].predicate
27912 == insn_data[icode].operand[1].predicate);
27913 icode = new_icode;
27914 goto non_constant;
27915 }
27916 break;
27917 default:
27918 gcc_unreachable ();
27919 }
27920 }
27921 }
27922 else
27923 {
27924 non_constant:
27925 if (VECTOR_MODE_P (mode))
27926 op = safe_vector_operand (op, mode);
27927
27928 /* If we aren't optimizing, only allow one memory operand to be
27929 generated. */
27930 if (memory_operand (op, mode))
27931 num_memory++;
27932
27933 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
27934
27935 if (optimize
27936 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
27937 || num_memory > 1)
27938 op = force_reg (mode, op);
27939 }
27940
27941 args[i].op = op;
27942 args[i].mode = mode;
27943 }
27944
27945 switch (nargs)
27946 {
27947 case 1:
27948 pat = GEN_FCN (icode) (target, args[0].op);
27949 break;
27950
27951 case 2:
27952 if (tf_p)
27953 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
27954 GEN_INT ((int)sub_code));
27955 else if (! comparison_p)
27956 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
27957 else
27958 {
27959 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
27960 args[0].op,
27961 args[1].op);
27962
27963 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
27964 }
27965 break;
27966
27967 case 3:
27968 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
27969 break;
27970
27971 case 4:
27972 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
27973 break;
27974
27975 default:
27976 gcc_unreachable ();
27977 }
27978
27979 if (! pat)
27980 return 0;
27981
27982 emit_insn (pat);
27983 return target;
27984 }
27985
27986 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
27987 insns with vec_merge. */
27988
27989 static rtx
27990 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
27991 rtx target)
27992 {
27993 rtx pat;
27994 tree arg0 = CALL_EXPR_ARG (exp, 0);
27995 rtx op1, op0 = expand_normal (arg0);
27996 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27997 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27998
27999 if (optimize || !target
28000 || GET_MODE (target) != tmode
28001 || !insn_data[icode].operand[0].predicate (target, tmode))
28002 target = gen_reg_rtx (tmode);
28003
28004 if (VECTOR_MODE_P (mode0))
28005 op0 = safe_vector_operand (op0, mode0);
28006
28007 if ((optimize && !register_operand (op0, mode0))
28008 || !insn_data[icode].operand[1].predicate (op0, mode0))
28009 op0 = copy_to_mode_reg (mode0, op0);
28010
28011 op1 = op0;
28012 if (!insn_data[icode].operand[2].predicate (op1, mode0))
28013 op1 = copy_to_mode_reg (mode0, op1);
28014
28015 pat = GEN_FCN (icode) (target, op0, op1);
28016 if (! pat)
28017 return 0;
28018 emit_insn (pat);
28019 return target;
28020 }
28021
28022 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
28023
28024 static rtx
28025 ix86_expand_sse_compare (const struct builtin_description *d,
28026 tree exp, rtx target, bool swap)
28027 {
28028 rtx pat;
28029 tree arg0 = CALL_EXPR_ARG (exp, 0);
28030 tree arg1 = CALL_EXPR_ARG (exp, 1);
28031 rtx op0 = expand_normal (arg0);
28032 rtx op1 = expand_normal (arg1);
28033 rtx op2;
28034 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28035 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28036 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28037 enum rtx_code comparison = d->comparison;
28038
28039 if (VECTOR_MODE_P (mode0))
28040 op0 = safe_vector_operand (op0, mode0);
28041 if (VECTOR_MODE_P (mode1))
28042 op1 = safe_vector_operand (op1, mode1);
28043
28044 /* Swap operands if we have a comparison that isn't available in
28045 hardware. */
28046 if (swap)
28047 {
28048 rtx tmp = gen_reg_rtx (mode1);
28049 emit_move_insn (tmp, op1);
28050 op1 = op0;
28051 op0 = tmp;
28052 }
28053
28054 if (optimize || !target
28055 || GET_MODE (target) != tmode
28056 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28057 target = gen_reg_rtx (tmode);
28058
28059 if ((optimize && !register_operand (op0, mode0))
28060 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
28061 op0 = copy_to_mode_reg (mode0, op0);
28062 if ((optimize && !register_operand (op1, mode1))
28063 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
28064 op1 = copy_to_mode_reg (mode1, op1);
28065
28066 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
28067 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28068 if (! pat)
28069 return 0;
28070 emit_insn (pat);
28071 return target;
28072 }
28073
28074 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
28075
28076 static rtx
28077 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28078 rtx target)
28079 {
28080 rtx pat;
28081 tree arg0 = CALL_EXPR_ARG (exp, 0);
28082 tree arg1 = CALL_EXPR_ARG (exp, 1);
28083 rtx op0 = expand_normal (arg0);
28084 rtx op1 = expand_normal (arg1);
28085 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28086 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28087 enum rtx_code comparison = d->comparison;
28088
28089 if (VECTOR_MODE_P (mode0))
28090 op0 = safe_vector_operand (op0, mode0);
28091 if (VECTOR_MODE_P (mode1))
28092 op1 = safe_vector_operand (op1, mode1);
28093
28094 /* Swap operands if we have a comparison that isn't available in
28095 hardware. */
28096 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28097 {
28098 rtx tmp = op1;
28099 op1 = op0;
28100 op0 = tmp;
28101 }
28102
28103 target = gen_reg_rtx (SImode);
28104 emit_move_insn (target, const0_rtx);
28105 target = gen_rtx_SUBREG (QImode, target, 0);
28106
28107 if ((optimize && !register_operand (op0, mode0))
28108 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28109 op0 = copy_to_mode_reg (mode0, op0);
28110 if ((optimize && !register_operand (op1, mode1))
28111 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28112 op1 = copy_to_mode_reg (mode1, op1);
28113
28114 pat = GEN_FCN (d->icode) (op0, op1);
28115 if (! pat)
28116 return 0;
28117 emit_insn (pat);
28118 emit_insn (gen_rtx_SET (VOIDmode,
28119 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28120 gen_rtx_fmt_ee (comparison, QImode,
28121 SET_DEST (pat),
28122 const0_rtx)));
28123
28124 return SUBREG_REG (target);
28125 }
28126
28127 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
28128
28129 static rtx
28130 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28131 rtx target)
28132 {
28133 rtx pat;
28134 tree arg0 = CALL_EXPR_ARG (exp, 0);
28135 rtx op1, op0 = expand_normal (arg0);
28136 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28137 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28138
28139 if (optimize || target == 0
28140 || GET_MODE (target) != tmode
28141 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28142 target = gen_reg_rtx (tmode);
28143
28144 if (VECTOR_MODE_P (mode0))
28145 op0 = safe_vector_operand (op0, mode0);
28146
28147 if ((optimize && !register_operand (op0, mode0))
28148 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28149 op0 = copy_to_mode_reg (mode0, op0);
28150
28151 op1 = GEN_INT (d->comparison);
28152
28153 pat = GEN_FCN (d->icode) (target, op0, op1);
28154 if (! pat)
28155 return 0;
28156 emit_insn (pat);
28157 return target;
28158 }
28159
28160 static rtx
28161 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28162 tree exp, rtx target)
28163 {
28164 rtx pat;
28165 tree arg0 = CALL_EXPR_ARG (exp, 0);
28166 tree arg1 = CALL_EXPR_ARG (exp, 1);
28167 rtx op0 = expand_normal (arg0);
28168 rtx op1 = expand_normal (arg1);
28169 rtx op2;
28170 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28171 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28172 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28173
28174 if (optimize || target == 0
28175 || GET_MODE (target) != tmode
28176 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28177 target = gen_reg_rtx (tmode);
28178
28179 op0 = safe_vector_operand (op0, mode0);
28180 op1 = safe_vector_operand (op1, mode1);
28181
28182 if ((optimize && !register_operand (op0, mode0))
28183 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28184 op0 = copy_to_mode_reg (mode0, op0);
28185 if ((optimize && !register_operand (op1, mode1))
28186 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28187 op1 = copy_to_mode_reg (mode1, op1);
28188
28189 op2 = GEN_INT (d->comparison);
28190
28191 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28192 if (! pat)
28193 return 0;
28194 emit_insn (pat);
28195 return target;
28196 }
28197
28198 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28199
28200 static rtx
28201 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28202 rtx target)
28203 {
28204 rtx pat;
28205 tree arg0 = CALL_EXPR_ARG (exp, 0);
28206 tree arg1 = CALL_EXPR_ARG (exp, 1);
28207 rtx op0 = expand_normal (arg0);
28208 rtx op1 = expand_normal (arg1);
28209 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28210 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28211 enum rtx_code comparison = d->comparison;
28212
28213 if (VECTOR_MODE_P (mode0))
28214 op0 = safe_vector_operand (op0, mode0);
28215 if (VECTOR_MODE_P (mode1))
28216 op1 = safe_vector_operand (op1, mode1);
28217
28218 target = gen_reg_rtx (SImode);
28219 emit_move_insn (target, const0_rtx);
28220 target = gen_rtx_SUBREG (QImode, target, 0);
28221
28222 if ((optimize && !register_operand (op0, mode0))
28223 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28224 op0 = copy_to_mode_reg (mode0, op0);
28225 if ((optimize && !register_operand (op1, mode1))
28226 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28227 op1 = copy_to_mode_reg (mode1, op1);
28228
28229 pat = GEN_FCN (d->icode) (op0, op1);
28230 if (! pat)
28231 return 0;
28232 emit_insn (pat);
28233 emit_insn (gen_rtx_SET (VOIDmode,
28234 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28235 gen_rtx_fmt_ee (comparison, QImode,
28236 SET_DEST (pat),
28237 const0_rtx)));
28238
28239 return SUBREG_REG (target);
28240 }
28241
28242 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28243
28244 static rtx
28245 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28246 tree exp, rtx target)
28247 {
28248 rtx pat;
28249 tree arg0 = CALL_EXPR_ARG (exp, 0);
28250 tree arg1 = CALL_EXPR_ARG (exp, 1);
28251 tree arg2 = CALL_EXPR_ARG (exp, 2);
28252 tree arg3 = CALL_EXPR_ARG (exp, 3);
28253 tree arg4 = CALL_EXPR_ARG (exp, 4);
28254 rtx scratch0, scratch1;
28255 rtx op0 = expand_normal (arg0);
28256 rtx op1 = expand_normal (arg1);
28257 rtx op2 = expand_normal (arg2);
28258 rtx op3 = expand_normal (arg3);
28259 rtx op4 = expand_normal (arg4);
28260 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28261
28262 tmode0 = insn_data[d->icode].operand[0].mode;
28263 tmode1 = insn_data[d->icode].operand[1].mode;
28264 modev2 = insn_data[d->icode].operand[2].mode;
28265 modei3 = insn_data[d->icode].operand[3].mode;
28266 modev4 = insn_data[d->icode].operand[4].mode;
28267 modei5 = insn_data[d->icode].operand[5].mode;
28268 modeimm = insn_data[d->icode].operand[6].mode;
28269
28270 if (VECTOR_MODE_P (modev2))
28271 op0 = safe_vector_operand (op0, modev2);
28272 if (VECTOR_MODE_P (modev4))
28273 op2 = safe_vector_operand (op2, modev4);
28274
28275 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28276 op0 = copy_to_mode_reg (modev2, op0);
28277 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28278 op1 = copy_to_mode_reg (modei3, op1);
28279 if ((optimize && !register_operand (op2, modev4))
28280 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28281 op2 = copy_to_mode_reg (modev4, op2);
28282 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28283 op3 = copy_to_mode_reg (modei5, op3);
28284
28285 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28286 {
28287 error ("the fifth argument must be an 8-bit immediate");
28288 return const0_rtx;
28289 }
28290
28291 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28292 {
28293 if (optimize || !target
28294 || GET_MODE (target) != tmode0
28295 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28296 target = gen_reg_rtx (tmode0);
28297
28298 scratch1 = gen_reg_rtx (tmode1);
28299
28300 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28301 }
28302 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28303 {
28304 if (optimize || !target
28305 || GET_MODE (target) != tmode1
28306 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28307 target = gen_reg_rtx (tmode1);
28308
28309 scratch0 = gen_reg_rtx (tmode0);
28310
28311 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28312 }
28313 else
28314 {
28315 gcc_assert (d->flag);
28316
28317 scratch0 = gen_reg_rtx (tmode0);
28318 scratch1 = gen_reg_rtx (tmode1);
28319
28320 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28321 }
28322
28323 if (! pat)
28324 return 0;
28325
28326 emit_insn (pat);
28327
28328 if (d->flag)
28329 {
28330 target = gen_reg_rtx (SImode);
28331 emit_move_insn (target, const0_rtx);
28332 target = gen_rtx_SUBREG (QImode, target, 0);
28333
28334 emit_insn
28335 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28336 gen_rtx_fmt_ee (EQ, QImode,
28337 gen_rtx_REG ((enum machine_mode) d->flag,
28338 FLAGS_REG),
28339 const0_rtx)));
28340 return SUBREG_REG (target);
28341 }
28342 else
28343 return target;
28344 }
28345
28346
28347 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28348
28349 static rtx
28350 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28351 tree exp, rtx target)
28352 {
28353 rtx pat;
28354 tree arg0 = CALL_EXPR_ARG (exp, 0);
28355 tree arg1 = CALL_EXPR_ARG (exp, 1);
28356 tree arg2 = CALL_EXPR_ARG (exp, 2);
28357 rtx scratch0, scratch1;
28358 rtx op0 = expand_normal (arg0);
28359 rtx op1 = expand_normal (arg1);
28360 rtx op2 = expand_normal (arg2);
28361 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28362
28363 tmode0 = insn_data[d->icode].operand[0].mode;
28364 tmode1 = insn_data[d->icode].operand[1].mode;
28365 modev2 = insn_data[d->icode].operand[2].mode;
28366 modev3 = insn_data[d->icode].operand[3].mode;
28367 modeimm = insn_data[d->icode].operand[4].mode;
28368
28369 if (VECTOR_MODE_P (modev2))
28370 op0 = safe_vector_operand (op0, modev2);
28371 if (VECTOR_MODE_P (modev3))
28372 op1 = safe_vector_operand (op1, modev3);
28373
28374 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28375 op0 = copy_to_mode_reg (modev2, op0);
28376 if ((optimize && !register_operand (op1, modev3))
28377 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28378 op1 = copy_to_mode_reg (modev3, op1);
28379
28380 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28381 {
28382 error ("the third argument must be an 8-bit immediate");
28383 return const0_rtx;
28384 }
28385
28386 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28387 {
28388 if (optimize || !target
28389 || GET_MODE (target) != tmode0
28390 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28391 target = gen_reg_rtx (tmode0);
28392
28393 scratch1 = gen_reg_rtx (tmode1);
28394
28395 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28396 }
28397 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28398 {
28399 if (optimize || !target
28400 || GET_MODE (target) != tmode1
28401 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28402 target = gen_reg_rtx (tmode1);
28403
28404 scratch0 = gen_reg_rtx (tmode0);
28405
28406 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28407 }
28408 else
28409 {
28410 gcc_assert (d->flag);
28411
28412 scratch0 = gen_reg_rtx (tmode0);
28413 scratch1 = gen_reg_rtx (tmode1);
28414
28415 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28416 }
28417
28418 if (! pat)
28419 return 0;
28420
28421 emit_insn (pat);
28422
28423 if (d->flag)
28424 {
28425 target = gen_reg_rtx (SImode);
28426 emit_move_insn (target, const0_rtx);
28427 target = gen_rtx_SUBREG (QImode, target, 0);
28428
28429 emit_insn
28430 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28431 gen_rtx_fmt_ee (EQ, QImode,
28432 gen_rtx_REG ((enum machine_mode) d->flag,
28433 FLAGS_REG),
28434 const0_rtx)));
28435 return SUBREG_REG (target);
28436 }
28437 else
28438 return target;
28439 }
28440
28441 /* Subroutine of ix86_expand_builtin to take care of insns with
28442 variable number of operands. */
28443
28444 static rtx
28445 ix86_expand_args_builtin (const struct builtin_description *d,
28446 tree exp, rtx target)
28447 {
28448 rtx pat, real_target;
28449 unsigned int i, nargs;
28450 unsigned int nargs_constant = 0;
28451 int num_memory = 0;
28452 struct
28453 {
28454 rtx op;
28455 enum machine_mode mode;
28456 } args[4];
28457 bool last_arg_count = false;
28458 enum insn_code icode = d->icode;
28459 const struct insn_data_d *insn_p = &insn_data[icode];
28460 enum machine_mode tmode = insn_p->operand[0].mode;
28461 enum machine_mode rmode = VOIDmode;
28462 bool swap = false;
28463 enum rtx_code comparison = d->comparison;
28464
28465 switch ((enum ix86_builtin_func_type) d->flag)
28466 {
28467 case V2DF_FTYPE_V2DF_ROUND:
28468 case V4DF_FTYPE_V4DF_ROUND:
28469 case V4SF_FTYPE_V4SF_ROUND:
28470 case V8SF_FTYPE_V8SF_ROUND:
28471 case V4SI_FTYPE_V4SF_ROUND:
28472 case V8SI_FTYPE_V8SF_ROUND:
28473 return ix86_expand_sse_round (d, exp, target);
28474 case V4SI_FTYPE_V2DF_V2DF_ROUND:
28475 case V8SI_FTYPE_V4DF_V4DF_ROUND:
28476 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28477 case INT_FTYPE_V8SF_V8SF_PTEST:
28478 case INT_FTYPE_V4DI_V4DI_PTEST:
28479 case INT_FTYPE_V4DF_V4DF_PTEST:
28480 case INT_FTYPE_V4SF_V4SF_PTEST:
28481 case INT_FTYPE_V2DI_V2DI_PTEST:
28482 case INT_FTYPE_V2DF_V2DF_PTEST:
28483 return ix86_expand_sse_ptest (d, exp, target);
28484 case FLOAT128_FTYPE_FLOAT128:
28485 case FLOAT_FTYPE_FLOAT:
28486 case INT_FTYPE_INT:
28487 case UINT64_FTYPE_INT:
28488 case UINT16_FTYPE_UINT16:
28489 case INT64_FTYPE_INT64:
28490 case INT64_FTYPE_V4SF:
28491 case INT64_FTYPE_V2DF:
28492 case INT_FTYPE_V16QI:
28493 case INT_FTYPE_V8QI:
28494 case INT_FTYPE_V8SF:
28495 case INT_FTYPE_V4DF:
28496 case INT_FTYPE_V4SF:
28497 case INT_FTYPE_V2DF:
28498 case INT_FTYPE_V32QI:
28499 case V16QI_FTYPE_V16QI:
28500 case V8SI_FTYPE_V8SF:
28501 case V8SI_FTYPE_V4SI:
28502 case V8HI_FTYPE_V8HI:
28503 case V8HI_FTYPE_V16QI:
28504 case V8QI_FTYPE_V8QI:
28505 case V8SF_FTYPE_V8SF:
28506 case V8SF_FTYPE_V8SI:
28507 case V8SF_FTYPE_V4SF:
28508 case V8SF_FTYPE_V8HI:
28509 case V4SI_FTYPE_V4SI:
28510 case V4SI_FTYPE_V16QI:
28511 case V4SI_FTYPE_V4SF:
28512 case V4SI_FTYPE_V8SI:
28513 case V4SI_FTYPE_V8HI:
28514 case V4SI_FTYPE_V4DF:
28515 case V4SI_FTYPE_V2DF:
28516 case V4HI_FTYPE_V4HI:
28517 case V4DF_FTYPE_V4DF:
28518 case V4DF_FTYPE_V4SI:
28519 case V4DF_FTYPE_V4SF:
28520 case V4DF_FTYPE_V2DF:
28521 case V4SF_FTYPE_V4SF:
28522 case V4SF_FTYPE_V4SI:
28523 case V4SF_FTYPE_V8SF:
28524 case V4SF_FTYPE_V4DF:
28525 case V4SF_FTYPE_V8HI:
28526 case V4SF_FTYPE_V2DF:
28527 case V2DI_FTYPE_V2DI:
28528 case V2DI_FTYPE_V16QI:
28529 case V2DI_FTYPE_V8HI:
28530 case V2DI_FTYPE_V4SI:
28531 case V2DF_FTYPE_V2DF:
28532 case V2DF_FTYPE_V4SI:
28533 case V2DF_FTYPE_V4DF:
28534 case V2DF_FTYPE_V4SF:
28535 case V2DF_FTYPE_V2SI:
28536 case V2SI_FTYPE_V2SI:
28537 case V2SI_FTYPE_V4SF:
28538 case V2SI_FTYPE_V2SF:
28539 case V2SI_FTYPE_V2DF:
28540 case V2SF_FTYPE_V2SF:
28541 case V2SF_FTYPE_V2SI:
28542 case V32QI_FTYPE_V32QI:
28543 case V32QI_FTYPE_V16QI:
28544 case V16HI_FTYPE_V16HI:
28545 case V16HI_FTYPE_V8HI:
28546 case V8SI_FTYPE_V8SI:
28547 case V16HI_FTYPE_V16QI:
28548 case V8SI_FTYPE_V16QI:
28549 case V4DI_FTYPE_V16QI:
28550 case V8SI_FTYPE_V8HI:
28551 case V4DI_FTYPE_V8HI:
28552 case V4DI_FTYPE_V4SI:
28553 case V4DI_FTYPE_V2DI:
28554 nargs = 1;
28555 break;
28556 case V4SF_FTYPE_V4SF_VEC_MERGE:
28557 case V2DF_FTYPE_V2DF_VEC_MERGE:
28558 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28559 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28560 case V16QI_FTYPE_V16QI_V16QI:
28561 case V16QI_FTYPE_V8HI_V8HI:
28562 case V8QI_FTYPE_V8QI_V8QI:
28563 case V8QI_FTYPE_V4HI_V4HI:
28564 case V8HI_FTYPE_V8HI_V8HI:
28565 case V8HI_FTYPE_V16QI_V16QI:
28566 case V8HI_FTYPE_V4SI_V4SI:
28567 case V8SF_FTYPE_V8SF_V8SF:
28568 case V8SF_FTYPE_V8SF_V8SI:
28569 case V4SI_FTYPE_V4SI_V4SI:
28570 case V4SI_FTYPE_V8HI_V8HI:
28571 case V4SI_FTYPE_V4SF_V4SF:
28572 case V4SI_FTYPE_V2DF_V2DF:
28573 case V4HI_FTYPE_V4HI_V4HI:
28574 case V4HI_FTYPE_V8QI_V8QI:
28575 case V4HI_FTYPE_V2SI_V2SI:
28576 case V4DF_FTYPE_V4DF_V4DF:
28577 case V4DF_FTYPE_V4DF_V4DI:
28578 case V4SF_FTYPE_V4SF_V4SF:
28579 case V4SF_FTYPE_V4SF_V4SI:
28580 case V4SF_FTYPE_V4SF_V2SI:
28581 case V4SF_FTYPE_V4SF_V2DF:
28582 case V4SF_FTYPE_V4SF_DI:
28583 case V4SF_FTYPE_V4SF_SI:
28584 case V2DI_FTYPE_V2DI_V2DI:
28585 case V2DI_FTYPE_V16QI_V16QI:
28586 case V2DI_FTYPE_V4SI_V4SI:
28587 case V2DI_FTYPE_V2DI_V16QI:
28588 case V2DI_FTYPE_V2DF_V2DF:
28589 case V2SI_FTYPE_V2SI_V2SI:
28590 case V2SI_FTYPE_V4HI_V4HI:
28591 case V2SI_FTYPE_V2SF_V2SF:
28592 case V2DF_FTYPE_V2DF_V2DF:
28593 case V2DF_FTYPE_V2DF_V4SF:
28594 case V2DF_FTYPE_V2DF_V2DI:
28595 case V2DF_FTYPE_V2DF_DI:
28596 case V2DF_FTYPE_V2DF_SI:
28597 case V2SF_FTYPE_V2SF_V2SF:
28598 case V1DI_FTYPE_V1DI_V1DI:
28599 case V1DI_FTYPE_V8QI_V8QI:
28600 case V1DI_FTYPE_V2SI_V2SI:
28601 case V32QI_FTYPE_V16HI_V16HI:
28602 case V16HI_FTYPE_V8SI_V8SI:
28603 case V32QI_FTYPE_V32QI_V32QI:
28604 case V16HI_FTYPE_V32QI_V32QI:
28605 case V16HI_FTYPE_V16HI_V16HI:
28606 case V8SI_FTYPE_V4DF_V4DF:
28607 case V8SI_FTYPE_V8SI_V8SI:
28608 case V8SI_FTYPE_V16HI_V16HI:
28609 case V4DI_FTYPE_V4DI_V4DI:
28610 case V4DI_FTYPE_V8SI_V8SI:
28611 if (comparison == UNKNOWN)
28612 return ix86_expand_binop_builtin (icode, exp, target);
28613 nargs = 2;
28614 break;
28615 case V4SF_FTYPE_V4SF_V4SF_SWAP:
28616 case V2DF_FTYPE_V2DF_V2DF_SWAP:
28617 gcc_assert (comparison != UNKNOWN);
28618 nargs = 2;
28619 swap = true;
28620 break;
28621 case V16HI_FTYPE_V16HI_V8HI_COUNT:
28622 case V16HI_FTYPE_V16HI_SI_COUNT:
28623 case V8SI_FTYPE_V8SI_V4SI_COUNT:
28624 case V8SI_FTYPE_V8SI_SI_COUNT:
28625 case V4DI_FTYPE_V4DI_V2DI_COUNT:
28626 case V4DI_FTYPE_V4DI_INT_COUNT:
28627 case V8HI_FTYPE_V8HI_V8HI_COUNT:
28628 case V8HI_FTYPE_V8HI_SI_COUNT:
28629 case V4SI_FTYPE_V4SI_V4SI_COUNT:
28630 case V4SI_FTYPE_V4SI_SI_COUNT:
28631 case V4HI_FTYPE_V4HI_V4HI_COUNT:
28632 case V4HI_FTYPE_V4HI_SI_COUNT:
28633 case V2DI_FTYPE_V2DI_V2DI_COUNT:
28634 case V2DI_FTYPE_V2DI_SI_COUNT:
28635 case V2SI_FTYPE_V2SI_V2SI_COUNT:
28636 case V2SI_FTYPE_V2SI_SI_COUNT:
28637 case V1DI_FTYPE_V1DI_V1DI_COUNT:
28638 case V1DI_FTYPE_V1DI_SI_COUNT:
28639 nargs = 2;
28640 last_arg_count = true;
28641 break;
28642 case UINT64_FTYPE_UINT64_UINT64:
28643 case UINT_FTYPE_UINT_UINT:
28644 case UINT_FTYPE_UINT_USHORT:
28645 case UINT_FTYPE_UINT_UCHAR:
28646 case UINT16_FTYPE_UINT16_INT:
28647 case UINT8_FTYPE_UINT8_INT:
28648 nargs = 2;
28649 break;
28650 case V2DI_FTYPE_V2DI_INT_CONVERT:
28651 nargs = 2;
28652 rmode = V1TImode;
28653 nargs_constant = 1;
28654 break;
28655 case V4DI_FTYPE_V4DI_INT_CONVERT:
28656 nargs = 2;
28657 rmode = V2TImode;
28658 nargs_constant = 1;
28659 break;
28660 case V8HI_FTYPE_V8HI_INT:
28661 case V8HI_FTYPE_V8SF_INT:
28662 case V8HI_FTYPE_V4SF_INT:
28663 case V8SF_FTYPE_V8SF_INT:
28664 case V4SI_FTYPE_V4SI_INT:
28665 case V4SI_FTYPE_V8SI_INT:
28666 case V4HI_FTYPE_V4HI_INT:
28667 case V4DF_FTYPE_V4DF_INT:
28668 case V4SF_FTYPE_V4SF_INT:
28669 case V4SF_FTYPE_V8SF_INT:
28670 case V2DI_FTYPE_V2DI_INT:
28671 case V2DF_FTYPE_V2DF_INT:
28672 case V2DF_FTYPE_V4DF_INT:
28673 case V16HI_FTYPE_V16HI_INT:
28674 case V8SI_FTYPE_V8SI_INT:
28675 case V4DI_FTYPE_V4DI_INT:
28676 case V2DI_FTYPE_V4DI_INT:
28677 nargs = 2;
28678 nargs_constant = 1;
28679 break;
28680 case V16QI_FTYPE_V16QI_V16QI_V16QI:
28681 case V8SF_FTYPE_V8SF_V8SF_V8SF:
28682 case V4DF_FTYPE_V4DF_V4DF_V4DF:
28683 case V4SF_FTYPE_V4SF_V4SF_V4SF:
28684 case V2DF_FTYPE_V2DF_V2DF_V2DF:
28685 case V32QI_FTYPE_V32QI_V32QI_V32QI:
28686 nargs = 3;
28687 break;
28688 case V32QI_FTYPE_V32QI_V32QI_INT:
28689 case V16HI_FTYPE_V16HI_V16HI_INT:
28690 case V16QI_FTYPE_V16QI_V16QI_INT:
28691 case V4DI_FTYPE_V4DI_V4DI_INT:
28692 case V8HI_FTYPE_V8HI_V8HI_INT:
28693 case V8SI_FTYPE_V8SI_V8SI_INT:
28694 case V8SI_FTYPE_V8SI_V4SI_INT:
28695 case V8SF_FTYPE_V8SF_V8SF_INT:
28696 case V8SF_FTYPE_V8SF_V4SF_INT:
28697 case V4SI_FTYPE_V4SI_V4SI_INT:
28698 case V4DF_FTYPE_V4DF_V4DF_INT:
28699 case V4DF_FTYPE_V4DF_V2DF_INT:
28700 case V4SF_FTYPE_V4SF_V4SF_INT:
28701 case V2DI_FTYPE_V2DI_V2DI_INT:
28702 case V4DI_FTYPE_V4DI_V2DI_INT:
28703 case V2DF_FTYPE_V2DF_V2DF_INT:
28704 nargs = 3;
28705 nargs_constant = 1;
28706 break;
28707 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28708 nargs = 3;
28709 rmode = V4DImode;
28710 nargs_constant = 1;
28711 break;
28712 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28713 nargs = 3;
28714 rmode = V2DImode;
28715 nargs_constant = 1;
28716 break;
28717 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28718 nargs = 3;
28719 rmode = DImode;
28720 nargs_constant = 1;
28721 break;
28722 case V2DI_FTYPE_V2DI_UINT_UINT:
28723 nargs = 3;
28724 nargs_constant = 2;
28725 break;
28726 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28727 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28728 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28729 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28730 nargs = 4;
28731 nargs_constant = 1;
28732 break;
28733 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28734 nargs = 4;
28735 nargs_constant = 2;
28736 break;
28737 default:
28738 gcc_unreachable ();
28739 }
28740
28741 gcc_assert (nargs <= ARRAY_SIZE (args));
28742
28743 if (comparison != UNKNOWN)
28744 {
28745 gcc_assert (nargs == 2);
28746 return ix86_expand_sse_compare (d, exp, target, swap);
28747 }
28748
28749 if (rmode == VOIDmode || rmode == tmode)
28750 {
28751 if (optimize
28752 || target == 0
28753 || GET_MODE (target) != tmode
28754 || !insn_p->operand[0].predicate (target, tmode))
28755 target = gen_reg_rtx (tmode);
28756 real_target = target;
28757 }
28758 else
28759 {
28760 target = gen_reg_rtx (rmode);
28761 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28762 }
28763
28764 for (i = 0; i < nargs; i++)
28765 {
28766 tree arg = CALL_EXPR_ARG (exp, i);
28767 rtx op = expand_normal (arg);
28768 enum machine_mode mode = insn_p->operand[i + 1].mode;
28769 bool match = insn_p->operand[i + 1].predicate (op, mode);
28770
28771 if (last_arg_count && (i + 1) == nargs)
28772 {
28773 /* SIMD shift insns take either an 8-bit immediate or
28774 register as count. But builtin functions take int as
28775 count. If count doesn't match, we put it in register. */
28776 if (!match)
28777 {
28778 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28779 if (!insn_p->operand[i + 1].predicate (op, mode))
28780 op = copy_to_reg (op);
28781 }
28782 }
28783 else if ((nargs - i) <= nargs_constant)
28784 {
28785 if (!match)
28786 switch (icode)
28787 {
28788 case CODE_FOR_avx2_inserti128:
28789 case CODE_FOR_avx2_extracti128:
28790 error ("the last argument must be an 1-bit immediate");
28791 return const0_rtx;
28792
28793 case CODE_FOR_sse4_1_roundsd:
28794 case CODE_FOR_sse4_1_roundss:
28795
28796 case CODE_FOR_sse4_1_roundpd:
28797 case CODE_FOR_sse4_1_roundps:
28798 case CODE_FOR_avx_roundpd256:
28799 case CODE_FOR_avx_roundps256:
28800
28801 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
28802 case CODE_FOR_sse4_1_roundps_sfix:
28803 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
28804 case CODE_FOR_avx_roundps_sfix256:
28805
28806 case CODE_FOR_sse4_1_blendps:
28807 case CODE_FOR_avx_blendpd256:
28808 case CODE_FOR_avx_vpermilv4df:
28809 error ("the last argument must be a 4-bit immediate");
28810 return const0_rtx;
28811
28812 case CODE_FOR_sse4_1_blendpd:
28813 case CODE_FOR_avx_vpermilv2df:
28814 case CODE_FOR_xop_vpermil2v2df3:
28815 case CODE_FOR_xop_vpermil2v4sf3:
28816 case CODE_FOR_xop_vpermil2v4df3:
28817 case CODE_FOR_xop_vpermil2v8sf3:
28818 error ("the last argument must be a 2-bit immediate");
28819 return const0_rtx;
28820
28821 case CODE_FOR_avx_vextractf128v4df:
28822 case CODE_FOR_avx_vextractf128v8sf:
28823 case CODE_FOR_avx_vextractf128v8si:
28824 case CODE_FOR_avx_vinsertf128v4df:
28825 case CODE_FOR_avx_vinsertf128v8sf:
28826 case CODE_FOR_avx_vinsertf128v8si:
28827 error ("the last argument must be a 1-bit immediate");
28828 return const0_rtx;
28829
28830 case CODE_FOR_avx_vmcmpv2df3:
28831 case CODE_FOR_avx_vmcmpv4sf3:
28832 case CODE_FOR_avx_cmpv2df3:
28833 case CODE_FOR_avx_cmpv4sf3:
28834 case CODE_FOR_avx_cmpv4df3:
28835 case CODE_FOR_avx_cmpv8sf3:
28836 error ("the last argument must be a 5-bit immediate");
28837 return const0_rtx;
28838
28839 default:
28840 switch (nargs_constant)
28841 {
28842 case 2:
28843 if ((nargs - i) == nargs_constant)
28844 {
28845 error ("the next to last argument must be an 8-bit immediate");
28846 break;
28847 }
28848 case 1:
28849 error ("the last argument must be an 8-bit immediate");
28850 break;
28851 default:
28852 gcc_unreachable ();
28853 }
28854 return const0_rtx;
28855 }
28856 }
28857 else
28858 {
28859 if (VECTOR_MODE_P (mode))
28860 op = safe_vector_operand (op, mode);
28861
28862 /* If we aren't optimizing, only allow one memory operand to
28863 be generated. */
28864 if (memory_operand (op, mode))
28865 num_memory++;
28866
28867 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
28868 {
28869 if (optimize || !match || num_memory > 1)
28870 op = copy_to_mode_reg (mode, op);
28871 }
28872 else
28873 {
28874 op = copy_to_reg (op);
28875 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
28876 }
28877 }
28878
28879 args[i].op = op;
28880 args[i].mode = mode;
28881 }
28882
28883 switch (nargs)
28884 {
28885 case 1:
28886 pat = GEN_FCN (icode) (real_target, args[0].op);
28887 break;
28888 case 2:
28889 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
28890 break;
28891 case 3:
28892 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28893 args[2].op);
28894 break;
28895 case 4:
28896 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28897 args[2].op, args[3].op);
28898 break;
28899 default:
28900 gcc_unreachable ();
28901 }
28902
28903 if (! pat)
28904 return 0;
28905
28906 emit_insn (pat);
28907 return target;
28908 }
28909
28910 /* Subroutine of ix86_expand_builtin to take care of special insns
28911 with variable number of operands. */
28912
28913 static rtx
28914 ix86_expand_special_args_builtin (const struct builtin_description *d,
28915 tree exp, rtx target)
28916 {
28917 tree arg;
28918 rtx pat, op;
28919 unsigned int i, nargs, arg_adjust, memory;
28920 struct
28921 {
28922 rtx op;
28923 enum machine_mode mode;
28924 } args[3];
28925 enum insn_code icode = d->icode;
28926 bool last_arg_constant = false;
28927 const struct insn_data_d *insn_p = &insn_data[icode];
28928 enum machine_mode tmode = insn_p->operand[0].mode;
28929 enum { load, store } klass;
28930
28931 switch ((enum ix86_builtin_func_type) d->flag)
28932 {
28933 case VOID_FTYPE_VOID:
28934 if (icode == CODE_FOR_avx_vzeroupper)
28935 target = GEN_INT (vzeroupper_intrinsic);
28936 emit_insn (GEN_FCN (icode) (target));
28937 return 0;
28938 case VOID_FTYPE_UINT64:
28939 case VOID_FTYPE_UNSIGNED:
28940 nargs = 0;
28941 klass = store;
28942 memory = 0;
28943 break;
28944
28945 case INT_FTYPE_VOID:
28946 case UINT64_FTYPE_VOID:
28947 case UNSIGNED_FTYPE_VOID:
28948 nargs = 0;
28949 klass = load;
28950 memory = 0;
28951 break;
28952 case UINT64_FTYPE_PUNSIGNED:
28953 case V2DI_FTYPE_PV2DI:
28954 case V4DI_FTYPE_PV4DI:
28955 case V32QI_FTYPE_PCCHAR:
28956 case V16QI_FTYPE_PCCHAR:
28957 case V8SF_FTYPE_PCV4SF:
28958 case V8SF_FTYPE_PCFLOAT:
28959 case V4SF_FTYPE_PCFLOAT:
28960 case V4DF_FTYPE_PCV2DF:
28961 case V4DF_FTYPE_PCDOUBLE:
28962 case V2DF_FTYPE_PCDOUBLE:
28963 case VOID_FTYPE_PVOID:
28964 nargs = 1;
28965 klass = load;
28966 memory = 0;
28967 break;
28968 case VOID_FTYPE_PV2SF_V4SF:
28969 case VOID_FTYPE_PV4DI_V4DI:
28970 case VOID_FTYPE_PV2DI_V2DI:
28971 case VOID_FTYPE_PCHAR_V32QI:
28972 case VOID_FTYPE_PCHAR_V16QI:
28973 case VOID_FTYPE_PFLOAT_V8SF:
28974 case VOID_FTYPE_PFLOAT_V4SF:
28975 case VOID_FTYPE_PDOUBLE_V4DF:
28976 case VOID_FTYPE_PDOUBLE_V2DF:
28977 case VOID_FTYPE_PLONGLONG_LONGLONG:
28978 case VOID_FTYPE_PULONGLONG_ULONGLONG:
28979 case VOID_FTYPE_PINT_INT:
28980 nargs = 1;
28981 klass = store;
28982 /* Reserve memory operand for target. */
28983 memory = ARRAY_SIZE (args);
28984 break;
28985 case V4SF_FTYPE_V4SF_PCV2SF:
28986 case V2DF_FTYPE_V2DF_PCDOUBLE:
28987 nargs = 2;
28988 klass = load;
28989 memory = 1;
28990 break;
28991 case V8SF_FTYPE_PCV8SF_V8SI:
28992 case V4DF_FTYPE_PCV4DF_V4DI:
28993 case V4SF_FTYPE_PCV4SF_V4SI:
28994 case V2DF_FTYPE_PCV2DF_V2DI:
28995 case V8SI_FTYPE_PCV8SI_V8SI:
28996 case V4DI_FTYPE_PCV4DI_V4DI:
28997 case V4SI_FTYPE_PCV4SI_V4SI:
28998 case V2DI_FTYPE_PCV2DI_V2DI:
28999 nargs = 2;
29000 klass = load;
29001 memory = 0;
29002 break;
29003 case VOID_FTYPE_PV8SF_V8SI_V8SF:
29004 case VOID_FTYPE_PV4DF_V4DI_V4DF:
29005 case VOID_FTYPE_PV4SF_V4SI_V4SF:
29006 case VOID_FTYPE_PV2DF_V2DI_V2DF:
29007 case VOID_FTYPE_PV8SI_V8SI_V8SI:
29008 case VOID_FTYPE_PV4DI_V4DI_V4DI:
29009 case VOID_FTYPE_PV4SI_V4SI_V4SI:
29010 case VOID_FTYPE_PV2DI_V2DI_V2DI:
29011 nargs = 2;
29012 klass = store;
29013 /* Reserve memory operand for target. */
29014 memory = ARRAY_SIZE (args);
29015 break;
29016 case VOID_FTYPE_UINT_UINT_UINT:
29017 case VOID_FTYPE_UINT64_UINT_UINT:
29018 case UCHAR_FTYPE_UINT_UINT_UINT:
29019 case UCHAR_FTYPE_UINT64_UINT_UINT:
29020 nargs = 3;
29021 klass = load;
29022 memory = ARRAY_SIZE (args);
29023 last_arg_constant = true;
29024 break;
29025 default:
29026 gcc_unreachable ();
29027 }
29028
29029 gcc_assert (nargs <= ARRAY_SIZE (args));
29030
29031 if (klass == store)
29032 {
29033 arg = CALL_EXPR_ARG (exp, 0);
29034 op = expand_normal (arg);
29035 gcc_assert (target == 0);
29036 if (memory)
29037 {
29038 if (GET_MODE (op) != Pmode)
29039 op = convert_to_mode (Pmode, op, 1);
29040 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
29041 }
29042 else
29043 target = force_reg (tmode, op);
29044 arg_adjust = 1;
29045 }
29046 else
29047 {
29048 arg_adjust = 0;
29049 if (optimize
29050 || target == 0
29051 || GET_MODE (target) != tmode
29052 || !insn_p->operand[0].predicate (target, tmode))
29053 target = gen_reg_rtx (tmode);
29054 }
29055
29056 for (i = 0; i < nargs; i++)
29057 {
29058 enum machine_mode mode = insn_p->operand[i + 1].mode;
29059 bool match;
29060
29061 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
29062 op = expand_normal (arg);
29063 match = insn_p->operand[i + 1].predicate (op, mode);
29064
29065 if (last_arg_constant && (i + 1) == nargs)
29066 {
29067 if (!match)
29068 {
29069 if (icode == CODE_FOR_lwp_lwpvalsi3
29070 || icode == CODE_FOR_lwp_lwpinssi3
29071 || icode == CODE_FOR_lwp_lwpvaldi3
29072 || icode == CODE_FOR_lwp_lwpinsdi3)
29073 error ("the last argument must be a 32-bit immediate");
29074 else
29075 error ("the last argument must be an 8-bit immediate");
29076 return const0_rtx;
29077 }
29078 }
29079 else
29080 {
29081 if (i == memory)
29082 {
29083 /* This must be the memory operand. */
29084 if (GET_MODE (op) != Pmode)
29085 op = convert_to_mode (Pmode, op, 1);
29086 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29087 gcc_assert (GET_MODE (op) == mode
29088 || GET_MODE (op) == VOIDmode);
29089 }
29090 else
29091 {
29092 /* This must be register. */
29093 if (VECTOR_MODE_P (mode))
29094 op = safe_vector_operand (op, mode);
29095
29096 gcc_assert (GET_MODE (op) == mode
29097 || GET_MODE (op) == VOIDmode);
29098 op = copy_to_mode_reg (mode, op);
29099 }
29100 }
29101
29102 args[i].op = op;
29103 args[i].mode = mode;
29104 }
29105
29106 switch (nargs)
29107 {
29108 case 0:
29109 pat = GEN_FCN (icode) (target);
29110 break;
29111 case 1:
29112 pat = GEN_FCN (icode) (target, args[0].op);
29113 break;
29114 case 2:
29115 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29116 break;
29117 case 3:
29118 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29119 break;
29120 default:
29121 gcc_unreachable ();
29122 }
29123
29124 if (! pat)
29125 return 0;
29126 emit_insn (pat);
29127 return klass == store ? 0 : target;
29128 }
29129
29130 /* Return the integer constant in ARG. Constrain it to be in the range
29131 of the subparts of VEC_TYPE; issue an error if not. */
29132
29133 static int
29134 get_element_number (tree vec_type, tree arg)
29135 {
29136 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29137
29138 if (!host_integerp (arg, 1)
29139 || (elt = tree_low_cst (arg, 1), elt > max))
29140 {
29141 error ("selector must be an integer constant in the range 0..%wi", max);
29142 return 0;
29143 }
29144
29145 return elt;
29146 }
29147
29148 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29149 ix86_expand_vector_init. We DO have language-level syntax for this, in
29150 the form of (type){ init-list }. Except that since we can't place emms
29151 instructions from inside the compiler, we can't allow the use of MMX
29152 registers unless the user explicitly asks for it. So we do *not* define
29153 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29154 we have builtins invoked by mmintrin.h that gives us license to emit
29155 these sorts of instructions. */
29156
29157 static rtx
29158 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29159 {
29160 enum machine_mode tmode = TYPE_MODE (type);
29161 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29162 int i, n_elt = GET_MODE_NUNITS (tmode);
29163 rtvec v = rtvec_alloc (n_elt);
29164
29165 gcc_assert (VECTOR_MODE_P (tmode));
29166 gcc_assert (call_expr_nargs (exp) == n_elt);
29167
29168 for (i = 0; i < n_elt; ++i)
29169 {
29170 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29171 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29172 }
29173
29174 if (!target || !register_operand (target, tmode))
29175 target = gen_reg_rtx (tmode);
29176
29177 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29178 return target;
29179 }
29180
29181 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29182 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29183 had a language-level syntax for referencing vector elements. */
29184
29185 static rtx
29186 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29187 {
29188 enum machine_mode tmode, mode0;
29189 tree arg0, arg1;
29190 int elt;
29191 rtx op0;
29192
29193 arg0 = CALL_EXPR_ARG (exp, 0);
29194 arg1 = CALL_EXPR_ARG (exp, 1);
29195
29196 op0 = expand_normal (arg0);
29197 elt = get_element_number (TREE_TYPE (arg0), arg1);
29198
29199 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29200 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29201 gcc_assert (VECTOR_MODE_P (mode0));
29202
29203 op0 = force_reg (mode0, op0);
29204
29205 if (optimize || !target || !register_operand (target, tmode))
29206 target = gen_reg_rtx (tmode);
29207
29208 ix86_expand_vector_extract (true, target, op0, elt);
29209
29210 return target;
29211 }
29212
29213 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29214 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29215 a language-level syntax for referencing vector elements. */
29216
29217 static rtx
29218 ix86_expand_vec_set_builtin (tree exp)
29219 {
29220 enum machine_mode tmode, mode1;
29221 tree arg0, arg1, arg2;
29222 int elt;
29223 rtx op0, op1, target;
29224
29225 arg0 = CALL_EXPR_ARG (exp, 0);
29226 arg1 = CALL_EXPR_ARG (exp, 1);
29227 arg2 = CALL_EXPR_ARG (exp, 2);
29228
29229 tmode = TYPE_MODE (TREE_TYPE (arg0));
29230 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29231 gcc_assert (VECTOR_MODE_P (tmode));
29232
29233 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29234 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29235 elt = get_element_number (TREE_TYPE (arg0), arg2);
29236
29237 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29238 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29239
29240 op0 = force_reg (tmode, op0);
29241 op1 = force_reg (mode1, op1);
29242
29243 /* OP0 is the source of these builtin functions and shouldn't be
29244 modified. Create a copy, use it and return it as target. */
29245 target = gen_reg_rtx (tmode);
29246 emit_move_insn (target, op0);
29247 ix86_expand_vector_set (true, target, op1, elt);
29248
29249 return target;
29250 }
29251
29252 /* Expand an expression EXP that calls a built-in function,
29253 with result going to TARGET if that's convenient
29254 (and in mode MODE if that's convenient).
29255 SUBTARGET may be used as the target for computing one of EXP's operands.
29256 IGNORE is nonzero if the value is to be ignored. */
29257
29258 static rtx
29259 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29260 enum machine_mode mode ATTRIBUTE_UNUSED,
29261 int ignore ATTRIBUTE_UNUSED)
29262 {
29263 const struct builtin_description *d;
29264 size_t i;
29265 enum insn_code icode;
29266 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29267 tree arg0, arg1, arg2, arg3, arg4;
29268 rtx op0, op1, op2, op3, op4, pat;
29269 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29270 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29271
29272 /* Determine whether the builtin function is available under the current ISA.
29273 Originally the builtin was not created if it wasn't applicable to the
29274 current ISA based on the command line switches. With function specific
29275 options, we need to check in the context of the function making the call
29276 whether it is supported. */
29277 if (ix86_builtins_isa[fcode].isa
29278 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29279 {
29280 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29281 NULL, (enum fpmath_unit) 0, false);
29282
29283 if (!opts)
29284 error ("%qE needs unknown isa option", fndecl);
29285 else
29286 {
29287 gcc_assert (opts != NULL);
29288 error ("%qE needs isa option %s", fndecl, opts);
29289 free (opts);
29290 }
29291 return const0_rtx;
29292 }
29293
29294 switch (fcode)
29295 {
29296 case IX86_BUILTIN_MASKMOVQ:
29297 case IX86_BUILTIN_MASKMOVDQU:
29298 icode = (fcode == IX86_BUILTIN_MASKMOVQ
29299 ? CODE_FOR_mmx_maskmovq
29300 : CODE_FOR_sse2_maskmovdqu);
29301 /* Note the arg order is different from the operand order. */
29302 arg1 = CALL_EXPR_ARG (exp, 0);
29303 arg2 = CALL_EXPR_ARG (exp, 1);
29304 arg0 = CALL_EXPR_ARG (exp, 2);
29305 op0 = expand_normal (arg0);
29306 op1 = expand_normal (arg1);
29307 op2 = expand_normal (arg2);
29308 mode0 = insn_data[icode].operand[0].mode;
29309 mode1 = insn_data[icode].operand[1].mode;
29310 mode2 = insn_data[icode].operand[2].mode;
29311
29312 if (GET_MODE (op0) != Pmode)
29313 op0 = convert_to_mode (Pmode, op0, 1);
29314 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29315
29316 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29317 op0 = copy_to_mode_reg (mode0, op0);
29318 if (!insn_data[icode].operand[1].predicate (op1, mode1))
29319 op1 = copy_to_mode_reg (mode1, op1);
29320 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29321 op2 = copy_to_mode_reg (mode2, op2);
29322 pat = GEN_FCN (icode) (op0, op1, op2);
29323 if (! pat)
29324 return 0;
29325 emit_insn (pat);
29326 return 0;
29327
29328 case IX86_BUILTIN_LDMXCSR:
29329 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29330 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29331 emit_move_insn (target, op0);
29332 emit_insn (gen_sse_ldmxcsr (target));
29333 return 0;
29334
29335 case IX86_BUILTIN_STMXCSR:
29336 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29337 emit_insn (gen_sse_stmxcsr (target));
29338 return copy_to_mode_reg (SImode, target);
29339
29340 case IX86_BUILTIN_CLFLUSH:
29341 arg0 = CALL_EXPR_ARG (exp, 0);
29342 op0 = expand_normal (arg0);
29343 icode = CODE_FOR_sse2_clflush;
29344 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29345 {
29346 if (GET_MODE (op0) != Pmode)
29347 op0 = convert_to_mode (Pmode, op0, 1);
29348 op0 = force_reg (Pmode, op0);
29349 }
29350
29351 emit_insn (gen_sse2_clflush (op0));
29352 return 0;
29353
29354 case IX86_BUILTIN_MONITOR:
29355 arg0 = CALL_EXPR_ARG (exp, 0);
29356 arg1 = CALL_EXPR_ARG (exp, 1);
29357 arg2 = CALL_EXPR_ARG (exp, 2);
29358 op0 = expand_normal (arg0);
29359 op1 = expand_normal (arg1);
29360 op2 = expand_normal (arg2);
29361 if (!REG_P (op0))
29362 {
29363 if (GET_MODE (op0) != Pmode)
29364 op0 = convert_to_mode (Pmode, op0, 1);
29365 op0 = force_reg (Pmode, op0);
29366 }
29367 if (!REG_P (op1))
29368 op1 = copy_to_mode_reg (SImode, op1);
29369 if (!REG_P (op2))
29370 op2 = copy_to_mode_reg (SImode, op2);
29371 emit_insn (ix86_gen_monitor (op0, op1, op2));
29372 return 0;
29373
29374 case IX86_BUILTIN_MWAIT:
29375 arg0 = CALL_EXPR_ARG (exp, 0);
29376 arg1 = CALL_EXPR_ARG (exp, 1);
29377 op0 = expand_normal (arg0);
29378 op1 = expand_normal (arg1);
29379 if (!REG_P (op0))
29380 op0 = copy_to_mode_reg (SImode, op0);
29381 if (!REG_P (op1))
29382 op1 = copy_to_mode_reg (SImode, op1);
29383 emit_insn (gen_sse3_mwait (op0, op1));
29384 return 0;
29385
29386 case IX86_BUILTIN_VEC_INIT_V2SI:
29387 case IX86_BUILTIN_VEC_INIT_V4HI:
29388 case IX86_BUILTIN_VEC_INIT_V8QI:
29389 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29390
29391 case IX86_BUILTIN_VEC_EXT_V2DF:
29392 case IX86_BUILTIN_VEC_EXT_V2DI:
29393 case IX86_BUILTIN_VEC_EXT_V4SF:
29394 case IX86_BUILTIN_VEC_EXT_V4SI:
29395 case IX86_BUILTIN_VEC_EXT_V8HI:
29396 case IX86_BUILTIN_VEC_EXT_V2SI:
29397 case IX86_BUILTIN_VEC_EXT_V4HI:
29398 case IX86_BUILTIN_VEC_EXT_V16QI:
29399 return ix86_expand_vec_ext_builtin (exp, target);
29400
29401 case IX86_BUILTIN_VEC_SET_V2DI:
29402 case IX86_BUILTIN_VEC_SET_V4SF:
29403 case IX86_BUILTIN_VEC_SET_V4SI:
29404 case IX86_BUILTIN_VEC_SET_V8HI:
29405 case IX86_BUILTIN_VEC_SET_V4HI:
29406 case IX86_BUILTIN_VEC_SET_V16QI:
29407 return ix86_expand_vec_set_builtin (exp);
29408
29409 case IX86_BUILTIN_INFQ:
29410 case IX86_BUILTIN_HUGE_VALQ:
29411 {
29412 REAL_VALUE_TYPE inf;
29413 rtx tmp;
29414
29415 real_inf (&inf);
29416 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29417
29418 tmp = validize_mem (force_const_mem (mode, tmp));
29419
29420 if (target == 0)
29421 target = gen_reg_rtx (mode);
29422
29423 emit_move_insn (target, tmp);
29424 return target;
29425 }
29426
29427 case IX86_BUILTIN_LLWPCB:
29428 arg0 = CALL_EXPR_ARG (exp, 0);
29429 op0 = expand_normal (arg0);
29430 icode = CODE_FOR_lwp_llwpcb;
29431 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29432 {
29433 if (GET_MODE (op0) != Pmode)
29434 op0 = convert_to_mode (Pmode, op0, 1);
29435 op0 = force_reg (Pmode, op0);
29436 }
29437 emit_insn (gen_lwp_llwpcb (op0));
29438 return 0;
29439
29440 case IX86_BUILTIN_SLWPCB:
29441 icode = CODE_FOR_lwp_slwpcb;
29442 if (!target
29443 || !insn_data[icode].operand[0].predicate (target, Pmode))
29444 target = gen_reg_rtx (Pmode);
29445 emit_insn (gen_lwp_slwpcb (target));
29446 return target;
29447
29448 case IX86_BUILTIN_BEXTRI32:
29449 case IX86_BUILTIN_BEXTRI64:
29450 arg0 = CALL_EXPR_ARG (exp, 0);
29451 arg1 = CALL_EXPR_ARG (exp, 1);
29452 op0 = expand_normal (arg0);
29453 op1 = expand_normal (arg1);
29454 icode = (fcode == IX86_BUILTIN_BEXTRI32
29455 ? CODE_FOR_tbm_bextri_si
29456 : CODE_FOR_tbm_bextri_di);
29457 if (!CONST_INT_P (op1))
29458 {
29459 error ("last argument must be an immediate");
29460 return const0_rtx;
29461 }
29462 else
29463 {
29464 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29465 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29466 op1 = GEN_INT (length);
29467 op2 = GEN_INT (lsb_index);
29468 pat = GEN_FCN (icode) (target, op0, op1, op2);
29469 if (pat)
29470 emit_insn (pat);
29471 return target;
29472 }
29473
29474 case IX86_BUILTIN_RDRAND16_STEP:
29475 icode = CODE_FOR_rdrandhi_1;
29476 mode0 = HImode;
29477 goto rdrand_step;
29478
29479 case IX86_BUILTIN_RDRAND32_STEP:
29480 icode = CODE_FOR_rdrandsi_1;
29481 mode0 = SImode;
29482 goto rdrand_step;
29483
29484 case IX86_BUILTIN_RDRAND64_STEP:
29485 icode = CODE_FOR_rdranddi_1;
29486 mode0 = DImode;
29487
29488 rdrand_step:
29489 op0 = gen_reg_rtx (mode0);
29490 emit_insn (GEN_FCN (icode) (op0));
29491
29492 arg0 = CALL_EXPR_ARG (exp, 0);
29493 op1 = expand_normal (arg0);
29494 if (!address_operand (op1, VOIDmode))
29495 {
29496 op1 = convert_memory_address (Pmode, op1);
29497 op1 = copy_addr_to_reg (op1);
29498 }
29499 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29500
29501 op1 = gen_reg_rtx (SImode);
29502 emit_move_insn (op1, CONST1_RTX (SImode));
29503
29504 /* Emit SImode conditional move. */
29505 if (mode0 == HImode)
29506 {
29507 op2 = gen_reg_rtx (SImode);
29508 emit_insn (gen_zero_extendhisi2 (op2, op0));
29509 }
29510 else if (mode0 == SImode)
29511 op2 = op0;
29512 else
29513 op2 = gen_rtx_SUBREG (SImode, op0, 0);
29514
29515 if (target == 0)
29516 target = gen_reg_rtx (SImode);
29517
29518 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29519 const0_rtx);
29520 emit_insn (gen_rtx_SET (VOIDmode, target,
29521 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29522 return target;
29523
29524 case IX86_BUILTIN_GATHERSIV2DF:
29525 icode = CODE_FOR_avx2_gathersiv2df;
29526 goto gather_gen;
29527 case IX86_BUILTIN_GATHERSIV4DF:
29528 icode = CODE_FOR_avx2_gathersiv4df;
29529 goto gather_gen;
29530 case IX86_BUILTIN_GATHERDIV2DF:
29531 icode = CODE_FOR_avx2_gatherdiv2df;
29532 goto gather_gen;
29533 case IX86_BUILTIN_GATHERDIV4DF:
29534 icode = CODE_FOR_avx2_gatherdiv4df;
29535 goto gather_gen;
29536 case IX86_BUILTIN_GATHERSIV4SF:
29537 icode = CODE_FOR_avx2_gathersiv4sf;
29538 goto gather_gen;
29539 case IX86_BUILTIN_GATHERSIV8SF:
29540 icode = CODE_FOR_avx2_gathersiv8sf;
29541 goto gather_gen;
29542 case IX86_BUILTIN_GATHERDIV4SF:
29543 icode = CODE_FOR_avx2_gatherdiv4sf;
29544 goto gather_gen;
29545 case IX86_BUILTIN_GATHERDIV8SF:
29546 icode = CODE_FOR_avx2_gatherdiv8sf;
29547 goto gather_gen;
29548 case IX86_BUILTIN_GATHERSIV2DI:
29549 icode = CODE_FOR_avx2_gathersiv2di;
29550 goto gather_gen;
29551 case IX86_BUILTIN_GATHERSIV4DI:
29552 icode = CODE_FOR_avx2_gathersiv4di;
29553 goto gather_gen;
29554 case IX86_BUILTIN_GATHERDIV2DI:
29555 icode = CODE_FOR_avx2_gatherdiv2di;
29556 goto gather_gen;
29557 case IX86_BUILTIN_GATHERDIV4DI:
29558 icode = CODE_FOR_avx2_gatherdiv4di;
29559 goto gather_gen;
29560 case IX86_BUILTIN_GATHERSIV4SI:
29561 icode = CODE_FOR_avx2_gathersiv4si;
29562 goto gather_gen;
29563 case IX86_BUILTIN_GATHERSIV8SI:
29564 icode = CODE_FOR_avx2_gathersiv8si;
29565 goto gather_gen;
29566 case IX86_BUILTIN_GATHERDIV4SI:
29567 icode = CODE_FOR_avx2_gatherdiv4si;
29568 goto gather_gen;
29569 case IX86_BUILTIN_GATHERDIV8SI:
29570 icode = CODE_FOR_avx2_gatherdiv8si;
29571 goto gather_gen;
29572 case IX86_BUILTIN_GATHERALTSIV4DF:
29573 icode = CODE_FOR_avx2_gathersiv4df;
29574 goto gather_gen;
29575 case IX86_BUILTIN_GATHERALTDIV8SF:
29576 icode = CODE_FOR_avx2_gatherdiv8sf;
29577 goto gather_gen;
29578 case IX86_BUILTIN_GATHERALTSIV4DI:
29579 icode = CODE_FOR_avx2_gathersiv4di;
29580 goto gather_gen;
29581 case IX86_BUILTIN_GATHERALTDIV8SI:
29582 icode = CODE_FOR_avx2_gatherdiv8si;
29583 goto gather_gen;
29584
29585 gather_gen:
29586 arg0 = CALL_EXPR_ARG (exp, 0);
29587 arg1 = CALL_EXPR_ARG (exp, 1);
29588 arg2 = CALL_EXPR_ARG (exp, 2);
29589 arg3 = CALL_EXPR_ARG (exp, 3);
29590 arg4 = CALL_EXPR_ARG (exp, 4);
29591 op0 = expand_normal (arg0);
29592 op1 = expand_normal (arg1);
29593 op2 = expand_normal (arg2);
29594 op3 = expand_normal (arg3);
29595 op4 = expand_normal (arg4);
29596 /* Note the arg order is different from the operand order. */
29597 mode0 = insn_data[icode].operand[1].mode;
29598 mode2 = insn_data[icode].operand[3].mode;
29599 mode3 = insn_data[icode].operand[4].mode;
29600 mode4 = insn_data[icode].operand[5].mode;
29601
29602 if (target == NULL_RTX
29603 || GET_MODE (target) != insn_data[icode].operand[0].mode)
29604 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
29605 else
29606 subtarget = target;
29607
29608 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
29609 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
29610 {
29611 rtx half = gen_reg_rtx (V4SImode);
29612 if (!nonimmediate_operand (op2, V8SImode))
29613 op2 = copy_to_mode_reg (V8SImode, op2);
29614 emit_insn (gen_vec_extract_lo_v8si (half, op2));
29615 op2 = half;
29616 }
29617 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
29618 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
29619 {
29620 rtx (*gen) (rtx, rtx);
29621 rtx half = gen_reg_rtx (mode0);
29622 if (mode0 == V4SFmode)
29623 gen = gen_vec_extract_lo_v8sf;
29624 else
29625 gen = gen_vec_extract_lo_v8si;
29626 if (!nonimmediate_operand (op0, GET_MODE (op0)))
29627 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
29628 emit_insn (gen (half, op0));
29629 op0 = half;
29630 if (!nonimmediate_operand (op3, GET_MODE (op3)))
29631 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
29632 emit_insn (gen (half, op3));
29633 op3 = half;
29634 }
29635
29636 /* Force memory operand only with base register here. But we
29637 don't want to do it on memory operand for other builtin
29638 functions. */
29639 if (GET_MODE (op1) != Pmode)
29640 op1 = convert_to_mode (Pmode, op1, 1);
29641 op1 = force_reg (Pmode, op1);
29642
29643 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29644 op0 = copy_to_mode_reg (mode0, op0);
29645 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
29646 op1 = copy_to_mode_reg (Pmode, op1);
29647 if (!insn_data[icode].operand[3].predicate (op2, mode2))
29648 op2 = copy_to_mode_reg (mode2, op2);
29649 if (!insn_data[icode].operand[4].predicate (op3, mode3))
29650 op3 = copy_to_mode_reg (mode3, op3);
29651 if (!insn_data[icode].operand[5].predicate (op4, mode4))
29652 {
29653 error ("last argument must be scale 1, 2, 4, 8");
29654 return const0_rtx;
29655 }
29656
29657 /* Optimize. If mask is known to have all high bits set,
29658 replace op0 with pc_rtx to signal that the instruction
29659 overwrites the whole destination and doesn't use its
29660 previous contents. */
29661 if (optimize)
29662 {
29663 if (TREE_CODE (arg3) == VECTOR_CST)
29664 {
29665 unsigned int negative = 0;
29666 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
29667 {
29668 tree cst = VECTOR_CST_ELT (arg3, i);
29669 if (TREE_CODE (cst) == INTEGER_CST
29670 && tree_int_cst_sign_bit (cst))
29671 negative++;
29672 else if (TREE_CODE (cst) == REAL_CST
29673 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
29674 negative++;
29675 }
29676 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
29677 op0 = pc_rtx;
29678 }
29679 else if (TREE_CODE (arg3) == SSA_NAME)
29680 {
29681 /* Recognize also when mask is like:
29682 __v2df src = _mm_setzero_pd ();
29683 __v2df mask = _mm_cmpeq_pd (src, src);
29684 or
29685 __v8sf src = _mm256_setzero_ps ();
29686 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
29687 as that is a cheaper way to load all ones into
29688 a register than having to load a constant from
29689 memory. */
29690 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
29691 if (is_gimple_call (def_stmt))
29692 {
29693 tree fndecl = gimple_call_fndecl (def_stmt);
29694 if (fndecl
29695 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29696 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
29697 {
29698 case IX86_BUILTIN_CMPPD:
29699 case IX86_BUILTIN_CMPPS:
29700 case IX86_BUILTIN_CMPPD256:
29701 case IX86_BUILTIN_CMPPS256:
29702 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
29703 break;
29704 /* FALLTHRU */
29705 case IX86_BUILTIN_CMPEQPD:
29706 case IX86_BUILTIN_CMPEQPS:
29707 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
29708 && initializer_zerop (gimple_call_arg (def_stmt,
29709 1)))
29710 op0 = pc_rtx;
29711 break;
29712 default:
29713 break;
29714 }
29715 }
29716 }
29717 }
29718
29719 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
29720 if (! pat)
29721 return const0_rtx;
29722 emit_insn (pat);
29723
29724 if (fcode == IX86_BUILTIN_GATHERDIV8SF
29725 || fcode == IX86_BUILTIN_GATHERDIV8SI)
29726 {
29727 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
29728 ? V4SFmode : V4SImode;
29729 if (target == NULL_RTX)
29730 target = gen_reg_rtx (tmode);
29731 if (tmode == V4SFmode)
29732 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
29733 else
29734 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
29735 }
29736 else
29737 target = subtarget;
29738
29739 return target;
29740
29741 case IX86_BUILTIN_XABORT:
29742 icode = CODE_FOR_xabort;
29743 arg0 = CALL_EXPR_ARG (exp, 0);
29744 op0 = expand_normal (arg0);
29745 mode0 = insn_data[icode].operand[0].mode;
29746 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29747 {
29748 error ("the xabort's argument must be an 8-bit immediate");
29749 return const0_rtx;
29750 }
29751 emit_insn (gen_xabort (op0));
29752 return 0;
29753
29754 default:
29755 break;
29756 }
29757
29758 for (i = 0, d = bdesc_special_args;
29759 i < ARRAY_SIZE (bdesc_special_args);
29760 i++, d++)
29761 if (d->code == fcode)
29762 return ix86_expand_special_args_builtin (d, exp, target);
29763
29764 for (i = 0, d = bdesc_args;
29765 i < ARRAY_SIZE (bdesc_args);
29766 i++, d++)
29767 if (d->code == fcode)
29768 switch (fcode)
29769 {
29770 case IX86_BUILTIN_FABSQ:
29771 case IX86_BUILTIN_COPYSIGNQ:
29772 if (!TARGET_SSE2)
29773 /* Emit a normal call if SSE2 isn't available. */
29774 return expand_call (exp, target, ignore);
29775 default:
29776 return ix86_expand_args_builtin (d, exp, target);
29777 }
29778
29779 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29780 if (d->code == fcode)
29781 return ix86_expand_sse_comi (d, exp, target);
29782
29783 for (i = 0, d = bdesc_pcmpestr;
29784 i < ARRAY_SIZE (bdesc_pcmpestr);
29785 i++, d++)
29786 if (d->code == fcode)
29787 return ix86_expand_sse_pcmpestr (d, exp, target);
29788
29789 for (i = 0, d = bdesc_pcmpistr;
29790 i < ARRAY_SIZE (bdesc_pcmpistr);
29791 i++, d++)
29792 if (d->code == fcode)
29793 return ix86_expand_sse_pcmpistr (d, exp, target);
29794
29795 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29796 if (d->code == fcode)
29797 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
29798 (enum ix86_builtin_func_type)
29799 d->flag, d->comparison);
29800
29801 gcc_unreachable ();
29802 }
29803
29804 /* Returns a function decl for a vectorized version of the builtin function
29805 with builtin function code FN and the result vector type TYPE, or NULL_TREE
29806 if it is not available. */
29807
29808 static tree
29809 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
29810 tree type_in)
29811 {
29812 enum machine_mode in_mode, out_mode;
29813 int in_n, out_n;
29814 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
29815
29816 if (TREE_CODE (type_out) != VECTOR_TYPE
29817 || TREE_CODE (type_in) != VECTOR_TYPE
29818 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
29819 return NULL_TREE;
29820
29821 out_mode = TYPE_MODE (TREE_TYPE (type_out));
29822 out_n = TYPE_VECTOR_SUBPARTS (type_out);
29823 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29824 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29825
29826 switch (fn)
29827 {
29828 case BUILT_IN_SQRT:
29829 if (out_mode == DFmode && in_mode == DFmode)
29830 {
29831 if (out_n == 2 && in_n == 2)
29832 return ix86_builtins[IX86_BUILTIN_SQRTPD];
29833 else if (out_n == 4 && in_n == 4)
29834 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
29835 }
29836 break;
29837
29838 case BUILT_IN_SQRTF:
29839 if (out_mode == SFmode && in_mode == SFmode)
29840 {
29841 if (out_n == 4 && in_n == 4)
29842 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
29843 else if (out_n == 8 && in_n == 8)
29844 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
29845 }
29846 break;
29847
29848 case BUILT_IN_IFLOOR:
29849 case BUILT_IN_LFLOOR:
29850 case BUILT_IN_LLFLOOR:
29851 /* The round insn does not trap on denormals. */
29852 if (flag_trapping_math || !TARGET_ROUND)
29853 break;
29854
29855 if (out_mode == SImode && in_mode == DFmode)
29856 {
29857 if (out_n == 4 && in_n == 2)
29858 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
29859 else if (out_n == 8 && in_n == 4)
29860 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
29861 }
29862 break;
29863
29864 case BUILT_IN_IFLOORF:
29865 case BUILT_IN_LFLOORF:
29866 case BUILT_IN_LLFLOORF:
29867 /* The round insn does not trap on denormals. */
29868 if (flag_trapping_math || !TARGET_ROUND)
29869 break;
29870
29871 if (out_mode == SImode && in_mode == SFmode)
29872 {
29873 if (out_n == 4 && in_n == 4)
29874 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
29875 else if (out_n == 8 && in_n == 8)
29876 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
29877 }
29878 break;
29879
29880 case BUILT_IN_ICEIL:
29881 case BUILT_IN_LCEIL:
29882 case BUILT_IN_LLCEIL:
29883 /* The round insn does not trap on denormals. */
29884 if (flag_trapping_math || !TARGET_ROUND)
29885 break;
29886
29887 if (out_mode == SImode && in_mode == DFmode)
29888 {
29889 if (out_n == 4 && in_n == 2)
29890 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
29891 else if (out_n == 8 && in_n == 4)
29892 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
29893 }
29894 break;
29895
29896 case BUILT_IN_ICEILF:
29897 case BUILT_IN_LCEILF:
29898 case BUILT_IN_LLCEILF:
29899 /* The round insn does not trap on denormals. */
29900 if (flag_trapping_math || !TARGET_ROUND)
29901 break;
29902
29903 if (out_mode == SImode && in_mode == SFmode)
29904 {
29905 if (out_n == 4 && in_n == 4)
29906 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
29907 else if (out_n == 8 && in_n == 8)
29908 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
29909 }
29910 break;
29911
29912 case BUILT_IN_IRINT:
29913 case BUILT_IN_LRINT:
29914 case BUILT_IN_LLRINT:
29915 if (out_mode == SImode && in_mode == DFmode)
29916 {
29917 if (out_n == 4 && in_n == 2)
29918 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
29919 else if (out_n == 8 && in_n == 4)
29920 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
29921 }
29922 break;
29923
29924 case BUILT_IN_IRINTF:
29925 case BUILT_IN_LRINTF:
29926 case BUILT_IN_LLRINTF:
29927 if (out_mode == SImode && in_mode == SFmode)
29928 {
29929 if (out_n == 4 && in_n == 4)
29930 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
29931 else if (out_n == 8 && in_n == 8)
29932 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
29933 }
29934 break;
29935
29936 case BUILT_IN_IROUND:
29937 case BUILT_IN_LROUND:
29938 case BUILT_IN_LLROUND:
29939 /* The round insn does not trap on denormals. */
29940 if (flag_trapping_math || !TARGET_ROUND)
29941 break;
29942
29943 if (out_mode == SImode && in_mode == DFmode)
29944 {
29945 if (out_n == 4 && in_n == 2)
29946 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
29947 else if (out_n == 8 && in_n == 4)
29948 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
29949 }
29950 break;
29951
29952 case BUILT_IN_IROUNDF:
29953 case BUILT_IN_LROUNDF:
29954 case BUILT_IN_LLROUNDF:
29955 /* The round insn does not trap on denormals. */
29956 if (flag_trapping_math || !TARGET_ROUND)
29957 break;
29958
29959 if (out_mode == SImode && in_mode == SFmode)
29960 {
29961 if (out_n == 4 && in_n == 4)
29962 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
29963 else if (out_n == 8 && in_n == 8)
29964 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
29965 }
29966 break;
29967
29968 case BUILT_IN_COPYSIGN:
29969 if (out_mode == DFmode && in_mode == DFmode)
29970 {
29971 if (out_n == 2 && in_n == 2)
29972 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
29973 else if (out_n == 4 && in_n == 4)
29974 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
29975 }
29976 break;
29977
29978 case BUILT_IN_COPYSIGNF:
29979 if (out_mode == SFmode && in_mode == SFmode)
29980 {
29981 if (out_n == 4 && in_n == 4)
29982 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
29983 else if (out_n == 8 && in_n == 8)
29984 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
29985 }
29986 break;
29987
29988 case BUILT_IN_FLOOR:
29989 /* The round insn does not trap on denormals. */
29990 if (flag_trapping_math || !TARGET_ROUND)
29991 break;
29992
29993 if (out_mode == DFmode && in_mode == DFmode)
29994 {
29995 if (out_n == 2 && in_n == 2)
29996 return ix86_builtins[IX86_BUILTIN_FLOORPD];
29997 else if (out_n == 4 && in_n == 4)
29998 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
29999 }
30000 break;
30001
30002 case BUILT_IN_FLOORF:
30003 /* The round insn does not trap on denormals. */
30004 if (flag_trapping_math || !TARGET_ROUND)
30005 break;
30006
30007 if (out_mode == SFmode && in_mode == SFmode)
30008 {
30009 if (out_n == 4 && in_n == 4)
30010 return ix86_builtins[IX86_BUILTIN_FLOORPS];
30011 else if (out_n == 8 && in_n == 8)
30012 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
30013 }
30014 break;
30015
30016 case BUILT_IN_CEIL:
30017 /* The round insn does not trap on denormals. */
30018 if (flag_trapping_math || !TARGET_ROUND)
30019 break;
30020
30021 if (out_mode == DFmode && in_mode == DFmode)
30022 {
30023 if (out_n == 2 && in_n == 2)
30024 return ix86_builtins[IX86_BUILTIN_CEILPD];
30025 else if (out_n == 4 && in_n == 4)
30026 return ix86_builtins[IX86_BUILTIN_CEILPD256];
30027 }
30028 break;
30029
30030 case BUILT_IN_CEILF:
30031 /* The round insn does not trap on denormals. */
30032 if (flag_trapping_math || !TARGET_ROUND)
30033 break;
30034
30035 if (out_mode == SFmode && in_mode == SFmode)
30036 {
30037 if (out_n == 4 && in_n == 4)
30038 return ix86_builtins[IX86_BUILTIN_CEILPS];
30039 else if (out_n == 8 && in_n == 8)
30040 return ix86_builtins[IX86_BUILTIN_CEILPS256];
30041 }
30042 break;
30043
30044 case BUILT_IN_TRUNC:
30045 /* The round insn does not trap on denormals. */
30046 if (flag_trapping_math || !TARGET_ROUND)
30047 break;
30048
30049 if (out_mode == DFmode && in_mode == DFmode)
30050 {
30051 if (out_n == 2 && in_n == 2)
30052 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
30053 else if (out_n == 4 && in_n == 4)
30054 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
30055 }
30056 break;
30057
30058 case BUILT_IN_TRUNCF:
30059 /* The round insn does not trap on denormals. */
30060 if (flag_trapping_math || !TARGET_ROUND)
30061 break;
30062
30063 if (out_mode == SFmode && in_mode == SFmode)
30064 {
30065 if (out_n == 4 && in_n == 4)
30066 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
30067 else if (out_n == 8 && in_n == 8)
30068 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
30069 }
30070 break;
30071
30072 case BUILT_IN_RINT:
30073 /* The round insn does not trap on denormals. */
30074 if (flag_trapping_math || !TARGET_ROUND)
30075 break;
30076
30077 if (out_mode == DFmode && in_mode == DFmode)
30078 {
30079 if (out_n == 2 && in_n == 2)
30080 return ix86_builtins[IX86_BUILTIN_RINTPD];
30081 else if (out_n == 4 && in_n == 4)
30082 return ix86_builtins[IX86_BUILTIN_RINTPD256];
30083 }
30084 break;
30085
30086 case BUILT_IN_RINTF:
30087 /* The round insn does not trap on denormals. */
30088 if (flag_trapping_math || !TARGET_ROUND)
30089 break;
30090
30091 if (out_mode == SFmode && in_mode == SFmode)
30092 {
30093 if (out_n == 4 && in_n == 4)
30094 return ix86_builtins[IX86_BUILTIN_RINTPS];
30095 else if (out_n == 8 && in_n == 8)
30096 return ix86_builtins[IX86_BUILTIN_RINTPS256];
30097 }
30098 break;
30099
30100 case BUILT_IN_ROUND:
30101 /* The round insn does not trap on denormals. */
30102 if (flag_trapping_math || !TARGET_ROUND)
30103 break;
30104
30105 if (out_mode == DFmode && in_mode == DFmode)
30106 {
30107 if (out_n == 2 && in_n == 2)
30108 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
30109 else if (out_n == 4 && in_n == 4)
30110 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
30111 }
30112 break;
30113
30114 case BUILT_IN_ROUNDF:
30115 /* The round insn does not trap on denormals. */
30116 if (flag_trapping_math || !TARGET_ROUND)
30117 break;
30118
30119 if (out_mode == SFmode && in_mode == SFmode)
30120 {
30121 if (out_n == 4 && in_n == 4)
30122 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
30123 else if (out_n == 8 && in_n == 8)
30124 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
30125 }
30126 break;
30127
30128 case BUILT_IN_FMA:
30129 if (out_mode == DFmode && in_mode == DFmode)
30130 {
30131 if (out_n == 2 && in_n == 2)
30132 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
30133 if (out_n == 4 && in_n == 4)
30134 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30135 }
30136 break;
30137
30138 case BUILT_IN_FMAF:
30139 if (out_mode == SFmode && in_mode == SFmode)
30140 {
30141 if (out_n == 4 && in_n == 4)
30142 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30143 if (out_n == 8 && in_n == 8)
30144 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30145 }
30146 break;
30147
30148 default:
30149 break;
30150 }
30151
30152 /* Dispatch to a handler for a vectorization library. */
30153 if (ix86_veclib_handler)
30154 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30155 type_in);
30156
30157 return NULL_TREE;
30158 }
30159
30160 /* Handler for an SVML-style interface to
30161 a library with vectorized intrinsics. */
30162
30163 static tree
30164 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30165 {
30166 char name[20];
30167 tree fntype, new_fndecl, args;
30168 unsigned arity;
30169 const char *bname;
30170 enum machine_mode el_mode, in_mode;
30171 int n, in_n;
30172
30173 /* The SVML is suitable for unsafe math only. */
30174 if (!flag_unsafe_math_optimizations)
30175 return NULL_TREE;
30176
30177 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30178 n = TYPE_VECTOR_SUBPARTS (type_out);
30179 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30180 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30181 if (el_mode != in_mode
30182 || n != in_n)
30183 return NULL_TREE;
30184
30185 switch (fn)
30186 {
30187 case BUILT_IN_EXP:
30188 case BUILT_IN_LOG:
30189 case BUILT_IN_LOG10:
30190 case BUILT_IN_POW:
30191 case BUILT_IN_TANH:
30192 case BUILT_IN_TAN:
30193 case BUILT_IN_ATAN:
30194 case BUILT_IN_ATAN2:
30195 case BUILT_IN_ATANH:
30196 case BUILT_IN_CBRT:
30197 case BUILT_IN_SINH:
30198 case BUILT_IN_SIN:
30199 case BUILT_IN_ASINH:
30200 case BUILT_IN_ASIN:
30201 case BUILT_IN_COSH:
30202 case BUILT_IN_COS:
30203 case BUILT_IN_ACOSH:
30204 case BUILT_IN_ACOS:
30205 if (el_mode != DFmode || n != 2)
30206 return NULL_TREE;
30207 break;
30208
30209 case BUILT_IN_EXPF:
30210 case BUILT_IN_LOGF:
30211 case BUILT_IN_LOG10F:
30212 case BUILT_IN_POWF:
30213 case BUILT_IN_TANHF:
30214 case BUILT_IN_TANF:
30215 case BUILT_IN_ATANF:
30216 case BUILT_IN_ATAN2F:
30217 case BUILT_IN_ATANHF:
30218 case BUILT_IN_CBRTF:
30219 case BUILT_IN_SINHF:
30220 case BUILT_IN_SINF:
30221 case BUILT_IN_ASINHF:
30222 case BUILT_IN_ASINF:
30223 case BUILT_IN_COSHF:
30224 case BUILT_IN_COSF:
30225 case BUILT_IN_ACOSHF:
30226 case BUILT_IN_ACOSF:
30227 if (el_mode != SFmode || n != 4)
30228 return NULL_TREE;
30229 break;
30230
30231 default:
30232 return NULL_TREE;
30233 }
30234
30235 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30236
30237 if (fn == BUILT_IN_LOGF)
30238 strcpy (name, "vmlsLn4");
30239 else if (fn == BUILT_IN_LOG)
30240 strcpy (name, "vmldLn2");
30241 else if (n == 4)
30242 {
30243 sprintf (name, "vmls%s", bname+10);
30244 name[strlen (name)-1] = '4';
30245 }
30246 else
30247 sprintf (name, "vmld%s2", bname+10);
30248
30249 /* Convert to uppercase. */
30250 name[4] &= ~0x20;
30251
30252 arity = 0;
30253 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30254 args;
30255 args = TREE_CHAIN (args))
30256 arity++;
30257
30258 if (arity == 1)
30259 fntype = build_function_type_list (type_out, type_in, NULL);
30260 else
30261 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30262
30263 /* Build a function declaration for the vectorized function. */
30264 new_fndecl = build_decl (BUILTINS_LOCATION,
30265 FUNCTION_DECL, get_identifier (name), fntype);
30266 TREE_PUBLIC (new_fndecl) = 1;
30267 DECL_EXTERNAL (new_fndecl) = 1;
30268 DECL_IS_NOVOPS (new_fndecl) = 1;
30269 TREE_READONLY (new_fndecl) = 1;
30270
30271 return new_fndecl;
30272 }
30273
30274 /* Handler for an ACML-style interface to
30275 a library with vectorized intrinsics. */
30276
30277 static tree
30278 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30279 {
30280 char name[20] = "__vr.._";
30281 tree fntype, new_fndecl, args;
30282 unsigned arity;
30283 const char *bname;
30284 enum machine_mode el_mode, in_mode;
30285 int n, in_n;
30286
30287 /* The ACML is 64bits only and suitable for unsafe math only as
30288 it does not correctly support parts of IEEE with the required
30289 precision such as denormals. */
30290 if (!TARGET_64BIT
30291 || !flag_unsafe_math_optimizations)
30292 return NULL_TREE;
30293
30294 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30295 n = TYPE_VECTOR_SUBPARTS (type_out);
30296 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30297 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30298 if (el_mode != in_mode
30299 || n != in_n)
30300 return NULL_TREE;
30301
30302 switch (fn)
30303 {
30304 case BUILT_IN_SIN:
30305 case BUILT_IN_COS:
30306 case BUILT_IN_EXP:
30307 case BUILT_IN_LOG:
30308 case BUILT_IN_LOG2:
30309 case BUILT_IN_LOG10:
30310 name[4] = 'd';
30311 name[5] = '2';
30312 if (el_mode != DFmode
30313 || n != 2)
30314 return NULL_TREE;
30315 break;
30316
30317 case BUILT_IN_SINF:
30318 case BUILT_IN_COSF:
30319 case BUILT_IN_EXPF:
30320 case BUILT_IN_POWF:
30321 case BUILT_IN_LOGF:
30322 case BUILT_IN_LOG2F:
30323 case BUILT_IN_LOG10F:
30324 name[4] = 's';
30325 name[5] = '4';
30326 if (el_mode != SFmode
30327 || n != 4)
30328 return NULL_TREE;
30329 break;
30330
30331 default:
30332 return NULL_TREE;
30333 }
30334
30335 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30336 sprintf (name + 7, "%s", bname+10);
30337
30338 arity = 0;
30339 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30340 args;
30341 args = TREE_CHAIN (args))
30342 arity++;
30343
30344 if (arity == 1)
30345 fntype = build_function_type_list (type_out, type_in, NULL);
30346 else
30347 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30348
30349 /* Build a function declaration for the vectorized function. */
30350 new_fndecl = build_decl (BUILTINS_LOCATION,
30351 FUNCTION_DECL, get_identifier (name), fntype);
30352 TREE_PUBLIC (new_fndecl) = 1;
30353 DECL_EXTERNAL (new_fndecl) = 1;
30354 DECL_IS_NOVOPS (new_fndecl) = 1;
30355 TREE_READONLY (new_fndecl) = 1;
30356
30357 return new_fndecl;
30358 }
30359
30360 /* Returns a decl of a function that implements gather load with
30361 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30362 Return NULL_TREE if it is not available. */
30363
30364 static tree
30365 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30366 const_tree index_type, int scale)
30367 {
30368 bool si;
30369 enum ix86_builtins code;
30370
30371 if (! TARGET_AVX2)
30372 return NULL_TREE;
30373
30374 if ((TREE_CODE (index_type) != INTEGER_TYPE
30375 && !POINTER_TYPE_P (index_type))
30376 || (TYPE_MODE (index_type) != SImode
30377 && TYPE_MODE (index_type) != DImode))
30378 return NULL_TREE;
30379
30380 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30381 return NULL_TREE;
30382
30383 /* v*gather* insn sign extends index to pointer mode. */
30384 if (TYPE_PRECISION (index_type) < POINTER_SIZE
30385 && TYPE_UNSIGNED (index_type))
30386 return NULL_TREE;
30387
30388 if (scale <= 0
30389 || scale > 8
30390 || (scale & (scale - 1)) != 0)
30391 return NULL_TREE;
30392
30393 si = TYPE_MODE (index_type) == SImode;
30394 switch (TYPE_MODE (mem_vectype))
30395 {
30396 case V2DFmode:
30397 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30398 break;
30399 case V4DFmode:
30400 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30401 break;
30402 case V2DImode:
30403 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30404 break;
30405 case V4DImode:
30406 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30407 break;
30408 case V4SFmode:
30409 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30410 break;
30411 case V8SFmode:
30412 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30413 break;
30414 case V4SImode:
30415 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30416 break;
30417 case V8SImode:
30418 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30419 break;
30420 default:
30421 return NULL_TREE;
30422 }
30423
30424 return ix86_builtins[code];
30425 }
30426
30427 /* Returns a code for a target-specific builtin that implements
30428 reciprocal of the function, or NULL_TREE if not available. */
30429
30430 static tree
30431 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30432 bool sqrt ATTRIBUTE_UNUSED)
30433 {
30434 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30435 && flag_finite_math_only && !flag_trapping_math
30436 && flag_unsafe_math_optimizations))
30437 return NULL_TREE;
30438
30439 if (md_fn)
30440 /* Machine dependent builtins. */
30441 switch (fn)
30442 {
30443 /* Vectorized version of sqrt to rsqrt conversion. */
30444 case IX86_BUILTIN_SQRTPS_NR:
30445 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30446
30447 case IX86_BUILTIN_SQRTPS_NR256:
30448 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30449
30450 default:
30451 return NULL_TREE;
30452 }
30453 else
30454 /* Normal builtins. */
30455 switch (fn)
30456 {
30457 /* Sqrt to rsqrt conversion. */
30458 case BUILT_IN_SQRTF:
30459 return ix86_builtins[IX86_BUILTIN_RSQRTF];
30460
30461 default:
30462 return NULL_TREE;
30463 }
30464 }
30465 \f
30466 /* Helper for avx_vpermilps256_operand et al. This is also used by
30467 the expansion functions to turn the parallel back into a mask.
30468 The return value is 0 for no match and the imm8+1 for a match. */
30469
30470 int
30471 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30472 {
30473 unsigned i, nelt = GET_MODE_NUNITS (mode);
30474 unsigned mask = 0;
30475 unsigned char ipar[8];
30476
30477 if (XVECLEN (par, 0) != (int) nelt)
30478 return 0;
30479
30480 /* Validate that all of the elements are constants, and not totally
30481 out of range. Copy the data into an integral array to make the
30482 subsequent checks easier. */
30483 for (i = 0; i < nelt; ++i)
30484 {
30485 rtx er = XVECEXP (par, 0, i);
30486 unsigned HOST_WIDE_INT ei;
30487
30488 if (!CONST_INT_P (er))
30489 return 0;
30490 ei = INTVAL (er);
30491 if (ei >= nelt)
30492 return 0;
30493 ipar[i] = ei;
30494 }
30495
30496 switch (mode)
30497 {
30498 case V4DFmode:
30499 /* In the 256-bit DFmode case, we can only move elements within
30500 a 128-bit lane. */
30501 for (i = 0; i < 2; ++i)
30502 {
30503 if (ipar[i] >= 2)
30504 return 0;
30505 mask |= ipar[i] << i;
30506 }
30507 for (i = 2; i < 4; ++i)
30508 {
30509 if (ipar[i] < 2)
30510 return 0;
30511 mask |= (ipar[i] - 2) << i;
30512 }
30513 break;
30514
30515 case V8SFmode:
30516 /* In the 256-bit SFmode case, we have full freedom of movement
30517 within the low 128-bit lane, but the high 128-bit lane must
30518 mirror the exact same pattern. */
30519 for (i = 0; i < 4; ++i)
30520 if (ipar[i] + 4 != ipar[i + 4])
30521 return 0;
30522 nelt = 4;
30523 /* FALLTHRU */
30524
30525 case V2DFmode:
30526 case V4SFmode:
30527 /* In the 128-bit case, we've full freedom in the placement of
30528 the elements from the source operand. */
30529 for (i = 0; i < nelt; ++i)
30530 mask |= ipar[i] << (i * (nelt / 2));
30531 break;
30532
30533 default:
30534 gcc_unreachable ();
30535 }
30536
30537 /* Make sure success has a non-zero value by adding one. */
30538 return mask + 1;
30539 }
30540
30541 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
30542 the expansion functions to turn the parallel back into a mask.
30543 The return value is 0 for no match and the imm8+1 for a match. */
30544
30545 int
30546 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30547 {
30548 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30549 unsigned mask = 0;
30550 unsigned char ipar[8];
30551
30552 if (XVECLEN (par, 0) != (int) nelt)
30553 return 0;
30554
30555 /* Validate that all of the elements are constants, and not totally
30556 out of range. Copy the data into an integral array to make the
30557 subsequent checks easier. */
30558 for (i = 0; i < nelt; ++i)
30559 {
30560 rtx er = XVECEXP (par, 0, i);
30561 unsigned HOST_WIDE_INT ei;
30562
30563 if (!CONST_INT_P (er))
30564 return 0;
30565 ei = INTVAL (er);
30566 if (ei >= 2 * nelt)
30567 return 0;
30568 ipar[i] = ei;
30569 }
30570
30571 /* Validate that the halves of the permute are halves. */
30572 for (i = 0; i < nelt2 - 1; ++i)
30573 if (ipar[i] + 1 != ipar[i + 1])
30574 return 0;
30575 for (i = nelt2; i < nelt - 1; ++i)
30576 if (ipar[i] + 1 != ipar[i + 1])
30577 return 0;
30578
30579 /* Reconstruct the mask. */
30580 for (i = 0; i < 2; ++i)
30581 {
30582 unsigned e = ipar[i * nelt2];
30583 if (e % nelt2)
30584 return 0;
30585 e /= nelt2;
30586 mask |= e << (i * 4);
30587 }
30588
30589 /* Make sure success has a non-zero value by adding one. */
30590 return mask + 1;
30591 }
30592 \f
30593 /* Store OPERAND to the memory after reload is completed. This means
30594 that we can't easily use assign_stack_local. */
30595 rtx
30596 ix86_force_to_memory (enum machine_mode mode, rtx operand)
30597 {
30598 rtx result;
30599
30600 gcc_assert (reload_completed);
30601 if (ix86_using_red_zone ())
30602 {
30603 result = gen_rtx_MEM (mode,
30604 gen_rtx_PLUS (Pmode,
30605 stack_pointer_rtx,
30606 GEN_INT (-RED_ZONE_SIZE)));
30607 emit_move_insn (result, operand);
30608 }
30609 else if (TARGET_64BIT)
30610 {
30611 switch (mode)
30612 {
30613 case HImode:
30614 case SImode:
30615 operand = gen_lowpart (DImode, operand);
30616 /* FALLTHRU */
30617 case DImode:
30618 emit_insn (
30619 gen_rtx_SET (VOIDmode,
30620 gen_rtx_MEM (DImode,
30621 gen_rtx_PRE_DEC (DImode,
30622 stack_pointer_rtx)),
30623 operand));
30624 break;
30625 default:
30626 gcc_unreachable ();
30627 }
30628 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30629 }
30630 else
30631 {
30632 switch (mode)
30633 {
30634 case DImode:
30635 {
30636 rtx operands[2];
30637 split_double_mode (mode, &operand, 1, operands, operands + 1);
30638 emit_insn (
30639 gen_rtx_SET (VOIDmode,
30640 gen_rtx_MEM (SImode,
30641 gen_rtx_PRE_DEC (Pmode,
30642 stack_pointer_rtx)),
30643 operands[1]));
30644 emit_insn (
30645 gen_rtx_SET (VOIDmode,
30646 gen_rtx_MEM (SImode,
30647 gen_rtx_PRE_DEC (Pmode,
30648 stack_pointer_rtx)),
30649 operands[0]));
30650 }
30651 break;
30652 case HImode:
30653 /* Store HImodes as SImodes. */
30654 operand = gen_lowpart (SImode, operand);
30655 /* FALLTHRU */
30656 case SImode:
30657 emit_insn (
30658 gen_rtx_SET (VOIDmode,
30659 gen_rtx_MEM (GET_MODE (operand),
30660 gen_rtx_PRE_DEC (SImode,
30661 stack_pointer_rtx)),
30662 operand));
30663 break;
30664 default:
30665 gcc_unreachable ();
30666 }
30667 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30668 }
30669 return result;
30670 }
30671
30672 /* Free operand from the memory. */
30673 void
30674 ix86_free_from_memory (enum machine_mode mode)
30675 {
30676 if (!ix86_using_red_zone ())
30677 {
30678 int size;
30679
30680 if (mode == DImode || TARGET_64BIT)
30681 size = 8;
30682 else
30683 size = 4;
30684 /* Use LEA to deallocate stack space. In peephole2 it will be converted
30685 to pop or add instruction if registers are available. */
30686 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
30687 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
30688 GEN_INT (size))));
30689 }
30690 }
30691
30692 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
30693
30694 Put float CONST_DOUBLE in the constant pool instead of fp regs.
30695 QImode must go into class Q_REGS.
30696 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
30697 movdf to do mem-to-mem moves through integer regs. */
30698
30699 static reg_class_t
30700 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
30701 {
30702 enum machine_mode mode = GET_MODE (x);
30703
30704 /* We're only allowed to return a subclass of CLASS. Many of the
30705 following checks fail for NO_REGS, so eliminate that early. */
30706 if (regclass == NO_REGS)
30707 return NO_REGS;
30708
30709 /* All classes can load zeros. */
30710 if (x == CONST0_RTX (mode))
30711 return regclass;
30712
30713 /* Force constants into memory if we are loading a (nonzero) constant into
30714 an MMX or SSE register. This is because there are no MMX/SSE instructions
30715 to load from a constant. */
30716 if (CONSTANT_P (x)
30717 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
30718 return NO_REGS;
30719
30720 /* Prefer SSE regs only, if we can use them for math. */
30721 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
30722 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
30723
30724 /* Floating-point constants need more complex checks. */
30725 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
30726 {
30727 /* General regs can load everything. */
30728 if (reg_class_subset_p (regclass, GENERAL_REGS))
30729 return regclass;
30730
30731 /* Floats can load 0 and 1 plus some others. Note that we eliminated
30732 zero above. We only want to wind up preferring 80387 registers if
30733 we plan on doing computation with them. */
30734 if (TARGET_80387
30735 && standard_80387_constant_p (x) > 0)
30736 {
30737 /* Limit class to non-sse. */
30738 if (regclass == FLOAT_SSE_REGS)
30739 return FLOAT_REGS;
30740 if (regclass == FP_TOP_SSE_REGS)
30741 return FP_TOP_REG;
30742 if (regclass == FP_SECOND_SSE_REGS)
30743 return FP_SECOND_REG;
30744 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
30745 return regclass;
30746 }
30747
30748 return NO_REGS;
30749 }
30750
30751 /* Generally when we see PLUS here, it's the function invariant
30752 (plus soft-fp const_int). Which can only be computed into general
30753 regs. */
30754 if (GET_CODE (x) == PLUS)
30755 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
30756
30757 /* QImode constants are easy to load, but non-constant QImode data
30758 must go into Q_REGS. */
30759 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
30760 {
30761 if (reg_class_subset_p (regclass, Q_REGS))
30762 return regclass;
30763 if (reg_class_subset_p (Q_REGS, regclass))
30764 return Q_REGS;
30765 return NO_REGS;
30766 }
30767
30768 return regclass;
30769 }
30770
30771 /* Discourage putting floating-point values in SSE registers unless
30772 SSE math is being used, and likewise for the 387 registers. */
30773 static reg_class_t
30774 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
30775 {
30776 enum machine_mode mode = GET_MODE (x);
30777
30778 /* Restrict the output reload class to the register bank that we are doing
30779 math on. If we would like not to return a subset of CLASS, reject this
30780 alternative: if reload cannot do this, it will still use its choice. */
30781 mode = GET_MODE (x);
30782 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
30783 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
30784
30785 if (X87_FLOAT_MODE_P (mode))
30786 {
30787 if (regclass == FP_TOP_SSE_REGS)
30788 return FP_TOP_REG;
30789 else if (regclass == FP_SECOND_SSE_REGS)
30790 return FP_SECOND_REG;
30791 else
30792 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
30793 }
30794
30795 return regclass;
30796 }
30797
30798 static reg_class_t
30799 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
30800 enum machine_mode mode, secondary_reload_info *sri)
30801 {
30802 /* Double-word spills from general registers to non-offsettable memory
30803 references (zero-extended addresses) require special handling. */
30804 if (TARGET_64BIT
30805 && MEM_P (x)
30806 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
30807 && rclass == GENERAL_REGS
30808 && !offsettable_memref_p (x))
30809 {
30810 sri->icode = (in_p
30811 ? CODE_FOR_reload_noff_load
30812 : CODE_FOR_reload_noff_store);
30813 /* Add the cost of moving address to a temporary. */
30814 sri->extra_cost = 1;
30815
30816 return NO_REGS;
30817 }
30818
30819 /* QImode spills from non-QI registers require
30820 intermediate register on 32bit targets. */
30821 if (!TARGET_64BIT
30822 && !in_p && mode == QImode
30823 && (rclass == GENERAL_REGS
30824 || rclass == LEGACY_REGS
30825 || rclass == INDEX_REGS))
30826 {
30827 int regno;
30828
30829 if (REG_P (x))
30830 regno = REGNO (x);
30831 else
30832 regno = -1;
30833
30834 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
30835 regno = true_regnum (x);
30836
30837 /* Return Q_REGS if the operand is in memory. */
30838 if (regno == -1)
30839 return Q_REGS;
30840 }
30841
30842 /* This condition handles corner case where an expression involving
30843 pointers gets vectorized. We're trying to use the address of a
30844 stack slot as a vector initializer.
30845
30846 (set (reg:V2DI 74 [ vect_cst_.2 ])
30847 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
30848
30849 Eventually frame gets turned into sp+offset like this:
30850
30851 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30852 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30853 (const_int 392 [0x188]))))
30854
30855 That later gets turned into:
30856
30857 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30858 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30859 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
30860
30861 We'll have the following reload recorded:
30862
30863 Reload 0: reload_in (DI) =
30864 (plus:DI (reg/f:DI 7 sp)
30865 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
30866 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30867 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
30868 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
30869 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30870 reload_reg_rtx: (reg:V2DI 22 xmm1)
30871
30872 Which isn't going to work since SSE instructions can't handle scalar
30873 additions. Returning GENERAL_REGS forces the addition into integer
30874 register and reload can handle subsequent reloads without problems. */
30875
30876 if (in_p && GET_CODE (x) == PLUS
30877 && SSE_CLASS_P (rclass)
30878 && SCALAR_INT_MODE_P (mode))
30879 return GENERAL_REGS;
30880
30881 return NO_REGS;
30882 }
30883
30884 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
30885
30886 static bool
30887 ix86_class_likely_spilled_p (reg_class_t rclass)
30888 {
30889 switch (rclass)
30890 {
30891 case AREG:
30892 case DREG:
30893 case CREG:
30894 case BREG:
30895 case AD_REGS:
30896 case SIREG:
30897 case DIREG:
30898 case SSE_FIRST_REG:
30899 case FP_TOP_REG:
30900 case FP_SECOND_REG:
30901 return true;
30902
30903 default:
30904 break;
30905 }
30906
30907 return false;
30908 }
30909
30910 /* If we are copying between general and FP registers, we need a memory
30911 location. The same is true for SSE and MMX registers.
30912
30913 To optimize register_move_cost performance, allow inline variant.
30914
30915 The macro can't work reliably when one of the CLASSES is class containing
30916 registers from multiple units (SSE, MMX, integer). We avoid this by never
30917 combining those units in single alternative in the machine description.
30918 Ensure that this constraint holds to avoid unexpected surprises.
30919
30920 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
30921 enforce these sanity checks. */
30922
30923 static inline bool
30924 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30925 enum machine_mode mode, int strict)
30926 {
30927 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
30928 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
30929 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
30930 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
30931 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
30932 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
30933 {
30934 gcc_assert (!strict);
30935 return true;
30936 }
30937
30938 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
30939 return true;
30940
30941 /* ??? This is a lie. We do have moves between mmx/general, and for
30942 mmx/sse2. But by saying we need secondary memory we discourage the
30943 register allocator from using the mmx registers unless needed. */
30944 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
30945 return true;
30946
30947 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30948 {
30949 /* SSE1 doesn't have any direct moves from other classes. */
30950 if (!TARGET_SSE2)
30951 return true;
30952
30953 /* If the target says that inter-unit moves are more expensive
30954 than moving through memory, then don't generate them. */
30955 if (!TARGET_INTER_UNIT_MOVES)
30956 return true;
30957
30958 /* Between SSE and general, we have moves no larger than word size. */
30959 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
30960 return true;
30961 }
30962
30963 return false;
30964 }
30965
30966 bool
30967 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30968 enum machine_mode mode, int strict)
30969 {
30970 return inline_secondary_memory_needed (class1, class2, mode, strict);
30971 }
30972
30973 /* Implement the TARGET_CLASS_MAX_NREGS hook.
30974
30975 On the 80386, this is the size of MODE in words,
30976 except in the FP regs, where a single reg is always enough. */
30977
30978 static unsigned char
30979 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
30980 {
30981 if (MAYBE_INTEGER_CLASS_P (rclass))
30982 {
30983 if (mode == XFmode)
30984 return (TARGET_64BIT ? 2 : 3);
30985 else if (mode == XCmode)
30986 return (TARGET_64BIT ? 4 : 6);
30987 else
30988 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
30989 }
30990 else
30991 {
30992 if (COMPLEX_MODE_P (mode))
30993 return 2;
30994 else
30995 return 1;
30996 }
30997 }
30998
30999 /* Return true if the registers in CLASS cannot represent the change from
31000 modes FROM to TO. */
31001
31002 bool
31003 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
31004 enum reg_class regclass)
31005 {
31006 if (from == to)
31007 return false;
31008
31009 /* x87 registers can't do subreg at all, as all values are reformatted
31010 to extended precision. */
31011 if (MAYBE_FLOAT_CLASS_P (regclass))
31012 return true;
31013
31014 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
31015 {
31016 /* Vector registers do not support QI or HImode loads. If we don't
31017 disallow a change to these modes, reload will assume it's ok to
31018 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
31019 the vec_dupv4hi pattern. */
31020 if (GET_MODE_SIZE (from) < 4)
31021 return true;
31022
31023 /* Vector registers do not support subreg with nonzero offsets, which
31024 are otherwise valid for integer registers. Since we can't see
31025 whether we have a nonzero offset from here, prohibit all
31026 nonparadoxical subregs changing size. */
31027 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
31028 return true;
31029 }
31030
31031 return false;
31032 }
31033
31034 /* Return the cost of moving data of mode M between a
31035 register and memory. A value of 2 is the default; this cost is
31036 relative to those in `REGISTER_MOVE_COST'.
31037
31038 This function is used extensively by register_move_cost that is used to
31039 build tables at startup. Make it inline in this case.
31040 When IN is 2, return maximum of in and out move cost.
31041
31042 If moving between registers and memory is more expensive than
31043 between two registers, you should define this macro to express the
31044 relative cost.
31045
31046 Model also increased moving costs of QImode registers in non
31047 Q_REGS classes.
31048 */
31049 static inline int
31050 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
31051 int in)
31052 {
31053 int cost;
31054 if (FLOAT_CLASS_P (regclass))
31055 {
31056 int index;
31057 switch (mode)
31058 {
31059 case SFmode:
31060 index = 0;
31061 break;
31062 case DFmode:
31063 index = 1;
31064 break;
31065 case XFmode:
31066 index = 2;
31067 break;
31068 default:
31069 return 100;
31070 }
31071 if (in == 2)
31072 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
31073 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
31074 }
31075 if (SSE_CLASS_P (regclass))
31076 {
31077 int index;
31078 switch (GET_MODE_SIZE (mode))
31079 {
31080 case 4:
31081 index = 0;
31082 break;
31083 case 8:
31084 index = 1;
31085 break;
31086 case 16:
31087 index = 2;
31088 break;
31089 default:
31090 return 100;
31091 }
31092 if (in == 2)
31093 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
31094 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
31095 }
31096 if (MMX_CLASS_P (regclass))
31097 {
31098 int index;
31099 switch (GET_MODE_SIZE (mode))
31100 {
31101 case 4:
31102 index = 0;
31103 break;
31104 case 8:
31105 index = 1;
31106 break;
31107 default:
31108 return 100;
31109 }
31110 if (in)
31111 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
31112 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
31113 }
31114 switch (GET_MODE_SIZE (mode))
31115 {
31116 case 1:
31117 if (Q_CLASS_P (regclass) || TARGET_64BIT)
31118 {
31119 if (!in)
31120 return ix86_cost->int_store[0];
31121 if (TARGET_PARTIAL_REG_DEPENDENCY
31122 && optimize_function_for_speed_p (cfun))
31123 cost = ix86_cost->movzbl_load;
31124 else
31125 cost = ix86_cost->int_load[0];
31126 if (in == 2)
31127 return MAX (cost, ix86_cost->int_store[0]);
31128 return cost;
31129 }
31130 else
31131 {
31132 if (in == 2)
31133 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
31134 if (in)
31135 return ix86_cost->movzbl_load;
31136 else
31137 return ix86_cost->int_store[0] + 4;
31138 }
31139 break;
31140 case 2:
31141 if (in == 2)
31142 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31143 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31144 default:
31145 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
31146 if (mode == TFmode)
31147 mode = XFmode;
31148 if (in == 2)
31149 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31150 else if (in)
31151 cost = ix86_cost->int_load[2];
31152 else
31153 cost = ix86_cost->int_store[2];
31154 return (cost * (((int) GET_MODE_SIZE (mode)
31155 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31156 }
31157 }
31158
31159 static int
31160 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31161 bool in)
31162 {
31163 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31164 }
31165
31166
31167 /* Return the cost of moving data from a register in class CLASS1 to
31168 one in class CLASS2.
31169
31170 It is not required that the cost always equal 2 when FROM is the same as TO;
31171 on some machines it is expensive to move between registers if they are not
31172 general registers. */
31173
31174 static int
31175 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31176 reg_class_t class2_i)
31177 {
31178 enum reg_class class1 = (enum reg_class) class1_i;
31179 enum reg_class class2 = (enum reg_class) class2_i;
31180
31181 /* In case we require secondary memory, compute cost of the store followed
31182 by load. In order to avoid bad register allocation choices, we need
31183 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
31184
31185 if (inline_secondary_memory_needed (class1, class2, mode, 0))
31186 {
31187 int cost = 1;
31188
31189 cost += inline_memory_move_cost (mode, class1, 2);
31190 cost += inline_memory_move_cost (mode, class2, 2);
31191
31192 /* In case of copying from general_purpose_register we may emit multiple
31193 stores followed by single load causing memory size mismatch stall.
31194 Count this as arbitrarily high cost of 20. */
31195 if (targetm.class_max_nregs (class1, mode)
31196 > targetm.class_max_nregs (class2, mode))
31197 cost += 20;
31198
31199 /* In the case of FP/MMX moves, the registers actually overlap, and we
31200 have to switch modes in order to treat them differently. */
31201 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31202 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31203 cost += 20;
31204
31205 return cost;
31206 }
31207
31208 /* Moves between SSE/MMX and integer unit are expensive. */
31209 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31210 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31211
31212 /* ??? By keeping returned value relatively high, we limit the number
31213 of moves between integer and MMX/SSE registers for all targets.
31214 Additionally, high value prevents problem with x86_modes_tieable_p(),
31215 where integer modes in MMX/SSE registers are not tieable
31216 because of missing QImode and HImode moves to, from or between
31217 MMX/SSE registers. */
31218 return MAX (8, ix86_cost->mmxsse_to_integer);
31219
31220 if (MAYBE_FLOAT_CLASS_P (class1))
31221 return ix86_cost->fp_move;
31222 if (MAYBE_SSE_CLASS_P (class1))
31223 return ix86_cost->sse_move;
31224 if (MAYBE_MMX_CLASS_P (class1))
31225 return ix86_cost->mmx_move;
31226 return 2;
31227 }
31228
31229 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31230 MODE. */
31231
31232 bool
31233 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31234 {
31235 /* Flags and only flags can only hold CCmode values. */
31236 if (CC_REGNO_P (regno))
31237 return GET_MODE_CLASS (mode) == MODE_CC;
31238 if (GET_MODE_CLASS (mode) == MODE_CC
31239 || GET_MODE_CLASS (mode) == MODE_RANDOM
31240 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31241 return false;
31242 if (FP_REGNO_P (regno))
31243 return VALID_FP_MODE_P (mode);
31244 if (SSE_REGNO_P (regno))
31245 {
31246 /* We implement the move patterns for all vector modes into and
31247 out of SSE registers, even when no operation instructions
31248 are available. OImode move is available only when AVX is
31249 enabled. */
31250 return ((TARGET_AVX && mode == OImode)
31251 || VALID_AVX256_REG_MODE (mode)
31252 || VALID_SSE_REG_MODE (mode)
31253 || VALID_SSE2_REG_MODE (mode)
31254 || VALID_MMX_REG_MODE (mode)
31255 || VALID_MMX_REG_MODE_3DNOW (mode));
31256 }
31257 if (MMX_REGNO_P (regno))
31258 {
31259 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31260 so if the register is available at all, then we can move data of
31261 the given mode into or out of it. */
31262 return (VALID_MMX_REG_MODE (mode)
31263 || VALID_MMX_REG_MODE_3DNOW (mode));
31264 }
31265
31266 if (mode == QImode)
31267 {
31268 /* Take care for QImode values - they can be in non-QI regs,
31269 but then they do cause partial register stalls. */
31270 if (regno <= BX_REG || TARGET_64BIT)
31271 return true;
31272 if (!TARGET_PARTIAL_REG_STALL)
31273 return true;
31274 return !can_create_pseudo_p ();
31275 }
31276 /* We handle both integer and floats in the general purpose registers. */
31277 else if (VALID_INT_MODE_P (mode))
31278 return true;
31279 else if (VALID_FP_MODE_P (mode))
31280 return true;
31281 else if (VALID_DFP_MODE_P (mode))
31282 return true;
31283 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
31284 on to use that value in smaller contexts, this can easily force a
31285 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
31286 supporting DImode, allow it. */
31287 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31288 return true;
31289
31290 return false;
31291 }
31292
31293 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
31294 tieable integer mode. */
31295
31296 static bool
31297 ix86_tieable_integer_mode_p (enum machine_mode mode)
31298 {
31299 switch (mode)
31300 {
31301 case HImode:
31302 case SImode:
31303 return true;
31304
31305 case QImode:
31306 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31307
31308 case DImode:
31309 return TARGET_64BIT;
31310
31311 default:
31312 return false;
31313 }
31314 }
31315
31316 /* Return true if MODE1 is accessible in a register that can hold MODE2
31317 without copying. That is, all register classes that can hold MODE2
31318 can also hold MODE1. */
31319
31320 bool
31321 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31322 {
31323 if (mode1 == mode2)
31324 return true;
31325
31326 if (ix86_tieable_integer_mode_p (mode1)
31327 && ix86_tieable_integer_mode_p (mode2))
31328 return true;
31329
31330 /* MODE2 being XFmode implies fp stack or general regs, which means we
31331 can tie any smaller floating point modes to it. Note that we do not
31332 tie this with TFmode. */
31333 if (mode2 == XFmode)
31334 return mode1 == SFmode || mode1 == DFmode;
31335
31336 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31337 that we can tie it with SFmode. */
31338 if (mode2 == DFmode)
31339 return mode1 == SFmode;
31340
31341 /* If MODE2 is only appropriate for an SSE register, then tie with
31342 any other mode acceptable to SSE registers. */
31343 if (GET_MODE_SIZE (mode2) == 16
31344 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31345 return (GET_MODE_SIZE (mode1) == 16
31346 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31347
31348 /* If MODE2 is appropriate for an MMX register, then tie
31349 with any other mode acceptable to MMX registers. */
31350 if (GET_MODE_SIZE (mode2) == 8
31351 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31352 return (GET_MODE_SIZE (mode1) == 8
31353 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31354
31355 return false;
31356 }
31357
31358 /* Compute a (partial) cost for rtx X. Return true if the complete
31359 cost has been computed, and false if subexpressions should be
31360 scanned. In either case, *TOTAL contains the cost result. */
31361
31362 static bool
31363 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
31364 bool speed)
31365 {
31366 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31367 enum machine_mode mode = GET_MODE (x);
31368 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31369
31370 switch (code)
31371 {
31372 case CONST_INT:
31373 case CONST:
31374 case LABEL_REF:
31375 case SYMBOL_REF:
31376 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31377 *total = 3;
31378 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31379 *total = 2;
31380 else if (flag_pic && SYMBOLIC_CONST (x)
31381 && (!TARGET_64BIT
31382 || (!GET_CODE (x) != LABEL_REF
31383 && (GET_CODE (x) != SYMBOL_REF
31384 || !SYMBOL_REF_LOCAL_P (x)))))
31385 *total = 1;
31386 else
31387 *total = 0;
31388 return true;
31389
31390 case CONST_DOUBLE:
31391 if (mode == VOIDmode)
31392 *total = 0;
31393 else
31394 switch (standard_80387_constant_p (x))
31395 {
31396 case 1: /* 0.0 */
31397 *total = 1;
31398 break;
31399 default: /* Other constants */
31400 *total = 2;
31401 break;
31402 case 0:
31403 case -1:
31404 /* Start with (MEM (SYMBOL_REF)), since that's where
31405 it'll probably end up. Add a penalty for size. */
31406 *total = (COSTS_N_INSNS (1)
31407 + (flag_pic != 0 && !TARGET_64BIT)
31408 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
31409 break;
31410 }
31411 return true;
31412
31413 case ZERO_EXTEND:
31414 /* The zero extensions is often completely free on x86_64, so make
31415 it as cheap as possible. */
31416 if (TARGET_64BIT && mode == DImode
31417 && GET_MODE (XEXP (x, 0)) == SImode)
31418 *total = 1;
31419 else if (TARGET_ZERO_EXTEND_WITH_AND)
31420 *total = cost->add;
31421 else
31422 *total = cost->movzx;
31423 return false;
31424
31425 case SIGN_EXTEND:
31426 *total = cost->movsx;
31427 return false;
31428
31429 case ASHIFT:
31430 if (CONST_INT_P (XEXP (x, 1))
31431 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
31432 {
31433 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31434 if (value == 1)
31435 {
31436 *total = cost->add;
31437 return false;
31438 }
31439 if ((value == 2 || value == 3)
31440 && cost->lea <= cost->shift_const)
31441 {
31442 *total = cost->lea;
31443 return false;
31444 }
31445 }
31446 /* FALLTHRU */
31447
31448 case ROTATE:
31449 case ASHIFTRT:
31450 case LSHIFTRT:
31451 case ROTATERT:
31452 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
31453 {
31454 if (CONST_INT_P (XEXP (x, 1)))
31455 {
31456 if (INTVAL (XEXP (x, 1)) > 32)
31457 *total = cost->shift_const + COSTS_N_INSNS (2);
31458 else
31459 *total = cost->shift_const * 2;
31460 }
31461 else
31462 {
31463 if (GET_CODE (XEXP (x, 1)) == AND)
31464 *total = cost->shift_var * 2;
31465 else
31466 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
31467 }
31468 }
31469 else
31470 {
31471 if (CONST_INT_P (XEXP (x, 1)))
31472 *total = cost->shift_const;
31473 else
31474 *total = cost->shift_var;
31475 }
31476 return false;
31477
31478 case FMA:
31479 {
31480 rtx sub;
31481
31482 gcc_assert (FLOAT_MODE_P (mode));
31483 gcc_assert (TARGET_FMA || TARGET_FMA4);
31484
31485 /* ??? SSE scalar/vector cost should be used here. */
31486 /* ??? Bald assumption that fma has the same cost as fmul. */
31487 *total = cost->fmul;
31488 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31489
31490 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
31491 sub = XEXP (x, 0);
31492 if (GET_CODE (sub) == NEG)
31493 sub = XEXP (sub, 0);
31494 *total += rtx_cost (sub, FMA, 0, speed);
31495
31496 sub = XEXP (x, 2);
31497 if (GET_CODE (sub) == NEG)
31498 sub = XEXP (sub, 0);
31499 *total += rtx_cost (sub, FMA, 2, speed);
31500 return true;
31501 }
31502
31503 case MULT:
31504 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31505 {
31506 /* ??? SSE scalar cost should be used here. */
31507 *total = cost->fmul;
31508 return false;
31509 }
31510 else if (X87_FLOAT_MODE_P (mode))
31511 {
31512 *total = cost->fmul;
31513 return false;
31514 }
31515 else if (FLOAT_MODE_P (mode))
31516 {
31517 /* ??? SSE vector cost should be used here. */
31518 *total = cost->fmul;
31519 return false;
31520 }
31521 else
31522 {
31523 rtx op0 = XEXP (x, 0);
31524 rtx op1 = XEXP (x, 1);
31525 int nbits;
31526 if (CONST_INT_P (XEXP (x, 1)))
31527 {
31528 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31529 for (nbits = 0; value != 0; value &= value - 1)
31530 nbits++;
31531 }
31532 else
31533 /* This is arbitrary. */
31534 nbits = 7;
31535
31536 /* Compute costs correctly for widening multiplication. */
31537 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
31538 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
31539 == GET_MODE_SIZE (mode))
31540 {
31541 int is_mulwiden = 0;
31542 enum machine_mode inner_mode = GET_MODE (op0);
31543
31544 if (GET_CODE (op0) == GET_CODE (op1))
31545 is_mulwiden = 1, op1 = XEXP (op1, 0);
31546 else if (CONST_INT_P (op1))
31547 {
31548 if (GET_CODE (op0) == SIGN_EXTEND)
31549 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
31550 == INTVAL (op1);
31551 else
31552 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
31553 }
31554
31555 if (is_mulwiden)
31556 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
31557 }
31558
31559 *total = (cost->mult_init[MODE_INDEX (mode)]
31560 + nbits * cost->mult_bit
31561 + rtx_cost (op0, outer_code, opno, speed)
31562 + rtx_cost (op1, outer_code, opno, speed));
31563
31564 return true;
31565 }
31566
31567 case DIV:
31568 case UDIV:
31569 case MOD:
31570 case UMOD:
31571 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31572 /* ??? SSE cost should be used here. */
31573 *total = cost->fdiv;
31574 else if (X87_FLOAT_MODE_P (mode))
31575 *total = cost->fdiv;
31576 else if (FLOAT_MODE_P (mode))
31577 /* ??? SSE vector cost should be used here. */
31578 *total = cost->fdiv;
31579 else
31580 *total = cost->divide[MODE_INDEX (mode)];
31581 return false;
31582
31583 case PLUS:
31584 if (GET_MODE_CLASS (mode) == MODE_INT
31585 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
31586 {
31587 if (GET_CODE (XEXP (x, 0)) == PLUS
31588 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
31589 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
31590 && CONSTANT_P (XEXP (x, 1)))
31591 {
31592 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
31593 if (val == 2 || val == 4 || val == 8)
31594 {
31595 *total = cost->lea;
31596 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31597 outer_code, opno, speed);
31598 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
31599 outer_code, opno, speed);
31600 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31601 return true;
31602 }
31603 }
31604 else if (GET_CODE (XEXP (x, 0)) == MULT
31605 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
31606 {
31607 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
31608 if (val == 2 || val == 4 || val == 8)
31609 {
31610 *total = cost->lea;
31611 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31612 outer_code, opno, speed);
31613 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31614 return true;
31615 }
31616 }
31617 else if (GET_CODE (XEXP (x, 0)) == PLUS)
31618 {
31619 *total = cost->lea;
31620 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31621 outer_code, opno, speed);
31622 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31623 outer_code, opno, speed);
31624 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31625 return true;
31626 }
31627 }
31628 /* FALLTHRU */
31629
31630 case MINUS:
31631 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31632 {
31633 /* ??? SSE cost should be used here. */
31634 *total = cost->fadd;
31635 return false;
31636 }
31637 else if (X87_FLOAT_MODE_P (mode))
31638 {
31639 *total = cost->fadd;
31640 return false;
31641 }
31642 else if (FLOAT_MODE_P (mode))
31643 {
31644 /* ??? SSE vector cost should be used here. */
31645 *total = cost->fadd;
31646 return false;
31647 }
31648 /* FALLTHRU */
31649
31650 case AND:
31651 case IOR:
31652 case XOR:
31653 if (!TARGET_64BIT && mode == DImode)
31654 {
31655 *total = (cost->add * 2
31656 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
31657 << (GET_MODE (XEXP (x, 0)) != DImode))
31658 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
31659 << (GET_MODE (XEXP (x, 1)) != DImode)));
31660 return true;
31661 }
31662 /* FALLTHRU */
31663
31664 case NEG:
31665 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31666 {
31667 /* ??? SSE cost should be used here. */
31668 *total = cost->fchs;
31669 return false;
31670 }
31671 else if (X87_FLOAT_MODE_P (mode))
31672 {
31673 *total = cost->fchs;
31674 return false;
31675 }
31676 else if (FLOAT_MODE_P (mode))
31677 {
31678 /* ??? SSE vector cost should be used here. */
31679 *total = cost->fchs;
31680 return false;
31681 }
31682 /* FALLTHRU */
31683
31684 case NOT:
31685 if (!TARGET_64BIT && mode == DImode)
31686 *total = cost->add * 2;
31687 else
31688 *total = cost->add;
31689 return false;
31690
31691 case COMPARE:
31692 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
31693 && XEXP (XEXP (x, 0), 1) == const1_rtx
31694 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
31695 && XEXP (x, 1) == const0_rtx)
31696 {
31697 /* This kind of construct is implemented using test[bwl].
31698 Treat it as if we had an AND. */
31699 *total = (cost->add
31700 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
31701 + rtx_cost (const1_rtx, outer_code, opno, speed));
31702 return true;
31703 }
31704 return false;
31705
31706 case FLOAT_EXTEND:
31707 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
31708 *total = 0;
31709 return false;
31710
31711 case ABS:
31712 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31713 /* ??? SSE cost should be used here. */
31714 *total = cost->fabs;
31715 else if (X87_FLOAT_MODE_P (mode))
31716 *total = cost->fabs;
31717 else if (FLOAT_MODE_P (mode))
31718 /* ??? SSE vector cost should be used here. */
31719 *total = cost->fabs;
31720 return false;
31721
31722 case SQRT:
31723 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31724 /* ??? SSE cost should be used here. */
31725 *total = cost->fsqrt;
31726 else if (X87_FLOAT_MODE_P (mode))
31727 *total = cost->fsqrt;
31728 else if (FLOAT_MODE_P (mode))
31729 /* ??? SSE vector cost should be used here. */
31730 *total = cost->fsqrt;
31731 return false;
31732
31733 case UNSPEC:
31734 if (XINT (x, 1) == UNSPEC_TP)
31735 *total = 0;
31736 return false;
31737
31738 case VEC_SELECT:
31739 case VEC_CONCAT:
31740 case VEC_MERGE:
31741 case VEC_DUPLICATE:
31742 /* ??? Assume all of these vector manipulation patterns are
31743 recognizable. In which case they all pretty much have the
31744 same cost. */
31745 *total = COSTS_N_INSNS (1);
31746 return true;
31747
31748 default:
31749 return false;
31750 }
31751 }
31752
31753 #if TARGET_MACHO
31754
31755 static int current_machopic_label_num;
31756
31757 /* Given a symbol name and its associated stub, write out the
31758 definition of the stub. */
31759
31760 void
31761 machopic_output_stub (FILE *file, const char *symb, const char *stub)
31762 {
31763 unsigned int length;
31764 char *binder_name, *symbol_name, lazy_ptr_name[32];
31765 int label = ++current_machopic_label_num;
31766
31767 /* For 64-bit we shouldn't get here. */
31768 gcc_assert (!TARGET_64BIT);
31769
31770 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
31771 symb = targetm.strip_name_encoding (symb);
31772
31773 length = strlen (stub);
31774 binder_name = XALLOCAVEC (char, length + 32);
31775 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
31776
31777 length = strlen (symb);
31778 symbol_name = XALLOCAVEC (char, length + 32);
31779 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
31780
31781 sprintf (lazy_ptr_name, "L%d$lz", label);
31782
31783 if (MACHOPIC_ATT_STUB)
31784 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
31785 else if (MACHOPIC_PURE)
31786 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
31787 else
31788 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
31789
31790 fprintf (file, "%s:\n", stub);
31791 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31792
31793 if (MACHOPIC_ATT_STUB)
31794 {
31795 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
31796 }
31797 else if (MACHOPIC_PURE)
31798 {
31799 /* PIC stub. */
31800 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31801 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
31802 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
31803 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
31804 label, lazy_ptr_name, label);
31805 fprintf (file, "\tjmp\t*%%ecx\n");
31806 }
31807 else
31808 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
31809
31810 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
31811 it needs no stub-binding-helper. */
31812 if (MACHOPIC_ATT_STUB)
31813 return;
31814
31815 fprintf (file, "%s:\n", binder_name);
31816
31817 if (MACHOPIC_PURE)
31818 {
31819 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
31820 fprintf (file, "\tpushl\t%%ecx\n");
31821 }
31822 else
31823 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
31824
31825 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
31826
31827 /* N.B. Keep the correspondence of these
31828 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
31829 old-pic/new-pic/non-pic stubs; altering this will break
31830 compatibility with existing dylibs. */
31831 if (MACHOPIC_PURE)
31832 {
31833 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31834 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
31835 }
31836 else
31837 /* 16-byte -mdynamic-no-pic stub. */
31838 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
31839
31840 fprintf (file, "%s:\n", lazy_ptr_name);
31841 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31842 fprintf (file, ASM_LONG "%s\n", binder_name);
31843 }
31844 #endif /* TARGET_MACHO */
31845
31846 /* Order the registers for register allocator. */
31847
31848 void
31849 x86_order_regs_for_local_alloc (void)
31850 {
31851 int pos = 0;
31852 int i;
31853
31854 /* First allocate the local general purpose registers. */
31855 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31856 if (GENERAL_REGNO_P (i) && call_used_regs[i])
31857 reg_alloc_order [pos++] = i;
31858
31859 /* Global general purpose registers. */
31860 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31861 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
31862 reg_alloc_order [pos++] = i;
31863
31864 /* x87 registers come first in case we are doing FP math
31865 using them. */
31866 if (!TARGET_SSE_MATH)
31867 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31868 reg_alloc_order [pos++] = i;
31869
31870 /* SSE registers. */
31871 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
31872 reg_alloc_order [pos++] = i;
31873 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
31874 reg_alloc_order [pos++] = i;
31875
31876 /* x87 registers. */
31877 if (TARGET_SSE_MATH)
31878 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31879 reg_alloc_order [pos++] = i;
31880
31881 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
31882 reg_alloc_order [pos++] = i;
31883
31884 /* Initialize the rest of array as we do not allocate some registers
31885 at all. */
31886 while (pos < FIRST_PSEUDO_REGISTER)
31887 reg_alloc_order [pos++] = 0;
31888 }
31889
31890 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
31891 in struct attribute_spec handler. */
31892 static tree
31893 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
31894 tree args,
31895 int flags ATTRIBUTE_UNUSED,
31896 bool *no_add_attrs)
31897 {
31898 if (TREE_CODE (*node) != FUNCTION_TYPE
31899 && TREE_CODE (*node) != METHOD_TYPE
31900 && TREE_CODE (*node) != FIELD_DECL
31901 && TREE_CODE (*node) != TYPE_DECL)
31902 {
31903 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31904 name);
31905 *no_add_attrs = true;
31906 return NULL_TREE;
31907 }
31908 if (TARGET_64BIT)
31909 {
31910 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
31911 name);
31912 *no_add_attrs = true;
31913 return NULL_TREE;
31914 }
31915 if (is_attribute_p ("callee_pop_aggregate_return", name))
31916 {
31917 tree cst;
31918
31919 cst = TREE_VALUE (args);
31920 if (TREE_CODE (cst) != INTEGER_CST)
31921 {
31922 warning (OPT_Wattributes,
31923 "%qE attribute requires an integer constant argument",
31924 name);
31925 *no_add_attrs = true;
31926 }
31927 else if (compare_tree_int (cst, 0) != 0
31928 && compare_tree_int (cst, 1) != 0)
31929 {
31930 warning (OPT_Wattributes,
31931 "argument to %qE attribute is neither zero, nor one",
31932 name);
31933 *no_add_attrs = true;
31934 }
31935
31936 return NULL_TREE;
31937 }
31938
31939 return NULL_TREE;
31940 }
31941
31942 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
31943 struct attribute_spec.handler. */
31944 static tree
31945 ix86_handle_abi_attribute (tree *node, tree name,
31946 tree args ATTRIBUTE_UNUSED,
31947 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31948 {
31949 if (TREE_CODE (*node) != FUNCTION_TYPE
31950 && TREE_CODE (*node) != METHOD_TYPE
31951 && TREE_CODE (*node) != FIELD_DECL
31952 && TREE_CODE (*node) != TYPE_DECL)
31953 {
31954 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31955 name);
31956 *no_add_attrs = true;
31957 return NULL_TREE;
31958 }
31959
31960 /* Can combine regparm with all attributes but fastcall. */
31961 if (is_attribute_p ("ms_abi", name))
31962 {
31963 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
31964 {
31965 error ("ms_abi and sysv_abi attributes are not compatible");
31966 }
31967
31968 return NULL_TREE;
31969 }
31970 else if (is_attribute_p ("sysv_abi", name))
31971 {
31972 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
31973 {
31974 error ("ms_abi and sysv_abi attributes are not compatible");
31975 }
31976
31977 return NULL_TREE;
31978 }
31979
31980 return NULL_TREE;
31981 }
31982
31983 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
31984 struct attribute_spec.handler. */
31985 static tree
31986 ix86_handle_struct_attribute (tree *node, tree name,
31987 tree args ATTRIBUTE_UNUSED,
31988 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31989 {
31990 tree *type = NULL;
31991 if (DECL_P (*node))
31992 {
31993 if (TREE_CODE (*node) == TYPE_DECL)
31994 type = &TREE_TYPE (*node);
31995 }
31996 else
31997 type = node;
31998
31999 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
32000 || TREE_CODE (*type) == UNION_TYPE)))
32001 {
32002 warning (OPT_Wattributes, "%qE attribute ignored",
32003 name);
32004 *no_add_attrs = true;
32005 }
32006
32007 else if ((is_attribute_p ("ms_struct", name)
32008 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
32009 || ((is_attribute_p ("gcc_struct", name)
32010 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
32011 {
32012 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
32013 name);
32014 *no_add_attrs = true;
32015 }
32016
32017 return NULL_TREE;
32018 }
32019
32020 static tree
32021 ix86_handle_fndecl_attribute (tree *node, tree name,
32022 tree args ATTRIBUTE_UNUSED,
32023 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32024 {
32025 if (TREE_CODE (*node) != FUNCTION_DECL)
32026 {
32027 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32028 name);
32029 *no_add_attrs = true;
32030 }
32031 return NULL_TREE;
32032 }
32033
32034 static bool
32035 ix86_ms_bitfield_layout_p (const_tree record_type)
32036 {
32037 return ((TARGET_MS_BITFIELD_LAYOUT
32038 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
32039 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
32040 }
32041
32042 /* Returns an expression indicating where the this parameter is
32043 located on entry to the FUNCTION. */
32044
32045 static rtx
32046 x86_this_parameter (tree function)
32047 {
32048 tree type = TREE_TYPE (function);
32049 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
32050 int nregs;
32051
32052 if (TARGET_64BIT)
32053 {
32054 const int *parm_regs;
32055
32056 if (ix86_function_type_abi (type) == MS_ABI)
32057 parm_regs = x86_64_ms_abi_int_parameter_registers;
32058 else
32059 parm_regs = x86_64_int_parameter_registers;
32060 return gen_rtx_REG (Pmode, parm_regs[aggr]);
32061 }
32062
32063 nregs = ix86_function_regparm (type, function);
32064
32065 if (nregs > 0 && !stdarg_p (type))
32066 {
32067 int regno;
32068 unsigned int ccvt = ix86_get_callcvt (type);
32069
32070 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32071 regno = aggr ? DX_REG : CX_REG;
32072 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32073 {
32074 regno = CX_REG;
32075 if (aggr)
32076 return gen_rtx_MEM (SImode,
32077 plus_constant (stack_pointer_rtx, 4));
32078 }
32079 else
32080 {
32081 regno = AX_REG;
32082 if (aggr)
32083 {
32084 regno = DX_REG;
32085 if (nregs == 1)
32086 return gen_rtx_MEM (SImode,
32087 plus_constant (stack_pointer_rtx, 4));
32088 }
32089 }
32090 return gen_rtx_REG (SImode, regno);
32091 }
32092
32093 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
32094 }
32095
32096 /* Determine whether x86_output_mi_thunk can succeed. */
32097
32098 static bool
32099 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
32100 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
32101 HOST_WIDE_INT vcall_offset, const_tree function)
32102 {
32103 /* 64-bit can handle anything. */
32104 if (TARGET_64BIT)
32105 return true;
32106
32107 /* For 32-bit, everything's fine if we have one free register. */
32108 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
32109 return true;
32110
32111 /* Need a free register for vcall_offset. */
32112 if (vcall_offset)
32113 return false;
32114
32115 /* Need a free register for GOT references. */
32116 if (flag_pic && !targetm.binds_local_p (function))
32117 return false;
32118
32119 /* Otherwise ok. */
32120 return true;
32121 }
32122
32123 /* Output the assembler code for a thunk function. THUNK_DECL is the
32124 declaration for the thunk function itself, FUNCTION is the decl for
32125 the target function. DELTA is an immediate constant offset to be
32126 added to THIS. If VCALL_OFFSET is nonzero, the word at
32127 *(*this + vcall_offset) should be added to THIS. */
32128
32129 static void
32130 x86_output_mi_thunk (FILE *file,
32131 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
32132 HOST_WIDE_INT vcall_offset, tree function)
32133 {
32134 rtx this_param = x86_this_parameter (function);
32135 rtx this_reg, tmp, fnaddr;
32136
32137 emit_note (NOTE_INSN_PROLOGUE_END);
32138
32139 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
32140 pull it in now and let DELTA benefit. */
32141 if (REG_P (this_param))
32142 this_reg = this_param;
32143 else if (vcall_offset)
32144 {
32145 /* Put the this parameter into %eax. */
32146 this_reg = gen_rtx_REG (Pmode, AX_REG);
32147 emit_move_insn (this_reg, this_param);
32148 }
32149 else
32150 this_reg = NULL_RTX;
32151
32152 /* Adjust the this parameter by a fixed constant. */
32153 if (delta)
32154 {
32155 rtx delta_rtx = GEN_INT (delta);
32156 rtx delta_dst = this_reg ? this_reg : this_param;
32157
32158 if (TARGET_64BIT)
32159 {
32160 if (!x86_64_general_operand (delta_rtx, Pmode))
32161 {
32162 tmp = gen_rtx_REG (Pmode, R10_REG);
32163 emit_move_insn (tmp, delta_rtx);
32164 delta_rtx = tmp;
32165 }
32166 }
32167
32168 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32169 }
32170
32171 /* Adjust the this parameter by a value stored in the vtable. */
32172 if (vcall_offset)
32173 {
32174 rtx vcall_addr, vcall_mem, this_mem;
32175 unsigned int tmp_regno;
32176
32177 if (TARGET_64BIT)
32178 tmp_regno = R10_REG;
32179 else
32180 {
32181 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32182 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
32183 tmp_regno = AX_REG;
32184 else
32185 tmp_regno = CX_REG;
32186 }
32187 tmp = gen_rtx_REG (Pmode, tmp_regno);
32188
32189 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32190 if (Pmode != ptr_mode)
32191 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32192 emit_move_insn (tmp, this_mem);
32193
32194 /* Adjust the this parameter. */
32195 vcall_addr = plus_constant (tmp, vcall_offset);
32196 if (TARGET_64BIT
32197 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32198 {
32199 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32200 emit_move_insn (tmp2, GEN_INT (vcall_offset));
32201 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32202 }
32203
32204 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32205 if (Pmode != ptr_mode)
32206 emit_insn (gen_addsi_1_zext (this_reg,
32207 gen_rtx_REG (ptr_mode,
32208 REGNO (this_reg)),
32209 vcall_mem));
32210 else
32211 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32212 }
32213
32214 /* If necessary, drop THIS back to its stack slot. */
32215 if (this_reg && this_reg != this_param)
32216 emit_move_insn (this_param, this_reg);
32217
32218 fnaddr = XEXP (DECL_RTL (function), 0);
32219 if (TARGET_64BIT)
32220 {
32221 if (!flag_pic || targetm.binds_local_p (function)
32222 || cfun->machine->call_abi == MS_ABI)
32223 ;
32224 else
32225 {
32226 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32227 tmp = gen_rtx_CONST (Pmode, tmp);
32228 fnaddr = gen_rtx_MEM (Pmode, tmp);
32229 }
32230 }
32231 else
32232 {
32233 if (!flag_pic || targetm.binds_local_p (function))
32234 ;
32235 #if TARGET_MACHO
32236 else if (TARGET_MACHO)
32237 {
32238 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32239 fnaddr = XEXP (fnaddr, 0);
32240 }
32241 #endif /* TARGET_MACHO */
32242 else
32243 {
32244 tmp = gen_rtx_REG (Pmode, CX_REG);
32245 output_set_got (tmp, NULL_RTX);
32246
32247 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32248 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32249 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32250 }
32251 }
32252
32253 /* Our sibling call patterns do not allow memories, because we have no
32254 predicate that can distinguish between frame and non-frame memory.
32255 For our purposes here, we can get away with (ab)using a jump pattern,
32256 because we're going to do no optimization. */
32257 if (MEM_P (fnaddr))
32258 emit_jump_insn (gen_indirect_jump (fnaddr));
32259 else
32260 {
32261 tmp = gen_rtx_MEM (QImode, fnaddr);
32262 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32263 tmp = emit_call_insn (tmp);
32264 SIBLING_CALL_P (tmp) = 1;
32265 }
32266 emit_barrier ();
32267
32268 /* Emit just enough of rest_of_compilation to get the insns emitted.
32269 Note that use_thunk calls assemble_start_function et al. */
32270 tmp = get_insns ();
32271 insn_locators_alloc ();
32272 shorten_branches (tmp);
32273 final_start_function (tmp, file, 1);
32274 final (tmp, file, 1);
32275 final_end_function ();
32276 }
32277
32278 static void
32279 x86_file_start (void)
32280 {
32281 default_file_start ();
32282 #if TARGET_MACHO
32283 darwin_file_start ();
32284 #endif
32285 if (X86_FILE_START_VERSION_DIRECTIVE)
32286 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32287 if (X86_FILE_START_FLTUSED)
32288 fputs ("\t.global\t__fltused\n", asm_out_file);
32289 if (ix86_asm_dialect == ASM_INTEL)
32290 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32291 }
32292
32293 int
32294 x86_field_alignment (tree field, int computed)
32295 {
32296 enum machine_mode mode;
32297 tree type = TREE_TYPE (field);
32298
32299 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32300 return computed;
32301 mode = TYPE_MODE (strip_array_types (type));
32302 if (mode == DFmode || mode == DCmode
32303 || GET_MODE_CLASS (mode) == MODE_INT
32304 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32305 return MIN (32, computed);
32306 return computed;
32307 }
32308
32309 /* Output assembler code to FILE to increment profiler label # LABELNO
32310 for profiling a function entry. */
32311 void
32312 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32313 {
32314 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32315 : MCOUNT_NAME);
32316
32317 if (TARGET_64BIT)
32318 {
32319 #ifndef NO_PROFILE_COUNTERS
32320 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32321 #endif
32322
32323 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32324 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32325 else
32326 fprintf (file, "\tcall\t%s\n", mcount_name);
32327 }
32328 else if (flag_pic)
32329 {
32330 #ifndef NO_PROFILE_COUNTERS
32331 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32332 LPREFIX, labelno);
32333 #endif
32334 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32335 }
32336 else
32337 {
32338 #ifndef NO_PROFILE_COUNTERS
32339 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32340 LPREFIX, labelno);
32341 #endif
32342 fprintf (file, "\tcall\t%s\n", mcount_name);
32343 }
32344 }
32345
32346 /* We don't have exact information about the insn sizes, but we may assume
32347 quite safely that we are informed about all 1 byte insns and memory
32348 address sizes. This is enough to eliminate unnecessary padding in
32349 99% of cases. */
32350
32351 static int
32352 min_insn_size (rtx insn)
32353 {
32354 int l = 0, len;
32355
32356 if (!INSN_P (insn) || !active_insn_p (insn))
32357 return 0;
32358
32359 /* Discard alignments we've emit and jump instructions. */
32360 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
32361 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
32362 return 0;
32363 if (JUMP_TABLE_DATA_P (insn))
32364 return 0;
32365
32366 /* Important case - calls are always 5 bytes.
32367 It is common to have many calls in the row. */
32368 if (CALL_P (insn)
32369 && symbolic_reference_mentioned_p (PATTERN (insn))
32370 && !SIBLING_CALL_P (insn))
32371 return 5;
32372 len = get_attr_length (insn);
32373 if (len <= 1)
32374 return 1;
32375
32376 /* For normal instructions we rely on get_attr_length being exact,
32377 with a few exceptions. */
32378 if (!JUMP_P (insn))
32379 {
32380 enum attr_type type = get_attr_type (insn);
32381
32382 switch (type)
32383 {
32384 case TYPE_MULTI:
32385 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
32386 || asm_noperands (PATTERN (insn)) >= 0)
32387 return 0;
32388 break;
32389 case TYPE_OTHER:
32390 case TYPE_FCMP:
32391 break;
32392 default:
32393 /* Otherwise trust get_attr_length. */
32394 return len;
32395 }
32396
32397 l = get_attr_length_address (insn);
32398 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
32399 l = 4;
32400 }
32401 if (l)
32402 return 1+l;
32403 else
32404 return 2;
32405 }
32406
32407 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32408
32409 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
32410 window. */
32411
32412 static void
32413 ix86_avoid_jump_mispredicts (void)
32414 {
32415 rtx insn, start = get_insns ();
32416 int nbytes = 0, njumps = 0;
32417 int isjump = 0;
32418
32419 /* Look for all minimal intervals of instructions containing 4 jumps.
32420 The intervals are bounded by START and INSN. NBYTES is the total
32421 size of instructions in the interval including INSN and not including
32422 START. When the NBYTES is smaller than 16 bytes, it is possible
32423 that the end of START and INSN ends up in the same 16byte page.
32424
32425 The smallest offset in the page INSN can start is the case where START
32426 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
32427 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
32428 */
32429 for (insn = start; insn; insn = NEXT_INSN (insn))
32430 {
32431 int min_size;
32432
32433 if (LABEL_P (insn))
32434 {
32435 int align = label_to_alignment (insn);
32436 int max_skip = label_to_max_skip (insn);
32437
32438 if (max_skip > 15)
32439 max_skip = 15;
32440 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
32441 already in the current 16 byte page, because otherwise
32442 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
32443 bytes to reach 16 byte boundary. */
32444 if (align <= 0
32445 || (align <= 3 && max_skip != (1 << align) - 1))
32446 max_skip = 0;
32447 if (dump_file)
32448 fprintf (dump_file, "Label %i with max_skip %i\n",
32449 INSN_UID (insn), max_skip);
32450 if (max_skip)
32451 {
32452 while (nbytes + max_skip >= 16)
32453 {
32454 start = NEXT_INSN (start);
32455 if ((JUMP_P (start)
32456 && GET_CODE (PATTERN (start)) != ADDR_VEC
32457 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32458 || CALL_P (start))
32459 njumps--, isjump = 1;
32460 else
32461 isjump = 0;
32462 nbytes -= min_insn_size (start);
32463 }
32464 }
32465 continue;
32466 }
32467
32468 min_size = min_insn_size (insn);
32469 nbytes += min_size;
32470 if (dump_file)
32471 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
32472 INSN_UID (insn), min_size);
32473 if ((JUMP_P (insn)
32474 && GET_CODE (PATTERN (insn)) != ADDR_VEC
32475 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
32476 || CALL_P (insn))
32477 njumps++;
32478 else
32479 continue;
32480
32481 while (njumps > 3)
32482 {
32483 start = NEXT_INSN (start);
32484 if ((JUMP_P (start)
32485 && GET_CODE (PATTERN (start)) != ADDR_VEC
32486 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32487 || CALL_P (start))
32488 njumps--, isjump = 1;
32489 else
32490 isjump = 0;
32491 nbytes -= min_insn_size (start);
32492 }
32493 gcc_assert (njumps >= 0);
32494 if (dump_file)
32495 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
32496 INSN_UID (start), INSN_UID (insn), nbytes);
32497
32498 if (njumps == 3 && isjump && nbytes < 16)
32499 {
32500 int padsize = 15 - nbytes + min_insn_size (insn);
32501
32502 if (dump_file)
32503 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
32504 INSN_UID (insn), padsize);
32505 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
32506 }
32507 }
32508 }
32509 #endif
32510
32511 /* AMD Athlon works faster
32512 when RET is not destination of conditional jump or directly preceded
32513 by other jump instruction. We avoid the penalty by inserting NOP just
32514 before the RET instructions in such cases. */
32515 static void
32516 ix86_pad_returns (void)
32517 {
32518 edge e;
32519 edge_iterator ei;
32520
32521 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32522 {
32523 basic_block bb = e->src;
32524 rtx ret = BB_END (bb);
32525 rtx prev;
32526 bool replace = false;
32527
32528 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
32529 || optimize_bb_for_size_p (bb))
32530 continue;
32531 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
32532 if (active_insn_p (prev) || LABEL_P (prev))
32533 break;
32534 if (prev && LABEL_P (prev))
32535 {
32536 edge e;
32537 edge_iterator ei;
32538
32539 FOR_EACH_EDGE (e, ei, bb->preds)
32540 if (EDGE_FREQUENCY (e) && e->src->index >= 0
32541 && !(e->flags & EDGE_FALLTHRU))
32542 replace = true;
32543 }
32544 if (!replace)
32545 {
32546 prev = prev_active_insn (ret);
32547 if (prev
32548 && ((JUMP_P (prev) && any_condjump_p (prev))
32549 || CALL_P (prev)))
32550 replace = true;
32551 /* Empty functions get branch mispredict even when
32552 the jump destination is not visible to us. */
32553 if (!prev && !optimize_function_for_size_p (cfun))
32554 replace = true;
32555 }
32556 if (replace)
32557 {
32558 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
32559 delete_insn (ret);
32560 }
32561 }
32562 }
32563
32564 /* Count the minimum number of instructions in BB. Return 4 if the
32565 number of instructions >= 4. */
32566
32567 static int
32568 ix86_count_insn_bb (basic_block bb)
32569 {
32570 rtx insn;
32571 int insn_count = 0;
32572
32573 /* Count number of instructions in this block. Return 4 if the number
32574 of instructions >= 4. */
32575 FOR_BB_INSNS (bb, insn)
32576 {
32577 /* Only happen in exit blocks. */
32578 if (JUMP_P (insn)
32579 && ANY_RETURN_P (PATTERN (insn)))
32580 break;
32581
32582 if (NONDEBUG_INSN_P (insn)
32583 && GET_CODE (PATTERN (insn)) != USE
32584 && GET_CODE (PATTERN (insn)) != CLOBBER)
32585 {
32586 insn_count++;
32587 if (insn_count >= 4)
32588 return insn_count;
32589 }
32590 }
32591
32592 return insn_count;
32593 }
32594
32595
32596 /* Count the minimum number of instructions in code path in BB.
32597 Return 4 if the number of instructions >= 4. */
32598
32599 static int
32600 ix86_count_insn (basic_block bb)
32601 {
32602 edge e;
32603 edge_iterator ei;
32604 int min_prev_count;
32605
32606 /* Only bother counting instructions along paths with no
32607 more than 2 basic blocks between entry and exit. Given
32608 that BB has an edge to exit, determine if a predecessor
32609 of BB has an edge from entry. If so, compute the number
32610 of instructions in the predecessor block. If there
32611 happen to be multiple such blocks, compute the minimum. */
32612 min_prev_count = 4;
32613 FOR_EACH_EDGE (e, ei, bb->preds)
32614 {
32615 edge prev_e;
32616 edge_iterator prev_ei;
32617
32618 if (e->src == ENTRY_BLOCK_PTR)
32619 {
32620 min_prev_count = 0;
32621 break;
32622 }
32623 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
32624 {
32625 if (prev_e->src == ENTRY_BLOCK_PTR)
32626 {
32627 int count = ix86_count_insn_bb (e->src);
32628 if (count < min_prev_count)
32629 min_prev_count = count;
32630 break;
32631 }
32632 }
32633 }
32634
32635 if (min_prev_count < 4)
32636 min_prev_count += ix86_count_insn_bb (bb);
32637
32638 return min_prev_count;
32639 }
32640
32641 /* Pad short funtion to 4 instructions. */
32642
32643 static void
32644 ix86_pad_short_function (void)
32645 {
32646 edge e;
32647 edge_iterator ei;
32648
32649 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32650 {
32651 rtx ret = BB_END (e->src);
32652 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
32653 {
32654 int insn_count = ix86_count_insn (e->src);
32655
32656 /* Pad short function. */
32657 if (insn_count < 4)
32658 {
32659 rtx insn = ret;
32660
32661 /* Find epilogue. */
32662 while (insn
32663 && (!NOTE_P (insn)
32664 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
32665 insn = PREV_INSN (insn);
32666
32667 if (!insn)
32668 insn = ret;
32669
32670 /* Two NOPs count as one instruction. */
32671 insn_count = 2 * (4 - insn_count);
32672 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
32673 }
32674 }
32675 }
32676 }
32677
32678 /* Implement machine specific optimizations. We implement padding of returns
32679 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
32680 static void
32681 ix86_reorg (void)
32682 {
32683 /* We are freeing block_for_insn in the toplev to keep compatibility
32684 with old MDEP_REORGS that are not CFG based. Recompute it now. */
32685 compute_bb_for_insn ();
32686
32687 /* Run the vzeroupper optimization if needed. */
32688 if (TARGET_VZEROUPPER)
32689 move_or_delete_vzeroupper ();
32690
32691 if (optimize && optimize_function_for_speed_p (cfun))
32692 {
32693 if (TARGET_PAD_SHORT_FUNCTION)
32694 ix86_pad_short_function ();
32695 else if (TARGET_PAD_RETURNS)
32696 ix86_pad_returns ();
32697 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32698 if (TARGET_FOUR_JUMP_LIMIT)
32699 ix86_avoid_jump_mispredicts ();
32700 #endif
32701 }
32702 }
32703
32704 /* Return nonzero when QImode register that must be represented via REX prefix
32705 is used. */
32706 bool
32707 x86_extended_QIreg_mentioned_p (rtx insn)
32708 {
32709 int i;
32710 extract_insn_cached (insn);
32711 for (i = 0; i < recog_data.n_operands; i++)
32712 if (REG_P (recog_data.operand[i])
32713 && REGNO (recog_data.operand[i]) > BX_REG)
32714 return true;
32715 return false;
32716 }
32717
32718 /* Return nonzero when P points to register encoded via REX prefix.
32719 Called via for_each_rtx. */
32720 static int
32721 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
32722 {
32723 unsigned int regno;
32724 if (!REG_P (*p))
32725 return 0;
32726 regno = REGNO (*p);
32727 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
32728 }
32729
32730 /* Return true when INSN mentions register that must be encoded using REX
32731 prefix. */
32732 bool
32733 x86_extended_reg_mentioned_p (rtx insn)
32734 {
32735 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
32736 extended_reg_mentioned_1, NULL);
32737 }
32738
32739 /* If profitable, negate (without causing overflow) integer constant
32740 of mode MODE at location LOC. Return true in this case. */
32741 bool
32742 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
32743 {
32744 HOST_WIDE_INT val;
32745
32746 if (!CONST_INT_P (*loc))
32747 return false;
32748
32749 switch (mode)
32750 {
32751 case DImode:
32752 /* DImode x86_64 constants must fit in 32 bits. */
32753 gcc_assert (x86_64_immediate_operand (*loc, mode));
32754
32755 mode = SImode;
32756 break;
32757
32758 case SImode:
32759 case HImode:
32760 case QImode:
32761 break;
32762
32763 default:
32764 gcc_unreachable ();
32765 }
32766
32767 /* Avoid overflows. */
32768 if (mode_signbit_p (mode, *loc))
32769 return false;
32770
32771 val = INTVAL (*loc);
32772
32773 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
32774 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
32775 if ((val < 0 && val != -128)
32776 || val == 128)
32777 {
32778 *loc = GEN_INT (-val);
32779 return true;
32780 }
32781
32782 return false;
32783 }
32784
32785 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
32786 optabs would emit if we didn't have TFmode patterns. */
32787
32788 void
32789 x86_emit_floatuns (rtx operands[2])
32790 {
32791 rtx neglab, donelab, i0, i1, f0, in, out;
32792 enum machine_mode mode, inmode;
32793
32794 inmode = GET_MODE (operands[1]);
32795 gcc_assert (inmode == SImode || inmode == DImode);
32796
32797 out = operands[0];
32798 in = force_reg (inmode, operands[1]);
32799 mode = GET_MODE (out);
32800 neglab = gen_label_rtx ();
32801 donelab = gen_label_rtx ();
32802 f0 = gen_reg_rtx (mode);
32803
32804 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
32805
32806 expand_float (out, in, 0);
32807
32808 emit_jump_insn (gen_jump (donelab));
32809 emit_barrier ();
32810
32811 emit_label (neglab);
32812
32813 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
32814 1, OPTAB_DIRECT);
32815 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
32816 1, OPTAB_DIRECT);
32817 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
32818
32819 expand_float (f0, i0, 0);
32820
32821 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
32822
32823 emit_label (donelab);
32824 }
32825 \f
32826 /* AVX2 does support 32-byte integer vector operations,
32827 thus the longest vector we are faced with is V32QImode. */
32828 #define MAX_VECT_LEN 32
32829
32830 struct expand_vec_perm_d
32831 {
32832 rtx target, op0, op1;
32833 unsigned char perm[MAX_VECT_LEN];
32834 enum machine_mode vmode;
32835 unsigned char nelt;
32836 bool testing_p;
32837 };
32838
32839 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
32840 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
32841
32842 /* Get a vector mode of the same size as the original but with elements
32843 twice as wide. This is only guaranteed to apply to integral vectors. */
32844
32845 static inline enum machine_mode
32846 get_mode_wider_vector (enum machine_mode o)
32847 {
32848 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
32849 enum machine_mode n = GET_MODE_WIDER_MODE (o);
32850 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
32851 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
32852 return n;
32853 }
32854
32855 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32856 with all elements equal to VAR. Return true if successful. */
32857
32858 static bool
32859 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
32860 rtx target, rtx val)
32861 {
32862 bool ok;
32863
32864 switch (mode)
32865 {
32866 case V2SImode:
32867 case V2SFmode:
32868 if (!mmx_ok)
32869 return false;
32870 /* FALLTHRU */
32871
32872 case V4DFmode:
32873 case V4DImode:
32874 case V8SFmode:
32875 case V8SImode:
32876 case V2DFmode:
32877 case V2DImode:
32878 case V4SFmode:
32879 case V4SImode:
32880 {
32881 rtx insn, dup;
32882
32883 /* First attempt to recognize VAL as-is. */
32884 dup = gen_rtx_VEC_DUPLICATE (mode, val);
32885 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
32886 if (recog_memoized (insn) < 0)
32887 {
32888 rtx seq;
32889 /* If that fails, force VAL into a register. */
32890
32891 start_sequence ();
32892 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
32893 seq = get_insns ();
32894 end_sequence ();
32895 if (seq)
32896 emit_insn_before (seq, insn);
32897
32898 ok = recog_memoized (insn) >= 0;
32899 gcc_assert (ok);
32900 }
32901 }
32902 return true;
32903
32904 case V4HImode:
32905 if (!mmx_ok)
32906 return false;
32907 if (TARGET_SSE || TARGET_3DNOW_A)
32908 {
32909 rtx x;
32910
32911 val = gen_lowpart (SImode, val);
32912 x = gen_rtx_TRUNCATE (HImode, val);
32913 x = gen_rtx_VEC_DUPLICATE (mode, x);
32914 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32915 return true;
32916 }
32917 goto widen;
32918
32919 case V8QImode:
32920 if (!mmx_ok)
32921 return false;
32922 goto widen;
32923
32924 case V8HImode:
32925 if (TARGET_SSE2)
32926 {
32927 struct expand_vec_perm_d dperm;
32928 rtx tmp1, tmp2;
32929
32930 permute:
32931 memset (&dperm, 0, sizeof (dperm));
32932 dperm.target = target;
32933 dperm.vmode = mode;
32934 dperm.nelt = GET_MODE_NUNITS (mode);
32935 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
32936
32937 /* Extend to SImode using a paradoxical SUBREG. */
32938 tmp1 = gen_reg_rtx (SImode);
32939 emit_move_insn (tmp1, gen_lowpart (SImode, val));
32940
32941 /* Insert the SImode value as low element of a V4SImode vector. */
32942 tmp2 = gen_lowpart (V4SImode, dperm.op0);
32943 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
32944
32945 ok = (expand_vec_perm_1 (&dperm)
32946 || expand_vec_perm_broadcast_1 (&dperm));
32947 gcc_assert (ok);
32948 return ok;
32949 }
32950 goto widen;
32951
32952 case V16QImode:
32953 if (TARGET_SSE2)
32954 goto permute;
32955 goto widen;
32956
32957 widen:
32958 /* Replicate the value once into the next wider mode and recurse. */
32959 {
32960 enum machine_mode smode, wsmode, wvmode;
32961 rtx x;
32962
32963 smode = GET_MODE_INNER (mode);
32964 wvmode = get_mode_wider_vector (mode);
32965 wsmode = GET_MODE_INNER (wvmode);
32966
32967 val = convert_modes (wsmode, smode, val, true);
32968 x = expand_simple_binop (wsmode, ASHIFT, val,
32969 GEN_INT (GET_MODE_BITSIZE (smode)),
32970 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32971 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
32972
32973 x = gen_lowpart (wvmode, target);
32974 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
32975 gcc_assert (ok);
32976 return ok;
32977 }
32978
32979 case V16HImode:
32980 case V32QImode:
32981 {
32982 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
32983 rtx x = gen_reg_rtx (hvmode);
32984
32985 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
32986 gcc_assert (ok);
32987
32988 x = gen_rtx_VEC_CONCAT (mode, x, x);
32989 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32990 }
32991 return true;
32992
32993 default:
32994 return false;
32995 }
32996 }
32997
32998 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32999 whose ONE_VAR element is VAR, and other elements are zero. Return true
33000 if successful. */
33001
33002 static bool
33003 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
33004 rtx target, rtx var, int one_var)
33005 {
33006 enum machine_mode vsimode;
33007 rtx new_target;
33008 rtx x, tmp;
33009 bool use_vector_set = false;
33010
33011 switch (mode)
33012 {
33013 case V2DImode:
33014 /* For SSE4.1, we normally use vector set. But if the second
33015 element is zero and inter-unit moves are OK, we use movq
33016 instead. */
33017 use_vector_set = (TARGET_64BIT
33018 && TARGET_SSE4_1
33019 && !(TARGET_INTER_UNIT_MOVES
33020 && one_var == 0));
33021 break;
33022 case V16QImode:
33023 case V4SImode:
33024 case V4SFmode:
33025 use_vector_set = TARGET_SSE4_1;
33026 break;
33027 case V8HImode:
33028 use_vector_set = TARGET_SSE2;
33029 break;
33030 case V4HImode:
33031 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
33032 break;
33033 case V32QImode:
33034 case V16HImode:
33035 case V8SImode:
33036 case V8SFmode:
33037 case V4DFmode:
33038 use_vector_set = TARGET_AVX;
33039 break;
33040 case V4DImode:
33041 /* Use ix86_expand_vector_set in 64bit mode only. */
33042 use_vector_set = TARGET_AVX && TARGET_64BIT;
33043 break;
33044 default:
33045 break;
33046 }
33047
33048 if (use_vector_set)
33049 {
33050 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
33051 var = force_reg (GET_MODE_INNER (mode), var);
33052 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33053 return true;
33054 }
33055
33056 switch (mode)
33057 {
33058 case V2SFmode:
33059 case V2SImode:
33060 if (!mmx_ok)
33061 return false;
33062 /* FALLTHRU */
33063
33064 case V2DFmode:
33065 case V2DImode:
33066 if (one_var != 0)
33067 return false;
33068 var = force_reg (GET_MODE_INNER (mode), var);
33069 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
33070 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33071 return true;
33072
33073 case V4SFmode:
33074 case V4SImode:
33075 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
33076 new_target = gen_reg_rtx (mode);
33077 else
33078 new_target = target;
33079 var = force_reg (GET_MODE_INNER (mode), var);
33080 x = gen_rtx_VEC_DUPLICATE (mode, var);
33081 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
33082 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
33083 if (one_var != 0)
33084 {
33085 /* We need to shuffle the value to the correct position, so
33086 create a new pseudo to store the intermediate result. */
33087
33088 /* With SSE2, we can use the integer shuffle insns. */
33089 if (mode != V4SFmode && TARGET_SSE2)
33090 {
33091 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
33092 const1_rtx,
33093 GEN_INT (one_var == 1 ? 0 : 1),
33094 GEN_INT (one_var == 2 ? 0 : 1),
33095 GEN_INT (one_var == 3 ? 0 : 1)));
33096 if (target != new_target)
33097 emit_move_insn (target, new_target);
33098 return true;
33099 }
33100
33101 /* Otherwise convert the intermediate result to V4SFmode and
33102 use the SSE1 shuffle instructions. */
33103 if (mode != V4SFmode)
33104 {
33105 tmp = gen_reg_rtx (V4SFmode);
33106 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
33107 }
33108 else
33109 tmp = new_target;
33110
33111 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
33112 const1_rtx,
33113 GEN_INT (one_var == 1 ? 0 : 1),
33114 GEN_INT (one_var == 2 ? 0+4 : 1+4),
33115 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
33116
33117 if (mode != V4SFmode)
33118 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
33119 else if (tmp != target)
33120 emit_move_insn (target, tmp);
33121 }
33122 else if (target != new_target)
33123 emit_move_insn (target, new_target);
33124 return true;
33125
33126 case V8HImode:
33127 case V16QImode:
33128 vsimode = V4SImode;
33129 goto widen;
33130 case V4HImode:
33131 case V8QImode:
33132 if (!mmx_ok)
33133 return false;
33134 vsimode = V2SImode;
33135 goto widen;
33136 widen:
33137 if (one_var != 0)
33138 return false;
33139
33140 /* Zero extend the variable element to SImode and recurse. */
33141 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
33142
33143 x = gen_reg_rtx (vsimode);
33144 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
33145 var, one_var))
33146 gcc_unreachable ();
33147
33148 emit_move_insn (target, gen_lowpart (mode, x));
33149 return true;
33150
33151 default:
33152 return false;
33153 }
33154 }
33155
33156 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33157 consisting of the values in VALS. It is known that all elements
33158 except ONE_VAR are constants. Return true if successful. */
33159
33160 static bool
33161 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33162 rtx target, rtx vals, int one_var)
33163 {
33164 rtx var = XVECEXP (vals, 0, one_var);
33165 enum machine_mode wmode;
33166 rtx const_vec, x;
33167
33168 const_vec = copy_rtx (vals);
33169 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33170 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33171
33172 switch (mode)
33173 {
33174 case V2DFmode:
33175 case V2DImode:
33176 case V2SFmode:
33177 case V2SImode:
33178 /* For the two element vectors, it's just as easy to use
33179 the general case. */
33180 return false;
33181
33182 case V4DImode:
33183 /* Use ix86_expand_vector_set in 64bit mode only. */
33184 if (!TARGET_64BIT)
33185 return false;
33186 case V4DFmode:
33187 case V8SFmode:
33188 case V8SImode:
33189 case V16HImode:
33190 case V32QImode:
33191 case V4SFmode:
33192 case V4SImode:
33193 case V8HImode:
33194 case V4HImode:
33195 break;
33196
33197 case V16QImode:
33198 if (TARGET_SSE4_1)
33199 break;
33200 wmode = V8HImode;
33201 goto widen;
33202 case V8QImode:
33203 wmode = V4HImode;
33204 goto widen;
33205 widen:
33206 /* There's no way to set one QImode entry easily. Combine
33207 the variable value with its adjacent constant value, and
33208 promote to an HImode set. */
33209 x = XVECEXP (vals, 0, one_var ^ 1);
33210 if (one_var & 1)
33211 {
33212 var = convert_modes (HImode, QImode, var, true);
33213 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33214 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33215 x = GEN_INT (INTVAL (x) & 0xff);
33216 }
33217 else
33218 {
33219 var = convert_modes (HImode, QImode, var, true);
33220 x = gen_int_mode (INTVAL (x) << 8, HImode);
33221 }
33222 if (x != const0_rtx)
33223 var = expand_simple_binop (HImode, IOR, var, x, var,
33224 1, OPTAB_LIB_WIDEN);
33225
33226 x = gen_reg_rtx (wmode);
33227 emit_move_insn (x, gen_lowpart (wmode, const_vec));
33228 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33229
33230 emit_move_insn (target, gen_lowpart (mode, x));
33231 return true;
33232
33233 default:
33234 return false;
33235 }
33236
33237 emit_move_insn (target, const_vec);
33238 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33239 return true;
33240 }
33241
33242 /* A subroutine of ix86_expand_vector_init_general. Use vector
33243 concatenate to handle the most general case: all values variable,
33244 and none identical. */
33245
33246 static void
33247 ix86_expand_vector_init_concat (enum machine_mode mode,
33248 rtx target, rtx *ops, int n)
33249 {
33250 enum machine_mode cmode, hmode = VOIDmode;
33251 rtx first[8], second[4];
33252 rtvec v;
33253 int i, j;
33254
33255 switch (n)
33256 {
33257 case 2:
33258 switch (mode)
33259 {
33260 case V8SImode:
33261 cmode = V4SImode;
33262 break;
33263 case V8SFmode:
33264 cmode = V4SFmode;
33265 break;
33266 case V4DImode:
33267 cmode = V2DImode;
33268 break;
33269 case V4DFmode:
33270 cmode = V2DFmode;
33271 break;
33272 case V4SImode:
33273 cmode = V2SImode;
33274 break;
33275 case V4SFmode:
33276 cmode = V2SFmode;
33277 break;
33278 case V2DImode:
33279 cmode = DImode;
33280 break;
33281 case V2SImode:
33282 cmode = SImode;
33283 break;
33284 case V2DFmode:
33285 cmode = DFmode;
33286 break;
33287 case V2SFmode:
33288 cmode = SFmode;
33289 break;
33290 default:
33291 gcc_unreachable ();
33292 }
33293
33294 if (!register_operand (ops[1], cmode))
33295 ops[1] = force_reg (cmode, ops[1]);
33296 if (!register_operand (ops[0], cmode))
33297 ops[0] = force_reg (cmode, ops[0]);
33298 emit_insn (gen_rtx_SET (VOIDmode, target,
33299 gen_rtx_VEC_CONCAT (mode, ops[0],
33300 ops[1])));
33301 break;
33302
33303 case 4:
33304 switch (mode)
33305 {
33306 case V4DImode:
33307 cmode = V2DImode;
33308 break;
33309 case V4DFmode:
33310 cmode = V2DFmode;
33311 break;
33312 case V4SImode:
33313 cmode = V2SImode;
33314 break;
33315 case V4SFmode:
33316 cmode = V2SFmode;
33317 break;
33318 default:
33319 gcc_unreachable ();
33320 }
33321 goto half;
33322
33323 case 8:
33324 switch (mode)
33325 {
33326 case V8SImode:
33327 cmode = V2SImode;
33328 hmode = V4SImode;
33329 break;
33330 case V8SFmode:
33331 cmode = V2SFmode;
33332 hmode = V4SFmode;
33333 break;
33334 default:
33335 gcc_unreachable ();
33336 }
33337 goto half;
33338
33339 half:
33340 /* FIXME: We process inputs backward to help RA. PR 36222. */
33341 i = n - 1;
33342 j = (n >> 1) - 1;
33343 for (; i > 0; i -= 2, j--)
33344 {
33345 first[j] = gen_reg_rtx (cmode);
33346 v = gen_rtvec (2, ops[i - 1], ops[i]);
33347 ix86_expand_vector_init (false, first[j],
33348 gen_rtx_PARALLEL (cmode, v));
33349 }
33350
33351 n >>= 1;
33352 if (n > 2)
33353 {
33354 gcc_assert (hmode != VOIDmode);
33355 for (i = j = 0; i < n; i += 2, j++)
33356 {
33357 second[j] = gen_reg_rtx (hmode);
33358 ix86_expand_vector_init_concat (hmode, second [j],
33359 &first [i], 2);
33360 }
33361 n >>= 1;
33362 ix86_expand_vector_init_concat (mode, target, second, n);
33363 }
33364 else
33365 ix86_expand_vector_init_concat (mode, target, first, n);
33366 break;
33367
33368 default:
33369 gcc_unreachable ();
33370 }
33371 }
33372
33373 /* A subroutine of ix86_expand_vector_init_general. Use vector
33374 interleave to handle the most general case: all values variable,
33375 and none identical. */
33376
33377 static void
33378 ix86_expand_vector_init_interleave (enum machine_mode mode,
33379 rtx target, rtx *ops, int n)
33380 {
33381 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
33382 int i, j;
33383 rtx op0, op1;
33384 rtx (*gen_load_even) (rtx, rtx, rtx);
33385 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
33386 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
33387
33388 switch (mode)
33389 {
33390 case V8HImode:
33391 gen_load_even = gen_vec_setv8hi;
33392 gen_interleave_first_low = gen_vec_interleave_lowv4si;
33393 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33394 inner_mode = HImode;
33395 first_imode = V4SImode;
33396 second_imode = V2DImode;
33397 third_imode = VOIDmode;
33398 break;
33399 case V16QImode:
33400 gen_load_even = gen_vec_setv16qi;
33401 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
33402 gen_interleave_second_low = gen_vec_interleave_lowv4si;
33403 inner_mode = QImode;
33404 first_imode = V8HImode;
33405 second_imode = V4SImode;
33406 third_imode = V2DImode;
33407 break;
33408 default:
33409 gcc_unreachable ();
33410 }
33411
33412 for (i = 0; i < n; i++)
33413 {
33414 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
33415 op0 = gen_reg_rtx (SImode);
33416 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
33417
33418 /* Insert the SImode value as low element of V4SImode vector. */
33419 op1 = gen_reg_rtx (V4SImode);
33420 op0 = gen_rtx_VEC_MERGE (V4SImode,
33421 gen_rtx_VEC_DUPLICATE (V4SImode,
33422 op0),
33423 CONST0_RTX (V4SImode),
33424 const1_rtx);
33425 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
33426
33427 /* Cast the V4SImode vector back to a vector in orignal mode. */
33428 op0 = gen_reg_rtx (mode);
33429 emit_move_insn (op0, gen_lowpart (mode, op1));
33430
33431 /* Load even elements into the second positon. */
33432 emit_insn (gen_load_even (op0,
33433 force_reg (inner_mode,
33434 ops [i + i + 1]),
33435 const1_rtx));
33436
33437 /* Cast vector to FIRST_IMODE vector. */
33438 ops[i] = gen_reg_rtx (first_imode);
33439 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
33440 }
33441
33442 /* Interleave low FIRST_IMODE vectors. */
33443 for (i = j = 0; i < n; i += 2, j++)
33444 {
33445 op0 = gen_reg_rtx (first_imode);
33446 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
33447
33448 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
33449 ops[j] = gen_reg_rtx (second_imode);
33450 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
33451 }
33452
33453 /* Interleave low SECOND_IMODE vectors. */
33454 switch (second_imode)
33455 {
33456 case V4SImode:
33457 for (i = j = 0; i < n / 2; i += 2, j++)
33458 {
33459 op0 = gen_reg_rtx (second_imode);
33460 emit_insn (gen_interleave_second_low (op0, ops[i],
33461 ops[i + 1]));
33462
33463 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
33464 vector. */
33465 ops[j] = gen_reg_rtx (third_imode);
33466 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
33467 }
33468 second_imode = V2DImode;
33469 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33470 /* FALLTHRU */
33471
33472 case V2DImode:
33473 op0 = gen_reg_rtx (second_imode);
33474 emit_insn (gen_interleave_second_low (op0, ops[0],
33475 ops[1]));
33476
33477 /* Cast the SECOND_IMODE vector back to a vector on original
33478 mode. */
33479 emit_insn (gen_rtx_SET (VOIDmode, target,
33480 gen_lowpart (mode, op0)));
33481 break;
33482
33483 default:
33484 gcc_unreachable ();
33485 }
33486 }
33487
33488 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
33489 all values variable, and none identical. */
33490
33491 static void
33492 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
33493 rtx target, rtx vals)
33494 {
33495 rtx ops[32], op0, op1;
33496 enum machine_mode half_mode = VOIDmode;
33497 int n, i;
33498
33499 switch (mode)
33500 {
33501 case V2SFmode:
33502 case V2SImode:
33503 if (!mmx_ok && !TARGET_SSE)
33504 break;
33505 /* FALLTHRU */
33506
33507 case V8SFmode:
33508 case V8SImode:
33509 case V4DFmode:
33510 case V4DImode:
33511 case V4SFmode:
33512 case V4SImode:
33513 case V2DFmode:
33514 case V2DImode:
33515 n = GET_MODE_NUNITS (mode);
33516 for (i = 0; i < n; i++)
33517 ops[i] = XVECEXP (vals, 0, i);
33518 ix86_expand_vector_init_concat (mode, target, ops, n);
33519 return;
33520
33521 case V32QImode:
33522 half_mode = V16QImode;
33523 goto half;
33524
33525 case V16HImode:
33526 half_mode = V8HImode;
33527 goto half;
33528
33529 half:
33530 n = GET_MODE_NUNITS (mode);
33531 for (i = 0; i < n; i++)
33532 ops[i] = XVECEXP (vals, 0, i);
33533 op0 = gen_reg_rtx (half_mode);
33534 op1 = gen_reg_rtx (half_mode);
33535 ix86_expand_vector_init_interleave (half_mode, op0, ops,
33536 n >> 2);
33537 ix86_expand_vector_init_interleave (half_mode, op1,
33538 &ops [n >> 1], n >> 2);
33539 emit_insn (gen_rtx_SET (VOIDmode, target,
33540 gen_rtx_VEC_CONCAT (mode, op0, op1)));
33541 return;
33542
33543 case V16QImode:
33544 if (!TARGET_SSE4_1)
33545 break;
33546 /* FALLTHRU */
33547
33548 case V8HImode:
33549 if (!TARGET_SSE2)
33550 break;
33551
33552 /* Don't use ix86_expand_vector_init_interleave if we can't
33553 move from GPR to SSE register directly. */
33554 if (!TARGET_INTER_UNIT_MOVES)
33555 break;
33556
33557 n = GET_MODE_NUNITS (mode);
33558 for (i = 0; i < n; i++)
33559 ops[i] = XVECEXP (vals, 0, i);
33560 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
33561 return;
33562
33563 case V4HImode:
33564 case V8QImode:
33565 break;
33566
33567 default:
33568 gcc_unreachable ();
33569 }
33570
33571 {
33572 int i, j, n_elts, n_words, n_elt_per_word;
33573 enum machine_mode inner_mode;
33574 rtx words[4], shift;
33575
33576 inner_mode = GET_MODE_INNER (mode);
33577 n_elts = GET_MODE_NUNITS (mode);
33578 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
33579 n_elt_per_word = n_elts / n_words;
33580 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
33581
33582 for (i = 0; i < n_words; ++i)
33583 {
33584 rtx word = NULL_RTX;
33585
33586 for (j = 0; j < n_elt_per_word; ++j)
33587 {
33588 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
33589 elt = convert_modes (word_mode, inner_mode, elt, true);
33590
33591 if (j == 0)
33592 word = elt;
33593 else
33594 {
33595 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
33596 word, 1, OPTAB_LIB_WIDEN);
33597 word = expand_simple_binop (word_mode, IOR, word, elt,
33598 word, 1, OPTAB_LIB_WIDEN);
33599 }
33600 }
33601
33602 words[i] = word;
33603 }
33604
33605 if (n_words == 1)
33606 emit_move_insn (target, gen_lowpart (mode, words[0]));
33607 else if (n_words == 2)
33608 {
33609 rtx tmp = gen_reg_rtx (mode);
33610 emit_clobber (tmp);
33611 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
33612 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
33613 emit_move_insn (target, tmp);
33614 }
33615 else if (n_words == 4)
33616 {
33617 rtx tmp = gen_reg_rtx (V4SImode);
33618 gcc_assert (word_mode == SImode);
33619 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
33620 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
33621 emit_move_insn (target, gen_lowpart (mode, tmp));
33622 }
33623 else
33624 gcc_unreachable ();
33625 }
33626 }
33627
33628 /* Initialize vector TARGET via VALS. Suppress the use of MMX
33629 instructions unless MMX_OK is true. */
33630
33631 void
33632 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
33633 {
33634 enum machine_mode mode = GET_MODE (target);
33635 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33636 int n_elts = GET_MODE_NUNITS (mode);
33637 int n_var = 0, one_var = -1;
33638 bool all_same = true, all_const_zero = true;
33639 int i;
33640 rtx x;
33641
33642 for (i = 0; i < n_elts; ++i)
33643 {
33644 x = XVECEXP (vals, 0, i);
33645 if (!(CONST_INT_P (x)
33646 || GET_CODE (x) == CONST_DOUBLE
33647 || GET_CODE (x) == CONST_FIXED))
33648 n_var++, one_var = i;
33649 else if (x != CONST0_RTX (inner_mode))
33650 all_const_zero = false;
33651 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
33652 all_same = false;
33653 }
33654
33655 /* Constants are best loaded from the constant pool. */
33656 if (n_var == 0)
33657 {
33658 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
33659 return;
33660 }
33661
33662 /* If all values are identical, broadcast the value. */
33663 if (all_same
33664 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
33665 XVECEXP (vals, 0, 0)))
33666 return;
33667
33668 /* Values where only one field is non-constant are best loaded from
33669 the pool and overwritten via move later. */
33670 if (n_var == 1)
33671 {
33672 if (all_const_zero
33673 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
33674 XVECEXP (vals, 0, one_var),
33675 one_var))
33676 return;
33677
33678 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
33679 return;
33680 }
33681
33682 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
33683 }
33684
33685 void
33686 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
33687 {
33688 enum machine_mode mode = GET_MODE (target);
33689 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33690 enum machine_mode half_mode;
33691 bool use_vec_merge = false;
33692 rtx tmp;
33693 static rtx (*gen_extract[6][2]) (rtx, rtx)
33694 = {
33695 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
33696 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
33697 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
33698 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
33699 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
33700 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
33701 };
33702 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
33703 = {
33704 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
33705 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
33706 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
33707 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
33708 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
33709 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
33710 };
33711 int i, j, n;
33712
33713 switch (mode)
33714 {
33715 case V2SFmode:
33716 case V2SImode:
33717 if (mmx_ok)
33718 {
33719 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33720 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
33721 if (elt == 0)
33722 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33723 else
33724 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33725 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33726 return;
33727 }
33728 break;
33729
33730 case V2DImode:
33731 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
33732 if (use_vec_merge)
33733 break;
33734
33735 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33736 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
33737 if (elt == 0)
33738 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33739 else
33740 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33741 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33742 return;
33743
33744 case V2DFmode:
33745 {
33746 rtx op0, op1;
33747
33748 /* For the two element vectors, we implement a VEC_CONCAT with
33749 the extraction of the other element. */
33750
33751 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
33752 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
33753
33754 if (elt == 0)
33755 op0 = val, op1 = tmp;
33756 else
33757 op0 = tmp, op1 = val;
33758
33759 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
33760 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33761 }
33762 return;
33763
33764 case V4SFmode:
33765 use_vec_merge = TARGET_SSE4_1;
33766 if (use_vec_merge)
33767 break;
33768
33769 switch (elt)
33770 {
33771 case 0:
33772 use_vec_merge = true;
33773 break;
33774
33775 case 1:
33776 /* tmp = target = A B C D */
33777 tmp = copy_to_reg (target);
33778 /* target = A A B B */
33779 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
33780 /* target = X A B B */
33781 ix86_expand_vector_set (false, target, val, 0);
33782 /* target = A X C D */
33783 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33784 const1_rtx, const0_rtx,
33785 GEN_INT (2+4), GEN_INT (3+4)));
33786 return;
33787
33788 case 2:
33789 /* tmp = target = A B C D */
33790 tmp = copy_to_reg (target);
33791 /* tmp = X B C D */
33792 ix86_expand_vector_set (false, tmp, val, 0);
33793 /* target = A B X D */
33794 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33795 const0_rtx, const1_rtx,
33796 GEN_INT (0+4), GEN_INT (3+4)));
33797 return;
33798
33799 case 3:
33800 /* tmp = target = A B C D */
33801 tmp = copy_to_reg (target);
33802 /* tmp = X B C D */
33803 ix86_expand_vector_set (false, tmp, val, 0);
33804 /* target = A B X D */
33805 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33806 const0_rtx, const1_rtx,
33807 GEN_INT (2+4), GEN_INT (0+4)));
33808 return;
33809
33810 default:
33811 gcc_unreachable ();
33812 }
33813 break;
33814
33815 case V4SImode:
33816 use_vec_merge = TARGET_SSE4_1;
33817 if (use_vec_merge)
33818 break;
33819
33820 /* Element 0 handled by vec_merge below. */
33821 if (elt == 0)
33822 {
33823 use_vec_merge = true;
33824 break;
33825 }
33826
33827 if (TARGET_SSE2)
33828 {
33829 /* With SSE2, use integer shuffles to swap element 0 and ELT,
33830 store into element 0, then shuffle them back. */
33831
33832 rtx order[4];
33833
33834 order[0] = GEN_INT (elt);
33835 order[1] = const1_rtx;
33836 order[2] = const2_rtx;
33837 order[3] = GEN_INT (3);
33838 order[elt] = const0_rtx;
33839
33840 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33841 order[1], order[2], order[3]));
33842
33843 ix86_expand_vector_set (false, target, val, 0);
33844
33845 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33846 order[1], order[2], order[3]));
33847 }
33848 else
33849 {
33850 /* For SSE1, we have to reuse the V4SF code. */
33851 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
33852 gen_lowpart (SFmode, val), elt);
33853 }
33854 return;
33855
33856 case V8HImode:
33857 use_vec_merge = TARGET_SSE2;
33858 break;
33859 case V4HImode:
33860 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33861 break;
33862
33863 case V16QImode:
33864 use_vec_merge = TARGET_SSE4_1;
33865 break;
33866
33867 case V8QImode:
33868 break;
33869
33870 case V32QImode:
33871 half_mode = V16QImode;
33872 j = 0;
33873 n = 16;
33874 goto half;
33875
33876 case V16HImode:
33877 half_mode = V8HImode;
33878 j = 1;
33879 n = 8;
33880 goto half;
33881
33882 case V8SImode:
33883 half_mode = V4SImode;
33884 j = 2;
33885 n = 4;
33886 goto half;
33887
33888 case V4DImode:
33889 half_mode = V2DImode;
33890 j = 3;
33891 n = 2;
33892 goto half;
33893
33894 case V8SFmode:
33895 half_mode = V4SFmode;
33896 j = 4;
33897 n = 4;
33898 goto half;
33899
33900 case V4DFmode:
33901 half_mode = V2DFmode;
33902 j = 5;
33903 n = 2;
33904 goto half;
33905
33906 half:
33907 /* Compute offset. */
33908 i = elt / n;
33909 elt %= n;
33910
33911 gcc_assert (i <= 1);
33912
33913 /* Extract the half. */
33914 tmp = gen_reg_rtx (half_mode);
33915 emit_insn (gen_extract[j][i] (tmp, target));
33916
33917 /* Put val in tmp at elt. */
33918 ix86_expand_vector_set (false, tmp, val, elt);
33919
33920 /* Put it back. */
33921 emit_insn (gen_insert[j][i] (target, target, tmp));
33922 return;
33923
33924 default:
33925 break;
33926 }
33927
33928 if (use_vec_merge)
33929 {
33930 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
33931 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
33932 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33933 }
33934 else
33935 {
33936 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33937
33938 emit_move_insn (mem, target);
33939
33940 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33941 emit_move_insn (tmp, val);
33942
33943 emit_move_insn (target, mem);
33944 }
33945 }
33946
33947 void
33948 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
33949 {
33950 enum machine_mode mode = GET_MODE (vec);
33951 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33952 bool use_vec_extr = false;
33953 rtx tmp;
33954
33955 switch (mode)
33956 {
33957 case V2SImode:
33958 case V2SFmode:
33959 if (!mmx_ok)
33960 break;
33961 /* FALLTHRU */
33962
33963 case V2DFmode:
33964 case V2DImode:
33965 use_vec_extr = true;
33966 break;
33967
33968 case V4SFmode:
33969 use_vec_extr = TARGET_SSE4_1;
33970 if (use_vec_extr)
33971 break;
33972
33973 switch (elt)
33974 {
33975 case 0:
33976 tmp = vec;
33977 break;
33978
33979 case 1:
33980 case 3:
33981 tmp = gen_reg_rtx (mode);
33982 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
33983 GEN_INT (elt), GEN_INT (elt),
33984 GEN_INT (elt+4), GEN_INT (elt+4)));
33985 break;
33986
33987 case 2:
33988 tmp = gen_reg_rtx (mode);
33989 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
33990 break;
33991
33992 default:
33993 gcc_unreachable ();
33994 }
33995 vec = tmp;
33996 use_vec_extr = true;
33997 elt = 0;
33998 break;
33999
34000 case V4SImode:
34001 use_vec_extr = TARGET_SSE4_1;
34002 if (use_vec_extr)
34003 break;
34004
34005 if (TARGET_SSE2)
34006 {
34007 switch (elt)
34008 {
34009 case 0:
34010 tmp = vec;
34011 break;
34012
34013 case 1:
34014 case 3:
34015 tmp = gen_reg_rtx (mode);
34016 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
34017 GEN_INT (elt), GEN_INT (elt),
34018 GEN_INT (elt), GEN_INT (elt)));
34019 break;
34020
34021 case 2:
34022 tmp = gen_reg_rtx (mode);
34023 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
34024 break;
34025
34026 default:
34027 gcc_unreachable ();
34028 }
34029 vec = tmp;
34030 use_vec_extr = true;
34031 elt = 0;
34032 }
34033 else
34034 {
34035 /* For SSE1, we have to reuse the V4SF code. */
34036 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
34037 gen_lowpart (V4SFmode, vec), elt);
34038 return;
34039 }
34040 break;
34041
34042 case V8HImode:
34043 use_vec_extr = TARGET_SSE2;
34044 break;
34045 case V4HImode:
34046 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34047 break;
34048
34049 case V16QImode:
34050 use_vec_extr = TARGET_SSE4_1;
34051 break;
34052
34053 case V8SFmode:
34054 if (TARGET_AVX)
34055 {
34056 tmp = gen_reg_rtx (V4SFmode);
34057 if (elt < 4)
34058 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
34059 else
34060 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
34061 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34062 return;
34063 }
34064 break;
34065
34066 case V4DFmode:
34067 if (TARGET_AVX)
34068 {
34069 tmp = gen_reg_rtx (V2DFmode);
34070 if (elt < 2)
34071 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
34072 else
34073 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
34074 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34075 return;
34076 }
34077 break;
34078
34079 case V32QImode:
34080 if (TARGET_AVX)
34081 {
34082 tmp = gen_reg_rtx (V16QImode);
34083 if (elt < 16)
34084 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
34085 else
34086 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
34087 ix86_expand_vector_extract (false, target, tmp, elt & 15);
34088 return;
34089 }
34090 break;
34091
34092 case V16HImode:
34093 if (TARGET_AVX)
34094 {
34095 tmp = gen_reg_rtx (V8HImode);
34096 if (elt < 8)
34097 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
34098 else
34099 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
34100 ix86_expand_vector_extract (false, target, tmp, elt & 7);
34101 return;
34102 }
34103 break;
34104
34105 case V8SImode:
34106 if (TARGET_AVX)
34107 {
34108 tmp = gen_reg_rtx (V4SImode);
34109 if (elt < 4)
34110 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
34111 else
34112 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
34113 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34114 return;
34115 }
34116 break;
34117
34118 case V4DImode:
34119 if (TARGET_AVX)
34120 {
34121 tmp = gen_reg_rtx (V2DImode);
34122 if (elt < 2)
34123 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
34124 else
34125 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
34126 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34127 return;
34128 }
34129 break;
34130
34131 case V8QImode:
34132 /* ??? Could extract the appropriate HImode element and shift. */
34133 default:
34134 break;
34135 }
34136
34137 if (use_vec_extr)
34138 {
34139 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
34140 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
34141
34142 /* Let the rtl optimizers know about the zero extension performed. */
34143 if (inner_mode == QImode || inner_mode == HImode)
34144 {
34145 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
34146 target = gen_lowpart (SImode, target);
34147 }
34148
34149 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34150 }
34151 else
34152 {
34153 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34154
34155 emit_move_insn (mem, vec);
34156
34157 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34158 emit_move_insn (target, tmp);
34159 }
34160 }
34161
34162 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34163 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34164 The upper bits of DEST are undefined, though they shouldn't cause
34165 exceptions (some bits from src or all zeros are ok). */
34166
34167 static void
34168 emit_reduc_half (rtx dest, rtx src, int i)
34169 {
34170 rtx tem;
34171 switch (GET_MODE (src))
34172 {
34173 case V4SFmode:
34174 if (i == 128)
34175 tem = gen_sse_movhlps (dest, src, src);
34176 else
34177 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34178 GEN_INT (1 + 4), GEN_INT (1 + 4));
34179 break;
34180 case V2DFmode:
34181 tem = gen_vec_interleave_highv2df (dest, src, src);
34182 break;
34183 case V16QImode:
34184 case V8HImode:
34185 case V4SImode:
34186 case V2DImode:
34187 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34188 gen_lowpart (V1TImode, src),
34189 GEN_INT (i / 2));
34190 break;
34191 case V8SFmode:
34192 if (i == 256)
34193 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34194 else
34195 tem = gen_avx_shufps256 (dest, src, src,
34196 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34197 break;
34198 case V4DFmode:
34199 if (i == 256)
34200 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34201 else
34202 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34203 break;
34204 case V32QImode:
34205 case V16HImode:
34206 case V8SImode:
34207 case V4DImode:
34208 if (i == 256)
34209 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34210 gen_lowpart (V4DImode, src),
34211 gen_lowpart (V4DImode, src),
34212 const1_rtx);
34213 else
34214 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34215 gen_lowpart (V2TImode, src),
34216 GEN_INT (i / 2));
34217 break;
34218 default:
34219 gcc_unreachable ();
34220 }
34221 emit_insn (tem);
34222 }
34223
34224 /* Expand a vector reduction. FN is the binary pattern to reduce;
34225 DEST is the destination; IN is the input vector. */
34226
34227 void
34228 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34229 {
34230 rtx half, dst, vec = in;
34231 enum machine_mode mode = GET_MODE (in);
34232 int i;
34233
34234 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
34235 if (TARGET_SSE4_1
34236 && mode == V8HImode
34237 && fn == gen_uminv8hi3)
34238 {
34239 emit_insn (gen_sse4_1_phminposuw (dest, in));
34240 return;
34241 }
34242
34243 for (i = GET_MODE_BITSIZE (mode);
34244 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34245 i >>= 1)
34246 {
34247 half = gen_reg_rtx (mode);
34248 emit_reduc_half (half, vec, i);
34249 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34250 dst = dest;
34251 else
34252 dst = gen_reg_rtx (mode);
34253 emit_insn (fn (dst, half, vec));
34254 vec = dst;
34255 }
34256 }
34257 \f
34258 /* Target hook for scalar_mode_supported_p. */
34259 static bool
34260 ix86_scalar_mode_supported_p (enum machine_mode mode)
34261 {
34262 if (DECIMAL_FLOAT_MODE_P (mode))
34263 return default_decimal_float_supported_p ();
34264 else if (mode == TFmode)
34265 return true;
34266 else
34267 return default_scalar_mode_supported_p (mode);
34268 }
34269
34270 /* Implements target hook vector_mode_supported_p. */
34271 static bool
34272 ix86_vector_mode_supported_p (enum machine_mode mode)
34273 {
34274 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34275 return true;
34276 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34277 return true;
34278 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34279 return true;
34280 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34281 return true;
34282 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34283 return true;
34284 return false;
34285 }
34286
34287 /* Target hook for c_mode_for_suffix. */
34288 static enum machine_mode
34289 ix86_c_mode_for_suffix (char suffix)
34290 {
34291 if (suffix == 'q')
34292 return TFmode;
34293 if (suffix == 'w')
34294 return XFmode;
34295
34296 return VOIDmode;
34297 }
34298
34299 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34300
34301 We do this in the new i386 backend to maintain source compatibility
34302 with the old cc0-based compiler. */
34303
34304 static tree
34305 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34306 tree inputs ATTRIBUTE_UNUSED,
34307 tree clobbers)
34308 {
34309 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34310 clobbers);
34311 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34312 clobbers);
34313 return clobbers;
34314 }
34315
34316 /* Implements target vector targetm.asm.encode_section_info. */
34317
34318 static void ATTRIBUTE_UNUSED
34319 ix86_encode_section_info (tree decl, rtx rtl, int first)
34320 {
34321 default_encode_section_info (decl, rtl, first);
34322
34323 if (TREE_CODE (decl) == VAR_DECL
34324 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34325 && ix86_in_large_data_p (decl))
34326 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34327 }
34328
34329 /* Worker function for REVERSE_CONDITION. */
34330
34331 enum rtx_code
34332 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34333 {
34334 return (mode != CCFPmode && mode != CCFPUmode
34335 ? reverse_condition (code)
34336 : reverse_condition_maybe_unordered (code));
34337 }
34338
34339 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34340 to OPERANDS[0]. */
34341
34342 const char *
34343 output_387_reg_move (rtx insn, rtx *operands)
34344 {
34345 if (REG_P (operands[0]))
34346 {
34347 if (REG_P (operands[1])
34348 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34349 {
34350 if (REGNO (operands[0]) == FIRST_STACK_REG)
34351 return output_387_ffreep (operands, 0);
34352 return "fstp\t%y0";
34353 }
34354 if (STACK_TOP_P (operands[0]))
34355 return "fld%Z1\t%y1";
34356 return "fst\t%y0";
34357 }
34358 else if (MEM_P (operands[0]))
34359 {
34360 gcc_assert (REG_P (operands[1]));
34361 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34362 return "fstp%Z0\t%y0";
34363 else
34364 {
34365 /* There is no non-popping store to memory for XFmode.
34366 So if we need one, follow the store with a load. */
34367 if (GET_MODE (operands[0]) == XFmode)
34368 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
34369 else
34370 return "fst%Z0\t%y0";
34371 }
34372 }
34373 else
34374 gcc_unreachable();
34375 }
34376
34377 /* Output code to perform a conditional jump to LABEL, if C2 flag in
34378 FP status register is set. */
34379
34380 void
34381 ix86_emit_fp_unordered_jump (rtx label)
34382 {
34383 rtx reg = gen_reg_rtx (HImode);
34384 rtx temp;
34385
34386 emit_insn (gen_x86_fnstsw_1 (reg));
34387
34388 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
34389 {
34390 emit_insn (gen_x86_sahf_1 (reg));
34391
34392 temp = gen_rtx_REG (CCmode, FLAGS_REG);
34393 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
34394 }
34395 else
34396 {
34397 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
34398
34399 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
34400 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
34401 }
34402
34403 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
34404 gen_rtx_LABEL_REF (VOIDmode, label),
34405 pc_rtx);
34406 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
34407
34408 emit_jump_insn (temp);
34409 predict_jump (REG_BR_PROB_BASE * 10 / 100);
34410 }
34411
34412 /* Output code to perform a log1p XFmode calculation. */
34413
34414 void ix86_emit_i387_log1p (rtx op0, rtx op1)
34415 {
34416 rtx label1 = gen_label_rtx ();
34417 rtx label2 = gen_label_rtx ();
34418
34419 rtx tmp = gen_reg_rtx (XFmode);
34420 rtx tmp2 = gen_reg_rtx (XFmode);
34421 rtx test;
34422
34423 emit_insn (gen_absxf2 (tmp, op1));
34424 test = gen_rtx_GE (VOIDmode, tmp,
34425 CONST_DOUBLE_FROM_REAL_VALUE (
34426 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
34427 XFmode));
34428 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
34429
34430 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34431 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
34432 emit_jump (label2);
34433
34434 emit_label (label1);
34435 emit_move_insn (tmp, CONST1_RTX (XFmode));
34436 emit_insn (gen_addxf3 (tmp, op1, tmp));
34437 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34438 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
34439
34440 emit_label (label2);
34441 }
34442
34443 /* Emit code for round calculation. */
34444 void ix86_emit_i387_round (rtx op0, rtx op1)
34445 {
34446 enum machine_mode inmode = GET_MODE (op1);
34447 enum machine_mode outmode = GET_MODE (op0);
34448 rtx e1, e2, res, tmp, tmp1, half;
34449 rtx scratch = gen_reg_rtx (HImode);
34450 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
34451 rtx jump_label = gen_label_rtx ();
34452 rtx insn;
34453 rtx (*gen_abs) (rtx, rtx);
34454 rtx (*gen_neg) (rtx, rtx);
34455
34456 switch (inmode)
34457 {
34458 case SFmode:
34459 gen_abs = gen_abssf2;
34460 break;
34461 case DFmode:
34462 gen_abs = gen_absdf2;
34463 break;
34464 case XFmode:
34465 gen_abs = gen_absxf2;
34466 break;
34467 default:
34468 gcc_unreachable ();
34469 }
34470
34471 switch (outmode)
34472 {
34473 case SFmode:
34474 gen_neg = gen_negsf2;
34475 break;
34476 case DFmode:
34477 gen_neg = gen_negdf2;
34478 break;
34479 case XFmode:
34480 gen_neg = gen_negxf2;
34481 break;
34482 case HImode:
34483 gen_neg = gen_neghi2;
34484 break;
34485 case SImode:
34486 gen_neg = gen_negsi2;
34487 break;
34488 case DImode:
34489 gen_neg = gen_negdi2;
34490 break;
34491 default:
34492 gcc_unreachable ();
34493 }
34494
34495 e1 = gen_reg_rtx (inmode);
34496 e2 = gen_reg_rtx (inmode);
34497 res = gen_reg_rtx (outmode);
34498
34499 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
34500
34501 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
34502
34503 /* scratch = fxam(op1) */
34504 emit_insn (gen_rtx_SET (VOIDmode, scratch,
34505 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
34506 UNSPEC_FXAM)));
34507 /* e1 = fabs(op1) */
34508 emit_insn (gen_abs (e1, op1));
34509
34510 /* e2 = e1 + 0.5 */
34511 half = force_reg (inmode, half);
34512 emit_insn (gen_rtx_SET (VOIDmode, e2,
34513 gen_rtx_PLUS (inmode, e1, half)));
34514
34515 /* res = floor(e2) */
34516 if (inmode != XFmode)
34517 {
34518 tmp1 = gen_reg_rtx (XFmode);
34519
34520 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
34521 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
34522 }
34523 else
34524 tmp1 = e2;
34525
34526 switch (outmode)
34527 {
34528 case SFmode:
34529 case DFmode:
34530 {
34531 rtx tmp0 = gen_reg_rtx (XFmode);
34532
34533 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
34534
34535 emit_insn (gen_rtx_SET (VOIDmode, res,
34536 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
34537 UNSPEC_TRUNC_NOOP)));
34538 }
34539 break;
34540 case XFmode:
34541 emit_insn (gen_frndintxf2_floor (res, tmp1));
34542 break;
34543 case HImode:
34544 emit_insn (gen_lfloorxfhi2 (res, tmp1));
34545 break;
34546 case SImode:
34547 emit_insn (gen_lfloorxfsi2 (res, tmp1));
34548 break;
34549 case DImode:
34550 emit_insn (gen_lfloorxfdi2 (res, tmp1));
34551 break;
34552 default:
34553 gcc_unreachable ();
34554 }
34555
34556 /* flags = signbit(a) */
34557 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
34558
34559 /* if (flags) then res = -res */
34560 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
34561 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
34562 gen_rtx_LABEL_REF (VOIDmode, jump_label),
34563 pc_rtx);
34564 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34565 predict_jump (REG_BR_PROB_BASE * 50 / 100);
34566 JUMP_LABEL (insn) = jump_label;
34567
34568 emit_insn (gen_neg (res, res));
34569
34570 emit_label (jump_label);
34571 LABEL_NUSES (jump_label) = 1;
34572
34573 emit_move_insn (op0, res);
34574 }
34575
34576 /* Output code to perform a Newton-Rhapson approximation of a single precision
34577 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
34578
34579 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
34580 {
34581 rtx x0, x1, e0, e1;
34582
34583 x0 = gen_reg_rtx (mode);
34584 e0 = gen_reg_rtx (mode);
34585 e1 = gen_reg_rtx (mode);
34586 x1 = gen_reg_rtx (mode);
34587
34588 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
34589
34590 b = force_reg (mode, b);
34591
34592 /* x0 = rcp(b) estimate */
34593 emit_insn (gen_rtx_SET (VOIDmode, x0,
34594 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
34595 UNSPEC_RCP)));
34596 /* e0 = x0 * b */
34597 emit_insn (gen_rtx_SET (VOIDmode, e0,
34598 gen_rtx_MULT (mode, x0, b)));
34599
34600 /* e0 = x0 * e0 */
34601 emit_insn (gen_rtx_SET (VOIDmode, e0,
34602 gen_rtx_MULT (mode, x0, e0)));
34603
34604 /* e1 = x0 + x0 */
34605 emit_insn (gen_rtx_SET (VOIDmode, e1,
34606 gen_rtx_PLUS (mode, x0, x0)));
34607
34608 /* x1 = e1 - e0 */
34609 emit_insn (gen_rtx_SET (VOIDmode, x1,
34610 gen_rtx_MINUS (mode, e1, e0)));
34611
34612 /* res = a * x1 */
34613 emit_insn (gen_rtx_SET (VOIDmode, res,
34614 gen_rtx_MULT (mode, a, x1)));
34615 }
34616
34617 /* Output code to perform a Newton-Rhapson approximation of a
34618 single precision floating point [reciprocal] square root. */
34619
34620 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
34621 bool recip)
34622 {
34623 rtx x0, e0, e1, e2, e3, mthree, mhalf;
34624 REAL_VALUE_TYPE r;
34625
34626 x0 = gen_reg_rtx (mode);
34627 e0 = gen_reg_rtx (mode);
34628 e1 = gen_reg_rtx (mode);
34629 e2 = gen_reg_rtx (mode);
34630 e3 = gen_reg_rtx (mode);
34631
34632 real_from_integer (&r, VOIDmode, -3, -1, 0);
34633 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34634
34635 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
34636 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34637
34638 if (VECTOR_MODE_P (mode))
34639 {
34640 mthree = ix86_build_const_vector (mode, true, mthree);
34641 mhalf = ix86_build_const_vector (mode, true, mhalf);
34642 }
34643
34644 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
34645 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
34646
34647 a = force_reg (mode, a);
34648
34649 /* x0 = rsqrt(a) estimate */
34650 emit_insn (gen_rtx_SET (VOIDmode, x0,
34651 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
34652 UNSPEC_RSQRT)));
34653
34654 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
34655 if (!recip)
34656 {
34657 rtx zero, mask;
34658
34659 zero = gen_reg_rtx (mode);
34660 mask = gen_reg_rtx (mode);
34661
34662 zero = force_reg (mode, CONST0_RTX(mode));
34663 emit_insn (gen_rtx_SET (VOIDmode, mask,
34664 gen_rtx_NE (mode, zero, a)));
34665
34666 emit_insn (gen_rtx_SET (VOIDmode, x0,
34667 gen_rtx_AND (mode, x0, mask)));
34668 }
34669
34670 /* e0 = x0 * a */
34671 emit_insn (gen_rtx_SET (VOIDmode, e0,
34672 gen_rtx_MULT (mode, x0, a)));
34673 /* e1 = e0 * x0 */
34674 emit_insn (gen_rtx_SET (VOIDmode, e1,
34675 gen_rtx_MULT (mode, e0, x0)));
34676
34677 /* e2 = e1 - 3. */
34678 mthree = force_reg (mode, mthree);
34679 emit_insn (gen_rtx_SET (VOIDmode, e2,
34680 gen_rtx_PLUS (mode, e1, mthree)));
34681
34682 mhalf = force_reg (mode, mhalf);
34683 if (recip)
34684 /* e3 = -.5 * x0 */
34685 emit_insn (gen_rtx_SET (VOIDmode, e3,
34686 gen_rtx_MULT (mode, x0, mhalf)));
34687 else
34688 /* e3 = -.5 * e0 */
34689 emit_insn (gen_rtx_SET (VOIDmode, e3,
34690 gen_rtx_MULT (mode, e0, mhalf)));
34691 /* ret = e2 * e3 */
34692 emit_insn (gen_rtx_SET (VOIDmode, res,
34693 gen_rtx_MULT (mode, e2, e3)));
34694 }
34695
34696 #ifdef TARGET_SOLARIS
34697 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
34698
34699 static void
34700 i386_solaris_elf_named_section (const char *name, unsigned int flags,
34701 tree decl)
34702 {
34703 /* With Binutils 2.15, the "@unwind" marker must be specified on
34704 every occurrence of the ".eh_frame" section, not just the first
34705 one. */
34706 if (TARGET_64BIT
34707 && strcmp (name, ".eh_frame") == 0)
34708 {
34709 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
34710 flags & SECTION_WRITE ? "aw" : "a");
34711 return;
34712 }
34713
34714 #ifndef USE_GAS
34715 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
34716 {
34717 solaris_elf_asm_comdat_section (name, flags, decl);
34718 return;
34719 }
34720 #endif
34721
34722 default_elf_asm_named_section (name, flags, decl);
34723 }
34724 #endif /* TARGET_SOLARIS */
34725
34726 /* Return the mangling of TYPE if it is an extended fundamental type. */
34727
34728 static const char *
34729 ix86_mangle_type (const_tree type)
34730 {
34731 type = TYPE_MAIN_VARIANT (type);
34732
34733 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
34734 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
34735 return NULL;
34736
34737 switch (TYPE_MODE (type))
34738 {
34739 case TFmode:
34740 /* __float128 is "g". */
34741 return "g";
34742 case XFmode:
34743 /* "long double" or __float80 is "e". */
34744 return "e";
34745 default:
34746 return NULL;
34747 }
34748 }
34749
34750 /* For 32-bit code we can save PIC register setup by using
34751 __stack_chk_fail_local hidden function instead of calling
34752 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
34753 register, so it is better to call __stack_chk_fail directly. */
34754
34755 static tree ATTRIBUTE_UNUSED
34756 ix86_stack_protect_fail (void)
34757 {
34758 return TARGET_64BIT
34759 ? default_external_stack_protect_fail ()
34760 : default_hidden_stack_protect_fail ();
34761 }
34762
34763 /* Select a format to encode pointers in exception handling data. CODE
34764 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
34765 true if the symbol may be affected by dynamic relocations.
34766
34767 ??? All x86 object file formats are capable of representing this.
34768 After all, the relocation needed is the same as for the call insn.
34769 Whether or not a particular assembler allows us to enter such, I
34770 guess we'll have to see. */
34771 int
34772 asm_preferred_eh_data_format (int code, int global)
34773 {
34774 if (flag_pic)
34775 {
34776 int type = DW_EH_PE_sdata8;
34777 if (!TARGET_64BIT
34778 || ix86_cmodel == CM_SMALL_PIC
34779 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
34780 type = DW_EH_PE_sdata4;
34781 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
34782 }
34783 if (ix86_cmodel == CM_SMALL
34784 || (ix86_cmodel == CM_MEDIUM && code))
34785 return DW_EH_PE_udata4;
34786 return DW_EH_PE_absptr;
34787 }
34788 \f
34789 /* Expand copysign from SIGN to the positive value ABS_VALUE
34790 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
34791 the sign-bit. */
34792 static void
34793 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
34794 {
34795 enum machine_mode mode = GET_MODE (sign);
34796 rtx sgn = gen_reg_rtx (mode);
34797 if (mask == NULL_RTX)
34798 {
34799 enum machine_mode vmode;
34800
34801 if (mode == SFmode)
34802 vmode = V4SFmode;
34803 else if (mode == DFmode)
34804 vmode = V2DFmode;
34805 else
34806 vmode = mode;
34807
34808 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
34809 if (!VECTOR_MODE_P (mode))
34810 {
34811 /* We need to generate a scalar mode mask in this case. */
34812 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34813 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34814 mask = gen_reg_rtx (mode);
34815 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34816 }
34817 }
34818 else
34819 mask = gen_rtx_NOT (mode, mask);
34820 emit_insn (gen_rtx_SET (VOIDmode, sgn,
34821 gen_rtx_AND (mode, mask, sign)));
34822 emit_insn (gen_rtx_SET (VOIDmode, result,
34823 gen_rtx_IOR (mode, abs_value, sgn)));
34824 }
34825
34826 /* Expand fabs (OP0) and return a new rtx that holds the result. The
34827 mask for masking out the sign-bit is stored in *SMASK, if that is
34828 non-null. */
34829 static rtx
34830 ix86_expand_sse_fabs (rtx op0, rtx *smask)
34831 {
34832 enum machine_mode vmode, mode = GET_MODE (op0);
34833 rtx xa, mask;
34834
34835 xa = gen_reg_rtx (mode);
34836 if (mode == SFmode)
34837 vmode = V4SFmode;
34838 else if (mode == DFmode)
34839 vmode = V2DFmode;
34840 else
34841 vmode = mode;
34842 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
34843 if (!VECTOR_MODE_P (mode))
34844 {
34845 /* We need to generate a scalar mode mask in this case. */
34846 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34847 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34848 mask = gen_reg_rtx (mode);
34849 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34850 }
34851 emit_insn (gen_rtx_SET (VOIDmode, xa,
34852 gen_rtx_AND (mode, op0, mask)));
34853
34854 if (smask)
34855 *smask = mask;
34856
34857 return xa;
34858 }
34859
34860 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
34861 swapping the operands if SWAP_OPERANDS is true. The expanded
34862 code is a forward jump to a newly created label in case the
34863 comparison is true. The generated label rtx is returned. */
34864 static rtx
34865 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
34866 bool swap_operands)
34867 {
34868 rtx label, tmp;
34869
34870 if (swap_operands)
34871 {
34872 tmp = op0;
34873 op0 = op1;
34874 op1 = tmp;
34875 }
34876
34877 label = gen_label_rtx ();
34878 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
34879 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34880 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
34881 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
34882 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
34883 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
34884 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34885 JUMP_LABEL (tmp) = label;
34886
34887 return label;
34888 }
34889
34890 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
34891 using comparison code CODE. Operands are swapped for the comparison if
34892 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
34893 static rtx
34894 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
34895 bool swap_operands)
34896 {
34897 rtx (*insn)(rtx, rtx, rtx, rtx);
34898 enum machine_mode mode = GET_MODE (op0);
34899 rtx mask = gen_reg_rtx (mode);
34900
34901 if (swap_operands)
34902 {
34903 rtx tmp = op0;
34904 op0 = op1;
34905 op1 = tmp;
34906 }
34907
34908 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
34909
34910 emit_insn (insn (mask, op0, op1,
34911 gen_rtx_fmt_ee (code, mode, op0, op1)));
34912 return mask;
34913 }
34914
34915 /* Generate and return a rtx of mode MODE for 2**n where n is the number
34916 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
34917 static rtx
34918 ix86_gen_TWO52 (enum machine_mode mode)
34919 {
34920 REAL_VALUE_TYPE TWO52r;
34921 rtx TWO52;
34922
34923 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
34924 TWO52 = const_double_from_real_value (TWO52r, mode);
34925 TWO52 = force_reg (mode, TWO52);
34926
34927 return TWO52;
34928 }
34929
34930 /* Expand SSE sequence for computing lround from OP1 storing
34931 into OP0. */
34932 void
34933 ix86_expand_lround (rtx op0, rtx op1)
34934 {
34935 /* C code for the stuff we're doing below:
34936 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
34937 return (long)tmp;
34938 */
34939 enum machine_mode mode = GET_MODE (op1);
34940 const struct real_format *fmt;
34941 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34942 rtx adj;
34943
34944 /* load nextafter (0.5, 0.0) */
34945 fmt = REAL_MODE_FORMAT (mode);
34946 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34947 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34948
34949 /* adj = copysign (0.5, op1) */
34950 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
34951 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
34952
34953 /* adj = op1 + adj */
34954 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
34955
34956 /* op0 = (imode)adj */
34957 expand_fix (op0, adj, 0);
34958 }
34959
34960 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
34961 into OPERAND0. */
34962 void
34963 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
34964 {
34965 /* C code for the stuff we're doing below (for do_floor):
34966 xi = (long)op1;
34967 xi -= (double)xi > op1 ? 1 : 0;
34968 return xi;
34969 */
34970 enum machine_mode fmode = GET_MODE (op1);
34971 enum machine_mode imode = GET_MODE (op0);
34972 rtx ireg, freg, label, tmp;
34973
34974 /* reg = (long)op1 */
34975 ireg = gen_reg_rtx (imode);
34976 expand_fix (ireg, op1, 0);
34977
34978 /* freg = (double)reg */
34979 freg = gen_reg_rtx (fmode);
34980 expand_float (freg, ireg, 0);
34981
34982 /* ireg = (freg > op1) ? ireg - 1 : ireg */
34983 label = ix86_expand_sse_compare_and_jump (UNLE,
34984 freg, op1, !do_floor);
34985 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
34986 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
34987 emit_move_insn (ireg, tmp);
34988
34989 emit_label (label);
34990 LABEL_NUSES (label) = 1;
34991
34992 emit_move_insn (op0, ireg);
34993 }
34994
34995 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
34996 result in OPERAND0. */
34997 void
34998 ix86_expand_rint (rtx operand0, rtx operand1)
34999 {
35000 /* C code for the stuff we're doing below:
35001 xa = fabs (operand1);
35002 if (!isless (xa, 2**52))
35003 return operand1;
35004 xa = xa + 2**52 - 2**52;
35005 return copysign (xa, operand1);
35006 */
35007 enum machine_mode mode = GET_MODE (operand0);
35008 rtx res, xa, label, TWO52, mask;
35009
35010 res = gen_reg_rtx (mode);
35011 emit_move_insn (res, operand1);
35012
35013 /* xa = abs (operand1) */
35014 xa = ix86_expand_sse_fabs (res, &mask);
35015
35016 /* if (!isless (xa, TWO52)) goto label; */
35017 TWO52 = ix86_gen_TWO52 (mode);
35018 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35019
35020 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35021 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35022
35023 ix86_sse_copysign_to_positive (res, xa, res, mask);
35024
35025 emit_label (label);
35026 LABEL_NUSES (label) = 1;
35027
35028 emit_move_insn (operand0, res);
35029 }
35030
35031 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35032 into OPERAND0. */
35033 void
35034 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
35035 {
35036 /* C code for the stuff we expand below.
35037 double xa = fabs (x), x2;
35038 if (!isless (xa, TWO52))
35039 return x;
35040 xa = xa + TWO52 - TWO52;
35041 x2 = copysign (xa, x);
35042 Compensate. Floor:
35043 if (x2 > x)
35044 x2 -= 1;
35045 Compensate. Ceil:
35046 if (x2 < x)
35047 x2 -= -1;
35048 return x2;
35049 */
35050 enum machine_mode mode = GET_MODE (operand0);
35051 rtx xa, TWO52, tmp, label, one, res, mask;
35052
35053 TWO52 = ix86_gen_TWO52 (mode);
35054
35055 /* Temporary for holding the result, initialized to the input
35056 operand to ease control flow. */
35057 res = gen_reg_rtx (mode);
35058 emit_move_insn (res, operand1);
35059
35060 /* xa = abs (operand1) */
35061 xa = ix86_expand_sse_fabs (res, &mask);
35062
35063 /* if (!isless (xa, TWO52)) goto label; */
35064 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35065
35066 /* xa = xa + TWO52 - TWO52; */
35067 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35068 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35069
35070 /* xa = copysign (xa, operand1) */
35071 ix86_sse_copysign_to_positive (xa, xa, res, mask);
35072
35073 /* generate 1.0 or -1.0 */
35074 one = force_reg (mode,
35075 const_double_from_real_value (do_floor
35076 ? dconst1 : dconstm1, mode));
35077
35078 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35079 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35080 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35081 gen_rtx_AND (mode, one, tmp)));
35082 /* We always need to subtract here to preserve signed zero. */
35083 tmp = expand_simple_binop (mode, MINUS,
35084 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35085 emit_move_insn (res, tmp);
35086
35087 emit_label (label);
35088 LABEL_NUSES (label) = 1;
35089
35090 emit_move_insn (operand0, res);
35091 }
35092
35093 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35094 into OPERAND0. */
35095 void
35096 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
35097 {
35098 /* C code for the stuff we expand below.
35099 double xa = fabs (x), x2;
35100 if (!isless (xa, TWO52))
35101 return x;
35102 x2 = (double)(long)x;
35103 Compensate. Floor:
35104 if (x2 > x)
35105 x2 -= 1;
35106 Compensate. Ceil:
35107 if (x2 < x)
35108 x2 += 1;
35109 if (HONOR_SIGNED_ZEROS (mode))
35110 return copysign (x2, x);
35111 return x2;
35112 */
35113 enum machine_mode mode = GET_MODE (operand0);
35114 rtx xa, xi, TWO52, tmp, label, one, res, mask;
35115
35116 TWO52 = ix86_gen_TWO52 (mode);
35117
35118 /* Temporary for holding the result, initialized to the input
35119 operand to ease control flow. */
35120 res = gen_reg_rtx (mode);
35121 emit_move_insn (res, operand1);
35122
35123 /* xa = abs (operand1) */
35124 xa = ix86_expand_sse_fabs (res, &mask);
35125
35126 /* if (!isless (xa, TWO52)) goto label; */
35127 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35128
35129 /* xa = (double)(long)x */
35130 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35131 expand_fix (xi, res, 0);
35132 expand_float (xa, xi, 0);
35133
35134 /* generate 1.0 */
35135 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35136
35137 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35138 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35139 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35140 gen_rtx_AND (mode, one, tmp)));
35141 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
35142 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35143 emit_move_insn (res, tmp);
35144
35145 if (HONOR_SIGNED_ZEROS (mode))
35146 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35147
35148 emit_label (label);
35149 LABEL_NUSES (label) = 1;
35150
35151 emit_move_insn (operand0, res);
35152 }
35153
35154 /* Expand SSE sequence for computing round from OPERAND1 storing
35155 into OPERAND0. Sequence that works without relying on DImode truncation
35156 via cvttsd2siq that is only available on 64bit targets. */
35157 void
35158 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35159 {
35160 /* C code for the stuff we expand below.
35161 double xa = fabs (x), xa2, x2;
35162 if (!isless (xa, TWO52))
35163 return x;
35164 Using the absolute value and copying back sign makes
35165 -0.0 -> -0.0 correct.
35166 xa2 = xa + TWO52 - TWO52;
35167 Compensate.
35168 dxa = xa2 - xa;
35169 if (dxa <= -0.5)
35170 xa2 += 1;
35171 else if (dxa > 0.5)
35172 xa2 -= 1;
35173 x2 = copysign (xa2, x);
35174 return x2;
35175 */
35176 enum machine_mode mode = GET_MODE (operand0);
35177 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35178
35179 TWO52 = ix86_gen_TWO52 (mode);
35180
35181 /* Temporary for holding the result, initialized to the input
35182 operand to ease control flow. */
35183 res = gen_reg_rtx (mode);
35184 emit_move_insn (res, operand1);
35185
35186 /* xa = abs (operand1) */
35187 xa = ix86_expand_sse_fabs (res, &mask);
35188
35189 /* if (!isless (xa, TWO52)) goto label; */
35190 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35191
35192 /* xa2 = xa + TWO52 - TWO52; */
35193 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35194 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35195
35196 /* dxa = xa2 - xa; */
35197 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35198
35199 /* generate 0.5, 1.0 and -0.5 */
35200 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35201 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35202 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35203 0, OPTAB_DIRECT);
35204
35205 /* Compensate. */
35206 tmp = gen_reg_rtx (mode);
35207 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35208 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35209 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35210 gen_rtx_AND (mode, one, tmp)));
35211 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35212 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35213 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35214 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35215 gen_rtx_AND (mode, one, tmp)));
35216 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35217
35218 /* res = copysign (xa2, operand1) */
35219 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35220
35221 emit_label (label);
35222 LABEL_NUSES (label) = 1;
35223
35224 emit_move_insn (operand0, res);
35225 }
35226
35227 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35228 into OPERAND0. */
35229 void
35230 ix86_expand_trunc (rtx operand0, rtx operand1)
35231 {
35232 /* C code for SSE variant we expand below.
35233 double xa = fabs (x), x2;
35234 if (!isless (xa, TWO52))
35235 return x;
35236 x2 = (double)(long)x;
35237 if (HONOR_SIGNED_ZEROS (mode))
35238 return copysign (x2, x);
35239 return x2;
35240 */
35241 enum machine_mode mode = GET_MODE (operand0);
35242 rtx xa, xi, TWO52, label, res, mask;
35243
35244 TWO52 = ix86_gen_TWO52 (mode);
35245
35246 /* Temporary for holding the result, initialized to the input
35247 operand to ease control flow. */
35248 res = gen_reg_rtx (mode);
35249 emit_move_insn (res, operand1);
35250
35251 /* xa = abs (operand1) */
35252 xa = ix86_expand_sse_fabs (res, &mask);
35253
35254 /* if (!isless (xa, TWO52)) goto label; */
35255 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35256
35257 /* x = (double)(long)x */
35258 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35259 expand_fix (xi, res, 0);
35260 expand_float (res, xi, 0);
35261
35262 if (HONOR_SIGNED_ZEROS (mode))
35263 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35264
35265 emit_label (label);
35266 LABEL_NUSES (label) = 1;
35267
35268 emit_move_insn (operand0, res);
35269 }
35270
35271 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35272 into OPERAND0. */
35273 void
35274 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35275 {
35276 enum machine_mode mode = GET_MODE (operand0);
35277 rtx xa, mask, TWO52, label, one, res, smask, tmp;
35278
35279 /* C code for SSE variant we expand below.
35280 double xa = fabs (x), x2;
35281 if (!isless (xa, TWO52))
35282 return x;
35283 xa2 = xa + TWO52 - TWO52;
35284 Compensate:
35285 if (xa2 > xa)
35286 xa2 -= 1.0;
35287 x2 = copysign (xa2, x);
35288 return x2;
35289 */
35290
35291 TWO52 = ix86_gen_TWO52 (mode);
35292
35293 /* Temporary for holding the result, initialized to the input
35294 operand to ease control flow. */
35295 res = gen_reg_rtx (mode);
35296 emit_move_insn (res, operand1);
35297
35298 /* xa = abs (operand1) */
35299 xa = ix86_expand_sse_fabs (res, &smask);
35300
35301 /* if (!isless (xa, TWO52)) goto label; */
35302 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35303
35304 /* res = xa + TWO52 - TWO52; */
35305 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35306 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35307 emit_move_insn (res, tmp);
35308
35309 /* generate 1.0 */
35310 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35311
35312 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
35313 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35314 emit_insn (gen_rtx_SET (VOIDmode, mask,
35315 gen_rtx_AND (mode, mask, one)));
35316 tmp = expand_simple_binop (mode, MINUS,
35317 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35318 emit_move_insn (res, tmp);
35319
35320 /* res = copysign (res, operand1) */
35321 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35322
35323 emit_label (label);
35324 LABEL_NUSES (label) = 1;
35325
35326 emit_move_insn (operand0, res);
35327 }
35328
35329 /* Expand SSE sequence for computing round from OPERAND1 storing
35330 into OPERAND0. */
35331 void
35332 ix86_expand_round (rtx operand0, rtx operand1)
35333 {
35334 /* C code for the stuff we're doing below:
35335 double xa = fabs (x);
35336 if (!isless (xa, TWO52))
35337 return x;
35338 xa = (double)(long)(xa + nextafter (0.5, 0.0));
35339 return copysign (xa, x);
35340 */
35341 enum machine_mode mode = GET_MODE (operand0);
35342 rtx res, TWO52, xa, label, xi, half, mask;
35343 const struct real_format *fmt;
35344 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35345
35346 /* Temporary for holding the result, initialized to the input
35347 operand to ease control flow. */
35348 res = gen_reg_rtx (mode);
35349 emit_move_insn (res, operand1);
35350
35351 TWO52 = ix86_gen_TWO52 (mode);
35352 xa = ix86_expand_sse_fabs (res, &mask);
35353 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35354
35355 /* load nextafter (0.5, 0.0) */
35356 fmt = REAL_MODE_FORMAT (mode);
35357 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35358 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35359
35360 /* xa = xa + 0.5 */
35361 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
35362 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
35363
35364 /* xa = (double)(int64_t)xa */
35365 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35366 expand_fix (xi, xa, 0);
35367 expand_float (xa, xi, 0);
35368
35369 /* res = copysign (xa, operand1) */
35370 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
35371
35372 emit_label (label);
35373 LABEL_NUSES (label) = 1;
35374
35375 emit_move_insn (operand0, res);
35376 }
35377
35378 /* Expand SSE sequence for computing round
35379 from OP1 storing into OP0 using sse4 round insn. */
35380 void
35381 ix86_expand_round_sse4 (rtx op0, rtx op1)
35382 {
35383 enum machine_mode mode = GET_MODE (op0);
35384 rtx e1, e2, res, half;
35385 const struct real_format *fmt;
35386 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35387 rtx (*gen_copysign) (rtx, rtx, rtx);
35388 rtx (*gen_round) (rtx, rtx, rtx);
35389
35390 switch (mode)
35391 {
35392 case SFmode:
35393 gen_copysign = gen_copysignsf3;
35394 gen_round = gen_sse4_1_roundsf2;
35395 break;
35396 case DFmode:
35397 gen_copysign = gen_copysigndf3;
35398 gen_round = gen_sse4_1_rounddf2;
35399 break;
35400 default:
35401 gcc_unreachable ();
35402 }
35403
35404 /* round (a) = trunc (a + copysign (0.5, a)) */
35405
35406 /* load nextafter (0.5, 0.0) */
35407 fmt = REAL_MODE_FORMAT (mode);
35408 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35409 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35410 half = const_double_from_real_value (pred_half, mode);
35411
35412 /* e1 = copysign (0.5, op1) */
35413 e1 = gen_reg_rtx (mode);
35414 emit_insn (gen_copysign (e1, half, op1));
35415
35416 /* e2 = op1 + e1 */
35417 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
35418
35419 /* res = trunc (e2) */
35420 res = gen_reg_rtx (mode);
35421 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
35422
35423 emit_move_insn (op0, res);
35424 }
35425 \f
35426
35427 /* Table of valid machine attributes. */
35428 static const struct attribute_spec ix86_attribute_table[] =
35429 {
35430 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
35431 affects_type_identity } */
35432 /* Stdcall attribute says callee is responsible for popping arguments
35433 if they are not variable. */
35434 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35435 true },
35436 /* Fastcall attribute says callee is responsible for popping arguments
35437 if they are not variable. */
35438 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35439 true },
35440 /* Thiscall attribute says callee is responsible for popping arguments
35441 if they are not variable. */
35442 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35443 true },
35444 /* Cdecl attribute says the callee is a normal C declaration */
35445 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35446 true },
35447 /* Regparm attribute specifies how many integer arguments are to be
35448 passed in registers. */
35449 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
35450 true },
35451 /* Sseregparm attribute says we are using x86_64 calling conventions
35452 for FP arguments. */
35453 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35454 true },
35455 /* The transactional memory builtins are implicitly regparm or fastcall
35456 depending on the ABI. Override the generic do-nothing attribute that
35457 these builtins were declared with. */
35458 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
35459 true },
35460 /* force_align_arg_pointer says this function realigns the stack at entry. */
35461 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
35462 false, true, true, ix86_handle_cconv_attribute, false },
35463 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
35464 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
35465 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
35466 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
35467 false },
35468 #endif
35469 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35470 false },
35471 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35472 false },
35473 #ifdef SUBTARGET_ATTRIBUTE_TABLE
35474 SUBTARGET_ATTRIBUTE_TABLE,
35475 #endif
35476 /* ms_abi and sysv_abi calling convention function attributes. */
35477 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35478 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35479 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
35480 false },
35481 { "callee_pop_aggregate_return", 1, 1, false, true, true,
35482 ix86_handle_callee_pop_aggregate_return, true },
35483 /* End element. */
35484 { NULL, 0, 0, false, false, false, NULL, false }
35485 };
35486
35487 /* Implement targetm.vectorize.builtin_vectorization_cost. */
35488 static int
35489 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35490 tree vectype ATTRIBUTE_UNUSED,
35491 int misalign ATTRIBUTE_UNUSED)
35492 {
35493 switch (type_of_cost)
35494 {
35495 case scalar_stmt:
35496 return ix86_cost->scalar_stmt_cost;
35497
35498 case scalar_load:
35499 return ix86_cost->scalar_load_cost;
35500
35501 case scalar_store:
35502 return ix86_cost->scalar_store_cost;
35503
35504 case vector_stmt:
35505 return ix86_cost->vec_stmt_cost;
35506
35507 case vector_load:
35508 return ix86_cost->vec_align_load_cost;
35509
35510 case vector_store:
35511 return ix86_cost->vec_store_cost;
35512
35513 case vec_to_scalar:
35514 return ix86_cost->vec_to_scalar_cost;
35515
35516 case scalar_to_vec:
35517 return ix86_cost->scalar_to_vec_cost;
35518
35519 case unaligned_load:
35520 case unaligned_store:
35521 return ix86_cost->vec_unalign_load_cost;
35522
35523 case cond_branch_taken:
35524 return ix86_cost->cond_taken_branch_cost;
35525
35526 case cond_branch_not_taken:
35527 return ix86_cost->cond_not_taken_branch_cost;
35528
35529 case vec_perm:
35530 case vec_promote_demote:
35531 return ix86_cost->vec_stmt_cost;
35532
35533 default:
35534 gcc_unreachable ();
35535 }
35536 }
35537
35538 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
35539 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
35540 insn every time. */
35541
35542 static GTY(()) rtx vselect_insn;
35543
35544 /* Initialize vselect_insn. */
35545
35546 static void
35547 init_vselect_insn (void)
35548 {
35549 unsigned i;
35550 rtx x;
35551
35552 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
35553 for (i = 0; i < MAX_VECT_LEN; ++i)
35554 XVECEXP (x, 0, i) = const0_rtx;
35555 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
35556 const0_rtx), x);
35557 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
35558 start_sequence ();
35559 vselect_insn = emit_insn (x);
35560 end_sequence ();
35561 }
35562
35563 /* Construct (set target (vec_select op0 (parallel perm))) and
35564 return true if that's a valid instruction in the active ISA. */
35565
35566 static bool
35567 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
35568 unsigned nelt, bool testing_p)
35569 {
35570 unsigned int i;
35571 rtx x, save_vconcat;
35572 int icode;
35573
35574 if (vselect_insn == NULL_RTX)
35575 init_vselect_insn ();
35576
35577 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
35578 PUT_NUM_ELEM (XVEC (x, 0), nelt);
35579 for (i = 0; i < nelt; ++i)
35580 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
35581 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
35582 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
35583 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
35584 SET_DEST (PATTERN (vselect_insn)) = target;
35585 icode = recog_memoized (vselect_insn);
35586
35587 if (icode >= 0 && !testing_p)
35588 emit_insn (copy_rtx (PATTERN (vselect_insn)));
35589
35590 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
35591 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
35592 INSN_CODE (vselect_insn) = -1;
35593
35594 return icode >= 0;
35595 }
35596
35597 /* Similar, but generate a vec_concat from op0 and op1 as well. */
35598
35599 static bool
35600 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
35601 const unsigned char *perm, unsigned nelt,
35602 bool testing_p)
35603 {
35604 enum machine_mode v2mode;
35605 rtx x;
35606 bool ok;
35607
35608 if (vselect_insn == NULL_RTX)
35609 init_vselect_insn ();
35610
35611 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
35612 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
35613 PUT_MODE (x, v2mode);
35614 XEXP (x, 0) = op0;
35615 XEXP (x, 1) = op1;
35616 ok = expand_vselect (target, x, perm, nelt, testing_p);
35617 XEXP (x, 0) = const0_rtx;
35618 XEXP (x, 1) = const0_rtx;
35619 return ok;
35620 }
35621
35622 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35623 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
35624
35625 static bool
35626 expand_vec_perm_blend (struct expand_vec_perm_d *d)
35627 {
35628 enum machine_mode vmode = d->vmode;
35629 unsigned i, mask, nelt = d->nelt;
35630 rtx target, op0, op1, x;
35631 rtx rperm[32], vperm;
35632
35633 if (d->op0 == d->op1)
35634 return false;
35635 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
35636 ;
35637 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
35638 ;
35639 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
35640 ;
35641 else
35642 return false;
35643
35644 /* This is a blend, not a permute. Elements must stay in their
35645 respective lanes. */
35646 for (i = 0; i < nelt; ++i)
35647 {
35648 unsigned e = d->perm[i];
35649 if (!(e == i || e == i + nelt))
35650 return false;
35651 }
35652
35653 if (d->testing_p)
35654 return true;
35655
35656 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
35657 decision should be extracted elsewhere, so that we only try that
35658 sequence once all budget==3 options have been tried. */
35659 target = d->target;
35660 op0 = d->op0;
35661 op1 = d->op1;
35662 mask = 0;
35663
35664 switch (vmode)
35665 {
35666 case V4DFmode:
35667 case V8SFmode:
35668 case V2DFmode:
35669 case V4SFmode:
35670 case V8HImode:
35671 case V8SImode:
35672 for (i = 0; i < nelt; ++i)
35673 mask |= (d->perm[i] >= nelt) << i;
35674 break;
35675
35676 case V2DImode:
35677 for (i = 0; i < 2; ++i)
35678 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
35679 vmode = V8HImode;
35680 goto do_subreg;
35681
35682 case V4SImode:
35683 for (i = 0; i < 4; ++i)
35684 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35685 vmode = V8HImode;
35686 goto do_subreg;
35687
35688 case V16QImode:
35689 /* See if bytes move in pairs so we can use pblendw with
35690 an immediate argument, rather than pblendvb with a vector
35691 argument. */
35692 for (i = 0; i < 16; i += 2)
35693 if (d->perm[i] + 1 != d->perm[i + 1])
35694 {
35695 use_pblendvb:
35696 for (i = 0; i < nelt; ++i)
35697 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
35698
35699 finish_pblendvb:
35700 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
35701 vperm = force_reg (vmode, vperm);
35702
35703 if (GET_MODE_SIZE (vmode) == 16)
35704 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
35705 else
35706 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
35707 return true;
35708 }
35709
35710 for (i = 0; i < 8; ++i)
35711 mask |= (d->perm[i * 2] >= 16) << i;
35712 vmode = V8HImode;
35713 /* FALLTHRU */
35714
35715 do_subreg:
35716 target = gen_lowpart (vmode, target);
35717 op0 = gen_lowpart (vmode, op0);
35718 op1 = gen_lowpart (vmode, op1);
35719 break;
35720
35721 case V32QImode:
35722 /* See if bytes move in pairs. If not, vpblendvb must be used. */
35723 for (i = 0; i < 32; i += 2)
35724 if (d->perm[i] + 1 != d->perm[i + 1])
35725 goto use_pblendvb;
35726 /* See if bytes move in quadruplets. If yes, vpblendd
35727 with immediate can be used. */
35728 for (i = 0; i < 32; i += 4)
35729 if (d->perm[i] + 2 != d->perm[i + 2])
35730 break;
35731 if (i < 32)
35732 {
35733 /* See if bytes move the same in both lanes. If yes,
35734 vpblendw with immediate can be used. */
35735 for (i = 0; i < 16; i += 2)
35736 if (d->perm[i] + 16 != d->perm[i + 16])
35737 goto use_pblendvb;
35738
35739 /* Use vpblendw. */
35740 for (i = 0; i < 16; ++i)
35741 mask |= (d->perm[i * 2] >= 32) << i;
35742 vmode = V16HImode;
35743 goto do_subreg;
35744 }
35745
35746 /* Use vpblendd. */
35747 for (i = 0; i < 8; ++i)
35748 mask |= (d->perm[i * 4] >= 32) << i;
35749 vmode = V8SImode;
35750 goto do_subreg;
35751
35752 case V16HImode:
35753 /* See if words move in pairs. If yes, vpblendd can be used. */
35754 for (i = 0; i < 16; i += 2)
35755 if (d->perm[i] + 1 != d->perm[i + 1])
35756 break;
35757 if (i < 16)
35758 {
35759 /* See if words move the same in both lanes. If not,
35760 vpblendvb must be used. */
35761 for (i = 0; i < 8; i++)
35762 if (d->perm[i] + 8 != d->perm[i + 8])
35763 {
35764 /* Use vpblendvb. */
35765 for (i = 0; i < 32; ++i)
35766 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
35767
35768 vmode = V32QImode;
35769 nelt = 32;
35770 target = gen_lowpart (vmode, target);
35771 op0 = gen_lowpart (vmode, op0);
35772 op1 = gen_lowpart (vmode, op1);
35773 goto finish_pblendvb;
35774 }
35775
35776 /* Use vpblendw. */
35777 for (i = 0; i < 16; ++i)
35778 mask |= (d->perm[i] >= 16) << i;
35779 break;
35780 }
35781
35782 /* Use vpblendd. */
35783 for (i = 0; i < 8; ++i)
35784 mask |= (d->perm[i * 2] >= 16) << i;
35785 vmode = V8SImode;
35786 goto do_subreg;
35787
35788 case V4DImode:
35789 /* Use vpblendd. */
35790 for (i = 0; i < 4; ++i)
35791 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35792 vmode = V8SImode;
35793 goto do_subreg;
35794
35795 default:
35796 gcc_unreachable ();
35797 }
35798
35799 /* This matches five different patterns with the different modes. */
35800 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
35801 x = gen_rtx_SET (VOIDmode, target, x);
35802 emit_insn (x);
35803
35804 return true;
35805 }
35806
35807 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35808 in terms of the variable form of vpermilps.
35809
35810 Note that we will have already failed the immediate input vpermilps,
35811 which requires that the high and low part shuffle be identical; the
35812 variable form doesn't require that. */
35813
35814 static bool
35815 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
35816 {
35817 rtx rperm[8], vperm;
35818 unsigned i;
35819
35820 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
35821 return false;
35822
35823 /* We can only permute within the 128-bit lane. */
35824 for (i = 0; i < 8; ++i)
35825 {
35826 unsigned e = d->perm[i];
35827 if (i < 4 ? e >= 4 : e < 4)
35828 return false;
35829 }
35830
35831 if (d->testing_p)
35832 return true;
35833
35834 for (i = 0; i < 8; ++i)
35835 {
35836 unsigned e = d->perm[i];
35837
35838 /* Within each 128-bit lane, the elements of op0 are numbered
35839 from 0 and the elements of op1 are numbered from 4. */
35840 if (e >= 8 + 4)
35841 e -= 8;
35842 else if (e >= 4)
35843 e -= 4;
35844
35845 rperm[i] = GEN_INT (e);
35846 }
35847
35848 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
35849 vperm = force_reg (V8SImode, vperm);
35850 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
35851
35852 return true;
35853 }
35854
35855 /* Return true if permutation D can be performed as VMODE permutation
35856 instead. */
35857
35858 static bool
35859 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
35860 {
35861 unsigned int i, j, chunk;
35862
35863 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
35864 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
35865 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
35866 return false;
35867
35868 if (GET_MODE_NUNITS (vmode) >= d->nelt)
35869 return true;
35870
35871 chunk = d->nelt / GET_MODE_NUNITS (vmode);
35872 for (i = 0; i < d->nelt; i += chunk)
35873 if (d->perm[i] & (chunk - 1))
35874 return false;
35875 else
35876 for (j = 1; j < chunk; ++j)
35877 if (d->perm[i] + j != d->perm[i + j])
35878 return false;
35879
35880 return true;
35881 }
35882
35883 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35884 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
35885
35886 static bool
35887 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
35888 {
35889 unsigned i, nelt, eltsz, mask;
35890 unsigned char perm[32];
35891 enum machine_mode vmode = V16QImode;
35892 rtx rperm[32], vperm, target, op0, op1;
35893
35894 nelt = d->nelt;
35895
35896 if (d->op0 != d->op1)
35897 {
35898 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
35899 {
35900 if (TARGET_AVX2
35901 && valid_perm_using_mode_p (V2TImode, d))
35902 {
35903 if (d->testing_p)
35904 return true;
35905
35906 /* Use vperm2i128 insn. The pattern uses
35907 V4DImode instead of V2TImode. */
35908 target = gen_lowpart (V4DImode, d->target);
35909 op0 = gen_lowpart (V4DImode, d->op0);
35910 op1 = gen_lowpart (V4DImode, d->op1);
35911 rperm[0]
35912 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
35913 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
35914 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
35915 return true;
35916 }
35917 return false;
35918 }
35919 }
35920 else
35921 {
35922 if (GET_MODE_SIZE (d->vmode) == 16)
35923 {
35924 if (!TARGET_SSSE3)
35925 return false;
35926 }
35927 else if (GET_MODE_SIZE (d->vmode) == 32)
35928 {
35929 if (!TARGET_AVX2)
35930 return false;
35931
35932 /* V4DImode should be already handled through
35933 expand_vselect by vpermq instruction. */
35934 gcc_assert (d->vmode != V4DImode);
35935
35936 vmode = V32QImode;
35937 if (d->vmode == V8SImode
35938 || d->vmode == V16HImode
35939 || d->vmode == V32QImode)
35940 {
35941 /* First see if vpermq can be used for
35942 V8SImode/V16HImode/V32QImode. */
35943 if (valid_perm_using_mode_p (V4DImode, d))
35944 {
35945 for (i = 0; i < 4; i++)
35946 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
35947 if (d->testing_p)
35948 return true;
35949 return expand_vselect (gen_lowpart (V4DImode, d->target),
35950 gen_lowpart (V4DImode, d->op0),
35951 perm, 4, false);
35952 }
35953
35954 /* Next see if vpermd can be used. */
35955 if (valid_perm_using_mode_p (V8SImode, d))
35956 vmode = V8SImode;
35957 }
35958 /* Or if vpermps can be used. */
35959 else if (d->vmode == V8SFmode)
35960 vmode = V8SImode;
35961
35962 if (vmode == V32QImode)
35963 {
35964 /* vpshufb only works intra lanes, it is not
35965 possible to shuffle bytes in between the lanes. */
35966 for (i = 0; i < nelt; ++i)
35967 if ((d->perm[i] ^ i) & (nelt / 2))
35968 return false;
35969 }
35970 }
35971 else
35972 return false;
35973 }
35974
35975 if (d->testing_p)
35976 return true;
35977
35978 if (vmode == V8SImode)
35979 for (i = 0; i < 8; ++i)
35980 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
35981 else
35982 {
35983 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35984 if (d->op0 != d->op1)
35985 mask = 2 * nelt - 1;
35986 else if (vmode == V16QImode)
35987 mask = nelt - 1;
35988 else
35989 mask = nelt / 2 - 1;
35990
35991 for (i = 0; i < nelt; ++i)
35992 {
35993 unsigned j, e = d->perm[i] & mask;
35994 for (j = 0; j < eltsz; ++j)
35995 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
35996 }
35997 }
35998
35999 vperm = gen_rtx_CONST_VECTOR (vmode,
36000 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
36001 vperm = force_reg (vmode, vperm);
36002
36003 if (vmode == V8SImode && d->vmode == V8SFmode)
36004 {
36005 vmode = V8SFmode;
36006 vperm = gen_lowpart (vmode, vperm);
36007 }
36008
36009 target = gen_lowpart (vmode, d->target);
36010 op0 = gen_lowpart (vmode, d->op0);
36011 if (d->op0 == d->op1)
36012 {
36013 if (vmode == V16QImode)
36014 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
36015 else if (vmode == V32QImode)
36016 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
36017 else if (vmode == V8SFmode)
36018 emit_insn (gen_avx2_permvarv8sf (target, vperm, op0));
36019 else
36020 emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
36021 }
36022 else
36023 {
36024 op1 = gen_lowpart (vmode, d->op1);
36025 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
36026 }
36027
36028 return true;
36029 }
36030
36031 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
36032 in a single instruction. */
36033
36034 static bool
36035 expand_vec_perm_1 (struct expand_vec_perm_d *d)
36036 {
36037 unsigned i, nelt = d->nelt;
36038 unsigned char perm2[MAX_VECT_LEN];
36039
36040 /* Check plain VEC_SELECT first, because AVX has instructions that could
36041 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
36042 input where SEL+CONCAT may not. */
36043 if (d->op0 == d->op1)
36044 {
36045 int mask = nelt - 1;
36046 bool identity_perm = true;
36047 bool broadcast_perm = true;
36048
36049 for (i = 0; i < nelt; i++)
36050 {
36051 perm2[i] = d->perm[i] & mask;
36052 if (perm2[i] != i)
36053 identity_perm = false;
36054 if (perm2[i])
36055 broadcast_perm = false;
36056 }
36057
36058 if (identity_perm)
36059 {
36060 if (!d->testing_p)
36061 emit_move_insn (d->target, d->op0);
36062 return true;
36063 }
36064 else if (broadcast_perm && TARGET_AVX2)
36065 {
36066 /* Use vpbroadcast{b,w,d}. */
36067 rtx (*gen) (rtx, rtx) = NULL;
36068 switch (d->vmode)
36069 {
36070 case V32QImode:
36071 gen = gen_avx2_pbroadcastv32qi_1;
36072 break;
36073 case V16HImode:
36074 gen = gen_avx2_pbroadcastv16hi_1;
36075 break;
36076 case V8SImode:
36077 gen = gen_avx2_pbroadcastv8si_1;
36078 break;
36079 case V16QImode:
36080 gen = gen_avx2_pbroadcastv16qi;
36081 break;
36082 case V8HImode:
36083 gen = gen_avx2_pbroadcastv8hi;
36084 break;
36085 case V8SFmode:
36086 gen = gen_avx2_vec_dupv8sf_1;
36087 break;
36088 /* For other modes prefer other shuffles this function creates. */
36089 default: break;
36090 }
36091 if (gen != NULL)
36092 {
36093 if (!d->testing_p)
36094 emit_insn (gen (d->target, d->op0));
36095 return true;
36096 }
36097 }
36098
36099 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
36100 return true;
36101
36102 /* There are plenty of patterns in sse.md that are written for
36103 SEL+CONCAT and are not replicated for a single op. Perhaps
36104 that should be changed, to avoid the nastiness here. */
36105
36106 /* Recognize interleave style patterns, which means incrementing
36107 every other permutation operand. */
36108 for (i = 0; i < nelt; i += 2)
36109 {
36110 perm2[i] = d->perm[i] & mask;
36111 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
36112 }
36113 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
36114 d->testing_p))
36115 return true;
36116
36117 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
36118 if (nelt >= 4)
36119 {
36120 for (i = 0; i < nelt; i += 4)
36121 {
36122 perm2[i + 0] = d->perm[i + 0] & mask;
36123 perm2[i + 1] = d->perm[i + 1] & mask;
36124 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
36125 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
36126 }
36127
36128 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
36129 d->testing_p))
36130 return true;
36131 }
36132 }
36133
36134 /* Finally, try the fully general two operand permute. */
36135 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
36136 d->testing_p))
36137 return true;
36138
36139 /* Recognize interleave style patterns with reversed operands. */
36140 if (d->op0 != d->op1)
36141 {
36142 for (i = 0; i < nelt; ++i)
36143 {
36144 unsigned e = d->perm[i];
36145 if (e >= nelt)
36146 e -= nelt;
36147 else
36148 e += nelt;
36149 perm2[i] = e;
36150 }
36151
36152 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
36153 d->testing_p))
36154 return true;
36155 }
36156
36157 /* Try the SSE4.1 blend variable merge instructions. */
36158 if (expand_vec_perm_blend (d))
36159 return true;
36160
36161 /* Try one of the AVX vpermil variable permutations. */
36162 if (expand_vec_perm_vpermil (d))
36163 return true;
36164
36165 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
36166 vpshufb, vpermd, vpermps or vpermq variable permutation. */
36167 if (expand_vec_perm_pshufb (d))
36168 return true;
36169
36170 return false;
36171 }
36172
36173 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36174 in terms of a pair of pshuflw + pshufhw instructions. */
36175
36176 static bool
36177 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
36178 {
36179 unsigned char perm2[MAX_VECT_LEN];
36180 unsigned i;
36181 bool ok;
36182
36183 if (d->vmode != V8HImode || d->op0 != d->op1)
36184 return false;
36185
36186 /* The two permutations only operate in 64-bit lanes. */
36187 for (i = 0; i < 4; ++i)
36188 if (d->perm[i] >= 4)
36189 return false;
36190 for (i = 4; i < 8; ++i)
36191 if (d->perm[i] < 4)
36192 return false;
36193
36194 if (d->testing_p)
36195 return true;
36196
36197 /* Emit the pshuflw. */
36198 memcpy (perm2, d->perm, 4);
36199 for (i = 4; i < 8; ++i)
36200 perm2[i] = i;
36201 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
36202 gcc_assert (ok);
36203
36204 /* Emit the pshufhw. */
36205 memcpy (perm2 + 4, d->perm + 4, 4);
36206 for (i = 0; i < 4; ++i)
36207 perm2[i] = i;
36208 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
36209 gcc_assert (ok);
36210
36211 return true;
36212 }
36213
36214 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36215 the permutation using the SSSE3 palignr instruction. This succeeds
36216 when all of the elements in PERM fit within one vector and we merely
36217 need to shift them down so that a single vector permutation has a
36218 chance to succeed. */
36219
36220 static bool
36221 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36222 {
36223 unsigned i, nelt = d->nelt;
36224 unsigned min, max;
36225 bool in_order, ok;
36226 rtx shift;
36227
36228 /* Even with AVX, palignr only operates on 128-bit vectors. */
36229 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36230 return false;
36231
36232 min = nelt, max = 0;
36233 for (i = 0; i < nelt; ++i)
36234 {
36235 unsigned e = d->perm[i];
36236 if (e < min)
36237 min = e;
36238 if (e > max)
36239 max = e;
36240 }
36241 if (min == 0 || max - min >= nelt)
36242 return false;
36243
36244 /* Given that we have SSSE3, we know we'll be able to implement the
36245 single operand permutation after the palignr with pshufb. */
36246 if (d->testing_p)
36247 return true;
36248
36249 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36250 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36251 gen_lowpart (TImode, d->op1),
36252 gen_lowpart (TImode, d->op0), shift));
36253
36254 d->op0 = d->op1 = d->target;
36255
36256 in_order = true;
36257 for (i = 0; i < nelt; ++i)
36258 {
36259 unsigned e = d->perm[i] - min;
36260 if (e != i)
36261 in_order = false;
36262 d->perm[i] = e;
36263 }
36264
36265 /* Test for the degenerate case where the alignment by itself
36266 produces the desired permutation. */
36267 if (in_order)
36268 return true;
36269
36270 ok = expand_vec_perm_1 (d);
36271 gcc_assert (ok);
36272
36273 return ok;
36274 }
36275
36276 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
36277
36278 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36279 a two vector permutation into a single vector permutation by using
36280 an interleave operation to merge the vectors. */
36281
36282 static bool
36283 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36284 {
36285 struct expand_vec_perm_d dremap, dfinal;
36286 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36287 unsigned HOST_WIDE_INT contents;
36288 unsigned char remap[2 * MAX_VECT_LEN];
36289 rtx seq;
36290 bool ok, same_halves = false;
36291
36292 if (GET_MODE_SIZE (d->vmode) == 16)
36293 {
36294 if (d->op0 == d->op1)
36295 return false;
36296 }
36297 else if (GET_MODE_SIZE (d->vmode) == 32)
36298 {
36299 if (!TARGET_AVX)
36300 return false;
36301 /* For 32-byte modes allow even d->op0 == d->op1.
36302 The lack of cross-lane shuffling in some instructions
36303 might prevent a single insn shuffle. */
36304 dfinal = *d;
36305 dfinal.testing_p = true;
36306 /* If expand_vec_perm_interleave3 can expand this into
36307 a 3 insn sequence, give up and let it be expanded as
36308 3 insn sequence. While that is one insn longer,
36309 it doesn't need a memory operand and in the common
36310 case that both interleave low and high permutations
36311 with the same operands are adjacent needs 4 insns
36312 for both after CSE. */
36313 if (expand_vec_perm_interleave3 (&dfinal))
36314 return false;
36315 }
36316 else
36317 return false;
36318
36319 /* Examine from whence the elements come. */
36320 contents = 0;
36321 for (i = 0; i < nelt; ++i)
36322 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36323
36324 memset (remap, 0xff, sizeof (remap));
36325 dremap = *d;
36326
36327 if (GET_MODE_SIZE (d->vmode) == 16)
36328 {
36329 unsigned HOST_WIDE_INT h1, h2, h3, h4;
36330
36331 /* Split the two input vectors into 4 halves. */
36332 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36333 h2 = h1 << nelt2;
36334 h3 = h2 << nelt2;
36335 h4 = h3 << nelt2;
36336
36337 /* If the elements from the low halves use interleave low, and similarly
36338 for interleave high. If the elements are from mis-matched halves, we
36339 can use shufps for V4SF/V4SI or do a DImode shuffle. */
36340 if ((contents & (h1 | h3)) == contents)
36341 {
36342 /* punpckl* */
36343 for (i = 0; i < nelt2; ++i)
36344 {
36345 remap[i] = i * 2;
36346 remap[i + nelt] = i * 2 + 1;
36347 dremap.perm[i * 2] = i;
36348 dremap.perm[i * 2 + 1] = i + nelt;
36349 }
36350 if (!TARGET_SSE2 && d->vmode == V4SImode)
36351 dremap.vmode = V4SFmode;
36352 }
36353 else if ((contents & (h2 | h4)) == contents)
36354 {
36355 /* punpckh* */
36356 for (i = 0; i < nelt2; ++i)
36357 {
36358 remap[i + nelt2] = i * 2;
36359 remap[i + nelt + nelt2] = i * 2 + 1;
36360 dremap.perm[i * 2] = i + nelt2;
36361 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
36362 }
36363 if (!TARGET_SSE2 && d->vmode == V4SImode)
36364 dremap.vmode = V4SFmode;
36365 }
36366 else if ((contents & (h1 | h4)) == contents)
36367 {
36368 /* shufps */
36369 for (i = 0; i < nelt2; ++i)
36370 {
36371 remap[i] = i;
36372 remap[i + nelt + nelt2] = i + nelt2;
36373 dremap.perm[i] = i;
36374 dremap.perm[i + nelt2] = i + nelt + nelt2;
36375 }
36376 if (nelt != 4)
36377 {
36378 /* shufpd */
36379 dremap.vmode = V2DImode;
36380 dremap.nelt = 2;
36381 dremap.perm[0] = 0;
36382 dremap.perm[1] = 3;
36383 }
36384 }
36385 else if ((contents & (h2 | h3)) == contents)
36386 {
36387 /* shufps */
36388 for (i = 0; i < nelt2; ++i)
36389 {
36390 remap[i + nelt2] = i;
36391 remap[i + nelt] = i + nelt2;
36392 dremap.perm[i] = i + nelt2;
36393 dremap.perm[i + nelt2] = i + nelt;
36394 }
36395 if (nelt != 4)
36396 {
36397 /* shufpd */
36398 dremap.vmode = V2DImode;
36399 dremap.nelt = 2;
36400 dremap.perm[0] = 1;
36401 dremap.perm[1] = 2;
36402 }
36403 }
36404 else
36405 return false;
36406 }
36407 else
36408 {
36409 unsigned int nelt4 = nelt / 4, nzcnt = 0;
36410 unsigned HOST_WIDE_INT q[8];
36411 unsigned int nonzero_halves[4];
36412
36413 /* Split the two input vectors into 8 quarters. */
36414 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
36415 for (i = 1; i < 8; ++i)
36416 q[i] = q[0] << (nelt4 * i);
36417 for (i = 0; i < 4; ++i)
36418 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
36419 {
36420 nonzero_halves[nzcnt] = i;
36421 ++nzcnt;
36422 }
36423
36424 if (nzcnt == 1)
36425 {
36426 gcc_assert (d->op0 == d->op1);
36427 nonzero_halves[1] = nonzero_halves[0];
36428 same_halves = true;
36429 }
36430 else if (d->op0 == d->op1)
36431 {
36432 gcc_assert (nonzero_halves[0] == 0);
36433 gcc_assert (nonzero_halves[1] == 1);
36434 }
36435
36436 if (nzcnt <= 2)
36437 {
36438 if (d->perm[0] / nelt2 == nonzero_halves[1])
36439 {
36440 /* Attempt to increase the likelyhood that dfinal
36441 shuffle will be intra-lane. */
36442 char tmph = nonzero_halves[0];
36443 nonzero_halves[0] = nonzero_halves[1];
36444 nonzero_halves[1] = tmph;
36445 }
36446
36447 /* vperm2f128 or vperm2i128. */
36448 for (i = 0; i < nelt2; ++i)
36449 {
36450 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
36451 remap[i + nonzero_halves[0] * nelt2] = i;
36452 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
36453 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
36454 }
36455
36456 if (d->vmode != V8SFmode
36457 && d->vmode != V4DFmode
36458 && d->vmode != V8SImode)
36459 {
36460 dremap.vmode = V8SImode;
36461 dremap.nelt = 8;
36462 for (i = 0; i < 4; ++i)
36463 {
36464 dremap.perm[i] = i + nonzero_halves[0] * 4;
36465 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
36466 }
36467 }
36468 }
36469 else if (d->op0 == d->op1)
36470 return false;
36471 else if (TARGET_AVX2
36472 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
36473 {
36474 /* vpunpckl* */
36475 for (i = 0; i < nelt4; ++i)
36476 {
36477 remap[i] = i * 2;
36478 remap[i + nelt] = i * 2 + 1;
36479 remap[i + nelt2] = i * 2 + nelt2;
36480 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
36481 dremap.perm[i * 2] = i;
36482 dremap.perm[i * 2 + 1] = i + nelt;
36483 dremap.perm[i * 2 + nelt2] = i + nelt2;
36484 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
36485 }
36486 }
36487 else if (TARGET_AVX2
36488 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
36489 {
36490 /* vpunpckh* */
36491 for (i = 0; i < nelt4; ++i)
36492 {
36493 remap[i + nelt4] = i * 2;
36494 remap[i + nelt + nelt4] = i * 2 + 1;
36495 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
36496 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
36497 dremap.perm[i * 2] = i + nelt4;
36498 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
36499 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
36500 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
36501 }
36502 }
36503 else
36504 return false;
36505 }
36506
36507 /* Use the remapping array set up above to move the elements from their
36508 swizzled locations into their final destinations. */
36509 dfinal = *d;
36510 for (i = 0; i < nelt; ++i)
36511 {
36512 unsigned e = remap[d->perm[i]];
36513 gcc_assert (e < nelt);
36514 /* If same_halves is true, both halves of the remapped vector are the
36515 same. Avoid cross-lane accesses if possible. */
36516 if (same_halves && i >= nelt2)
36517 {
36518 gcc_assert (e < nelt2);
36519 dfinal.perm[i] = e + nelt2;
36520 }
36521 else
36522 dfinal.perm[i] = e;
36523 }
36524 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
36525 dfinal.op1 = dfinal.op0;
36526 dremap.target = dfinal.op0;
36527
36528 /* Test if the final remap can be done with a single insn. For V4SFmode or
36529 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
36530 start_sequence ();
36531 ok = expand_vec_perm_1 (&dfinal);
36532 seq = get_insns ();
36533 end_sequence ();
36534
36535 if (!ok)
36536 return false;
36537
36538 if (d->testing_p)
36539 return true;
36540
36541 if (dremap.vmode != dfinal.vmode)
36542 {
36543 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
36544 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
36545 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
36546 }
36547
36548 ok = expand_vec_perm_1 (&dremap);
36549 gcc_assert (ok);
36550
36551 emit_insn (seq);
36552 return true;
36553 }
36554
36555 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36556 a single vector cross-lane permutation into vpermq followed
36557 by any of the single insn permutations. */
36558
36559 static bool
36560 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
36561 {
36562 struct expand_vec_perm_d dremap, dfinal;
36563 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
36564 unsigned contents[2];
36565 bool ok;
36566
36567 if (!(TARGET_AVX2
36568 && (d->vmode == V32QImode || d->vmode == V16HImode)
36569 && d->op0 == d->op1))
36570 return false;
36571
36572 contents[0] = 0;
36573 contents[1] = 0;
36574 for (i = 0; i < nelt2; ++i)
36575 {
36576 contents[0] |= 1u << (d->perm[i] / nelt4);
36577 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
36578 }
36579
36580 for (i = 0; i < 2; ++i)
36581 {
36582 unsigned int cnt = 0;
36583 for (j = 0; j < 4; ++j)
36584 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
36585 return false;
36586 }
36587
36588 if (d->testing_p)
36589 return true;
36590
36591 dremap = *d;
36592 dremap.vmode = V4DImode;
36593 dremap.nelt = 4;
36594 dremap.target = gen_reg_rtx (V4DImode);
36595 dremap.op0 = gen_lowpart (V4DImode, d->op0);
36596 dremap.op1 = dremap.op0;
36597 for (i = 0; i < 2; ++i)
36598 {
36599 unsigned int cnt = 0;
36600 for (j = 0; j < 4; ++j)
36601 if ((contents[i] & (1u << j)) != 0)
36602 dremap.perm[2 * i + cnt++] = j;
36603 for (; cnt < 2; ++cnt)
36604 dremap.perm[2 * i + cnt] = 0;
36605 }
36606
36607 dfinal = *d;
36608 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
36609 dfinal.op1 = dfinal.op0;
36610 for (i = 0, j = 0; i < nelt; ++i)
36611 {
36612 if (i == nelt2)
36613 j = 2;
36614 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
36615 if ((d->perm[i] / nelt4) == dremap.perm[j])
36616 ;
36617 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
36618 dfinal.perm[i] |= nelt4;
36619 else
36620 gcc_unreachable ();
36621 }
36622
36623 ok = expand_vec_perm_1 (&dremap);
36624 gcc_assert (ok);
36625
36626 ok = expand_vec_perm_1 (&dfinal);
36627 gcc_assert (ok);
36628
36629 return true;
36630 }
36631
36632 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36633 a two vector permutation using 2 intra-lane interleave insns
36634 and cross-lane shuffle for 32-byte vectors. */
36635
36636 static bool
36637 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
36638 {
36639 unsigned i, nelt;
36640 rtx (*gen) (rtx, rtx, rtx);
36641
36642 if (d->op0 == d->op1)
36643 return false;
36644 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
36645 ;
36646 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
36647 ;
36648 else
36649 return false;
36650
36651 nelt = d->nelt;
36652 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
36653 return false;
36654 for (i = 0; i < nelt; i += 2)
36655 if (d->perm[i] != d->perm[0] + i / 2
36656 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
36657 return false;
36658
36659 if (d->testing_p)
36660 return true;
36661
36662 switch (d->vmode)
36663 {
36664 case V32QImode:
36665 if (d->perm[0])
36666 gen = gen_vec_interleave_highv32qi;
36667 else
36668 gen = gen_vec_interleave_lowv32qi;
36669 break;
36670 case V16HImode:
36671 if (d->perm[0])
36672 gen = gen_vec_interleave_highv16hi;
36673 else
36674 gen = gen_vec_interleave_lowv16hi;
36675 break;
36676 case V8SImode:
36677 if (d->perm[0])
36678 gen = gen_vec_interleave_highv8si;
36679 else
36680 gen = gen_vec_interleave_lowv8si;
36681 break;
36682 case V4DImode:
36683 if (d->perm[0])
36684 gen = gen_vec_interleave_highv4di;
36685 else
36686 gen = gen_vec_interleave_lowv4di;
36687 break;
36688 case V8SFmode:
36689 if (d->perm[0])
36690 gen = gen_vec_interleave_highv8sf;
36691 else
36692 gen = gen_vec_interleave_lowv8sf;
36693 break;
36694 case V4DFmode:
36695 if (d->perm[0])
36696 gen = gen_vec_interleave_highv4df;
36697 else
36698 gen = gen_vec_interleave_lowv4df;
36699 break;
36700 default:
36701 gcc_unreachable ();
36702 }
36703
36704 emit_insn (gen (d->target, d->op0, d->op1));
36705 return true;
36706 }
36707
36708 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
36709 a single vector permutation using a single intra-lane vector
36710 permutation, vperm2f128 swapping the lanes and vblend* insn blending
36711 the non-swapped and swapped vectors together. */
36712
36713 static bool
36714 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
36715 {
36716 struct expand_vec_perm_d dfirst, dsecond;
36717 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
36718 rtx seq;
36719 bool ok;
36720 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
36721
36722 if (!TARGET_AVX
36723 || TARGET_AVX2
36724 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
36725 || d->op0 != d->op1)
36726 return false;
36727
36728 dfirst = *d;
36729 for (i = 0; i < nelt; i++)
36730 dfirst.perm[i] = 0xff;
36731 for (i = 0, msk = 0; i < nelt; i++)
36732 {
36733 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
36734 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
36735 return false;
36736 dfirst.perm[j] = d->perm[i];
36737 if (j != i)
36738 msk |= (1 << i);
36739 }
36740 for (i = 0; i < nelt; i++)
36741 if (dfirst.perm[i] == 0xff)
36742 dfirst.perm[i] = i;
36743
36744 if (!d->testing_p)
36745 dfirst.target = gen_reg_rtx (dfirst.vmode);
36746
36747 start_sequence ();
36748 ok = expand_vec_perm_1 (&dfirst);
36749 seq = get_insns ();
36750 end_sequence ();
36751
36752 if (!ok)
36753 return false;
36754
36755 if (d->testing_p)
36756 return true;
36757
36758 emit_insn (seq);
36759
36760 dsecond = *d;
36761 dsecond.op0 = dfirst.target;
36762 dsecond.op1 = dfirst.target;
36763 dsecond.target = gen_reg_rtx (dsecond.vmode);
36764 for (i = 0; i < nelt; i++)
36765 dsecond.perm[i] = i ^ nelt2;
36766
36767 ok = expand_vec_perm_1 (&dsecond);
36768 gcc_assert (ok);
36769
36770 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
36771 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
36772 return true;
36773 }
36774
36775 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
36776 permutation with two pshufb insns and an ior. We should have already
36777 failed all two instruction sequences. */
36778
36779 static bool
36780 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
36781 {
36782 rtx rperm[2][16], vperm, l, h, op, m128;
36783 unsigned int i, nelt, eltsz;
36784
36785 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36786 return false;
36787 gcc_assert (d->op0 != d->op1);
36788
36789 nelt = d->nelt;
36790 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36791
36792 /* Generate two permutation masks. If the required element is within
36793 the given vector it is shuffled into the proper lane. If the required
36794 element is in the other vector, force a zero into the lane by setting
36795 bit 7 in the permutation mask. */
36796 m128 = GEN_INT (-128);
36797 for (i = 0; i < nelt; ++i)
36798 {
36799 unsigned j, e = d->perm[i];
36800 unsigned which = (e >= nelt);
36801 if (e >= nelt)
36802 e -= nelt;
36803
36804 for (j = 0; j < eltsz; ++j)
36805 {
36806 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
36807 rperm[1-which][i*eltsz + j] = m128;
36808 }
36809 }
36810
36811 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
36812 vperm = force_reg (V16QImode, vperm);
36813
36814 l = gen_reg_rtx (V16QImode);
36815 op = gen_lowpart (V16QImode, d->op0);
36816 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
36817
36818 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
36819 vperm = force_reg (V16QImode, vperm);
36820
36821 h = gen_reg_rtx (V16QImode);
36822 op = gen_lowpart (V16QImode, d->op1);
36823 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
36824
36825 op = gen_lowpart (V16QImode, d->target);
36826 emit_insn (gen_iorv16qi3 (op, l, h));
36827
36828 return true;
36829 }
36830
36831 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
36832 with two vpshufb insns, vpermq and vpor. We should have already failed
36833 all two or three instruction sequences. */
36834
36835 static bool
36836 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
36837 {
36838 rtx rperm[2][32], vperm, l, h, hp, op, m128;
36839 unsigned int i, nelt, eltsz;
36840
36841 if (!TARGET_AVX2
36842 || d->op0 != d->op1
36843 || (d->vmode != V32QImode && d->vmode != V16HImode))
36844 return false;
36845
36846 if (d->testing_p)
36847 return true;
36848
36849 nelt = d->nelt;
36850 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36851
36852 /* Generate two permutation masks. If the required element is within
36853 the same lane, it is shuffled in. If the required element from the
36854 other lane, force a zero by setting bit 7 in the permutation mask.
36855 In the other mask the mask has non-negative elements if element
36856 is requested from the other lane, but also moved to the other lane,
36857 so that the result of vpshufb can have the two V2TImode halves
36858 swapped. */
36859 m128 = GEN_INT (-128);
36860 for (i = 0; i < nelt; ++i)
36861 {
36862 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36863 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36864
36865 for (j = 0; j < eltsz; ++j)
36866 {
36867 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
36868 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
36869 }
36870 }
36871
36872 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36873 vperm = force_reg (V32QImode, vperm);
36874
36875 h = gen_reg_rtx (V32QImode);
36876 op = gen_lowpart (V32QImode, d->op0);
36877 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36878
36879 /* Swap the 128-byte lanes of h into hp. */
36880 hp = gen_reg_rtx (V4DImode);
36881 op = gen_lowpart (V4DImode, h);
36882 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
36883 const1_rtx));
36884
36885 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36886 vperm = force_reg (V32QImode, vperm);
36887
36888 l = gen_reg_rtx (V32QImode);
36889 op = gen_lowpart (V32QImode, d->op0);
36890 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36891
36892 op = gen_lowpart (V32QImode, d->target);
36893 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
36894
36895 return true;
36896 }
36897
36898 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
36899 and extract-odd permutations of two V32QImode and V16QImode operand
36900 with two vpshufb insns, vpor and vpermq. We should have already
36901 failed all two or three instruction sequences. */
36902
36903 static bool
36904 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
36905 {
36906 rtx rperm[2][32], vperm, l, h, ior, op, m128;
36907 unsigned int i, nelt, eltsz;
36908
36909 if (!TARGET_AVX2
36910 || d->op0 == d->op1
36911 || (d->vmode != V32QImode && d->vmode != V16HImode))
36912 return false;
36913
36914 for (i = 0; i < d->nelt; ++i)
36915 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
36916 return false;
36917
36918 if (d->testing_p)
36919 return true;
36920
36921 nelt = d->nelt;
36922 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36923
36924 /* Generate two permutation masks. In the first permutation mask
36925 the first quarter will contain indexes for the first half
36926 of the op0, the second quarter will contain bit 7 set, third quarter
36927 will contain indexes for the second half of the op0 and the
36928 last quarter bit 7 set. In the second permutation mask
36929 the first quarter will contain bit 7 set, the second quarter
36930 indexes for the first half of the op1, the third quarter bit 7 set
36931 and last quarter indexes for the second half of the op1.
36932 I.e. the first mask e.g. for V32QImode extract even will be:
36933 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
36934 (all values masked with 0xf except for -128) and second mask
36935 for extract even will be
36936 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
36937 m128 = GEN_INT (-128);
36938 for (i = 0; i < nelt; ++i)
36939 {
36940 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36941 unsigned which = d->perm[i] >= nelt;
36942 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
36943
36944 for (j = 0; j < eltsz; ++j)
36945 {
36946 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
36947 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
36948 }
36949 }
36950
36951 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36952 vperm = force_reg (V32QImode, vperm);
36953
36954 l = gen_reg_rtx (V32QImode);
36955 op = gen_lowpart (V32QImode, d->op0);
36956 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36957
36958 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36959 vperm = force_reg (V32QImode, vperm);
36960
36961 h = gen_reg_rtx (V32QImode);
36962 op = gen_lowpart (V32QImode, d->op1);
36963 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36964
36965 ior = gen_reg_rtx (V32QImode);
36966 emit_insn (gen_iorv32qi3 (ior, l, h));
36967
36968 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
36969 op = gen_lowpart (V4DImode, d->target);
36970 ior = gen_lowpart (V4DImode, ior);
36971 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
36972 const1_rtx, GEN_INT (3)));
36973
36974 return true;
36975 }
36976
36977 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
36978 and extract-odd permutations. */
36979
36980 static bool
36981 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
36982 {
36983 rtx t1, t2, t3;
36984
36985 switch (d->vmode)
36986 {
36987 case V4DFmode:
36988 t1 = gen_reg_rtx (V4DFmode);
36989 t2 = gen_reg_rtx (V4DFmode);
36990
36991 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36992 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
36993 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
36994
36995 /* Now an unpck[lh]pd will produce the result required. */
36996 if (odd)
36997 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
36998 else
36999 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
37000 emit_insn (t3);
37001 break;
37002
37003 case V8SFmode:
37004 {
37005 int mask = odd ? 0xdd : 0x88;
37006
37007 t1 = gen_reg_rtx (V8SFmode);
37008 t2 = gen_reg_rtx (V8SFmode);
37009 t3 = gen_reg_rtx (V8SFmode);
37010
37011 /* Shuffle within the 128-bit lanes to produce:
37012 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
37013 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
37014 GEN_INT (mask)));
37015
37016 /* Shuffle the lanes around to produce:
37017 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
37018 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
37019 GEN_INT (0x3)));
37020
37021 /* Shuffle within the 128-bit lanes to produce:
37022 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
37023 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
37024
37025 /* Shuffle within the 128-bit lanes to produce:
37026 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
37027 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
37028
37029 /* Shuffle the lanes around to produce:
37030 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
37031 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
37032 GEN_INT (0x20)));
37033 }
37034 break;
37035
37036 case V2DFmode:
37037 case V4SFmode:
37038 case V2DImode:
37039 case V4SImode:
37040 /* These are always directly implementable by expand_vec_perm_1. */
37041 gcc_unreachable ();
37042
37043 case V8HImode:
37044 if (TARGET_SSSE3)
37045 return expand_vec_perm_pshufb2 (d);
37046 else
37047 {
37048 /* We need 2*log2(N)-1 operations to achieve odd/even
37049 with interleave. */
37050 t1 = gen_reg_rtx (V8HImode);
37051 t2 = gen_reg_rtx (V8HImode);
37052 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
37053 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
37054 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
37055 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
37056 if (odd)
37057 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
37058 else
37059 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
37060 emit_insn (t3);
37061 }
37062 break;
37063
37064 case V16QImode:
37065 if (TARGET_SSSE3)
37066 return expand_vec_perm_pshufb2 (d);
37067 else
37068 {
37069 t1 = gen_reg_rtx (V16QImode);
37070 t2 = gen_reg_rtx (V16QImode);
37071 t3 = gen_reg_rtx (V16QImode);
37072 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
37073 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
37074 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
37075 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
37076 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
37077 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
37078 if (odd)
37079 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
37080 else
37081 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
37082 emit_insn (t3);
37083 }
37084 break;
37085
37086 case V16HImode:
37087 case V32QImode:
37088 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
37089
37090 case V4DImode:
37091 if (!TARGET_AVX2)
37092 {
37093 struct expand_vec_perm_d d_copy = *d;
37094 d_copy.vmode = V4DFmode;
37095 d_copy.target = gen_lowpart (V4DFmode, d->target);
37096 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
37097 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
37098 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37099 }
37100
37101 t1 = gen_reg_rtx (V4DImode);
37102 t2 = gen_reg_rtx (V4DImode);
37103
37104 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37105 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
37106 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
37107
37108 /* Now an vpunpck[lh]qdq will produce the result required. */
37109 if (odd)
37110 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
37111 else
37112 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
37113 emit_insn (t3);
37114 break;
37115
37116 case V8SImode:
37117 if (!TARGET_AVX2)
37118 {
37119 struct expand_vec_perm_d d_copy = *d;
37120 d_copy.vmode = V8SFmode;
37121 d_copy.target = gen_lowpart (V8SFmode, d->target);
37122 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
37123 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
37124 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37125 }
37126
37127 t1 = gen_reg_rtx (V8SImode);
37128 t2 = gen_reg_rtx (V8SImode);
37129
37130 /* Shuffle the lanes around into
37131 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
37132 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
37133 gen_lowpart (V4DImode, d->op0),
37134 gen_lowpart (V4DImode, d->op1),
37135 GEN_INT (0x20)));
37136 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
37137 gen_lowpart (V4DImode, d->op0),
37138 gen_lowpart (V4DImode, d->op1),
37139 GEN_INT (0x31)));
37140
37141 /* Swap the 2nd and 3rd position in each lane into
37142 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
37143 emit_insn (gen_avx2_pshufdv3 (t1, t1,
37144 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37145 emit_insn (gen_avx2_pshufdv3 (t2, t2,
37146 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37147
37148 /* Now an vpunpck[lh]qdq will produce
37149 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
37150 if (odd)
37151 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
37152 gen_lowpart (V4DImode, t1),
37153 gen_lowpart (V4DImode, t2));
37154 else
37155 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
37156 gen_lowpart (V4DImode, t1),
37157 gen_lowpart (V4DImode, t2));
37158 emit_insn (t3);
37159 break;
37160
37161 default:
37162 gcc_unreachable ();
37163 }
37164
37165 return true;
37166 }
37167
37168 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37169 extract-even and extract-odd permutations. */
37170
37171 static bool
37172 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
37173 {
37174 unsigned i, odd, nelt = d->nelt;
37175
37176 odd = d->perm[0];
37177 if (odd != 0 && odd != 1)
37178 return false;
37179
37180 for (i = 1; i < nelt; ++i)
37181 if (d->perm[i] != 2 * i + odd)
37182 return false;
37183
37184 return expand_vec_perm_even_odd_1 (d, odd);
37185 }
37186
37187 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
37188 permutations. We assume that expand_vec_perm_1 has already failed. */
37189
37190 static bool
37191 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
37192 {
37193 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
37194 enum machine_mode vmode = d->vmode;
37195 unsigned char perm2[4];
37196 rtx op0 = d->op0;
37197 bool ok;
37198
37199 switch (vmode)
37200 {
37201 case V4DFmode:
37202 case V8SFmode:
37203 /* These are special-cased in sse.md so that we can optionally
37204 use the vbroadcast instruction. They expand to two insns
37205 if the input happens to be in a register. */
37206 gcc_unreachable ();
37207
37208 case V2DFmode:
37209 case V2DImode:
37210 case V4SFmode:
37211 case V4SImode:
37212 /* These are always implementable using standard shuffle patterns. */
37213 gcc_unreachable ();
37214
37215 case V8HImode:
37216 case V16QImode:
37217 /* These can be implemented via interleave. We save one insn by
37218 stopping once we have promoted to V4SImode and then use pshufd. */
37219 do
37220 {
37221 rtx dest;
37222 rtx (*gen) (rtx, rtx, rtx)
37223 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
37224 : gen_vec_interleave_lowv8hi;
37225
37226 if (elt >= nelt2)
37227 {
37228 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
37229 : gen_vec_interleave_highv8hi;
37230 elt -= nelt2;
37231 }
37232 nelt2 /= 2;
37233
37234 dest = gen_reg_rtx (vmode);
37235 emit_insn (gen (dest, op0, op0));
37236 vmode = get_mode_wider_vector (vmode);
37237 op0 = gen_lowpart (vmode, dest);
37238 }
37239 while (vmode != V4SImode);
37240
37241 memset (perm2, elt, 4);
37242 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
37243 d->testing_p);
37244 gcc_assert (ok);
37245 return true;
37246
37247 case V32QImode:
37248 case V16HImode:
37249 case V8SImode:
37250 case V4DImode:
37251 /* For AVX2 broadcasts of the first element vpbroadcast* or
37252 vpermq should be used by expand_vec_perm_1. */
37253 gcc_assert (!TARGET_AVX2 || d->perm[0]);
37254 return false;
37255
37256 default:
37257 gcc_unreachable ();
37258 }
37259 }
37260
37261 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37262 broadcast permutations. */
37263
37264 static bool
37265 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
37266 {
37267 unsigned i, elt, nelt = d->nelt;
37268
37269 if (d->op0 != d->op1)
37270 return false;
37271
37272 elt = d->perm[0];
37273 for (i = 1; i < nelt; ++i)
37274 if (d->perm[i] != elt)
37275 return false;
37276
37277 return expand_vec_perm_broadcast_1 (d);
37278 }
37279
37280 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
37281 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
37282 all the shorter instruction sequences. */
37283
37284 static bool
37285 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
37286 {
37287 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
37288 unsigned int i, nelt, eltsz;
37289 bool used[4];
37290
37291 if (!TARGET_AVX2
37292 || d->op0 == d->op1
37293 || (d->vmode != V32QImode && d->vmode != V16HImode))
37294 return false;
37295
37296 if (d->testing_p)
37297 return true;
37298
37299 nelt = d->nelt;
37300 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37301
37302 /* Generate 4 permutation masks. If the required element is within
37303 the same lane, it is shuffled in. If the required element from the
37304 other lane, force a zero by setting bit 7 in the permutation mask.
37305 In the other mask the mask has non-negative elements if element
37306 is requested from the other lane, but also moved to the other lane,
37307 so that the result of vpshufb can have the two V2TImode halves
37308 swapped. */
37309 m128 = GEN_INT (-128);
37310 for (i = 0; i < 32; ++i)
37311 {
37312 rperm[0][i] = m128;
37313 rperm[1][i] = m128;
37314 rperm[2][i] = m128;
37315 rperm[3][i] = m128;
37316 }
37317 used[0] = false;
37318 used[1] = false;
37319 used[2] = false;
37320 used[3] = false;
37321 for (i = 0; i < nelt; ++i)
37322 {
37323 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37324 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37325 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
37326
37327 for (j = 0; j < eltsz; ++j)
37328 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
37329 used[which] = true;
37330 }
37331
37332 for (i = 0; i < 2; ++i)
37333 {
37334 if (!used[2 * i + 1])
37335 {
37336 h[i] = NULL_RTX;
37337 continue;
37338 }
37339 vperm = gen_rtx_CONST_VECTOR (V32QImode,
37340 gen_rtvec_v (32, rperm[2 * i + 1]));
37341 vperm = force_reg (V32QImode, vperm);
37342 h[i] = gen_reg_rtx (V32QImode);
37343 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37344 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
37345 }
37346
37347 /* Swap the 128-byte lanes of h[X]. */
37348 for (i = 0; i < 2; ++i)
37349 {
37350 if (h[i] == NULL_RTX)
37351 continue;
37352 op = gen_reg_rtx (V4DImode);
37353 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
37354 const2_rtx, GEN_INT (3), const0_rtx,
37355 const1_rtx));
37356 h[i] = gen_lowpart (V32QImode, op);
37357 }
37358
37359 for (i = 0; i < 2; ++i)
37360 {
37361 if (!used[2 * i])
37362 {
37363 l[i] = NULL_RTX;
37364 continue;
37365 }
37366 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
37367 vperm = force_reg (V32QImode, vperm);
37368 l[i] = gen_reg_rtx (V32QImode);
37369 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37370 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
37371 }
37372
37373 for (i = 0; i < 2; ++i)
37374 {
37375 if (h[i] && l[i])
37376 {
37377 op = gen_reg_rtx (V32QImode);
37378 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
37379 l[i] = op;
37380 }
37381 else if (h[i])
37382 l[i] = h[i];
37383 }
37384
37385 gcc_assert (l[0] && l[1]);
37386 op = gen_lowpart (V32QImode, d->target);
37387 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
37388 return true;
37389 }
37390
37391 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
37392 With all of the interface bits taken care of, perform the expansion
37393 in D and return true on success. */
37394
37395 static bool
37396 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
37397 {
37398 /* Try a single instruction expansion. */
37399 if (expand_vec_perm_1 (d))
37400 return true;
37401
37402 /* Try sequences of two instructions. */
37403
37404 if (expand_vec_perm_pshuflw_pshufhw (d))
37405 return true;
37406
37407 if (expand_vec_perm_palignr (d))
37408 return true;
37409
37410 if (expand_vec_perm_interleave2 (d))
37411 return true;
37412
37413 if (expand_vec_perm_broadcast (d))
37414 return true;
37415
37416 if (expand_vec_perm_vpermq_perm_1 (d))
37417 return true;
37418
37419 /* Try sequences of three instructions. */
37420
37421 if (expand_vec_perm_pshufb2 (d))
37422 return true;
37423
37424 if (expand_vec_perm_interleave3 (d))
37425 return true;
37426
37427 if (expand_vec_perm_vperm2f128_vblend (d))
37428 return true;
37429
37430 /* Try sequences of four instructions. */
37431
37432 if (expand_vec_perm_vpshufb2_vpermq (d))
37433 return true;
37434
37435 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
37436 return true;
37437
37438 /* ??? Look for narrow permutations whose element orderings would
37439 allow the promotion to a wider mode. */
37440
37441 /* ??? Look for sequences of interleave or a wider permute that place
37442 the data into the correct lanes for a half-vector shuffle like
37443 pshuf[lh]w or vpermilps. */
37444
37445 /* ??? Look for sequences of interleave that produce the desired results.
37446 The combinatorics of punpck[lh] get pretty ugly... */
37447
37448 if (expand_vec_perm_even_odd (d))
37449 return true;
37450
37451 /* Even longer sequences. */
37452 if (expand_vec_perm_vpshufb4_vpermq2 (d))
37453 return true;
37454
37455 return false;
37456 }
37457
37458 bool
37459 ix86_expand_vec_perm_const (rtx operands[4])
37460 {
37461 struct expand_vec_perm_d d;
37462 unsigned char perm[MAX_VECT_LEN];
37463 int i, nelt, which;
37464 rtx sel;
37465
37466 d.target = operands[0];
37467 d.op0 = operands[1];
37468 d.op1 = operands[2];
37469 sel = operands[3];
37470
37471 d.vmode = GET_MODE (d.target);
37472 gcc_assert (VECTOR_MODE_P (d.vmode));
37473 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37474 d.testing_p = false;
37475
37476 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
37477 gcc_assert (XVECLEN (sel, 0) == nelt);
37478 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
37479
37480 for (i = which = 0; i < nelt; ++i)
37481 {
37482 rtx e = XVECEXP (sel, 0, i);
37483 int ei = INTVAL (e) & (2 * nelt - 1);
37484
37485 which |= (ei < nelt ? 1 : 2);
37486 d.perm[i] = ei;
37487 perm[i] = ei;
37488 }
37489
37490 switch (which)
37491 {
37492 default:
37493 gcc_unreachable();
37494
37495 case 3:
37496 if (!rtx_equal_p (d.op0, d.op1))
37497 break;
37498
37499 /* The elements of PERM do not suggest that only the first operand
37500 is used, but both operands are identical. Allow easier matching
37501 of the permutation by folding the permutation into the single
37502 input vector. */
37503 for (i = 0; i < nelt; ++i)
37504 if (d.perm[i] >= nelt)
37505 d.perm[i] -= nelt;
37506 /* FALLTHRU */
37507
37508 case 1:
37509 d.op1 = d.op0;
37510 break;
37511
37512 case 2:
37513 for (i = 0; i < nelt; ++i)
37514 d.perm[i] -= nelt;
37515 d.op0 = d.op1;
37516 break;
37517 }
37518
37519 if (ix86_expand_vec_perm_const_1 (&d))
37520 return true;
37521
37522 /* If the mask says both arguments are needed, but they are the same,
37523 the above tried to expand with d.op0 == d.op1. If that didn't work,
37524 retry with d.op0 != d.op1 as that is what testing has been done with. */
37525 if (which == 3 && d.op0 == d.op1)
37526 {
37527 rtx seq;
37528 bool ok;
37529
37530 memcpy (d.perm, perm, sizeof (perm));
37531 d.op1 = gen_reg_rtx (d.vmode);
37532 start_sequence ();
37533 ok = ix86_expand_vec_perm_const_1 (&d);
37534 seq = get_insns ();
37535 end_sequence ();
37536 if (ok)
37537 {
37538 emit_move_insn (d.op1, d.op0);
37539 emit_insn (seq);
37540 return true;
37541 }
37542 }
37543
37544 return false;
37545 }
37546
37547 /* Implement targetm.vectorize.vec_perm_const_ok. */
37548
37549 static bool
37550 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
37551 const unsigned char *sel)
37552 {
37553 struct expand_vec_perm_d d;
37554 unsigned int i, nelt, which;
37555 bool ret, one_vec;
37556
37557 d.vmode = vmode;
37558 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37559 d.testing_p = true;
37560
37561 /* Given sufficient ISA support we can just return true here
37562 for selected vector modes. */
37563 if (GET_MODE_SIZE (d.vmode) == 16)
37564 {
37565 /* All implementable with a single vpperm insn. */
37566 if (TARGET_XOP)
37567 return true;
37568 /* All implementable with 2 pshufb + 1 ior. */
37569 if (TARGET_SSSE3)
37570 return true;
37571 /* All implementable with shufpd or unpck[lh]pd. */
37572 if (d.nelt == 2)
37573 return true;
37574 }
37575
37576 /* Extract the values from the vector CST into the permutation
37577 array in D. */
37578 memcpy (d.perm, sel, nelt);
37579 for (i = which = 0; i < nelt; ++i)
37580 {
37581 unsigned char e = d.perm[i];
37582 gcc_assert (e < 2 * nelt);
37583 which |= (e < nelt ? 1 : 2);
37584 }
37585
37586 /* For all elements from second vector, fold the elements to first. */
37587 if (which == 2)
37588 for (i = 0; i < nelt; ++i)
37589 d.perm[i] -= nelt;
37590
37591 /* Check whether the mask can be applied to the vector type. */
37592 one_vec = (which != 3);
37593
37594 /* Implementable with shufps or pshufd. */
37595 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
37596 return true;
37597
37598 /* Otherwise we have to go through the motions and see if we can
37599 figure out how to generate the requested permutation. */
37600 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
37601 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
37602 if (!one_vec)
37603 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
37604
37605 start_sequence ();
37606 ret = ix86_expand_vec_perm_const_1 (&d);
37607 end_sequence ();
37608
37609 return ret;
37610 }
37611
37612 void
37613 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
37614 {
37615 struct expand_vec_perm_d d;
37616 unsigned i, nelt;
37617
37618 d.target = targ;
37619 d.op0 = op0;
37620 d.op1 = op1;
37621 d.vmode = GET_MODE (targ);
37622 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37623 d.testing_p = false;
37624
37625 for (i = 0; i < nelt; ++i)
37626 d.perm[i] = i * 2 + odd;
37627
37628 /* We'll either be able to implement the permutation directly... */
37629 if (expand_vec_perm_1 (&d))
37630 return;
37631
37632 /* ... or we use the special-case patterns. */
37633 expand_vec_perm_even_odd_1 (&d, odd);
37634 }
37635
37636 /* Expand an insert into a vector register through pinsr insn.
37637 Return true if successful. */
37638
37639 bool
37640 ix86_expand_pinsr (rtx *operands)
37641 {
37642 rtx dst = operands[0];
37643 rtx src = operands[3];
37644
37645 unsigned int size = INTVAL (operands[1]);
37646 unsigned int pos = INTVAL (operands[2]);
37647
37648 if (GET_CODE (dst) == SUBREG)
37649 {
37650 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
37651 dst = SUBREG_REG (dst);
37652 }
37653
37654 if (GET_CODE (src) == SUBREG)
37655 src = SUBREG_REG (src);
37656
37657 switch (GET_MODE (dst))
37658 {
37659 case V16QImode:
37660 case V8HImode:
37661 case V4SImode:
37662 case V2DImode:
37663 {
37664 enum machine_mode srcmode, dstmode;
37665 rtx (*pinsr)(rtx, rtx, rtx, rtx);
37666
37667 srcmode = mode_for_size (size, MODE_INT, 0);
37668
37669 switch (srcmode)
37670 {
37671 case QImode:
37672 if (!TARGET_SSE4_1)
37673 return false;
37674 dstmode = V16QImode;
37675 pinsr = gen_sse4_1_pinsrb;
37676 break;
37677
37678 case HImode:
37679 if (!TARGET_SSE2)
37680 return false;
37681 dstmode = V8HImode;
37682 pinsr = gen_sse2_pinsrw;
37683 break;
37684
37685 case SImode:
37686 if (!TARGET_SSE4_1)
37687 return false;
37688 dstmode = V4SImode;
37689 pinsr = gen_sse4_1_pinsrd;
37690 break;
37691
37692 case DImode:
37693 gcc_assert (TARGET_64BIT);
37694 if (!TARGET_SSE4_1)
37695 return false;
37696 dstmode = V2DImode;
37697 pinsr = gen_sse4_1_pinsrq;
37698 break;
37699
37700 default:
37701 return false;
37702 }
37703
37704 dst = gen_lowpart (dstmode, dst);
37705 src = gen_lowpart (srcmode, src);
37706
37707 pos /= size;
37708
37709 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
37710 return true;
37711 }
37712
37713 default:
37714 return false;
37715 }
37716 }
37717 \f
37718 /* This function returns the calling abi specific va_list type node.
37719 It returns the FNDECL specific va_list type. */
37720
37721 static tree
37722 ix86_fn_abi_va_list (tree fndecl)
37723 {
37724 if (!TARGET_64BIT)
37725 return va_list_type_node;
37726 gcc_assert (fndecl != NULL_TREE);
37727
37728 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
37729 return ms_va_list_type_node;
37730 else
37731 return sysv_va_list_type_node;
37732 }
37733
37734 /* Returns the canonical va_list type specified by TYPE. If there
37735 is no valid TYPE provided, it return NULL_TREE. */
37736
37737 static tree
37738 ix86_canonical_va_list_type (tree type)
37739 {
37740 tree wtype, htype;
37741
37742 /* Resolve references and pointers to va_list type. */
37743 if (TREE_CODE (type) == MEM_REF)
37744 type = TREE_TYPE (type);
37745 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
37746 type = TREE_TYPE (type);
37747 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
37748 type = TREE_TYPE (type);
37749
37750 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
37751 {
37752 wtype = va_list_type_node;
37753 gcc_assert (wtype != NULL_TREE);
37754 htype = type;
37755 if (TREE_CODE (wtype) == ARRAY_TYPE)
37756 {
37757 /* If va_list is an array type, the argument may have decayed
37758 to a pointer type, e.g. by being passed to another function.
37759 In that case, unwrap both types so that we can compare the
37760 underlying records. */
37761 if (TREE_CODE (htype) == ARRAY_TYPE
37762 || POINTER_TYPE_P (htype))
37763 {
37764 wtype = TREE_TYPE (wtype);
37765 htype = TREE_TYPE (htype);
37766 }
37767 }
37768 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37769 return va_list_type_node;
37770 wtype = sysv_va_list_type_node;
37771 gcc_assert (wtype != NULL_TREE);
37772 htype = type;
37773 if (TREE_CODE (wtype) == ARRAY_TYPE)
37774 {
37775 /* If va_list is an array type, the argument may have decayed
37776 to a pointer type, e.g. by being passed to another function.
37777 In that case, unwrap both types so that we can compare the
37778 underlying records. */
37779 if (TREE_CODE (htype) == ARRAY_TYPE
37780 || POINTER_TYPE_P (htype))
37781 {
37782 wtype = TREE_TYPE (wtype);
37783 htype = TREE_TYPE (htype);
37784 }
37785 }
37786 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37787 return sysv_va_list_type_node;
37788 wtype = ms_va_list_type_node;
37789 gcc_assert (wtype != NULL_TREE);
37790 htype = type;
37791 if (TREE_CODE (wtype) == ARRAY_TYPE)
37792 {
37793 /* If va_list is an array type, the argument may have decayed
37794 to a pointer type, e.g. by being passed to another function.
37795 In that case, unwrap both types so that we can compare the
37796 underlying records. */
37797 if (TREE_CODE (htype) == ARRAY_TYPE
37798 || POINTER_TYPE_P (htype))
37799 {
37800 wtype = TREE_TYPE (wtype);
37801 htype = TREE_TYPE (htype);
37802 }
37803 }
37804 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37805 return ms_va_list_type_node;
37806 return NULL_TREE;
37807 }
37808 return std_canonical_va_list_type (type);
37809 }
37810
37811 /* Iterate through the target-specific builtin types for va_list.
37812 IDX denotes the iterator, *PTREE is set to the result type of
37813 the va_list builtin, and *PNAME to its internal type.
37814 Returns zero if there is no element for this index, otherwise
37815 IDX should be increased upon the next call.
37816 Note, do not iterate a base builtin's name like __builtin_va_list.
37817 Used from c_common_nodes_and_builtins. */
37818
37819 static int
37820 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
37821 {
37822 if (TARGET_64BIT)
37823 {
37824 switch (idx)
37825 {
37826 default:
37827 break;
37828
37829 case 0:
37830 *ptree = ms_va_list_type_node;
37831 *pname = "__builtin_ms_va_list";
37832 return 1;
37833
37834 case 1:
37835 *ptree = sysv_va_list_type_node;
37836 *pname = "__builtin_sysv_va_list";
37837 return 1;
37838 }
37839 }
37840
37841 return 0;
37842 }
37843
37844 #undef TARGET_SCHED_DISPATCH
37845 #define TARGET_SCHED_DISPATCH has_dispatch
37846 #undef TARGET_SCHED_DISPATCH_DO
37847 #define TARGET_SCHED_DISPATCH_DO do_dispatch
37848 #undef TARGET_SCHED_REASSOCIATION_WIDTH
37849 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
37850
37851 /* The size of the dispatch window is the total number of bytes of
37852 object code allowed in a window. */
37853 #define DISPATCH_WINDOW_SIZE 16
37854
37855 /* Number of dispatch windows considered for scheduling. */
37856 #define MAX_DISPATCH_WINDOWS 3
37857
37858 /* Maximum number of instructions in a window. */
37859 #define MAX_INSN 4
37860
37861 /* Maximum number of immediate operands in a window. */
37862 #define MAX_IMM 4
37863
37864 /* Maximum number of immediate bits allowed in a window. */
37865 #define MAX_IMM_SIZE 128
37866
37867 /* Maximum number of 32 bit immediates allowed in a window. */
37868 #define MAX_IMM_32 4
37869
37870 /* Maximum number of 64 bit immediates allowed in a window. */
37871 #define MAX_IMM_64 2
37872
37873 /* Maximum total of loads or prefetches allowed in a window. */
37874 #define MAX_LOAD 2
37875
37876 /* Maximum total of stores allowed in a window. */
37877 #define MAX_STORE 1
37878
37879 #undef BIG
37880 #define BIG 100
37881
37882
37883 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
37884 enum dispatch_group {
37885 disp_no_group = 0,
37886 disp_load,
37887 disp_store,
37888 disp_load_store,
37889 disp_prefetch,
37890 disp_imm,
37891 disp_imm_32,
37892 disp_imm_64,
37893 disp_branch,
37894 disp_cmp,
37895 disp_jcc,
37896 disp_last
37897 };
37898
37899 /* Number of allowable groups in a dispatch window. It is an array
37900 indexed by dispatch_group enum. 100 is used as a big number,
37901 because the number of these kind of operations does not have any
37902 effect in dispatch window, but we need them for other reasons in
37903 the table. */
37904 static unsigned int num_allowable_groups[disp_last] = {
37905 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
37906 };
37907
37908 char group_name[disp_last + 1][16] = {
37909 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
37910 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
37911 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
37912 };
37913
37914 /* Instruction path. */
37915 enum insn_path {
37916 no_path = 0,
37917 path_single, /* Single micro op. */
37918 path_double, /* Double micro op. */
37919 path_multi, /* Instructions with more than 2 micro op.. */
37920 last_path
37921 };
37922
37923 /* sched_insn_info defines a window to the instructions scheduled in
37924 the basic block. It contains a pointer to the insn_info table and
37925 the instruction scheduled.
37926
37927 Windows are allocated for each basic block and are linked
37928 together. */
37929 typedef struct sched_insn_info_s {
37930 rtx insn;
37931 enum dispatch_group group;
37932 enum insn_path path;
37933 int byte_len;
37934 int imm_bytes;
37935 } sched_insn_info;
37936
37937 /* Linked list of dispatch windows. This is a two way list of
37938 dispatch windows of a basic block. It contains information about
37939 the number of uops in the window and the total number of
37940 instructions and of bytes in the object code for this dispatch
37941 window. */
37942 typedef struct dispatch_windows_s {
37943 int num_insn; /* Number of insn in the window. */
37944 int num_uops; /* Number of uops in the window. */
37945 int window_size; /* Number of bytes in the window. */
37946 int window_num; /* Window number between 0 or 1. */
37947 int num_imm; /* Number of immediates in an insn. */
37948 int num_imm_32; /* Number of 32 bit immediates in an insn. */
37949 int num_imm_64; /* Number of 64 bit immediates in an insn. */
37950 int imm_size; /* Total immediates in the window. */
37951 int num_loads; /* Total memory loads in the window. */
37952 int num_stores; /* Total memory stores in the window. */
37953 int violation; /* Violation exists in window. */
37954 sched_insn_info *window; /* Pointer to the window. */
37955 struct dispatch_windows_s *next;
37956 struct dispatch_windows_s *prev;
37957 } dispatch_windows;
37958
37959 /* Immediate valuse used in an insn. */
37960 typedef struct imm_info_s
37961 {
37962 int imm;
37963 int imm32;
37964 int imm64;
37965 } imm_info;
37966
37967 static dispatch_windows *dispatch_window_list;
37968 static dispatch_windows *dispatch_window_list1;
37969
37970 /* Get dispatch group of insn. */
37971
37972 static enum dispatch_group
37973 get_mem_group (rtx insn)
37974 {
37975 enum attr_memory memory;
37976
37977 if (INSN_CODE (insn) < 0)
37978 return disp_no_group;
37979 memory = get_attr_memory (insn);
37980 if (memory == MEMORY_STORE)
37981 return disp_store;
37982
37983 if (memory == MEMORY_LOAD)
37984 return disp_load;
37985
37986 if (memory == MEMORY_BOTH)
37987 return disp_load_store;
37988
37989 return disp_no_group;
37990 }
37991
37992 /* Return true if insn is a compare instruction. */
37993
37994 static bool
37995 is_cmp (rtx insn)
37996 {
37997 enum attr_type type;
37998
37999 type = get_attr_type (insn);
38000 return (type == TYPE_TEST
38001 || type == TYPE_ICMP
38002 || type == TYPE_FCMP
38003 || GET_CODE (PATTERN (insn)) == COMPARE);
38004 }
38005
38006 /* Return true if a dispatch violation encountered. */
38007
38008 static bool
38009 dispatch_violation (void)
38010 {
38011 if (dispatch_window_list->next)
38012 return dispatch_window_list->next->violation;
38013 return dispatch_window_list->violation;
38014 }
38015
38016 /* Return true if insn is a branch instruction. */
38017
38018 static bool
38019 is_branch (rtx insn)
38020 {
38021 return (CALL_P (insn) || JUMP_P (insn));
38022 }
38023
38024 /* Return true if insn is a prefetch instruction. */
38025
38026 static bool
38027 is_prefetch (rtx insn)
38028 {
38029 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
38030 }
38031
38032 /* This function initializes a dispatch window and the list container holding a
38033 pointer to the window. */
38034
38035 static void
38036 init_window (int window_num)
38037 {
38038 int i;
38039 dispatch_windows *new_list;
38040
38041 if (window_num == 0)
38042 new_list = dispatch_window_list;
38043 else
38044 new_list = dispatch_window_list1;
38045
38046 new_list->num_insn = 0;
38047 new_list->num_uops = 0;
38048 new_list->window_size = 0;
38049 new_list->next = NULL;
38050 new_list->prev = NULL;
38051 new_list->window_num = window_num;
38052 new_list->num_imm = 0;
38053 new_list->num_imm_32 = 0;
38054 new_list->num_imm_64 = 0;
38055 new_list->imm_size = 0;
38056 new_list->num_loads = 0;
38057 new_list->num_stores = 0;
38058 new_list->violation = false;
38059
38060 for (i = 0; i < MAX_INSN; i++)
38061 {
38062 new_list->window[i].insn = NULL;
38063 new_list->window[i].group = disp_no_group;
38064 new_list->window[i].path = no_path;
38065 new_list->window[i].byte_len = 0;
38066 new_list->window[i].imm_bytes = 0;
38067 }
38068 return;
38069 }
38070
38071 /* This function allocates and initializes a dispatch window and the
38072 list container holding a pointer to the window. */
38073
38074 static dispatch_windows *
38075 allocate_window (void)
38076 {
38077 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
38078 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
38079
38080 return new_list;
38081 }
38082
38083 /* This routine initializes the dispatch scheduling information. It
38084 initiates building dispatch scheduler tables and constructs the
38085 first dispatch window. */
38086
38087 static void
38088 init_dispatch_sched (void)
38089 {
38090 /* Allocate a dispatch list and a window. */
38091 dispatch_window_list = allocate_window ();
38092 dispatch_window_list1 = allocate_window ();
38093 init_window (0);
38094 init_window (1);
38095 }
38096
38097 /* This function returns true if a branch is detected. End of a basic block
38098 does not have to be a branch, but here we assume only branches end a
38099 window. */
38100
38101 static bool
38102 is_end_basic_block (enum dispatch_group group)
38103 {
38104 return group == disp_branch;
38105 }
38106
38107 /* This function is called when the end of a window processing is reached. */
38108
38109 static void
38110 process_end_window (void)
38111 {
38112 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
38113 if (dispatch_window_list->next)
38114 {
38115 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
38116 gcc_assert (dispatch_window_list->window_size
38117 + dispatch_window_list1->window_size <= 48);
38118 init_window (1);
38119 }
38120 init_window (0);
38121 }
38122
38123 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
38124 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
38125 for 48 bytes of instructions. Note that these windows are not dispatch
38126 windows that their sizes are DISPATCH_WINDOW_SIZE. */
38127
38128 static dispatch_windows *
38129 allocate_next_window (int window_num)
38130 {
38131 if (window_num == 0)
38132 {
38133 if (dispatch_window_list->next)
38134 init_window (1);
38135 init_window (0);
38136 return dispatch_window_list;
38137 }
38138
38139 dispatch_window_list->next = dispatch_window_list1;
38140 dispatch_window_list1->prev = dispatch_window_list;
38141
38142 return dispatch_window_list1;
38143 }
38144
38145 /* Increment the number of immediate operands of an instruction. */
38146
38147 static int
38148 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
38149 {
38150 if (*in_rtx == 0)
38151 return 0;
38152
38153 switch ( GET_CODE (*in_rtx))
38154 {
38155 case CONST:
38156 case SYMBOL_REF:
38157 case CONST_INT:
38158 (imm_values->imm)++;
38159 if (x86_64_immediate_operand (*in_rtx, SImode))
38160 (imm_values->imm32)++;
38161 else
38162 (imm_values->imm64)++;
38163 break;
38164
38165 case CONST_DOUBLE:
38166 (imm_values->imm)++;
38167 (imm_values->imm64)++;
38168 break;
38169
38170 case CODE_LABEL:
38171 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
38172 {
38173 (imm_values->imm)++;
38174 (imm_values->imm32)++;
38175 }
38176 break;
38177
38178 default:
38179 break;
38180 }
38181
38182 return 0;
38183 }
38184
38185 /* Compute number of immediate operands of an instruction. */
38186
38187 static void
38188 find_constant (rtx in_rtx, imm_info *imm_values)
38189 {
38190 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
38191 (rtx_function) find_constant_1, (void *) imm_values);
38192 }
38193
38194 /* Return total size of immediate operands of an instruction along with number
38195 of corresponding immediate-operands. It initializes its parameters to zero
38196 befor calling FIND_CONSTANT.
38197 INSN is the input instruction. IMM is the total of immediates.
38198 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
38199 bit immediates. */
38200
38201 static int
38202 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
38203 {
38204 imm_info imm_values = {0, 0, 0};
38205
38206 find_constant (insn, &imm_values);
38207 *imm = imm_values.imm;
38208 *imm32 = imm_values.imm32;
38209 *imm64 = imm_values.imm64;
38210 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
38211 }
38212
38213 /* This function indicates if an operand of an instruction is an
38214 immediate. */
38215
38216 static bool
38217 has_immediate (rtx insn)
38218 {
38219 int num_imm_operand;
38220 int num_imm32_operand;
38221 int num_imm64_operand;
38222
38223 if (insn)
38224 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38225 &num_imm64_operand);
38226 return false;
38227 }
38228
38229 /* Return single or double path for instructions. */
38230
38231 static enum insn_path
38232 get_insn_path (rtx insn)
38233 {
38234 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
38235
38236 if ((int)path == 0)
38237 return path_single;
38238
38239 if ((int)path == 1)
38240 return path_double;
38241
38242 return path_multi;
38243 }
38244
38245 /* Return insn dispatch group. */
38246
38247 static enum dispatch_group
38248 get_insn_group (rtx insn)
38249 {
38250 enum dispatch_group group = get_mem_group (insn);
38251 if (group)
38252 return group;
38253
38254 if (is_branch (insn))
38255 return disp_branch;
38256
38257 if (is_cmp (insn))
38258 return disp_cmp;
38259
38260 if (has_immediate (insn))
38261 return disp_imm;
38262
38263 if (is_prefetch (insn))
38264 return disp_prefetch;
38265
38266 return disp_no_group;
38267 }
38268
38269 /* Count number of GROUP restricted instructions in a dispatch
38270 window WINDOW_LIST. */
38271
38272 static int
38273 count_num_restricted (rtx insn, dispatch_windows *window_list)
38274 {
38275 enum dispatch_group group = get_insn_group (insn);
38276 int imm_size;
38277 int num_imm_operand;
38278 int num_imm32_operand;
38279 int num_imm64_operand;
38280
38281 if (group == disp_no_group)
38282 return 0;
38283
38284 if (group == disp_imm)
38285 {
38286 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38287 &num_imm64_operand);
38288 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
38289 || num_imm_operand + window_list->num_imm > MAX_IMM
38290 || (num_imm32_operand > 0
38291 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
38292 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
38293 || (num_imm64_operand > 0
38294 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
38295 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
38296 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
38297 && num_imm64_operand > 0
38298 && ((window_list->num_imm_64 > 0
38299 && window_list->num_insn >= 2)
38300 || window_list->num_insn >= 3)))
38301 return BIG;
38302
38303 return 1;
38304 }
38305
38306 if ((group == disp_load_store
38307 && (window_list->num_loads >= MAX_LOAD
38308 || window_list->num_stores >= MAX_STORE))
38309 || ((group == disp_load
38310 || group == disp_prefetch)
38311 && window_list->num_loads >= MAX_LOAD)
38312 || (group == disp_store
38313 && window_list->num_stores >= MAX_STORE))
38314 return BIG;
38315
38316 return 1;
38317 }
38318
38319 /* This function returns true if insn satisfies dispatch rules on the
38320 last window scheduled. */
38321
38322 static bool
38323 fits_dispatch_window (rtx insn)
38324 {
38325 dispatch_windows *window_list = dispatch_window_list;
38326 dispatch_windows *window_list_next = dispatch_window_list->next;
38327 unsigned int num_restrict;
38328 enum dispatch_group group = get_insn_group (insn);
38329 enum insn_path path = get_insn_path (insn);
38330 int sum;
38331
38332 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
38333 instructions should be given the lowest priority in the
38334 scheduling process in Haifa scheduler to make sure they will be
38335 scheduled in the same dispatch window as the refrence to them. */
38336 if (group == disp_jcc || group == disp_cmp)
38337 return false;
38338
38339 /* Check nonrestricted. */
38340 if (group == disp_no_group || group == disp_branch)
38341 return true;
38342
38343 /* Get last dispatch window. */
38344 if (window_list_next)
38345 window_list = window_list_next;
38346
38347 if (window_list->window_num == 1)
38348 {
38349 sum = window_list->prev->window_size + window_list->window_size;
38350
38351 if (sum == 32
38352 || (min_insn_size (insn) + sum) >= 48)
38353 /* Window 1 is full. Go for next window. */
38354 return true;
38355 }
38356
38357 num_restrict = count_num_restricted (insn, window_list);
38358
38359 if (num_restrict > num_allowable_groups[group])
38360 return false;
38361
38362 /* See if it fits in the first window. */
38363 if (window_list->window_num == 0)
38364 {
38365 /* The first widow should have only single and double path
38366 uops. */
38367 if (path == path_double
38368 && (window_list->num_uops + 2) > MAX_INSN)
38369 return false;
38370 else if (path != path_single)
38371 return false;
38372 }
38373 return true;
38374 }
38375
38376 /* Add an instruction INSN with NUM_UOPS micro-operations to the
38377 dispatch window WINDOW_LIST. */
38378
38379 static void
38380 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
38381 {
38382 int byte_len = min_insn_size (insn);
38383 int num_insn = window_list->num_insn;
38384 int imm_size;
38385 sched_insn_info *window = window_list->window;
38386 enum dispatch_group group = get_insn_group (insn);
38387 enum insn_path path = get_insn_path (insn);
38388 int num_imm_operand;
38389 int num_imm32_operand;
38390 int num_imm64_operand;
38391
38392 if (!window_list->violation && group != disp_cmp
38393 && !fits_dispatch_window (insn))
38394 window_list->violation = true;
38395
38396 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38397 &num_imm64_operand);
38398
38399 /* Initialize window with new instruction. */
38400 window[num_insn].insn = insn;
38401 window[num_insn].byte_len = byte_len;
38402 window[num_insn].group = group;
38403 window[num_insn].path = path;
38404 window[num_insn].imm_bytes = imm_size;
38405
38406 window_list->window_size += byte_len;
38407 window_list->num_insn = num_insn + 1;
38408 window_list->num_uops = window_list->num_uops + num_uops;
38409 window_list->imm_size += imm_size;
38410 window_list->num_imm += num_imm_operand;
38411 window_list->num_imm_32 += num_imm32_operand;
38412 window_list->num_imm_64 += num_imm64_operand;
38413
38414 if (group == disp_store)
38415 window_list->num_stores += 1;
38416 else if (group == disp_load
38417 || group == disp_prefetch)
38418 window_list->num_loads += 1;
38419 else if (group == disp_load_store)
38420 {
38421 window_list->num_stores += 1;
38422 window_list->num_loads += 1;
38423 }
38424 }
38425
38426 /* Adds a scheduled instruction, INSN, to the current dispatch window.
38427 If the total bytes of instructions or the number of instructions in
38428 the window exceed allowable, it allocates a new window. */
38429
38430 static void
38431 add_to_dispatch_window (rtx insn)
38432 {
38433 int byte_len;
38434 dispatch_windows *window_list;
38435 dispatch_windows *next_list;
38436 dispatch_windows *window0_list;
38437 enum insn_path path;
38438 enum dispatch_group insn_group;
38439 bool insn_fits;
38440 int num_insn;
38441 int num_uops;
38442 int window_num;
38443 int insn_num_uops;
38444 int sum;
38445
38446 if (INSN_CODE (insn) < 0)
38447 return;
38448
38449 byte_len = min_insn_size (insn);
38450 window_list = dispatch_window_list;
38451 next_list = window_list->next;
38452 path = get_insn_path (insn);
38453 insn_group = get_insn_group (insn);
38454
38455 /* Get the last dispatch window. */
38456 if (next_list)
38457 window_list = dispatch_window_list->next;
38458
38459 if (path == path_single)
38460 insn_num_uops = 1;
38461 else if (path == path_double)
38462 insn_num_uops = 2;
38463 else
38464 insn_num_uops = (int) path;
38465
38466 /* If current window is full, get a new window.
38467 Window number zero is full, if MAX_INSN uops are scheduled in it.
38468 Window number one is full, if window zero's bytes plus window
38469 one's bytes is 32, or if the bytes of the new instruction added
38470 to the total makes it greater than 48, or it has already MAX_INSN
38471 instructions in it. */
38472 num_insn = window_list->num_insn;
38473 num_uops = window_list->num_uops;
38474 window_num = window_list->window_num;
38475 insn_fits = fits_dispatch_window (insn);
38476
38477 if (num_insn >= MAX_INSN
38478 || num_uops + insn_num_uops > MAX_INSN
38479 || !(insn_fits))
38480 {
38481 window_num = ~window_num & 1;
38482 window_list = allocate_next_window (window_num);
38483 }
38484
38485 if (window_num == 0)
38486 {
38487 add_insn_window (insn, window_list, insn_num_uops);
38488 if (window_list->num_insn >= MAX_INSN
38489 && insn_group == disp_branch)
38490 {
38491 process_end_window ();
38492 return;
38493 }
38494 }
38495 else if (window_num == 1)
38496 {
38497 window0_list = window_list->prev;
38498 sum = window0_list->window_size + window_list->window_size;
38499 if (sum == 32
38500 || (byte_len + sum) >= 48)
38501 {
38502 process_end_window ();
38503 window_list = dispatch_window_list;
38504 }
38505
38506 add_insn_window (insn, window_list, insn_num_uops);
38507 }
38508 else
38509 gcc_unreachable ();
38510
38511 if (is_end_basic_block (insn_group))
38512 {
38513 /* End of basic block is reached do end-basic-block process. */
38514 process_end_window ();
38515 return;
38516 }
38517 }
38518
38519 /* Print the dispatch window, WINDOW_NUM, to FILE. */
38520
38521 DEBUG_FUNCTION static void
38522 debug_dispatch_window_file (FILE *file, int window_num)
38523 {
38524 dispatch_windows *list;
38525 int i;
38526
38527 if (window_num == 0)
38528 list = dispatch_window_list;
38529 else
38530 list = dispatch_window_list1;
38531
38532 fprintf (file, "Window #%d:\n", list->window_num);
38533 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
38534 list->num_insn, list->num_uops, list->window_size);
38535 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38536 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
38537
38538 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
38539 list->num_stores);
38540 fprintf (file, " insn info:\n");
38541
38542 for (i = 0; i < MAX_INSN; i++)
38543 {
38544 if (!list->window[i].insn)
38545 break;
38546 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
38547 i, group_name[list->window[i].group],
38548 i, (void *)list->window[i].insn,
38549 i, list->window[i].path,
38550 i, list->window[i].byte_len,
38551 i, list->window[i].imm_bytes);
38552 }
38553 }
38554
38555 /* Print to stdout a dispatch window. */
38556
38557 DEBUG_FUNCTION void
38558 debug_dispatch_window (int window_num)
38559 {
38560 debug_dispatch_window_file (stdout, window_num);
38561 }
38562
38563 /* Print INSN dispatch information to FILE. */
38564
38565 DEBUG_FUNCTION static void
38566 debug_insn_dispatch_info_file (FILE *file, rtx insn)
38567 {
38568 int byte_len;
38569 enum insn_path path;
38570 enum dispatch_group group;
38571 int imm_size;
38572 int num_imm_operand;
38573 int num_imm32_operand;
38574 int num_imm64_operand;
38575
38576 if (INSN_CODE (insn) < 0)
38577 return;
38578
38579 byte_len = min_insn_size (insn);
38580 path = get_insn_path (insn);
38581 group = get_insn_group (insn);
38582 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38583 &num_imm64_operand);
38584
38585 fprintf (file, " insn info:\n");
38586 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
38587 group_name[group], path, byte_len);
38588 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38589 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
38590 }
38591
38592 /* Print to STDERR the status of the ready list with respect to
38593 dispatch windows. */
38594
38595 DEBUG_FUNCTION void
38596 debug_ready_dispatch (void)
38597 {
38598 int i;
38599 int no_ready = number_in_ready ();
38600
38601 fprintf (stdout, "Number of ready: %d\n", no_ready);
38602
38603 for (i = 0; i < no_ready; i++)
38604 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
38605 }
38606
38607 /* This routine is the driver of the dispatch scheduler. */
38608
38609 static void
38610 do_dispatch (rtx insn, int mode)
38611 {
38612 if (mode == DISPATCH_INIT)
38613 init_dispatch_sched ();
38614 else if (mode == ADD_TO_DISPATCH_WINDOW)
38615 add_to_dispatch_window (insn);
38616 }
38617
38618 /* Return TRUE if Dispatch Scheduling is supported. */
38619
38620 static bool
38621 has_dispatch (rtx insn, int action)
38622 {
38623 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
38624 && flag_dispatch_scheduler)
38625 switch (action)
38626 {
38627 default:
38628 return false;
38629
38630 case IS_DISPATCH_ON:
38631 return true;
38632 break;
38633
38634 case IS_CMP:
38635 return is_cmp (insn);
38636
38637 case DISPATCH_VIOLATION:
38638 return dispatch_violation ();
38639
38640 case FITS_DISPATCH_WINDOW:
38641 return fits_dispatch_window (insn);
38642 }
38643
38644 return false;
38645 }
38646
38647 /* Implementation of reassociation_width target hook used by
38648 reassoc phase to identify parallelism level in reassociated
38649 tree. Statements tree_code is passed in OPC. Arguments type
38650 is passed in MODE.
38651
38652 Currently parallel reassociation is enabled for Atom
38653 processors only and we set reassociation width to be 2
38654 because Atom may issue up to 2 instructions per cycle.
38655
38656 Return value should be fixed if parallel reassociation is
38657 enabled for other processors. */
38658
38659 static int
38660 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
38661 enum machine_mode mode)
38662 {
38663 int res = 1;
38664
38665 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
38666 res = 2;
38667 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
38668 res = 2;
38669
38670 return res;
38671 }
38672
38673 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
38674 place emms and femms instructions. */
38675
38676 static enum machine_mode
38677 ix86_preferred_simd_mode (enum machine_mode mode)
38678 {
38679 if (!TARGET_SSE)
38680 return word_mode;
38681
38682 switch (mode)
38683 {
38684 case QImode:
38685 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
38686 case HImode:
38687 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
38688 case SImode:
38689 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
38690 case DImode:
38691 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
38692
38693 case SFmode:
38694 if (TARGET_AVX && !TARGET_PREFER_AVX128)
38695 return V8SFmode;
38696 else
38697 return V4SFmode;
38698
38699 case DFmode:
38700 if (!TARGET_VECTORIZE_DOUBLE)
38701 return word_mode;
38702 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
38703 return V4DFmode;
38704 else if (TARGET_SSE2)
38705 return V2DFmode;
38706 /* FALLTHRU */
38707
38708 default:
38709 return word_mode;
38710 }
38711 }
38712
38713 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
38714 vectors. */
38715
38716 static unsigned int
38717 ix86_autovectorize_vector_sizes (void)
38718 {
38719 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
38720 }
38721
38722 /* Initialize the GCC target structure. */
38723 #undef TARGET_RETURN_IN_MEMORY
38724 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
38725
38726 #undef TARGET_LEGITIMIZE_ADDRESS
38727 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
38728
38729 #undef TARGET_ATTRIBUTE_TABLE
38730 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
38731 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38732 # undef TARGET_MERGE_DECL_ATTRIBUTES
38733 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
38734 #endif
38735
38736 #undef TARGET_COMP_TYPE_ATTRIBUTES
38737 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
38738
38739 #undef TARGET_INIT_BUILTINS
38740 #define TARGET_INIT_BUILTINS ix86_init_builtins
38741 #undef TARGET_BUILTIN_DECL
38742 #define TARGET_BUILTIN_DECL ix86_builtin_decl
38743 #undef TARGET_EXPAND_BUILTIN
38744 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
38745
38746 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
38747 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
38748 ix86_builtin_vectorized_function
38749
38750 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
38751 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
38752
38753 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
38754 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
38755
38756 #undef TARGET_VECTORIZE_BUILTIN_GATHER
38757 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
38758
38759 #undef TARGET_BUILTIN_RECIPROCAL
38760 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
38761
38762 #undef TARGET_ASM_FUNCTION_EPILOGUE
38763 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
38764
38765 #undef TARGET_ENCODE_SECTION_INFO
38766 #ifndef SUBTARGET_ENCODE_SECTION_INFO
38767 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
38768 #else
38769 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
38770 #endif
38771
38772 #undef TARGET_ASM_OPEN_PAREN
38773 #define TARGET_ASM_OPEN_PAREN ""
38774 #undef TARGET_ASM_CLOSE_PAREN
38775 #define TARGET_ASM_CLOSE_PAREN ""
38776
38777 #undef TARGET_ASM_BYTE_OP
38778 #define TARGET_ASM_BYTE_OP ASM_BYTE
38779
38780 #undef TARGET_ASM_ALIGNED_HI_OP
38781 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
38782 #undef TARGET_ASM_ALIGNED_SI_OP
38783 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
38784 #ifdef ASM_QUAD
38785 #undef TARGET_ASM_ALIGNED_DI_OP
38786 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
38787 #endif
38788
38789 #undef TARGET_PROFILE_BEFORE_PROLOGUE
38790 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
38791
38792 #undef TARGET_ASM_UNALIGNED_HI_OP
38793 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
38794 #undef TARGET_ASM_UNALIGNED_SI_OP
38795 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
38796 #undef TARGET_ASM_UNALIGNED_DI_OP
38797 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
38798
38799 #undef TARGET_PRINT_OPERAND
38800 #define TARGET_PRINT_OPERAND ix86_print_operand
38801 #undef TARGET_PRINT_OPERAND_ADDRESS
38802 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
38803 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
38804 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
38805 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
38806 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
38807
38808 #undef TARGET_SCHED_INIT_GLOBAL
38809 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
38810 #undef TARGET_SCHED_ADJUST_COST
38811 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
38812 #undef TARGET_SCHED_ISSUE_RATE
38813 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
38814 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
38815 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
38816 ia32_multipass_dfa_lookahead
38817
38818 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
38819 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
38820
38821 #ifdef HAVE_AS_TLS
38822 #undef TARGET_HAVE_TLS
38823 #define TARGET_HAVE_TLS true
38824 #endif
38825 #undef TARGET_CANNOT_FORCE_CONST_MEM
38826 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
38827 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
38828 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
38829
38830 #undef TARGET_DELEGITIMIZE_ADDRESS
38831 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
38832
38833 #undef TARGET_MS_BITFIELD_LAYOUT_P
38834 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
38835
38836 #if TARGET_MACHO
38837 #undef TARGET_BINDS_LOCAL_P
38838 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
38839 #endif
38840 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38841 #undef TARGET_BINDS_LOCAL_P
38842 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
38843 #endif
38844
38845 #undef TARGET_ASM_OUTPUT_MI_THUNK
38846 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
38847 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
38848 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
38849
38850 #undef TARGET_ASM_FILE_START
38851 #define TARGET_ASM_FILE_START x86_file_start
38852
38853 #undef TARGET_OPTION_OVERRIDE
38854 #define TARGET_OPTION_OVERRIDE ix86_option_override
38855
38856 #undef TARGET_REGISTER_MOVE_COST
38857 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
38858 #undef TARGET_MEMORY_MOVE_COST
38859 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
38860 #undef TARGET_RTX_COSTS
38861 #define TARGET_RTX_COSTS ix86_rtx_costs
38862 #undef TARGET_ADDRESS_COST
38863 #define TARGET_ADDRESS_COST ix86_address_cost
38864
38865 #undef TARGET_FIXED_CONDITION_CODE_REGS
38866 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
38867 #undef TARGET_CC_MODES_COMPATIBLE
38868 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
38869
38870 #undef TARGET_MACHINE_DEPENDENT_REORG
38871 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
38872
38873 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
38874 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
38875
38876 #undef TARGET_BUILD_BUILTIN_VA_LIST
38877 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
38878
38879 #undef TARGET_ENUM_VA_LIST_P
38880 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
38881
38882 #undef TARGET_FN_ABI_VA_LIST
38883 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
38884
38885 #undef TARGET_CANONICAL_VA_LIST_TYPE
38886 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
38887
38888 #undef TARGET_EXPAND_BUILTIN_VA_START
38889 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
38890
38891 #undef TARGET_MD_ASM_CLOBBERS
38892 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
38893
38894 #undef TARGET_PROMOTE_PROTOTYPES
38895 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
38896 #undef TARGET_STRUCT_VALUE_RTX
38897 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
38898 #undef TARGET_SETUP_INCOMING_VARARGS
38899 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
38900 #undef TARGET_MUST_PASS_IN_STACK
38901 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
38902 #undef TARGET_FUNCTION_ARG_ADVANCE
38903 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
38904 #undef TARGET_FUNCTION_ARG
38905 #define TARGET_FUNCTION_ARG ix86_function_arg
38906 #undef TARGET_FUNCTION_ARG_BOUNDARY
38907 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
38908 #undef TARGET_PASS_BY_REFERENCE
38909 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
38910 #undef TARGET_INTERNAL_ARG_POINTER
38911 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
38912 #undef TARGET_UPDATE_STACK_BOUNDARY
38913 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
38914 #undef TARGET_GET_DRAP_RTX
38915 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
38916 #undef TARGET_STRICT_ARGUMENT_NAMING
38917 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
38918 #undef TARGET_STATIC_CHAIN
38919 #define TARGET_STATIC_CHAIN ix86_static_chain
38920 #undef TARGET_TRAMPOLINE_INIT
38921 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
38922 #undef TARGET_RETURN_POPS_ARGS
38923 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
38924
38925 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
38926 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
38927
38928 #undef TARGET_SCALAR_MODE_SUPPORTED_P
38929 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
38930
38931 #undef TARGET_VECTOR_MODE_SUPPORTED_P
38932 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
38933
38934 #undef TARGET_C_MODE_FOR_SUFFIX
38935 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
38936
38937 #ifdef HAVE_AS_TLS
38938 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
38939 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
38940 #endif
38941
38942 #ifdef SUBTARGET_INSERT_ATTRIBUTES
38943 #undef TARGET_INSERT_ATTRIBUTES
38944 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
38945 #endif
38946
38947 #undef TARGET_MANGLE_TYPE
38948 #define TARGET_MANGLE_TYPE ix86_mangle_type
38949
38950 #if !TARGET_MACHO
38951 #undef TARGET_STACK_PROTECT_FAIL
38952 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
38953 #endif
38954
38955 #undef TARGET_FUNCTION_VALUE
38956 #define TARGET_FUNCTION_VALUE ix86_function_value
38957
38958 #undef TARGET_FUNCTION_VALUE_REGNO_P
38959 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
38960
38961 #undef TARGET_PROMOTE_FUNCTION_MODE
38962 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
38963
38964 #undef TARGET_SECONDARY_RELOAD
38965 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
38966
38967 #undef TARGET_CLASS_MAX_NREGS
38968 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
38969
38970 #undef TARGET_PREFERRED_RELOAD_CLASS
38971 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
38972 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
38973 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
38974 #undef TARGET_CLASS_LIKELY_SPILLED_P
38975 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
38976
38977 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
38978 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
38979 ix86_builtin_vectorization_cost
38980 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
38981 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
38982 ix86_vectorize_vec_perm_const_ok
38983 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
38984 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
38985 ix86_preferred_simd_mode
38986 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
38987 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
38988 ix86_autovectorize_vector_sizes
38989
38990 #undef TARGET_SET_CURRENT_FUNCTION
38991 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
38992
38993 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
38994 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
38995
38996 #undef TARGET_OPTION_SAVE
38997 #define TARGET_OPTION_SAVE ix86_function_specific_save
38998
38999 #undef TARGET_OPTION_RESTORE
39000 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
39001
39002 #undef TARGET_OPTION_PRINT
39003 #define TARGET_OPTION_PRINT ix86_function_specific_print
39004
39005 #undef TARGET_CAN_INLINE_P
39006 #define TARGET_CAN_INLINE_P ix86_can_inline_p
39007
39008 #undef TARGET_EXPAND_TO_RTL_HOOK
39009 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
39010
39011 #undef TARGET_LEGITIMATE_ADDRESS_P
39012 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
39013
39014 #undef TARGET_LEGITIMATE_CONSTANT_P
39015 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
39016
39017 #undef TARGET_FRAME_POINTER_REQUIRED
39018 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
39019
39020 #undef TARGET_CAN_ELIMINATE
39021 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
39022
39023 #undef TARGET_EXTRA_LIVE_ON_ENTRY
39024 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
39025
39026 #undef TARGET_ASM_CODE_END
39027 #define TARGET_ASM_CODE_END ix86_code_end
39028
39029 #undef TARGET_CONDITIONAL_REGISTER_USAGE
39030 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
39031
39032 #if TARGET_MACHO
39033 #undef TARGET_INIT_LIBFUNCS
39034 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
39035 #endif
39036
39037 struct gcc_target targetm = TARGET_INITIALIZER;
39038 \f
39039 #include "gt-i386.h"