i386.c (ix86_modes_tieable_p): Handle 32bit AVX modes.
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64
65 enum upper_128bits_state
66 {
67 unknown = 0,
68 unused,
69 used
70 };
71
72 typedef struct block_info_def
73 {
74 /* State of the upper 128bits of AVX registers at exit. */
75 enum upper_128bits_state state;
76 /* TRUE if state of the upper 128bits of AVX registers is unchanged
77 in this block. */
78 bool unchanged;
79 /* TRUE if block has been processed. */
80 bool processed;
81 /* TRUE if block has been scanned. */
82 bool scanned;
83 /* Previous state of the upper 128bits of AVX registers at entry. */
84 enum upper_128bits_state prev;
85 } *block_info;
86
87 #define BLOCK_INFO(B) ((block_info) (B)->aux)
88
89 enum call_avx256_state
90 {
91 /* Callee returns 256bit AVX register. */
92 callee_return_avx256 = -1,
93 /* Callee returns and passes 256bit AVX register. */
94 callee_return_pass_avx256,
95 /* Callee passes 256bit AVX register. */
96 callee_pass_avx256,
97 /* Callee doesn't return nor passe 256bit AVX register, or no
98 256bit AVX register in function return. */
99 call_no_avx256,
100 /* vzeroupper intrinsic. */
101 vzeroupper_intrinsic
102 };
103
104 /* Check if a 256bit AVX register is referenced in stores. */
105
106 static void
107 check_avx256_stores (rtx dest, const_rtx set, void *data)
108 {
109 if ((REG_P (dest)
110 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
111 || (GET_CODE (set) == SET
112 && REG_P (SET_SRC (set))
113 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
114 {
115 enum upper_128bits_state *state
116 = (enum upper_128bits_state *) data;
117 *state = used;
118 }
119 }
120
121 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
122 in basic block BB. Delete it if upper 128bit AVX registers are
123 unused. If it isn't deleted, move it to just before a jump insn.
124
125 STATE is state of the upper 128bits of AVX registers at entry. */
126
127 static void
128 move_or_delete_vzeroupper_2 (basic_block bb,
129 enum upper_128bits_state state)
130 {
131 rtx insn, bb_end;
132 rtx vzeroupper_insn = NULL_RTX;
133 rtx pat;
134 int avx256;
135 bool unchanged;
136
137 if (BLOCK_INFO (bb)->unchanged)
138 {
139 if (dump_file)
140 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
141 bb->index, state);
142
143 BLOCK_INFO (bb)->state = state;
144 return;
145 }
146
147 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
148 {
149 if (dump_file)
150 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
151 bb->index, BLOCK_INFO (bb)->state);
152 return;
153 }
154
155 BLOCK_INFO (bb)->prev = state;
156
157 if (dump_file)
158 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
159 bb->index, state);
160
161 unchanged = true;
162
163 /* BB_END changes when it is deleted. */
164 bb_end = BB_END (bb);
165 insn = BB_HEAD (bb);
166 while (insn != bb_end)
167 {
168 insn = NEXT_INSN (insn);
169
170 if (!NONDEBUG_INSN_P (insn))
171 continue;
172
173 /* Move vzeroupper before jump/call. */
174 if (JUMP_P (insn) || CALL_P (insn))
175 {
176 if (!vzeroupper_insn)
177 continue;
178
179 if (PREV_INSN (insn) != vzeroupper_insn)
180 {
181 if (dump_file)
182 {
183 fprintf (dump_file, "Move vzeroupper after:\n");
184 print_rtl_single (dump_file, PREV_INSN (insn));
185 fprintf (dump_file, "before:\n");
186 print_rtl_single (dump_file, insn);
187 }
188 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
189 PREV_INSN (insn));
190 }
191 vzeroupper_insn = NULL_RTX;
192 continue;
193 }
194
195 pat = PATTERN (insn);
196
197 /* Check insn for vzeroupper intrinsic. */
198 if (GET_CODE (pat) == UNSPEC_VOLATILE
199 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
200 {
201 if (dump_file)
202 {
203 /* Found vzeroupper intrinsic. */
204 fprintf (dump_file, "Found vzeroupper:\n");
205 print_rtl_single (dump_file, insn);
206 }
207 }
208 else
209 {
210 /* Check insn for vzeroall intrinsic. */
211 if (GET_CODE (pat) == PARALLEL
212 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
213 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
214 {
215 state = unused;
216 unchanged = false;
217
218 /* Delete pending vzeroupper insertion. */
219 if (vzeroupper_insn)
220 {
221 delete_insn (vzeroupper_insn);
222 vzeroupper_insn = NULL_RTX;
223 }
224 }
225 else if (state != used)
226 {
227 note_stores (pat, check_avx256_stores, &state);
228 if (state == used)
229 unchanged = false;
230 }
231 continue;
232 }
233
234 /* Process vzeroupper intrinsic. */
235 avx256 = INTVAL (XVECEXP (pat, 0, 0));
236
237 if (state == unused)
238 {
239 /* Since the upper 128bits are cleared, callee must not pass
240 256bit AVX register. We only need to check if callee
241 returns 256bit AVX register. */
242 if (avx256 == callee_return_avx256)
243 {
244 state = used;
245 unchanged = false;
246 }
247
248 /* Remove unnecessary vzeroupper since upper 128bits are
249 cleared. */
250 if (dump_file)
251 {
252 fprintf (dump_file, "Delete redundant vzeroupper:\n");
253 print_rtl_single (dump_file, insn);
254 }
255 delete_insn (insn);
256 }
257 else
258 {
259 /* Set state to UNUSED if callee doesn't return 256bit AVX
260 register. */
261 if (avx256 != callee_return_pass_avx256)
262 state = unused;
263
264 if (avx256 == callee_return_pass_avx256
265 || avx256 == callee_pass_avx256)
266 {
267 /* Must remove vzeroupper since callee passes in 256bit
268 AVX register. */
269 if (dump_file)
270 {
271 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
272 print_rtl_single (dump_file, insn);
273 }
274 delete_insn (insn);
275 }
276 else
277 {
278 vzeroupper_insn = insn;
279 unchanged = false;
280 }
281 }
282 }
283
284 BLOCK_INFO (bb)->state = state;
285 BLOCK_INFO (bb)->unchanged = unchanged;
286 BLOCK_INFO (bb)->scanned = true;
287
288 if (dump_file)
289 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
290 bb->index, unchanged ? "unchanged" : "changed",
291 state);
292 }
293
294 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
295 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
296 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
297 state is changed. */
298
299 static bool
300 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
301 {
302 edge e;
303 edge_iterator ei;
304 enum upper_128bits_state state, old_state, new_state;
305 bool seen_unknown;
306
307 if (dump_file)
308 fprintf (dump_file, " Process [bb %i]: status: %d\n",
309 block->index, BLOCK_INFO (block)->processed);
310
311 if (BLOCK_INFO (block)->processed)
312 return false;
313
314 state = unused;
315
316 /* Check all predecessor edges of this block. */
317 seen_unknown = false;
318 FOR_EACH_EDGE (e, ei, block->preds)
319 {
320 if (e->src == block)
321 continue;
322 switch (BLOCK_INFO (e->src)->state)
323 {
324 case unknown:
325 if (!unknown_is_unused)
326 seen_unknown = true;
327 case unused:
328 break;
329 case used:
330 state = used;
331 goto done;
332 }
333 }
334
335 if (seen_unknown)
336 state = unknown;
337
338 done:
339 old_state = BLOCK_INFO (block)->state;
340 move_or_delete_vzeroupper_2 (block, state);
341 new_state = BLOCK_INFO (block)->state;
342
343 if (state != unknown || new_state == used)
344 BLOCK_INFO (block)->processed = true;
345
346 /* Need to rescan if the upper 128bits of AVX registers are changed
347 to USED at exit. */
348 if (new_state != old_state)
349 {
350 if (new_state == used)
351 cfun->machine->rescan_vzeroupper_p = 1;
352 return true;
353 }
354 else
355 return false;
356 }
357
358 /* Go through the instruction stream looking for vzeroupper. Delete
359 it if upper 128bit AVX registers are unused. If it isn't deleted,
360 move it to just before a jump insn. */
361
362 static void
363 move_or_delete_vzeroupper (void)
364 {
365 edge e;
366 edge_iterator ei;
367 basic_block bb;
368 fibheap_t worklist, pending, fibheap_swap;
369 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
370 int *bb_order;
371 int *rc_order;
372 int i;
373
374 /* Set up block info for each basic block. */
375 alloc_aux_for_blocks (sizeof (struct block_info_def));
376
377 /* Process outgoing edges of entry point. */
378 if (dump_file)
379 fprintf (dump_file, "Process outgoing edges of entry point\n");
380
381 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
382 {
383 move_or_delete_vzeroupper_2 (e->dest,
384 cfun->machine->caller_pass_avx256_p
385 ? used : unused);
386 BLOCK_INFO (e->dest)->processed = true;
387 }
388
389 /* Compute reverse completion order of depth first search of the CFG
390 so that the data-flow runs faster. */
391 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
392 bb_order = XNEWVEC (int, last_basic_block);
393 pre_and_rev_post_order_compute (NULL, rc_order, false);
394 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
395 bb_order[rc_order[i]] = i;
396 free (rc_order);
397
398 worklist = fibheap_new ();
399 pending = fibheap_new ();
400 visited = sbitmap_alloc (last_basic_block);
401 in_worklist = sbitmap_alloc (last_basic_block);
402 in_pending = sbitmap_alloc (last_basic_block);
403 sbitmap_zero (in_worklist);
404
405 /* Don't check outgoing edges of entry point. */
406 sbitmap_ones (in_pending);
407 FOR_EACH_BB (bb)
408 if (BLOCK_INFO (bb)->processed)
409 RESET_BIT (in_pending, bb->index);
410 else
411 {
412 move_or_delete_vzeroupper_1 (bb, false);
413 fibheap_insert (pending, bb_order[bb->index], bb);
414 }
415
416 if (dump_file)
417 fprintf (dump_file, "Check remaining basic blocks\n");
418
419 while (!fibheap_empty (pending))
420 {
421 fibheap_swap = pending;
422 pending = worklist;
423 worklist = fibheap_swap;
424 sbitmap_swap = in_pending;
425 in_pending = in_worklist;
426 in_worklist = sbitmap_swap;
427
428 sbitmap_zero (visited);
429
430 cfun->machine->rescan_vzeroupper_p = 0;
431
432 while (!fibheap_empty (worklist))
433 {
434 bb = (basic_block) fibheap_extract_min (worklist);
435 RESET_BIT (in_worklist, bb->index);
436 gcc_assert (!TEST_BIT (visited, bb->index));
437 if (!TEST_BIT (visited, bb->index))
438 {
439 edge_iterator ei;
440
441 SET_BIT (visited, bb->index);
442
443 if (move_or_delete_vzeroupper_1 (bb, false))
444 FOR_EACH_EDGE (e, ei, bb->succs)
445 {
446 if (e->dest == EXIT_BLOCK_PTR
447 || BLOCK_INFO (e->dest)->processed)
448 continue;
449
450 if (TEST_BIT (visited, e->dest->index))
451 {
452 if (!TEST_BIT (in_pending, e->dest->index))
453 {
454 /* Send E->DEST to next round. */
455 SET_BIT (in_pending, e->dest->index);
456 fibheap_insert (pending,
457 bb_order[e->dest->index],
458 e->dest);
459 }
460 }
461 else if (!TEST_BIT (in_worklist, e->dest->index))
462 {
463 /* Add E->DEST to current round. */
464 SET_BIT (in_worklist, e->dest->index);
465 fibheap_insert (worklist, bb_order[e->dest->index],
466 e->dest);
467 }
468 }
469 }
470 }
471
472 if (!cfun->machine->rescan_vzeroupper_p)
473 break;
474 }
475
476 free (bb_order);
477 fibheap_delete (worklist);
478 fibheap_delete (pending);
479 sbitmap_free (visited);
480 sbitmap_free (in_worklist);
481 sbitmap_free (in_pending);
482
483 if (dump_file)
484 fprintf (dump_file, "Process remaining basic blocks\n");
485
486 FOR_EACH_BB (bb)
487 move_or_delete_vzeroupper_1 (bb, true);
488
489 free_aux_for_blocks ();
490 }
491
492 static rtx legitimize_dllimport_symbol (rtx, bool);
493
494 #ifndef CHECK_STACK_LIMIT
495 #define CHECK_STACK_LIMIT (-1)
496 #endif
497
498 /* Return index of given mode in mult and division cost tables. */
499 #define MODE_INDEX(mode) \
500 ((mode) == QImode ? 0 \
501 : (mode) == HImode ? 1 \
502 : (mode) == SImode ? 2 \
503 : (mode) == DImode ? 3 \
504 : 4)
505
506 /* Processor costs (relative to an add) */
507 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
508 #define COSTS_N_BYTES(N) ((N) * 2)
509
510 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
511
512 const
513 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
514 COSTS_N_BYTES (2), /* cost of an add instruction */
515 COSTS_N_BYTES (3), /* cost of a lea instruction */
516 COSTS_N_BYTES (2), /* variable shift costs */
517 COSTS_N_BYTES (3), /* constant shift costs */
518 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
519 COSTS_N_BYTES (3), /* HI */
520 COSTS_N_BYTES (3), /* SI */
521 COSTS_N_BYTES (3), /* DI */
522 COSTS_N_BYTES (5)}, /* other */
523 0, /* cost of multiply per each bit set */
524 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
525 COSTS_N_BYTES (3), /* HI */
526 COSTS_N_BYTES (3), /* SI */
527 COSTS_N_BYTES (3), /* DI */
528 COSTS_N_BYTES (5)}, /* other */
529 COSTS_N_BYTES (3), /* cost of movsx */
530 COSTS_N_BYTES (3), /* cost of movzx */
531 0, /* "large" insn */
532 2, /* MOVE_RATIO */
533 2, /* cost for loading QImode using movzbl */
534 {2, 2, 2}, /* cost of loading integer registers
535 in QImode, HImode and SImode.
536 Relative to reg-reg move (2). */
537 {2, 2, 2}, /* cost of storing integer registers */
538 2, /* cost of reg,reg fld/fst */
539 {2, 2, 2}, /* cost of loading fp registers
540 in SFmode, DFmode and XFmode */
541 {2, 2, 2}, /* cost of storing fp registers
542 in SFmode, DFmode and XFmode */
543 3, /* cost of moving MMX register */
544 {3, 3}, /* cost of loading MMX registers
545 in SImode and DImode */
546 {3, 3}, /* cost of storing MMX registers
547 in SImode and DImode */
548 3, /* cost of moving SSE register */
549 {3, 3, 3}, /* cost of loading SSE registers
550 in SImode, DImode and TImode */
551 {3, 3, 3}, /* cost of storing SSE registers
552 in SImode, DImode and TImode */
553 3, /* MMX or SSE register to integer */
554 0, /* size of l1 cache */
555 0, /* size of l2 cache */
556 0, /* size of prefetch block */
557 0, /* number of parallel prefetches */
558 2, /* Branch cost */
559 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
560 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
561 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
562 COSTS_N_BYTES (2), /* cost of FABS instruction. */
563 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
564 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
565 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
566 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
567 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
568 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
569 1, /* scalar_stmt_cost. */
570 1, /* scalar load_cost. */
571 1, /* scalar_store_cost. */
572 1, /* vec_stmt_cost. */
573 1, /* vec_to_scalar_cost. */
574 1, /* scalar_to_vec_cost. */
575 1, /* vec_align_load_cost. */
576 1, /* vec_unalign_load_cost. */
577 1, /* vec_store_cost. */
578 1, /* cond_taken_branch_cost. */
579 1, /* cond_not_taken_branch_cost. */
580 };
581
582 /* Processor costs (relative to an add) */
583 static const
584 struct processor_costs i386_cost = { /* 386 specific costs */
585 COSTS_N_INSNS (1), /* cost of an add instruction */
586 COSTS_N_INSNS (1), /* cost of a lea instruction */
587 COSTS_N_INSNS (3), /* variable shift costs */
588 COSTS_N_INSNS (2), /* constant shift costs */
589 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
590 COSTS_N_INSNS (6), /* HI */
591 COSTS_N_INSNS (6), /* SI */
592 COSTS_N_INSNS (6), /* DI */
593 COSTS_N_INSNS (6)}, /* other */
594 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
595 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
596 COSTS_N_INSNS (23), /* HI */
597 COSTS_N_INSNS (23), /* SI */
598 COSTS_N_INSNS (23), /* DI */
599 COSTS_N_INSNS (23)}, /* other */
600 COSTS_N_INSNS (3), /* cost of movsx */
601 COSTS_N_INSNS (2), /* cost of movzx */
602 15, /* "large" insn */
603 3, /* MOVE_RATIO */
604 4, /* cost for loading QImode using movzbl */
605 {2, 4, 2}, /* cost of loading integer registers
606 in QImode, HImode and SImode.
607 Relative to reg-reg move (2). */
608 {2, 4, 2}, /* cost of storing integer registers */
609 2, /* cost of reg,reg fld/fst */
610 {8, 8, 8}, /* cost of loading fp registers
611 in SFmode, DFmode and XFmode */
612 {8, 8, 8}, /* cost of storing fp registers
613 in SFmode, DFmode and XFmode */
614 2, /* cost of moving MMX register */
615 {4, 8}, /* cost of loading MMX registers
616 in SImode and DImode */
617 {4, 8}, /* cost of storing MMX registers
618 in SImode and DImode */
619 2, /* cost of moving SSE register */
620 {4, 8, 16}, /* cost of loading SSE registers
621 in SImode, DImode and TImode */
622 {4, 8, 16}, /* cost of storing SSE registers
623 in SImode, DImode and TImode */
624 3, /* MMX or SSE register to integer */
625 0, /* size of l1 cache */
626 0, /* size of l2 cache */
627 0, /* size of prefetch block */
628 0, /* number of parallel prefetches */
629 1, /* Branch cost */
630 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
631 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
632 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
633 COSTS_N_INSNS (22), /* cost of FABS instruction. */
634 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
635 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
636 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
637 DUMMY_STRINGOP_ALGS},
638 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
639 DUMMY_STRINGOP_ALGS},
640 1, /* scalar_stmt_cost. */
641 1, /* scalar load_cost. */
642 1, /* scalar_store_cost. */
643 1, /* vec_stmt_cost. */
644 1, /* vec_to_scalar_cost. */
645 1, /* scalar_to_vec_cost. */
646 1, /* vec_align_load_cost. */
647 2, /* vec_unalign_load_cost. */
648 1, /* vec_store_cost. */
649 3, /* cond_taken_branch_cost. */
650 1, /* cond_not_taken_branch_cost. */
651 };
652
653 static const
654 struct processor_costs i486_cost = { /* 486 specific costs */
655 COSTS_N_INSNS (1), /* cost of an add instruction */
656 COSTS_N_INSNS (1), /* cost of a lea instruction */
657 COSTS_N_INSNS (3), /* variable shift costs */
658 COSTS_N_INSNS (2), /* constant shift costs */
659 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
660 COSTS_N_INSNS (12), /* HI */
661 COSTS_N_INSNS (12), /* SI */
662 COSTS_N_INSNS (12), /* DI */
663 COSTS_N_INSNS (12)}, /* other */
664 1, /* cost of multiply per each bit set */
665 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
666 COSTS_N_INSNS (40), /* HI */
667 COSTS_N_INSNS (40), /* SI */
668 COSTS_N_INSNS (40), /* DI */
669 COSTS_N_INSNS (40)}, /* other */
670 COSTS_N_INSNS (3), /* cost of movsx */
671 COSTS_N_INSNS (2), /* cost of movzx */
672 15, /* "large" insn */
673 3, /* MOVE_RATIO */
674 4, /* cost for loading QImode using movzbl */
675 {2, 4, 2}, /* cost of loading integer registers
676 in QImode, HImode and SImode.
677 Relative to reg-reg move (2). */
678 {2, 4, 2}, /* cost of storing integer registers */
679 2, /* cost of reg,reg fld/fst */
680 {8, 8, 8}, /* cost of loading fp registers
681 in SFmode, DFmode and XFmode */
682 {8, 8, 8}, /* cost of storing fp registers
683 in SFmode, DFmode and XFmode */
684 2, /* cost of moving MMX register */
685 {4, 8}, /* cost of loading MMX registers
686 in SImode and DImode */
687 {4, 8}, /* cost of storing MMX registers
688 in SImode and DImode */
689 2, /* cost of moving SSE register */
690 {4, 8, 16}, /* cost of loading SSE registers
691 in SImode, DImode and TImode */
692 {4, 8, 16}, /* cost of storing SSE registers
693 in SImode, DImode and TImode */
694 3, /* MMX or SSE register to integer */
695 4, /* size of l1 cache. 486 has 8kB cache
696 shared for code and data, so 4kB is
697 not really precise. */
698 4, /* size of l2 cache */
699 0, /* size of prefetch block */
700 0, /* number of parallel prefetches */
701 1, /* Branch cost */
702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
703 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
704 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
705 COSTS_N_INSNS (3), /* cost of FABS instruction. */
706 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
707 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
708 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
709 DUMMY_STRINGOP_ALGS},
710 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
711 DUMMY_STRINGOP_ALGS},
712 1, /* scalar_stmt_cost. */
713 1, /* scalar load_cost. */
714 1, /* scalar_store_cost. */
715 1, /* vec_stmt_cost. */
716 1, /* vec_to_scalar_cost. */
717 1, /* scalar_to_vec_cost. */
718 1, /* vec_align_load_cost. */
719 2, /* vec_unalign_load_cost. */
720 1, /* vec_store_cost. */
721 3, /* cond_taken_branch_cost. */
722 1, /* cond_not_taken_branch_cost. */
723 };
724
725 static const
726 struct processor_costs pentium_cost = {
727 COSTS_N_INSNS (1), /* cost of an add instruction */
728 COSTS_N_INSNS (1), /* cost of a lea instruction */
729 COSTS_N_INSNS (4), /* variable shift costs */
730 COSTS_N_INSNS (1), /* constant shift costs */
731 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
732 COSTS_N_INSNS (11), /* HI */
733 COSTS_N_INSNS (11), /* SI */
734 COSTS_N_INSNS (11), /* DI */
735 COSTS_N_INSNS (11)}, /* other */
736 0, /* cost of multiply per each bit set */
737 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
738 COSTS_N_INSNS (25), /* HI */
739 COSTS_N_INSNS (25), /* SI */
740 COSTS_N_INSNS (25), /* DI */
741 COSTS_N_INSNS (25)}, /* other */
742 COSTS_N_INSNS (3), /* cost of movsx */
743 COSTS_N_INSNS (2), /* cost of movzx */
744 8, /* "large" insn */
745 6, /* MOVE_RATIO */
746 6, /* cost for loading QImode using movzbl */
747 {2, 4, 2}, /* cost of loading integer registers
748 in QImode, HImode and SImode.
749 Relative to reg-reg move (2). */
750 {2, 4, 2}, /* cost of storing integer registers */
751 2, /* cost of reg,reg fld/fst */
752 {2, 2, 6}, /* cost of loading fp registers
753 in SFmode, DFmode and XFmode */
754 {4, 4, 6}, /* cost of storing fp registers
755 in SFmode, DFmode and XFmode */
756 8, /* cost of moving MMX register */
757 {8, 8}, /* cost of loading MMX registers
758 in SImode and DImode */
759 {8, 8}, /* cost of storing MMX registers
760 in SImode and DImode */
761 2, /* cost of moving SSE register */
762 {4, 8, 16}, /* cost of loading SSE registers
763 in SImode, DImode and TImode */
764 {4, 8, 16}, /* cost of storing SSE registers
765 in SImode, DImode and TImode */
766 3, /* MMX or SSE register to integer */
767 8, /* size of l1 cache. */
768 8, /* size of l2 cache */
769 0, /* size of prefetch block */
770 0, /* number of parallel prefetches */
771 2, /* Branch cost */
772 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
773 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
774 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
775 COSTS_N_INSNS (1), /* cost of FABS instruction. */
776 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
777 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
778 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
779 DUMMY_STRINGOP_ALGS},
780 {{libcall, {{-1, rep_prefix_4_byte}}},
781 DUMMY_STRINGOP_ALGS},
782 1, /* scalar_stmt_cost. */
783 1, /* scalar load_cost. */
784 1, /* scalar_store_cost. */
785 1, /* vec_stmt_cost. */
786 1, /* vec_to_scalar_cost. */
787 1, /* scalar_to_vec_cost. */
788 1, /* vec_align_load_cost. */
789 2, /* vec_unalign_load_cost. */
790 1, /* vec_store_cost. */
791 3, /* cond_taken_branch_cost. */
792 1, /* cond_not_taken_branch_cost. */
793 };
794
795 static const
796 struct processor_costs pentiumpro_cost = {
797 COSTS_N_INSNS (1), /* cost of an add instruction */
798 COSTS_N_INSNS (1), /* cost of a lea instruction */
799 COSTS_N_INSNS (1), /* variable shift costs */
800 COSTS_N_INSNS (1), /* constant shift costs */
801 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
802 COSTS_N_INSNS (4), /* HI */
803 COSTS_N_INSNS (4), /* SI */
804 COSTS_N_INSNS (4), /* DI */
805 COSTS_N_INSNS (4)}, /* other */
806 0, /* cost of multiply per each bit set */
807 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
808 COSTS_N_INSNS (17), /* HI */
809 COSTS_N_INSNS (17), /* SI */
810 COSTS_N_INSNS (17), /* DI */
811 COSTS_N_INSNS (17)}, /* other */
812 COSTS_N_INSNS (1), /* cost of movsx */
813 COSTS_N_INSNS (1), /* cost of movzx */
814 8, /* "large" insn */
815 6, /* MOVE_RATIO */
816 2, /* cost for loading QImode using movzbl */
817 {4, 4, 4}, /* cost of loading integer registers
818 in QImode, HImode and SImode.
819 Relative to reg-reg move (2). */
820 {2, 2, 2}, /* cost of storing integer registers */
821 2, /* cost of reg,reg fld/fst */
822 {2, 2, 6}, /* cost of loading fp registers
823 in SFmode, DFmode and XFmode */
824 {4, 4, 6}, /* cost of storing fp registers
825 in SFmode, DFmode and XFmode */
826 2, /* cost of moving MMX register */
827 {2, 2}, /* cost of loading MMX registers
828 in SImode and DImode */
829 {2, 2}, /* cost of storing MMX registers
830 in SImode and DImode */
831 2, /* cost of moving SSE register */
832 {2, 2, 8}, /* cost of loading SSE registers
833 in SImode, DImode and TImode */
834 {2, 2, 8}, /* cost of storing SSE registers
835 in SImode, DImode and TImode */
836 3, /* MMX or SSE register to integer */
837 8, /* size of l1 cache. */
838 256, /* size of l2 cache */
839 32, /* size of prefetch block */
840 6, /* number of parallel prefetches */
841 2, /* Branch cost */
842 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
843 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
844 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
845 COSTS_N_INSNS (2), /* cost of FABS instruction. */
846 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
847 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
848 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
849 (we ensure the alignment). For small blocks inline loop is still a
850 noticeable win, for bigger blocks either rep movsl or rep movsb is
851 way to go. Rep movsb has apparently more expensive startup time in CPU,
852 but after 4K the difference is down in the noise. */
853 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
854 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
855 DUMMY_STRINGOP_ALGS},
856 {{rep_prefix_4_byte, {{1024, unrolled_loop},
857 {8192, rep_prefix_4_byte}, {-1, libcall}}},
858 DUMMY_STRINGOP_ALGS},
859 1, /* scalar_stmt_cost. */
860 1, /* scalar load_cost. */
861 1, /* scalar_store_cost. */
862 1, /* vec_stmt_cost. */
863 1, /* vec_to_scalar_cost. */
864 1, /* scalar_to_vec_cost. */
865 1, /* vec_align_load_cost. */
866 2, /* vec_unalign_load_cost. */
867 1, /* vec_store_cost. */
868 3, /* cond_taken_branch_cost. */
869 1, /* cond_not_taken_branch_cost. */
870 };
871
872 static const
873 struct processor_costs geode_cost = {
874 COSTS_N_INSNS (1), /* cost of an add instruction */
875 COSTS_N_INSNS (1), /* cost of a lea instruction */
876 COSTS_N_INSNS (2), /* variable shift costs */
877 COSTS_N_INSNS (1), /* constant shift costs */
878 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
879 COSTS_N_INSNS (4), /* HI */
880 COSTS_N_INSNS (7), /* SI */
881 COSTS_N_INSNS (7), /* DI */
882 COSTS_N_INSNS (7)}, /* other */
883 0, /* cost of multiply per each bit set */
884 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
885 COSTS_N_INSNS (23), /* HI */
886 COSTS_N_INSNS (39), /* SI */
887 COSTS_N_INSNS (39), /* DI */
888 COSTS_N_INSNS (39)}, /* other */
889 COSTS_N_INSNS (1), /* cost of movsx */
890 COSTS_N_INSNS (1), /* cost of movzx */
891 8, /* "large" insn */
892 4, /* MOVE_RATIO */
893 1, /* cost for loading QImode using movzbl */
894 {1, 1, 1}, /* cost of loading integer registers
895 in QImode, HImode and SImode.
896 Relative to reg-reg move (2). */
897 {1, 1, 1}, /* cost of storing integer registers */
898 1, /* cost of reg,reg fld/fst */
899 {1, 1, 1}, /* cost of loading fp registers
900 in SFmode, DFmode and XFmode */
901 {4, 6, 6}, /* cost of storing fp registers
902 in SFmode, DFmode and XFmode */
903
904 1, /* cost of moving MMX register */
905 {1, 1}, /* cost of loading MMX registers
906 in SImode and DImode */
907 {1, 1}, /* cost of storing MMX registers
908 in SImode and DImode */
909 1, /* cost of moving SSE register */
910 {1, 1, 1}, /* cost of loading SSE registers
911 in SImode, DImode and TImode */
912 {1, 1, 1}, /* cost of storing SSE registers
913 in SImode, DImode and TImode */
914 1, /* MMX or SSE register to integer */
915 64, /* size of l1 cache. */
916 128, /* size of l2 cache. */
917 32, /* size of prefetch block */
918 1, /* number of parallel prefetches */
919 1, /* Branch cost */
920 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
921 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
922 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
923 COSTS_N_INSNS (1), /* cost of FABS instruction. */
924 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
925 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
926 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
927 DUMMY_STRINGOP_ALGS},
928 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
929 DUMMY_STRINGOP_ALGS},
930 1, /* scalar_stmt_cost. */
931 1, /* scalar load_cost. */
932 1, /* scalar_store_cost. */
933 1, /* vec_stmt_cost. */
934 1, /* vec_to_scalar_cost. */
935 1, /* scalar_to_vec_cost. */
936 1, /* vec_align_load_cost. */
937 2, /* vec_unalign_load_cost. */
938 1, /* vec_store_cost. */
939 3, /* cond_taken_branch_cost. */
940 1, /* cond_not_taken_branch_cost. */
941 };
942
943 static const
944 struct processor_costs k6_cost = {
945 COSTS_N_INSNS (1), /* cost of an add instruction */
946 COSTS_N_INSNS (2), /* cost of a lea instruction */
947 COSTS_N_INSNS (1), /* variable shift costs */
948 COSTS_N_INSNS (1), /* constant shift costs */
949 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
950 COSTS_N_INSNS (3), /* HI */
951 COSTS_N_INSNS (3), /* SI */
952 COSTS_N_INSNS (3), /* DI */
953 COSTS_N_INSNS (3)}, /* other */
954 0, /* cost of multiply per each bit set */
955 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
956 COSTS_N_INSNS (18), /* HI */
957 COSTS_N_INSNS (18), /* SI */
958 COSTS_N_INSNS (18), /* DI */
959 COSTS_N_INSNS (18)}, /* other */
960 COSTS_N_INSNS (2), /* cost of movsx */
961 COSTS_N_INSNS (2), /* cost of movzx */
962 8, /* "large" insn */
963 4, /* MOVE_RATIO */
964 3, /* cost for loading QImode using movzbl */
965 {4, 5, 4}, /* cost of loading integer registers
966 in QImode, HImode and SImode.
967 Relative to reg-reg move (2). */
968 {2, 3, 2}, /* cost of storing integer registers */
969 4, /* cost of reg,reg fld/fst */
970 {6, 6, 6}, /* cost of loading fp registers
971 in SFmode, DFmode and XFmode */
972 {4, 4, 4}, /* cost of storing fp registers
973 in SFmode, DFmode and XFmode */
974 2, /* cost of moving MMX register */
975 {2, 2}, /* cost of loading MMX registers
976 in SImode and DImode */
977 {2, 2}, /* cost of storing MMX registers
978 in SImode and DImode */
979 2, /* cost of moving SSE register */
980 {2, 2, 8}, /* cost of loading SSE registers
981 in SImode, DImode and TImode */
982 {2, 2, 8}, /* cost of storing SSE registers
983 in SImode, DImode and TImode */
984 6, /* MMX or SSE register to integer */
985 32, /* size of l1 cache. */
986 32, /* size of l2 cache. Some models
987 have integrated l2 cache, but
988 optimizing for k6 is not important
989 enough to worry about that. */
990 32, /* size of prefetch block */
991 1, /* number of parallel prefetches */
992 1, /* Branch cost */
993 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
994 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
995 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
996 COSTS_N_INSNS (2), /* cost of FABS instruction. */
997 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
998 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
999 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1000 DUMMY_STRINGOP_ALGS},
1001 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1002 DUMMY_STRINGOP_ALGS},
1003 1, /* scalar_stmt_cost. */
1004 1, /* scalar load_cost. */
1005 1, /* scalar_store_cost. */
1006 1, /* vec_stmt_cost. */
1007 1, /* vec_to_scalar_cost. */
1008 1, /* scalar_to_vec_cost. */
1009 1, /* vec_align_load_cost. */
1010 2, /* vec_unalign_load_cost. */
1011 1, /* vec_store_cost. */
1012 3, /* cond_taken_branch_cost. */
1013 1, /* cond_not_taken_branch_cost. */
1014 };
1015
1016 static const
1017 struct processor_costs athlon_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (2), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (5), /* HI */
1024 COSTS_N_INSNS (5), /* SI */
1025 COSTS_N_INSNS (5), /* DI */
1026 COSTS_N_INSNS (5)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (26), /* HI */
1030 COSTS_N_INSNS (42), /* SI */
1031 COSTS_N_INSNS (74), /* DI */
1032 COSTS_N_INSNS (74)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {3, 4, 3}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {3, 4, 3}, /* cost of storing integer registers */
1042 4, /* cost of reg,reg fld/fst */
1043 {4, 4, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {6, 6, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 6}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 5}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 5, /* MMX or SSE register to integer */
1058 64, /* size of l1 cache. */
1059 256, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 6, /* number of parallel prefetches */
1062 5, /* Branch cost */
1063 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1064 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1065 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1066 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1067 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1068 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1069 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1070 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1071 128 bytes for memset. */
1072 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1073 DUMMY_STRINGOP_ALGS},
1074 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1075 DUMMY_STRINGOP_ALGS},
1076 1, /* scalar_stmt_cost. */
1077 1, /* scalar load_cost. */
1078 1, /* scalar_store_cost. */
1079 1, /* vec_stmt_cost. */
1080 1, /* vec_to_scalar_cost. */
1081 1, /* scalar_to_vec_cost. */
1082 1, /* vec_align_load_cost. */
1083 2, /* vec_unalign_load_cost. */
1084 1, /* vec_store_cost. */
1085 3, /* cond_taken_branch_cost. */
1086 1, /* cond_not_taken_branch_cost. */
1087 };
1088
1089 static const
1090 struct processor_costs k8_cost = {
1091 COSTS_N_INSNS (1), /* cost of an add instruction */
1092 COSTS_N_INSNS (2), /* cost of a lea instruction */
1093 COSTS_N_INSNS (1), /* variable shift costs */
1094 COSTS_N_INSNS (1), /* constant shift costs */
1095 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1096 COSTS_N_INSNS (4), /* HI */
1097 COSTS_N_INSNS (3), /* SI */
1098 COSTS_N_INSNS (4), /* DI */
1099 COSTS_N_INSNS (5)}, /* other */
1100 0, /* cost of multiply per each bit set */
1101 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1102 COSTS_N_INSNS (26), /* HI */
1103 COSTS_N_INSNS (42), /* SI */
1104 COSTS_N_INSNS (74), /* DI */
1105 COSTS_N_INSNS (74)}, /* other */
1106 COSTS_N_INSNS (1), /* cost of movsx */
1107 COSTS_N_INSNS (1), /* cost of movzx */
1108 8, /* "large" insn */
1109 9, /* MOVE_RATIO */
1110 4, /* cost for loading QImode using movzbl */
1111 {3, 4, 3}, /* cost of loading integer registers
1112 in QImode, HImode and SImode.
1113 Relative to reg-reg move (2). */
1114 {3, 4, 3}, /* cost of storing integer registers */
1115 4, /* cost of reg,reg fld/fst */
1116 {4, 4, 12}, /* cost of loading fp registers
1117 in SFmode, DFmode and XFmode */
1118 {6, 6, 8}, /* cost of storing fp registers
1119 in SFmode, DFmode and XFmode */
1120 2, /* cost of moving MMX register */
1121 {3, 3}, /* cost of loading MMX registers
1122 in SImode and DImode */
1123 {4, 4}, /* cost of storing MMX registers
1124 in SImode and DImode */
1125 2, /* cost of moving SSE register */
1126 {4, 3, 6}, /* cost of loading SSE registers
1127 in SImode, DImode and TImode */
1128 {4, 4, 5}, /* cost of storing SSE registers
1129 in SImode, DImode and TImode */
1130 5, /* MMX or SSE register to integer */
1131 64, /* size of l1 cache. */
1132 512, /* size of l2 cache. */
1133 64, /* size of prefetch block */
1134 /* New AMD processors never drop prefetches; if they cannot be performed
1135 immediately, they are queued. We set number of simultaneous prefetches
1136 to a large constant to reflect this (it probably is not a good idea not
1137 to limit number of prefetches at all, as their execution also takes some
1138 time). */
1139 100, /* number of parallel prefetches */
1140 3, /* Branch cost */
1141 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1142 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1143 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1144 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1145 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1146 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1147 /* K8 has optimized REP instruction for medium sized blocks, but for very
1148 small blocks it is better to use loop. For large blocks, libcall can
1149 do nontemporary accesses and beat inline considerably. */
1150 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1151 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1152 {{libcall, {{8, loop}, {24, unrolled_loop},
1153 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1154 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1155 4, /* scalar_stmt_cost. */
1156 2, /* scalar load_cost. */
1157 2, /* scalar_store_cost. */
1158 5, /* vec_stmt_cost. */
1159 0, /* vec_to_scalar_cost. */
1160 2, /* scalar_to_vec_cost. */
1161 2, /* vec_align_load_cost. */
1162 3, /* vec_unalign_load_cost. */
1163 3, /* vec_store_cost. */
1164 3, /* cond_taken_branch_cost. */
1165 2, /* cond_not_taken_branch_cost. */
1166 };
1167
1168 struct processor_costs amdfam10_cost = {
1169 COSTS_N_INSNS (1), /* cost of an add instruction */
1170 COSTS_N_INSNS (2), /* cost of a lea instruction */
1171 COSTS_N_INSNS (1), /* variable shift costs */
1172 COSTS_N_INSNS (1), /* constant shift costs */
1173 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1174 COSTS_N_INSNS (4), /* HI */
1175 COSTS_N_INSNS (3), /* SI */
1176 COSTS_N_INSNS (4), /* DI */
1177 COSTS_N_INSNS (5)}, /* other */
1178 0, /* cost of multiply per each bit set */
1179 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1180 COSTS_N_INSNS (35), /* HI */
1181 COSTS_N_INSNS (51), /* SI */
1182 COSTS_N_INSNS (83), /* DI */
1183 COSTS_N_INSNS (83)}, /* other */
1184 COSTS_N_INSNS (1), /* cost of movsx */
1185 COSTS_N_INSNS (1), /* cost of movzx */
1186 8, /* "large" insn */
1187 9, /* MOVE_RATIO */
1188 4, /* cost for loading QImode using movzbl */
1189 {3, 4, 3}, /* cost of loading integer registers
1190 in QImode, HImode and SImode.
1191 Relative to reg-reg move (2). */
1192 {3, 4, 3}, /* cost of storing integer registers */
1193 4, /* cost of reg,reg fld/fst */
1194 {4, 4, 12}, /* cost of loading fp registers
1195 in SFmode, DFmode and XFmode */
1196 {6, 6, 8}, /* cost of storing fp registers
1197 in SFmode, DFmode and XFmode */
1198 2, /* cost of moving MMX register */
1199 {3, 3}, /* cost of loading MMX registers
1200 in SImode and DImode */
1201 {4, 4}, /* cost of storing MMX registers
1202 in SImode and DImode */
1203 2, /* cost of moving SSE register */
1204 {4, 4, 3}, /* cost of loading SSE registers
1205 in SImode, DImode and TImode */
1206 {4, 4, 5}, /* cost of storing SSE registers
1207 in SImode, DImode and TImode */
1208 3, /* MMX or SSE register to integer */
1209 /* On K8:
1210 MOVD reg64, xmmreg Double FSTORE 4
1211 MOVD reg32, xmmreg Double FSTORE 4
1212 On AMDFAM10:
1213 MOVD reg64, xmmreg Double FADD 3
1214 1/1 1/1
1215 MOVD reg32, xmmreg Double FADD 3
1216 1/1 1/1 */
1217 64, /* size of l1 cache. */
1218 512, /* size of l2 cache. */
1219 64, /* size of prefetch block */
1220 /* New AMD processors never drop prefetches; if they cannot be performed
1221 immediately, they are queued. We set number of simultaneous prefetches
1222 to a large constant to reflect this (it probably is not a good idea not
1223 to limit number of prefetches at all, as their execution also takes some
1224 time). */
1225 100, /* number of parallel prefetches */
1226 2, /* Branch cost */
1227 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1228 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1229 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1230 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1231 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1232 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1233
1234 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1235 very small blocks it is better to use loop. For large blocks, libcall can
1236 do nontemporary accesses and beat inline considerably. */
1237 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1238 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1239 {{libcall, {{8, loop}, {24, unrolled_loop},
1240 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1241 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1242 4, /* scalar_stmt_cost. */
1243 2, /* scalar load_cost. */
1244 2, /* scalar_store_cost. */
1245 6, /* vec_stmt_cost. */
1246 0, /* vec_to_scalar_cost. */
1247 2, /* scalar_to_vec_cost. */
1248 2, /* vec_align_load_cost. */
1249 2, /* vec_unalign_load_cost. */
1250 2, /* vec_store_cost. */
1251 2, /* cond_taken_branch_cost. */
1252 1, /* cond_not_taken_branch_cost. */
1253 };
1254
1255 struct processor_costs bdver1_cost = {
1256 COSTS_N_INSNS (1), /* cost of an add instruction */
1257 COSTS_N_INSNS (1), /* cost of a lea instruction */
1258 COSTS_N_INSNS (1), /* variable shift costs */
1259 COSTS_N_INSNS (1), /* constant shift costs */
1260 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1261 COSTS_N_INSNS (4), /* HI */
1262 COSTS_N_INSNS (4), /* SI */
1263 COSTS_N_INSNS (6), /* DI */
1264 COSTS_N_INSNS (6)}, /* other */
1265 0, /* cost of multiply per each bit set */
1266 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1267 COSTS_N_INSNS (35), /* HI */
1268 COSTS_N_INSNS (51), /* SI */
1269 COSTS_N_INSNS (83), /* DI */
1270 COSTS_N_INSNS (83)}, /* other */
1271 COSTS_N_INSNS (1), /* cost of movsx */
1272 COSTS_N_INSNS (1), /* cost of movzx */
1273 8, /* "large" insn */
1274 9, /* MOVE_RATIO */
1275 4, /* cost for loading QImode using movzbl */
1276 {5, 5, 4}, /* cost of loading integer registers
1277 in QImode, HImode and SImode.
1278 Relative to reg-reg move (2). */
1279 {4, 4, 4}, /* cost of storing integer registers */
1280 2, /* cost of reg,reg fld/fst */
1281 {5, 5, 12}, /* cost of loading fp registers
1282 in SFmode, DFmode and XFmode */
1283 {4, 4, 8}, /* cost of storing fp registers
1284 in SFmode, DFmode and XFmode */
1285 2, /* cost of moving MMX register */
1286 {4, 4}, /* cost of loading MMX registers
1287 in SImode and DImode */
1288 {4, 4}, /* cost of storing MMX registers
1289 in SImode and DImode */
1290 2, /* cost of moving SSE register */
1291 {4, 4, 4}, /* cost of loading SSE registers
1292 in SImode, DImode and TImode */
1293 {4, 4, 4}, /* cost of storing SSE registers
1294 in SImode, DImode and TImode */
1295 2, /* MMX or SSE register to integer */
1296 /* On K8:
1297 MOVD reg64, xmmreg Double FSTORE 4
1298 MOVD reg32, xmmreg Double FSTORE 4
1299 On AMDFAM10:
1300 MOVD reg64, xmmreg Double FADD 3
1301 1/1 1/1
1302 MOVD reg32, xmmreg Double FADD 3
1303 1/1 1/1 */
1304 16, /* size of l1 cache. */
1305 2048, /* size of l2 cache. */
1306 64, /* size of prefetch block */
1307 /* New AMD processors never drop prefetches; if they cannot be performed
1308 immediately, they are queued. We set number of simultaneous prefetches
1309 to a large constant to reflect this (it probably is not a good idea not
1310 to limit number of prefetches at all, as their execution also takes some
1311 time). */
1312 100, /* number of parallel prefetches */
1313 2, /* Branch cost */
1314 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1315 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1316 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1317 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1318 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1319 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1320
1321 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1322 very small blocks it is better to use loop. For large blocks, libcall
1323 can do nontemporary accesses and beat inline considerably. */
1324 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1325 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1326 {{libcall, {{8, loop}, {24, unrolled_loop},
1327 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1328 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1329 6, /* scalar_stmt_cost. */
1330 4, /* scalar load_cost. */
1331 4, /* scalar_store_cost. */
1332 6, /* vec_stmt_cost. */
1333 0, /* vec_to_scalar_cost. */
1334 2, /* scalar_to_vec_cost. */
1335 4, /* vec_align_load_cost. */
1336 4, /* vec_unalign_load_cost. */
1337 4, /* vec_store_cost. */
1338 2, /* cond_taken_branch_cost. */
1339 1, /* cond_not_taken_branch_cost. */
1340 };
1341
1342 struct processor_costs bdver2_cost = {
1343 COSTS_N_INSNS (1), /* cost of an add instruction */
1344 COSTS_N_INSNS (1), /* cost of a lea instruction */
1345 COSTS_N_INSNS (1), /* variable shift costs */
1346 COSTS_N_INSNS (1), /* constant shift costs */
1347 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1348 COSTS_N_INSNS (4), /* HI */
1349 COSTS_N_INSNS (4), /* SI */
1350 COSTS_N_INSNS (6), /* DI */
1351 COSTS_N_INSNS (6)}, /* other */
1352 0, /* cost of multiply per each bit set */
1353 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1354 COSTS_N_INSNS (35), /* HI */
1355 COSTS_N_INSNS (51), /* SI */
1356 COSTS_N_INSNS (83), /* DI */
1357 COSTS_N_INSNS (83)}, /* other */
1358 COSTS_N_INSNS (1), /* cost of movsx */
1359 COSTS_N_INSNS (1), /* cost of movzx */
1360 8, /* "large" insn */
1361 9, /* MOVE_RATIO */
1362 4, /* cost for loading QImode using movzbl */
1363 {5, 5, 4}, /* cost of loading integer registers
1364 in QImode, HImode and SImode.
1365 Relative to reg-reg move (2). */
1366 {4, 4, 4}, /* cost of storing integer registers */
1367 2, /* cost of reg,reg fld/fst */
1368 {5, 5, 12}, /* cost of loading fp registers
1369 in SFmode, DFmode and XFmode */
1370 {4, 4, 8}, /* cost of storing fp registers
1371 in SFmode, DFmode and XFmode */
1372 2, /* cost of moving MMX register */
1373 {4, 4}, /* cost of loading MMX registers
1374 in SImode and DImode */
1375 {4, 4}, /* cost of storing MMX registers
1376 in SImode and DImode */
1377 2, /* cost of moving SSE register */
1378 {4, 4, 4}, /* cost of loading SSE registers
1379 in SImode, DImode and TImode */
1380 {4, 4, 4}, /* cost of storing SSE registers
1381 in SImode, DImode and TImode */
1382 2, /* MMX or SSE register to integer */
1383 /* On K8:
1384 MOVD reg64, xmmreg Double FSTORE 4
1385 MOVD reg32, xmmreg Double FSTORE 4
1386 On AMDFAM10:
1387 MOVD reg64, xmmreg Double FADD 3
1388 1/1 1/1
1389 MOVD reg32, xmmreg Double FADD 3
1390 1/1 1/1 */
1391 16, /* size of l1 cache. */
1392 2048, /* size of l2 cache. */
1393 64, /* size of prefetch block */
1394 /* New AMD processors never drop prefetches; if they cannot be performed
1395 immediately, they are queued. We set number of simultaneous prefetches
1396 to a large constant to reflect this (it probably is not a good idea not
1397 to limit number of prefetches at all, as their execution also takes some
1398 time). */
1399 100, /* number of parallel prefetches */
1400 2, /* Branch cost */
1401 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1402 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1403 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1404 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1405 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1406 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1407
1408 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1409 very small blocks it is better to use loop. For large blocks, libcall
1410 can do nontemporary accesses and beat inline considerably. */
1411 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1412 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1413 {{libcall, {{8, loop}, {24, unrolled_loop},
1414 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1415 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1416 6, /* scalar_stmt_cost. */
1417 4, /* scalar load_cost. */
1418 4, /* scalar_store_cost. */
1419 6, /* vec_stmt_cost. */
1420 0, /* vec_to_scalar_cost. */
1421 2, /* scalar_to_vec_cost. */
1422 4, /* vec_align_load_cost. */
1423 4, /* vec_unalign_load_cost. */
1424 4, /* vec_store_cost. */
1425 2, /* cond_taken_branch_cost. */
1426 1, /* cond_not_taken_branch_cost. */
1427 };
1428
1429 struct processor_costs btver1_cost = {
1430 COSTS_N_INSNS (1), /* cost of an add instruction */
1431 COSTS_N_INSNS (2), /* cost of a lea instruction */
1432 COSTS_N_INSNS (1), /* variable shift costs */
1433 COSTS_N_INSNS (1), /* constant shift costs */
1434 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1435 COSTS_N_INSNS (4), /* HI */
1436 COSTS_N_INSNS (3), /* SI */
1437 COSTS_N_INSNS (4), /* DI */
1438 COSTS_N_INSNS (5)}, /* other */
1439 0, /* cost of multiply per each bit set */
1440 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1441 COSTS_N_INSNS (35), /* HI */
1442 COSTS_N_INSNS (51), /* SI */
1443 COSTS_N_INSNS (83), /* DI */
1444 COSTS_N_INSNS (83)}, /* other */
1445 COSTS_N_INSNS (1), /* cost of movsx */
1446 COSTS_N_INSNS (1), /* cost of movzx */
1447 8, /* "large" insn */
1448 9, /* MOVE_RATIO */
1449 4, /* cost for loading QImode using movzbl */
1450 {3, 4, 3}, /* cost of loading integer registers
1451 in QImode, HImode and SImode.
1452 Relative to reg-reg move (2). */
1453 {3, 4, 3}, /* cost of storing integer registers */
1454 4, /* cost of reg,reg fld/fst */
1455 {4, 4, 12}, /* cost of loading fp registers
1456 in SFmode, DFmode and XFmode */
1457 {6, 6, 8}, /* cost of storing fp registers
1458 in SFmode, DFmode and XFmode */
1459 2, /* cost of moving MMX register */
1460 {3, 3}, /* cost of loading MMX registers
1461 in SImode and DImode */
1462 {4, 4}, /* cost of storing MMX registers
1463 in SImode and DImode */
1464 2, /* cost of moving SSE register */
1465 {4, 4, 3}, /* cost of loading SSE registers
1466 in SImode, DImode and TImode */
1467 {4, 4, 5}, /* cost of storing SSE registers
1468 in SImode, DImode and TImode */
1469 3, /* MMX or SSE register to integer */
1470 /* On K8:
1471 MOVD reg64, xmmreg Double FSTORE 4
1472 MOVD reg32, xmmreg Double FSTORE 4
1473 On AMDFAM10:
1474 MOVD reg64, xmmreg Double FADD 3
1475 1/1 1/1
1476 MOVD reg32, xmmreg Double FADD 3
1477 1/1 1/1 */
1478 32, /* size of l1 cache. */
1479 512, /* size of l2 cache. */
1480 64, /* size of prefetch block */
1481 100, /* number of parallel prefetches */
1482 2, /* Branch cost */
1483 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1484 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1485 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1486 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1487 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1488 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1489
1490 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1491 very small blocks it is better to use loop. For large blocks, libcall can
1492 do nontemporary accesses and beat inline considerably. */
1493 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1494 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1495 {{libcall, {{8, loop}, {24, unrolled_loop},
1496 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1497 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1498 4, /* scalar_stmt_cost. */
1499 2, /* scalar load_cost. */
1500 2, /* scalar_store_cost. */
1501 6, /* vec_stmt_cost. */
1502 0, /* vec_to_scalar_cost. */
1503 2, /* scalar_to_vec_cost. */
1504 2, /* vec_align_load_cost. */
1505 2, /* vec_unalign_load_cost. */
1506 2, /* vec_store_cost. */
1507 2, /* cond_taken_branch_cost. */
1508 1, /* cond_not_taken_branch_cost. */
1509 };
1510
1511 static const
1512 struct processor_costs pentium4_cost = {
1513 COSTS_N_INSNS (1), /* cost of an add instruction */
1514 COSTS_N_INSNS (3), /* cost of a lea instruction */
1515 COSTS_N_INSNS (4), /* variable shift costs */
1516 COSTS_N_INSNS (4), /* constant shift costs */
1517 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1518 COSTS_N_INSNS (15), /* HI */
1519 COSTS_N_INSNS (15), /* SI */
1520 COSTS_N_INSNS (15), /* DI */
1521 COSTS_N_INSNS (15)}, /* other */
1522 0, /* cost of multiply per each bit set */
1523 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1524 COSTS_N_INSNS (56), /* HI */
1525 COSTS_N_INSNS (56), /* SI */
1526 COSTS_N_INSNS (56), /* DI */
1527 COSTS_N_INSNS (56)}, /* other */
1528 COSTS_N_INSNS (1), /* cost of movsx */
1529 COSTS_N_INSNS (1), /* cost of movzx */
1530 16, /* "large" insn */
1531 6, /* MOVE_RATIO */
1532 2, /* cost for loading QImode using movzbl */
1533 {4, 5, 4}, /* cost of loading integer registers
1534 in QImode, HImode and SImode.
1535 Relative to reg-reg move (2). */
1536 {2, 3, 2}, /* cost of storing integer registers */
1537 2, /* cost of reg,reg fld/fst */
1538 {2, 2, 6}, /* cost of loading fp registers
1539 in SFmode, DFmode and XFmode */
1540 {4, 4, 6}, /* cost of storing fp registers
1541 in SFmode, DFmode and XFmode */
1542 2, /* cost of moving MMX register */
1543 {2, 2}, /* cost of loading MMX registers
1544 in SImode and DImode */
1545 {2, 2}, /* cost of storing MMX registers
1546 in SImode and DImode */
1547 12, /* cost of moving SSE register */
1548 {12, 12, 12}, /* cost of loading SSE registers
1549 in SImode, DImode and TImode */
1550 {2, 2, 8}, /* cost of storing SSE registers
1551 in SImode, DImode and TImode */
1552 10, /* MMX or SSE register to integer */
1553 8, /* size of l1 cache. */
1554 256, /* size of l2 cache. */
1555 64, /* size of prefetch block */
1556 6, /* number of parallel prefetches */
1557 2, /* Branch cost */
1558 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1559 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1560 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1561 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1562 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1563 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1564 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1565 DUMMY_STRINGOP_ALGS},
1566 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1567 {-1, libcall}}},
1568 DUMMY_STRINGOP_ALGS},
1569 1, /* scalar_stmt_cost. */
1570 1, /* scalar load_cost. */
1571 1, /* scalar_store_cost. */
1572 1, /* vec_stmt_cost. */
1573 1, /* vec_to_scalar_cost. */
1574 1, /* scalar_to_vec_cost. */
1575 1, /* vec_align_load_cost. */
1576 2, /* vec_unalign_load_cost. */
1577 1, /* vec_store_cost. */
1578 3, /* cond_taken_branch_cost. */
1579 1, /* cond_not_taken_branch_cost. */
1580 };
1581
1582 static const
1583 struct processor_costs nocona_cost = {
1584 COSTS_N_INSNS (1), /* cost of an add instruction */
1585 COSTS_N_INSNS (1), /* cost of a lea instruction */
1586 COSTS_N_INSNS (1), /* variable shift costs */
1587 COSTS_N_INSNS (1), /* constant shift costs */
1588 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1589 COSTS_N_INSNS (10), /* HI */
1590 COSTS_N_INSNS (10), /* SI */
1591 COSTS_N_INSNS (10), /* DI */
1592 COSTS_N_INSNS (10)}, /* other */
1593 0, /* cost of multiply per each bit set */
1594 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1595 COSTS_N_INSNS (66), /* HI */
1596 COSTS_N_INSNS (66), /* SI */
1597 COSTS_N_INSNS (66), /* DI */
1598 COSTS_N_INSNS (66)}, /* other */
1599 COSTS_N_INSNS (1), /* cost of movsx */
1600 COSTS_N_INSNS (1), /* cost of movzx */
1601 16, /* "large" insn */
1602 17, /* MOVE_RATIO */
1603 4, /* cost for loading QImode using movzbl */
1604 {4, 4, 4}, /* cost of loading integer registers
1605 in QImode, HImode and SImode.
1606 Relative to reg-reg move (2). */
1607 {4, 4, 4}, /* cost of storing integer registers */
1608 3, /* cost of reg,reg fld/fst */
1609 {12, 12, 12}, /* cost of loading fp registers
1610 in SFmode, DFmode and XFmode */
1611 {4, 4, 4}, /* cost of storing fp registers
1612 in SFmode, DFmode and XFmode */
1613 6, /* cost of moving MMX register */
1614 {12, 12}, /* cost of loading MMX registers
1615 in SImode and DImode */
1616 {12, 12}, /* cost of storing MMX registers
1617 in SImode and DImode */
1618 6, /* cost of moving SSE register */
1619 {12, 12, 12}, /* cost of loading SSE registers
1620 in SImode, DImode and TImode */
1621 {12, 12, 12}, /* cost of storing SSE registers
1622 in SImode, DImode and TImode */
1623 8, /* MMX or SSE register to integer */
1624 8, /* size of l1 cache. */
1625 1024, /* size of l2 cache. */
1626 128, /* size of prefetch block */
1627 8, /* number of parallel prefetches */
1628 1, /* Branch cost */
1629 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1630 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1631 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1632 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1633 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1634 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1635 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1636 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1637 {100000, unrolled_loop}, {-1, libcall}}}},
1638 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1639 {-1, libcall}}},
1640 {libcall, {{24, loop}, {64, unrolled_loop},
1641 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1642 1, /* scalar_stmt_cost. */
1643 1, /* scalar load_cost. */
1644 1, /* scalar_store_cost. */
1645 1, /* vec_stmt_cost. */
1646 1, /* vec_to_scalar_cost. */
1647 1, /* scalar_to_vec_cost. */
1648 1, /* vec_align_load_cost. */
1649 2, /* vec_unalign_load_cost. */
1650 1, /* vec_store_cost. */
1651 3, /* cond_taken_branch_cost. */
1652 1, /* cond_not_taken_branch_cost. */
1653 };
1654
1655 static const
1656 struct processor_costs atom_cost = {
1657 COSTS_N_INSNS (1), /* cost of an add instruction */
1658 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1659 COSTS_N_INSNS (1), /* variable shift costs */
1660 COSTS_N_INSNS (1), /* constant shift costs */
1661 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1662 COSTS_N_INSNS (4), /* HI */
1663 COSTS_N_INSNS (3), /* SI */
1664 COSTS_N_INSNS (4), /* DI */
1665 COSTS_N_INSNS (2)}, /* other */
1666 0, /* cost of multiply per each bit set */
1667 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1668 COSTS_N_INSNS (26), /* HI */
1669 COSTS_N_INSNS (42), /* SI */
1670 COSTS_N_INSNS (74), /* DI */
1671 COSTS_N_INSNS (74)}, /* other */
1672 COSTS_N_INSNS (1), /* cost of movsx */
1673 COSTS_N_INSNS (1), /* cost of movzx */
1674 8, /* "large" insn */
1675 17, /* MOVE_RATIO */
1676 4, /* cost for loading QImode using movzbl */
1677 {4, 4, 4}, /* cost of loading integer registers
1678 in QImode, HImode and SImode.
1679 Relative to reg-reg move (2). */
1680 {4, 4, 4}, /* cost of storing integer registers */
1681 4, /* cost of reg,reg fld/fst */
1682 {12, 12, 12}, /* cost of loading fp registers
1683 in SFmode, DFmode and XFmode */
1684 {6, 6, 8}, /* cost of storing fp registers
1685 in SFmode, DFmode and XFmode */
1686 2, /* cost of moving MMX register */
1687 {8, 8}, /* cost of loading MMX registers
1688 in SImode and DImode */
1689 {8, 8}, /* cost of storing MMX registers
1690 in SImode and DImode */
1691 2, /* cost of moving SSE register */
1692 {8, 8, 8}, /* cost of loading SSE registers
1693 in SImode, DImode and TImode */
1694 {8, 8, 8}, /* cost of storing SSE registers
1695 in SImode, DImode and TImode */
1696 5, /* MMX or SSE register to integer */
1697 32, /* size of l1 cache. */
1698 256, /* size of l2 cache. */
1699 64, /* size of prefetch block */
1700 6, /* number of parallel prefetches */
1701 3, /* Branch cost */
1702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1703 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1704 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1705 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1706 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1707 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1708 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1709 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1710 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1711 {{libcall, {{8, loop}, {15, unrolled_loop},
1712 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1713 {libcall, {{24, loop}, {32, unrolled_loop},
1714 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1715 1, /* scalar_stmt_cost. */
1716 1, /* scalar load_cost. */
1717 1, /* scalar_store_cost. */
1718 1, /* vec_stmt_cost. */
1719 1, /* vec_to_scalar_cost. */
1720 1, /* scalar_to_vec_cost. */
1721 1, /* vec_align_load_cost. */
1722 2, /* vec_unalign_load_cost. */
1723 1, /* vec_store_cost. */
1724 3, /* cond_taken_branch_cost. */
1725 1, /* cond_not_taken_branch_cost. */
1726 };
1727
1728 /* Generic64 should produce code tuned for Nocona and K8. */
1729 static const
1730 struct processor_costs generic64_cost = {
1731 COSTS_N_INSNS (1), /* cost of an add instruction */
1732 /* On all chips taken into consideration lea is 2 cycles and more. With
1733 this cost however our current implementation of synth_mult results in
1734 use of unnecessary temporary registers causing regression on several
1735 SPECfp benchmarks. */
1736 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1737 COSTS_N_INSNS (1), /* variable shift costs */
1738 COSTS_N_INSNS (1), /* constant shift costs */
1739 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1740 COSTS_N_INSNS (4), /* HI */
1741 COSTS_N_INSNS (3), /* SI */
1742 COSTS_N_INSNS (4), /* DI */
1743 COSTS_N_INSNS (2)}, /* other */
1744 0, /* cost of multiply per each bit set */
1745 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1746 COSTS_N_INSNS (26), /* HI */
1747 COSTS_N_INSNS (42), /* SI */
1748 COSTS_N_INSNS (74), /* DI */
1749 COSTS_N_INSNS (74)}, /* other */
1750 COSTS_N_INSNS (1), /* cost of movsx */
1751 COSTS_N_INSNS (1), /* cost of movzx */
1752 8, /* "large" insn */
1753 17, /* MOVE_RATIO */
1754 4, /* cost for loading QImode using movzbl */
1755 {4, 4, 4}, /* cost of loading integer registers
1756 in QImode, HImode and SImode.
1757 Relative to reg-reg move (2). */
1758 {4, 4, 4}, /* cost of storing integer registers */
1759 4, /* cost of reg,reg fld/fst */
1760 {12, 12, 12}, /* cost of loading fp registers
1761 in SFmode, DFmode and XFmode */
1762 {6, 6, 8}, /* cost of storing fp registers
1763 in SFmode, DFmode and XFmode */
1764 2, /* cost of moving MMX register */
1765 {8, 8}, /* cost of loading MMX registers
1766 in SImode and DImode */
1767 {8, 8}, /* cost of storing MMX registers
1768 in SImode and DImode */
1769 2, /* cost of moving SSE register */
1770 {8, 8, 8}, /* cost of loading SSE registers
1771 in SImode, DImode and TImode */
1772 {8, 8, 8}, /* cost of storing SSE registers
1773 in SImode, DImode and TImode */
1774 5, /* MMX or SSE register to integer */
1775 32, /* size of l1 cache. */
1776 512, /* size of l2 cache. */
1777 64, /* size of prefetch block */
1778 6, /* number of parallel prefetches */
1779 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1780 value is increased to perhaps more appropriate value of 5. */
1781 3, /* Branch cost */
1782 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1783 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1784 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1785 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1786 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1787 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1788 {DUMMY_STRINGOP_ALGS,
1789 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1790 {DUMMY_STRINGOP_ALGS,
1791 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1792 1, /* scalar_stmt_cost. */
1793 1, /* scalar load_cost. */
1794 1, /* scalar_store_cost. */
1795 1, /* vec_stmt_cost. */
1796 1, /* vec_to_scalar_cost. */
1797 1, /* scalar_to_vec_cost. */
1798 1, /* vec_align_load_cost. */
1799 2, /* vec_unalign_load_cost. */
1800 1, /* vec_store_cost. */
1801 3, /* cond_taken_branch_cost. */
1802 1, /* cond_not_taken_branch_cost. */
1803 };
1804
1805 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1806 Athlon and K8. */
1807 static const
1808 struct processor_costs generic32_cost = {
1809 COSTS_N_INSNS (1), /* cost of an add instruction */
1810 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1811 COSTS_N_INSNS (1), /* variable shift costs */
1812 COSTS_N_INSNS (1), /* constant shift costs */
1813 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1814 COSTS_N_INSNS (4), /* HI */
1815 COSTS_N_INSNS (3), /* SI */
1816 COSTS_N_INSNS (4), /* DI */
1817 COSTS_N_INSNS (2)}, /* other */
1818 0, /* cost of multiply per each bit set */
1819 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1820 COSTS_N_INSNS (26), /* HI */
1821 COSTS_N_INSNS (42), /* SI */
1822 COSTS_N_INSNS (74), /* DI */
1823 COSTS_N_INSNS (74)}, /* other */
1824 COSTS_N_INSNS (1), /* cost of movsx */
1825 COSTS_N_INSNS (1), /* cost of movzx */
1826 8, /* "large" insn */
1827 17, /* MOVE_RATIO */
1828 4, /* cost for loading QImode using movzbl */
1829 {4, 4, 4}, /* cost of loading integer registers
1830 in QImode, HImode and SImode.
1831 Relative to reg-reg move (2). */
1832 {4, 4, 4}, /* cost of storing integer registers */
1833 4, /* cost of reg,reg fld/fst */
1834 {12, 12, 12}, /* cost of loading fp registers
1835 in SFmode, DFmode and XFmode */
1836 {6, 6, 8}, /* cost of storing fp registers
1837 in SFmode, DFmode and XFmode */
1838 2, /* cost of moving MMX register */
1839 {8, 8}, /* cost of loading MMX registers
1840 in SImode and DImode */
1841 {8, 8}, /* cost of storing MMX registers
1842 in SImode and DImode */
1843 2, /* cost of moving SSE register */
1844 {8, 8, 8}, /* cost of loading SSE registers
1845 in SImode, DImode and TImode */
1846 {8, 8, 8}, /* cost of storing SSE registers
1847 in SImode, DImode and TImode */
1848 5, /* MMX or SSE register to integer */
1849 32, /* size of l1 cache. */
1850 256, /* size of l2 cache. */
1851 64, /* size of prefetch block */
1852 6, /* number of parallel prefetches */
1853 3, /* Branch cost */
1854 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1855 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1856 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1857 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1858 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1859 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1860 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1861 DUMMY_STRINGOP_ALGS},
1862 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1863 DUMMY_STRINGOP_ALGS},
1864 1, /* scalar_stmt_cost. */
1865 1, /* scalar load_cost. */
1866 1, /* scalar_store_cost. */
1867 1, /* vec_stmt_cost. */
1868 1, /* vec_to_scalar_cost. */
1869 1, /* scalar_to_vec_cost. */
1870 1, /* vec_align_load_cost. */
1871 2, /* vec_unalign_load_cost. */
1872 1, /* vec_store_cost. */
1873 3, /* cond_taken_branch_cost. */
1874 1, /* cond_not_taken_branch_cost. */
1875 };
1876
1877 const struct processor_costs *ix86_cost = &pentium_cost;
1878
1879 /* Processor feature/optimization bitmasks. */
1880 #define m_386 (1<<PROCESSOR_I386)
1881 #define m_486 (1<<PROCESSOR_I486)
1882 #define m_PENT (1<<PROCESSOR_PENTIUM)
1883 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1884 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1885 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1886 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1887 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1888 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1889 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1890 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1891 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1892 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1893 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1894 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1895 #define m_ATOM (1<<PROCESSOR_ATOM)
1896
1897 #define m_GEODE (1<<PROCESSOR_GEODE)
1898 #define m_K6 (1<<PROCESSOR_K6)
1899 #define m_K6_GEODE (m_K6 | m_GEODE)
1900 #define m_K8 (1<<PROCESSOR_K8)
1901 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1902 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1903 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1904 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1905 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1906 #define m_BDVER (m_BDVER1 | m_BDVER2)
1907 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1908 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1909
1910 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1911 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1912
1913 /* Generic instruction choice should be common subset of supported CPUs
1914 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1915 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1916
1917 /* Feature tests against the various tunings. */
1918 unsigned char ix86_tune_features[X86_TUNE_LAST];
1919
1920 /* Feature tests against the various tunings used to create ix86_tune_features
1921 based on the processor mask. */
1922 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1923 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1924 negatively, so enabling for Generic64 seems like good code size
1925 tradeoff. We can't enable it for 32bit generic because it does not
1926 work well with PPro base chips. */
1927 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1928
1929 /* X86_TUNE_PUSH_MEMORY */
1930 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1931
1932 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1933 m_486 | m_PENT,
1934
1935 /* X86_TUNE_UNROLL_STRLEN */
1936 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1937
1938 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1939 on simulation result. But after P4 was made, no performance benefit
1940 was observed with branch hints. It also increases the code size.
1941 As a result, icc never generates branch hints. */
1942 0,
1943
1944 /* X86_TUNE_DOUBLE_WITH_ADD */
1945 ~m_386,
1946
1947 /* X86_TUNE_USE_SAHF */
1948 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1949
1950 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1951 partial dependencies. */
1952 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1953
1954 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1955 register stalls on Generic32 compilation setting as well. However
1956 in current implementation the partial register stalls are not eliminated
1957 very well - they can be introduced via subregs synthesized by combine
1958 and can happen in caller/callee saving sequences. Because this option
1959 pays back little on PPro based chips and is in conflict with partial reg
1960 dependencies used by Athlon/P4 based chips, it is better to leave it off
1961 for generic32 for now. */
1962 m_PPRO,
1963
1964 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1965 m_CORE2I7 | m_GENERIC,
1966
1967 /* X86_TUNE_USE_HIMODE_FIOP */
1968 m_386 | m_486 | m_K6_GEODE,
1969
1970 /* X86_TUNE_USE_SIMODE_FIOP */
1971 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1972
1973 /* X86_TUNE_USE_MOV0 */
1974 m_K6,
1975
1976 /* X86_TUNE_USE_CLTD */
1977 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1978
1979 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1980 m_PENT4,
1981
1982 /* X86_TUNE_SPLIT_LONG_MOVES */
1983 m_PPRO,
1984
1985 /* X86_TUNE_READ_MODIFY_WRITE */
1986 ~m_PENT,
1987
1988 /* X86_TUNE_READ_MODIFY */
1989 ~(m_PENT | m_PPRO),
1990
1991 /* X86_TUNE_PROMOTE_QIMODE */
1992 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1993
1994 /* X86_TUNE_FAST_PREFIX */
1995 ~(m_386 | m_486 | m_PENT),
1996
1997 /* X86_TUNE_SINGLE_STRINGOP */
1998 m_386 | m_P4_NOCONA,
1999
2000 /* X86_TUNE_QIMODE_MATH */
2001 ~0,
2002
2003 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2004 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2005 might be considered for Generic32 if our scheme for avoiding partial
2006 stalls was more effective. */
2007 ~m_PPRO,
2008
2009 /* X86_TUNE_PROMOTE_QI_REGS */
2010 0,
2011
2012 /* X86_TUNE_PROMOTE_HI_REGS */
2013 m_PPRO,
2014
2015 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2016 over esp addition. */
2017 m_386 | m_486 | m_PENT | m_PPRO,
2018
2019 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2020 over esp addition. */
2021 m_PENT,
2022
2023 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2024 over esp subtraction. */
2025 m_386 | m_486 | m_PENT | m_K6_GEODE,
2026
2027 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2028 over esp subtraction. */
2029 m_PENT | m_K6_GEODE,
2030
2031 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2032 for DFmode copies */
2033 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2034
2035 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2036 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2037
2038 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2039 conflict here in between PPro/Pentium4 based chips that thread 128bit
2040 SSE registers as single units versus K8 based chips that divide SSE
2041 registers to two 64bit halves. This knob promotes all store destinations
2042 to be 128bit to allow register renaming on 128bit SSE units, but usually
2043 results in one extra microop on 64bit SSE units. Experimental results
2044 shows that disabling this option on P4 brings over 20% SPECfp regression,
2045 while enabling it on K8 brings roughly 2.4% regression that can be partly
2046 masked by careful scheduling of moves. */
2047 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2048
2049 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2050 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2051
2052 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2053 m_COREI7 | m_BDVER,
2054
2055 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2056 m_BDVER ,
2057
2058 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2059 are resolved on SSE register parts instead of whole registers, so we may
2060 maintain just lower part of scalar values in proper format leaving the
2061 upper part undefined. */
2062 m_ATHLON_K8,
2063
2064 /* X86_TUNE_SSE_TYPELESS_STORES */
2065 m_AMD_MULTIPLE,
2066
2067 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2068 m_PPRO | m_P4_NOCONA,
2069
2070 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2071 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2072
2073 /* X86_TUNE_PROLOGUE_USING_MOVE */
2074 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2075
2076 /* X86_TUNE_EPILOGUE_USING_MOVE */
2077 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2078
2079 /* X86_TUNE_SHIFT1 */
2080 ~m_486,
2081
2082 /* X86_TUNE_USE_FFREEP */
2083 m_AMD_MULTIPLE,
2084
2085 /* X86_TUNE_INTER_UNIT_MOVES */
2086 ~(m_AMD_MULTIPLE | m_GENERIC),
2087
2088 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2089 ~(m_AMDFAM10 | m_BDVER ),
2090
2091 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2092 than 4 branch instructions in the 16 byte window. */
2093 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2094
2095 /* X86_TUNE_SCHEDULE */
2096 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2097
2098 /* X86_TUNE_USE_BT */
2099 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2100
2101 /* X86_TUNE_USE_INCDEC */
2102 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2103
2104 /* X86_TUNE_PAD_RETURNS */
2105 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2106
2107 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2108 m_ATOM,
2109
2110 /* X86_TUNE_EXT_80387_CONSTANTS */
2111 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2112
2113 /* X86_TUNE_SHORTEN_X87_SSE */
2114 ~m_K8,
2115
2116 /* X86_TUNE_AVOID_VECTOR_DECODE */
2117 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2118
2119 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2120 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2121 ~(m_386 | m_486),
2122
2123 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2124 vector path on AMD machines. */
2125 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2126
2127 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2128 machines. */
2129 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2130
2131 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2132 than a MOV. */
2133 m_PENT,
2134
2135 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2136 but one byte longer. */
2137 m_PENT,
2138
2139 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2140 operand that cannot be represented using a modRM byte. The XOR
2141 replacement is long decoded, so this split helps here as well. */
2142 m_K6,
2143
2144 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2145 from FP to FP. */
2146 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2147
2148 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2149 from integer to FP. */
2150 m_AMDFAM10,
2151
2152 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2153 with a subsequent conditional jump instruction into a single
2154 compare-and-branch uop. */
2155 m_BDVER,
2156
2157 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2158 will impact LEA instruction selection. */
2159 m_ATOM,
2160
2161 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2162 instructions. */
2163 ~m_ATOM,
2164
2165 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2166 at -O3. For the moment, the prefetching seems badly tuned for Intel
2167 chips. */
2168 m_K6_GEODE | m_AMD_MULTIPLE,
2169
2170 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2171 the auto-vectorizer. */
2172 m_BDVER,
2173
2174 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2175 during reassociation of integer computation. */
2176 m_ATOM,
2177
2178 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2179 during reassociation of fp computation. */
2180 m_ATOM
2181 };
2182
2183 /* Feature tests against the various architecture variations. */
2184 unsigned char ix86_arch_features[X86_ARCH_LAST];
2185
2186 /* Feature tests against the various architecture variations, used to create
2187 ix86_arch_features based on the processor mask. */
2188 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2189 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2190 ~(m_386 | m_486 | m_PENT | m_K6),
2191
2192 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2193 ~m_386,
2194
2195 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2196 ~(m_386 | m_486),
2197
2198 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2199 ~m_386,
2200
2201 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2202 ~m_386,
2203 };
2204
2205 static const unsigned int x86_accumulate_outgoing_args
2206 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2207
2208 static const unsigned int x86_arch_always_fancy_math_387
2209 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2210
2211 static const unsigned int x86_avx256_split_unaligned_load
2212 = m_COREI7 | m_GENERIC;
2213
2214 static const unsigned int x86_avx256_split_unaligned_store
2215 = m_COREI7 | m_BDVER | m_GENERIC;
2216
2217 /* In case the average insn count for single function invocation is
2218 lower than this constant, emit fast (but longer) prologue and
2219 epilogue code. */
2220 #define FAST_PROLOGUE_INSN_COUNT 20
2221
2222 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2223 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2224 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2225 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2226
2227 /* Array of the smallest class containing reg number REGNO, indexed by
2228 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2229
2230 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2231 {
2232 /* ax, dx, cx, bx */
2233 AREG, DREG, CREG, BREG,
2234 /* si, di, bp, sp */
2235 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2236 /* FP registers */
2237 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2238 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2239 /* arg pointer */
2240 NON_Q_REGS,
2241 /* flags, fpsr, fpcr, frame */
2242 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2243 /* SSE registers */
2244 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2245 SSE_REGS, SSE_REGS,
2246 /* MMX registers */
2247 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2248 MMX_REGS, MMX_REGS,
2249 /* REX registers */
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2252 /* SSE REX registers */
2253 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2254 SSE_REGS, SSE_REGS,
2255 };
2256
2257 /* The "default" register map used in 32bit mode. */
2258
2259 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2260 {
2261 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2262 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2263 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2264 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2265 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2267 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2268 };
2269
2270 /* The "default" register map used in 64bit mode. */
2271
2272 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2273 {
2274 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2275 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2276 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2277 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2278 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2279 8,9,10,11,12,13,14,15, /* extended integer registers */
2280 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2281 };
2282
2283 /* Define the register numbers to be used in Dwarf debugging information.
2284 The SVR4 reference port C compiler uses the following register numbers
2285 in its Dwarf output code:
2286 0 for %eax (gcc regno = 0)
2287 1 for %ecx (gcc regno = 2)
2288 2 for %edx (gcc regno = 1)
2289 3 for %ebx (gcc regno = 3)
2290 4 for %esp (gcc regno = 7)
2291 5 for %ebp (gcc regno = 6)
2292 6 for %esi (gcc regno = 4)
2293 7 for %edi (gcc regno = 5)
2294 The following three DWARF register numbers are never generated by
2295 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2296 believes these numbers have these meanings.
2297 8 for %eip (no gcc equivalent)
2298 9 for %eflags (gcc regno = 17)
2299 10 for %trapno (no gcc equivalent)
2300 It is not at all clear how we should number the FP stack registers
2301 for the x86 architecture. If the version of SDB on x86/svr4 were
2302 a bit less brain dead with respect to floating-point then we would
2303 have a precedent to follow with respect to DWARF register numbers
2304 for x86 FP registers, but the SDB on x86/svr4 is so completely
2305 broken with respect to FP registers that it is hardly worth thinking
2306 of it as something to strive for compatibility with.
2307 The version of x86/svr4 SDB I have at the moment does (partially)
2308 seem to believe that DWARF register number 11 is associated with
2309 the x86 register %st(0), but that's about all. Higher DWARF
2310 register numbers don't seem to be associated with anything in
2311 particular, and even for DWARF regno 11, SDB only seems to under-
2312 stand that it should say that a variable lives in %st(0) (when
2313 asked via an `=' command) if we said it was in DWARF regno 11,
2314 but SDB still prints garbage when asked for the value of the
2315 variable in question (via a `/' command).
2316 (Also note that the labels SDB prints for various FP stack regs
2317 when doing an `x' command are all wrong.)
2318 Note that these problems generally don't affect the native SVR4
2319 C compiler because it doesn't allow the use of -O with -g and
2320 because when it is *not* optimizing, it allocates a memory
2321 location for each floating-point variable, and the memory
2322 location is what gets described in the DWARF AT_location
2323 attribute for the variable in question.
2324 Regardless of the severe mental illness of the x86/svr4 SDB, we
2325 do something sensible here and we use the following DWARF
2326 register numbers. Note that these are all stack-top-relative
2327 numbers.
2328 11 for %st(0) (gcc regno = 8)
2329 12 for %st(1) (gcc regno = 9)
2330 13 for %st(2) (gcc regno = 10)
2331 14 for %st(3) (gcc regno = 11)
2332 15 for %st(4) (gcc regno = 12)
2333 16 for %st(5) (gcc regno = 13)
2334 17 for %st(6) (gcc regno = 14)
2335 18 for %st(7) (gcc regno = 15)
2336 */
2337 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2338 {
2339 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2340 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2341 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2342 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2343 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2345 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2346 };
2347
2348 /* Define parameter passing and return registers. */
2349
2350 static int const x86_64_int_parameter_registers[6] =
2351 {
2352 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2353 };
2354
2355 static int const x86_64_ms_abi_int_parameter_registers[4] =
2356 {
2357 CX_REG, DX_REG, R8_REG, R9_REG
2358 };
2359
2360 static int const x86_64_int_return_registers[4] =
2361 {
2362 AX_REG, DX_REG, DI_REG, SI_REG
2363 };
2364
2365 /* Define the structure for the machine field in struct function. */
2366
2367 struct GTY(()) stack_local_entry {
2368 unsigned short mode;
2369 unsigned short n;
2370 rtx rtl;
2371 struct stack_local_entry *next;
2372 };
2373
2374 /* Structure describing stack frame layout.
2375 Stack grows downward:
2376
2377 [arguments]
2378 <- ARG_POINTER
2379 saved pc
2380
2381 saved static chain if ix86_static_chain_on_stack
2382
2383 saved frame pointer if frame_pointer_needed
2384 <- HARD_FRAME_POINTER
2385 [saved regs]
2386 <- regs_save_offset
2387 [padding0]
2388
2389 [saved SSE regs]
2390 <- sse_regs_save_offset
2391 [padding1] |
2392 | <- FRAME_POINTER
2393 [va_arg registers] |
2394 |
2395 [frame] |
2396 |
2397 [padding2] | = to_allocate
2398 <- STACK_POINTER
2399 */
2400 struct ix86_frame
2401 {
2402 int nsseregs;
2403 int nregs;
2404 int va_arg_size;
2405 int red_zone_size;
2406 int outgoing_arguments_size;
2407 HOST_WIDE_INT frame;
2408
2409 /* The offsets relative to ARG_POINTER. */
2410 HOST_WIDE_INT frame_pointer_offset;
2411 HOST_WIDE_INT hard_frame_pointer_offset;
2412 HOST_WIDE_INT stack_pointer_offset;
2413 HOST_WIDE_INT hfp_save_offset;
2414 HOST_WIDE_INT reg_save_offset;
2415 HOST_WIDE_INT sse_reg_save_offset;
2416
2417 /* When save_regs_using_mov is set, emit prologue using
2418 move instead of push instructions. */
2419 bool save_regs_using_mov;
2420 };
2421
2422 /* Which cpu are we scheduling for. */
2423 enum attr_cpu ix86_schedule;
2424
2425 /* Which cpu are we optimizing for. */
2426 enum processor_type ix86_tune;
2427
2428 /* Which instruction set architecture to use. */
2429 enum processor_type ix86_arch;
2430
2431 /* true if sse prefetch instruction is not NOOP. */
2432 int x86_prefetch_sse;
2433
2434 /* -mstackrealign option */
2435 static const char ix86_force_align_arg_pointer_string[]
2436 = "force_align_arg_pointer";
2437
2438 static rtx (*ix86_gen_leave) (void);
2439 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2440 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2441 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2442 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2443 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2445 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2446 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2447 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2448 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2449 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2450
2451 /* Preferred alignment for stack boundary in bits. */
2452 unsigned int ix86_preferred_stack_boundary;
2453
2454 /* Alignment for incoming stack boundary in bits specified at
2455 command line. */
2456 static unsigned int ix86_user_incoming_stack_boundary;
2457
2458 /* Default alignment for incoming stack boundary in bits. */
2459 static unsigned int ix86_default_incoming_stack_boundary;
2460
2461 /* Alignment for incoming stack boundary in bits. */
2462 unsigned int ix86_incoming_stack_boundary;
2463
2464 /* Calling abi specific va_list type nodes. */
2465 static GTY(()) tree sysv_va_list_type_node;
2466 static GTY(()) tree ms_va_list_type_node;
2467
2468 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2469 char internal_label_prefix[16];
2470 int internal_label_prefix_len;
2471
2472 /* Fence to use after loop using movnt. */
2473 tree x86_mfence;
2474
2475 /* Register class used for passing given 64bit part of the argument.
2476 These represent classes as documented by the PS ABI, with the exception
2477 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2478 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2479
2480 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2481 whenever possible (upper half does contain padding). */
2482 enum x86_64_reg_class
2483 {
2484 X86_64_NO_CLASS,
2485 X86_64_INTEGER_CLASS,
2486 X86_64_INTEGERSI_CLASS,
2487 X86_64_SSE_CLASS,
2488 X86_64_SSESF_CLASS,
2489 X86_64_SSEDF_CLASS,
2490 X86_64_SSEUP_CLASS,
2491 X86_64_X87_CLASS,
2492 X86_64_X87UP_CLASS,
2493 X86_64_COMPLEX_X87_CLASS,
2494 X86_64_MEMORY_CLASS
2495 };
2496
2497 #define MAX_CLASSES 4
2498
2499 /* Table of constants used by fldpi, fldln2, etc.... */
2500 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2501 static bool ext_80387_constants_init = 0;
2502
2503 \f
2504 static struct machine_function * ix86_init_machine_status (void);
2505 static rtx ix86_function_value (const_tree, const_tree, bool);
2506 static bool ix86_function_value_regno_p (const unsigned int);
2507 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2508 const_tree);
2509 static rtx ix86_static_chain (const_tree, bool);
2510 static int ix86_function_regparm (const_tree, const_tree);
2511 static void ix86_compute_frame_layout (struct ix86_frame *);
2512 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2513 rtx, rtx, int);
2514 static void ix86_add_new_builtins (HOST_WIDE_INT);
2515 static tree ix86_canonical_va_list_type (tree);
2516 static void predict_jump (int);
2517 static unsigned int split_stack_prologue_scratch_regno (void);
2518 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2519
2520 enum ix86_function_specific_strings
2521 {
2522 IX86_FUNCTION_SPECIFIC_ARCH,
2523 IX86_FUNCTION_SPECIFIC_TUNE,
2524 IX86_FUNCTION_SPECIFIC_MAX
2525 };
2526
2527 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2528 const char *, enum fpmath_unit, bool);
2529 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2530 static void ix86_function_specific_save (struct cl_target_option *);
2531 static void ix86_function_specific_restore (struct cl_target_option *);
2532 static void ix86_function_specific_print (FILE *, int,
2533 struct cl_target_option *);
2534 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2535 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2536 struct gcc_options *);
2537 static bool ix86_can_inline_p (tree, tree);
2538 static void ix86_set_current_function (tree);
2539 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2540
2541 static enum calling_abi ix86_function_abi (const_tree);
2542
2543 \f
2544 #ifndef SUBTARGET32_DEFAULT_CPU
2545 #define SUBTARGET32_DEFAULT_CPU "i386"
2546 #endif
2547
2548 /* The svr4 ABI for the i386 says that records and unions are returned
2549 in memory. */
2550 #ifndef DEFAULT_PCC_STRUCT_RETURN
2551 #define DEFAULT_PCC_STRUCT_RETURN 1
2552 #endif
2553
2554 /* Whether -mtune= or -march= were specified */
2555 static int ix86_tune_defaulted;
2556 static int ix86_arch_specified;
2557
2558 /* Vectorization library interface and handlers. */
2559 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2560
2561 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2562 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2563
2564 /* Processor target table, indexed by processor number */
2565 struct ptt
2566 {
2567 const struct processor_costs *cost; /* Processor costs */
2568 const int align_loop; /* Default alignments. */
2569 const int align_loop_max_skip;
2570 const int align_jump;
2571 const int align_jump_max_skip;
2572 const int align_func;
2573 };
2574
2575 static const struct ptt processor_target_table[PROCESSOR_max] =
2576 {
2577 {&i386_cost, 4, 3, 4, 3, 4},
2578 {&i486_cost, 16, 15, 16, 15, 16},
2579 {&pentium_cost, 16, 7, 16, 7, 16},
2580 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2581 {&geode_cost, 0, 0, 0, 0, 0},
2582 {&k6_cost, 32, 7, 32, 7, 32},
2583 {&athlon_cost, 16, 7, 16, 7, 16},
2584 {&pentium4_cost, 0, 0, 0, 0, 0},
2585 {&k8_cost, 16, 7, 16, 7, 16},
2586 {&nocona_cost, 0, 0, 0, 0, 0},
2587 /* Core 2 32-bit. */
2588 {&generic32_cost, 16, 10, 16, 10, 16},
2589 /* Core 2 64-bit. */
2590 {&generic64_cost, 16, 10, 16, 10, 16},
2591 /* Core i7 32-bit. */
2592 {&generic32_cost, 16, 10, 16, 10, 16},
2593 /* Core i7 64-bit. */
2594 {&generic64_cost, 16, 10, 16, 10, 16},
2595 {&generic32_cost, 16, 7, 16, 7, 16},
2596 {&generic64_cost, 16, 10, 16, 10, 16},
2597 {&amdfam10_cost, 32, 24, 32, 7, 32},
2598 {&bdver1_cost, 32, 24, 32, 7, 32},
2599 {&bdver2_cost, 32, 24, 32, 7, 32},
2600 {&btver1_cost, 32, 24, 32, 7, 32},
2601 {&atom_cost, 16, 15, 16, 7, 16}
2602 };
2603
2604 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2605 {
2606 "generic",
2607 "i386",
2608 "i486",
2609 "pentium",
2610 "pentium-mmx",
2611 "pentiumpro",
2612 "pentium2",
2613 "pentium3",
2614 "pentium4",
2615 "pentium-m",
2616 "prescott",
2617 "nocona",
2618 "core2",
2619 "corei7",
2620 "atom",
2621 "geode",
2622 "k6",
2623 "k6-2",
2624 "k6-3",
2625 "athlon",
2626 "athlon-4",
2627 "k8",
2628 "amdfam10",
2629 "bdver1",
2630 "bdver2",
2631 "btver1"
2632 };
2633 \f
2634 /* Return true if a red-zone is in use. */
2635
2636 static inline bool
2637 ix86_using_red_zone (void)
2638 {
2639 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2640 }
2641 \f
2642 /* Return a string that documents the current -m options. The caller is
2643 responsible for freeing the string. */
2644
2645 static char *
2646 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2647 const char *tune, enum fpmath_unit fpmath,
2648 bool add_nl_p)
2649 {
2650 struct ix86_target_opts
2651 {
2652 const char *option; /* option string */
2653 HOST_WIDE_INT mask; /* isa mask options */
2654 };
2655
2656 /* This table is ordered so that options like -msse4.2 that imply
2657 preceding options while match those first. */
2658 static struct ix86_target_opts isa_opts[] =
2659 {
2660 { "-m64", OPTION_MASK_ISA_64BIT },
2661 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2662 { "-mfma", OPTION_MASK_ISA_FMA },
2663 { "-mxop", OPTION_MASK_ISA_XOP },
2664 { "-mlwp", OPTION_MASK_ISA_LWP },
2665 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2666 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2667 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2668 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2669 { "-msse3", OPTION_MASK_ISA_SSE3 },
2670 { "-msse2", OPTION_MASK_ISA_SSE2 },
2671 { "-msse", OPTION_MASK_ISA_SSE },
2672 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2673 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2674 { "-mmmx", OPTION_MASK_ISA_MMX },
2675 { "-mabm", OPTION_MASK_ISA_ABM },
2676 { "-mbmi", OPTION_MASK_ISA_BMI },
2677 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2678 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2679 { "-mtbm", OPTION_MASK_ISA_TBM },
2680 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2681 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2682 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2683 { "-maes", OPTION_MASK_ISA_AES },
2684 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2685 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2686 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2687 { "-mf16c", OPTION_MASK_ISA_F16C },
2688 { "-mrtm", OPTION_MASK_ISA_RTM },
2689 };
2690
2691 /* Flag options. */
2692 static struct ix86_target_opts flag_opts[] =
2693 {
2694 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2695 { "-m80387", MASK_80387 },
2696 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2697 { "-malign-double", MASK_ALIGN_DOUBLE },
2698 { "-mcld", MASK_CLD },
2699 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2700 { "-mieee-fp", MASK_IEEE_FP },
2701 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2702 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2703 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2704 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2705 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2706 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2707 { "-mno-red-zone", MASK_NO_RED_ZONE },
2708 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2709 { "-mrecip", MASK_RECIP },
2710 { "-mrtd", MASK_RTD },
2711 { "-msseregparm", MASK_SSEREGPARM },
2712 { "-mstack-arg-probe", MASK_STACK_PROBE },
2713 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2714 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2715 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2716 { "-mvzeroupper", MASK_VZEROUPPER },
2717 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2718 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2719 { "-mprefer-avx128", MASK_PREFER_AVX128},
2720 };
2721
2722 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2723
2724 char isa_other[40];
2725 char target_other[40];
2726 unsigned num = 0;
2727 unsigned i, j;
2728 char *ret;
2729 char *ptr;
2730 size_t len;
2731 size_t line_len;
2732 size_t sep_len;
2733
2734 memset (opts, '\0', sizeof (opts));
2735
2736 /* Add -march= option. */
2737 if (arch)
2738 {
2739 opts[num][0] = "-march=";
2740 opts[num++][1] = arch;
2741 }
2742
2743 /* Add -mtune= option. */
2744 if (tune)
2745 {
2746 opts[num][0] = "-mtune=";
2747 opts[num++][1] = tune;
2748 }
2749
2750 /* Pick out the options in isa options. */
2751 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2752 {
2753 if ((isa & isa_opts[i].mask) != 0)
2754 {
2755 opts[num++][0] = isa_opts[i].option;
2756 isa &= ~ isa_opts[i].mask;
2757 }
2758 }
2759
2760 if (isa && add_nl_p)
2761 {
2762 opts[num++][0] = isa_other;
2763 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2764 isa);
2765 }
2766
2767 /* Add flag options. */
2768 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2769 {
2770 if ((flags & flag_opts[i].mask) != 0)
2771 {
2772 opts[num++][0] = flag_opts[i].option;
2773 flags &= ~ flag_opts[i].mask;
2774 }
2775 }
2776
2777 if (flags && add_nl_p)
2778 {
2779 opts[num++][0] = target_other;
2780 sprintf (target_other, "(other flags: %#x)", flags);
2781 }
2782
2783 /* Add -fpmath= option. */
2784 if (fpmath)
2785 {
2786 opts[num][0] = "-mfpmath=";
2787 switch ((int) fpmath)
2788 {
2789 case FPMATH_387:
2790 opts[num++][1] = "387";
2791 break;
2792
2793 case FPMATH_SSE:
2794 opts[num++][1] = "sse";
2795 break;
2796
2797 case FPMATH_387 | FPMATH_SSE:
2798 opts[num++][1] = "sse+387";
2799 break;
2800
2801 default:
2802 gcc_unreachable ();
2803 }
2804 }
2805
2806 /* Any options? */
2807 if (num == 0)
2808 return NULL;
2809
2810 gcc_assert (num < ARRAY_SIZE (opts));
2811
2812 /* Size the string. */
2813 len = 0;
2814 sep_len = (add_nl_p) ? 3 : 1;
2815 for (i = 0; i < num; i++)
2816 {
2817 len += sep_len;
2818 for (j = 0; j < 2; j++)
2819 if (opts[i][j])
2820 len += strlen (opts[i][j]);
2821 }
2822
2823 /* Build the string. */
2824 ret = ptr = (char *) xmalloc (len);
2825 line_len = 0;
2826
2827 for (i = 0; i < num; i++)
2828 {
2829 size_t len2[2];
2830
2831 for (j = 0; j < 2; j++)
2832 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2833
2834 if (i != 0)
2835 {
2836 *ptr++ = ' ';
2837 line_len++;
2838
2839 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2840 {
2841 *ptr++ = '\\';
2842 *ptr++ = '\n';
2843 line_len = 0;
2844 }
2845 }
2846
2847 for (j = 0; j < 2; j++)
2848 if (opts[i][j])
2849 {
2850 memcpy (ptr, opts[i][j], len2[j]);
2851 ptr += len2[j];
2852 line_len += len2[j];
2853 }
2854 }
2855
2856 *ptr = '\0';
2857 gcc_assert (ret + len >= ptr);
2858
2859 return ret;
2860 }
2861
2862 /* Return true, if profiling code should be emitted before
2863 prologue. Otherwise it returns false.
2864 Note: For x86 with "hotfix" it is sorried. */
2865 static bool
2866 ix86_profile_before_prologue (void)
2867 {
2868 return flag_fentry != 0;
2869 }
2870
2871 /* Function that is callable from the debugger to print the current
2872 options. */
2873 void
2874 ix86_debug_options (void)
2875 {
2876 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2877 ix86_arch_string, ix86_tune_string,
2878 ix86_fpmath, true);
2879
2880 if (opts)
2881 {
2882 fprintf (stderr, "%s\n\n", opts);
2883 free (opts);
2884 }
2885 else
2886 fputs ("<no options>\n\n", stderr);
2887
2888 return;
2889 }
2890 \f
2891 /* Override various settings based on options. If MAIN_ARGS_P, the
2892 options are from the command line, otherwise they are from
2893 attributes. */
2894
2895 static void
2896 ix86_option_override_internal (bool main_args_p)
2897 {
2898 int i;
2899 unsigned int ix86_arch_mask, ix86_tune_mask;
2900 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2901 const char *prefix;
2902 const char *suffix;
2903 const char *sw;
2904
2905 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2906 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2907 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2908 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2909 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2910 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2911 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2912 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2913 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2914 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2915 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2916 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2917 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2918 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2919 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2920 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2921 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2922 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2923 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2924 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2925 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2926 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2927 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2928 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2929 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2930 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2931 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2932 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2933 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2934 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2935 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2936 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2937 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2938 /* if this reaches 64, need to widen struct pta flags below */
2939
2940 static struct pta
2941 {
2942 const char *const name; /* processor name or nickname. */
2943 const enum processor_type processor;
2944 const enum attr_cpu schedule;
2945 const unsigned HOST_WIDE_INT flags;
2946 }
2947 const processor_alias_table[] =
2948 {
2949 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2950 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2951 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2952 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2953 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2954 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2955 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2956 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2957 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2958 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2959 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2960 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2961 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2962 PTA_MMX | PTA_SSE},
2963 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2964 PTA_MMX | PTA_SSE},
2965 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2966 PTA_MMX | PTA_SSE | PTA_SSE2},
2967 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2968 PTA_MMX |PTA_SSE | PTA_SSE2},
2969 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2970 PTA_MMX | PTA_SSE | PTA_SSE2},
2971 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2972 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2973 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2974 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2975 | PTA_CX16 | PTA_NO_SAHF},
2976 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2977 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2978 | PTA_SSSE3 | PTA_CX16},
2979 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2980 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2981 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2982 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2983 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2984 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2985 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2986 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2987 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2988 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2989 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2990 | PTA_RDRND | PTA_F16C},
2991 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2992 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2993 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2994 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2995 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2996 | PTA_FMA | PTA_MOVBE | PTA_RTM},
2997 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2998 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2999 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
3000 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3001 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
3002 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3003 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3004 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3005 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3006 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3007 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3008 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3009 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3010 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3011 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3012 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3013 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3014 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3015 {"x86-64", PROCESSOR_K8, CPU_K8,
3016 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3017 {"k8", PROCESSOR_K8, CPU_K8,
3018 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3019 | PTA_SSE2 | PTA_NO_SAHF},
3020 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3021 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3022 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3023 {"opteron", PROCESSOR_K8, CPU_K8,
3024 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3025 | PTA_SSE2 | PTA_NO_SAHF},
3026 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3027 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3028 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3029 {"athlon64", PROCESSOR_K8, CPU_K8,
3030 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3031 | PTA_SSE2 | PTA_NO_SAHF},
3032 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3033 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3034 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3035 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3036 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3037 | PTA_SSE2 | PTA_NO_SAHF},
3038 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3039 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3040 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3041 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3042 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3043 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3044 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3045 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3046 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3047 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3048 | PTA_XOP | PTA_LWP},
3049 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3050 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3051 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3052 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3053 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3054 | PTA_FMA},
3055 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3056 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3057 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3058 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3059 0 /* flags are only used for -march switch. */ },
3060 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3061 PTA_64BIT /* flags are only used for -march switch. */ },
3062 };
3063
3064 /* -mrecip options. */
3065 static struct
3066 {
3067 const char *string; /* option name */
3068 unsigned int mask; /* mask bits to set */
3069 }
3070 const recip_options[] =
3071 {
3072 { "all", RECIP_MASK_ALL },
3073 { "none", RECIP_MASK_NONE },
3074 { "div", RECIP_MASK_DIV },
3075 { "sqrt", RECIP_MASK_SQRT },
3076 { "vec-div", RECIP_MASK_VEC_DIV },
3077 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3078 };
3079
3080 int const pta_size = ARRAY_SIZE (processor_alias_table);
3081
3082 /* Set up prefix/suffix so the error messages refer to either the command
3083 line argument, or the attribute(target). */
3084 if (main_args_p)
3085 {
3086 prefix = "-m";
3087 suffix = "";
3088 sw = "switch";
3089 }
3090 else
3091 {
3092 prefix = "option(\"";
3093 suffix = "\")";
3094 sw = "attribute";
3095 }
3096
3097 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3098 SUBTARGET_OVERRIDE_OPTIONS;
3099 #endif
3100
3101 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3102 SUBSUBTARGET_OVERRIDE_OPTIONS;
3103 #endif
3104
3105 if (TARGET_X32)
3106 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3107
3108 /* -fPIC is the default for x86_64. */
3109 if (TARGET_MACHO && TARGET_64BIT)
3110 flag_pic = 2;
3111
3112 /* Need to check -mtune=generic first. */
3113 if (ix86_tune_string)
3114 {
3115 if (!strcmp (ix86_tune_string, "generic")
3116 || !strcmp (ix86_tune_string, "i686")
3117 /* As special support for cross compilers we read -mtune=native
3118 as -mtune=generic. With native compilers we won't see the
3119 -mtune=native, as it was changed by the driver. */
3120 || !strcmp (ix86_tune_string, "native"))
3121 {
3122 if (TARGET_64BIT)
3123 ix86_tune_string = "generic64";
3124 else
3125 ix86_tune_string = "generic32";
3126 }
3127 /* If this call is for setting the option attribute, allow the
3128 generic32/generic64 that was previously set. */
3129 else if (!main_args_p
3130 && (!strcmp (ix86_tune_string, "generic32")
3131 || !strcmp (ix86_tune_string, "generic64")))
3132 ;
3133 else if (!strncmp (ix86_tune_string, "generic", 7))
3134 error ("bad value (%s) for %stune=%s %s",
3135 ix86_tune_string, prefix, suffix, sw);
3136 else if (!strcmp (ix86_tune_string, "x86-64"))
3137 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3138 "%stune=k8%s or %stune=generic%s instead as appropriate",
3139 prefix, suffix, prefix, suffix, prefix, suffix);
3140 }
3141 else
3142 {
3143 if (ix86_arch_string)
3144 ix86_tune_string = ix86_arch_string;
3145 if (!ix86_tune_string)
3146 {
3147 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3148 ix86_tune_defaulted = 1;
3149 }
3150
3151 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3152 need to use a sensible tune option. */
3153 if (!strcmp (ix86_tune_string, "generic")
3154 || !strcmp (ix86_tune_string, "x86-64")
3155 || !strcmp (ix86_tune_string, "i686"))
3156 {
3157 if (TARGET_64BIT)
3158 ix86_tune_string = "generic64";
3159 else
3160 ix86_tune_string = "generic32";
3161 }
3162 }
3163
3164 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3165 {
3166 /* rep; movq isn't available in 32-bit code. */
3167 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3168 ix86_stringop_alg = no_stringop;
3169 }
3170
3171 if (!ix86_arch_string)
3172 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3173 else
3174 ix86_arch_specified = 1;
3175
3176 if (global_options_set.x_ix86_pmode)
3177 {
3178 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3179 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3180 error ("address mode %qs not supported in the %s bit mode",
3181 TARGET_64BIT ? "short" : "long",
3182 TARGET_64BIT ? "64" : "32");
3183 }
3184 else
3185 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3186
3187 if (!global_options_set.x_ix86_abi)
3188 ix86_abi = DEFAULT_ABI;
3189
3190 if (global_options_set.x_ix86_cmodel)
3191 {
3192 switch (ix86_cmodel)
3193 {
3194 case CM_SMALL:
3195 case CM_SMALL_PIC:
3196 if (flag_pic)
3197 ix86_cmodel = CM_SMALL_PIC;
3198 if (!TARGET_64BIT)
3199 error ("code model %qs not supported in the %s bit mode",
3200 "small", "32");
3201 break;
3202
3203 case CM_MEDIUM:
3204 case CM_MEDIUM_PIC:
3205 if (flag_pic)
3206 ix86_cmodel = CM_MEDIUM_PIC;
3207 if (!TARGET_64BIT)
3208 error ("code model %qs not supported in the %s bit mode",
3209 "medium", "32");
3210 else if (TARGET_X32)
3211 error ("code model %qs not supported in x32 mode",
3212 "medium");
3213 break;
3214
3215 case CM_LARGE:
3216 case CM_LARGE_PIC:
3217 if (flag_pic)
3218 ix86_cmodel = CM_LARGE_PIC;
3219 if (!TARGET_64BIT)
3220 error ("code model %qs not supported in the %s bit mode",
3221 "large", "32");
3222 else if (TARGET_X32)
3223 error ("code model %qs not supported in x32 mode",
3224 "medium");
3225 break;
3226
3227 case CM_32:
3228 if (flag_pic)
3229 error ("code model %s does not support PIC mode", "32");
3230 if (TARGET_64BIT)
3231 error ("code model %qs not supported in the %s bit mode",
3232 "32", "64");
3233 break;
3234
3235 case CM_KERNEL:
3236 if (flag_pic)
3237 {
3238 error ("code model %s does not support PIC mode", "kernel");
3239 ix86_cmodel = CM_32;
3240 }
3241 if (!TARGET_64BIT)
3242 error ("code model %qs not supported in the %s bit mode",
3243 "kernel", "32");
3244 break;
3245
3246 default:
3247 gcc_unreachable ();
3248 }
3249 }
3250 else
3251 {
3252 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3253 use of rip-relative addressing. This eliminates fixups that
3254 would otherwise be needed if this object is to be placed in a
3255 DLL, and is essentially just as efficient as direct addressing. */
3256 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3257 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3258 else if (TARGET_64BIT)
3259 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3260 else
3261 ix86_cmodel = CM_32;
3262 }
3263 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3264 {
3265 error ("-masm=intel not supported in this configuration");
3266 ix86_asm_dialect = ASM_ATT;
3267 }
3268 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3269 sorry ("%i-bit mode not compiled in",
3270 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3271
3272 for (i = 0; i < pta_size; i++)
3273 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3274 {
3275 ix86_schedule = processor_alias_table[i].schedule;
3276 ix86_arch = processor_alias_table[i].processor;
3277 /* Default cpu tuning to the architecture. */
3278 ix86_tune = ix86_arch;
3279
3280 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3281 error ("CPU you selected does not support x86-64 "
3282 "instruction set");
3283
3284 if (processor_alias_table[i].flags & PTA_MMX
3285 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3286 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3287 if (processor_alias_table[i].flags & PTA_3DNOW
3288 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3289 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3290 if (processor_alias_table[i].flags & PTA_3DNOW_A
3291 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3292 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3293 if (processor_alias_table[i].flags & PTA_SSE
3294 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3295 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3296 if (processor_alias_table[i].flags & PTA_SSE2
3297 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3298 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3299 if (processor_alias_table[i].flags & PTA_SSE3
3300 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3301 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3302 if (processor_alias_table[i].flags & PTA_SSSE3
3303 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3304 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3305 if (processor_alias_table[i].flags & PTA_SSE4_1
3306 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3307 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3308 if (processor_alias_table[i].flags & PTA_SSE4_2
3309 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3310 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3311 if (processor_alias_table[i].flags & PTA_AVX
3312 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3313 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3314 if (processor_alias_table[i].flags & PTA_AVX2
3315 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3316 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3317 if (processor_alias_table[i].flags & PTA_FMA
3318 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3319 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3320 if (processor_alias_table[i].flags & PTA_SSE4A
3321 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3322 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3323 if (processor_alias_table[i].flags & PTA_FMA4
3324 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3325 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3326 if (processor_alias_table[i].flags & PTA_XOP
3327 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3328 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3329 if (processor_alias_table[i].flags & PTA_LWP
3330 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3331 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3332 if (processor_alias_table[i].flags & PTA_ABM
3333 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3334 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3335 if (processor_alias_table[i].flags & PTA_BMI
3336 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3337 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3338 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3339 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3340 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3341 if (processor_alias_table[i].flags & PTA_TBM
3342 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3343 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3344 if (processor_alias_table[i].flags & PTA_BMI2
3345 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3346 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3347 if (processor_alias_table[i].flags & PTA_CX16
3348 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3349 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3350 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3351 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3352 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3353 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3354 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3355 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3356 if (processor_alias_table[i].flags & PTA_MOVBE
3357 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3358 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3359 if (processor_alias_table[i].flags & PTA_AES
3360 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3361 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3362 if (processor_alias_table[i].flags & PTA_PCLMUL
3363 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3364 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3365 if (processor_alias_table[i].flags & PTA_FSGSBASE
3366 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3367 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3368 if (processor_alias_table[i].flags & PTA_RDRND
3369 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3370 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3371 if (processor_alias_table[i].flags & PTA_F16C
3372 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3373 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3374 if (processor_alias_table[i].flags & PTA_RTM
3375 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3376 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3377 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3378 x86_prefetch_sse = true;
3379
3380 break;
3381 }
3382
3383 if (!strcmp (ix86_arch_string, "generic"))
3384 error ("generic CPU can be used only for %stune=%s %s",
3385 prefix, suffix, sw);
3386 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3387 error ("bad value (%s) for %sarch=%s %s",
3388 ix86_arch_string, prefix, suffix, sw);
3389
3390 ix86_arch_mask = 1u << ix86_arch;
3391 for (i = 0; i < X86_ARCH_LAST; ++i)
3392 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3393
3394 for (i = 0; i < pta_size; i++)
3395 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3396 {
3397 ix86_schedule = processor_alias_table[i].schedule;
3398 ix86_tune = processor_alias_table[i].processor;
3399 if (TARGET_64BIT)
3400 {
3401 if (!(processor_alias_table[i].flags & PTA_64BIT))
3402 {
3403 if (ix86_tune_defaulted)
3404 {
3405 ix86_tune_string = "x86-64";
3406 for (i = 0; i < pta_size; i++)
3407 if (! strcmp (ix86_tune_string,
3408 processor_alias_table[i].name))
3409 break;
3410 ix86_schedule = processor_alias_table[i].schedule;
3411 ix86_tune = processor_alias_table[i].processor;
3412 }
3413 else
3414 error ("CPU you selected does not support x86-64 "
3415 "instruction set");
3416 }
3417 }
3418 else
3419 {
3420 /* Adjust tuning when compiling for 32-bit ABI. */
3421 switch (ix86_tune)
3422 {
3423 case PROCESSOR_GENERIC64:
3424 ix86_tune = PROCESSOR_GENERIC32;
3425 ix86_schedule = CPU_PENTIUMPRO;
3426 break;
3427
3428 case PROCESSOR_CORE2_64:
3429 ix86_tune = PROCESSOR_CORE2_32;
3430 break;
3431
3432 case PROCESSOR_COREI7_64:
3433 ix86_tune = PROCESSOR_COREI7_32;
3434 break;
3435
3436 default:
3437 break;
3438 }
3439 }
3440 /* Intel CPUs have always interpreted SSE prefetch instructions as
3441 NOPs; so, we can enable SSE prefetch instructions even when
3442 -mtune (rather than -march) points us to a processor that has them.
3443 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3444 higher processors. */
3445 if (TARGET_CMOVE
3446 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3447 x86_prefetch_sse = true;
3448 break;
3449 }
3450
3451 if (ix86_tune_specified && i == pta_size)
3452 error ("bad value (%s) for %stune=%s %s",
3453 ix86_tune_string, prefix, suffix, sw);
3454
3455 ix86_tune_mask = 1u << ix86_tune;
3456 for (i = 0; i < X86_TUNE_LAST; ++i)
3457 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3458
3459 #ifndef USE_IX86_FRAME_POINTER
3460 #define USE_IX86_FRAME_POINTER 0
3461 #endif
3462
3463 #ifndef USE_X86_64_FRAME_POINTER
3464 #define USE_X86_64_FRAME_POINTER 0
3465 #endif
3466
3467 /* Set the default values for switches whose default depends on TARGET_64BIT
3468 in case they weren't overwritten by command line options. */
3469 if (TARGET_64BIT)
3470 {
3471 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3472 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3473 if (flag_asynchronous_unwind_tables == 2)
3474 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3475 if (flag_pcc_struct_return == 2)
3476 flag_pcc_struct_return = 0;
3477 }
3478 else
3479 {
3480 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3481 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3482 if (flag_asynchronous_unwind_tables == 2)
3483 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3484 if (flag_pcc_struct_return == 2)
3485 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3486 }
3487
3488 if (optimize_size)
3489 ix86_cost = &ix86_size_cost;
3490 else
3491 ix86_cost = processor_target_table[ix86_tune].cost;
3492
3493 /* Arrange to set up i386_stack_locals for all functions. */
3494 init_machine_status = ix86_init_machine_status;
3495
3496 /* Validate -mregparm= value. */
3497 if (global_options_set.x_ix86_regparm)
3498 {
3499 if (TARGET_64BIT)
3500 warning (0, "-mregparm is ignored in 64-bit mode");
3501 if (ix86_regparm > REGPARM_MAX)
3502 {
3503 error ("-mregparm=%d is not between 0 and %d",
3504 ix86_regparm, REGPARM_MAX);
3505 ix86_regparm = 0;
3506 }
3507 }
3508 if (TARGET_64BIT)
3509 ix86_regparm = REGPARM_MAX;
3510
3511 /* Default align_* from the processor table. */
3512 if (align_loops == 0)
3513 {
3514 align_loops = processor_target_table[ix86_tune].align_loop;
3515 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3516 }
3517 if (align_jumps == 0)
3518 {
3519 align_jumps = processor_target_table[ix86_tune].align_jump;
3520 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3521 }
3522 if (align_functions == 0)
3523 {
3524 align_functions = processor_target_table[ix86_tune].align_func;
3525 }
3526
3527 /* Provide default for -mbranch-cost= value. */
3528 if (!global_options_set.x_ix86_branch_cost)
3529 ix86_branch_cost = ix86_cost->branch_cost;
3530
3531 if (TARGET_64BIT)
3532 {
3533 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3534
3535 /* Enable by default the SSE and MMX builtins. Do allow the user to
3536 explicitly disable any of these. In particular, disabling SSE and
3537 MMX for kernel code is extremely useful. */
3538 if (!ix86_arch_specified)
3539 ix86_isa_flags
3540 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3541 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3542
3543 if (TARGET_RTD)
3544 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3545 }
3546 else
3547 {
3548 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3549
3550 if (!ix86_arch_specified)
3551 ix86_isa_flags
3552 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3553
3554 /* i386 ABI does not specify red zone. It still makes sense to use it
3555 when programmer takes care to stack from being destroyed. */
3556 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3557 target_flags |= MASK_NO_RED_ZONE;
3558 }
3559
3560 /* Keep nonleaf frame pointers. */
3561 if (flag_omit_frame_pointer)
3562 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3563 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3564 flag_omit_frame_pointer = 1;
3565
3566 /* If we're doing fast math, we don't care about comparison order
3567 wrt NaNs. This lets us use a shorter comparison sequence. */
3568 if (flag_finite_math_only)
3569 target_flags &= ~MASK_IEEE_FP;
3570
3571 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3572 since the insns won't need emulation. */
3573 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3574 target_flags &= ~MASK_NO_FANCY_MATH_387;
3575
3576 /* Likewise, if the target doesn't have a 387, or we've specified
3577 software floating point, don't use 387 inline intrinsics. */
3578 if (!TARGET_80387)
3579 target_flags |= MASK_NO_FANCY_MATH_387;
3580
3581 /* Turn on MMX builtins for -msse. */
3582 if (TARGET_SSE)
3583 {
3584 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3585 x86_prefetch_sse = true;
3586 }
3587
3588 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3589 if (TARGET_SSE4_2 || TARGET_ABM)
3590 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3591
3592 /* Turn on lzcnt instruction for -mabm. */
3593 if (TARGET_ABM)
3594 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3595
3596 /* Validate -mpreferred-stack-boundary= value or default it to
3597 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3598 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3599 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3600 {
3601 int min = (TARGET_64BIT ? 4 : 2);
3602 int max = (TARGET_SEH ? 4 : 12);
3603
3604 if (ix86_preferred_stack_boundary_arg < min
3605 || ix86_preferred_stack_boundary_arg > max)
3606 {
3607 if (min == max)
3608 error ("-mpreferred-stack-boundary is not supported "
3609 "for this target");
3610 else
3611 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3612 ix86_preferred_stack_boundary_arg, min, max);
3613 }
3614 else
3615 ix86_preferred_stack_boundary
3616 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3617 }
3618
3619 /* Set the default value for -mstackrealign. */
3620 if (ix86_force_align_arg_pointer == -1)
3621 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3622
3623 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3624
3625 /* Validate -mincoming-stack-boundary= value or default it to
3626 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3627 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3628 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3629 {
3630 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3631 || ix86_incoming_stack_boundary_arg > 12)
3632 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3633 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3634 else
3635 {
3636 ix86_user_incoming_stack_boundary
3637 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3638 ix86_incoming_stack_boundary
3639 = ix86_user_incoming_stack_boundary;
3640 }
3641 }
3642
3643 /* Accept -msseregparm only if at least SSE support is enabled. */
3644 if (TARGET_SSEREGPARM
3645 && ! TARGET_SSE)
3646 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3647
3648 if (global_options_set.x_ix86_fpmath)
3649 {
3650 if (ix86_fpmath & FPMATH_SSE)
3651 {
3652 if (!TARGET_SSE)
3653 {
3654 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3655 ix86_fpmath = FPMATH_387;
3656 }
3657 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3658 {
3659 warning (0, "387 instruction set disabled, using SSE arithmetics");
3660 ix86_fpmath = FPMATH_SSE;
3661 }
3662 }
3663 }
3664 else
3665 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3666
3667 /* If the i387 is disabled, then do not return values in it. */
3668 if (!TARGET_80387)
3669 target_flags &= ~MASK_FLOAT_RETURNS;
3670
3671 /* Use external vectorized library in vectorizing intrinsics. */
3672 if (global_options_set.x_ix86_veclibabi_type)
3673 switch (ix86_veclibabi_type)
3674 {
3675 case ix86_veclibabi_type_svml:
3676 ix86_veclib_handler = ix86_veclibabi_svml;
3677 break;
3678
3679 case ix86_veclibabi_type_acml:
3680 ix86_veclib_handler = ix86_veclibabi_acml;
3681 break;
3682
3683 default:
3684 gcc_unreachable ();
3685 }
3686
3687 if ((!USE_IX86_FRAME_POINTER
3688 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3689 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3690 && !optimize_size)
3691 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3692
3693 /* ??? Unwind info is not correct around the CFG unless either a frame
3694 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3695 unwind info generation to be aware of the CFG and propagating states
3696 around edges. */
3697 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3698 || flag_exceptions || flag_non_call_exceptions)
3699 && flag_omit_frame_pointer
3700 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3701 {
3702 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3703 warning (0, "unwind tables currently require either a frame pointer "
3704 "or %saccumulate-outgoing-args%s for correctness",
3705 prefix, suffix);
3706 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3707 }
3708
3709 /* If stack probes are required, the space used for large function
3710 arguments on the stack must also be probed, so enable
3711 -maccumulate-outgoing-args so this happens in the prologue. */
3712 if (TARGET_STACK_PROBE
3713 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3714 {
3715 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3716 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3717 "for correctness", prefix, suffix);
3718 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3719 }
3720
3721 /* For sane SSE instruction set generation we need fcomi instruction.
3722 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3723 expands to a sequence that includes conditional move. */
3724 if (TARGET_SSE || TARGET_RDRND)
3725 TARGET_CMOVE = 1;
3726
3727 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3728 {
3729 char *p;
3730 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3731 p = strchr (internal_label_prefix, 'X');
3732 internal_label_prefix_len = p - internal_label_prefix;
3733 *p = '\0';
3734 }
3735
3736 /* When scheduling description is not available, disable scheduler pass
3737 so it won't slow down the compilation and make x87 code slower. */
3738 if (!TARGET_SCHEDULE)
3739 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3740
3741 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3742 ix86_cost->simultaneous_prefetches,
3743 global_options.x_param_values,
3744 global_options_set.x_param_values);
3745 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3746 global_options.x_param_values,
3747 global_options_set.x_param_values);
3748 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3749 global_options.x_param_values,
3750 global_options_set.x_param_values);
3751 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3752 global_options.x_param_values,
3753 global_options_set.x_param_values);
3754
3755 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3756 if (flag_prefetch_loop_arrays < 0
3757 && HAVE_prefetch
3758 && optimize >= 3
3759 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3760 flag_prefetch_loop_arrays = 1;
3761
3762 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3763 can be optimized to ap = __builtin_next_arg (0). */
3764 if (!TARGET_64BIT && !flag_split_stack)
3765 targetm.expand_builtin_va_start = NULL;
3766
3767 if (TARGET_64BIT)
3768 {
3769 ix86_gen_leave = gen_leave_rex64;
3770 if (Pmode == DImode)
3771 {
3772 ix86_gen_monitor = gen_sse3_monitor64_di;
3773 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3774 ix86_gen_tls_local_dynamic_base_64
3775 = gen_tls_local_dynamic_base_64_di;
3776 }
3777 else
3778 {
3779 ix86_gen_monitor = gen_sse3_monitor64_si;
3780 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3781 ix86_gen_tls_local_dynamic_base_64
3782 = gen_tls_local_dynamic_base_64_si;
3783 }
3784 }
3785 else
3786 {
3787 ix86_gen_leave = gen_leave;
3788 ix86_gen_monitor = gen_sse3_monitor;
3789 }
3790
3791 if (Pmode == DImode)
3792 {
3793 ix86_gen_add3 = gen_adddi3;
3794 ix86_gen_sub3 = gen_subdi3;
3795 ix86_gen_sub3_carry = gen_subdi3_carry;
3796 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3797 ix86_gen_andsp = gen_anddi3;
3798 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3799 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3800 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3801 }
3802 else
3803 {
3804 ix86_gen_add3 = gen_addsi3;
3805 ix86_gen_sub3 = gen_subsi3;
3806 ix86_gen_sub3_carry = gen_subsi3_carry;
3807 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3808 ix86_gen_andsp = gen_andsi3;
3809 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3810 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3811 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3812 }
3813
3814 #ifdef USE_IX86_CLD
3815 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3816 if (!TARGET_64BIT)
3817 target_flags |= MASK_CLD & ~target_flags_explicit;
3818 #endif
3819
3820 if (!TARGET_64BIT && flag_pic)
3821 {
3822 if (flag_fentry > 0)
3823 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3824 "with -fpic");
3825 flag_fentry = 0;
3826 }
3827 else if (TARGET_SEH)
3828 {
3829 if (flag_fentry == 0)
3830 sorry ("-mno-fentry isn%'t compatible with SEH");
3831 flag_fentry = 1;
3832 }
3833 else if (flag_fentry < 0)
3834 {
3835 #if defined(PROFILE_BEFORE_PROLOGUE)
3836 flag_fentry = 1;
3837 #else
3838 flag_fentry = 0;
3839 #endif
3840 }
3841
3842 if (TARGET_AVX)
3843 {
3844 /* When not optimize for size, enable vzeroupper optimization for
3845 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3846 AVX unaligned load/store. */
3847 if (!optimize_size)
3848 {
3849 if (flag_expensive_optimizations
3850 && !(target_flags_explicit & MASK_VZEROUPPER))
3851 target_flags |= MASK_VZEROUPPER;
3852 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3853 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3854 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3855 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3856 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3857 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3858 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3859 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3860 target_flags |= MASK_PREFER_AVX128;
3861 }
3862 }
3863 else
3864 {
3865 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3866 target_flags &= ~MASK_VZEROUPPER;
3867 }
3868
3869 if (ix86_recip_name)
3870 {
3871 char *p = ASTRDUP (ix86_recip_name);
3872 char *q;
3873 unsigned int mask, i;
3874 bool invert;
3875
3876 while ((q = strtok (p, ",")) != NULL)
3877 {
3878 p = NULL;
3879 if (*q == '!')
3880 {
3881 invert = true;
3882 q++;
3883 }
3884 else
3885 invert = false;
3886
3887 if (!strcmp (q, "default"))
3888 mask = RECIP_MASK_ALL;
3889 else
3890 {
3891 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3892 if (!strcmp (q, recip_options[i].string))
3893 {
3894 mask = recip_options[i].mask;
3895 break;
3896 }
3897
3898 if (i == ARRAY_SIZE (recip_options))
3899 {
3900 error ("unknown option for -mrecip=%s", q);
3901 invert = false;
3902 mask = RECIP_MASK_NONE;
3903 }
3904 }
3905
3906 recip_mask_explicit |= mask;
3907 if (invert)
3908 recip_mask &= ~mask;
3909 else
3910 recip_mask |= mask;
3911 }
3912 }
3913
3914 if (TARGET_RECIP)
3915 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3916 else if (target_flags_explicit & MASK_RECIP)
3917 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3918
3919 /* Save the initial options in case the user does function specific
3920 options. */
3921 if (main_args_p)
3922 target_option_default_node = target_option_current_node
3923 = build_target_option_node ();
3924 }
3925
3926 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3927
3928 static bool
3929 function_pass_avx256_p (const_rtx val)
3930 {
3931 if (!val)
3932 return false;
3933
3934 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3935 return true;
3936
3937 if (GET_CODE (val) == PARALLEL)
3938 {
3939 int i;
3940 rtx r;
3941
3942 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3943 {
3944 r = XVECEXP (val, 0, i);
3945 if (GET_CODE (r) == EXPR_LIST
3946 && XEXP (r, 0)
3947 && REG_P (XEXP (r, 0))
3948 && (GET_MODE (XEXP (r, 0)) == OImode
3949 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3950 return true;
3951 }
3952 }
3953
3954 return false;
3955 }
3956
3957 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3958
3959 static void
3960 ix86_option_override (void)
3961 {
3962 ix86_option_override_internal (true);
3963 }
3964
3965 /* Update register usage after having seen the compiler flags. */
3966
3967 static void
3968 ix86_conditional_register_usage (void)
3969 {
3970 int i;
3971 unsigned int j;
3972
3973 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3974 {
3975 if (fixed_regs[i] > 1)
3976 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3977 if (call_used_regs[i] > 1)
3978 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3979 }
3980
3981 /* The PIC register, if it exists, is fixed. */
3982 j = PIC_OFFSET_TABLE_REGNUM;
3983 if (j != INVALID_REGNUM)
3984 fixed_regs[j] = call_used_regs[j] = 1;
3985
3986 /* The 64-bit MS_ABI changes the set of call-used registers. */
3987 if (TARGET_64BIT_MS_ABI)
3988 {
3989 call_used_regs[SI_REG] = 0;
3990 call_used_regs[DI_REG] = 0;
3991 call_used_regs[XMM6_REG] = 0;
3992 call_used_regs[XMM7_REG] = 0;
3993 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3994 call_used_regs[i] = 0;
3995 }
3996
3997 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3998 other call-clobbered regs for 64-bit. */
3999 if (TARGET_64BIT)
4000 {
4001 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4002
4003 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4004 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4005 && call_used_regs[i])
4006 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4007 }
4008
4009 /* If MMX is disabled, squash the registers. */
4010 if (! TARGET_MMX)
4011 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4012 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4013 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4014
4015 /* If SSE is disabled, squash the registers. */
4016 if (! TARGET_SSE)
4017 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4018 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4019 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4020
4021 /* If the FPU is disabled, squash the registers. */
4022 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4023 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4024 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4025 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4026
4027 /* If 32-bit, squash the 64-bit registers. */
4028 if (! TARGET_64BIT)
4029 {
4030 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4031 reg_names[i] = "";
4032 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4033 reg_names[i] = "";
4034 }
4035 }
4036
4037 \f
4038 /* Save the current options */
4039
4040 static void
4041 ix86_function_specific_save (struct cl_target_option *ptr)
4042 {
4043 ptr->arch = ix86_arch;
4044 ptr->schedule = ix86_schedule;
4045 ptr->tune = ix86_tune;
4046 ptr->branch_cost = ix86_branch_cost;
4047 ptr->tune_defaulted = ix86_tune_defaulted;
4048 ptr->arch_specified = ix86_arch_specified;
4049 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4050 ptr->ix86_target_flags_explicit = target_flags_explicit;
4051 ptr->x_recip_mask_explicit = recip_mask_explicit;
4052
4053 /* The fields are char but the variables are not; make sure the
4054 values fit in the fields. */
4055 gcc_assert (ptr->arch == ix86_arch);
4056 gcc_assert (ptr->schedule == ix86_schedule);
4057 gcc_assert (ptr->tune == ix86_tune);
4058 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4059 }
4060
4061 /* Restore the current options */
4062
4063 static void
4064 ix86_function_specific_restore (struct cl_target_option *ptr)
4065 {
4066 enum processor_type old_tune = ix86_tune;
4067 enum processor_type old_arch = ix86_arch;
4068 unsigned int ix86_arch_mask, ix86_tune_mask;
4069 int i;
4070
4071 ix86_arch = (enum processor_type) ptr->arch;
4072 ix86_schedule = (enum attr_cpu) ptr->schedule;
4073 ix86_tune = (enum processor_type) ptr->tune;
4074 ix86_branch_cost = ptr->branch_cost;
4075 ix86_tune_defaulted = ptr->tune_defaulted;
4076 ix86_arch_specified = ptr->arch_specified;
4077 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4078 target_flags_explicit = ptr->ix86_target_flags_explicit;
4079 recip_mask_explicit = ptr->x_recip_mask_explicit;
4080
4081 /* Recreate the arch feature tests if the arch changed */
4082 if (old_arch != ix86_arch)
4083 {
4084 ix86_arch_mask = 1u << ix86_arch;
4085 for (i = 0; i < X86_ARCH_LAST; ++i)
4086 ix86_arch_features[i]
4087 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4088 }
4089
4090 /* Recreate the tune optimization tests */
4091 if (old_tune != ix86_tune)
4092 {
4093 ix86_tune_mask = 1u << ix86_tune;
4094 for (i = 0; i < X86_TUNE_LAST; ++i)
4095 ix86_tune_features[i]
4096 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4097 }
4098 }
4099
4100 /* Print the current options */
4101
4102 static void
4103 ix86_function_specific_print (FILE *file, int indent,
4104 struct cl_target_option *ptr)
4105 {
4106 char *target_string
4107 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4108 NULL, NULL, ptr->x_ix86_fpmath, false);
4109
4110 fprintf (file, "%*sarch = %d (%s)\n",
4111 indent, "",
4112 ptr->arch,
4113 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4114 ? cpu_names[ptr->arch]
4115 : "<unknown>"));
4116
4117 fprintf (file, "%*stune = %d (%s)\n",
4118 indent, "",
4119 ptr->tune,
4120 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4121 ? cpu_names[ptr->tune]
4122 : "<unknown>"));
4123
4124 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4125
4126 if (target_string)
4127 {
4128 fprintf (file, "%*s%s\n", indent, "", target_string);
4129 free (target_string);
4130 }
4131 }
4132
4133 \f
4134 /* Inner function to process the attribute((target(...))), take an argument and
4135 set the current options from the argument. If we have a list, recursively go
4136 over the list. */
4137
4138 static bool
4139 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4140 struct gcc_options *enum_opts_set)
4141 {
4142 char *next_optstr;
4143 bool ret = true;
4144
4145 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4146 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4147 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4148 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4149 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4150
4151 enum ix86_opt_type
4152 {
4153 ix86_opt_unknown,
4154 ix86_opt_yes,
4155 ix86_opt_no,
4156 ix86_opt_str,
4157 ix86_opt_enum,
4158 ix86_opt_isa
4159 };
4160
4161 static const struct
4162 {
4163 const char *string;
4164 size_t len;
4165 enum ix86_opt_type type;
4166 int opt;
4167 int mask;
4168 } attrs[] = {
4169 /* isa options */
4170 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4171 IX86_ATTR_ISA ("abm", OPT_mabm),
4172 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4173 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4174 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4175 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4176 IX86_ATTR_ISA ("aes", OPT_maes),
4177 IX86_ATTR_ISA ("avx", OPT_mavx),
4178 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4179 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4180 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4181 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4182 IX86_ATTR_ISA ("sse", OPT_msse),
4183 IX86_ATTR_ISA ("sse2", OPT_msse2),
4184 IX86_ATTR_ISA ("sse3", OPT_msse3),
4185 IX86_ATTR_ISA ("sse4", OPT_msse4),
4186 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4187 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4188 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4189 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4190 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4191 IX86_ATTR_ISA ("fma", OPT_mfma),
4192 IX86_ATTR_ISA ("xop", OPT_mxop),
4193 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4194 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4195 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4196 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4197 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4198
4199 /* enum options */
4200 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4201
4202 /* string options */
4203 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4204 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4205
4206 /* flag options */
4207 IX86_ATTR_YES ("cld",
4208 OPT_mcld,
4209 MASK_CLD),
4210
4211 IX86_ATTR_NO ("fancy-math-387",
4212 OPT_mfancy_math_387,
4213 MASK_NO_FANCY_MATH_387),
4214
4215 IX86_ATTR_YES ("ieee-fp",
4216 OPT_mieee_fp,
4217 MASK_IEEE_FP),
4218
4219 IX86_ATTR_YES ("inline-all-stringops",
4220 OPT_minline_all_stringops,
4221 MASK_INLINE_ALL_STRINGOPS),
4222
4223 IX86_ATTR_YES ("inline-stringops-dynamically",
4224 OPT_minline_stringops_dynamically,
4225 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4226
4227 IX86_ATTR_NO ("align-stringops",
4228 OPT_mno_align_stringops,
4229 MASK_NO_ALIGN_STRINGOPS),
4230
4231 IX86_ATTR_YES ("recip",
4232 OPT_mrecip,
4233 MASK_RECIP),
4234
4235 };
4236
4237 /* If this is a list, recurse to get the options. */
4238 if (TREE_CODE (args) == TREE_LIST)
4239 {
4240 bool ret = true;
4241
4242 for (; args; args = TREE_CHAIN (args))
4243 if (TREE_VALUE (args)
4244 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4245 p_strings, enum_opts_set))
4246 ret = false;
4247
4248 return ret;
4249 }
4250
4251 else if (TREE_CODE (args) != STRING_CST)
4252 gcc_unreachable ();
4253
4254 /* Handle multiple arguments separated by commas. */
4255 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4256
4257 while (next_optstr && *next_optstr != '\0')
4258 {
4259 char *p = next_optstr;
4260 char *orig_p = p;
4261 char *comma = strchr (next_optstr, ',');
4262 const char *opt_string;
4263 size_t len, opt_len;
4264 int opt;
4265 bool opt_set_p;
4266 char ch;
4267 unsigned i;
4268 enum ix86_opt_type type = ix86_opt_unknown;
4269 int mask = 0;
4270
4271 if (comma)
4272 {
4273 *comma = '\0';
4274 len = comma - next_optstr;
4275 next_optstr = comma + 1;
4276 }
4277 else
4278 {
4279 len = strlen (p);
4280 next_optstr = NULL;
4281 }
4282
4283 /* Recognize no-xxx. */
4284 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4285 {
4286 opt_set_p = false;
4287 p += 3;
4288 len -= 3;
4289 }
4290 else
4291 opt_set_p = true;
4292
4293 /* Find the option. */
4294 ch = *p;
4295 opt = N_OPTS;
4296 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4297 {
4298 type = attrs[i].type;
4299 opt_len = attrs[i].len;
4300 if (ch == attrs[i].string[0]
4301 && ((type != ix86_opt_str && type != ix86_opt_enum)
4302 ? len == opt_len
4303 : len > opt_len)
4304 && memcmp (p, attrs[i].string, opt_len) == 0)
4305 {
4306 opt = attrs[i].opt;
4307 mask = attrs[i].mask;
4308 opt_string = attrs[i].string;
4309 break;
4310 }
4311 }
4312
4313 /* Process the option. */
4314 if (opt == N_OPTS)
4315 {
4316 error ("attribute(target(\"%s\")) is unknown", orig_p);
4317 ret = false;
4318 }
4319
4320 else if (type == ix86_opt_isa)
4321 {
4322 struct cl_decoded_option decoded;
4323
4324 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4325 ix86_handle_option (&global_options, &global_options_set,
4326 &decoded, input_location);
4327 }
4328
4329 else if (type == ix86_opt_yes || type == ix86_opt_no)
4330 {
4331 if (type == ix86_opt_no)
4332 opt_set_p = !opt_set_p;
4333
4334 if (opt_set_p)
4335 target_flags |= mask;
4336 else
4337 target_flags &= ~mask;
4338 }
4339
4340 else if (type == ix86_opt_str)
4341 {
4342 if (p_strings[opt])
4343 {
4344 error ("option(\"%s\") was already specified", opt_string);
4345 ret = false;
4346 }
4347 else
4348 p_strings[opt] = xstrdup (p + opt_len);
4349 }
4350
4351 else if (type == ix86_opt_enum)
4352 {
4353 bool arg_ok;
4354 int value;
4355
4356 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4357 if (arg_ok)
4358 set_option (&global_options, enum_opts_set, opt, value,
4359 p + opt_len, DK_UNSPECIFIED, input_location,
4360 global_dc);
4361 else
4362 {
4363 error ("attribute(target(\"%s\")) is unknown", orig_p);
4364 ret = false;
4365 }
4366 }
4367
4368 else
4369 gcc_unreachable ();
4370 }
4371
4372 return ret;
4373 }
4374
4375 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4376
4377 tree
4378 ix86_valid_target_attribute_tree (tree args)
4379 {
4380 const char *orig_arch_string = ix86_arch_string;
4381 const char *orig_tune_string = ix86_tune_string;
4382 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4383 int orig_tune_defaulted = ix86_tune_defaulted;
4384 int orig_arch_specified = ix86_arch_specified;
4385 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4386 tree t = NULL_TREE;
4387 int i;
4388 struct cl_target_option *def
4389 = TREE_TARGET_OPTION (target_option_default_node);
4390 struct gcc_options enum_opts_set;
4391
4392 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4393
4394 /* Process each of the options on the chain. */
4395 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4396 &enum_opts_set))
4397 return NULL_TREE;
4398
4399 /* If the changed options are different from the default, rerun
4400 ix86_option_override_internal, and then save the options away.
4401 The string options are are attribute options, and will be undone
4402 when we copy the save structure. */
4403 if (ix86_isa_flags != def->x_ix86_isa_flags
4404 || target_flags != def->x_target_flags
4405 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4406 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4407 || enum_opts_set.x_ix86_fpmath)
4408 {
4409 /* If we are using the default tune= or arch=, undo the string assigned,
4410 and use the default. */
4411 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4412 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4413 else if (!orig_arch_specified)
4414 ix86_arch_string = NULL;
4415
4416 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4417 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4418 else if (orig_tune_defaulted)
4419 ix86_tune_string = NULL;
4420
4421 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4422 if (enum_opts_set.x_ix86_fpmath)
4423 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4424 else if (!TARGET_64BIT && TARGET_SSE)
4425 {
4426 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4427 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4428 }
4429
4430 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4431 ix86_option_override_internal (false);
4432
4433 /* Add any builtin functions with the new isa if any. */
4434 ix86_add_new_builtins (ix86_isa_flags);
4435
4436 /* Save the current options unless we are validating options for
4437 #pragma. */
4438 t = build_target_option_node ();
4439
4440 ix86_arch_string = orig_arch_string;
4441 ix86_tune_string = orig_tune_string;
4442 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4443
4444 /* Free up memory allocated to hold the strings */
4445 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4446 free (option_strings[i]);
4447 }
4448
4449 return t;
4450 }
4451
4452 /* Hook to validate attribute((target("string"))). */
4453
4454 static bool
4455 ix86_valid_target_attribute_p (tree fndecl,
4456 tree ARG_UNUSED (name),
4457 tree args,
4458 int ARG_UNUSED (flags))
4459 {
4460 struct cl_target_option cur_target;
4461 bool ret = true;
4462 tree old_optimize = build_optimization_node ();
4463 tree new_target, new_optimize;
4464 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4465
4466 /* If the function changed the optimization levels as well as setting target
4467 options, start with the optimizations specified. */
4468 if (func_optimize && func_optimize != old_optimize)
4469 cl_optimization_restore (&global_options,
4470 TREE_OPTIMIZATION (func_optimize));
4471
4472 /* The target attributes may also change some optimization flags, so update
4473 the optimization options if necessary. */
4474 cl_target_option_save (&cur_target, &global_options);
4475 new_target = ix86_valid_target_attribute_tree (args);
4476 new_optimize = build_optimization_node ();
4477
4478 if (!new_target)
4479 ret = false;
4480
4481 else if (fndecl)
4482 {
4483 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4484
4485 if (old_optimize != new_optimize)
4486 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4487 }
4488
4489 cl_target_option_restore (&global_options, &cur_target);
4490
4491 if (old_optimize != new_optimize)
4492 cl_optimization_restore (&global_options,
4493 TREE_OPTIMIZATION (old_optimize));
4494
4495 return ret;
4496 }
4497
4498 \f
4499 /* Hook to determine if one function can safely inline another. */
4500
4501 static bool
4502 ix86_can_inline_p (tree caller, tree callee)
4503 {
4504 bool ret = false;
4505 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4506 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4507
4508 /* If callee has no option attributes, then it is ok to inline. */
4509 if (!callee_tree)
4510 ret = true;
4511
4512 /* If caller has no option attributes, but callee does then it is not ok to
4513 inline. */
4514 else if (!caller_tree)
4515 ret = false;
4516
4517 else
4518 {
4519 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4520 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4521
4522 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4523 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4524 function. */
4525 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4526 != callee_opts->x_ix86_isa_flags)
4527 ret = false;
4528
4529 /* See if we have the same non-isa options. */
4530 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4531 ret = false;
4532
4533 /* See if arch, tune, etc. are the same. */
4534 else if (caller_opts->arch != callee_opts->arch)
4535 ret = false;
4536
4537 else if (caller_opts->tune != callee_opts->tune)
4538 ret = false;
4539
4540 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4541 ret = false;
4542
4543 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4544 ret = false;
4545
4546 else
4547 ret = true;
4548 }
4549
4550 return ret;
4551 }
4552
4553 \f
4554 /* Remember the last target of ix86_set_current_function. */
4555 static GTY(()) tree ix86_previous_fndecl;
4556
4557 /* Establish appropriate back-end context for processing the function
4558 FNDECL. The argument might be NULL to indicate processing at top
4559 level, outside of any function scope. */
4560 static void
4561 ix86_set_current_function (tree fndecl)
4562 {
4563 /* Only change the context if the function changes. This hook is called
4564 several times in the course of compiling a function, and we don't want to
4565 slow things down too much or call target_reinit when it isn't safe. */
4566 if (fndecl && fndecl != ix86_previous_fndecl)
4567 {
4568 tree old_tree = (ix86_previous_fndecl
4569 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4570 : NULL_TREE);
4571
4572 tree new_tree = (fndecl
4573 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4574 : NULL_TREE);
4575
4576 ix86_previous_fndecl = fndecl;
4577 if (old_tree == new_tree)
4578 ;
4579
4580 else if (new_tree)
4581 {
4582 cl_target_option_restore (&global_options,
4583 TREE_TARGET_OPTION (new_tree));
4584 target_reinit ();
4585 }
4586
4587 else if (old_tree)
4588 {
4589 struct cl_target_option *def
4590 = TREE_TARGET_OPTION (target_option_current_node);
4591
4592 cl_target_option_restore (&global_options, def);
4593 target_reinit ();
4594 }
4595 }
4596 }
4597
4598 \f
4599 /* Return true if this goes in large data/bss. */
4600
4601 static bool
4602 ix86_in_large_data_p (tree exp)
4603 {
4604 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4605 return false;
4606
4607 /* Functions are never large data. */
4608 if (TREE_CODE (exp) == FUNCTION_DECL)
4609 return false;
4610
4611 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4612 {
4613 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4614 if (strcmp (section, ".ldata") == 0
4615 || strcmp (section, ".lbss") == 0)
4616 return true;
4617 return false;
4618 }
4619 else
4620 {
4621 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4622
4623 /* If this is an incomplete type with size 0, then we can't put it
4624 in data because it might be too big when completed. */
4625 if (!size || size > ix86_section_threshold)
4626 return true;
4627 }
4628
4629 return false;
4630 }
4631
4632 /* Switch to the appropriate section for output of DECL.
4633 DECL is either a `VAR_DECL' node or a constant of some sort.
4634 RELOC indicates whether forming the initial value of DECL requires
4635 link-time relocations. */
4636
4637 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4638 ATTRIBUTE_UNUSED;
4639
4640 static section *
4641 x86_64_elf_select_section (tree decl, int reloc,
4642 unsigned HOST_WIDE_INT align)
4643 {
4644 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4645 && ix86_in_large_data_p (decl))
4646 {
4647 const char *sname = NULL;
4648 unsigned int flags = SECTION_WRITE;
4649 switch (categorize_decl_for_section (decl, reloc))
4650 {
4651 case SECCAT_DATA:
4652 sname = ".ldata";
4653 break;
4654 case SECCAT_DATA_REL:
4655 sname = ".ldata.rel";
4656 break;
4657 case SECCAT_DATA_REL_LOCAL:
4658 sname = ".ldata.rel.local";
4659 break;
4660 case SECCAT_DATA_REL_RO:
4661 sname = ".ldata.rel.ro";
4662 break;
4663 case SECCAT_DATA_REL_RO_LOCAL:
4664 sname = ".ldata.rel.ro.local";
4665 break;
4666 case SECCAT_BSS:
4667 sname = ".lbss";
4668 flags |= SECTION_BSS;
4669 break;
4670 case SECCAT_RODATA:
4671 case SECCAT_RODATA_MERGE_STR:
4672 case SECCAT_RODATA_MERGE_STR_INIT:
4673 case SECCAT_RODATA_MERGE_CONST:
4674 sname = ".lrodata";
4675 flags = 0;
4676 break;
4677 case SECCAT_SRODATA:
4678 case SECCAT_SDATA:
4679 case SECCAT_SBSS:
4680 gcc_unreachable ();
4681 case SECCAT_TEXT:
4682 case SECCAT_TDATA:
4683 case SECCAT_TBSS:
4684 /* We don't split these for medium model. Place them into
4685 default sections and hope for best. */
4686 break;
4687 }
4688 if (sname)
4689 {
4690 /* We might get called with string constants, but get_named_section
4691 doesn't like them as they are not DECLs. Also, we need to set
4692 flags in that case. */
4693 if (!DECL_P (decl))
4694 return get_section (sname, flags, NULL);
4695 return get_named_section (decl, sname, reloc);
4696 }
4697 }
4698 return default_elf_select_section (decl, reloc, align);
4699 }
4700
4701 /* Build up a unique section name, expressed as a
4702 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4703 RELOC indicates whether the initial value of EXP requires
4704 link-time relocations. */
4705
4706 static void ATTRIBUTE_UNUSED
4707 x86_64_elf_unique_section (tree decl, int reloc)
4708 {
4709 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4710 && ix86_in_large_data_p (decl))
4711 {
4712 const char *prefix = NULL;
4713 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4714 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4715
4716 switch (categorize_decl_for_section (decl, reloc))
4717 {
4718 case SECCAT_DATA:
4719 case SECCAT_DATA_REL:
4720 case SECCAT_DATA_REL_LOCAL:
4721 case SECCAT_DATA_REL_RO:
4722 case SECCAT_DATA_REL_RO_LOCAL:
4723 prefix = one_only ? ".ld" : ".ldata";
4724 break;
4725 case SECCAT_BSS:
4726 prefix = one_only ? ".lb" : ".lbss";
4727 break;
4728 case SECCAT_RODATA:
4729 case SECCAT_RODATA_MERGE_STR:
4730 case SECCAT_RODATA_MERGE_STR_INIT:
4731 case SECCAT_RODATA_MERGE_CONST:
4732 prefix = one_only ? ".lr" : ".lrodata";
4733 break;
4734 case SECCAT_SRODATA:
4735 case SECCAT_SDATA:
4736 case SECCAT_SBSS:
4737 gcc_unreachable ();
4738 case SECCAT_TEXT:
4739 case SECCAT_TDATA:
4740 case SECCAT_TBSS:
4741 /* We don't split these for medium model. Place them into
4742 default sections and hope for best. */
4743 break;
4744 }
4745 if (prefix)
4746 {
4747 const char *name, *linkonce;
4748 char *string;
4749
4750 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4751 name = targetm.strip_name_encoding (name);
4752
4753 /* If we're using one_only, then there needs to be a .gnu.linkonce
4754 prefix to the section name. */
4755 linkonce = one_only ? ".gnu.linkonce" : "";
4756
4757 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4758
4759 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4760 return;
4761 }
4762 }
4763 default_unique_section (decl, reloc);
4764 }
4765
4766 #ifdef COMMON_ASM_OP
4767 /* This says how to output assembler code to declare an
4768 uninitialized external linkage data object.
4769
4770 For medium model x86-64 we need to use .largecomm opcode for
4771 large objects. */
4772 void
4773 x86_elf_aligned_common (FILE *file,
4774 const char *name, unsigned HOST_WIDE_INT size,
4775 int align)
4776 {
4777 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4778 && size > (unsigned int)ix86_section_threshold)
4779 fputs (".largecomm\t", file);
4780 else
4781 fputs (COMMON_ASM_OP, file);
4782 assemble_name (file, name);
4783 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4784 size, align / BITS_PER_UNIT);
4785 }
4786 #endif
4787
4788 /* Utility function for targets to use in implementing
4789 ASM_OUTPUT_ALIGNED_BSS. */
4790
4791 void
4792 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4793 const char *name, unsigned HOST_WIDE_INT size,
4794 int align)
4795 {
4796 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4797 && size > (unsigned int)ix86_section_threshold)
4798 switch_to_section (get_named_section (decl, ".lbss", 0));
4799 else
4800 switch_to_section (bss_section);
4801 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4802 #ifdef ASM_DECLARE_OBJECT_NAME
4803 last_assemble_variable_decl = decl;
4804 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4805 #else
4806 /* Standard thing is just output label for the object. */
4807 ASM_OUTPUT_LABEL (file, name);
4808 #endif /* ASM_DECLARE_OBJECT_NAME */
4809 ASM_OUTPUT_SKIP (file, size ? size : 1);
4810 }
4811 \f
4812 /* Decide whether we must probe the stack before any space allocation
4813 on this target. It's essentially TARGET_STACK_PROBE except when
4814 -fstack-check causes the stack to be already probed differently. */
4815
4816 bool
4817 ix86_target_stack_probe (void)
4818 {
4819 /* Do not probe the stack twice if static stack checking is enabled. */
4820 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4821 return false;
4822
4823 return TARGET_STACK_PROBE;
4824 }
4825 \f
4826 /* Decide whether we can make a sibling call to a function. DECL is the
4827 declaration of the function being targeted by the call and EXP is the
4828 CALL_EXPR representing the call. */
4829
4830 static bool
4831 ix86_function_ok_for_sibcall (tree decl, tree exp)
4832 {
4833 tree type, decl_or_type;
4834 rtx a, b;
4835
4836 /* If we are generating position-independent code, we cannot sibcall
4837 optimize any indirect call, or a direct call to a global function,
4838 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4839 if (!TARGET_MACHO
4840 && !TARGET_64BIT
4841 && flag_pic
4842 && (!decl || !targetm.binds_local_p (decl)))
4843 return false;
4844
4845 /* If we need to align the outgoing stack, then sibcalling would
4846 unalign the stack, which may break the called function. */
4847 if (ix86_minimum_incoming_stack_boundary (true)
4848 < PREFERRED_STACK_BOUNDARY)
4849 return false;
4850
4851 if (decl)
4852 {
4853 decl_or_type = decl;
4854 type = TREE_TYPE (decl);
4855 }
4856 else
4857 {
4858 /* We're looking at the CALL_EXPR, we need the type of the function. */
4859 type = CALL_EXPR_FN (exp); /* pointer expression */
4860 type = TREE_TYPE (type); /* pointer type */
4861 type = TREE_TYPE (type); /* function type */
4862 decl_or_type = type;
4863 }
4864
4865 /* Check that the return value locations are the same. Like
4866 if we are returning floats on the 80387 register stack, we cannot
4867 make a sibcall from a function that doesn't return a float to a
4868 function that does or, conversely, from a function that does return
4869 a float to a function that doesn't; the necessary stack adjustment
4870 would not be executed. This is also the place we notice
4871 differences in the return value ABI. Note that it is ok for one
4872 of the functions to have void return type as long as the return
4873 value of the other is passed in a register. */
4874 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4875 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4876 cfun->decl, false);
4877 if (STACK_REG_P (a) || STACK_REG_P (b))
4878 {
4879 if (!rtx_equal_p (a, b))
4880 return false;
4881 }
4882 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4883 {
4884 /* Disable sibcall if we need to generate vzeroupper after
4885 callee returns. */
4886 if (TARGET_VZEROUPPER
4887 && cfun->machine->callee_return_avx256_p
4888 && !cfun->machine->caller_return_avx256_p)
4889 return false;
4890 }
4891 else if (!rtx_equal_p (a, b))
4892 return false;
4893
4894 if (TARGET_64BIT)
4895 {
4896 /* The SYSV ABI has more call-clobbered registers;
4897 disallow sibcalls from MS to SYSV. */
4898 if (cfun->machine->call_abi == MS_ABI
4899 && ix86_function_type_abi (type) == SYSV_ABI)
4900 return false;
4901 }
4902 else
4903 {
4904 /* If this call is indirect, we'll need to be able to use a
4905 call-clobbered register for the address of the target function.
4906 Make sure that all such registers are not used for passing
4907 parameters. Note that DLLIMPORT functions are indirect. */
4908 if (!decl
4909 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4910 {
4911 if (ix86_function_regparm (type, NULL) >= 3)
4912 {
4913 /* ??? Need to count the actual number of registers to be used,
4914 not the possible number of registers. Fix later. */
4915 return false;
4916 }
4917 }
4918 }
4919
4920 /* Otherwise okay. That also includes certain types of indirect calls. */
4921 return true;
4922 }
4923
4924 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4925 and "sseregparm" calling convention attributes;
4926 arguments as in struct attribute_spec.handler. */
4927
4928 static tree
4929 ix86_handle_cconv_attribute (tree *node, tree name,
4930 tree args,
4931 int flags ATTRIBUTE_UNUSED,
4932 bool *no_add_attrs)
4933 {
4934 if (TREE_CODE (*node) != FUNCTION_TYPE
4935 && TREE_CODE (*node) != METHOD_TYPE
4936 && TREE_CODE (*node) != FIELD_DECL
4937 && TREE_CODE (*node) != TYPE_DECL)
4938 {
4939 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4940 name);
4941 *no_add_attrs = true;
4942 return NULL_TREE;
4943 }
4944
4945 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4946 if (is_attribute_p ("regparm", name))
4947 {
4948 tree cst;
4949
4950 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4951 {
4952 error ("fastcall and regparm attributes are not compatible");
4953 }
4954
4955 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4956 {
4957 error ("regparam and thiscall attributes are not compatible");
4958 }
4959
4960 cst = TREE_VALUE (args);
4961 if (TREE_CODE (cst) != INTEGER_CST)
4962 {
4963 warning (OPT_Wattributes,
4964 "%qE attribute requires an integer constant argument",
4965 name);
4966 *no_add_attrs = true;
4967 }
4968 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4969 {
4970 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4971 name, REGPARM_MAX);
4972 *no_add_attrs = true;
4973 }
4974
4975 return NULL_TREE;
4976 }
4977
4978 if (TARGET_64BIT)
4979 {
4980 /* Do not warn when emulating the MS ABI. */
4981 if ((TREE_CODE (*node) != FUNCTION_TYPE
4982 && TREE_CODE (*node) != METHOD_TYPE)
4983 || ix86_function_type_abi (*node) != MS_ABI)
4984 warning (OPT_Wattributes, "%qE attribute ignored",
4985 name);
4986 *no_add_attrs = true;
4987 return NULL_TREE;
4988 }
4989
4990 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4991 if (is_attribute_p ("fastcall", name))
4992 {
4993 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4994 {
4995 error ("fastcall and cdecl attributes are not compatible");
4996 }
4997 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4998 {
4999 error ("fastcall and stdcall attributes are not compatible");
5000 }
5001 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5002 {
5003 error ("fastcall and regparm attributes are not compatible");
5004 }
5005 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5006 {
5007 error ("fastcall and thiscall attributes are not compatible");
5008 }
5009 }
5010
5011 /* Can combine stdcall with fastcall (redundant), regparm and
5012 sseregparm. */
5013 else if (is_attribute_p ("stdcall", name))
5014 {
5015 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5016 {
5017 error ("stdcall and cdecl attributes are not compatible");
5018 }
5019 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5020 {
5021 error ("stdcall and fastcall attributes are not compatible");
5022 }
5023 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5024 {
5025 error ("stdcall and thiscall attributes are not compatible");
5026 }
5027 }
5028
5029 /* Can combine cdecl with regparm and sseregparm. */
5030 else if (is_attribute_p ("cdecl", name))
5031 {
5032 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5033 {
5034 error ("stdcall and cdecl attributes are not compatible");
5035 }
5036 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5037 {
5038 error ("fastcall and cdecl attributes are not compatible");
5039 }
5040 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5041 {
5042 error ("cdecl and thiscall attributes are not compatible");
5043 }
5044 }
5045 else if (is_attribute_p ("thiscall", name))
5046 {
5047 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5048 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5049 name);
5050 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5051 {
5052 error ("stdcall and thiscall attributes are not compatible");
5053 }
5054 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5055 {
5056 error ("fastcall and thiscall attributes are not compatible");
5057 }
5058 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5059 {
5060 error ("cdecl and thiscall attributes are not compatible");
5061 }
5062 }
5063
5064 /* Can combine sseregparm with all attributes. */
5065
5066 return NULL_TREE;
5067 }
5068
5069 /* The transactional memory builtins are implicitly regparm or fastcall
5070 depending on the ABI. Override the generic do-nothing attribute that
5071 these builtins were declared with, and replace it with one of the two
5072 attributes that we expect elsewhere. */
5073
5074 static tree
5075 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5076 tree args ATTRIBUTE_UNUSED,
5077 int flags ATTRIBUTE_UNUSED,
5078 bool *no_add_attrs)
5079 {
5080 tree alt;
5081
5082 /* In no case do we want to add the placeholder attribute. */
5083 *no_add_attrs = true;
5084
5085 /* The 64-bit ABI is unchanged for transactional memory. */
5086 if (TARGET_64BIT)
5087 return NULL_TREE;
5088
5089 /* ??? Is there a better way to validate 32-bit windows? We have
5090 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5091 if (CHECK_STACK_LIMIT > 0)
5092 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5093 else
5094 {
5095 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5096 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5097 }
5098 decl_attributes (node, alt, flags);
5099
5100 return NULL_TREE;
5101 }
5102
5103 /* This function determines from TYPE the calling-convention. */
5104
5105 unsigned int
5106 ix86_get_callcvt (const_tree type)
5107 {
5108 unsigned int ret = 0;
5109 bool is_stdarg;
5110 tree attrs;
5111
5112 if (TARGET_64BIT)
5113 return IX86_CALLCVT_CDECL;
5114
5115 attrs = TYPE_ATTRIBUTES (type);
5116 if (attrs != NULL_TREE)
5117 {
5118 if (lookup_attribute ("cdecl", attrs))
5119 ret |= IX86_CALLCVT_CDECL;
5120 else if (lookup_attribute ("stdcall", attrs))
5121 ret |= IX86_CALLCVT_STDCALL;
5122 else if (lookup_attribute ("fastcall", attrs))
5123 ret |= IX86_CALLCVT_FASTCALL;
5124 else if (lookup_attribute ("thiscall", attrs))
5125 ret |= IX86_CALLCVT_THISCALL;
5126
5127 /* Regparam isn't allowed for thiscall and fastcall. */
5128 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5129 {
5130 if (lookup_attribute ("regparm", attrs))
5131 ret |= IX86_CALLCVT_REGPARM;
5132 if (lookup_attribute ("sseregparm", attrs))
5133 ret |= IX86_CALLCVT_SSEREGPARM;
5134 }
5135
5136 if (IX86_BASE_CALLCVT(ret) != 0)
5137 return ret;
5138 }
5139
5140 is_stdarg = stdarg_p (type);
5141 if (TARGET_RTD && !is_stdarg)
5142 return IX86_CALLCVT_STDCALL | ret;
5143
5144 if (ret != 0
5145 || is_stdarg
5146 || TREE_CODE (type) != METHOD_TYPE
5147 || ix86_function_type_abi (type) != MS_ABI)
5148 return IX86_CALLCVT_CDECL | ret;
5149
5150 return IX86_CALLCVT_THISCALL;
5151 }
5152
5153 /* Return 0 if the attributes for two types are incompatible, 1 if they
5154 are compatible, and 2 if they are nearly compatible (which causes a
5155 warning to be generated). */
5156
5157 static int
5158 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5159 {
5160 unsigned int ccvt1, ccvt2;
5161
5162 if (TREE_CODE (type1) != FUNCTION_TYPE
5163 && TREE_CODE (type1) != METHOD_TYPE)
5164 return 1;
5165
5166 ccvt1 = ix86_get_callcvt (type1);
5167 ccvt2 = ix86_get_callcvt (type2);
5168 if (ccvt1 != ccvt2)
5169 return 0;
5170 if (ix86_function_regparm (type1, NULL)
5171 != ix86_function_regparm (type2, NULL))
5172 return 0;
5173
5174 return 1;
5175 }
5176 \f
5177 /* Return the regparm value for a function with the indicated TYPE and DECL.
5178 DECL may be NULL when calling function indirectly
5179 or considering a libcall. */
5180
5181 static int
5182 ix86_function_regparm (const_tree type, const_tree decl)
5183 {
5184 tree attr;
5185 int regparm;
5186 unsigned int ccvt;
5187
5188 if (TARGET_64BIT)
5189 return (ix86_function_type_abi (type) == SYSV_ABI
5190 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5191 ccvt = ix86_get_callcvt (type);
5192 regparm = ix86_regparm;
5193
5194 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5195 {
5196 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5197 if (attr)
5198 {
5199 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5200 return regparm;
5201 }
5202 }
5203 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5204 return 2;
5205 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5206 return 1;
5207
5208 /* Use register calling convention for local functions when possible. */
5209 if (decl
5210 && TREE_CODE (decl) == FUNCTION_DECL
5211 && optimize
5212 && !(profile_flag && !flag_fentry))
5213 {
5214 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5215 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5216 if (i && i->local && i->can_change_signature)
5217 {
5218 int local_regparm, globals = 0, regno;
5219
5220 /* Make sure no regparm register is taken by a
5221 fixed register variable. */
5222 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5223 if (fixed_regs[local_regparm])
5224 break;
5225
5226 /* We don't want to use regparm(3) for nested functions as
5227 these use a static chain pointer in the third argument. */
5228 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5229 local_regparm = 2;
5230
5231 /* In 32-bit mode save a register for the split stack. */
5232 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5233 local_regparm = 2;
5234
5235 /* Each fixed register usage increases register pressure,
5236 so less registers should be used for argument passing.
5237 This functionality can be overriden by an explicit
5238 regparm value. */
5239 for (regno = 0; regno <= DI_REG; regno++)
5240 if (fixed_regs[regno])
5241 globals++;
5242
5243 local_regparm
5244 = globals < local_regparm ? local_regparm - globals : 0;
5245
5246 if (local_regparm > regparm)
5247 regparm = local_regparm;
5248 }
5249 }
5250
5251 return regparm;
5252 }
5253
5254 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5255 DFmode (2) arguments in SSE registers for a function with the
5256 indicated TYPE and DECL. DECL may be NULL when calling function
5257 indirectly or considering a libcall. Otherwise return 0. */
5258
5259 static int
5260 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5261 {
5262 gcc_assert (!TARGET_64BIT);
5263
5264 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5265 by the sseregparm attribute. */
5266 if (TARGET_SSEREGPARM
5267 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5268 {
5269 if (!TARGET_SSE)
5270 {
5271 if (warn)
5272 {
5273 if (decl)
5274 error ("calling %qD with attribute sseregparm without "
5275 "SSE/SSE2 enabled", decl);
5276 else
5277 error ("calling %qT with attribute sseregparm without "
5278 "SSE/SSE2 enabled", type);
5279 }
5280 return 0;
5281 }
5282
5283 return 2;
5284 }
5285
5286 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5287 (and DFmode for SSE2) arguments in SSE registers. */
5288 if (decl && TARGET_SSE_MATH && optimize
5289 && !(profile_flag && !flag_fentry))
5290 {
5291 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5292 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5293 if (i && i->local && i->can_change_signature)
5294 return TARGET_SSE2 ? 2 : 1;
5295 }
5296
5297 return 0;
5298 }
5299
5300 /* Return true if EAX is live at the start of the function. Used by
5301 ix86_expand_prologue to determine if we need special help before
5302 calling allocate_stack_worker. */
5303
5304 static bool
5305 ix86_eax_live_at_start_p (void)
5306 {
5307 /* Cheat. Don't bother working forward from ix86_function_regparm
5308 to the function type to whether an actual argument is located in
5309 eax. Instead just look at cfg info, which is still close enough
5310 to correct at this point. This gives false positives for broken
5311 functions that might use uninitialized data that happens to be
5312 allocated in eax, but who cares? */
5313 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5314 }
5315
5316 static bool
5317 ix86_keep_aggregate_return_pointer (tree fntype)
5318 {
5319 tree attr;
5320
5321 if (!TARGET_64BIT)
5322 {
5323 attr = lookup_attribute ("callee_pop_aggregate_return",
5324 TYPE_ATTRIBUTES (fntype));
5325 if (attr)
5326 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5327
5328 /* For 32-bit MS-ABI the default is to keep aggregate
5329 return pointer. */
5330 if (ix86_function_type_abi (fntype) == MS_ABI)
5331 return true;
5332 }
5333 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5334 }
5335
5336 /* Value is the number of bytes of arguments automatically
5337 popped when returning from a subroutine call.
5338 FUNDECL is the declaration node of the function (as a tree),
5339 FUNTYPE is the data type of the function (as a tree),
5340 or for a library call it is an identifier node for the subroutine name.
5341 SIZE is the number of bytes of arguments passed on the stack.
5342
5343 On the 80386, the RTD insn may be used to pop them if the number
5344 of args is fixed, but if the number is variable then the caller
5345 must pop them all. RTD can't be used for library calls now
5346 because the library is compiled with the Unix compiler.
5347 Use of RTD is a selectable option, since it is incompatible with
5348 standard Unix calling sequences. If the option is not selected,
5349 the caller must always pop the args.
5350
5351 The attribute stdcall is equivalent to RTD on a per module basis. */
5352
5353 static int
5354 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5355 {
5356 unsigned int ccvt;
5357
5358 /* None of the 64-bit ABIs pop arguments. */
5359 if (TARGET_64BIT)
5360 return 0;
5361
5362 ccvt = ix86_get_callcvt (funtype);
5363
5364 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5365 | IX86_CALLCVT_THISCALL)) != 0
5366 && ! stdarg_p (funtype))
5367 return size;
5368
5369 /* Lose any fake structure return argument if it is passed on the stack. */
5370 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5371 && !ix86_keep_aggregate_return_pointer (funtype))
5372 {
5373 int nregs = ix86_function_regparm (funtype, fundecl);
5374 if (nregs == 0)
5375 return GET_MODE_SIZE (Pmode);
5376 }
5377
5378 return 0;
5379 }
5380 \f
5381 /* Argument support functions. */
5382
5383 /* Return true when register may be used to pass function parameters. */
5384 bool
5385 ix86_function_arg_regno_p (int regno)
5386 {
5387 int i;
5388 const int *parm_regs;
5389
5390 if (!TARGET_64BIT)
5391 {
5392 if (TARGET_MACHO)
5393 return (regno < REGPARM_MAX
5394 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5395 else
5396 return (regno < REGPARM_MAX
5397 || (TARGET_MMX && MMX_REGNO_P (regno)
5398 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5399 || (TARGET_SSE && SSE_REGNO_P (regno)
5400 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5401 }
5402
5403 if (TARGET_MACHO)
5404 {
5405 if (SSE_REGNO_P (regno) && TARGET_SSE)
5406 return true;
5407 }
5408 else
5409 {
5410 if (TARGET_SSE && SSE_REGNO_P (regno)
5411 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5412 return true;
5413 }
5414
5415 /* TODO: The function should depend on current function ABI but
5416 builtins.c would need updating then. Therefore we use the
5417 default ABI. */
5418
5419 /* RAX is used as hidden argument to va_arg functions. */
5420 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5421 return true;
5422
5423 if (ix86_abi == MS_ABI)
5424 parm_regs = x86_64_ms_abi_int_parameter_registers;
5425 else
5426 parm_regs = x86_64_int_parameter_registers;
5427 for (i = 0; i < (ix86_abi == MS_ABI
5428 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5429 if (regno == parm_regs[i])
5430 return true;
5431 return false;
5432 }
5433
5434 /* Return if we do not know how to pass TYPE solely in registers. */
5435
5436 static bool
5437 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5438 {
5439 if (must_pass_in_stack_var_size_or_pad (mode, type))
5440 return true;
5441
5442 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5443 The layout_type routine is crafty and tries to trick us into passing
5444 currently unsupported vector types on the stack by using TImode. */
5445 return (!TARGET_64BIT && mode == TImode
5446 && type && TREE_CODE (type) != VECTOR_TYPE);
5447 }
5448
5449 /* It returns the size, in bytes, of the area reserved for arguments passed
5450 in registers for the function represented by fndecl dependent to the used
5451 abi format. */
5452 int
5453 ix86_reg_parm_stack_space (const_tree fndecl)
5454 {
5455 enum calling_abi call_abi = SYSV_ABI;
5456 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5457 call_abi = ix86_function_abi (fndecl);
5458 else
5459 call_abi = ix86_function_type_abi (fndecl);
5460 if (TARGET_64BIT && call_abi == MS_ABI)
5461 return 32;
5462 return 0;
5463 }
5464
5465 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5466 call abi used. */
5467 enum calling_abi
5468 ix86_function_type_abi (const_tree fntype)
5469 {
5470 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5471 {
5472 enum calling_abi abi = ix86_abi;
5473 if (abi == SYSV_ABI)
5474 {
5475 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5476 abi = MS_ABI;
5477 }
5478 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5479 abi = SYSV_ABI;
5480 return abi;
5481 }
5482 return ix86_abi;
5483 }
5484
5485 static bool
5486 ix86_function_ms_hook_prologue (const_tree fn)
5487 {
5488 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5489 {
5490 if (decl_function_context (fn) != NULL_TREE)
5491 error_at (DECL_SOURCE_LOCATION (fn),
5492 "ms_hook_prologue is not compatible with nested function");
5493 else
5494 return true;
5495 }
5496 return false;
5497 }
5498
5499 static enum calling_abi
5500 ix86_function_abi (const_tree fndecl)
5501 {
5502 if (! fndecl)
5503 return ix86_abi;
5504 return ix86_function_type_abi (TREE_TYPE (fndecl));
5505 }
5506
5507 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5508 call abi used. */
5509 enum calling_abi
5510 ix86_cfun_abi (void)
5511 {
5512 if (! cfun)
5513 return ix86_abi;
5514 return cfun->machine->call_abi;
5515 }
5516
5517 /* Write the extra assembler code needed to declare a function properly. */
5518
5519 void
5520 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5521 tree decl)
5522 {
5523 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5524
5525 if (is_ms_hook)
5526 {
5527 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5528 unsigned int filler_cc = 0xcccccccc;
5529
5530 for (i = 0; i < filler_count; i += 4)
5531 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5532 }
5533
5534 #ifdef SUBTARGET_ASM_UNWIND_INIT
5535 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5536 #endif
5537
5538 ASM_OUTPUT_LABEL (asm_out_file, fname);
5539
5540 /* Output magic byte marker, if hot-patch attribute is set. */
5541 if (is_ms_hook)
5542 {
5543 if (TARGET_64BIT)
5544 {
5545 /* leaq [%rsp + 0], %rsp */
5546 asm_fprintf (asm_out_file, ASM_BYTE
5547 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5548 }
5549 else
5550 {
5551 /* movl.s %edi, %edi
5552 push %ebp
5553 movl.s %esp, %ebp */
5554 asm_fprintf (asm_out_file, ASM_BYTE
5555 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5556 }
5557 }
5558 }
5559
5560 /* regclass.c */
5561 extern void init_regs (void);
5562
5563 /* Implementation of call abi switching target hook. Specific to FNDECL
5564 the specific call register sets are set. See also
5565 ix86_conditional_register_usage for more details. */
5566 void
5567 ix86_call_abi_override (const_tree fndecl)
5568 {
5569 if (fndecl == NULL_TREE)
5570 cfun->machine->call_abi = ix86_abi;
5571 else
5572 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5573 }
5574
5575 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5576 expensive re-initialization of init_regs each time we switch function context
5577 since this is needed only during RTL expansion. */
5578 static void
5579 ix86_maybe_switch_abi (void)
5580 {
5581 if (TARGET_64BIT &&
5582 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5583 reinit_regs ();
5584 }
5585
5586 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5587 for a call to a function whose data type is FNTYPE.
5588 For a library call, FNTYPE is 0. */
5589
5590 void
5591 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5592 tree fntype, /* tree ptr for function decl */
5593 rtx libname, /* SYMBOL_REF of library name or 0 */
5594 tree fndecl,
5595 int caller)
5596 {
5597 struct cgraph_local_info *i;
5598 tree fnret_type;
5599
5600 memset (cum, 0, sizeof (*cum));
5601
5602 /* Initialize for the current callee. */
5603 if (caller)
5604 {
5605 cfun->machine->callee_pass_avx256_p = false;
5606 cfun->machine->callee_return_avx256_p = false;
5607 }
5608
5609 if (fndecl)
5610 {
5611 i = cgraph_local_info (fndecl);
5612 cum->call_abi = ix86_function_abi (fndecl);
5613 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5614 }
5615 else
5616 {
5617 i = NULL;
5618 cum->call_abi = ix86_function_type_abi (fntype);
5619 if (fntype)
5620 fnret_type = TREE_TYPE (fntype);
5621 else
5622 fnret_type = NULL;
5623 }
5624
5625 if (TARGET_VZEROUPPER && fnret_type)
5626 {
5627 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5628 false);
5629 if (function_pass_avx256_p (fnret_value))
5630 {
5631 /* The return value of this function uses 256bit AVX modes. */
5632 if (caller)
5633 cfun->machine->callee_return_avx256_p = true;
5634 else
5635 cfun->machine->caller_return_avx256_p = true;
5636 }
5637 }
5638
5639 cum->caller = caller;
5640
5641 /* Set up the number of registers to use for passing arguments. */
5642
5643 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5644 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5645 "or subtarget optimization implying it");
5646 cum->nregs = ix86_regparm;
5647 if (TARGET_64BIT)
5648 {
5649 cum->nregs = (cum->call_abi == SYSV_ABI
5650 ? X86_64_REGPARM_MAX
5651 : X86_64_MS_REGPARM_MAX);
5652 }
5653 if (TARGET_SSE)
5654 {
5655 cum->sse_nregs = SSE_REGPARM_MAX;
5656 if (TARGET_64BIT)
5657 {
5658 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5659 ? X86_64_SSE_REGPARM_MAX
5660 : X86_64_MS_SSE_REGPARM_MAX);
5661 }
5662 }
5663 if (TARGET_MMX)
5664 cum->mmx_nregs = MMX_REGPARM_MAX;
5665 cum->warn_avx = true;
5666 cum->warn_sse = true;
5667 cum->warn_mmx = true;
5668
5669 /* Because type might mismatch in between caller and callee, we need to
5670 use actual type of function for local calls.
5671 FIXME: cgraph_analyze can be told to actually record if function uses
5672 va_start so for local functions maybe_vaarg can be made aggressive
5673 helping K&R code.
5674 FIXME: once typesytem is fixed, we won't need this code anymore. */
5675 if (i && i->local && i->can_change_signature)
5676 fntype = TREE_TYPE (fndecl);
5677 cum->maybe_vaarg = (fntype
5678 ? (!prototype_p (fntype) || stdarg_p (fntype))
5679 : !libname);
5680
5681 if (!TARGET_64BIT)
5682 {
5683 /* If there are variable arguments, then we won't pass anything
5684 in registers in 32-bit mode. */
5685 if (stdarg_p (fntype))
5686 {
5687 cum->nregs = 0;
5688 cum->sse_nregs = 0;
5689 cum->mmx_nregs = 0;
5690 cum->warn_avx = 0;
5691 cum->warn_sse = 0;
5692 cum->warn_mmx = 0;
5693 return;
5694 }
5695
5696 /* Use ecx and edx registers if function has fastcall attribute,
5697 else look for regparm information. */
5698 if (fntype)
5699 {
5700 unsigned int ccvt = ix86_get_callcvt (fntype);
5701 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5702 {
5703 cum->nregs = 1;
5704 cum->fastcall = 1; /* Same first register as in fastcall. */
5705 }
5706 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5707 {
5708 cum->nregs = 2;
5709 cum->fastcall = 1;
5710 }
5711 else
5712 cum->nregs = ix86_function_regparm (fntype, fndecl);
5713 }
5714
5715 /* Set up the number of SSE registers used for passing SFmode
5716 and DFmode arguments. Warn for mismatching ABI. */
5717 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5718 }
5719 }
5720
5721 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5722 But in the case of vector types, it is some vector mode.
5723
5724 When we have only some of our vector isa extensions enabled, then there
5725 are some modes for which vector_mode_supported_p is false. For these
5726 modes, the generic vector support in gcc will choose some non-vector mode
5727 in order to implement the type. By computing the natural mode, we'll
5728 select the proper ABI location for the operand and not depend on whatever
5729 the middle-end decides to do with these vector types.
5730
5731 The midde-end can't deal with the vector types > 16 bytes. In this
5732 case, we return the original mode and warn ABI change if CUM isn't
5733 NULL. */
5734
5735 static enum machine_mode
5736 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5737 {
5738 enum machine_mode mode = TYPE_MODE (type);
5739
5740 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5741 {
5742 HOST_WIDE_INT size = int_size_in_bytes (type);
5743 if ((size == 8 || size == 16 || size == 32)
5744 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5745 && TYPE_VECTOR_SUBPARTS (type) > 1)
5746 {
5747 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5748
5749 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5750 mode = MIN_MODE_VECTOR_FLOAT;
5751 else
5752 mode = MIN_MODE_VECTOR_INT;
5753
5754 /* Get the mode which has this inner mode and number of units. */
5755 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5756 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5757 && GET_MODE_INNER (mode) == innermode)
5758 {
5759 if (size == 32 && !TARGET_AVX)
5760 {
5761 static bool warnedavx;
5762
5763 if (cum
5764 && !warnedavx
5765 && cum->warn_avx)
5766 {
5767 warnedavx = true;
5768 warning (0, "AVX vector argument without AVX "
5769 "enabled changes the ABI");
5770 }
5771 return TYPE_MODE (type);
5772 }
5773 else
5774 return mode;
5775 }
5776
5777 gcc_unreachable ();
5778 }
5779 }
5780
5781 return mode;
5782 }
5783
5784 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5785 this may not agree with the mode that the type system has chosen for the
5786 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5787 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5788
5789 static rtx
5790 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5791 unsigned int regno)
5792 {
5793 rtx tmp;
5794
5795 if (orig_mode != BLKmode)
5796 tmp = gen_rtx_REG (orig_mode, regno);
5797 else
5798 {
5799 tmp = gen_rtx_REG (mode, regno);
5800 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5801 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5802 }
5803
5804 return tmp;
5805 }
5806
5807 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5808 of this code is to classify each 8bytes of incoming argument by the register
5809 class and assign registers accordingly. */
5810
5811 /* Return the union class of CLASS1 and CLASS2.
5812 See the x86-64 PS ABI for details. */
5813
5814 static enum x86_64_reg_class
5815 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5816 {
5817 /* Rule #1: If both classes are equal, this is the resulting class. */
5818 if (class1 == class2)
5819 return class1;
5820
5821 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5822 the other class. */
5823 if (class1 == X86_64_NO_CLASS)
5824 return class2;
5825 if (class2 == X86_64_NO_CLASS)
5826 return class1;
5827
5828 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5829 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5830 return X86_64_MEMORY_CLASS;
5831
5832 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5833 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5834 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5835 return X86_64_INTEGERSI_CLASS;
5836 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5837 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5838 return X86_64_INTEGER_CLASS;
5839
5840 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5841 MEMORY is used. */
5842 if (class1 == X86_64_X87_CLASS
5843 || class1 == X86_64_X87UP_CLASS
5844 || class1 == X86_64_COMPLEX_X87_CLASS
5845 || class2 == X86_64_X87_CLASS
5846 || class2 == X86_64_X87UP_CLASS
5847 || class2 == X86_64_COMPLEX_X87_CLASS)
5848 return X86_64_MEMORY_CLASS;
5849
5850 /* Rule #6: Otherwise class SSE is used. */
5851 return X86_64_SSE_CLASS;
5852 }
5853
5854 /* Classify the argument of type TYPE and mode MODE.
5855 CLASSES will be filled by the register class used to pass each word
5856 of the operand. The number of words is returned. In case the parameter
5857 should be passed in memory, 0 is returned. As a special case for zero
5858 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5859
5860 BIT_OFFSET is used internally for handling records and specifies offset
5861 of the offset in bits modulo 256 to avoid overflow cases.
5862
5863 See the x86-64 PS ABI for details.
5864 */
5865
5866 static int
5867 classify_argument (enum machine_mode mode, const_tree type,
5868 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5869 {
5870 HOST_WIDE_INT bytes =
5871 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5872 int words
5873 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5874
5875 /* Variable sized entities are always passed/returned in memory. */
5876 if (bytes < 0)
5877 return 0;
5878
5879 if (mode != VOIDmode
5880 && targetm.calls.must_pass_in_stack (mode, type))
5881 return 0;
5882
5883 if (type && AGGREGATE_TYPE_P (type))
5884 {
5885 int i;
5886 tree field;
5887 enum x86_64_reg_class subclasses[MAX_CLASSES];
5888
5889 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5890 if (bytes > 32)
5891 return 0;
5892
5893 for (i = 0; i < words; i++)
5894 classes[i] = X86_64_NO_CLASS;
5895
5896 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5897 signalize memory class, so handle it as special case. */
5898 if (!words)
5899 {
5900 classes[0] = X86_64_NO_CLASS;
5901 return 1;
5902 }
5903
5904 /* Classify each field of record and merge classes. */
5905 switch (TREE_CODE (type))
5906 {
5907 case RECORD_TYPE:
5908 /* And now merge the fields of structure. */
5909 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5910 {
5911 if (TREE_CODE (field) == FIELD_DECL)
5912 {
5913 int num;
5914
5915 if (TREE_TYPE (field) == error_mark_node)
5916 continue;
5917
5918 /* Bitfields are always classified as integer. Handle them
5919 early, since later code would consider them to be
5920 misaligned integers. */
5921 if (DECL_BIT_FIELD (field))
5922 {
5923 for (i = (int_bit_position (field)
5924 + (bit_offset % 64)) / 8 / 8;
5925 i < ((int_bit_position (field) + (bit_offset % 64))
5926 + tree_low_cst (DECL_SIZE (field), 0)
5927 + 63) / 8 / 8; i++)
5928 classes[i] =
5929 merge_classes (X86_64_INTEGER_CLASS,
5930 classes[i]);
5931 }
5932 else
5933 {
5934 int pos;
5935
5936 type = TREE_TYPE (field);
5937
5938 /* Flexible array member is ignored. */
5939 if (TYPE_MODE (type) == BLKmode
5940 && TREE_CODE (type) == ARRAY_TYPE
5941 && TYPE_SIZE (type) == NULL_TREE
5942 && TYPE_DOMAIN (type) != NULL_TREE
5943 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5944 == NULL_TREE))
5945 {
5946 static bool warned;
5947
5948 if (!warned && warn_psabi)
5949 {
5950 warned = true;
5951 inform (input_location,
5952 "the ABI of passing struct with"
5953 " a flexible array member has"
5954 " changed in GCC 4.4");
5955 }
5956 continue;
5957 }
5958 num = classify_argument (TYPE_MODE (type), type,
5959 subclasses,
5960 (int_bit_position (field)
5961 + bit_offset) % 256);
5962 if (!num)
5963 return 0;
5964 pos = (int_bit_position (field)
5965 + (bit_offset % 64)) / 8 / 8;
5966 for (i = 0; i < num && (i + pos) < words; i++)
5967 classes[i + pos] =
5968 merge_classes (subclasses[i], classes[i + pos]);
5969 }
5970 }
5971 }
5972 break;
5973
5974 case ARRAY_TYPE:
5975 /* Arrays are handled as small records. */
5976 {
5977 int num;
5978 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5979 TREE_TYPE (type), subclasses, bit_offset);
5980 if (!num)
5981 return 0;
5982
5983 /* The partial classes are now full classes. */
5984 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5985 subclasses[0] = X86_64_SSE_CLASS;
5986 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5987 && !((bit_offset % 64) == 0 && bytes == 4))
5988 subclasses[0] = X86_64_INTEGER_CLASS;
5989
5990 for (i = 0; i < words; i++)
5991 classes[i] = subclasses[i % num];
5992
5993 break;
5994 }
5995 case UNION_TYPE:
5996 case QUAL_UNION_TYPE:
5997 /* Unions are similar to RECORD_TYPE but offset is always 0.
5998 */
5999 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6000 {
6001 if (TREE_CODE (field) == FIELD_DECL)
6002 {
6003 int num;
6004
6005 if (TREE_TYPE (field) == error_mark_node)
6006 continue;
6007
6008 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6009 TREE_TYPE (field), subclasses,
6010 bit_offset);
6011 if (!num)
6012 return 0;
6013 for (i = 0; i < num; i++)
6014 classes[i] = merge_classes (subclasses[i], classes[i]);
6015 }
6016 }
6017 break;
6018
6019 default:
6020 gcc_unreachable ();
6021 }
6022
6023 if (words > 2)
6024 {
6025 /* When size > 16 bytes, if the first one isn't
6026 X86_64_SSE_CLASS or any other ones aren't
6027 X86_64_SSEUP_CLASS, everything should be passed in
6028 memory. */
6029 if (classes[0] != X86_64_SSE_CLASS)
6030 return 0;
6031
6032 for (i = 1; i < words; i++)
6033 if (classes[i] != X86_64_SSEUP_CLASS)
6034 return 0;
6035 }
6036
6037 /* Final merger cleanup. */
6038 for (i = 0; i < words; i++)
6039 {
6040 /* If one class is MEMORY, everything should be passed in
6041 memory. */
6042 if (classes[i] == X86_64_MEMORY_CLASS)
6043 return 0;
6044
6045 /* The X86_64_SSEUP_CLASS should be always preceded by
6046 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6047 if (classes[i] == X86_64_SSEUP_CLASS
6048 && classes[i - 1] != X86_64_SSE_CLASS
6049 && classes[i - 1] != X86_64_SSEUP_CLASS)
6050 {
6051 /* The first one should never be X86_64_SSEUP_CLASS. */
6052 gcc_assert (i != 0);
6053 classes[i] = X86_64_SSE_CLASS;
6054 }
6055
6056 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6057 everything should be passed in memory. */
6058 if (classes[i] == X86_64_X87UP_CLASS
6059 && (classes[i - 1] != X86_64_X87_CLASS))
6060 {
6061 static bool warned;
6062
6063 /* The first one should never be X86_64_X87UP_CLASS. */
6064 gcc_assert (i != 0);
6065 if (!warned && warn_psabi)
6066 {
6067 warned = true;
6068 inform (input_location,
6069 "the ABI of passing union with long double"
6070 " has changed in GCC 4.4");
6071 }
6072 return 0;
6073 }
6074 }
6075 return words;
6076 }
6077
6078 /* Compute alignment needed. We align all types to natural boundaries with
6079 exception of XFmode that is aligned to 64bits. */
6080 if (mode != VOIDmode && mode != BLKmode)
6081 {
6082 int mode_alignment = GET_MODE_BITSIZE (mode);
6083
6084 if (mode == XFmode)
6085 mode_alignment = 128;
6086 else if (mode == XCmode)
6087 mode_alignment = 256;
6088 if (COMPLEX_MODE_P (mode))
6089 mode_alignment /= 2;
6090 /* Misaligned fields are always returned in memory. */
6091 if (bit_offset % mode_alignment)
6092 return 0;
6093 }
6094
6095 /* for V1xx modes, just use the base mode */
6096 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6097 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6098 mode = GET_MODE_INNER (mode);
6099
6100 /* Classification of atomic types. */
6101 switch (mode)
6102 {
6103 case SDmode:
6104 case DDmode:
6105 classes[0] = X86_64_SSE_CLASS;
6106 return 1;
6107 case TDmode:
6108 classes[0] = X86_64_SSE_CLASS;
6109 classes[1] = X86_64_SSEUP_CLASS;
6110 return 2;
6111 case DImode:
6112 case SImode:
6113 case HImode:
6114 case QImode:
6115 case CSImode:
6116 case CHImode:
6117 case CQImode:
6118 {
6119 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6120
6121 if (size <= 32)
6122 {
6123 classes[0] = X86_64_INTEGERSI_CLASS;
6124 return 1;
6125 }
6126 else if (size <= 64)
6127 {
6128 classes[0] = X86_64_INTEGER_CLASS;
6129 return 1;
6130 }
6131 else if (size <= 64+32)
6132 {
6133 classes[0] = X86_64_INTEGER_CLASS;
6134 classes[1] = X86_64_INTEGERSI_CLASS;
6135 return 2;
6136 }
6137 else if (size <= 64+64)
6138 {
6139 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6140 return 2;
6141 }
6142 else
6143 gcc_unreachable ();
6144 }
6145 case CDImode:
6146 case TImode:
6147 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6148 return 2;
6149 case COImode:
6150 case OImode:
6151 /* OImode shouldn't be used directly. */
6152 gcc_unreachable ();
6153 case CTImode:
6154 return 0;
6155 case SFmode:
6156 if (!(bit_offset % 64))
6157 classes[0] = X86_64_SSESF_CLASS;
6158 else
6159 classes[0] = X86_64_SSE_CLASS;
6160 return 1;
6161 case DFmode:
6162 classes[0] = X86_64_SSEDF_CLASS;
6163 return 1;
6164 case XFmode:
6165 classes[0] = X86_64_X87_CLASS;
6166 classes[1] = X86_64_X87UP_CLASS;
6167 return 2;
6168 case TFmode:
6169 classes[0] = X86_64_SSE_CLASS;
6170 classes[1] = X86_64_SSEUP_CLASS;
6171 return 2;
6172 case SCmode:
6173 classes[0] = X86_64_SSE_CLASS;
6174 if (!(bit_offset % 64))
6175 return 1;
6176 else
6177 {
6178 static bool warned;
6179
6180 if (!warned && warn_psabi)
6181 {
6182 warned = true;
6183 inform (input_location,
6184 "the ABI of passing structure with complex float"
6185 " member has changed in GCC 4.4");
6186 }
6187 classes[1] = X86_64_SSESF_CLASS;
6188 return 2;
6189 }
6190 case DCmode:
6191 classes[0] = X86_64_SSEDF_CLASS;
6192 classes[1] = X86_64_SSEDF_CLASS;
6193 return 2;
6194 case XCmode:
6195 classes[0] = X86_64_COMPLEX_X87_CLASS;
6196 return 1;
6197 case TCmode:
6198 /* This modes is larger than 16 bytes. */
6199 return 0;
6200 case V8SFmode:
6201 case V8SImode:
6202 case V32QImode:
6203 case V16HImode:
6204 case V4DFmode:
6205 case V4DImode:
6206 classes[0] = X86_64_SSE_CLASS;
6207 classes[1] = X86_64_SSEUP_CLASS;
6208 classes[2] = X86_64_SSEUP_CLASS;
6209 classes[3] = X86_64_SSEUP_CLASS;
6210 return 4;
6211 case V4SFmode:
6212 case V4SImode:
6213 case V16QImode:
6214 case V8HImode:
6215 case V2DFmode:
6216 case V2DImode:
6217 classes[0] = X86_64_SSE_CLASS;
6218 classes[1] = X86_64_SSEUP_CLASS;
6219 return 2;
6220 case V1TImode:
6221 case V1DImode:
6222 case V2SFmode:
6223 case V2SImode:
6224 case V4HImode:
6225 case V8QImode:
6226 classes[0] = X86_64_SSE_CLASS;
6227 return 1;
6228 case BLKmode:
6229 case VOIDmode:
6230 return 0;
6231 default:
6232 gcc_assert (VECTOR_MODE_P (mode));
6233
6234 if (bytes > 16)
6235 return 0;
6236
6237 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6238
6239 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6240 classes[0] = X86_64_INTEGERSI_CLASS;
6241 else
6242 classes[0] = X86_64_INTEGER_CLASS;
6243 classes[1] = X86_64_INTEGER_CLASS;
6244 return 1 + (bytes > 8);
6245 }
6246 }
6247
6248 /* Examine the argument and return set number of register required in each
6249 class. Return 0 iff parameter should be passed in memory. */
6250 static int
6251 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6252 int *int_nregs, int *sse_nregs)
6253 {
6254 enum x86_64_reg_class regclass[MAX_CLASSES];
6255 int n = classify_argument (mode, type, regclass, 0);
6256
6257 *int_nregs = 0;
6258 *sse_nregs = 0;
6259 if (!n)
6260 return 0;
6261 for (n--; n >= 0; n--)
6262 switch (regclass[n])
6263 {
6264 case X86_64_INTEGER_CLASS:
6265 case X86_64_INTEGERSI_CLASS:
6266 (*int_nregs)++;
6267 break;
6268 case X86_64_SSE_CLASS:
6269 case X86_64_SSESF_CLASS:
6270 case X86_64_SSEDF_CLASS:
6271 (*sse_nregs)++;
6272 break;
6273 case X86_64_NO_CLASS:
6274 case X86_64_SSEUP_CLASS:
6275 break;
6276 case X86_64_X87_CLASS:
6277 case X86_64_X87UP_CLASS:
6278 if (!in_return)
6279 return 0;
6280 break;
6281 case X86_64_COMPLEX_X87_CLASS:
6282 return in_return ? 2 : 0;
6283 case X86_64_MEMORY_CLASS:
6284 gcc_unreachable ();
6285 }
6286 return 1;
6287 }
6288
6289 /* Construct container for the argument used by GCC interface. See
6290 FUNCTION_ARG for the detailed description. */
6291
6292 static rtx
6293 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6294 const_tree type, int in_return, int nintregs, int nsseregs,
6295 const int *intreg, int sse_regno)
6296 {
6297 /* The following variables hold the static issued_error state. */
6298 static bool issued_sse_arg_error;
6299 static bool issued_sse_ret_error;
6300 static bool issued_x87_ret_error;
6301
6302 enum machine_mode tmpmode;
6303 int bytes =
6304 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6305 enum x86_64_reg_class regclass[MAX_CLASSES];
6306 int n;
6307 int i;
6308 int nexps = 0;
6309 int needed_sseregs, needed_intregs;
6310 rtx exp[MAX_CLASSES];
6311 rtx ret;
6312
6313 n = classify_argument (mode, type, regclass, 0);
6314 if (!n)
6315 return NULL;
6316 if (!examine_argument (mode, type, in_return, &needed_intregs,
6317 &needed_sseregs))
6318 return NULL;
6319 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6320 return NULL;
6321
6322 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6323 some less clueful developer tries to use floating-point anyway. */
6324 if (needed_sseregs && !TARGET_SSE)
6325 {
6326 if (in_return)
6327 {
6328 if (!issued_sse_ret_error)
6329 {
6330 error ("SSE register return with SSE disabled");
6331 issued_sse_ret_error = true;
6332 }
6333 }
6334 else if (!issued_sse_arg_error)
6335 {
6336 error ("SSE register argument with SSE disabled");
6337 issued_sse_arg_error = true;
6338 }
6339 return NULL;
6340 }
6341
6342 /* Likewise, error if the ABI requires us to return values in the
6343 x87 registers and the user specified -mno-80387. */
6344 if (!TARGET_80387 && in_return)
6345 for (i = 0; i < n; i++)
6346 if (regclass[i] == X86_64_X87_CLASS
6347 || regclass[i] == X86_64_X87UP_CLASS
6348 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6349 {
6350 if (!issued_x87_ret_error)
6351 {
6352 error ("x87 register return with x87 disabled");
6353 issued_x87_ret_error = true;
6354 }
6355 return NULL;
6356 }
6357
6358 /* First construct simple cases. Avoid SCmode, since we want to use
6359 single register to pass this type. */
6360 if (n == 1 && mode != SCmode)
6361 switch (regclass[0])
6362 {
6363 case X86_64_INTEGER_CLASS:
6364 case X86_64_INTEGERSI_CLASS:
6365 return gen_rtx_REG (mode, intreg[0]);
6366 case X86_64_SSE_CLASS:
6367 case X86_64_SSESF_CLASS:
6368 case X86_64_SSEDF_CLASS:
6369 if (mode != BLKmode)
6370 return gen_reg_or_parallel (mode, orig_mode,
6371 SSE_REGNO (sse_regno));
6372 break;
6373 case X86_64_X87_CLASS:
6374 case X86_64_COMPLEX_X87_CLASS:
6375 return gen_rtx_REG (mode, FIRST_STACK_REG);
6376 case X86_64_NO_CLASS:
6377 /* Zero sized array, struct or class. */
6378 return NULL;
6379 default:
6380 gcc_unreachable ();
6381 }
6382 if (n == 2
6383 && regclass[0] == X86_64_SSE_CLASS
6384 && regclass[1] == X86_64_SSEUP_CLASS
6385 && mode != BLKmode)
6386 return gen_reg_or_parallel (mode, orig_mode,
6387 SSE_REGNO (sse_regno));
6388 if (n == 4
6389 && regclass[0] == X86_64_SSE_CLASS
6390 && regclass[1] == X86_64_SSEUP_CLASS
6391 && regclass[2] == X86_64_SSEUP_CLASS
6392 && regclass[3] == X86_64_SSEUP_CLASS
6393 && mode != BLKmode)
6394 return gen_reg_or_parallel (mode, orig_mode,
6395 SSE_REGNO (sse_regno));
6396 if (n == 2
6397 && regclass[0] == X86_64_X87_CLASS
6398 && regclass[1] == X86_64_X87UP_CLASS)
6399 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6400
6401 if (n == 2
6402 && regclass[0] == X86_64_INTEGER_CLASS
6403 && regclass[1] == X86_64_INTEGER_CLASS
6404 && (mode == CDImode || mode == TImode || mode == TFmode)
6405 && intreg[0] + 1 == intreg[1])
6406 return gen_rtx_REG (mode, intreg[0]);
6407
6408 /* Otherwise figure out the entries of the PARALLEL. */
6409 for (i = 0; i < n; i++)
6410 {
6411 int pos;
6412
6413 switch (regclass[i])
6414 {
6415 case X86_64_NO_CLASS:
6416 break;
6417 case X86_64_INTEGER_CLASS:
6418 case X86_64_INTEGERSI_CLASS:
6419 /* Merge TImodes on aligned occasions here too. */
6420 if (i * 8 + 8 > bytes)
6421 tmpmode
6422 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6423 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6424 tmpmode = SImode;
6425 else
6426 tmpmode = DImode;
6427 /* We've requested 24 bytes we
6428 don't have mode for. Use DImode. */
6429 if (tmpmode == BLKmode)
6430 tmpmode = DImode;
6431 exp [nexps++]
6432 = gen_rtx_EXPR_LIST (VOIDmode,
6433 gen_rtx_REG (tmpmode, *intreg),
6434 GEN_INT (i*8));
6435 intreg++;
6436 break;
6437 case X86_64_SSESF_CLASS:
6438 exp [nexps++]
6439 = gen_rtx_EXPR_LIST (VOIDmode,
6440 gen_rtx_REG (SFmode,
6441 SSE_REGNO (sse_regno)),
6442 GEN_INT (i*8));
6443 sse_regno++;
6444 break;
6445 case X86_64_SSEDF_CLASS:
6446 exp [nexps++]
6447 = gen_rtx_EXPR_LIST (VOIDmode,
6448 gen_rtx_REG (DFmode,
6449 SSE_REGNO (sse_regno)),
6450 GEN_INT (i*8));
6451 sse_regno++;
6452 break;
6453 case X86_64_SSE_CLASS:
6454 pos = i;
6455 switch (n)
6456 {
6457 case 1:
6458 tmpmode = DImode;
6459 break;
6460 case 2:
6461 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6462 {
6463 tmpmode = TImode;
6464 i++;
6465 }
6466 else
6467 tmpmode = DImode;
6468 break;
6469 case 4:
6470 gcc_assert (i == 0
6471 && regclass[1] == X86_64_SSEUP_CLASS
6472 && regclass[2] == X86_64_SSEUP_CLASS
6473 && regclass[3] == X86_64_SSEUP_CLASS);
6474 tmpmode = OImode;
6475 i += 3;
6476 break;
6477 default:
6478 gcc_unreachable ();
6479 }
6480 exp [nexps++]
6481 = gen_rtx_EXPR_LIST (VOIDmode,
6482 gen_rtx_REG (tmpmode,
6483 SSE_REGNO (sse_regno)),
6484 GEN_INT (pos*8));
6485 sse_regno++;
6486 break;
6487 default:
6488 gcc_unreachable ();
6489 }
6490 }
6491
6492 /* Empty aligned struct, union or class. */
6493 if (nexps == 0)
6494 return NULL;
6495
6496 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6497 for (i = 0; i < nexps; i++)
6498 XVECEXP (ret, 0, i) = exp [i];
6499 return ret;
6500 }
6501
6502 /* Update the data in CUM to advance over an argument of mode MODE
6503 and data type TYPE. (TYPE is null for libcalls where that information
6504 may not be available.) */
6505
6506 static void
6507 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6508 const_tree type, HOST_WIDE_INT bytes,
6509 HOST_WIDE_INT words)
6510 {
6511 switch (mode)
6512 {
6513 default:
6514 break;
6515
6516 case BLKmode:
6517 if (bytes < 0)
6518 break;
6519 /* FALLTHRU */
6520
6521 case DImode:
6522 case SImode:
6523 case HImode:
6524 case QImode:
6525 cum->words += words;
6526 cum->nregs -= words;
6527 cum->regno += words;
6528
6529 if (cum->nregs <= 0)
6530 {
6531 cum->nregs = 0;
6532 cum->regno = 0;
6533 }
6534 break;
6535
6536 case OImode:
6537 /* OImode shouldn't be used directly. */
6538 gcc_unreachable ();
6539
6540 case DFmode:
6541 if (cum->float_in_sse < 2)
6542 break;
6543 case SFmode:
6544 if (cum->float_in_sse < 1)
6545 break;
6546 /* FALLTHRU */
6547
6548 case V8SFmode:
6549 case V8SImode:
6550 case V32QImode:
6551 case V16HImode:
6552 case V4DFmode:
6553 case V4DImode:
6554 case TImode:
6555 case V16QImode:
6556 case V8HImode:
6557 case V4SImode:
6558 case V2DImode:
6559 case V4SFmode:
6560 case V2DFmode:
6561 if (!type || !AGGREGATE_TYPE_P (type))
6562 {
6563 cum->sse_words += words;
6564 cum->sse_nregs -= 1;
6565 cum->sse_regno += 1;
6566 if (cum->sse_nregs <= 0)
6567 {
6568 cum->sse_nregs = 0;
6569 cum->sse_regno = 0;
6570 }
6571 }
6572 break;
6573
6574 case V8QImode:
6575 case V4HImode:
6576 case V2SImode:
6577 case V2SFmode:
6578 case V1TImode:
6579 case V1DImode:
6580 if (!type || !AGGREGATE_TYPE_P (type))
6581 {
6582 cum->mmx_words += words;
6583 cum->mmx_nregs -= 1;
6584 cum->mmx_regno += 1;
6585 if (cum->mmx_nregs <= 0)
6586 {
6587 cum->mmx_nregs = 0;
6588 cum->mmx_regno = 0;
6589 }
6590 }
6591 break;
6592 }
6593 }
6594
6595 static void
6596 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6597 const_tree type, HOST_WIDE_INT words, bool named)
6598 {
6599 int int_nregs, sse_nregs;
6600
6601 /* Unnamed 256bit vector mode parameters are passed on stack. */
6602 if (!named && VALID_AVX256_REG_MODE (mode))
6603 return;
6604
6605 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6606 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6607 {
6608 cum->nregs -= int_nregs;
6609 cum->sse_nregs -= sse_nregs;
6610 cum->regno += int_nregs;
6611 cum->sse_regno += sse_nregs;
6612 }
6613 else
6614 {
6615 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6616 cum->words = (cum->words + align - 1) & ~(align - 1);
6617 cum->words += words;
6618 }
6619 }
6620
6621 static void
6622 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6623 HOST_WIDE_INT words)
6624 {
6625 /* Otherwise, this should be passed indirect. */
6626 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6627
6628 cum->words += words;
6629 if (cum->nregs > 0)
6630 {
6631 cum->nregs -= 1;
6632 cum->regno += 1;
6633 }
6634 }
6635
6636 /* Update the data in CUM to advance over an argument of mode MODE and
6637 data type TYPE. (TYPE is null for libcalls where that information
6638 may not be available.) */
6639
6640 static void
6641 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6642 const_tree type, bool named)
6643 {
6644 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6645 HOST_WIDE_INT bytes, words;
6646
6647 if (mode == BLKmode)
6648 bytes = int_size_in_bytes (type);
6649 else
6650 bytes = GET_MODE_SIZE (mode);
6651 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6652
6653 if (type)
6654 mode = type_natural_mode (type, NULL);
6655
6656 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6657 function_arg_advance_ms_64 (cum, bytes, words);
6658 else if (TARGET_64BIT)
6659 function_arg_advance_64 (cum, mode, type, words, named);
6660 else
6661 function_arg_advance_32 (cum, mode, type, bytes, words);
6662 }
6663
6664 /* Define where to put the arguments to a function.
6665 Value is zero to push the argument on the stack,
6666 or a hard register in which to store the argument.
6667
6668 MODE is the argument's machine mode.
6669 TYPE is the data type of the argument (as a tree).
6670 This is null for libcalls where that information may
6671 not be available.
6672 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6673 the preceding args and about the function being called.
6674 NAMED is nonzero if this argument is a named parameter
6675 (otherwise it is an extra parameter matching an ellipsis). */
6676
6677 static rtx
6678 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6679 enum machine_mode orig_mode, const_tree type,
6680 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6681 {
6682 static bool warnedsse, warnedmmx;
6683
6684 /* Avoid the AL settings for the Unix64 ABI. */
6685 if (mode == VOIDmode)
6686 return constm1_rtx;
6687
6688 switch (mode)
6689 {
6690 default:
6691 break;
6692
6693 case BLKmode:
6694 if (bytes < 0)
6695 break;
6696 /* FALLTHRU */
6697 case DImode:
6698 case SImode:
6699 case HImode:
6700 case QImode:
6701 if (words <= cum->nregs)
6702 {
6703 int regno = cum->regno;
6704
6705 /* Fastcall allocates the first two DWORD (SImode) or
6706 smaller arguments to ECX and EDX if it isn't an
6707 aggregate type . */
6708 if (cum->fastcall)
6709 {
6710 if (mode == BLKmode
6711 || mode == DImode
6712 || (type && AGGREGATE_TYPE_P (type)))
6713 break;
6714
6715 /* ECX not EAX is the first allocated register. */
6716 if (regno == AX_REG)
6717 regno = CX_REG;
6718 }
6719 return gen_rtx_REG (mode, regno);
6720 }
6721 break;
6722
6723 case DFmode:
6724 if (cum->float_in_sse < 2)
6725 break;
6726 case SFmode:
6727 if (cum->float_in_sse < 1)
6728 break;
6729 /* FALLTHRU */
6730 case TImode:
6731 /* In 32bit, we pass TImode in xmm registers. */
6732 case V16QImode:
6733 case V8HImode:
6734 case V4SImode:
6735 case V2DImode:
6736 case V4SFmode:
6737 case V2DFmode:
6738 if (!type || !AGGREGATE_TYPE_P (type))
6739 {
6740 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6741 {
6742 warnedsse = true;
6743 warning (0, "SSE vector argument without SSE enabled "
6744 "changes the ABI");
6745 }
6746 if (cum->sse_nregs)
6747 return gen_reg_or_parallel (mode, orig_mode,
6748 cum->sse_regno + FIRST_SSE_REG);
6749 }
6750 break;
6751
6752 case OImode:
6753 /* OImode shouldn't be used directly. */
6754 gcc_unreachable ();
6755
6756 case V8SFmode:
6757 case V8SImode:
6758 case V32QImode:
6759 case V16HImode:
6760 case V4DFmode:
6761 case V4DImode:
6762 if (!type || !AGGREGATE_TYPE_P (type))
6763 {
6764 if (cum->sse_nregs)
6765 return gen_reg_or_parallel (mode, orig_mode,
6766 cum->sse_regno + FIRST_SSE_REG);
6767 }
6768 break;
6769
6770 case V8QImode:
6771 case V4HImode:
6772 case V2SImode:
6773 case V2SFmode:
6774 case V1TImode:
6775 case V1DImode:
6776 if (!type || !AGGREGATE_TYPE_P (type))
6777 {
6778 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6779 {
6780 warnedmmx = true;
6781 warning (0, "MMX vector argument without MMX enabled "
6782 "changes the ABI");
6783 }
6784 if (cum->mmx_nregs)
6785 return gen_reg_or_parallel (mode, orig_mode,
6786 cum->mmx_regno + FIRST_MMX_REG);
6787 }
6788 break;
6789 }
6790
6791 return NULL_RTX;
6792 }
6793
6794 static rtx
6795 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6796 enum machine_mode orig_mode, const_tree type, bool named)
6797 {
6798 /* Handle a hidden AL argument containing number of registers
6799 for varargs x86-64 functions. */
6800 if (mode == VOIDmode)
6801 return GEN_INT (cum->maybe_vaarg
6802 ? (cum->sse_nregs < 0
6803 ? X86_64_SSE_REGPARM_MAX
6804 : cum->sse_regno)
6805 : -1);
6806
6807 switch (mode)
6808 {
6809 default:
6810 break;
6811
6812 case V8SFmode:
6813 case V8SImode:
6814 case V32QImode:
6815 case V16HImode:
6816 case V4DFmode:
6817 case V4DImode:
6818 /* Unnamed 256bit vector mode parameters are passed on stack. */
6819 if (!named)
6820 return NULL;
6821 break;
6822 }
6823
6824 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6825 cum->sse_nregs,
6826 &x86_64_int_parameter_registers [cum->regno],
6827 cum->sse_regno);
6828 }
6829
6830 static rtx
6831 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6832 enum machine_mode orig_mode, bool named,
6833 HOST_WIDE_INT bytes)
6834 {
6835 unsigned int regno;
6836
6837 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6838 We use value of -2 to specify that current function call is MSABI. */
6839 if (mode == VOIDmode)
6840 return GEN_INT (-2);
6841
6842 /* If we've run out of registers, it goes on the stack. */
6843 if (cum->nregs == 0)
6844 return NULL_RTX;
6845
6846 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6847
6848 /* Only floating point modes are passed in anything but integer regs. */
6849 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6850 {
6851 if (named)
6852 regno = cum->regno + FIRST_SSE_REG;
6853 else
6854 {
6855 rtx t1, t2;
6856
6857 /* Unnamed floating parameters are passed in both the
6858 SSE and integer registers. */
6859 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6860 t2 = gen_rtx_REG (mode, regno);
6861 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6862 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6863 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6864 }
6865 }
6866 /* Handle aggregated types passed in register. */
6867 if (orig_mode == BLKmode)
6868 {
6869 if (bytes > 0 && bytes <= 8)
6870 mode = (bytes > 4 ? DImode : SImode);
6871 if (mode == BLKmode)
6872 mode = DImode;
6873 }
6874
6875 return gen_reg_or_parallel (mode, orig_mode, regno);
6876 }
6877
6878 /* Return where to put the arguments to a function.
6879 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6880
6881 MODE is the argument's machine mode. TYPE is the data type of the
6882 argument. It is null for libcalls where that information may not be
6883 available. CUM gives information about the preceding args and about
6884 the function being called. NAMED is nonzero if this argument is a
6885 named parameter (otherwise it is an extra parameter matching an
6886 ellipsis). */
6887
6888 static rtx
6889 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6890 const_tree type, bool named)
6891 {
6892 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6893 enum machine_mode mode = omode;
6894 HOST_WIDE_INT bytes, words;
6895 rtx arg;
6896
6897 if (mode == BLKmode)
6898 bytes = int_size_in_bytes (type);
6899 else
6900 bytes = GET_MODE_SIZE (mode);
6901 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6902
6903 /* To simplify the code below, represent vector types with a vector mode
6904 even if MMX/SSE are not active. */
6905 if (type && TREE_CODE (type) == VECTOR_TYPE)
6906 mode = type_natural_mode (type, cum);
6907
6908 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6909 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6910 else if (TARGET_64BIT)
6911 arg = function_arg_64 (cum, mode, omode, type, named);
6912 else
6913 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6914
6915 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6916 {
6917 /* This argument uses 256bit AVX modes. */
6918 if (cum->caller)
6919 cfun->machine->callee_pass_avx256_p = true;
6920 else
6921 cfun->machine->caller_pass_avx256_p = true;
6922 }
6923
6924 return arg;
6925 }
6926
6927 /* A C expression that indicates when an argument must be passed by
6928 reference. If nonzero for an argument, a copy of that argument is
6929 made in memory and a pointer to the argument is passed instead of
6930 the argument itself. The pointer is passed in whatever way is
6931 appropriate for passing a pointer to that type. */
6932
6933 static bool
6934 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6935 enum machine_mode mode ATTRIBUTE_UNUSED,
6936 const_tree type, bool named ATTRIBUTE_UNUSED)
6937 {
6938 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6939
6940 /* See Windows x64 Software Convention. */
6941 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6942 {
6943 int msize = (int) GET_MODE_SIZE (mode);
6944 if (type)
6945 {
6946 /* Arrays are passed by reference. */
6947 if (TREE_CODE (type) == ARRAY_TYPE)
6948 return true;
6949
6950 if (AGGREGATE_TYPE_P (type))
6951 {
6952 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6953 are passed by reference. */
6954 msize = int_size_in_bytes (type);
6955 }
6956 }
6957
6958 /* __m128 is passed by reference. */
6959 switch (msize) {
6960 case 1: case 2: case 4: case 8:
6961 break;
6962 default:
6963 return true;
6964 }
6965 }
6966 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6967 return 1;
6968
6969 return 0;
6970 }
6971
6972 /* Return true when TYPE should be 128bit aligned for 32bit argument
6973 passing ABI. XXX: This function is obsolete and is only used for
6974 checking psABI compatibility with previous versions of GCC. */
6975
6976 static bool
6977 ix86_compat_aligned_value_p (const_tree type)
6978 {
6979 enum machine_mode mode = TYPE_MODE (type);
6980 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6981 || mode == TDmode
6982 || mode == TFmode
6983 || mode == TCmode)
6984 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6985 return true;
6986 if (TYPE_ALIGN (type) < 128)
6987 return false;
6988
6989 if (AGGREGATE_TYPE_P (type))
6990 {
6991 /* Walk the aggregates recursively. */
6992 switch (TREE_CODE (type))
6993 {
6994 case RECORD_TYPE:
6995 case UNION_TYPE:
6996 case QUAL_UNION_TYPE:
6997 {
6998 tree field;
6999
7000 /* Walk all the structure fields. */
7001 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7002 {
7003 if (TREE_CODE (field) == FIELD_DECL
7004 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7005 return true;
7006 }
7007 break;
7008 }
7009
7010 case ARRAY_TYPE:
7011 /* Just for use if some languages passes arrays by value. */
7012 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7013 return true;
7014 break;
7015
7016 default:
7017 gcc_unreachable ();
7018 }
7019 }
7020 return false;
7021 }
7022
7023 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7024 XXX: This function is obsolete and is only used for checking psABI
7025 compatibility with previous versions of GCC. */
7026
7027 static unsigned int
7028 ix86_compat_function_arg_boundary (enum machine_mode mode,
7029 const_tree type, unsigned int align)
7030 {
7031 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7032 natural boundaries. */
7033 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7034 {
7035 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7036 make an exception for SSE modes since these require 128bit
7037 alignment.
7038
7039 The handling here differs from field_alignment. ICC aligns MMX
7040 arguments to 4 byte boundaries, while structure fields are aligned
7041 to 8 byte boundaries. */
7042 if (!type)
7043 {
7044 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7045 align = PARM_BOUNDARY;
7046 }
7047 else
7048 {
7049 if (!ix86_compat_aligned_value_p (type))
7050 align = PARM_BOUNDARY;
7051 }
7052 }
7053 if (align > BIGGEST_ALIGNMENT)
7054 align = BIGGEST_ALIGNMENT;
7055 return align;
7056 }
7057
7058 /* Return true when TYPE should be 128bit aligned for 32bit argument
7059 passing ABI. */
7060
7061 static bool
7062 ix86_contains_aligned_value_p (const_tree type)
7063 {
7064 enum machine_mode mode = TYPE_MODE (type);
7065
7066 if (mode == XFmode || mode == XCmode)
7067 return false;
7068
7069 if (TYPE_ALIGN (type) < 128)
7070 return false;
7071
7072 if (AGGREGATE_TYPE_P (type))
7073 {
7074 /* Walk the aggregates recursively. */
7075 switch (TREE_CODE (type))
7076 {
7077 case RECORD_TYPE:
7078 case UNION_TYPE:
7079 case QUAL_UNION_TYPE:
7080 {
7081 tree field;
7082
7083 /* Walk all the structure fields. */
7084 for (field = TYPE_FIELDS (type);
7085 field;
7086 field = DECL_CHAIN (field))
7087 {
7088 if (TREE_CODE (field) == FIELD_DECL
7089 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7090 return true;
7091 }
7092 break;
7093 }
7094
7095 case ARRAY_TYPE:
7096 /* Just for use if some languages passes arrays by value. */
7097 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7098 return true;
7099 break;
7100
7101 default:
7102 gcc_unreachable ();
7103 }
7104 }
7105 else
7106 return TYPE_ALIGN (type) >= 128;
7107
7108 return false;
7109 }
7110
7111 /* Gives the alignment boundary, in bits, of an argument with the
7112 specified mode and type. */
7113
7114 static unsigned int
7115 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7116 {
7117 unsigned int align;
7118 if (type)
7119 {
7120 /* Since the main variant type is used for call, we convert it to
7121 the main variant type. */
7122 type = TYPE_MAIN_VARIANT (type);
7123 align = TYPE_ALIGN (type);
7124 }
7125 else
7126 align = GET_MODE_ALIGNMENT (mode);
7127 if (align < PARM_BOUNDARY)
7128 align = PARM_BOUNDARY;
7129 else
7130 {
7131 static bool warned;
7132 unsigned int saved_align = align;
7133
7134 if (!TARGET_64BIT)
7135 {
7136 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7137 if (!type)
7138 {
7139 if (mode == XFmode || mode == XCmode)
7140 align = PARM_BOUNDARY;
7141 }
7142 else if (!ix86_contains_aligned_value_p (type))
7143 align = PARM_BOUNDARY;
7144
7145 if (align < 128)
7146 align = PARM_BOUNDARY;
7147 }
7148
7149 if (warn_psabi
7150 && !warned
7151 && align != ix86_compat_function_arg_boundary (mode, type,
7152 saved_align))
7153 {
7154 warned = true;
7155 inform (input_location,
7156 "The ABI for passing parameters with %d-byte"
7157 " alignment has changed in GCC 4.6",
7158 align / BITS_PER_UNIT);
7159 }
7160 }
7161
7162 return align;
7163 }
7164
7165 /* Return true if N is a possible register number of function value. */
7166
7167 static bool
7168 ix86_function_value_regno_p (const unsigned int regno)
7169 {
7170 switch (regno)
7171 {
7172 case AX_REG:
7173 return true;
7174
7175 case FIRST_FLOAT_REG:
7176 /* TODO: The function should depend on current function ABI but
7177 builtins.c would need updating then. Therefore we use the
7178 default ABI. */
7179 if (TARGET_64BIT && ix86_abi == MS_ABI)
7180 return false;
7181 return TARGET_FLOAT_RETURNS_IN_80387;
7182
7183 case FIRST_SSE_REG:
7184 return TARGET_SSE;
7185
7186 case FIRST_MMX_REG:
7187 if (TARGET_MACHO || TARGET_64BIT)
7188 return false;
7189 return TARGET_MMX;
7190 }
7191
7192 return false;
7193 }
7194
7195 /* Define how to find the value returned by a function.
7196 VALTYPE is the data type of the value (as a tree).
7197 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7198 otherwise, FUNC is 0. */
7199
7200 static rtx
7201 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7202 const_tree fntype, const_tree fn)
7203 {
7204 unsigned int regno;
7205
7206 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7207 we normally prevent this case when mmx is not available. However
7208 some ABIs may require the result to be returned like DImode. */
7209 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7210 regno = FIRST_MMX_REG;
7211
7212 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7213 we prevent this case when sse is not available. However some ABIs
7214 may require the result to be returned like integer TImode. */
7215 else if (mode == TImode
7216 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7217 regno = FIRST_SSE_REG;
7218
7219 /* 32-byte vector modes in %ymm0. */
7220 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7221 regno = FIRST_SSE_REG;
7222
7223 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7224 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7225 regno = FIRST_FLOAT_REG;
7226 else
7227 /* Most things go in %eax. */
7228 regno = AX_REG;
7229
7230 /* Override FP return register with %xmm0 for local functions when
7231 SSE math is enabled or for functions with sseregparm attribute. */
7232 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7233 {
7234 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7235 if ((sse_level >= 1 && mode == SFmode)
7236 || (sse_level == 2 && mode == DFmode))
7237 regno = FIRST_SSE_REG;
7238 }
7239
7240 /* OImode shouldn't be used directly. */
7241 gcc_assert (mode != OImode);
7242
7243 return gen_rtx_REG (orig_mode, regno);
7244 }
7245
7246 static rtx
7247 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7248 const_tree valtype)
7249 {
7250 rtx ret;
7251
7252 /* Handle libcalls, which don't provide a type node. */
7253 if (valtype == NULL)
7254 {
7255 unsigned int regno;
7256
7257 switch (mode)
7258 {
7259 case SFmode:
7260 case SCmode:
7261 case DFmode:
7262 case DCmode:
7263 case TFmode:
7264 case SDmode:
7265 case DDmode:
7266 case TDmode:
7267 regno = FIRST_SSE_REG;
7268 break;
7269 case XFmode:
7270 case XCmode:
7271 regno = FIRST_FLOAT_REG;
7272 break;
7273 case TCmode:
7274 return NULL;
7275 default:
7276 regno = AX_REG;
7277 }
7278
7279 return gen_rtx_REG (mode, regno);
7280 }
7281 else if (POINTER_TYPE_P (valtype))
7282 {
7283 /* Pointers are always returned in word_mode. */
7284 mode = word_mode;
7285 }
7286
7287 ret = construct_container (mode, orig_mode, valtype, 1,
7288 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7289 x86_64_int_return_registers, 0);
7290
7291 /* For zero sized structures, construct_container returns NULL, but we
7292 need to keep rest of compiler happy by returning meaningful value. */
7293 if (!ret)
7294 ret = gen_rtx_REG (orig_mode, AX_REG);
7295
7296 return ret;
7297 }
7298
7299 static rtx
7300 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7301 {
7302 unsigned int regno = AX_REG;
7303
7304 if (TARGET_SSE)
7305 {
7306 switch (GET_MODE_SIZE (mode))
7307 {
7308 case 16:
7309 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7310 && !COMPLEX_MODE_P (mode))
7311 regno = FIRST_SSE_REG;
7312 break;
7313 case 8:
7314 case 4:
7315 if (mode == SFmode || mode == DFmode)
7316 regno = FIRST_SSE_REG;
7317 break;
7318 default:
7319 break;
7320 }
7321 }
7322 return gen_rtx_REG (orig_mode, regno);
7323 }
7324
7325 static rtx
7326 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7327 enum machine_mode orig_mode, enum machine_mode mode)
7328 {
7329 const_tree fn, fntype;
7330
7331 fn = NULL_TREE;
7332 if (fntype_or_decl && DECL_P (fntype_or_decl))
7333 fn = fntype_or_decl;
7334 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7335
7336 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7337 return function_value_ms_64 (orig_mode, mode);
7338 else if (TARGET_64BIT)
7339 return function_value_64 (orig_mode, mode, valtype);
7340 else
7341 return function_value_32 (orig_mode, mode, fntype, fn);
7342 }
7343
7344 static rtx
7345 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7346 bool outgoing ATTRIBUTE_UNUSED)
7347 {
7348 enum machine_mode mode, orig_mode;
7349
7350 orig_mode = TYPE_MODE (valtype);
7351 mode = type_natural_mode (valtype, NULL);
7352 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7353 }
7354
7355 /* Pointer function arguments and return values are promoted to
7356 word_mode. */
7357
7358 static enum machine_mode
7359 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7360 int *punsignedp, const_tree fntype,
7361 int for_return)
7362 {
7363 if (type != NULL_TREE && POINTER_TYPE_P (type))
7364 {
7365 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7366 return word_mode;
7367 }
7368 return default_promote_function_mode (type, mode, punsignedp, fntype,
7369 for_return);
7370 }
7371
7372 rtx
7373 ix86_libcall_value (enum machine_mode mode)
7374 {
7375 return ix86_function_value_1 (NULL, NULL, mode, mode);
7376 }
7377
7378 /* Return true iff type is returned in memory. */
7379
7380 static bool ATTRIBUTE_UNUSED
7381 return_in_memory_32 (const_tree type, enum machine_mode mode)
7382 {
7383 HOST_WIDE_INT size;
7384
7385 if (mode == BLKmode)
7386 return true;
7387
7388 size = int_size_in_bytes (type);
7389
7390 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7391 return false;
7392
7393 if (VECTOR_MODE_P (mode) || mode == TImode)
7394 {
7395 /* User-created vectors small enough to fit in EAX. */
7396 if (size < 8)
7397 return false;
7398
7399 /* MMX/3dNow values are returned in MM0,
7400 except when it doesn't exits or the ABI prescribes otherwise. */
7401 if (size == 8)
7402 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7403
7404 /* SSE values are returned in XMM0, except when it doesn't exist. */
7405 if (size == 16)
7406 return !TARGET_SSE;
7407
7408 /* AVX values are returned in YMM0, except when it doesn't exist. */
7409 if (size == 32)
7410 return !TARGET_AVX;
7411 }
7412
7413 if (mode == XFmode)
7414 return false;
7415
7416 if (size > 12)
7417 return true;
7418
7419 /* OImode shouldn't be used directly. */
7420 gcc_assert (mode != OImode);
7421
7422 return false;
7423 }
7424
7425 static bool ATTRIBUTE_UNUSED
7426 return_in_memory_64 (const_tree type, enum machine_mode mode)
7427 {
7428 int needed_intregs, needed_sseregs;
7429 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7430 }
7431
7432 static bool ATTRIBUTE_UNUSED
7433 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7434 {
7435 HOST_WIDE_INT size = int_size_in_bytes (type);
7436
7437 /* __m128 is returned in xmm0. */
7438 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7439 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7440 return false;
7441
7442 /* Otherwise, the size must be exactly in [1248]. */
7443 return size != 1 && size != 2 && size != 4 && size != 8;
7444 }
7445
7446 static bool
7447 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7448 {
7449 #ifdef SUBTARGET_RETURN_IN_MEMORY
7450 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7451 #else
7452 const enum machine_mode mode = type_natural_mode (type, NULL);
7453
7454 if (TARGET_64BIT)
7455 {
7456 if (ix86_function_type_abi (fntype) == MS_ABI)
7457 return return_in_memory_ms_64 (type, mode);
7458 else
7459 return return_in_memory_64 (type, mode);
7460 }
7461 else
7462 return return_in_memory_32 (type, mode);
7463 #endif
7464 }
7465
7466 /* When returning SSE vector types, we have a choice of either
7467 (1) being abi incompatible with a -march switch, or
7468 (2) generating an error.
7469 Given no good solution, I think the safest thing is one warning.
7470 The user won't be able to use -Werror, but....
7471
7472 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7473 called in response to actually generating a caller or callee that
7474 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7475 via aggregate_value_p for general type probing from tree-ssa. */
7476
7477 static rtx
7478 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7479 {
7480 static bool warnedsse, warnedmmx;
7481
7482 if (!TARGET_64BIT && type)
7483 {
7484 /* Look at the return type of the function, not the function type. */
7485 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7486
7487 if (!TARGET_SSE && !warnedsse)
7488 {
7489 if (mode == TImode
7490 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7491 {
7492 warnedsse = true;
7493 warning (0, "SSE vector return without SSE enabled "
7494 "changes the ABI");
7495 }
7496 }
7497
7498 if (!TARGET_MMX && !warnedmmx)
7499 {
7500 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7501 {
7502 warnedmmx = true;
7503 warning (0, "MMX vector return without MMX enabled "
7504 "changes the ABI");
7505 }
7506 }
7507 }
7508
7509 return NULL;
7510 }
7511
7512 \f
7513 /* Create the va_list data type. */
7514
7515 /* Returns the calling convention specific va_list date type.
7516 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7517
7518 static tree
7519 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7520 {
7521 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7522
7523 /* For i386 we use plain pointer to argument area. */
7524 if (!TARGET_64BIT || abi == MS_ABI)
7525 return build_pointer_type (char_type_node);
7526
7527 record = lang_hooks.types.make_type (RECORD_TYPE);
7528 type_decl = build_decl (BUILTINS_LOCATION,
7529 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7530
7531 f_gpr = build_decl (BUILTINS_LOCATION,
7532 FIELD_DECL, get_identifier ("gp_offset"),
7533 unsigned_type_node);
7534 f_fpr = build_decl (BUILTINS_LOCATION,
7535 FIELD_DECL, get_identifier ("fp_offset"),
7536 unsigned_type_node);
7537 f_ovf = build_decl (BUILTINS_LOCATION,
7538 FIELD_DECL, get_identifier ("overflow_arg_area"),
7539 ptr_type_node);
7540 f_sav = build_decl (BUILTINS_LOCATION,
7541 FIELD_DECL, get_identifier ("reg_save_area"),
7542 ptr_type_node);
7543
7544 va_list_gpr_counter_field = f_gpr;
7545 va_list_fpr_counter_field = f_fpr;
7546
7547 DECL_FIELD_CONTEXT (f_gpr) = record;
7548 DECL_FIELD_CONTEXT (f_fpr) = record;
7549 DECL_FIELD_CONTEXT (f_ovf) = record;
7550 DECL_FIELD_CONTEXT (f_sav) = record;
7551
7552 TYPE_STUB_DECL (record) = type_decl;
7553 TYPE_NAME (record) = type_decl;
7554 TYPE_FIELDS (record) = f_gpr;
7555 DECL_CHAIN (f_gpr) = f_fpr;
7556 DECL_CHAIN (f_fpr) = f_ovf;
7557 DECL_CHAIN (f_ovf) = f_sav;
7558
7559 layout_type (record);
7560
7561 /* The correct type is an array type of one element. */
7562 return build_array_type (record, build_index_type (size_zero_node));
7563 }
7564
7565 /* Setup the builtin va_list data type and for 64-bit the additional
7566 calling convention specific va_list data types. */
7567
7568 static tree
7569 ix86_build_builtin_va_list (void)
7570 {
7571 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7572
7573 /* Initialize abi specific va_list builtin types. */
7574 if (TARGET_64BIT)
7575 {
7576 tree t;
7577 if (ix86_abi == MS_ABI)
7578 {
7579 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7580 if (TREE_CODE (t) != RECORD_TYPE)
7581 t = build_variant_type_copy (t);
7582 sysv_va_list_type_node = t;
7583 }
7584 else
7585 {
7586 t = ret;
7587 if (TREE_CODE (t) != RECORD_TYPE)
7588 t = build_variant_type_copy (t);
7589 sysv_va_list_type_node = t;
7590 }
7591 if (ix86_abi != MS_ABI)
7592 {
7593 t = ix86_build_builtin_va_list_abi (MS_ABI);
7594 if (TREE_CODE (t) != RECORD_TYPE)
7595 t = build_variant_type_copy (t);
7596 ms_va_list_type_node = t;
7597 }
7598 else
7599 {
7600 t = ret;
7601 if (TREE_CODE (t) != RECORD_TYPE)
7602 t = build_variant_type_copy (t);
7603 ms_va_list_type_node = t;
7604 }
7605 }
7606
7607 return ret;
7608 }
7609
7610 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7611
7612 static void
7613 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7614 {
7615 rtx save_area, mem;
7616 alias_set_type set;
7617 int i, max;
7618
7619 /* GPR size of varargs save area. */
7620 if (cfun->va_list_gpr_size)
7621 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7622 else
7623 ix86_varargs_gpr_size = 0;
7624
7625 /* FPR size of varargs save area. We don't need it if we don't pass
7626 anything in SSE registers. */
7627 if (TARGET_SSE && cfun->va_list_fpr_size)
7628 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7629 else
7630 ix86_varargs_fpr_size = 0;
7631
7632 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7633 return;
7634
7635 save_area = frame_pointer_rtx;
7636 set = get_varargs_alias_set ();
7637
7638 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7639 if (max > X86_64_REGPARM_MAX)
7640 max = X86_64_REGPARM_MAX;
7641
7642 for (i = cum->regno; i < max; i++)
7643 {
7644 mem = gen_rtx_MEM (word_mode,
7645 plus_constant (save_area, i * UNITS_PER_WORD));
7646 MEM_NOTRAP_P (mem) = 1;
7647 set_mem_alias_set (mem, set);
7648 emit_move_insn (mem,
7649 gen_rtx_REG (word_mode,
7650 x86_64_int_parameter_registers[i]));
7651 }
7652
7653 if (ix86_varargs_fpr_size)
7654 {
7655 enum machine_mode smode;
7656 rtx label, test;
7657
7658 /* Now emit code to save SSE registers. The AX parameter contains number
7659 of SSE parameter registers used to call this function, though all we
7660 actually check here is the zero/non-zero status. */
7661
7662 label = gen_label_rtx ();
7663 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7664 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7665 label));
7666
7667 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7668 we used movdqa (i.e. TImode) instead? Perhaps even better would
7669 be if we could determine the real mode of the data, via a hook
7670 into pass_stdarg. Ignore all that for now. */
7671 smode = V4SFmode;
7672 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7673 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7674
7675 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7676 if (max > X86_64_SSE_REGPARM_MAX)
7677 max = X86_64_SSE_REGPARM_MAX;
7678
7679 for (i = cum->sse_regno; i < max; ++i)
7680 {
7681 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7682 mem = gen_rtx_MEM (smode, mem);
7683 MEM_NOTRAP_P (mem) = 1;
7684 set_mem_alias_set (mem, set);
7685 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7686
7687 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7688 }
7689
7690 emit_label (label);
7691 }
7692 }
7693
7694 static void
7695 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7696 {
7697 alias_set_type set = get_varargs_alias_set ();
7698 int i;
7699
7700 /* Reset to zero, as there might be a sysv vaarg used
7701 before. */
7702 ix86_varargs_gpr_size = 0;
7703 ix86_varargs_fpr_size = 0;
7704
7705 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7706 {
7707 rtx reg, mem;
7708
7709 mem = gen_rtx_MEM (Pmode,
7710 plus_constant (virtual_incoming_args_rtx,
7711 i * UNITS_PER_WORD));
7712 MEM_NOTRAP_P (mem) = 1;
7713 set_mem_alias_set (mem, set);
7714
7715 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7716 emit_move_insn (mem, reg);
7717 }
7718 }
7719
7720 static void
7721 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7722 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7723 int no_rtl)
7724 {
7725 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7726 CUMULATIVE_ARGS next_cum;
7727 tree fntype;
7728
7729 /* This argument doesn't appear to be used anymore. Which is good,
7730 because the old code here didn't suppress rtl generation. */
7731 gcc_assert (!no_rtl);
7732
7733 if (!TARGET_64BIT)
7734 return;
7735
7736 fntype = TREE_TYPE (current_function_decl);
7737
7738 /* For varargs, we do not want to skip the dummy va_dcl argument.
7739 For stdargs, we do want to skip the last named argument. */
7740 next_cum = *cum;
7741 if (stdarg_p (fntype))
7742 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7743 true);
7744
7745 if (cum->call_abi == MS_ABI)
7746 setup_incoming_varargs_ms_64 (&next_cum);
7747 else
7748 setup_incoming_varargs_64 (&next_cum);
7749 }
7750
7751 /* Checks if TYPE is of kind va_list char *. */
7752
7753 static bool
7754 is_va_list_char_pointer (tree type)
7755 {
7756 tree canonic;
7757
7758 /* For 32-bit it is always true. */
7759 if (!TARGET_64BIT)
7760 return true;
7761 canonic = ix86_canonical_va_list_type (type);
7762 return (canonic == ms_va_list_type_node
7763 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7764 }
7765
7766 /* Implement va_start. */
7767
7768 static void
7769 ix86_va_start (tree valist, rtx nextarg)
7770 {
7771 HOST_WIDE_INT words, n_gpr, n_fpr;
7772 tree f_gpr, f_fpr, f_ovf, f_sav;
7773 tree gpr, fpr, ovf, sav, t;
7774 tree type;
7775 rtx ovf_rtx;
7776
7777 if (flag_split_stack
7778 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7779 {
7780 unsigned int scratch_regno;
7781
7782 /* When we are splitting the stack, we can't refer to the stack
7783 arguments using internal_arg_pointer, because they may be on
7784 the old stack. The split stack prologue will arrange to
7785 leave a pointer to the old stack arguments in a scratch
7786 register, which we here copy to a pseudo-register. The split
7787 stack prologue can't set the pseudo-register directly because
7788 it (the prologue) runs before any registers have been saved. */
7789
7790 scratch_regno = split_stack_prologue_scratch_regno ();
7791 if (scratch_regno != INVALID_REGNUM)
7792 {
7793 rtx reg, seq;
7794
7795 reg = gen_reg_rtx (Pmode);
7796 cfun->machine->split_stack_varargs_pointer = reg;
7797
7798 start_sequence ();
7799 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7800 seq = get_insns ();
7801 end_sequence ();
7802
7803 push_topmost_sequence ();
7804 emit_insn_after (seq, entry_of_function ());
7805 pop_topmost_sequence ();
7806 }
7807 }
7808
7809 /* Only 64bit target needs something special. */
7810 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7811 {
7812 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7813 std_expand_builtin_va_start (valist, nextarg);
7814 else
7815 {
7816 rtx va_r, next;
7817
7818 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7819 next = expand_binop (ptr_mode, add_optab,
7820 cfun->machine->split_stack_varargs_pointer,
7821 crtl->args.arg_offset_rtx,
7822 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7823 convert_move (va_r, next, 0);
7824 }
7825 return;
7826 }
7827
7828 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7829 f_fpr = DECL_CHAIN (f_gpr);
7830 f_ovf = DECL_CHAIN (f_fpr);
7831 f_sav = DECL_CHAIN (f_ovf);
7832
7833 valist = build_simple_mem_ref (valist);
7834 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7835 /* The following should be folded into the MEM_REF offset. */
7836 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7837 f_gpr, NULL_TREE);
7838 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7839 f_fpr, NULL_TREE);
7840 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7841 f_ovf, NULL_TREE);
7842 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7843 f_sav, NULL_TREE);
7844
7845 /* Count number of gp and fp argument registers used. */
7846 words = crtl->args.info.words;
7847 n_gpr = crtl->args.info.regno;
7848 n_fpr = crtl->args.info.sse_regno;
7849
7850 if (cfun->va_list_gpr_size)
7851 {
7852 type = TREE_TYPE (gpr);
7853 t = build2 (MODIFY_EXPR, type,
7854 gpr, build_int_cst (type, n_gpr * 8));
7855 TREE_SIDE_EFFECTS (t) = 1;
7856 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7857 }
7858
7859 if (TARGET_SSE && cfun->va_list_fpr_size)
7860 {
7861 type = TREE_TYPE (fpr);
7862 t = build2 (MODIFY_EXPR, type, fpr,
7863 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7864 TREE_SIDE_EFFECTS (t) = 1;
7865 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7866 }
7867
7868 /* Find the overflow area. */
7869 type = TREE_TYPE (ovf);
7870 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7871 ovf_rtx = crtl->args.internal_arg_pointer;
7872 else
7873 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7874 t = make_tree (type, ovf_rtx);
7875 if (words != 0)
7876 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7877 t = build2 (MODIFY_EXPR, type, ovf, t);
7878 TREE_SIDE_EFFECTS (t) = 1;
7879 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7880
7881 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7882 {
7883 /* Find the register save area.
7884 Prologue of the function save it right above stack frame. */
7885 type = TREE_TYPE (sav);
7886 t = make_tree (type, frame_pointer_rtx);
7887 if (!ix86_varargs_gpr_size)
7888 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7889 t = build2 (MODIFY_EXPR, type, sav, t);
7890 TREE_SIDE_EFFECTS (t) = 1;
7891 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7892 }
7893 }
7894
7895 /* Implement va_arg. */
7896
7897 static tree
7898 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7899 gimple_seq *post_p)
7900 {
7901 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7902 tree f_gpr, f_fpr, f_ovf, f_sav;
7903 tree gpr, fpr, ovf, sav, t;
7904 int size, rsize;
7905 tree lab_false, lab_over = NULL_TREE;
7906 tree addr, t2;
7907 rtx container;
7908 int indirect_p = 0;
7909 tree ptrtype;
7910 enum machine_mode nat_mode;
7911 unsigned int arg_boundary;
7912
7913 /* Only 64bit target needs something special. */
7914 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7915 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7916
7917 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7918 f_fpr = DECL_CHAIN (f_gpr);
7919 f_ovf = DECL_CHAIN (f_fpr);
7920 f_sav = DECL_CHAIN (f_ovf);
7921
7922 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7923 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7924 valist = build_va_arg_indirect_ref (valist);
7925 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7926 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7927 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7928
7929 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7930 if (indirect_p)
7931 type = build_pointer_type (type);
7932 size = int_size_in_bytes (type);
7933 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7934
7935 nat_mode = type_natural_mode (type, NULL);
7936 switch (nat_mode)
7937 {
7938 case V8SFmode:
7939 case V8SImode:
7940 case V32QImode:
7941 case V16HImode:
7942 case V4DFmode:
7943 case V4DImode:
7944 /* Unnamed 256bit vector mode parameters are passed on stack. */
7945 if (!TARGET_64BIT_MS_ABI)
7946 {
7947 container = NULL;
7948 break;
7949 }
7950
7951 default:
7952 container = construct_container (nat_mode, TYPE_MODE (type),
7953 type, 0, X86_64_REGPARM_MAX,
7954 X86_64_SSE_REGPARM_MAX, intreg,
7955 0);
7956 break;
7957 }
7958
7959 /* Pull the value out of the saved registers. */
7960
7961 addr = create_tmp_var (ptr_type_node, "addr");
7962
7963 if (container)
7964 {
7965 int needed_intregs, needed_sseregs;
7966 bool need_temp;
7967 tree int_addr, sse_addr;
7968
7969 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7970 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7971
7972 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7973
7974 need_temp = (!REG_P (container)
7975 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7976 || TYPE_ALIGN (type) > 128));
7977
7978 /* In case we are passing structure, verify that it is consecutive block
7979 on the register save area. If not we need to do moves. */
7980 if (!need_temp && !REG_P (container))
7981 {
7982 /* Verify that all registers are strictly consecutive */
7983 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7984 {
7985 int i;
7986
7987 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7988 {
7989 rtx slot = XVECEXP (container, 0, i);
7990 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7991 || INTVAL (XEXP (slot, 1)) != i * 16)
7992 need_temp = 1;
7993 }
7994 }
7995 else
7996 {
7997 int i;
7998
7999 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8000 {
8001 rtx slot = XVECEXP (container, 0, i);
8002 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8003 || INTVAL (XEXP (slot, 1)) != i * 8)
8004 need_temp = 1;
8005 }
8006 }
8007 }
8008 if (!need_temp)
8009 {
8010 int_addr = addr;
8011 sse_addr = addr;
8012 }
8013 else
8014 {
8015 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8016 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8017 }
8018
8019 /* First ensure that we fit completely in registers. */
8020 if (needed_intregs)
8021 {
8022 t = build_int_cst (TREE_TYPE (gpr),
8023 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8024 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8025 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8026 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8027 gimplify_and_add (t, pre_p);
8028 }
8029 if (needed_sseregs)
8030 {
8031 t = build_int_cst (TREE_TYPE (fpr),
8032 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8033 + X86_64_REGPARM_MAX * 8);
8034 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8035 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8036 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8037 gimplify_and_add (t, pre_p);
8038 }
8039
8040 /* Compute index to start of area used for integer regs. */
8041 if (needed_intregs)
8042 {
8043 /* int_addr = gpr + sav; */
8044 t = fold_build_pointer_plus (sav, gpr);
8045 gimplify_assign (int_addr, t, pre_p);
8046 }
8047 if (needed_sseregs)
8048 {
8049 /* sse_addr = fpr + sav; */
8050 t = fold_build_pointer_plus (sav, fpr);
8051 gimplify_assign (sse_addr, t, pre_p);
8052 }
8053 if (need_temp)
8054 {
8055 int i, prev_size = 0;
8056 tree temp = create_tmp_var (type, "va_arg_tmp");
8057
8058 /* addr = &temp; */
8059 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8060 gimplify_assign (addr, t, pre_p);
8061
8062 for (i = 0; i < XVECLEN (container, 0); i++)
8063 {
8064 rtx slot = XVECEXP (container, 0, i);
8065 rtx reg = XEXP (slot, 0);
8066 enum machine_mode mode = GET_MODE (reg);
8067 tree piece_type;
8068 tree addr_type;
8069 tree daddr_type;
8070 tree src_addr, src;
8071 int src_offset;
8072 tree dest_addr, dest;
8073 int cur_size = GET_MODE_SIZE (mode);
8074
8075 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8076 prev_size = INTVAL (XEXP (slot, 1));
8077 if (prev_size + cur_size > size)
8078 {
8079 cur_size = size - prev_size;
8080 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8081 if (mode == BLKmode)
8082 mode = QImode;
8083 }
8084 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8085 if (mode == GET_MODE (reg))
8086 addr_type = build_pointer_type (piece_type);
8087 else
8088 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8089 true);
8090 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8091 true);
8092
8093 if (SSE_REGNO_P (REGNO (reg)))
8094 {
8095 src_addr = sse_addr;
8096 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8097 }
8098 else
8099 {
8100 src_addr = int_addr;
8101 src_offset = REGNO (reg) * 8;
8102 }
8103 src_addr = fold_convert (addr_type, src_addr);
8104 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8105
8106 dest_addr = fold_convert (daddr_type, addr);
8107 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8108 if (cur_size == GET_MODE_SIZE (mode))
8109 {
8110 src = build_va_arg_indirect_ref (src_addr);
8111 dest = build_va_arg_indirect_ref (dest_addr);
8112
8113 gimplify_assign (dest, src, pre_p);
8114 }
8115 else
8116 {
8117 tree copy
8118 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8119 3, dest_addr, src_addr,
8120 size_int (cur_size));
8121 gimplify_and_add (copy, pre_p);
8122 }
8123 prev_size += cur_size;
8124 }
8125 }
8126
8127 if (needed_intregs)
8128 {
8129 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8130 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8131 gimplify_assign (gpr, t, pre_p);
8132 }
8133
8134 if (needed_sseregs)
8135 {
8136 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8137 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8138 gimplify_assign (fpr, t, pre_p);
8139 }
8140
8141 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8142
8143 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8144 }
8145
8146 /* ... otherwise out of the overflow area. */
8147
8148 /* When we align parameter on stack for caller, if the parameter
8149 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8150 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8151 here with caller. */
8152 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8153 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8154 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8155
8156 /* Care for on-stack alignment if needed. */
8157 if (arg_boundary <= 64 || size == 0)
8158 t = ovf;
8159 else
8160 {
8161 HOST_WIDE_INT align = arg_boundary / 8;
8162 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8163 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8164 build_int_cst (TREE_TYPE (t), -align));
8165 }
8166
8167 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8168 gimplify_assign (addr, t, pre_p);
8169
8170 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8171 gimplify_assign (unshare_expr (ovf), t, pre_p);
8172
8173 if (container)
8174 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8175
8176 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8177 addr = fold_convert (ptrtype, addr);
8178
8179 if (indirect_p)
8180 addr = build_va_arg_indirect_ref (addr);
8181 return build_va_arg_indirect_ref (addr);
8182 }
8183 \f
8184 /* Return true if OPNUM's MEM should be matched
8185 in movabs* patterns. */
8186
8187 bool
8188 ix86_check_movabs (rtx insn, int opnum)
8189 {
8190 rtx set, mem;
8191
8192 set = PATTERN (insn);
8193 if (GET_CODE (set) == PARALLEL)
8194 set = XVECEXP (set, 0, 0);
8195 gcc_assert (GET_CODE (set) == SET);
8196 mem = XEXP (set, opnum);
8197 while (GET_CODE (mem) == SUBREG)
8198 mem = SUBREG_REG (mem);
8199 gcc_assert (MEM_P (mem));
8200 return volatile_ok || !MEM_VOLATILE_P (mem);
8201 }
8202 \f
8203 /* Initialize the table of extra 80387 mathematical constants. */
8204
8205 static void
8206 init_ext_80387_constants (void)
8207 {
8208 static const char * cst[5] =
8209 {
8210 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8211 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8212 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8213 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8214 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8215 };
8216 int i;
8217
8218 for (i = 0; i < 5; i++)
8219 {
8220 real_from_string (&ext_80387_constants_table[i], cst[i]);
8221 /* Ensure each constant is rounded to XFmode precision. */
8222 real_convert (&ext_80387_constants_table[i],
8223 XFmode, &ext_80387_constants_table[i]);
8224 }
8225
8226 ext_80387_constants_init = 1;
8227 }
8228
8229 /* Return non-zero if the constant is something that
8230 can be loaded with a special instruction. */
8231
8232 int
8233 standard_80387_constant_p (rtx x)
8234 {
8235 enum machine_mode mode = GET_MODE (x);
8236
8237 REAL_VALUE_TYPE r;
8238
8239 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8240 return -1;
8241
8242 if (x == CONST0_RTX (mode))
8243 return 1;
8244 if (x == CONST1_RTX (mode))
8245 return 2;
8246
8247 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8248
8249 /* For XFmode constants, try to find a special 80387 instruction when
8250 optimizing for size or on those CPUs that benefit from them. */
8251 if (mode == XFmode
8252 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8253 {
8254 int i;
8255
8256 if (! ext_80387_constants_init)
8257 init_ext_80387_constants ();
8258
8259 for (i = 0; i < 5; i++)
8260 if (real_identical (&r, &ext_80387_constants_table[i]))
8261 return i + 3;
8262 }
8263
8264 /* Load of the constant -0.0 or -1.0 will be split as
8265 fldz;fchs or fld1;fchs sequence. */
8266 if (real_isnegzero (&r))
8267 return 8;
8268 if (real_identical (&r, &dconstm1))
8269 return 9;
8270
8271 return 0;
8272 }
8273
8274 /* Return the opcode of the special instruction to be used to load
8275 the constant X. */
8276
8277 const char *
8278 standard_80387_constant_opcode (rtx x)
8279 {
8280 switch (standard_80387_constant_p (x))
8281 {
8282 case 1:
8283 return "fldz";
8284 case 2:
8285 return "fld1";
8286 case 3:
8287 return "fldlg2";
8288 case 4:
8289 return "fldln2";
8290 case 5:
8291 return "fldl2e";
8292 case 6:
8293 return "fldl2t";
8294 case 7:
8295 return "fldpi";
8296 case 8:
8297 case 9:
8298 return "#";
8299 default:
8300 gcc_unreachable ();
8301 }
8302 }
8303
8304 /* Return the CONST_DOUBLE representing the 80387 constant that is
8305 loaded by the specified special instruction. The argument IDX
8306 matches the return value from standard_80387_constant_p. */
8307
8308 rtx
8309 standard_80387_constant_rtx (int idx)
8310 {
8311 int i;
8312
8313 if (! ext_80387_constants_init)
8314 init_ext_80387_constants ();
8315
8316 switch (idx)
8317 {
8318 case 3:
8319 case 4:
8320 case 5:
8321 case 6:
8322 case 7:
8323 i = idx - 3;
8324 break;
8325
8326 default:
8327 gcc_unreachable ();
8328 }
8329
8330 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8331 XFmode);
8332 }
8333
8334 /* Return 1 if X is all 0s and 2 if x is all 1s
8335 in supported SSE/AVX vector mode. */
8336
8337 int
8338 standard_sse_constant_p (rtx x)
8339 {
8340 enum machine_mode mode = GET_MODE (x);
8341
8342 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8343 return 1;
8344 if (vector_all_ones_operand (x, mode))
8345 switch (mode)
8346 {
8347 case V16QImode:
8348 case V8HImode:
8349 case V4SImode:
8350 case V2DImode:
8351 if (TARGET_SSE2)
8352 return 2;
8353 case V32QImode:
8354 case V16HImode:
8355 case V8SImode:
8356 case V4DImode:
8357 if (TARGET_AVX2)
8358 return 2;
8359 default:
8360 break;
8361 }
8362
8363 return 0;
8364 }
8365
8366 /* Return the opcode of the special instruction to be used to load
8367 the constant X. */
8368
8369 const char *
8370 standard_sse_constant_opcode (rtx insn, rtx x)
8371 {
8372 switch (standard_sse_constant_p (x))
8373 {
8374 case 1:
8375 switch (get_attr_mode (insn))
8376 {
8377 case MODE_TI:
8378 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8379 return "%vpxor\t%0, %d0";
8380 case MODE_V2DF:
8381 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8382 return "%vxorpd\t%0, %d0";
8383 case MODE_V4SF:
8384 return "%vxorps\t%0, %d0";
8385
8386 case MODE_OI:
8387 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8388 return "vpxor\t%x0, %x0, %x0";
8389 case MODE_V4DF:
8390 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8391 return "vxorpd\t%x0, %x0, %x0";
8392 case MODE_V8SF:
8393 return "vxorps\t%x0, %x0, %x0";
8394
8395 default:
8396 break;
8397 }
8398
8399 case 2:
8400 if (TARGET_AVX)
8401 return "vpcmpeqd\t%0, %0, %0";
8402 else
8403 return "pcmpeqd\t%0, %0";
8404
8405 default:
8406 break;
8407 }
8408 gcc_unreachable ();
8409 }
8410
8411 /* Returns true if OP contains a symbol reference */
8412
8413 bool
8414 symbolic_reference_mentioned_p (rtx op)
8415 {
8416 const char *fmt;
8417 int i;
8418
8419 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8420 return true;
8421
8422 fmt = GET_RTX_FORMAT (GET_CODE (op));
8423 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8424 {
8425 if (fmt[i] == 'E')
8426 {
8427 int j;
8428
8429 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8430 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8431 return true;
8432 }
8433
8434 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8435 return true;
8436 }
8437
8438 return false;
8439 }
8440
8441 /* Return true if it is appropriate to emit `ret' instructions in the
8442 body of a function. Do this only if the epilogue is simple, needing a
8443 couple of insns. Prior to reloading, we can't tell how many registers
8444 must be saved, so return false then. Return false if there is no frame
8445 marker to de-allocate. */
8446
8447 bool
8448 ix86_can_use_return_insn_p (void)
8449 {
8450 struct ix86_frame frame;
8451
8452 if (! reload_completed || frame_pointer_needed)
8453 return 0;
8454
8455 /* Don't allow more than 32k pop, since that's all we can do
8456 with one instruction. */
8457 if (crtl->args.pops_args && crtl->args.size >= 32768)
8458 return 0;
8459
8460 ix86_compute_frame_layout (&frame);
8461 return (frame.stack_pointer_offset == UNITS_PER_WORD
8462 && (frame.nregs + frame.nsseregs) == 0);
8463 }
8464 \f
8465 /* Value should be nonzero if functions must have frame pointers.
8466 Zero means the frame pointer need not be set up (and parms may
8467 be accessed via the stack pointer) in functions that seem suitable. */
8468
8469 static bool
8470 ix86_frame_pointer_required (void)
8471 {
8472 /* If we accessed previous frames, then the generated code expects
8473 to be able to access the saved ebp value in our frame. */
8474 if (cfun->machine->accesses_prev_frame)
8475 return true;
8476
8477 /* Several x86 os'es need a frame pointer for other reasons,
8478 usually pertaining to setjmp. */
8479 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8480 return true;
8481
8482 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8483 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8484 return true;
8485
8486 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8487 turns off the frame pointer by default. Turn it back on now if
8488 we've not got a leaf function. */
8489 if (TARGET_OMIT_LEAF_FRAME_POINTER
8490 && (!current_function_is_leaf
8491 || ix86_current_function_calls_tls_descriptor))
8492 return true;
8493
8494 if (crtl->profile && !flag_fentry)
8495 return true;
8496
8497 return false;
8498 }
8499
8500 /* Record that the current function accesses previous call frames. */
8501
8502 void
8503 ix86_setup_frame_addresses (void)
8504 {
8505 cfun->machine->accesses_prev_frame = 1;
8506 }
8507 \f
8508 #ifndef USE_HIDDEN_LINKONCE
8509 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8510 # define USE_HIDDEN_LINKONCE 1
8511 # else
8512 # define USE_HIDDEN_LINKONCE 0
8513 # endif
8514 #endif
8515
8516 static int pic_labels_used;
8517
8518 /* Fills in the label name that should be used for a pc thunk for
8519 the given register. */
8520
8521 static void
8522 get_pc_thunk_name (char name[32], unsigned int regno)
8523 {
8524 gcc_assert (!TARGET_64BIT);
8525
8526 if (USE_HIDDEN_LINKONCE)
8527 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8528 else
8529 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8530 }
8531
8532
8533 /* This function generates code for -fpic that loads %ebx with
8534 the return address of the caller and then returns. */
8535
8536 static void
8537 ix86_code_end (void)
8538 {
8539 rtx xops[2];
8540 int regno;
8541
8542 for (regno = AX_REG; regno <= SP_REG; regno++)
8543 {
8544 char name[32];
8545 tree decl;
8546
8547 if (!(pic_labels_used & (1 << regno)))
8548 continue;
8549
8550 get_pc_thunk_name (name, regno);
8551
8552 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8553 get_identifier (name),
8554 build_function_type_list (void_type_node, NULL_TREE));
8555 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8556 NULL_TREE, void_type_node);
8557 TREE_PUBLIC (decl) = 1;
8558 TREE_STATIC (decl) = 1;
8559
8560 #if TARGET_MACHO
8561 if (TARGET_MACHO)
8562 {
8563 switch_to_section (darwin_sections[text_coal_section]);
8564 fputs ("\t.weak_definition\t", asm_out_file);
8565 assemble_name (asm_out_file, name);
8566 fputs ("\n\t.private_extern\t", asm_out_file);
8567 assemble_name (asm_out_file, name);
8568 putc ('\n', asm_out_file);
8569 ASM_OUTPUT_LABEL (asm_out_file, name);
8570 DECL_WEAK (decl) = 1;
8571 }
8572 else
8573 #endif
8574 if (USE_HIDDEN_LINKONCE)
8575 {
8576 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8577
8578 targetm.asm_out.unique_section (decl, 0);
8579 switch_to_section (get_named_section (decl, NULL, 0));
8580
8581 targetm.asm_out.globalize_label (asm_out_file, name);
8582 fputs ("\t.hidden\t", asm_out_file);
8583 assemble_name (asm_out_file, name);
8584 putc ('\n', asm_out_file);
8585 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8586 }
8587 else
8588 {
8589 switch_to_section (text_section);
8590 ASM_OUTPUT_LABEL (asm_out_file, name);
8591 }
8592
8593 DECL_INITIAL (decl) = make_node (BLOCK);
8594 current_function_decl = decl;
8595 init_function_start (decl);
8596 first_function_block_is_cold = false;
8597 /* Make sure unwind info is emitted for the thunk if needed. */
8598 final_start_function (emit_barrier (), asm_out_file, 1);
8599
8600 /* Pad stack IP move with 4 instructions (two NOPs count
8601 as one instruction). */
8602 if (TARGET_PAD_SHORT_FUNCTION)
8603 {
8604 int i = 8;
8605
8606 while (i--)
8607 fputs ("\tnop\n", asm_out_file);
8608 }
8609
8610 xops[0] = gen_rtx_REG (Pmode, regno);
8611 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8612 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8613 fputs ("\tret\n", asm_out_file);
8614 final_end_function ();
8615 init_insn_lengths ();
8616 free_after_compilation (cfun);
8617 set_cfun (NULL);
8618 current_function_decl = NULL;
8619 }
8620
8621 if (flag_split_stack)
8622 file_end_indicate_split_stack ();
8623 }
8624
8625 /* Emit code for the SET_GOT patterns. */
8626
8627 const char *
8628 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8629 {
8630 rtx xops[3];
8631
8632 xops[0] = dest;
8633
8634 if (TARGET_VXWORKS_RTP && flag_pic)
8635 {
8636 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8637 xops[2] = gen_rtx_MEM (Pmode,
8638 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8639 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8640
8641 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8642 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8643 an unadorned address. */
8644 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8645 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8646 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8647 return "";
8648 }
8649
8650 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8651
8652 if (!flag_pic)
8653 {
8654 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8655
8656 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8657
8658 #if TARGET_MACHO
8659 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8660 is what will be referenced by the Mach-O PIC subsystem. */
8661 if (!label)
8662 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8663 #endif
8664
8665 targetm.asm_out.internal_label (asm_out_file, "L",
8666 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8667 }
8668 else
8669 {
8670 char name[32];
8671 get_pc_thunk_name (name, REGNO (dest));
8672 pic_labels_used |= 1 << REGNO (dest);
8673
8674 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8675 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8676 output_asm_insn ("call\t%X2", xops);
8677 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8678 is what will be referenced by the Mach-O PIC subsystem. */
8679 #if TARGET_MACHO
8680 if (!label)
8681 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8682 else
8683 targetm.asm_out.internal_label (asm_out_file, "L",
8684 CODE_LABEL_NUMBER (label));
8685 #endif
8686 }
8687
8688 if (!TARGET_MACHO)
8689 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8690
8691 return "";
8692 }
8693
8694 /* Generate an "push" pattern for input ARG. */
8695
8696 static rtx
8697 gen_push (rtx arg)
8698 {
8699 struct machine_function *m = cfun->machine;
8700
8701 if (m->fs.cfa_reg == stack_pointer_rtx)
8702 m->fs.cfa_offset += UNITS_PER_WORD;
8703 m->fs.sp_offset += UNITS_PER_WORD;
8704
8705 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8706 arg = gen_rtx_REG (word_mode, REGNO (arg));
8707
8708 return gen_rtx_SET (VOIDmode,
8709 gen_rtx_MEM (word_mode,
8710 gen_rtx_PRE_DEC (Pmode,
8711 stack_pointer_rtx)),
8712 arg);
8713 }
8714
8715 /* Generate an "pop" pattern for input ARG. */
8716
8717 static rtx
8718 gen_pop (rtx arg)
8719 {
8720 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8721 arg = gen_rtx_REG (word_mode, REGNO (arg));
8722
8723 return gen_rtx_SET (VOIDmode,
8724 arg,
8725 gen_rtx_MEM (word_mode,
8726 gen_rtx_POST_INC (Pmode,
8727 stack_pointer_rtx)));
8728 }
8729
8730 /* Return >= 0 if there is an unused call-clobbered register available
8731 for the entire function. */
8732
8733 static unsigned int
8734 ix86_select_alt_pic_regnum (void)
8735 {
8736 if (current_function_is_leaf
8737 && !crtl->profile
8738 && !ix86_current_function_calls_tls_descriptor)
8739 {
8740 int i, drap;
8741 /* Can't use the same register for both PIC and DRAP. */
8742 if (crtl->drap_reg)
8743 drap = REGNO (crtl->drap_reg);
8744 else
8745 drap = -1;
8746 for (i = 2; i >= 0; --i)
8747 if (i != drap && !df_regs_ever_live_p (i))
8748 return i;
8749 }
8750
8751 return INVALID_REGNUM;
8752 }
8753
8754 /* Return TRUE if we need to save REGNO. */
8755
8756 static bool
8757 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8758 {
8759 if (pic_offset_table_rtx
8760 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8761 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8762 || crtl->profile
8763 || crtl->calls_eh_return
8764 || crtl->uses_const_pool))
8765 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8766
8767 if (crtl->calls_eh_return && maybe_eh_return)
8768 {
8769 unsigned i;
8770 for (i = 0; ; i++)
8771 {
8772 unsigned test = EH_RETURN_DATA_REGNO (i);
8773 if (test == INVALID_REGNUM)
8774 break;
8775 if (test == regno)
8776 return true;
8777 }
8778 }
8779
8780 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8781 return true;
8782
8783 return (df_regs_ever_live_p (regno)
8784 && !call_used_regs[regno]
8785 && !fixed_regs[regno]
8786 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8787 }
8788
8789 /* Return number of saved general prupose registers. */
8790
8791 static int
8792 ix86_nsaved_regs (void)
8793 {
8794 int nregs = 0;
8795 int regno;
8796
8797 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8798 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8799 nregs ++;
8800 return nregs;
8801 }
8802
8803 /* Return number of saved SSE registrers. */
8804
8805 static int
8806 ix86_nsaved_sseregs (void)
8807 {
8808 int nregs = 0;
8809 int regno;
8810
8811 if (!TARGET_64BIT_MS_ABI)
8812 return 0;
8813 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8814 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8815 nregs ++;
8816 return nregs;
8817 }
8818
8819 /* Given FROM and TO register numbers, say whether this elimination is
8820 allowed. If stack alignment is needed, we can only replace argument
8821 pointer with hard frame pointer, or replace frame pointer with stack
8822 pointer. Otherwise, frame pointer elimination is automatically
8823 handled and all other eliminations are valid. */
8824
8825 static bool
8826 ix86_can_eliminate (const int from, const int to)
8827 {
8828 if (stack_realign_fp)
8829 return ((from == ARG_POINTER_REGNUM
8830 && to == HARD_FRAME_POINTER_REGNUM)
8831 || (from == FRAME_POINTER_REGNUM
8832 && to == STACK_POINTER_REGNUM));
8833 else
8834 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8835 }
8836
8837 /* Return the offset between two registers, one to be eliminated, and the other
8838 its replacement, at the start of a routine. */
8839
8840 HOST_WIDE_INT
8841 ix86_initial_elimination_offset (int from, int to)
8842 {
8843 struct ix86_frame frame;
8844 ix86_compute_frame_layout (&frame);
8845
8846 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8847 return frame.hard_frame_pointer_offset;
8848 else if (from == FRAME_POINTER_REGNUM
8849 && to == HARD_FRAME_POINTER_REGNUM)
8850 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8851 else
8852 {
8853 gcc_assert (to == STACK_POINTER_REGNUM);
8854
8855 if (from == ARG_POINTER_REGNUM)
8856 return frame.stack_pointer_offset;
8857
8858 gcc_assert (from == FRAME_POINTER_REGNUM);
8859 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8860 }
8861 }
8862
8863 /* In a dynamically-aligned function, we can't know the offset from
8864 stack pointer to frame pointer, so we must ensure that setjmp
8865 eliminates fp against the hard fp (%ebp) rather than trying to
8866 index from %esp up to the top of the frame across a gap that is
8867 of unknown (at compile-time) size. */
8868 static rtx
8869 ix86_builtin_setjmp_frame_value (void)
8870 {
8871 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8872 }
8873
8874 /* When using -fsplit-stack, the allocation routines set a field in
8875 the TCB to the bottom of the stack plus this much space, measured
8876 in bytes. */
8877
8878 #define SPLIT_STACK_AVAILABLE 256
8879
8880 /* Fill structure ix86_frame about frame of currently computed function. */
8881
8882 static void
8883 ix86_compute_frame_layout (struct ix86_frame *frame)
8884 {
8885 unsigned int stack_alignment_needed;
8886 HOST_WIDE_INT offset;
8887 unsigned int preferred_alignment;
8888 HOST_WIDE_INT size = get_frame_size ();
8889 HOST_WIDE_INT to_allocate;
8890
8891 frame->nregs = ix86_nsaved_regs ();
8892 frame->nsseregs = ix86_nsaved_sseregs ();
8893
8894 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8895 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8896
8897 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8898 function prologues and leaf. */
8899 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8900 && (!current_function_is_leaf || cfun->calls_alloca != 0
8901 || ix86_current_function_calls_tls_descriptor))
8902 {
8903 preferred_alignment = 16;
8904 stack_alignment_needed = 16;
8905 crtl->preferred_stack_boundary = 128;
8906 crtl->stack_alignment_needed = 128;
8907 }
8908
8909 gcc_assert (!size || stack_alignment_needed);
8910 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8911 gcc_assert (preferred_alignment <= stack_alignment_needed);
8912
8913 /* For SEH we have to limit the amount of code movement into the prologue.
8914 At present we do this via a BLOCKAGE, at which point there's very little
8915 scheduling that can be done, which means that there's very little point
8916 in doing anything except PUSHs. */
8917 if (TARGET_SEH)
8918 cfun->machine->use_fast_prologue_epilogue = false;
8919
8920 /* During reload iteration the amount of registers saved can change.
8921 Recompute the value as needed. Do not recompute when amount of registers
8922 didn't change as reload does multiple calls to the function and does not
8923 expect the decision to change within single iteration. */
8924 else if (!optimize_function_for_size_p (cfun)
8925 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8926 {
8927 int count = frame->nregs;
8928 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8929
8930 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8931
8932 /* The fast prologue uses move instead of push to save registers. This
8933 is significantly longer, but also executes faster as modern hardware
8934 can execute the moves in parallel, but can't do that for push/pop.
8935
8936 Be careful about choosing what prologue to emit: When function takes
8937 many instructions to execute we may use slow version as well as in
8938 case function is known to be outside hot spot (this is known with
8939 feedback only). Weight the size of function by number of registers
8940 to save as it is cheap to use one or two push instructions but very
8941 slow to use many of them. */
8942 if (count)
8943 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8944 if (node->frequency < NODE_FREQUENCY_NORMAL
8945 || (flag_branch_probabilities
8946 && node->frequency < NODE_FREQUENCY_HOT))
8947 cfun->machine->use_fast_prologue_epilogue = false;
8948 else
8949 cfun->machine->use_fast_prologue_epilogue
8950 = !expensive_function_p (count);
8951 }
8952
8953 frame->save_regs_using_mov
8954 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8955 /* If static stack checking is enabled and done with probes,
8956 the registers need to be saved before allocating the frame. */
8957 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8958
8959 /* Skip return address. */
8960 offset = UNITS_PER_WORD;
8961
8962 /* Skip pushed static chain. */
8963 if (ix86_static_chain_on_stack)
8964 offset += UNITS_PER_WORD;
8965
8966 /* Skip saved base pointer. */
8967 if (frame_pointer_needed)
8968 offset += UNITS_PER_WORD;
8969 frame->hfp_save_offset = offset;
8970
8971 /* The traditional frame pointer location is at the top of the frame. */
8972 frame->hard_frame_pointer_offset = offset;
8973
8974 /* Register save area */
8975 offset += frame->nregs * UNITS_PER_WORD;
8976 frame->reg_save_offset = offset;
8977
8978 /* Align and set SSE register save area. */
8979 if (frame->nsseregs)
8980 {
8981 /* The only ABI that has saved SSE registers (Win64) also has a
8982 16-byte aligned default stack, and thus we don't need to be
8983 within the re-aligned local stack frame to save them. */
8984 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8985 offset = (offset + 16 - 1) & -16;
8986 offset += frame->nsseregs * 16;
8987 }
8988 frame->sse_reg_save_offset = offset;
8989
8990 /* The re-aligned stack starts here. Values before this point are not
8991 directly comparable with values below this point. In order to make
8992 sure that no value happens to be the same before and after, force
8993 the alignment computation below to add a non-zero value. */
8994 if (stack_realign_fp)
8995 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8996
8997 /* Va-arg area */
8998 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8999 offset += frame->va_arg_size;
9000
9001 /* Align start of frame for local function. */
9002 if (stack_realign_fp
9003 || offset != frame->sse_reg_save_offset
9004 || size != 0
9005 || !current_function_is_leaf
9006 || cfun->calls_alloca
9007 || ix86_current_function_calls_tls_descriptor)
9008 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9009
9010 /* Frame pointer points here. */
9011 frame->frame_pointer_offset = offset;
9012
9013 offset += size;
9014
9015 /* Add outgoing arguments area. Can be skipped if we eliminated
9016 all the function calls as dead code.
9017 Skipping is however impossible when function calls alloca. Alloca
9018 expander assumes that last crtl->outgoing_args_size
9019 of stack frame are unused. */
9020 if (ACCUMULATE_OUTGOING_ARGS
9021 && (!current_function_is_leaf || cfun->calls_alloca
9022 || ix86_current_function_calls_tls_descriptor))
9023 {
9024 offset += crtl->outgoing_args_size;
9025 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9026 }
9027 else
9028 frame->outgoing_arguments_size = 0;
9029
9030 /* Align stack boundary. Only needed if we're calling another function
9031 or using alloca. */
9032 if (!current_function_is_leaf || cfun->calls_alloca
9033 || ix86_current_function_calls_tls_descriptor)
9034 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9035
9036 /* We've reached end of stack frame. */
9037 frame->stack_pointer_offset = offset;
9038
9039 /* Size prologue needs to allocate. */
9040 to_allocate = offset - frame->sse_reg_save_offset;
9041
9042 if ((!to_allocate && frame->nregs <= 1)
9043 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9044 frame->save_regs_using_mov = false;
9045
9046 if (ix86_using_red_zone ()
9047 && current_function_sp_is_unchanging
9048 && current_function_is_leaf
9049 && !ix86_current_function_calls_tls_descriptor)
9050 {
9051 frame->red_zone_size = to_allocate;
9052 if (frame->save_regs_using_mov)
9053 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9054 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9055 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9056 }
9057 else
9058 frame->red_zone_size = 0;
9059 frame->stack_pointer_offset -= frame->red_zone_size;
9060
9061 /* The SEH frame pointer location is near the bottom of the frame.
9062 This is enforced by the fact that the difference between the
9063 stack pointer and the frame pointer is limited to 240 bytes in
9064 the unwind data structure. */
9065 if (TARGET_SEH)
9066 {
9067 HOST_WIDE_INT diff;
9068
9069 /* If we can leave the frame pointer where it is, do so. */
9070 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9071 if (diff > 240 || (diff & 15) != 0)
9072 {
9073 /* Ideally we'd determine what portion of the local stack frame
9074 (within the constraint of the lowest 240) is most heavily used.
9075 But without that complication, simply bias the frame pointer
9076 by 128 bytes so as to maximize the amount of the local stack
9077 frame that is addressable with 8-bit offsets. */
9078 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9079 }
9080 }
9081 }
9082
9083 /* This is semi-inlined memory_address_length, but simplified
9084 since we know that we're always dealing with reg+offset, and
9085 to avoid having to create and discard all that rtl. */
9086
9087 static inline int
9088 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9089 {
9090 int len = 4;
9091
9092 if (offset == 0)
9093 {
9094 /* EBP and R13 cannot be encoded without an offset. */
9095 len = (regno == BP_REG || regno == R13_REG);
9096 }
9097 else if (IN_RANGE (offset, -128, 127))
9098 len = 1;
9099
9100 /* ESP and R12 must be encoded with a SIB byte. */
9101 if (regno == SP_REG || regno == R12_REG)
9102 len++;
9103
9104 return len;
9105 }
9106
9107 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9108 The valid base registers are taken from CFUN->MACHINE->FS. */
9109
9110 static rtx
9111 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9112 {
9113 const struct machine_function *m = cfun->machine;
9114 rtx base_reg = NULL;
9115 HOST_WIDE_INT base_offset = 0;
9116
9117 if (m->use_fast_prologue_epilogue)
9118 {
9119 /* Choose the base register most likely to allow the most scheduling
9120 opportunities. Generally FP is valid througout the function,
9121 while DRAP must be reloaded within the epilogue. But choose either
9122 over the SP due to increased encoding size. */
9123
9124 if (m->fs.fp_valid)
9125 {
9126 base_reg = hard_frame_pointer_rtx;
9127 base_offset = m->fs.fp_offset - cfa_offset;
9128 }
9129 else if (m->fs.drap_valid)
9130 {
9131 base_reg = crtl->drap_reg;
9132 base_offset = 0 - cfa_offset;
9133 }
9134 else if (m->fs.sp_valid)
9135 {
9136 base_reg = stack_pointer_rtx;
9137 base_offset = m->fs.sp_offset - cfa_offset;
9138 }
9139 }
9140 else
9141 {
9142 HOST_WIDE_INT toffset;
9143 int len = 16, tlen;
9144
9145 /* Choose the base register with the smallest address encoding.
9146 With a tie, choose FP > DRAP > SP. */
9147 if (m->fs.sp_valid)
9148 {
9149 base_reg = stack_pointer_rtx;
9150 base_offset = m->fs.sp_offset - cfa_offset;
9151 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9152 }
9153 if (m->fs.drap_valid)
9154 {
9155 toffset = 0 - cfa_offset;
9156 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9157 if (tlen <= len)
9158 {
9159 base_reg = crtl->drap_reg;
9160 base_offset = toffset;
9161 len = tlen;
9162 }
9163 }
9164 if (m->fs.fp_valid)
9165 {
9166 toffset = m->fs.fp_offset - cfa_offset;
9167 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9168 if (tlen <= len)
9169 {
9170 base_reg = hard_frame_pointer_rtx;
9171 base_offset = toffset;
9172 len = tlen;
9173 }
9174 }
9175 }
9176 gcc_assert (base_reg != NULL);
9177
9178 return plus_constant (base_reg, base_offset);
9179 }
9180
9181 /* Emit code to save registers in the prologue. */
9182
9183 static void
9184 ix86_emit_save_regs (void)
9185 {
9186 unsigned int regno;
9187 rtx insn;
9188
9189 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9190 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9191 {
9192 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9193 RTX_FRAME_RELATED_P (insn) = 1;
9194 }
9195 }
9196
9197 /* Emit a single register save at CFA - CFA_OFFSET. */
9198
9199 static void
9200 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9201 HOST_WIDE_INT cfa_offset)
9202 {
9203 struct machine_function *m = cfun->machine;
9204 rtx reg = gen_rtx_REG (mode, regno);
9205 rtx mem, addr, base, insn;
9206
9207 addr = choose_baseaddr (cfa_offset);
9208 mem = gen_frame_mem (mode, addr);
9209
9210 /* For SSE saves, we need to indicate the 128-bit alignment. */
9211 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9212
9213 insn = emit_move_insn (mem, reg);
9214 RTX_FRAME_RELATED_P (insn) = 1;
9215
9216 base = addr;
9217 if (GET_CODE (base) == PLUS)
9218 base = XEXP (base, 0);
9219 gcc_checking_assert (REG_P (base));
9220
9221 /* When saving registers into a re-aligned local stack frame, avoid
9222 any tricky guessing by dwarf2out. */
9223 if (m->fs.realigned)
9224 {
9225 gcc_checking_assert (stack_realign_drap);
9226
9227 if (regno == REGNO (crtl->drap_reg))
9228 {
9229 /* A bit of a hack. We force the DRAP register to be saved in
9230 the re-aligned stack frame, which provides us with a copy
9231 of the CFA that will last past the prologue. Install it. */
9232 gcc_checking_assert (cfun->machine->fs.fp_valid);
9233 addr = plus_constant (hard_frame_pointer_rtx,
9234 cfun->machine->fs.fp_offset - cfa_offset);
9235 mem = gen_rtx_MEM (mode, addr);
9236 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9237 }
9238 else
9239 {
9240 /* The frame pointer is a stable reference within the
9241 aligned frame. Use it. */
9242 gcc_checking_assert (cfun->machine->fs.fp_valid);
9243 addr = plus_constant (hard_frame_pointer_rtx,
9244 cfun->machine->fs.fp_offset - cfa_offset);
9245 mem = gen_rtx_MEM (mode, addr);
9246 add_reg_note (insn, REG_CFA_EXPRESSION,
9247 gen_rtx_SET (VOIDmode, mem, reg));
9248 }
9249 }
9250
9251 /* The memory may not be relative to the current CFA register,
9252 which means that we may need to generate a new pattern for
9253 use by the unwind info. */
9254 else if (base != m->fs.cfa_reg)
9255 {
9256 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9257 mem = gen_rtx_MEM (mode, addr);
9258 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9259 }
9260 }
9261
9262 /* Emit code to save registers using MOV insns.
9263 First register is stored at CFA - CFA_OFFSET. */
9264 static void
9265 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9266 {
9267 unsigned int regno;
9268
9269 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9270 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9271 {
9272 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9273 cfa_offset -= UNITS_PER_WORD;
9274 }
9275 }
9276
9277 /* Emit code to save SSE registers using MOV insns.
9278 First register is stored at CFA - CFA_OFFSET. */
9279 static void
9280 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9281 {
9282 unsigned int regno;
9283
9284 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9285 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9286 {
9287 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9288 cfa_offset -= 16;
9289 }
9290 }
9291
9292 static GTY(()) rtx queued_cfa_restores;
9293
9294 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9295 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9296 Don't add the note if the previously saved value will be left untouched
9297 within stack red-zone till return, as unwinders can find the same value
9298 in the register and on the stack. */
9299
9300 static void
9301 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9302 {
9303 if (!crtl->shrink_wrapped
9304 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9305 return;
9306
9307 if (insn)
9308 {
9309 add_reg_note (insn, REG_CFA_RESTORE, reg);
9310 RTX_FRAME_RELATED_P (insn) = 1;
9311 }
9312 else
9313 queued_cfa_restores
9314 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9315 }
9316
9317 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9318
9319 static void
9320 ix86_add_queued_cfa_restore_notes (rtx insn)
9321 {
9322 rtx last;
9323 if (!queued_cfa_restores)
9324 return;
9325 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9326 ;
9327 XEXP (last, 1) = REG_NOTES (insn);
9328 REG_NOTES (insn) = queued_cfa_restores;
9329 queued_cfa_restores = NULL_RTX;
9330 RTX_FRAME_RELATED_P (insn) = 1;
9331 }
9332
9333 /* Expand prologue or epilogue stack adjustment.
9334 The pattern exist to put a dependency on all ebp-based memory accesses.
9335 STYLE should be negative if instructions should be marked as frame related,
9336 zero if %r11 register is live and cannot be freely used and positive
9337 otherwise. */
9338
9339 static void
9340 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9341 int style, bool set_cfa)
9342 {
9343 struct machine_function *m = cfun->machine;
9344 rtx insn;
9345 bool add_frame_related_expr = false;
9346
9347 if (Pmode == SImode)
9348 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9349 else if (x86_64_immediate_operand (offset, DImode))
9350 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9351 else
9352 {
9353 rtx tmp;
9354 /* r11 is used by indirect sibcall return as well, set before the
9355 epilogue and used after the epilogue. */
9356 if (style)
9357 tmp = gen_rtx_REG (DImode, R11_REG);
9358 else
9359 {
9360 gcc_assert (src != hard_frame_pointer_rtx
9361 && dest != hard_frame_pointer_rtx);
9362 tmp = hard_frame_pointer_rtx;
9363 }
9364 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9365 if (style < 0)
9366 add_frame_related_expr = true;
9367
9368 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9369 }
9370
9371 insn = emit_insn (insn);
9372 if (style >= 0)
9373 ix86_add_queued_cfa_restore_notes (insn);
9374
9375 if (set_cfa)
9376 {
9377 rtx r;
9378
9379 gcc_assert (m->fs.cfa_reg == src);
9380 m->fs.cfa_offset += INTVAL (offset);
9381 m->fs.cfa_reg = dest;
9382
9383 r = gen_rtx_PLUS (Pmode, src, offset);
9384 r = gen_rtx_SET (VOIDmode, dest, r);
9385 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9386 RTX_FRAME_RELATED_P (insn) = 1;
9387 }
9388 else if (style < 0)
9389 {
9390 RTX_FRAME_RELATED_P (insn) = 1;
9391 if (add_frame_related_expr)
9392 {
9393 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9394 r = gen_rtx_SET (VOIDmode, dest, r);
9395 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9396 }
9397 }
9398
9399 if (dest == stack_pointer_rtx)
9400 {
9401 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9402 bool valid = m->fs.sp_valid;
9403
9404 if (src == hard_frame_pointer_rtx)
9405 {
9406 valid = m->fs.fp_valid;
9407 ooffset = m->fs.fp_offset;
9408 }
9409 else if (src == crtl->drap_reg)
9410 {
9411 valid = m->fs.drap_valid;
9412 ooffset = 0;
9413 }
9414 else
9415 {
9416 /* Else there are two possibilities: SP itself, which we set
9417 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9418 taken care of this by hand along the eh_return path. */
9419 gcc_checking_assert (src == stack_pointer_rtx
9420 || offset == const0_rtx);
9421 }
9422
9423 m->fs.sp_offset = ooffset - INTVAL (offset);
9424 m->fs.sp_valid = valid;
9425 }
9426 }
9427
9428 /* Find an available register to be used as dynamic realign argument
9429 pointer regsiter. Such a register will be written in prologue and
9430 used in begin of body, so it must not be
9431 1. parameter passing register.
9432 2. GOT pointer.
9433 We reuse static-chain register if it is available. Otherwise, we
9434 use DI for i386 and R13 for x86-64. We chose R13 since it has
9435 shorter encoding.
9436
9437 Return: the regno of chosen register. */
9438
9439 static unsigned int
9440 find_drap_reg (void)
9441 {
9442 tree decl = cfun->decl;
9443
9444 if (TARGET_64BIT)
9445 {
9446 /* Use R13 for nested function or function need static chain.
9447 Since function with tail call may use any caller-saved
9448 registers in epilogue, DRAP must not use caller-saved
9449 register in such case. */
9450 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9451 return R13_REG;
9452
9453 return R10_REG;
9454 }
9455 else
9456 {
9457 /* Use DI for nested function or function need static chain.
9458 Since function with tail call may use any caller-saved
9459 registers in epilogue, DRAP must not use caller-saved
9460 register in such case. */
9461 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9462 return DI_REG;
9463
9464 /* Reuse static chain register if it isn't used for parameter
9465 passing. */
9466 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9467 {
9468 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9469 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9470 return CX_REG;
9471 }
9472 return DI_REG;
9473 }
9474 }
9475
9476 /* Return minimum incoming stack alignment. */
9477
9478 static unsigned int
9479 ix86_minimum_incoming_stack_boundary (bool sibcall)
9480 {
9481 unsigned int incoming_stack_boundary;
9482
9483 /* Prefer the one specified at command line. */
9484 if (ix86_user_incoming_stack_boundary)
9485 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9486 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9487 if -mstackrealign is used, it isn't used for sibcall check and
9488 estimated stack alignment is 128bit. */
9489 else if (!sibcall
9490 && !TARGET_64BIT
9491 && ix86_force_align_arg_pointer
9492 && crtl->stack_alignment_estimated == 128)
9493 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9494 else
9495 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9496
9497 /* Incoming stack alignment can be changed on individual functions
9498 via force_align_arg_pointer attribute. We use the smallest
9499 incoming stack boundary. */
9500 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9501 && lookup_attribute (ix86_force_align_arg_pointer_string,
9502 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9503 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9504
9505 /* The incoming stack frame has to be aligned at least at
9506 parm_stack_boundary. */
9507 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9508 incoming_stack_boundary = crtl->parm_stack_boundary;
9509
9510 /* Stack at entrance of main is aligned by runtime. We use the
9511 smallest incoming stack boundary. */
9512 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9513 && DECL_NAME (current_function_decl)
9514 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9515 && DECL_FILE_SCOPE_P (current_function_decl))
9516 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9517
9518 return incoming_stack_boundary;
9519 }
9520
9521 /* Update incoming stack boundary and estimated stack alignment. */
9522
9523 static void
9524 ix86_update_stack_boundary (void)
9525 {
9526 ix86_incoming_stack_boundary
9527 = ix86_minimum_incoming_stack_boundary (false);
9528
9529 /* x86_64 vararg needs 16byte stack alignment for register save
9530 area. */
9531 if (TARGET_64BIT
9532 && cfun->stdarg
9533 && crtl->stack_alignment_estimated < 128)
9534 crtl->stack_alignment_estimated = 128;
9535 }
9536
9537 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9538 needed or an rtx for DRAP otherwise. */
9539
9540 static rtx
9541 ix86_get_drap_rtx (void)
9542 {
9543 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9544 crtl->need_drap = true;
9545
9546 if (stack_realign_drap)
9547 {
9548 /* Assign DRAP to vDRAP and returns vDRAP */
9549 unsigned int regno = find_drap_reg ();
9550 rtx drap_vreg;
9551 rtx arg_ptr;
9552 rtx seq, insn;
9553
9554 arg_ptr = gen_rtx_REG (Pmode, regno);
9555 crtl->drap_reg = arg_ptr;
9556
9557 start_sequence ();
9558 drap_vreg = copy_to_reg (arg_ptr);
9559 seq = get_insns ();
9560 end_sequence ();
9561
9562 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9563 if (!optimize)
9564 {
9565 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9566 RTX_FRAME_RELATED_P (insn) = 1;
9567 }
9568 return drap_vreg;
9569 }
9570 else
9571 return NULL;
9572 }
9573
9574 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9575
9576 static rtx
9577 ix86_internal_arg_pointer (void)
9578 {
9579 return virtual_incoming_args_rtx;
9580 }
9581
9582 struct scratch_reg {
9583 rtx reg;
9584 bool saved;
9585 };
9586
9587 /* Return a short-lived scratch register for use on function entry.
9588 In 32-bit mode, it is valid only after the registers are saved
9589 in the prologue. This register must be released by means of
9590 release_scratch_register_on_entry once it is dead. */
9591
9592 static void
9593 get_scratch_register_on_entry (struct scratch_reg *sr)
9594 {
9595 int regno;
9596
9597 sr->saved = false;
9598
9599 if (TARGET_64BIT)
9600 {
9601 /* We always use R11 in 64-bit mode. */
9602 regno = R11_REG;
9603 }
9604 else
9605 {
9606 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9607 bool fastcall_p
9608 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9609 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9610 int regparm = ix86_function_regparm (fntype, decl);
9611 int drap_regno
9612 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9613
9614 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9615 for the static chain register. */
9616 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9617 && drap_regno != AX_REG)
9618 regno = AX_REG;
9619 else if (regparm < 2 && drap_regno != DX_REG)
9620 regno = DX_REG;
9621 /* ecx is the static chain register. */
9622 else if (regparm < 3 && !fastcall_p && !static_chain_p
9623 && drap_regno != CX_REG)
9624 regno = CX_REG;
9625 else if (ix86_save_reg (BX_REG, true))
9626 regno = BX_REG;
9627 /* esi is the static chain register. */
9628 else if (!(regparm == 3 && static_chain_p)
9629 && ix86_save_reg (SI_REG, true))
9630 regno = SI_REG;
9631 else if (ix86_save_reg (DI_REG, true))
9632 regno = DI_REG;
9633 else
9634 {
9635 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9636 sr->saved = true;
9637 }
9638 }
9639
9640 sr->reg = gen_rtx_REG (Pmode, regno);
9641 if (sr->saved)
9642 {
9643 rtx insn = emit_insn (gen_push (sr->reg));
9644 RTX_FRAME_RELATED_P (insn) = 1;
9645 }
9646 }
9647
9648 /* Release a scratch register obtained from the preceding function. */
9649
9650 static void
9651 release_scratch_register_on_entry (struct scratch_reg *sr)
9652 {
9653 if (sr->saved)
9654 {
9655 rtx x, insn = emit_insn (gen_pop (sr->reg));
9656
9657 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9658 RTX_FRAME_RELATED_P (insn) = 1;
9659 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9660 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9661 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9662 }
9663 }
9664
9665 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9666
9667 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9668
9669 static void
9670 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9671 {
9672 /* We skip the probe for the first interval + a small dope of 4 words and
9673 probe that many bytes past the specified size to maintain a protection
9674 area at the botton of the stack. */
9675 const int dope = 4 * UNITS_PER_WORD;
9676 rtx size_rtx = GEN_INT (size), last;
9677
9678 /* See if we have a constant small number of probes to generate. If so,
9679 that's the easy case. The run-time loop is made up of 11 insns in the
9680 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9681 for n # of intervals. */
9682 if (size <= 5 * PROBE_INTERVAL)
9683 {
9684 HOST_WIDE_INT i, adjust;
9685 bool first_probe = true;
9686
9687 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9688 values of N from 1 until it exceeds SIZE. If only one probe is
9689 needed, this will not generate any code. Then adjust and probe
9690 to PROBE_INTERVAL + SIZE. */
9691 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9692 {
9693 if (first_probe)
9694 {
9695 adjust = 2 * PROBE_INTERVAL + dope;
9696 first_probe = false;
9697 }
9698 else
9699 adjust = PROBE_INTERVAL;
9700
9701 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9702 plus_constant (stack_pointer_rtx, -adjust)));
9703 emit_stack_probe (stack_pointer_rtx);
9704 }
9705
9706 if (first_probe)
9707 adjust = size + PROBE_INTERVAL + dope;
9708 else
9709 adjust = size + PROBE_INTERVAL - i;
9710
9711 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9712 plus_constant (stack_pointer_rtx, -adjust)));
9713 emit_stack_probe (stack_pointer_rtx);
9714
9715 /* Adjust back to account for the additional first interval. */
9716 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9717 plus_constant (stack_pointer_rtx,
9718 PROBE_INTERVAL + dope)));
9719 }
9720
9721 /* Otherwise, do the same as above, but in a loop. Note that we must be
9722 extra careful with variables wrapping around because we might be at
9723 the very top (or the very bottom) of the address space and we have
9724 to be able to handle this case properly; in particular, we use an
9725 equality test for the loop condition. */
9726 else
9727 {
9728 HOST_WIDE_INT rounded_size;
9729 struct scratch_reg sr;
9730
9731 get_scratch_register_on_entry (&sr);
9732
9733
9734 /* Step 1: round SIZE to the previous multiple of the interval. */
9735
9736 rounded_size = size & -PROBE_INTERVAL;
9737
9738
9739 /* Step 2: compute initial and final value of the loop counter. */
9740
9741 /* SP = SP_0 + PROBE_INTERVAL. */
9742 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9743 plus_constant (stack_pointer_rtx,
9744 - (PROBE_INTERVAL + dope))));
9745
9746 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9747 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9748 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9749 gen_rtx_PLUS (Pmode, sr.reg,
9750 stack_pointer_rtx)));
9751
9752
9753 /* Step 3: the loop
9754
9755 while (SP != LAST_ADDR)
9756 {
9757 SP = SP + PROBE_INTERVAL
9758 probe at SP
9759 }
9760
9761 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9762 values of N from 1 until it is equal to ROUNDED_SIZE. */
9763
9764 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9765
9766
9767 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9768 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9769
9770 if (size != rounded_size)
9771 {
9772 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9773 plus_constant (stack_pointer_rtx,
9774 rounded_size - size)));
9775 emit_stack_probe (stack_pointer_rtx);
9776 }
9777
9778 /* Adjust back to account for the additional first interval. */
9779 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9780 plus_constant (stack_pointer_rtx,
9781 PROBE_INTERVAL + dope)));
9782
9783 release_scratch_register_on_entry (&sr);
9784 }
9785
9786 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9787
9788 /* Even if the stack pointer isn't the CFA register, we need to correctly
9789 describe the adjustments made to it, in particular differentiate the
9790 frame-related ones from the frame-unrelated ones. */
9791 if (size > 0)
9792 {
9793 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9794 XVECEXP (expr, 0, 0)
9795 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9796 plus_constant (stack_pointer_rtx, -size));
9797 XVECEXP (expr, 0, 1)
9798 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9799 plus_constant (stack_pointer_rtx,
9800 PROBE_INTERVAL + dope + size));
9801 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9802 RTX_FRAME_RELATED_P (last) = 1;
9803
9804 cfun->machine->fs.sp_offset += size;
9805 }
9806
9807 /* Make sure nothing is scheduled before we are done. */
9808 emit_insn (gen_blockage ());
9809 }
9810
9811 /* Adjust the stack pointer up to REG while probing it. */
9812
9813 const char *
9814 output_adjust_stack_and_probe (rtx reg)
9815 {
9816 static int labelno = 0;
9817 char loop_lab[32], end_lab[32];
9818 rtx xops[2];
9819
9820 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9821 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9822
9823 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9824
9825 /* Jump to END_LAB if SP == LAST_ADDR. */
9826 xops[0] = stack_pointer_rtx;
9827 xops[1] = reg;
9828 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9829 fputs ("\tje\t", asm_out_file);
9830 assemble_name_raw (asm_out_file, end_lab);
9831 fputc ('\n', asm_out_file);
9832
9833 /* SP = SP + PROBE_INTERVAL. */
9834 xops[1] = GEN_INT (PROBE_INTERVAL);
9835 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9836
9837 /* Probe at SP. */
9838 xops[1] = const0_rtx;
9839 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9840
9841 fprintf (asm_out_file, "\tjmp\t");
9842 assemble_name_raw (asm_out_file, loop_lab);
9843 fputc ('\n', asm_out_file);
9844
9845 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9846
9847 return "";
9848 }
9849
9850 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9851 inclusive. These are offsets from the current stack pointer. */
9852
9853 static void
9854 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9855 {
9856 /* See if we have a constant small number of probes to generate. If so,
9857 that's the easy case. The run-time loop is made up of 7 insns in the
9858 generic case while the compile-time loop is made up of n insns for n #
9859 of intervals. */
9860 if (size <= 7 * PROBE_INTERVAL)
9861 {
9862 HOST_WIDE_INT i;
9863
9864 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9865 it exceeds SIZE. If only one probe is needed, this will not
9866 generate any code. Then probe at FIRST + SIZE. */
9867 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9868 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9869
9870 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9871 }
9872
9873 /* Otherwise, do the same as above, but in a loop. Note that we must be
9874 extra careful with variables wrapping around because we might be at
9875 the very top (or the very bottom) of the address space and we have
9876 to be able to handle this case properly; in particular, we use an
9877 equality test for the loop condition. */
9878 else
9879 {
9880 HOST_WIDE_INT rounded_size, last;
9881 struct scratch_reg sr;
9882
9883 get_scratch_register_on_entry (&sr);
9884
9885
9886 /* Step 1: round SIZE to the previous multiple of the interval. */
9887
9888 rounded_size = size & -PROBE_INTERVAL;
9889
9890
9891 /* Step 2: compute initial and final value of the loop counter. */
9892
9893 /* TEST_OFFSET = FIRST. */
9894 emit_move_insn (sr.reg, GEN_INT (-first));
9895
9896 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9897 last = first + rounded_size;
9898
9899
9900 /* Step 3: the loop
9901
9902 while (TEST_ADDR != LAST_ADDR)
9903 {
9904 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9905 probe at TEST_ADDR
9906 }
9907
9908 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9909 until it is equal to ROUNDED_SIZE. */
9910
9911 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9912
9913
9914 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9915 that SIZE is equal to ROUNDED_SIZE. */
9916
9917 if (size != rounded_size)
9918 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9919 stack_pointer_rtx,
9920 sr.reg),
9921 rounded_size - size));
9922
9923 release_scratch_register_on_entry (&sr);
9924 }
9925
9926 /* Make sure nothing is scheduled before we are done. */
9927 emit_insn (gen_blockage ());
9928 }
9929
9930 /* Probe a range of stack addresses from REG to END, inclusive. These are
9931 offsets from the current stack pointer. */
9932
9933 const char *
9934 output_probe_stack_range (rtx reg, rtx end)
9935 {
9936 static int labelno = 0;
9937 char loop_lab[32], end_lab[32];
9938 rtx xops[3];
9939
9940 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9941 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9942
9943 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9944
9945 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9946 xops[0] = reg;
9947 xops[1] = end;
9948 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9949 fputs ("\tje\t", asm_out_file);
9950 assemble_name_raw (asm_out_file, end_lab);
9951 fputc ('\n', asm_out_file);
9952
9953 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9954 xops[1] = GEN_INT (PROBE_INTERVAL);
9955 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9956
9957 /* Probe at TEST_ADDR. */
9958 xops[0] = stack_pointer_rtx;
9959 xops[1] = reg;
9960 xops[2] = const0_rtx;
9961 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9962
9963 fprintf (asm_out_file, "\tjmp\t");
9964 assemble_name_raw (asm_out_file, loop_lab);
9965 fputc ('\n', asm_out_file);
9966
9967 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9968
9969 return "";
9970 }
9971
9972 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9973 to be generated in correct form. */
9974 static void
9975 ix86_finalize_stack_realign_flags (void)
9976 {
9977 /* Check if stack realign is really needed after reload, and
9978 stores result in cfun */
9979 unsigned int incoming_stack_boundary
9980 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9981 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9982 unsigned int stack_realign = (incoming_stack_boundary
9983 < (current_function_is_leaf
9984 ? crtl->max_used_stack_slot_alignment
9985 : crtl->stack_alignment_needed));
9986
9987 if (crtl->stack_realign_finalized)
9988 {
9989 /* After stack_realign_needed is finalized, we can't no longer
9990 change it. */
9991 gcc_assert (crtl->stack_realign_needed == stack_realign);
9992 return;
9993 }
9994
9995 /* If the only reason for frame_pointer_needed is that we conservatively
9996 assumed stack realignment might be needed, but in the end nothing that
9997 needed the stack alignment had been spilled, clear frame_pointer_needed
9998 and say we don't need stack realignment. */
9999 if (stack_realign
10000 && !crtl->need_drap
10001 && frame_pointer_needed
10002 && current_function_is_leaf
10003 && flag_omit_frame_pointer
10004 && current_function_sp_is_unchanging
10005 && !ix86_current_function_calls_tls_descriptor
10006 && !crtl->accesses_prior_frames
10007 && !cfun->calls_alloca
10008 && !crtl->calls_eh_return
10009 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10010 && !ix86_frame_pointer_required ()
10011 && get_frame_size () == 0
10012 && ix86_nsaved_sseregs () == 0
10013 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10014 {
10015 HARD_REG_SET set_up_by_prologue, prologue_used;
10016 basic_block bb;
10017
10018 CLEAR_HARD_REG_SET (prologue_used);
10019 CLEAR_HARD_REG_SET (set_up_by_prologue);
10020 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10021 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10022 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10023 HARD_FRAME_POINTER_REGNUM);
10024 FOR_EACH_BB (bb)
10025 {
10026 rtx insn;
10027 FOR_BB_INSNS (bb, insn)
10028 if (NONDEBUG_INSN_P (insn)
10029 && requires_stack_frame_p (insn, prologue_used,
10030 set_up_by_prologue))
10031 {
10032 crtl->stack_realign_needed = stack_realign;
10033 crtl->stack_realign_finalized = true;
10034 return;
10035 }
10036 }
10037
10038 frame_pointer_needed = false;
10039 stack_realign = false;
10040 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10041 crtl->stack_alignment_needed = incoming_stack_boundary;
10042 crtl->stack_alignment_estimated = incoming_stack_boundary;
10043 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10044 crtl->preferred_stack_boundary = incoming_stack_boundary;
10045 df_finish_pass (true);
10046 df_scan_alloc (NULL);
10047 df_scan_blocks ();
10048 df_compute_regs_ever_live (true);
10049 df_analyze ();
10050 }
10051
10052 crtl->stack_realign_needed = stack_realign;
10053 crtl->stack_realign_finalized = true;
10054 }
10055
10056 /* Expand the prologue into a bunch of separate insns. */
10057
10058 void
10059 ix86_expand_prologue (void)
10060 {
10061 struct machine_function *m = cfun->machine;
10062 rtx insn, t;
10063 bool pic_reg_used;
10064 struct ix86_frame frame;
10065 HOST_WIDE_INT allocate;
10066 bool int_registers_saved;
10067
10068 ix86_finalize_stack_realign_flags ();
10069
10070 /* DRAP should not coexist with stack_realign_fp */
10071 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10072
10073 memset (&m->fs, 0, sizeof (m->fs));
10074
10075 /* Initialize CFA state for before the prologue. */
10076 m->fs.cfa_reg = stack_pointer_rtx;
10077 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10078
10079 /* Track SP offset to the CFA. We continue tracking this after we've
10080 swapped the CFA register away from SP. In the case of re-alignment
10081 this is fudged; we're interested to offsets within the local frame. */
10082 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10083 m->fs.sp_valid = true;
10084
10085 ix86_compute_frame_layout (&frame);
10086
10087 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10088 {
10089 /* We should have already generated an error for any use of
10090 ms_hook on a nested function. */
10091 gcc_checking_assert (!ix86_static_chain_on_stack);
10092
10093 /* Check if profiling is active and we shall use profiling before
10094 prologue variant. If so sorry. */
10095 if (crtl->profile && flag_fentry != 0)
10096 sorry ("ms_hook_prologue attribute isn%'t compatible "
10097 "with -mfentry for 32-bit");
10098
10099 /* In ix86_asm_output_function_label we emitted:
10100 8b ff movl.s %edi,%edi
10101 55 push %ebp
10102 8b ec movl.s %esp,%ebp
10103
10104 This matches the hookable function prologue in Win32 API
10105 functions in Microsoft Windows XP Service Pack 2 and newer.
10106 Wine uses this to enable Windows apps to hook the Win32 API
10107 functions provided by Wine.
10108
10109 What that means is that we've already set up the frame pointer. */
10110
10111 if (frame_pointer_needed
10112 && !(crtl->drap_reg && crtl->stack_realign_needed))
10113 {
10114 rtx push, mov;
10115
10116 /* We've decided to use the frame pointer already set up.
10117 Describe this to the unwinder by pretending that both
10118 push and mov insns happen right here.
10119
10120 Putting the unwind info here at the end of the ms_hook
10121 is done so that we can make absolutely certain we get
10122 the required byte sequence at the start of the function,
10123 rather than relying on an assembler that can produce
10124 the exact encoding required.
10125
10126 However it does mean (in the unpatched case) that we have
10127 a 1 insn window where the asynchronous unwind info is
10128 incorrect. However, if we placed the unwind info at
10129 its correct location we would have incorrect unwind info
10130 in the patched case. Which is probably all moot since
10131 I don't expect Wine generates dwarf2 unwind info for the
10132 system libraries that use this feature. */
10133
10134 insn = emit_insn (gen_blockage ());
10135
10136 push = gen_push (hard_frame_pointer_rtx);
10137 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10138 stack_pointer_rtx);
10139 RTX_FRAME_RELATED_P (push) = 1;
10140 RTX_FRAME_RELATED_P (mov) = 1;
10141
10142 RTX_FRAME_RELATED_P (insn) = 1;
10143 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10144 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10145
10146 /* Note that gen_push incremented m->fs.cfa_offset, even
10147 though we didn't emit the push insn here. */
10148 m->fs.cfa_reg = hard_frame_pointer_rtx;
10149 m->fs.fp_offset = m->fs.cfa_offset;
10150 m->fs.fp_valid = true;
10151 }
10152 else
10153 {
10154 /* The frame pointer is not needed so pop %ebp again.
10155 This leaves us with a pristine state. */
10156 emit_insn (gen_pop (hard_frame_pointer_rtx));
10157 }
10158 }
10159
10160 /* The first insn of a function that accepts its static chain on the
10161 stack is to push the register that would be filled in by a direct
10162 call. This insn will be skipped by the trampoline. */
10163 else if (ix86_static_chain_on_stack)
10164 {
10165 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10166 emit_insn (gen_blockage ());
10167
10168 /* We don't want to interpret this push insn as a register save,
10169 only as a stack adjustment. The real copy of the register as
10170 a save will be done later, if needed. */
10171 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10172 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10173 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10174 RTX_FRAME_RELATED_P (insn) = 1;
10175 }
10176
10177 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10178 of DRAP is needed and stack realignment is really needed after reload */
10179 if (stack_realign_drap)
10180 {
10181 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10182
10183 /* Only need to push parameter pointer reg if it is caller saved. */
10184 if (!call_used_regs[REGNO (crtl->drap_reg)])
10185 {
10186 /* Push arg pointer reg */
10187 insn = emit_insn (gen_push (crtl->drap_reg));
10188 RTX_FRAME_RELATED_P (insn) = 1;
10189 }
10190
10191 /* Grab the argument pointer. */
10192 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10193 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10194 RTX_FRAME_RELATED_P (insn) = 1;
10195 m->fs.cfa_reg = crtl->drap_reg;
10196 m->fs.cfa_offset = 0;
10197
10198 /* Align the stack. */
10199 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10200 stack_pointer_rtx,
10201 GEN_INT (-align_bytes)));
10202 RTX_FRAME_RELATED_P (insn) = 1;
10203
10204 /* Replicate the return address on the stack so that return
10205 address can be reached via (argp - 1) slot. This is needed
10206 to implement macro RETURN_ADDR_RTX and intrinsic function
10207 expand_builtin_return_addr etc. */
10208 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10209 t = gen_frame_mem (word_mode, t);
10210 insn = emit_insn (gen_push (t));
10211 RTX_FRAME_RELATED_P (insn) = 1;
10212
10213 /* For the purposes of frame and register save area addressing,
10214 we've started over with a new frame. */
10215 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10216 m->fs.realigned = true;
10217 }
10218
10219 if (frame_pointer_needed && !m->fs.fp_valid)
10220 {
10221 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10222 slower on all targets. Also sdb doesn't like it. */
10223 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10224 RTX_FRAME_RELATED_P (insn) = 1;
10225
10226 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10227 {
10228 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10229 RTX_FRAME_RELATED_P (insn) = 1;
10230
10231 if (m->fs.cfa_reg == stack_pointer_rtx)
10232 m->fs.cfa_reg = hard_frame_pointer_rtx;
10233 m->fs.fp_offset = m->fs.sp_offset;
10234 m->fs.fp_valid = true;
10235 }
10236 }
10237
10238 int_registers_saved = (frame.nregs == 0);
10239
10240 if (!int_registers_saved)
10241 {
10242 /* If saving registers via PUSH, do so now. */
10243 if (!frame.save_regs_using_mov)
10244 {
10245 ix86_emit_save_regs ();
10246 int_registers_saved = true;
10247 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10248 }
10249
10250 /* When using red zone we may start register saving before allocating
10251 the stack frame saving one cycle of the prologue. However, avoid
10252 doing this if we have to probe the stack; at least on x86_64 the
10253 stack probe can turn into a call that clobbers a red zone location. */
10254 else if (ix86_using_red_zone ()
10255 && (! TARGET_STACK_PROBE
10256 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10257 {
10258 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10259 int_registers_saved = true;
10260 }
10261 }
10262
10263 if (stack_realign_fp)
10264 {
10265 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10266 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10267
10268 /* The computation of the size of the re-aligned stack frame means
10269 that we must allocate the size of the register save area before
10270 performing the actual alignment. Otherwise we cannot guarantee
10271 that there's enough storage above the realignment point. */
10272 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10273 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10274 GEN_INT (m->fs.sp_offset
10275 - frame.sse_reg_save_offset),
10276 -1, false);
10277
10278 /* Align the stack. */
10279 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10280 stack_pointer_rtx,
10281 GEN_INT (-align_bytes)));
10282
10283 /* For the purposes of register save area addressing, the stack
10284 pointer is no longer valid. As for the value of sp_offset,
10285 see ix86_compute_frame_layout, which we need to match in order
10286 to pass verification of stack_pointer_offset at the end. */
10287 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10288 m->fs.sp_valid = false;
10289 }
10290
10291 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10292
10293 if (flag_stack_usage_info)
10294 {
10295 /* We start to count from ARG_POINTER. */
10296 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10297
10298 /* If it was realigned, take into account the fake frame. */
10299 if (stack_realign_drap)
10300 {
10301 if (ix86_static_chain_on_stack)
10302 stack_size += UNITS_PER_WORD;
10303
10304 if (!call_used_regs[REGNO (crtl->drap_reg)])
10305 stack_size += UNITS_PER_WORD;
10306
10307 /* This over-estimates by 1 minimal-stack-alignment-unit but
10308 mitigates that by counting in the new return address slot. */
10309 current_function_dynamic_stack_size
10310 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10311 }
10312
10313 current_function_static_stack_size = stack_size;
10314 }
10315
10316 /* The stack has already been decremented by the instruction calling us
10317 so probe if the size is non-negative to preserve the protection area. */
10318 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10319 {
10320 /* We expect the registers to be saved when probes are used. */
10321 gcc_assert (int_registers_saved);
10322
10323 if (STACK_CHECK_MOVING_SP)
10324 {
10325 ix86_adjust_stack_and_probe (allocate);
10326 allocate = 0;
10327 }
10328 else
10329 {
10330 HOST_WIDE_INT size = allocate;
10331
10332 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10333 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10334
10335 if (TARGET_STACK_PROBE)
10336 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10337 else
10338 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10339 }
10340 }
10341
10342 if (allocate == 0)
10343 ;
10344 else if (!ix86_target_stack_probe ()
10345 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10346 {
10347 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10348 GEN_INT (-allocate), -1,
10349 m->fs.cfa_reg == stack_pointer_rtx);
10350 }
10351 else
10352 {
10353 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10354 rtx r10 = NULL;
10355 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10356
10357 bool eax_live = false;
10358 bool r10_live = false;
10359
10360 if (TARGET_64BIT)
10361 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10362 if (!TARGET_64BIT_MS_ABI)
10363 eax_live = ix86_eax_live_at_start_p ();
10364
10365 if (eax_live)
10366 {
10367 emit_insn (gen_push (eax));
10368 allocate -= UNITS_PER_WORD;
10369 }
10370 if (r10_live)
10371 {
10372 r10 = gen_rtx_REG (Pmode, R10_REG);
10373 emit_insn (gen_push (r10));
10374 allocate -= UNITS_PER_WORD;
10375 }
10376
10377 emit_move_insn (eax, GEN_INT (allocate));
10378 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10379
10380 /* Use the fact that AX still contains ALLOCATE. */
10381 adjust_stack_insn = (Pmode == DImode
10382 ? gen_pro_epilogue_adjust_stack_di_sub
10383 : gen_pro_epilogue_adjust_stack_si_sub);
10384
10385 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10386 stack_pointer_rtx, eax));
10387
10388 /* Note that SEH directives need to continue tracking the stack
10389 pointer even after the frame pointer has been set up. */
10390 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10391 {
10392 if (m->fs.cfa_reg == stack_pointer_rtx)
10393 m->fs.cfa_offset += allocate;
10394
10395 RTX_FRAME_RELATED_P (insn) = 1;
10396 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10397 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10398 plus_constant (stack_pointer_rtx,
10399 -allocate)));
10400 }
10401 m->fs.sp_offset += allocate;
10402
10403 if (r10_live && eax_live)
10404 {
10405 t = choose_baseaddr (m->fs.sp_offset - allocate);
10406 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10407 gen_frame_mem (word_mode, t));
10408 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10409 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10410 gen_frame_mem (word_mode, t));
10411 }
10412 else if (eax_live || r10_live)
10413 {
10414 t = choose_baseaddr (m->fs.sp_offset - allocate);
10415 emit_move_insn (gen_rtx_REG (word_mode,
10416 (eax_live ? AX_REG : R10_REG)),
10417 gen_frame_mem (word_mode, t));
10418 }
10419 }
10420 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10421
10422 /* If we havn't already set up the frame pointer, do so now. */
10423 if (frame_pointer_needed && !m->fs.fp_valid)
10424 {
10425 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10426 GEN_INT (frame.stack_pointer_offset
10427 - frame.hard_frame_pointer_offset));
10428 insn = emit_insn (insn);
10429 RTX_FRAME_RELATED_P (insn) = 1;
10430 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10431
10432 if (m->fs.cfa_reg == stack_pointer_rtx)
10433 m->fs.cfa_reg = hard_frame_pointer_rtx;
10434 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10435 m->fs.fp_valid = true;
10436 }
10437
10438 if (!int_registers_saved)
10439 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10440 if (frame.nsseregs)
10441 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10442
10443 pic_reg_used = false;
10444 if (pic_offset_table_rtx
10445 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10446 || crtl->profile))
10447 {
10448 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10449
10450 if (alt_pic_reg_used != INVALID_REGNUM)
10451 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10452
10453 pic_reg_used = true;
10454 }
10455
10456 if (pic_reg_used)
10457 {
10458 if (TARGET_64BIT)
10459 {
10460 if (ix86_cmodel == CM_LARGE_PIC)
10461 {
10462 rtx label, tmp_reg;
10463
10464 gcc_assert (Pmode == DImode);
10465 label = gen_label_rtx ();
10466 emit_label (label);
10467 LABEL_PRESERVE_P (label) = 1;
10468 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10469 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10470 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10471 label));
10472 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10473 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10474 pic_offset_table_rtx, tmp_reg));
10475 }
10476 else
10477 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10478 }
10479 else
10480 {
10481 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10482 RTX_FRAME_RELATED_P (insn) = 1;
10483 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10484 }
10485 }
10486
10487 /* In the pic_reg_used case, make sure that the got load isn't deleted
10488 when mcount needs it. Blockage to avoid call movement across mcount
10489 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10490 note. */
10491 if (crtl->profile && !flag_fentry && pic_reg_used)
10492 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10493
10494 if (crtl->drap_reg && !crtl->stack_realign_needed)
10495 {
10496 /* vDRAP is setup but after reload it turns out stack realign
10497 isn't necessary, here we will emit prologue to setup DRAP
10498 without stack realign adjustment */
10499 t = choose_baseaddr (0);
10500 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10501 }
10502
10503 /* Prevent instructions from being scheduled into register save push
10504 sequence when access to the redzone area is done through frame pointer.
10505 The offset between the frame pointer and the stack pointer is calculated
10506 relative to the value of the stack pointer at the end of the function
10507 prologue, and moving instructions that access redzone area via frame
10508 pointer inside push sequence violates this assumption. */
10509 if (frame_pointer_needed && frame.red_zone_size)
10510 emit_insn (gen_memory_blockage ());
10511
10512 /* Emit cld instruction if stringops are used in the function. */
10513 if (TARGET_CLD && ix86_current_function_needs_cld)
10514 emit_insn (gen_cld ());
10515
10516 /* SEH requires that the prologue end within 256 bytes of the start of
10517 the function. Prevent instruction schedules that would extend that.
10518 Further, prevent alloca modifications to the stack pointer from being
10519 combined with prologue modifications. */
10520 if (TARGET_SEH)
10521 emit_insn (gen_prologue_use (stack_pointer_rtx));
10522 }
10523
10524 /* Emit code to restore REG using a POP insn. */
10525
10526 static void
10527 ix86_emit_restore_reg_using_pop (rtx reg)
10528 {
10529 struct machine_function *m = cfun->machine;
10530 rtx insn = emit_insn (gen_pop (reg));
10531
10532 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10533 m->fs.sp_offset -= UNITS_PER_WORD;
10534
10535 if (m->fs.cfa_reg == crtl->drap_reg
10536 && REGNO (reg) == REGNO (crtl->drap_reg))
10537 {
10538 /* Previously we'd represented the CFA as an expression
10539 like *(%ebp - 8). We've just popped that value from
10540 the stack, which means we need to reset the CFA to
10541 the drap register. This will remain until we restore
10542 the stack pointer. */
10543 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10544 RTX_FRAME_RELATED_P (insn) = 1;
10545
10546 /* This means that the DRAP register is valid for addressing too. */
10547 m->fs.drap_valid = true;
10548 return;
10549 }
10550
10551 if (m->fs.cfa_reg == stack_pointer_rtx)
10552 {
10553 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10554 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10555 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10556 RTX_FRAME_RELATED_P (insn) = 1;
10557
10558 m->fs.cfa_offset -= UNITS_PER_WORD;
10559 }
10560
10561 /* When the frame pointer is the CFA, and we pop it, we are
10562 swapping back to the stack pointer as the CFA. This happens
10563 for stack frames that don't allocate other data, so we assume
10564 the stack pointer is now pointing at the return address, i.e.
10565 the function entry state, which makes the offset be 1 word. */
10566 if (reg == hard_frame_pointer_rtx)
10567 {
10568 m->fs.fp_valid = false;
10569 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10570 {
10571 m->fs.cfa_reg = stack_pointer_rtx;
10572 m->fs.cfa_offset -= UNITS_PER_WORD;
10573
10574 add_reg_note (insn, REG_CFA_DEF_CFA,
10575 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10576 GEN_INT (m->fs.cfa_offset)));
10577 RTX_FRAME_RELATED_P (insn) = 1;
10578 }
10579 }
10580 }
10581
10582 /* Emit code to restore saved registers using POP insns. */
10583
10584 static void
10585 ix86_emit_restore_regs_using_pop (void)
10586 {
10587 unsigned int regno;
10588
10589 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10590 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10591 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10592 }
10593
10594 /* Emit code and notes for the LEAVE instruction. */
10595
10596 static void
10597 ix86_emit_leave (void)
10598 {
10599 struct machine_function *m = cfun->machine;
10600 rtx insn = emit_insn (ix86_gen_leave ());
10601
10602 ix86_add_queued_cfa_restore_notes (insn);
10603
10604 gcc_assert (m->fs.fp_valid);
10605 m->fs.sp_valid = true;
10606 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10607 m->fs.fp_valid = false;
10608
10609 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10610 {
10611 m->fs.cfa_reg = stack_pointer_rtx;
10612 m->fs.cfa_offset = m->fs.sp_offset;
10613
10614 add_reg_note (insn, REG_CFA_DEF_CFA,
10615 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10616 RTX_FRAME_RELATED_P (insn) = 1;
10617 }
10618 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10619 m->fs.fp_offset);
10620 }
10621
10622 /* Emit code to restore saved registers using MOV insns.
10623 First register is restored from CFA - CFA_OFFSET. */
10624 static void
10625 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10626 bool maybe_eh_return)
10627 {
10628 struct machine_function *m = cfun->machine;
10629 unsigned int regno;
10630
10631 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10632 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10633 {
10634 rtx reg = gen_rtx_REG (word_mode, regno);
10635 rtx insn, mem;
10636
10637 mem = choose_baseaddr (cfa_offset);
10638 mem = gen_frame_mem (word_mode, mem);
10639 insn = emit_move_insn (reg, mem);
10640
10641 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10642 {
10643 /* Previously we'd represented the CFA as an expression
10644 like *(%ebp - 8). We've just popped that value from
10645 the stack, which means we need to reset the CFA to
10646 the drap register. This will remain until we restore
10647 the stack pointer. */
10648 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10649 RTX_FRAME_RELATED_P (insn) = 1;
10650
10651 /* This means that the DRAP register is valid for addressing. */
10652 m->fs.drap_valid = true;
10653 }
10654 else
10655 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10656
10657 cfa_offset -= UNITS_PER_WORD;
10658 }
10659 }
10660
10661 /* Emit code to restore saved registers using MOV insns.
10662 First register is restored from CFA - CFA_OFFSET. */
10663 static void
10664 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10665 bool maybe_eh_return)
10666 {
10667 unsigned int regno;
10668
10669 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10670 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10671 {
10672 rtx reg = gen_rtx_REG (V4SFmode, regno);
10673 rtx mem;
10674
10675 mem = choose_baseaddr (cfa_offset);
10676 mem = gen_rtx_MEM (V4SFmode, mem);
10677 set_mem_align (mem, 128);
10678 emit_move_insn (reg, mem);
10679
10680 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10681
10682 cfa_offset -= 16;
10683 }
10684 }
10685
10686 /* Emit vzeroupper if needed. */
10687
10688 void
10689 ix86_maybe_emit_epilogue_vzeroupper (void)
10690 {
10691 if (TARGET_VZEROUPPER
10692 && !TREE_THIS_VOLATILE (cfun->decl)
10693 && !cfun->machine->caller_return_avx256_p)
10694 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10695 }
10696
10697 /* Restore function stack, frame, and registers. */
10698
10699 void
10700 ix86_expand_epilogue (int style)
10701 {
10702 struct machine_function *m = cfun->machine;
10703 struct machine_frame_state frame_state_save = m->fs;
10704 struct ix86_frame frame;
10705 bool restore_regs_via_mov;
10706 bool using_drap;
10707
10708 ix86_finalize_stack_realign_flags ();
10709 ix86_compute_frame_layout (&frame);
10710
10711 m->fs.sp_valid = (!frame_pointer_needed
10712 || (current_function_sp_is_unchanging
10713 && !stack_realign_fp));
10714 gcc_assert (!m->fs.sp_valid
10715 || m->fs.sp_offset == frame.stack_pointer_offset);
10716
10717 /* The FP must be valid if the frame pointer is present. */
10718 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10719 gcc_assert (!m->fs.fp_valid
10720 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10721
10722 /* We must have *some* valid pointer to the stack frame. */
10723 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10724
10725 /* The DRAP is never valid at this point. */
10726 gcc_assert (!m->fs.drap_valid);
10727
10728 /* See the comment about red zone and frame
10729 pointer usage in ix86_expand_prologue. */
10730 if (frame_pointer_needed && frame.red_zone_size)
10731 emit_insn (gen_memory_blockage ());
10732
10733 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10734 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10735
10736 /* Determine the CFA offset of the end of the red-zone. */
10737 m->fs.red_zone_offset = 0;
10738 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10739 {
10740 /* The red-zone begins below the return address. */
10741 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10742
10743 /* When the register save area is in the aligned portion of
10744 the stack, determine the maximum runtime displacement that
10745 matches up with the aligned frame. */
10746 if (stack_realign_drap)
10747 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10748 + UNITS_PER_WORD);
10749 }
10750
10751 /* Special care must be taken for the normal return case of a function
10752 using eh_return: the eax and edx registers are marked as saved, but
10753 not restored along this path. Adjust the save location to match. */
10754 if (crtl->calls_eh_return && style != 2)
10755 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10756
10757 /* EH_RETURN requires the use of moves to function properly. */
10758 if (crtl->calls_eh_return)
10759 restore_regs_via_mov = true;
10760 /* SEH requires the use of pops to identify the epilogue. */
10761 else if (TARGET_SEH)
10762 restore_regs_via_mov = false;
10763 /* If we're only restoring one register and sp is not valid then
10764 using a move instruction to restore the register since it's
10765 less work than reloading sp and popping the register. */
10766 else if (!m->fs.sp_valid && frame.nregs <= 1)
10767 restore_regs_via_mov = true;
10768 else if (TARGET_EPILOGUE_USING_MOVE
10769 && cfun->machine->use_fast_prologue_epilogue
10770 && (frame.nregs > 1
10771 || m->fs.sp_offset != frame.reg_save_offset))
10772 restore_regs_via_mov = true;
10773 else if (frame_pointer_needed
10774 && !frame.nregs
10775 && m->fs.sp_offset != frame.reg_save_offset)
10776 restore_regs_via_mov = true;
10777 else if (frame_pointer_needed
10778 && TARGET_USE_LEAVE
10779 && cfun->machine->use_fast_prologue_epilogue
10780 && frame.nregs == 1)
10781 restore_regs_via_mov = true;
10782 else
10783 restore_regs_via_mov = false;
10784
10785 if (restore_regs_via_mov || frame.nsseregs)
10786 {
10787 /* Ensure that the entire register save area is addressable via
10788 the stack pointer, if we will restore via sp. */
10789 if (TARGET_64BIT
10790 && m->fs.sp_offset > 0x7fffffff
10791 && !(m->fs.fp_valid || m->fs.drap_valid)
10792 && (frame.nsseregs + frame.nregs) != 0)
10793 {
10794 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10795 GEN_INT (m->fs.sp_offset
10796 - frame.sse_reg_save_offset),
10797 style,
10798 m->fs.cfa_reg == stack_pointer_rtx);
10799 }
10800 }
10801
10802 /* If there are any SSE registers to restore, then we have to do it
10803 via moves, since there's obviously no pop for SSE regs. */
10804 if (frame.nsseregs)
10805 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10806 style == 2);
10807
10808 if (restore_regs_via_mov)
10809 {
10810 rtx t;
10811
10812 if (frame.nregs)
10813 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10814
10815 /* eh_return epilogues need %ecx added to the stack pointer. */
10816 if (style == 2)
10817 {
10818 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10819
10820 /* Stack align doesn't work with eh_return. */
10821 gcc_assert (!stack_realign_drap);
10822 /* Neither does regparm nested functions. */
10823 gcc_assert (!ix86_static_chain_on_stack);
10824
10825 if (frame_pointer_needed)
10826 {
10827 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10828 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10829 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10830
10831 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10832 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10833
10834 /* Note that we use SA as a temporary CFA, as the return
10835 address is at the proper place relative to it. We
10836 pretend this happens at the FP restore insn because
10837 prior to this insn the FP would be stored at the wrong
10838 offset relative to SA, and after this insn we have no
10839 other reasonable register to use for the CFA. We don't
10840 bother resetting the CFA to the SP for the duration of
10841 the return insn. */
10842 add_reg_note (insn, REG_CFA_DEF_CFA,
10843 plus_constant (sa, UNITS_PER_WORD));
10844 ix86_add_queued_cfa_restore_notes (insn);
10845 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10846 RTX_FRAME_RELATED_P (insn) = 1;
10847
10848 m->fs.cfa_reg = sa;
10849 m->fs.cfa_offset = UNITS_PER_WORD;
10850 m->fs.fp_valid = false;
10851
10852 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10853 const0_rtx, style, false);
10854 }
10855 else
10856 {
10857 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10858 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10859 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10860 ix86_add_queued_cfa_restore_notes (insn);
10861
10862 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10863 if (m->fs.cfa_offset != UNITS_PER_WORD)
10864 {
10865 m->fs.cfa_offset = UNITS_PER_WORD;
10866 add_reg_note (insn, REG_CFA_DEF_CFA,
10867 plus_constant (stack_pointer_rtx,
10868 UNITS_PER_WORD));
10869 RTX_FRAME_RELATED_P (insn) = 1;
10870 }
10871 }
10872 m->fs.sp_offset = UNITS_PER_WORD;
10873 m->fs.sp_valid = true;
10874 }
10875 }
10876 else
10877 {
10878 /* SEH requires that the function end with (1) a stack adjustment
10879 if necessary, (2) a sequence of pops, and (3) a return or
10880 jump instruction. Prevent insns from the function body from
10881 being scheduled into this sequence. */
10882 if (TARGET_SEH)
10883 {
10884 /* Prevent a catch region from being adjacent to the standard
10885 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10886 several other flags that would be interesting to test are
10887 not yet set up. */
10888 if (flag_non_call_exceptions)
10889 emit_insn (gen_nops (const1_rtx));
10890 else
10891 emit_insn (gen_blockage ());
10892 }
10893
10894 /* First step is to deallocate the stack frame so that we can
10895 pop the registers. */
10896 if (!m->fs.sp_valid)
10897 {
10898 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10899 GEN_INT (m->fs.fp_offset
10900 - frame.reg_save_offset),
10901 style, false);
10902 }
10903 else if (m->fs.sp_offset != frame.reg_save_offset)
10904 {
10905 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10906 GEN_INT (m->fs.sp_offset
10907 - frame.reg_save_offset),
10908 style,
10909 m->fs.cfa_reg == stack_pointer_rtx);
10910 }
10911
10912 ix86_emit_restore_regs_using_pop ();
10913 }
10914
10915 /* If we used a stack pointer and haven't already got rid of it,
10916 then do so now. */
10917 if (m->fs.fp_valid)
10918 {
10919 /* If the stack pointer is valid and pointing at the frame
10920 pointer store address, then we only need a pop. */
10921 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10922 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10923 /* Leave results in shorter dependency chains on CPUs that are
10924 able to grok it fast. */
10925 else if (TARGET_USE_LEAVE
10926 || optimize_function_for_size_p (cfun)
10927 || !cfun->machine->use_fast_prologue_epilogue)
10928 ix86_emit_leave ();
10929 else
10930 {
10931 pro_epilogue_adjust_stack (stack_pointer_rtx,
10932 hard_frame_pointer_rtx,
10933 const0_rtx, style, !using_drap);
10934 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10935 }
10936 }
10937
10938 if (using_drap)
10939 {
10940 int param_ptr_offset = UNITS_PER_WORD;
10941 rtx insn;
10942
10943 gcc_assert (stack_realign_drap);
10944
10945 if (ix86_static_chain_on_stack)
10946 param_ptr_offset += UNITS_PER_WORD;
10947 if (!call_used_regs[REGNO (crtl->drap_reg)])
10948 param_ptr_offset += UNITS_PER_WORD;
10949
10950 insn = emit_insn (gen_rtx_SET
10951 (VOIDmode, stack_pointer_rtx,
10952 gen_rtx_PLUS (Pmode,
10953 crtl->drap_reg,
10954 GEN_INT (-param_ptr_offset))));
10955 m->fs.cfa_reg = stack_pointer_rtx;
10956 m->fs.cfa_offset = param_ptr_offset;
10957 m->fs.sp_offset = param_ptr_offset;
10958 m->fs.realigned = false;
10959
10960 add_reg_note (insn, REG_CFA_DEF_CFA,
10961 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10962 GEN_INT (param_ptr_offset)));
10963 RTX_FRAME_RELATED_P (insn) = 1;
10964
10965 if (!call_used_regs[REGNO (crtl->drap_reg)])
10966 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10967 }
10968
10969 /* At this point the stack pointer must be valid, and we must have
10970 restored all of the registers. We may not have deallocated the
10971 entire stack frame. We've delayed this until now because it may
10972 be possible to merge the local stack deallocation with the
10973 deallocation forced by ix86_static_chain_on_stack. */
10974 gcc_assert (m->fs.sp_valid);
10975 gcc_assert (!m->fs.fp_valid);
10976 gcc_assert (!m->fs.realigned);
10977 if (m->fs.sp_offset != UNITS_PER_WORD)
10978 {
10979 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10980 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10981 style, true);
10982 }
10983 else
10984 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10985
10986 /* Sibcall epilogues don't want a return instruction. */
10987 if (style == 0)
10988 {
10989 m->fs = frame_state_save;
10990 return;
10991 }
10992
10993 /* Emit vzeroupper if needed. */
10994 ix86_maybe_emit_epilogue_vzeroupper ();
10995
10996 if (crtl->args.pops_args && crtl->args.size)
10997 {
10998 rtx popc = GEN_INT (crtl->args.pops_args);
10999
11000 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11001 address, do explicit add, and jump indirectly to the caller. */
11002
11003 if (crtl->args.pops_args >= 65536)
11004 {
11005 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11006 rtx insn;
11007
11008 /* There is no "pascal" calling convention in any 64bit ABI. */
11009 gcc_assert (!TARGET_64BIT);
11010
11011 insn = emit_insn (gen_pop (ecx));
11012 m->fs.cfa_offset -= UNITS_PER_WORD;
11013 m->fs.sp_offset -= UNITS_PER_WORD;
11014
11015 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11016 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11017 add_reg_note (insn, REG_CFA_REGISTER,
11018 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11019 RTX_FRAME_RELATED_P (insn) = 1;
11020
11021 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11022 popc, -1, true);
11023 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11024 }
11025 else
11026 emit_jump_insn (gen_simple_return_pop_internal (popc));
11027 }
11028 else
11029 emit_jump_insn (gen_simple_return_internal ());
11030
11031 /* Restore the state back to the state from the prologue,
11032 so that it's correct for the next epilogue. */
11033 m->fs = frame_state_save;
11034 }
11035
11036 /* Reset from the function's potential modifications. */
11037
11038 static void
11039 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11040 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11041 {
11042 if (pic_offset_table_rtx)
11043 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11044 #if TARGET_MACHO
11045 /* Mach-O doesn't support labels at the end of objects, so if
11046 it looks like we might want one, insert a NOP. */
11047 {
11048 rtx insn = get_last_insn ();
11049 rtx deleted_debug_label = NULL_RTX;
11050 while (insn
11051 && NOTE_P (insn)
11052 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11053 {
11054 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11055 notes only, instead set their CODE_LABEL_NUMBER to -1,
11056 otherwise there would be code generation differences
11057 in between -g and -g0. */
11058 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11059 deleted_debug_label = insn;
11060 insn = PREV_INSN (insn);
11061 }
11062 if (insn
11063 && (LABEL_P (insn)
11064 || (NOTE_P (insn)
11065 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11066 fputs ("\tnop\n", file);
11067 else if (deleted_debug_label)
11068 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11069 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11070 CODE_LABEL_NUMBER (insn) = -1;
11071 }
11072 #endif
11073
11074 }
11075
11076 /* Return a scratch register to use in the split stack prologue. The
11077 split stack prologue is used for -fsplit-stack. It is the first
11078 instructions in the function, even before the regular prologue.
11079 The scratch register can be any caller-saved register which is not
11080 used for parameters or for the static chain. */
11081
11082 static unsigned int
11083 split_stack_prologue_scratch_regno (void)
11084 {
11085 if (TARGET_64BIT)
11086 return R11_REG;
11087 else
11088 {
11089 bool is_fastcall;
11090 int regparm;
11091
11092 is_fastcall = (lookup_attribute ("fastcall",
11093 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11094 != NULL);
11095 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11096
11097 if (is_fastcall)
11098 {
11099 if (DECL_STATIC_CHAIN (cfun->decl))
11100 {
11101 sorry ("-fsplit-stack does not support fastcall with "
11102 "nested function");
11103 return INVALID_REGNUM;
11104 }
11105 return AX_REG;
11106 }
11107 else if (regparm < 3)
11108 {
11109 if (!DECL_STATIC_CHAIN (cfun->decl))
11110 return CX_REG;
11111 else
11112 {
11113 if (regparm >= 2)
11114 {
11115 sorry ("-fsplit-stack does not support 2 register "
11116 " parameters for a nested function");
11117 return INVALID_REGNUM;
11118 }
11119 return DX_REG;
11120 }
11121 }
11122 else
11123 {
11124 /* FIXME: We could make this work by pushing a register
11125 around the addition and comparison. */
11126 sorry ("-fsplit-stack does not support 3 register parameters");
11127 return INVALID_REGNUM;
11128 }
11129 }
11130 }
11131
11132 /* A SYMBOL_REF for the function which allocates new stackspace for
11133 -fsplit-stack. */
11134
11135 static GTY(()) rtx split_stack_fn;
11136
11137 /* A SYMBOL_REF for the more stack function when using the large
11138 model. */
11139
11140 static GTY(()) rtx split_stack_fn_large;
11141
11142 /* Handle -fsplit-stack. These are the first instructions in the
11143 function, even before the regular prologue. */
11144
11145 void
11146 ix86_expand_split_stack_prologue (void)
11147 {
11148 struct ix86_frame frame;
11149 HOST_WIDE_INT allocate;
11150 unsigned HOST_WIDE_INT args_size;
11151 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11152 rtx scratch_reg = NULL_RTX;
11153 rtx varargs_label = NULL_RTX;
11154 rtx fn;
11155
11156 gcc_assert (flag_split_stack && reload_completed);
11157
11158 ix86_finalize_stack_realign_flags ();
11159 ix86_compute_frame_layout (&frame);
11160 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11161
11162 /* This is the label we will branch to if we have enough stack
11163 space. We expect the basic block reordering pass to reverse this
11164 branch if optimizing, so that we branch in the unlikely case. */
11165 label = gen_label_rtx ();
11166
11167 /* We need to compare the stack pointer minus the frame size with
11168 the stack boundary in the TCB. The stack boundary always gives
11169 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11170 can compare directly. Otherwise we need to do an addition. */
11171
11172 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11173 UNSPEC_STACK_CHECK);
11174 limit = gen_rtx_CONST (Pmode, limit);
11175 limit = gen_rtx_MEM (Pmode, limit);
11176 if (allocate < SPLIT_STACK_AVAILABLE)
11177 current = stack_pointer_rtx;
11178 else
11179 {
11180 unsigned int scratch_regno;
11181 rtx offset;
11182
11183 /* We need a scratch register to hold the stack pointer minus
11184 the required frame size. Since this is the very start of the
11185 function, the scratch register can be any caller-saved
11186 register which is not used for parameters. */
11187 offset = GEN_INT (- allocate);
11188 scratch_regno = split_stack_prologue_scratch_regno ();
11189 if (scratch_regno == INVALID_REGNUM)
11190 return;
11191 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11192 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11193 {
11194 /* We don't use ix86_gen_add3 in this case because it will
11195 want to split to lea, but when not optimizing the insn
11196 will not be split after this point. */
11197 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11198 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11199 offset)));
11200 }
11201 else
11202 {
11203 emit_move_insn (scratch_reg, offset);
11204 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11205 stack_pointer_rtx));
11206 }
11207 current = scratch_reg;
11208 }
11209
11210 ix86_expand_branch (GEU, current, limit, label);
11211 jump_insn = get_last_insn ();
11212 JUMP_LABEL (jump_insn) = label;
11213
11214 /* Mark the jump as very likely to be taken. */
11215 add_reg_note (jump_insn, REG_BR_PROB,
11216 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11217
11218 if (split_stack_fn == NULL_RTX)
11219 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11220 fn = split_stack_fn;
11221
11222 /* Get more stack space. We pass in the desired stack space and the
11223 size of the arguments to copy to the new stack. In 32-bit mode
11224 we push the parameters; __morestack will return on a new stack
11225 anyhow. In 64-bit mode we pass the parameters in r10 and
11226 r11. */
11227 allocate_rtx = GEN_INT (allocate);
11228 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11229 call_fusage = NULL_RTX;
11230 if (TARGET_64BIT)
11231 {
11232 rtx reg10, reg11;
11233
11234 reg10 = gen_rtx_REG (Pmode, R10_REG);
11235 reg11 = gen_rtx_REG (Pmode, R11_REG);
11236
11237 /* If this function uses a static chain, it will be in %r10.
11238 Preserve it across the call to __morestack. */
11239 if (DECL_STATIC_CHAIN (cfun->decl))
11240 {
11241 rtx rax;
11242
11243 rax = gen_rtx_REG (word_mode, AX_REG);
11244 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11245 use_reg (&call_fusage, rax);
11246 }
11247
11248 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11249 {
11250 HOST_WIDE_INT argval;
11251
11252 gcc_assert (Pmode == DImode);
11253 /* When using the large model we need to load the address
11254 into a register, and we've run out of registers. So we
11255 switch to a different calling convention, and we call a
11256 different function: __morestack_large. We pass the
11257 argument size in the upper 32 bits of r10 and pass the
11258 frame size in the lower 32 bits. */
11259 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11260 gcc_assert ((args_size & 0xffffffff) == args_size);
11261
11262 if (split_stack_fn_large == NULL_RTX)
11263 split_stack_fn_large =
11264 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11265
11266 if (ix86_cmodel == CM_LARGE_PIC)
11267 {
11268 rtx label, x;
11269
11270 label = gen_label_rtx ();
11271 emit_label (label);
11272 LABEL_PRESERVE_P (label) = 1;
11273 emit_insn (gen_set_rip_rex64 (reg10, label));
11274 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11275 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11276 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11277 UNSPEC_GOT);
11278 x = gen_rtx_CONST (Pmode, x);
11279 emit_move_insn (reg11, x);
11280 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11281 x = gen_const_mem (Pmode, x);
11282 emit_move_insn (reg11, x);
11283 }
11284 else
11285 emit_move_insn (reg11, split_stack_fn_large);
11286
11287 fn = reg11;
11288
11289 argval = ((args_size << 16) << 16) + allocate;
11290 emit_move_insn (reg10, GEN_INT (argval));
11291 }
11292 else
11293 {
11294 emit_move_insn (reg10, allocate_rtx);
11295 emit_move_insn (reg11, GEN_INT (args_size));
11296 use_reg (&call_fusage, reg11);
11297 }
11298
11299 use_reg (&call_fusage, reg10);
11300 }
11301 else
11302 {
11303 emit_insn (gen_push (GEN_INT (args_size)));
11304 emit_insn (gen_push (allocate_rtx));
11305 }
11306 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11307 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11308 NULL_RTX, false);
11309 add_function_usage_to (call_insn, call_fusage);
11310
11311 /* In order to make call/return prediction work right, we now need
11312 to execute a return instruction. See
11313 libgcc/config/i386/morestack.S for the details on how this works.
11314
11315 For flow purposes gcc must not see this as a return
11316 instruction--we need control flow to continue at the subsequent
11317 label. Therefore, we use an unspec. */
11318 gcc_assert (crtl->args.pops_args < 65536);
11319 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11320
11321 /* If we are in 64-bit mode and this function uses a static chain,
11322 we saved %r10 in %rax before calling _morestack. */
11323 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11324 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11325 gen_rtx_REG (word_mode, AX_REG));
11326
11327 /* If this function calls va_start, we need to store a pointer to
11328 the arguments on the old stack, because they may not have been
11329 all copied to the new stack. At this point the old stack can be
11330 found at the frame pointer value used by __morestack, because
11331 __morestack has set that up before calling back to us. Here we
11332 store that pointer in a scratch register, and in
11333 ix86_expand_prologue we store the scratch register in a stack
11334 slot. */
11335 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11336 {
11337 unsigned int scratch_regno;
11338 rtx frame_reg;
11339 int words;
11340
11341 scratch_regno = split_stack_prologue_scratch_regno ();
11342 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11343 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11344
11345 /* 64-bit:
11346 fp -> old fp value
11347 return address within this function
11348 return address of caller of this function
11349 stack arguments
11350 So we add three words to get to the stack arguments.
11351
11352 32-bit:
11353 fp -> old fp value
11354 return address within this function
11355 first argument to __morestack
11356 second argument to __morestack
11357 return address of caller of this function
11358 stack arguments
11359 So we add five words to get to the stack arguments.
11360 */
11361 words = TARGET_64BIT ? 3 : 5;
11362 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11363 gen_rtx_PLUS (Pmode, frame_reg,
11364 GEN_INT (words * UNITS_PER_WORD))));
11365
11366 varargs_label = gen_label_rtx ();
11367 emit_jump_insn (gen_jump (varargs_label));
11368 JUMP_LABEL (get_last_insn ()) = varargs_label;
11369
11370 emit_barrier ();
11371 }
11372
11373 emit_label (label);
11374 LABEL_NUSES (label) = 1;
11375
11376 /* If this function calls va_start, we now have to set the scratch
11377 register for the case where we do not call __morestack. In this
11378 case we need to set it based on the stack pointer. */
11379 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11380 {
11381 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11382 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11383 GEN_INT (UNITS_PER_WORD))));
11384
11385 emit_label (varargs_label);
11386 LABEL_NUSES (varargs_label) = 1;
11387 }
11388 }
11389
11390 /* We may have to tell the dataflow pass that the split stack prologue
11391 is initializing a scratch register. */
11392
11393 static void
11394 ix86_live_on_entry (bitmap regs)
11395 {
11396 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11397 {
11398 gcc_assert (flag_split_stack);
11399 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11400 }
11401 }
11402 \f
11403 /* Determine if op is suitable SUBREG RTX for address. */
11404
11405 static bool
11406 ix86_address_subreg_operand (rtx op)
11407 {
11408 enum machine_mode mode;
11409
11410 if (!REG_P (op))
11411 return false;
11412
11413 mode = GET_MODE (op);
11414
11415 if (GET_MODE_CLASS (mode) != MODE_INT)
11416 return false;
11417
11418 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11419 failures when the register is one word out of a two word structure. */
11420 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11421 return false;
11422
11423 /* Allow only SUBREGs of non-eliminable hard registers. */
11424 return register_no_elim_operand (op, mode);
11425 }
11426
11427 /* Extract the parts of an RTL expression that is a valid memory address
11428 for an instruction. Return 0 if the structure of the address is
11429 grossly off. Return -1 if the address contains ASHIFT, so it is not
11430 strictly valid, but still used for computing length of lea instruction. */
11431
11432 int
11433 ix86_decompose_address (rtx addr, struct ix86_address *out)
11434 {
11435 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11436 rtx base_reg, index_reg;
11437 HOST_WIDE_INT scale = 1;
11438 rtx scale_rtx = NULL_RTX;
11439 rtx tmp;
11440 int retval = 1;
11441 enum ix86_address_seg seg = SEG_DEFAULT;
11442
11443 /* Allow zero-extended SImode addresses,
11444 they will be emitted with addr32 prefix. */
11445 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11446 {
11447 if (GET_CODE (addr) == ZERO_EXTEND
11448 && GET_MODE (XEXP (addr, 0)) == SImode)
11449 addr = XEXP (addr, 0);
11450 else if (GET_CODE (addr) == AND
11451 && const_32bit_mask (XEXP (addr, 1), DImode))
11452 {
11453 addr = XEXP (addr, 0);
11454
11455 /* Adjust SUBREGs. */
11456 if (GET_CODE (addr) == SUBREG
11457 && GET_MODE (SUBREG_REG (addr)) == SImode)
11458 addr = SUBREG_REG (addr);
11459 else if (GET_MODE (addr) == DImode)
11460 addr = gen_rtx_SUBREG (SImode, addr, 0);
11461 else
11462 return 0;
11463 }
11464 }
11465
11466 if (REG_P (addr))
11467 base = addr;
11468 else if (GET_CODE (addr) == SUBREG)
11469 {
11470 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11471 base = addr;
11472 else
11473 return 0;
11474 }
11475 else if (GET_CODE (addr) == PLUS)
11476 {
11477 rtx addends[4], op;
11478 int n = 0, i;
11479
11480 op = addr;
11481 do
11482 {
11483 if (n >= 4)
11484 return 0;
11485 addends[n++] = XEXP (op, 1);
11486 op = XEXP (op, 0);
11487 }
11488 while (GET_CODE (op) == PLUS);
11489 if (n >= 4)
11490 return 0;
11491 addends[n] = op;
11492
11493 for (i = n; i >= 0; --i)
11494 {
11495 op = addends[i];
11496 switch (GET_CODE (op))
11497 {
11498 case MULT:
11499 if (index)
11500 return 0;
11501 index = XEXP (op, 0);
11502 scale_rtx = XEXP (op, 1);
11503 break;
11504
11505 case ASHIFT:
11506 if (index)
11507 return 0;
11508 index = XEXP (op, 0);
11509 tmp = XEXP (op, 1);
11510 if (!CONST_INT_P (tmp))
11511 return 0;
11512 scale = INTVAL (tmp);
11513 if ((unsigned HOST_WIDE_INT) scale > 3)
11514 return 0;
11515 scale = 1 << scale;
11516 break;
11517
11518 case ZERO_EXTEND:
11519 op = XEXP (op, 0);
11520 if (GET_CODE (op) != UNSPEC)
11521 return 0;
11522 /* FALLTHRU */
11523
11524 case UNSPEC:
11525 if (XINT (op, 1) == UNSPEC_TP
11526 && TARGET_TLS_DIRECT_SEG_REFS
11527 && seg == SEG_DEFAULT)
11528 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11529 else
11530 return 0;
11531 break;
11532
11533 case SUBREG:
11534 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11535 return 0;
11536 /* FALLTHRU */
11537
11538 case REG:
11539 if (!base)
11540 base = op;
11541 else if (!index)
11542 index = op;
11543 else
11544 return 0;
11545 break;
11546
11547 case CONST:
11548 case CONST_INT:
11549 case SYMBOL_REF:
11550 case LABEL_REF:
11551 if (disp)
11552 return 0;
11553 disp = op;
11554 break;
11555
11556 default:
11557 return 0;
11558 }
11559 }
11560 }
11561 else if (GET_CODE (addr) == MULT)
11562 {
11563 index = XEXP (addr, 0); /* index*scale */
11564 scale_rtx = XEXP (addr, 1);
11565 }
11566 else if (GET_CODE (addr) == ASHIFT)
11567 {
11568 /* We're called for lea too, which implements ashift on occasion. */
11569 index = XEXP (addr, 0);
11570 tmp = XEXP (addr, 1);
11571 if (!CONST_INT_P (tmp))
11572 return 0;
11573 scale = INTVAL (tmp);
11574 if ((unsigned HOST_WIDE_INT) scale > 3)
11575 return 0;
11576 scale = 1 << scale;
11577 retval = -1;
11578 }
11579 else
11580 disp = addr; /* displacement */
11581
11582 if (index)
11583 {
11584 if (REG_P (index))
11585 ;
11586 else if (GET_CODE (index) == SUBREG
11587 && ix86_address_subreg_operand (SUBREG_REG (index)))
11588 ;
11589 else
11590 return 0;
11591 }
11592
11593 /* Address override works only on the (%reg) part of %fs:(%reg). */
11594 if (seg != SEG_DEFAULT
11595 && ((base && GET_MODE (base) != word_mode)
11596 || (index && GET_MODE (index) != word_mode)))
11597 return 0;
11598
11599 /* Extract the integral value of scale. */
11600 if (scale_rtx)
11601 {
11602 if (!CONST_INT_P (scale_rtx))
11603 return 0;
11604 scale = INTVAL (scale_rtx);
11605 }
11606
11607 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11608 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11609
11610 /* Avoid useless 0 displacement. */
11611 if (disp == const0_rtx && (base || index))
11612 disp = NULL_RTX;
11613
11614 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11615 if (base_reg && index_reg && scale == 1
11616 && (index_reg == arg_pointer_rtx
11617 || index_reg == frame_pointer_rtx
11618 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11619 {
11620 rtx tmp;
11621 tmp = base, base = index, index = tmp;
11622 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11623 }
11624
11625 /* Special case: %ebp cannot be encoded as a base without a displacement.
11626 Similarly %r13. */
11627 if (!disp
11628 && base_reg
11629 && (base_reg == hard_frame_pointer_rtx
11630 || base_reg == frame_pointer_rtx
11631 || base_reg == arg_pointer_rtx
11632 || (REG_P (base_reg)
11633 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11634 || REGNO (base_reg) == R13_REG))))
11635 disp = const0_rtx;
11636
11637 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11638 Avoid this by transforming to [%esi+0].
11639 Reload calls address legitimization without cfun defined, so we need
11640 to test cfun for being non-NULL. */
11641 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11642 && base_reg && !index_reg && !disp
11643 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11644 disp = const0_rtx;
11645
11646 /* Special case: encode reg+reg instead of reg*2. */
11647 if (!base && index && scale == 2)
11648 base = index, base_reg = index_reg, scale = 1;
11649
11650 /* Special case: scaling cannot be encoded without base or displacement. */
11651 if (!base && !disp && index && scale != 1)
11652 disp = const0_rtx;
11653
11654 out->base = base;
11655 out->index = index;
11656 out->disp = disp;
11657 out->scale = scale;
11658 out->seg = seg;
11659
11660 return retval;
11661 }
11662 \f
11663 /* Return cost of the memory address x.
11664 For i386, it is better to use a complex address than let gcc copy
11665 the address into a reg and make a new pseudo. But not if the address
11666 requires to two regs - that would mean more pseudos with longer
11667 lifetimes. */
11668 static int
11669 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11670 {
11671 struct ix86_address parts;
11672 int cost = 1;
11673 int ok = ix86_decompose_address (x, &parts);
11674
11675 gcc_assert (ok);
11676
11677 if (parts.base && GET_CODE (parts.base) == SUBREG)
11678 parts.base = SUBREG_REG (parts.base);
11679 if (parts.index && GET_CODE (parts.index) == SUBREG)
11680 parts.index = SUBREG_REG (parts.index);
11681
11682 /* Attempt to minimize number of registers in the address. */
11683 if ((parts.base
11684 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11685 || (parts.index
11686 && (!REG_P (parts.index)
11687 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11688 cost++;
11689
11690 if (parts.base
11691 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11692 && parts.index
11693 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11694 && parts.base != parts.index)
11695 cost++;
11696
11697 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11698 since it's predecode logic can't detect the length of instructions
11699 and it degenerates to vector decoded. Increase cost of such
11700 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11701 to split such addresses or even refuse such addresses at all.
11702
11703 Following addressing modes are affected:
11704 [base+scale*index]
11705 [scale*index+disp]
11706 [base+index]
11707
11708 The first and last case may be avoidable by explicitly coding the zero in
11709 memory address, but I don't have AMD-K6 machine handy to check this
11710 theory. */
11711
11712 if (TARGET_K6
11713 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11714 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11715 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11716 cost += 10;
11717
11718 return cost;
11719 }
11720 \f
11721 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11722 this is used for to form addresses to local data when -fPIC is in
11723 use. */
11724
11725 static bool
11726 darwin_local_data_pic (rtx disp)
11727 {
11728 return (GET_CODE (disp) == UNSPEC
11729 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11730 }
11731
11732 /* Determine if a given RTX is a valid constant. We already know this
11733 satisfies CONSTANT_P. */
11734
11735 static bool
11736 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11737 {
11738 switch (GET_CODE (x))
11739 {
11740 case CONST:
11741 x = XEXP (x, 0);
11742
11743 if (GET_CODE (x) == PLUS)
11744 {
11745 if (!CONST_INT_P (XEXP (x, 1)))
11746 return false;
11747 x = XEXP (x, 0);
11748 }
11749
11750 if (TARGET_MACHO && darwin_local_data_pic (x))
11751 return true;
11752
11753 /* Only some unspecs are valid as "constants". */
11754 if (GET_CODE (x) == UNSPEC)
11755 switch (XINT (x, 1))
11756 {
11757 case UNSPEC_GOT:
11758 case UNSPEC_GOTOFF:
11759 case UNSPEC_PLTOFF:
11760 return TARGET_64BIT;
11761 case UNSPEC_TPOFF:
11762 case UNSPEC_NTPOFF:
11763 x = XVECEXP (x, 0, 0);
11764 return (GET_CODE (x) == SYMBOL_REF
11765 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11766 case UNSPEC_DTPOFF:
11767 x = XVECEXP (x, 0, 0);
11768 return (GET_CODE (x) == SYMBOL_REF
11769 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11770 default:
11771 return false;
11772 }
11773
11774 /* We must have drilled down to a symbol. */
11775 if (GET_CODE (x) == LABEL_REF)
11776 return true;
11777 if (GET_CODE (x) != SYMBOL_REF)
11778 return false;
11779 /* FALLTHRU */
11780
11781 case SYMBOL_REF:
11782 /* TLS symbols are never valid. */
11783 if (SYMBOL_REF_TLS_MODEL (x))
11784 return false;
11785
11786 /* DLLIMPORT symbols are never valid. */
11787 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11788 && SYMBOL_REF_DLLIMPORT_P (x))
11789 return false;
11790
11791 #if TARGET_MACHO
11792 /* mdynamic-no-pic */
11793 if (MACHO_DYNAMIC_NO_PIC_P)
11794 return machopic_symbol_defined_p (x);
11795 #endif
11796 break;
11797
11798 case CONST_DOUBLE:
11799 if (GET_MODE (x) == TImode
11800 && x != CONST0_RTX (TImode)
11801 && !TARGET_64BIT)
11802 return false;
11803 break;
11804
11805 case CONST_VECTOR:
11806 if (!standard_sse_constant_p (x))
11807 return false;
11808
11809 default:
11810 break;
11811 }
11812
11813 /* Otherwise we handle everything else in the move patterns. */
11814 return true;
11815 }
11816
11817 /* Determine if it's legal to put X into the constant pool. This
11818 is not possible for the address of thread-local symbols, which
11819 is checked above. */
11820
11821 static bool
11822 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11823 {
11824 /* We can always put integral constants and vectors in memory. */
11825 switch (GET_CODE (x))
11826 {
11827 case CONST_INT:
11828 case CONST_DOUBLE:
11829 case CONST_VECTOR:
11830 return false;
11831
11832 default:
11833 break;
11834 }
11835 return !ix86_legitimate_constant_p (mode, x);
11836 }
11837
11838
11839 /* Nonzero if the constant value X is a legitimate general operand
11840 when generating PIC code. It is given that flag_pic is on and
11841 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11842
11843 bool
11844 legitimate_pic_operand_p (rtx x)
11845 {
11846 rtx inner;
11847
11848 switch (GET_CODE (x))
11849 {
11850 case CONST:
11851 inner = XEXP (x, 0);
11852 if (GET_CODE (inner) == PLUS
11853 && CONST_INT_P (XEXP (inner, 1)))
11854 inner = XEXP (inner, 0);
11855
11856 /* Only some unspecs are valid as "constants". */
11857 if (GET_CODE (inner) == UNSPEC)
11858 switch (XINT (inner, 1))
11859 {
11860 case UNSPEC_GOT:
11861 case UNSPEC_GOTOFF:
11862 case UNSPEC_PLTOFF:
11863 return TARGET_64BIT;
11864 case UNSPEC_TPOFF:
11865 x = XVECEXP (inner, 0, 0);
11866 return (GET_CODE (x) == SYMBOL_REF
11867 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11868 case UNSPEC_MACHOPIC_OFFSET:
11869 return legitimate_pic_address_disp_p (x);
11870 default:
11871 return false;
11872 }
11873 /* FALLTHRU */
11874
11875 case SYMBOL_REF:
11876 case LABEL_REF:
11877 return legitimate_pic_address_disp_p (x);
11878
11879 default:
11880 return true;
11881 }
11882 }
11883
11884 /* Determine if a given CONST RTX is a valid memory displacement
11885 in PIC mode. */
11886
11887 bool
11888 legitimate_pic_address_disp_p (rtx disp)
11889 {
11890 bool saw_plus;
11891
11892 /* In 64bit mode we can allow direct addresses of symbols and labels
11893 when they are not dynamic symbols. */
11894 if (TARGET_64BIT)
11895 {
11896 rtx op0 = disp, op1;
11897
11898 switch (GET_CODE (disp))
11899 {
11900 case LABEL_REF:
11901 return true;
11902
11903 case CONST:
11904 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11905 break;
11906 op0 = XEXP (XEXP (disp, 0), 0);
11907 op1 = XEXP (XEXP (disp, 0), 1);
11908 if (!CONST_INT_P (op1)
11909 || INTVAL (op1) >= 16*1024*1024
11910 || INTVAL (op1) < -16*1024*1024)
11911 break;
11912 if (GET_CODE (op0) == LABEL_REF)
11913 return true;
11914 if (GET_CODE (op0) == CONST
11915 && GET_CODE (XEXP (op0, 0)) == UNSPEC
11916 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
11917 return true;
11918 if (GET_CODE (op0) == UNSPEC
11919 && XINT (op0, 1) == UNSPEC_PCREL)
11920 return true;
11921 if (GET_CODE (op0) != SYMBOL_REF)
11922 break;
11923 /* FALLTHRU */
11924
11925 case SYMBOL_REF:
11926 /* TLS references should always be enclosed in UNSPEC. */
11927 if (SYMBOL_REF_TLS_MODEL (op0))
11928 return false;
11929 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11930 && ix86_cmodel != CM_LARGE_PIC)
11931 return true;
11932 break;
11933
11934 default:
11935 break;
11936 }
11937 }
11938 if (GET_CODE (disp) != CONST)
11939 return false;
11940 disp = XEXP (disp, 0);
11941
11942 if (TARGET_64BIT)
11943 {
11944 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11945 of GOT tables. We should not need these anyway. */
11946 if (GET_CODE (disp) != UNSPEC
11947 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11948 && XINT (disp, 1) != UNSPEC_GOTOFF
11949 && XINT (disp, 1) != UNSPEC_PCREL
11950 && XINT (disp, 1) != UNSPEC_PLTOFF))
11951 return false;
11952
11953 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11954 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11955 return false;
11956 return true;
11957 }
11958
11959 saw_plus = false;
11960 if (GET_CODE (disp) == PLUS)
11961 {
11962 if (!CONST_INT_P (XEXP (disp, 1)))
11963 return false;
11964 disp = XEXP (disp, 0);
11965 saw_plus = true;
11966 }
11967
11968 if (TARGET_MACHO && darwin_local_data_pic (disp))
11969 return true;
11970
11971 if (GET_CODE (disp) != UNSPEC)
11972 return false;
11973
11974 switch (XINT (disp, 1))
11975 {
11976 case UNSPEC_GOT:
11977 if (saw_plus)
11978 return false;
11979 /* We need to check for both symbols and labels because VxWorks loads
11980 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11981 details. */
11982 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11983 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11984 case UNSPEC_GOTOFF:
11985 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11986 While ABI specify also 32bit relocation but we don't produce it in
11987 small PIC model at all. */
11988 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11989 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11990 && !TARGET_64BIT)
11991 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11992 return false;
11993 case UNSPEC_GOTTPOFF:
11994 case UNSPEC_GOTNTPOFF:
11995 case UNSPEC_INDNTPOFF:
11996 if (saw_plus)
11997 return false;
11998 disp = XVECEXP (disp, 0, 0);
11999 return (GET_CODE (disp) == SYMBOL_REF
12000 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12001 case UNSPEC_NTPOFF:
12002 disp = XVECEXP (disp, 0, 0);
12003 return (GET_CODE (disp) == SYMBOL_REF
12004 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12005 case UNSPEC_DTPOFF:
12006 disp = XVECEXP (disp, 0, 0);
12007 return (GET_CODE (disp) == SYMBOL_REF
12008 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12009 }
12010
12011 return false;
12012 }
12013
12014 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12015 replace the input X, or the original X if no replacement is called for.
12016 The output parameter *WIN is 1 if the calling macro should goto WIN,
12017 0 if it should not. */
12018
12019 bool
12020 ix86_legitimize_reload_address (rtx x,
12021 enum machine_mode mode ATTRIBUTE_UNUSED,
12022 int opnum, int type,
12023 int ind_levels ATTRIBUTE_UNUSED)
12024 {
12025 /* Reload can generate:
12026
12027 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12028 (reg:DI 97))
12029 (reg:DI 2 cx))
12030
12031 This RTX is rejected from ix86_legitimate_address_p due to
12032 non-strictness of base register 97. Following this rejection,
12033 reload pushes all three components into separate registers,
12034 creating invalid memory address RTX.
12035
12036 Following code reloads only the invalid part of the
12037 memory address RTX. */
12038
12039 if (GET_CODE (x) == PLUS
12040 && REG_P (XEXP (x, 1))
12041 && GET_CODE (XEXP (x, 0)) == PLUS
12042 && REG_P (XEXP (XEXP (x, 0), 1)))
12043 {
12044 rtx base, index;
12045 bool something_reloaded = false;
12046
12047 base = XEXP (XEXP (x, 0), 1);
12048 if (!REG_OK_FOR_BASE_STRICT_P (base))
12049 {
12050 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12051 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12052 opnum, (enum reload_type) type);
12053 something_reloaded = true;
12054 }
12055
12056 index = XEXP (x, 1);
12057 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12058 {
12059 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12060 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12061 opnum, (enum reload_type) type);
12062 something_reloaded = true;
12063 }
12064
12065 gcc_assert (something_reloaded);
12066 return true;
12067 }
12068
12069 return false;
12070 }
12071
12072 /* Recognizes RTL expressions that are valid memory addresses for an
12073 instruction. The MODE argument is the machine mode for the MEM
12074 expression that wants to use this address.
12075
12076 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12077 convert common non-canonical forms to canonical form so that they will
12078 be recognized. */
12079
12080 static bool
12081 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12082 rtx addr, bool strict)
12083 {
12084 struct ix86_address parts;
12085 rtx base, index, disp;
12086 HOST_WIDE_INT scale;
12087
12088 /* Since constant address in x32 is signed extended to 64bit,
12089 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12090 if (TARGET_X32
12091 && CONST_INT_P (addr)
12092 && INTVAL (addr) < 0)
12093 return false;
12094
12095 if (ix86_decompose_address (addr, &parts) <= 0)
12096 /* Decomposition failed. */
12097 return false;
12098
12099 base = parts.base;
12100 index = parts.index;
12101 disp = parts.disp;
12102 scale = parts.scale;
12103
12104 /* Validate base register. */
12105 if (base)
12106 {
12107 rtx reg;
12108
12109 if (REG_P (base))
12110 reg = base;
12111 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12112 reg = SUBREG_REG (base);
12113 else
12114 /* Base is not a register. */
12115 return false;
12116
12117 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12118 return false;
12119
12120 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12121 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12122 /* Base is not valid. */
12123 return false;
12124 }
12125
12126 /* Validate index register. */
12127 if (index)
12128 {
12129 rtx reg;
12130
12131 if (REG_P (index))
12132 reg = index;
12133 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12134 reg = SUBREG_REG (index);
12135 else
12136 /* Index is not a register. */
12137 return false;
12138
12139 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12140 return false;
12141
12142 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12143 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12144 /* Index is not valid. */
12145 return false;
12146 }
12147
12148 /* Index and base should have the same mode. */
12149 if (base && index
12150 && GET_MODE (base) != GET_MODE (index))
12151 return false;
12152
12153 /* Validate scale factor. */
12154 if (scale != 1)
12155 {
12156 if (!index)
12157 /* Scale without index. */
12158 return false;
12159
12160 if (scale != 2 && scale != 4 && scale != 8)
12161 /* Scale is not a valid multiplier. */
12162 return false;
12163 }
12164
12165 /* Validate displacement. */
12166 if (disp)
12167 {
12168 if (GET_CODE (disp) == CONST
12169 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12170 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12171 switch (XINT (XEXP (disp, 0), 1))
12172 {
12173 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12174 used. While ABI specify also 32bit relocations, we don't produce
12175 them at all and use IP relative instead. */
12176 case UNSPEC_GOT:
12177 case UNSPEC_GOTOFF:
12178 gcc_assert (flag_pic);
12179 if (!TARGET_64BIT)
12180 goto is_legitimate_pic;
12181
12182 /* 64bit address unspec. */
12183 return false;
12184
12185 case UNSPEC_GOTPCREL:
12186 case UNSPEC_PCREL:
12187 gcc_assert (flag_pic);
12188 goto is_legitimate_pic;
12189
12190 case UNSPEC_GOTTPOFF:
12191 case UNSPEC_GOTNTPOFF:
12192 case UNSPEC_INDNTPOFF:
12193 case UNSPEC_NTPOFF:
12194 case UNSPEC_DTPOFF:
12195 break;
12196
12197 case UNSPEC_STACK_CHECK:
12198 gcc_assert (flag_split_stack);
12199 break;
12200
12201 default:
12202 /* Invalid address unspec. */
12203 return false;
12204 }
12205
12206 else if (SYMBOLIC_CONST (disp)
12207 && (flag_pic
12208 || (TARGET_MACHO
12209 #if TARGET_MACHO
12210 && MACHOPIC_INDIRECT
12211 && !machopic_operand_p (disp)
12212 #endif
12213 )))
12214 {
12215
12216 is_legitimate_pic:
12217 if (TARGET_64BIT && (index || base))
12218 {
12219 /* foo@dtpoff(%rX) is ok. */
12220 if (GET_CODE (disp) != CONST
12221 || GET_CODE (XEXP (disp, 0)) != PLUS
12222 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12223 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12224 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12225 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12226 /* Non-constant pic memory reference. */
12227 return false;
12228 }
12229 else if ((!TARGET_MACHO || flag_pic)
12230 && ! legitimate_pic_address_disp_p (disp))
12231 /* Displacement is an invalid pic construct. */
12232 return false;
12233 #if TARGET_MACHO
12234 else if (MACHO_DYNAMIC_NO_PIC_P
12235 && !ix86_legitimate_constant_p (Pmode, disp))
12236 /* displacment must be referenced via non_lazy_pointer */
12237 return false;
12238 #endif
12239
12240 /* This code used to verify that a symbolic pic displacement
12241 includes the pic_offset_table_rtx register.
12242
12243 While this is good idea, unfortunately these constructs may
12244 be created by "adds using lea" optimization for incorrect
12245 code like:
12246
12247 int a;
12248 int foo(int i)
12249 {
12250 return *(&a+i);
12251 }
12252
12253 This code is nonsensical, but results in addressing
12254 GOT table with pic_offset_table_rtx base. We can't
12255 just refuse it easily, since it gets matched by
12256 "addsi3" pattern, that later gets split to lea in the
12257 case output register differs from input. While this
12258 can be handled by separate addsi pattern for this case
12259 that never results in lea, this seems to be easier and
12260 correct fix for crash to disable this test. */
12261 }
12262 else if (GET_CODE (disp) != LABEL_REF
12263 && !CONST_INT_P (disp)
12264 && (GET_CODE (disp) != CONST
12265 || !ix86_legitimate_constant_p (Pmode, disp))
12266 && (GET_CODE (disp) != SYMBOL_REF
12267 || !ix86_legitimate_constant_p (Pmode, disp)))
12268 /* Displacement is not constant. */
12269 return false;
12270 else if (TARGET_64BIT
12271 && !x86_64_immediate_operand (disp, VOIDmode))
12272 /* Displacement is out of range. */
12273 return false;
12274 }
12275
12276 /* Everything looks valid. */
12277 return true;
12278 }
12279
12280 /* Determine if a given RTX is a valid constant address. */
12281
12282 bool
12283 constant_address_p (rtx x)
12284 {
12285 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12286 }
12287 \f
12288 /* Return a unique alias set for the GOT. */
12289
12290 static alias_set_type
12291 ix86_GOT_alias_set (void)
12292 {
12293 static alias_set_type set = -1;
12294 if (set == -1)
12295 set = new_alias_set ();
12296 return set;
12297 }
12298
12299 /* Return a legitimate reference for ORIG (an address) using the
12300 register REG. If REG is 0, a new pseudo is generated.
12301
12302 There are two types of references that must be handled:
12303
12304 1. Global data references must load the address from the GOT, via
12305 the PIC reg. An insn is emitted to do this load, and the reg is
12306 returned.
12307
12308 2. Static data references, constant pool addresses, and code labels
12309 compute the address as an offset from the GOT, whose base is in
12310 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12311 differentiate them from global data objects. The returned
12312 address is the PIC reg + an unspec constant.
12313
12314 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12315 reg also appears in the address. */
12316
12317 static rtx
12318 legitimize_pic_address (rtx orig, rtx reg)
12319 {
12320 rtx addr = orig;
12321 rtx new_rtx = orig;
12322 rtx base;
12323
12324 #if TARGET_MACHO
12325 if (TARGET_MACHO && !TARGET_64BIT)
12326 {
12327 if (reg == 0)
12328 reg = gen_reg_rtx (Pmode);
12329 /* Use the generic Mach-O PIC machinery. */
12330 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12331 }
12332 #endif
12333
12334 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12335 new_rtx = addr;
12336 else if (TARGET_64BIT
12337 && ix86_cmodel != CM_SMALL_PIC
12338 && gotoff_operand (addr, Pmode))
12339 {
12340 rtx tmpreg;
12341 /* This symbol may be referenced via a displacement from the PIC
12342 base address (@GOTOFF). */
12343
12344 if (reload_in_progress)
12345 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12346 if (GET_CODE (addr) == CONST)
12347 addr = XEXP (addr, 0);
12348 if (GET_CODE (addr) == PLUS)
12349 {
12350 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12351 UNSPEC_GOTOFF);
12352 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12353 }
12354 else
12355 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12356 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12357 if (!reg)
12358 tmpreg = gen_reg_rtx (Pmode);
12359 else
12360 tmpreg = reg;
12361 emit_move_insn (tmpreg, new_rtx);
12362
12363 if (reg != 0)
12364 {
12365 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12366 tmpreg, 1, OPTAB_DIRECT);
12367 new_rtx = reg;
12368 }
12369 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12370 }
12371 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12372 {
12373 /* This symbol may be referenced via a displacement from the PIC
12374 base address (@GOTOFF). */
12375
12376 if (reload_in_progress)
12377 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12378 if (GET_CODE (addr) == CONST)
12379 addr = XEXP (addr, 0);
12380 if (GET_CODE (addr) == PLUS)
12381 {
12382 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12383 UNSPEC_GOTOFF);
12384 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12385 }
12386 else
12387 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12388 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12389 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12390
12391 if (reg != 0)
12392 {
12393 emit_move_insn (reg, new_rtx);
12394 new_rtx = reg;
12395 }
12396 }
12397 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12398 /* We can't use @GOTOFF for text labels on VxWorks;
12399 see gotoff_operand. */
12400 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12401 {
12402 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12403 {
12404 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12405 return legitimize_dllimport_symbol (addr, true);
12406 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12407 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12408 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12409 {
12410 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12411 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12412 }
12413 }
12414
12415 /* For x64 PE-COFF there is no GOT table. So we use address
12416 directly. */
12417 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12418 {
12419 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12420 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12421
12422 if (reg == 0)
12423 reg = gen_reg_rtx (Pmode);
12424 emit_move_insn (reg, new_rtx);
12425 new_rtx = reg;
12426 }
12427 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12428 {
12429 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12430 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12431 new_rtx = gen_const_mem (Pmode, new_rtx);
12432 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12433
12434 if (reg == 0)
12435 reg = gen_reg_rtx (Pmode);
12436 /* Use directly gen_movsi, otherwise the address is loaded
12437 into register for CSE. We don't want to CSE this addresses,
12438 instead we CSE addresses from the GOT table, so skip this. */
12439 emit_insn (gen_movsi (reg, new_rtx));
12440 new_rtx = reg;
12441 }
12442 else
12443 {
12444 /* This symbol must be referenced via a load from the
12445 Global Offset Table (@GOT). */
12446
12447 if (reload_in_progress)
12448 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12449 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12450 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12451 if (TARGET_64BIT)
12452 new_rtx = force_reg (Pmode, new_rtx);
12453 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12454 new_rtx = gen_const_mem (Pmode, new_rtx);
12455 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12456
12457 if (reg == 0)
12458 reg = gen_reg_rtx (Pmode);
12459 emit_move_insn (reg, new_rtx);
12460 new_rtx = reg;
12461 }
12462 }
12463 else
12464 {
12465 if (CONST_INT_P (addr)
12466 && !x86_64_immediate_operand (addr, VOIDmode))
12467 {
12468 if (reg)
12469 {
12470 emit_move_insn (reg, addr);
12471 new_rtx = reg;
12472 }
12473 else
12474 new_rtx = force_reg (Pmode, addr);
12475 }
12476 else if (GET_CODE (addr) == CONST)
12477 {
12478 addr = XEXP (addr, 0);
12479
12480 /* We must match stuff we generate before. Assume the only
12481 unspecs that can get here are ours. Not that we could do
12482 anything with them anyway.... */
12483 if (GET_CODE (addr) == UNSPEC
12484 || (GET_CODE (addr) == PLUS
12485 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12486 return orig;
12487 gcc_assert (GET_CODE (addr) == PLUS);
12488 }
12489 if (GET_CODE (addr) == PLUS)
12490 {
12491 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12492
12493 /* Check first to see if this is a constant offset from a @GOTOFF
12494 symbol reference. */
12495 if (gotoff_operand (op0, Pmode)
12496 && CONST_INT_P (op1))
12497 {
12498 if (!TARGET_64BIT)
12499 {
12500 if (reload_in_progress)
12501 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12502 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12503 UNSPEC_GOTOFF);
12504 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12505 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12506 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12507
12508 if (reg != 0)
12509 {
12510 emit_move_insn (reg, new_rtx);
12511 new_rtx = reg;
12512 }
12513 }
12514 else
12515 {
12516 if (INTVAL (op1) < -16*1024*1024
12517 || INTVAL (op1) >= 16*1024*1024)
12518 {
12519 if (!x86_64_immediate_operand (op1, Pmode))
12520 op1 = force_reg (Pmode, op1);
12521 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12522 }
12523 }
12524 }
12525 else
12526 {
12527 base = legitimize_pic_address (XEXP (addr, 0), reg);
12528 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12529 base == reg ? NULL_RTX : reg);
12530
12531 if (CONST_INT_P (new_rtx))
12532 new_rtx = plus_constant (base, INTVAL (new_rtx));
12533 else
12534 {
12535 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12536 {
12537 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12538 new_rtx = XEXP (new_rtx, 1);
12539 }
12540 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12541 }
12542 }
12543 }
12544 }
12545 return new_rtx;
12546 }
12547 \f
12548 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12549
12550 static rtx
12551 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12552 {
12553 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12554
12555 if (GET_MODE (tp) != tp_mode)
12556 {
12557 gcc_assert (GET_MODE (tp) == SImode);
12558 gcc_assert (tp_mode == DImode);
12559
12560 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12561 }
12562
12563 if (to_reg)
12564 tp = copy_to_mode_reg (tp_mode, tp);
12565
12566 return tp;
12567 }
12568
12569 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12570
12571 static GTY(()) rtx ix86_tls_symbol;
12572
12573 static rtx
12574 ix86_tls_get_addr (void)
12575 {
12576 if (!ix86_tls_symbol)
12577 {
12578 const char *sym
12579 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12580 ? "___tls_get_addr" : "__tls_get_addr");
12581
12582 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12583 }
12584
12585 return ix86_tls_symbol;
12586 }
12587
12588 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12589
12590 static GTY(()) rtx ix86_tls_module_base_symbol;
12591
12592 rtx
12593 ix86_tls_module_base (void)
12594 {
12595 if (!ix86_tls_module_base_symbol)
12596 {
12597 ix86_tls_module_base_symbol
12598 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12599
12600 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12601 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12602 }
12603
12604 return ix86_tls_module_base_symbol;
12605 }
12606
12607 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12608 false if we expect this to be used for a memory address and true if
12609 we expect to load the address into a register. */
12610
12611 static rtx
12612 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12613 {
12614 rtx dest, base, off;
12615 rtx pic = NULL_RTX, tp = NULL_RTX;
12616 enum machine_mode tp_mode = Pmode;
12617 int type;
12618
12619 switch (model)
12620 {
12621 case TLS_MODEL_GLOBAL_DYNAMIC:
12622 dest = gen_reg_rtx (Pmode);
12623
12624 if (!TARGET_64BIT)
12625 {
12626 if (flag_pic)
12627 pic = pic_offset_table_rtx;
12628 else
12629 {
12630 pic = gen_reg_rtx (Pmode);
12631 emit_insn (gen_set_got (pic));
12632 }
12633 }
12634
12635 if (TARGET_GNU2_TLS)
12636 {
12637 if (TARGET_64BIT)
12638 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12639 else
12640 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12641
12642 tp = get_thread_pointer (Pmode, true);
12643 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12644
12645 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12646 }
12647 else
12648 {
12649 rtx caddr = ix86_tls_get_addr ();
12650
12651 if (TARGET_64BIT)
12652 {
12653 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12654
12655 start_sequence ();
12656 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
12657 caddr));
12658 insns = get_insns ();
12659 end_sequence ();
12660
12661 RTL_CONST_CALL_P (insns) = 1;
12662 emit_libcall_block (insns, dest, rax, x);
12663 }
12664 else
12665 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12666 }
12667 break;
12668
12669 case TLS_MODEL_LOCAL_DYNAMIC:
12670 base = gen_reg_rtx (Pmode);
12671
12672 if (!TARGET_64BIT)
12673 {
12674 if (flag_pic)
12675 pic = pic_offset_table_rtx;
12676 else
12677 {
12678 pic = gen_reg_rtx (Pmode);
12679 emit_insn (gen_set_got (pic));
12680 }
12681 }
12682
12683 if (TARGET_GNU2_TLS)
12684 {
12685 rtx tmp = ix86_tls_module_base ();
12686
12687 if (TARGET_64BIT)
12688 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12689 else
12690 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12691
12692 tp = get_thread_pointer (Pmode, true);
12693 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12694 gen_rtx_MINUS (Pmode, tmp, tp));
12695 }
12696 else
12697 {
12698 rtx caddr = ix86_tls_get_addr ();
12699
12700 if (TARGET_64BIT)
12701 {
12702 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12703
12704 start_sequence ();
12705 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
12706 caddr));
12707 insns = get_insns ();
12708 end_sequence ();
12709
12710 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12711 share the LD_BASE result with other LD model accesses. */
12712 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12713 UNSPEC_TLS_LD_BASE);
12714
12715 RTL_CONST_CALL_P (insns) = 1;
12716 emit_libcall_block (insns, base, rax, eqv);
12717 }
12718 else
12719 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12720 }
12721
12722 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12723 off = gen_rtx_CONST (Pmode, off);
12724
12725 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12726
12727 if (TARGET_GNU2_TLS)
12728 {
12729 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12730
12731 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12732 }
12733 break;
12734
12735 case TLS_MODEL_INITIAL_EXEC:
12736 if (TARGET_64BIT)
12737 {
12738 if (TARGET_SUN_TLS)
12739 {
12740 /* The Sun linker took the AMD64 TLS spec literally
12741 and can only handle %rax as destination of the
12742 initial executable code sequence. */
12743
12744 dest = gen_reg_rtx (Pmode);
12745 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12746 return dest;
12747 }
12748
12749 /* Generate DImode references to avoid %fs:(%reg32)
12750 problems and linker IE->LE relaxation bug. */
12751 tp_mode = DImode;
12752 pic = NULL;
12753 type = UNSPEC_GOTNTPOFF;
12754 }
12755 else if (flag_pic)
12756 {
12757 if (reload_in_progress)
12758 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12759 pic = pic_offset_table_rtx;
12760 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12761 }
12762 else if (!TARGET_ANY_GNU_TLS)
12763 {
12764 pic = gen_reg_rtx (Pmode);
12765 emit_insn (gen_set_got (pic));
12766 type = UNSPEC_GOTTPOFF;
12767 }
12768 else
12769 {
12770 pic = NULL;
12771 type = UNSPEC_INDNTPOFF;
12772 }
12773
12774 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12775 off = gen_rtx_CONST (tp_mode, off);
12776 if (pic)
12777 off = gen_rtx_PLUS (tp_mode, pic, off);
12778 off = gen_const_mem (tp_mode, off);
12779 set_mem_alias_set (off, ix86_GOT_alias_set ());
12780
12781 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12782 {
12783 base = get_thread_pointer (tp_mode,
12784 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12785 off = force_reg (tp_mode, off);
12786 return gen_rtx_PLUS (tp_mode, base, off);
12787 }
12788 else
12789 {
12790 base = get_thread_pointer (Pmode, true);
12791 dest = gen_reg_rtx (Pmode);
12792 emit_insn (ix86_gen_sub3 (dest, base, off));
12793 }
12794 break;
12795
12796 case TLS_MODEL_LOCAL_EXEC:
12797 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12798 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12799 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12800 off = gen_rtx_CONST (Pmode, off);
12801
12802 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12803 {
12804 base = get_thread_pointer (Pmode,
12805 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12806 return gen_rtx_PLUS (Pmode, base, off);
12807 }
12808 else
12809 {
12810 base = get_thread_pointer (Pmode, true);
12811 dest = gen_reg_rtx (Pmode);
12812 emit_insn (ix86_gen_sub3 (dest, base, off));
12813 }
12814 break;
12815
12816 default:
12817 gcc_unreachable ();
12818 }
12819
12820 return dest;
12821 }
12822
12823 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12824 to symbol DECL. */
12825
12826 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12827 htab_t dllimport_map;
12828
12829 static tree
12830 get_dllimport_decl (tree decl)
12831 {
12832 struct tree_map *h, in;
12833 void **loc;
12834 const char *name;
12835 const char *prefix;
12836 size_t namelen, prefixlen;
12837 char *imp_name;
12838 tree to;
12839 rtx rtl;
12840
12841 if (!dllimport_map)
12842 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12843
12844 in.hash = htab_hash_pointer (decl);
12845 in.base.from = decl;
12846 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12847 h = (struct tree_map *) *loc;
12848 if (h)
12849 return h->to;
12850
12851 *loc = h = ggc_alloc_tree_map ();
12852 h->hash = in.hash;
12853 h->base.from = decl;
12854 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12855 VAR_DECL, NULL, ptr_type_node);
12856 DECL_ARTIFICIAL (to) = 1;
12857 DECL_IGNORED_P (to) = 1;
12858 DECL_EXTERNAL (to) = 1;
12859 TREE_READONLY (to) = 1;
12860
12861 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12862 name = targetm.strip_name_encoding (name);
12863 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12864 ? "*__imp_" : "*__imp__";
12865 namelen = strlen (name);
12866 prefixlen = strlen (prefix);
12867 imp_name = (char *) alloca (namelen + prefixlen + 1);
12868 memcpy (imp_name, prefix, prefixlen);
12869 memcpy (imp_name + prefixlen, name, namelen + 1);
12870
12871 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12872 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12873 SET_SYMBOL_REF_DECL (rtl, to);
12874 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12875
12876 rtl = gen_const_mem (Pmode, rtl);
12877 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12878
12879 SET_DECL_RTL (to, rtl);
12880 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12881
12882 return to;
12883 }
12884
12885 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12886 true if we require the result be a register. */
12887
12888 static rtx
12889 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12890 {
12891 tree imp_decl;
12892 rtx x;
12893
12894 gcc_assert (SYMBOL_REF_DECL (symbol));
12895 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12896
12897 x = DECL_RTL (imp_decl);
12898 if (want_reg)
12899 x = force_reg (Pmode, x);
12900 return x;
12901 }
12902
12903 /* Try machine-dependent ways of modifying an illegitimate address
12904 to be legitimate. If we find one, return the new, valid address.
12905 This macro is used in only one place: `memory_address' in explow.c.
12906
12907 OLDX is the address as it was before break_out_memory_refs was called.
12908 In some cases it is useful to look at this to decide what needs to be done.
12909
12910 It is always safe for this macro to do nothing. It exists to recognize
12911 opportunities to optimize the output.
12912
12913 For the 80386, we handle X+REG by loading X into a register R and
12914 using R+REG. R will go in a general reg and indexing will be used.
12915 However, if REG is a broken-out memory address or multiplication,
12916 nothing needs to be done because REG can certainly go in a general reg.
12917
12918 When -fpic is used, special handling is needed for symbolic references.
12919 See comments by legitimize_pic_address in i386.c for details. */
12920
12921 static rtx
12922 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12923 enum machine_mode mode)
12924 {
12925 int changed = 0;
12926 unsigned log;
12927
12928 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12929 if (log)
12930 return legitimize_tls_address (x, (enum tls_model) log, false);
12931 if (GET_CODE (x) == CONST
12932 && GET_CODE (XEXP (x, 0)) == PLUS
12933 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12934 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12935 {
12936 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12937 (enum tls_model) log, false);
12938 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12939 }
12940
12941 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12942 {
12943 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12944 return legitimize_dllimport_symbol (x, true);
12945 if (GET_CODE (x) == CONST
12946 && GET_CODE (XEXP (x, 0)) == PLUS
12947 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12948 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12949 {
12950 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12951 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12952 }
12953 }
12954
12955 if (flag_pic && SYMBOLIC_CONST (x))
12956 return legitimize_pic_address (x, 0);
12957
12958 #if TARGET_MACHO
12959 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12960 return machopic_indirect_data_reference (x, 0);
12961 #endif
12962
12963 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12964 if (GET_CODE (x) == ASHIFT
12965 && CONST_INT_P (XEXP (x, 1))
12966 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12967 {
12968 changed = 1;
12969 log = INTVAL (XEXP (x, 1));
12970 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12971 GEN_INT (1 << log));
12972 }
12973
12974 if (GET_CODE (x) == PLUS)
12975 {
12976 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12977
12978 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12979 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12980 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12981 {
12982 changed = 1;
12983 log = INTVAL (XEXP (XEXP (x, 0), 1));
12984 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12985 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12986 GEN_INT (1 << log));
12987 }
12988
12989 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12990 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12991 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12992 {
12993 changed = 1;
12994 log = INTVAL (XEXP (XEXP (x, 1), 1));
12995 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12996 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12997 GEN_INT (1 << log));
12998 }
12999
13000 /* Put multiply first if it isn't already. */
13001 if (GET_CODE (XEXP (x, 1)) == MULT)
13002 {
13003 rtx tmp = XEXP (x, 0);
13004 XEXP (x, 0) = XEXP (x, 1);
13005 XEXP (x, 1) = tmp;
13006 changed = 1;
13007 }
13008
13009 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13010 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13011 created by virtual register instantiation, register elimination, and
13012 similar optimizations. */
13013 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13014 {
13015 changed = 1;
13016 x = gen_rtx_PLUS (Pmode,
13017 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13018 XEXP (XEXP (x, 1), 0)),
13019 XEXP (XEXP (x, 1), 1));
13020 }
13021
13022 /* Canonicalize
13023 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13024 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13025 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13026 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13027 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13028 && CONSTANT_P (XEXP (x, 1)))
13029 {
13030 rtx constant;
13031 rtx other = NULL_RTX;
13032
13033 if (CONST_INT_P (XEXP (x, 1)))
13034 {
13035 constant = XEXP (x, 1);
13036 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13037 }
13038 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13039 {
13040 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13041 other = XEXP (x, 1);
13042 }
13043 else
13044 constant = 0;
13045
13046 if (constant)
13047 {
13048 changed = 1;
13049 x = gen_rtx_PLUS (Pmode,
13050 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13051 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13052 plus_constant (other, INTVAL (constant)));
13053 }
13054 }
13055
13056 if (changed && ix86_legitimate_address_p (mode, x, false))
13057 return x;
13058
13059 if (GET_CODE (XEXP (x, 0)) == MULT)
13060 {
13061 changed = 1;
13062 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13063 }
13064
13065 if (GET_CODE (XEXP (x, 1)) == MULT)
13066 {
13067 changed = 1;
13068 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13069 }
13070
13071 if (changed
13072 && REG_P (XEXP (x, 1))
13073 && REG_P (XEXP (x, 0)))
13074 return x;
13075
13076 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13077 {
13078 changed = 1;
13079 x = legitimize_pic_address (x, 0);
13080 }
13081
13082 if (changed && ix86_legitimate_address_p (mode, x, false))
13083 return x;
13084
13085 if (REG_P (XEXP (x, 0)))
13086 {
13087 rtx temp = gen_reg_rtx (Pmode);
13088 rtx val = force_operand (XEXP (x, 1), temp);
13089 if (val != temp)
13090 {
13091 if (GET_MODE (val) != Pmode)
13092 val = convert_to_mode (Pmode, val, 1);
13093 emit_move_insn (temp, val);
13094 }
13095
13096 XEXP (x, 1) = temp;
13097 return x;
13098 }
13099
13100 else if (REG_P (XEXP (x, 1)))
13101 {
13102 rtx temp = gen_reg_rtx (Pmode);
13103 rtx val = force_operand (XEXP (x, 0), temp);
13104 if (val != temp)
13105 {
13106 if (GET_MODE (val) != Pmode)
13107 val = convert_to_mode (Pmode, val, 1);
13108 emit_move_insn (temp, val);
13109 }
13110
13111 XEXP (x, 0) = temp;
13112 return x;
13113 }
13114 }
13115
13116 return x;
13117 }
13118 \f
13119 /* Print an integer constant expression in assembler syntax. Addition
13120 and subtraction are the only arithmetic that may appear in these
13121 expressions. FILE is the stdio stream to write to, X is the rtx, and
13122 CODE is the operand print code from the output string. */
13123
13124 static void
13125 output_pic_addr_const (FILE *file, rtx x, int code)
13126 {
13127 char buf[256];
13128
13129 switch (GET_CODE (x))
13130 {
13131 case PC:
13132 gcc_assert (flag_pic);
13133 putc ('.', file);
13134 break;
13135
13136 case SYMBOL_REF:
13137 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13138 output_addr_const (file, x);
13139 else
13140 {
13141 const char *name = XSTR (x, 0);
13142
13143 /* Mark the decl as referenced so that cgraph will
13144 output the function. */
13145 if (SYMBOL_REF_DECL (x))
13146 mark_decl_referenced (SYMBOL_REF_DECL (x));
13147
13148 #if TARGET_MACHO
13149 if (MACHOPIC_INDIRECT
13150 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13151 name = machopic_indirection_name (x, /*stub_p=*/true);
13152 #endif
13153 assemble_name (file, name);
13154 }
13155 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13156 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13157 fputs ("@PLT", file);
13158 break;
13159
13160 case LABEL_REF:
13161 x = XEXP (x, 0);
13162 /* FALLTHRU */
13163 case CODE_LABEL:
13164 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13165 assemble_name (asm_out_file, buf);
13166 break;
13167
13168 case CONST_INT:
13169 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13170 break;
13171
13172 case CONST:
13173 /* This used to output parentheses around the expression,
13174 but that does not work on the 386 (either ATT or BSD assembler). */
13175 output_pic_addr_const (file, XEXP (x, 0), code);
13176 break;
13177
13178 case CONST_DOUBLE:
13179 if (GET_MODE (x) == VOIDmode)
13180 {
13181 /* We can use %d if the number is <32 bits and positive. */
13182 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13183 fprintf (file, "0x%lx%08lx",
13184 (unsigned long) CONST_DOUBLE_HIGH (x),
13185 (unsigned long) CONST_DOUBLE_LOW (x));
13186 else
13187 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13188 }
13189 else
13190 /* We can't handle floating point constants;
13191 TARGET_PRINT_OPERAND must handle them. */
13192 output_operand_lossage ("floating constant misused");
13193 break;
13194
13195 case PLUS:
13196 /* Some assemblers need integer constants to appear first. */
13197 if (CONST_INT_P (XEXP (x, 0)))
13198 {
13199 output_pic_addr_const (file, XEXP (x, 0), code);
13200 putc ('+', file);
13201 output_pic_addr_const (file, XEXP (x, 1), code);
13202 }
13203 else
13204 {
13205 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13206 output_pic_addr_const (file, XEXP (x, 1), code);
13207 putc ('+', file);
13208 output_pic_addr_const (file, XEXP (x, 0), code);
13209 }
13210 break;
13211
13212 case MINUS:
13213 if (!TARGET_MACHO)
13214 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13215 output_pic_addr_const (file, XEXP (x, 0), code);
13216 putc ('-', file);
13217 output_pic_addr_const (file, XEXP (x, 1), code);
13218 if (!TARGET_MACHO)
13219 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13220 break;
13221
13222 case UNSPEC:
13223 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13224 {
13225 bool f = i386_asm_output_addr_const_extra (file, x);
13226 gcc_assert (f);
13227 break;
13228 }
13229
13230 gcc_assert (XVECLEN (x, 0) == 1);
13231 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13232 switch (XINT (x, 1))
13233 {
13234 case UNSPEC_GOT:
13235 fputs ("@GOT", file);
13236 break;
13237 case UNSPEC_GOTOFF:
13238 fputs ("@GOTOFF", file);
13239 break;
13240 case UNSPEC_PLTOFF:
13241 fputs ("@PLTOFF", file);
13242 break;
13243 case UNSPEC_PCREL:
13244 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13245 "(%rip)" : "[rip]", file);
13246 break;
13247 case UNSPEC_GOTPCREL:
13248 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13249 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13250 break;
13251 case UNSPEC_GOTTPOFF:
13252 /* FIXME: This might be @TPOFF in Sun ld too. */
13253 fputs ("@gottpoff", file);
13254 break;
13255 case UNSPEC_TPOFF:
13256 fputs ("@tpoff", file);
13257 break;
13258 case UNSPEC_NTPOFF:
13259 if (TARGET_64BIT)
13260 fputs ("@tpoff", file);
13261 else
13262 fputs ("@ntpoff", file);
13263 break;
13264 case UNSPEC_DTPOFF:
13265 fputs ("@dtpoff", file);
13266 break;
13267 case UNSPEC_GOTNTPOFF:
13268 if (TARGET_64BIT)
13269 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13270 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13271 else
13272 fputs ("@gotntpoff", file);
13273 break;
13274 case UNSPEC_INDNTPOFF:
13275 fputs ("@indntpoff", file);
13276 break;
13277 #if TARGET_MACHO
13278 case UNSPEC_MACHOPIC_OFFSET:
13279 putc ('-', file);
13280 machopic_output_function_base_name (file);
13281 break;
13282 #endif
13283 default:
13284 output_operand_lossage ("invalid UNSPEC as operand");
13285 break;
13286 }
13287 break;
13288
13289 default:
13290 output_operand_lossage ("invalid expression as operand");
13291 }
13292 }
13293
13294 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13295 We need to emit DTP-relative relocations. */
13296
13297 static void ATTRIBUTE_UNUSED
13298 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13299 {
13300 fputs (ASM_LONG, file);
13301 output_addr_const (file, x);
13302 fputs ("@dtpoff", file);
13303 switch (size)
13304 {
13305 case 4:
13306 break;
13307 case 8:
13308 fputs (", 0", file);
13309 break;
13310 default:
13311 gcc_unreachable ();
13312 }
13313 }
13314
13315 /* Return true if X is a representation of the PIC register. This copes
13316 with calls from ix86_find_base_term, where the register might have
13317 been replaced by a cselib value. */
13318
13319 static bool
13320 ix86_pic_register_p (rtx x)
13321 {
13322 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13323 return (pic_offset_table_rtx
13324 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13325 else
13326 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13327 }
13328
13329 /* Helper function for ix86_delegitimize_address.
13330 Attempt to delegitimize TLS local-exec accesses. */
13331
13332 static rtx
13333 ix86_delegitimize_tls_address (rtx orig_x)
13334 {
13335 rtx x = orig_x, unspec;
13336 struct ix86_address addr;
13337
13338 if (!TARGET_TLS_DIRECT_SEG_REFS)
13339 return orig_x;
13340 if (MEM_P (x))
13341 x = XEXP (x, 0);
13342 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13343 return orig_x;
13344 if (ix86_decompose_address (x, &addr) == 0
13345 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13346 || addr.disp == NULL_RTX
13347 || GET_CODE (addr.disp) != CONST)
13348 return orig_x;
13349 unspec = XEXP (addr.disp, 0);
13350 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13351 unspec = XEXP (unspec, 0);
13352 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13353 return orig_x;
13354 x = XVECEXP (unspec, 0, 0);
13355 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13356 if (unspec != XEXP (addr.disp, 0))
13357 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13358 if (addr.index)
13359 {
13360 rtx idx = addr.index;
13361 if (addr.scale != 1)
13362 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13363 x = gen_rtx_PLUS (Pmode, idx, x);
13364 }
13365 if (addr.base)
13366 x = gen_rtx_PLUS (Pmode, addr.base, x);
13367 if (MEM_P (orig_x))
13368 x = replace_equiv_address_nv (orig_x, x);
13369 return x;
13370 }
13371
13372 /* In the name of slightly smaller debug output, and to cater to
13373 general assembler lossage, recognize PIC+GOTOFF and turn it back
13374 into a direct symbol reference.
13375
13376 On Darwin, this is necessary to avoid a crash, because Darwin
13377 has a different PIC label for each routine but the DWARF debugging
13378 information is not associated with any particular routine, so it's
13379 necessary to remove references to the PIC label from RTL stored by
13380 the DWARF output code. */
13381
13382 static rtx
13383 ix86_delegitimize_address (rtx x)
13384 {
13385 rtx orig_x = delegitimize_mem_from_attrs (x);
13386 /* addend is NULL or some rtx if x is something+GOTOFF where
13387 something doesn't include the PIC register. */
13388 rtx addend = NULL_RTX;
13389 /* reg_addend is NULL or a multiple of some register. */
13390 rtx reg_addend = NULL_RTX;
13391 /* const_addend is NULL or a const_int. */
13392 rtx const_addend = NULL_RTX;
13393 /* This is the result, or NULL. */
13394 rtx result = NULL_RTX;
13395
13396 x = orig_x;
13397
13398 if (MEM_P (x))
13399 x = XEXP (x, 0);
13400
13401 if (TARGET_64BIT)
13402 {
13403 if (GET_CODE (x) == CONST
13404 && GET_CODE (XEXP (x, 0)) == PLUS
13405 && GET_MODE (XEXP (x, 0)) == Pmode
13406 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13407 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13408 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13409 {
13410 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13411 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13412 if (MEM_P (orig_x))
13413 x = replace_equiv_address_nv (orig_x, x);
13414 return x;
13415 }
13416 if (GET_CODE (x) != CONST
13417 || GET_CODE (XEXP (x, 0)) != UNSPEC
13418 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13419 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13420 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13421 return ix86_delegitimize_tls_address (orig_x);
13422 x = XVECEXP (XEXP (x, 0), 0, 0);
13423 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13424 {
13425 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13426 GET_MODE (x), 0);
13427 if (x == NULL_RTX)
13428 return orig_x;
13429 }
13430 return x;
13431 }
13432
13433 if (GET_CODE (x) != PLUS
13434 || GET_CODE (XEXP (x, 1)) != CONST)
13435 return ix86_delegitimize_tls_address (orig_x);
13436
13437 if (ix86_pic_register_p (XEXP (x, 0)))
13438 /* %ebx + GOT/GOTOFF */
13439 ;
13440 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13441 {
13442 /* %ebx + %reg * scale + GOT/GOTOFF */
13443 reg_addend = XEXP (x, 0);
13444 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13445 reg_addend = XEXP (reg_addend, 1);
13446 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13447 reg_addend = XEXP (reg_addend, 0);
13448 else
13449 {
13450 reg_addend = NULL_RTX;
13451 addend = XEXP (x, 0);
13452 }
13453 }
13454 else
13455 addend = XEXP (x, 0);
13456
13457 x = XEXP (XEXP (x, 1), 0);
13458 if (GET_CODE (x) == PLUS
13459 && CONST_INT_P (XEXP (x, 1)))
13460 {
13461 const_addend = XEXP (x, 1);
13462 x = XEXP (x, 0);
13463 }
13464
13465 if (GET_CODE (x) == UNSPEC
13466 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13467 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13468 result = XVECEXP (x, 0, 0);
13469
13470 if (TARGET_MACHO && darwin_local_data_pic (x)
13471 && !MEM_P (orig_x))
13472 result = XVECEXP (x, 0, 0);
13473
13474 if (! result)
13475 return ix86_delegitimize_tls_address (orig_x);
13476
13477 if (const_addend)
13478 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13479 if (reg_addend)
13480 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13481 if (addend)
13482 {
13483 /* If the rest of original X doesn't involve the PIC register, add
13484 addend and subtract pic_offset_table_rtx. This can happen e.g.
13485 for code like:
13486 leal (%ebx, %ecx, 4), %ecx
13487 ...
13488 movl foo@GOTOFF(%ecx), %edx
13489 in which case we return (%ecx - %ebx) + foo. */
13490 if (pic_offset_table_rtx)
13491 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13492 pic_offset_table_rtx),
13493 result);
13494 else
13495 return orig_x;
13496 }
13497 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13498 {
13499 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13500 if (result == NULL_RTX)
13501 return orig_x;
13502 }
13503 return result;
13504 }
13505
13506 /* If X is a machine specific address (i.e. a symbol or label being
13507 referenced as a displacement from the GOT implemented using an
13508 UNSPEC), then return the base term. Otherwise return X. */
13509
13510 rtx
13511 ix86_find_base_term (rtx x)
13512 {
13513 rtx term;
13514
13515 if (TARGET_64BIT)
13516 {
13517 if (GET_CODE (x) != CONST)
13518 return x;
13519 term = XEXP (x, 0);
13520 if (GET_CODE (term) == PLUS
13521 && (CONST_INT_P (XEXP (term, 1))
13522 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13523 term = XEXP (term, 0);
13524 if (GET_CODE (term) != UNSPEC
13525 || (XINT (term, 1) != UNSPEC_GOTPCREL
13526 && XINT (term, 1) != UNSPEC_PCREL))
13527 return x;
13528
13529 return XVECEXP (term, 0, 0);
13530 }
13531
13532 return ix86_delegitimize_address (x);
13533 }
13534 \f
13535 static void
13536 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13537 int fp, FILE *file)
13538 {
13539 const char *suffix;
13540
13541 if (mode == CCFPmode || mode == CCFPUmode)
13542 {
13543 code = ix86_fp_compare_code_to_integer (code);
13544 mode = CCmode;
13545 }
13546 if (reverse)
13547 code = reverse_condition (code);
13548
13549 switch (code)
13550 {
13551 case EQ:
13552 switch (mode)
13553 {
13554 case CCAmode:
13555 suffix = "a";
13556 break;
13557
13558 case CCCmode:
13559 suffix = "c";
13560 break;
13561
13562 case CCOmode:
13563 suffix = "o";
13564 break;
13565
13566 case CCSmode:
13567 suffix = "s";
13568 break;
13569
13570 default:
13571 suffix = "e";
13572 }
13573 break;
13574 case NE:
13575 switch (mode)
13576 {
13577 case CCAmode:
13578 suffix = "na";
13579 break;
13580
13581 case CCCmode:
13582 suffix = "nc";
13583 break;
13584
13585 case CCOmode:
13586 suffix = "no";
13587 break;
13588
13589 case CCSmode:
13590 suffix = "ns";
13591 break;
13592
13593 default:
13594 suffix = "ne";
13595 }
13596 break;
13597 case GT:
13598 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13599 suffix = "g";
13600 break;
13601 case GTU:
13602 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13603 Those same assemblers have the same but opposite lossage on cmov. */
13604 if (mode == CCmode)
13605 suffix = fp ? "nbe" : "a";
13606 else if (mode == CCCmode)
13607 suffix = "b";
13608 else
13609 gcc_unreachable ();
13610 break;
13611 case LT:
13612 switch (mode)
13613 {
13614 case CCNOmode:
13615 case CCGOCmode:
13616 suffix = "s";
13617 break;
13618
13619 case CCmode:
13620 case CCGCmode:
13621 suffix = "l";
13622 break;
13623
13624 default:
13625 gcc_unreachable ();
13626 }
13627 break;
13628 case LTU:
13629 gcc_assert (mode == CCmode || mode == CCCmode);
13630 suffix = "b";
13631 break;
13632 case GE:
13633 switch (mode)
13634 {
13635 case CCNOmode:
13636 case CCGOCmode:
13637 suffix = "ns";
13638 break;
13639
13640 case CCmode:
13641 case CCGCmode:
13642 suffix = "ge";
13643 break;
13644
13645 default:
13646 gcc_unreachable ();
13647 }
13648 break;
13649 case GEU:
13650 /* ??? As above. */
13651 gcc_assert (mode == CCmode || mode == CCCmode);
13652 suffix = fp ? "nb" : "ae";
13653 break;
13654 case LE:
13655 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13656 suffix = "le";
13657 break;
13658 case LEU:
13659 /* ??? As above. */
13660 if (mode == CCmode)
13661 suffix = "be";
13662 else if (mode == CCCmode)
13663 suffix = fp ? "nb" : "ae";
13664 else
13665 gcc_unreachable ();
13666 break;
13667 case UNORDERED:
13668 suffix = fp ? "u" : "p";
13669 break;
13670 case ORDERED:
13671 suffix = fp ? "nu" : "np";
13672 break;
13673 default:
13674 gcc_unreachable ();
13675 }
13676 fputs (suffix, file);
13677 }
13678
13679 /* Print the name of register X to FILE based on its machine mode and number.
13680 If CODE is 'w', pretend the mode is HImode.
13681 If CODE is 'b', pretend the mode is QImode.
13682 If CODE is 'k', pretend the mode is SImode.
13683 If CODE is 'q', pretend the mode is DImode.
13684 If CODE is 'x', pretend the mode is V4SFmode.
13685 If CODE is 't', pretend the mode is V8SFmode.
13686 If CODE is 'h', pretend the reg is the 'high' byte register.
13687 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13688 If CODE is 'd', duplicate the operand for AVX instruction.
13689 */
13690
13691 void
13692 print_reg (rtx x, int code, FILE *file)
13693 {
13694 const char *reg;
13695 bool duplicated = code == 'd' && TARGET_AVX;
13696
13697 gcc_assert (x == pc_rtx
13698 || (REGNO (x) != ARG_POINTER_REGNUM
13699 && REGNO (x) != FRAME_POINTER_REGNUM
13700 && REGNO (x) != FLAGS_REG
13701 && REGNO (x) != FPSR_REG
13702 && REGNO (x) != FPCR_REG));
13703
13704 if (ASSEMBLER_DIALECT == ASM_ATT)
13705 putc ('%', file);
13706
13707 if (x == pc_rtx)
13708 {
13709 gcc_assert (TARGET_64BIT);
13710 fputs ("rip", file);
13711 return;
13712 }
13713
13714 if (code == 'w' || MMX_REG_P (x))
13715 code = 2;
13716 else if (code == 'b')
13717 code = 1;
13718 else if (code == 'k')
13719 code = 4;
13720 else if (code == 'q')
13721 code = 8;
13722 else if (code == 'y')
13723 code = 3;
13724 else if (code == 'h')
13725 code = 0;
13726 else if (code == 'x')
13727 code = 16;
13728 else if (code == 't')
13729 code = 32;
13730 else
13731 code = GET_MODE_SIZE (GET_MODE (x));
13732
13733 /* Irritatingly, AMD extended registers use different naming convention
13734 from the normal registers: "r%d[bwd]" */
13735 if (REX_INT_REG_P (x))
13736 {
13737 gcc_assert (TARGET_64BIT);
13738 putc ('r', file);
13739 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13740 switch (code)
13741 {
13742 case 0:
13743 error ("extended registers have no high halves");
13744 break;
13745 case 1:
13746 putc ('b', file);
13747 break;
13748 case 2:
13749 putc ('w', file);
13750 break;
13751 case 4:
13752 putc ('d', file);
13753 break;
13754 case 8:
13755 /* no suffix */
13756 break;
13757 default:
13758 error ("unsupported operand size for extended register");
13759 break;
13760 }
13761 return;
13762 }
13763
13764 reg = NULL;
13765 switch (code)
13766 {
13767 case 3:
13768 if (STACK_TOP_P (x))
13769 {
13770 reg = "st(0)";
13771 break;
13772 }
13773 /* FALLTHRU */
13774 case 8:
13775 case 4:
13776 case 12:
13777 if (! ANY_FP_REG_P (x))
13778 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13779 /* FALLTHRU */
13780 case 16:
13781 case 2:
13782 normal:
13783 reg = hi_reg_name[REGNO (x)];
13784 break;
13785 case 1:
13786 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13787 goto normal;
13788 reg = qi_reg_name[REGNO (x)];
13789 break;
13790 case 0:
13791 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13792 goto normal;
13793 reg = qi_high_reg_name[REGNO (x)];
13794 break;
13795 case 32:
13796 if (SSE_REG_P (x))
13797 {
13798 gcc_assert (!duplicated);
13799 putc ('y', file);
13800 fputs (hi_reg_name[REGNO (x)] + 1, file);
13801 return;
13802 }
13803 break;
13804 default:
13805 gcc_unreachable ();
13806 }
13807
13808 fputs (reg, file);
13809 if (duplicated)
13810 {
13811 if (ASSEMBLER_DIALECT == ASM_ATT)
13812 fprintf (file, ", %%%s", reg);
13813 else
13814 fprintf (file, ", %s", reg);
13815 }
13816 }
13817
13818 /* Locate some local-dynamic symbol still in use by this function
13819 so that we can print its name in some tls_local_dynamic_base
13820 pattern. */
13821
13822 static int
13823 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13824 {
13825 rtx x = *px;
13826
13827 if (GET_CODE (x) == SYMBOL_REF
13828 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13829 {
13830 cfun->machine->some_ld_name = XSTR (x, 0);
13831 return 1;
13832 }
13833
13834 return 0;
13835 }
13836
13837 static const char *
13838 get_some_local_dynamic_name (void)
13839 {
13840 rtx insn;
13841
13842 if (cfun->machine->some_ld_name)
13843 return cfun->machine->some_ld_name;
13844
13845 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13846 if (NONDEBUG_INSN_P (insn)
13847 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13848 return cfun->machine->some_ld_name;
13849
13850 return NULL;
13851 }
13852
13853 /* Meaning of CODE:
13854 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13855 C -- print opcode suffix for set/cmov insn.
13856 c -- like C, but print reversed condition
13857 F,f -- likewise, but for floating-point.
13858 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13859 otherwise nothing
13860 R -- print the prefix for register names.
13861 z -- print the opcode suffix for the size of the current operand.
13862 Z -- likewise, with special suffixes for x87 instructions.
13863 * -- print a star (in certain assembler syntax)
13864 A -- print an absolute memory reference.
13865 E -- print address with DImode register names if TARGET_64BIT.
13866 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13867 s -- print a shift double count, followed by the assemblers argument
13868 delimiter.
13869 b -- print the QImode name of the register for the indicated operand.
13870 %b0 would print %al if operands[0] is reg 0.
13871 w -- likewise, print the HImode name of the register.
13872 k -- likewise, print the SImode name of the register.
13873 q -- likewise, print the DImode name of the register.
13874 x -- likewise, print the V4SFmode name of the register.
13875 t -- likewise, print the V8SFmode name of the register.
13876 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13877 y -- print "st(0)" instead of "st" as a register.
13878 d -- print duplicated register operand for AVX instruction.
13879 D -- print condition for SSE cmp instruction.
13880 P -- if PIC, print an @PLT suffix.
13881 p -- print raw symbol name.
13882 X -- don't print any sort of PIC '@' suffix for a symbol.
13883 & -- print some in-use local-dynamic symbol name.
13884 H -- print a memory address offset by 8; used for sse high-parts
13885 Y -- print condition for XOP pcom* instruction.
13886 + -- print a branch hint as 'cs' or 'ds' prefix
13887 ; -- print a semicolon (after prefixes due to bug in older gas).
13888 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13889 @ -- print a segment register of thread base pointer load
13890 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
13891 */
13892
13893 void
13894 ix86_print_operand (FILE *file, rtx x, int code)
13895 {
13896 if (code)
13897 {
13898 switch (code)
13899 {
13900 case '*':
13901 if (ASSEMBLER_DIALECT == ASM_ATT)
13902 putc ('*', file);
13903 return;
13904
13905 case '&':
13906 {
13907 const char *name = get_some_local_dynamic_name ();
13908 if (name == NULL)
13909 output_operand_lossage ("'%%&' used without any "
13910 "local dynamic TLS references");
13911 else
13912 assemble_name (file, name);
13913 return;
13914 }
13915
13916 case 'A':
13917 switch (ASSEMBLER_DIALECT)
13918 {
13919 case ASM_ATT:
13920 putc ('*', file);
13921 break;
13922
13923 case ASM_INTEL:
13924 /* Intel syntax. For absolute addresses, registers should not
13925 be surrounded by braces. */
13926 if (!REG_P (x))
13927 {
13928 putc ('[', file);
13929 ix86_print_operand (file, x, 0);
13930 putc (']', file);
13931 return;
13932 }
13933 break;
13934
13935 default:
13936 gcc_unreachable ();
13937 }
13938
13939 ix86_print_operand (file, x, 0);
13940 return;
13941
13942 case 'E':
13943 /* Wrap address in an UNSPEC to declare special handling. */
13944 if (TARGET_64BIT)
13945 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
13946
13947 output_address (x);
13948 return;
13949
13950 case 'L':
13951 if (ASSEMBLER_DIALECT == ASM_ATT)
13952 putc ('l', file);
13953 return;
13954
13955 case 'W':
13956 if (ASSEMBLER_DIALECT == ASM_ATT)
13957 putc ('w', file);
13958 return;
13959
13960 case 'B':
13961 if (ASSEMBLER_DIALECT == ASM_ATT)
13962 putc ('b', file);
13963 return;
13964
13965 case 'Q':
13966 if (ASSEMBLER_DIALECT == ASM_ATT)
13967 putc ('l', file);
13968 return;
13969
13970 case 'S':
13971 if (ASSEMBLER_DIALECT == ASM_ATT)
13972 putc ('s', file);
13973 return;
13974
13975 case 'T':
13976 if (ASSEMBLER_DIALECT == ASM_ATT)
13977 putc ('t', file);
13978 return;
13979
13980 case 'z':
13981 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13982 {
13983 /* Opcodes don't get size suffixes if using Intel opcodes. */
13984 if (ASSEMBLER_DIALECT == ASM_INTEL)
13985 return;
13986
13987 switch (GET_MODE_SIZE (GET_MODE (x)))
13988 {
13989 case 1:
13990 putc ('b', file);
13991 return;
13992
13993 case 2:
13994 putc ('w', file);
13995 return;
13996
13997 case 4:
13998 putc ('l', file);
13999 return;
14000
14001 case 8:
14002 putc ('q', file);
14003 return;
14004
14005 default:
14006 output_operand_lossage
14007 ("invalid operand size for operand code '%c'", code);
14008 return;
14009 }
14010 }
14011
14012 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14013 warning
14014 (0, "non-integer operand used with operand code '%c'", code);
14015 /* FALLTHRU */
14016
14017 case 'Z':
14018 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14019 if (ASSEMBLER_DIALECT == ASM_INTEL)
14020 return;
14021
14022 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14023 {
14024 switch (GET_MODE_SIZE (GET_MODE (x)))
14025 {
14026 case 2:
14027 #ifdef HAVE_AS_IX86_FILDS
14028 putc ('s', file);
14029 #endif
14030 return;
14031
14032 case 4:
14033 putc ('l', file);
14034 return;
14035
14036 case 8:
14037 #ifdef HAVE_AS_IX86_FILDQ
14038 putc ('q', file);
14039 #else
14040 fputs ("ll", file);
14041 #endif
14042 return;
14043
14044 default:
14045 break;
14046 }
14047 }
14048 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14049 {
14050 /* 387 opcodes don't get size suffixes
14051 if the operands are registers. */
14052 if (STACK_REG_P (x))
14053 return;
14054
14055 switch (GET_MODE_SIZE (GET_MODE (x)))
14056 {
14057 case 4:
14058 putc ('s', file);
14059 return;
14060
14061 case 8:
14062 putc ('l', file);
14063 return;
14064
14065 case 12:
14066 case 16:
14067 putc ('t', file);
14068 return;
14069
14070 default:
14071 break;
14072 }
14073 }
14074 else
14075 {
14076 output_operand_lossage
14077 ("invalid operand type used with operand code '%c'", code);
14078 return;
14079 }
14080
14081 output_operand_lossage
14082 ("invalid operand size for operand code '%c'", code);
14083 return;
14084
14085 case 'd':
14086 case 'b':
14087 case 'w':
14088 case 'k':
14089 case 'q':
14090 case 'h':
14091 case 't':
14092 case 'y':
14093 case 'x':
14094 case 'X':
14095 case 'P':
14096 case 'p':
14097 break;
14098
14099 case 's':
14100 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14101 {
14102 ix86_print_operand (file, x, 0);
14103 fputs (", ", file);
14104 }
14105 return;
14106
14107 case 'D':
14108 /* Little bit of braindamage here. The SSE compare instructions
14109 does use completely different names for the comparisons that the
14110 fp conditional moves. */
14111 if (TARGET_AVX)
14112 {
14113 switch (GET_CODE (x))
14114 {
14115 case EQ:
14116 fputs ("eq", file);
14117 break;
14118 case UNEQ:
14119 fputs ("eq_us", file);
14120 break;
14121 case LT:
14122 fputs ("lt", file);
14123 break;
14124 case UNLT:
14125 fputs ("nge", file);
14126 break;
14127 case LE:
14128 fputs ("le", file);
14129 break;
14130 case UNLE:
14131 fputs ("ngt", file);
14132 break;
14133 case UNORDERED:
14134 fputs ("unord", file);
14135 break;
14136 case NE:
14137 fputs ("neq", file);
14138 break;
14139 case LTGT:
14140 fputs ("neq_oq", file);
14141 break;
14142 case GE:
14143 fputs ("ge", file);
14144 break;
14145 case UNGE:
14146 fputs ("nlt", file);
14147 break;
14148 case GT:
14149 fputs ("gt", file);
14150 break;
14151 case UNGT:
14152 fputs ("nle", file);
14153 break;
14154 case ORDERED:
14155 fputs ("ord", file);
14156 break;
14157 default:
14158 output_operand_lossage ("operand is not a condition code, "
14159 "invalid operand code 'D'");
14160 return;
14161 }
14162 }
14163 else
14164 {
14165 switch (GET_CODE (x))
14166 {
14167 case EQ:
14168 case UNEQ:
14169 fputs ("eq", file);
14170 break;
14171 case LT:
14172 case UNLT:
14173 fputs ("lt", file);
14174 break;
14175 case LE:
14176 case UNLE:
14177 fputs ("le", file);
14178 break;
14179 case UNORDERED:
14180 fputs ("unord", file);
14181 break;
14182 case NE:
14183 case LTGT:
14184 fputs ("neq", file);
14185 break;
14186 case UNGE:
14187 case GE:
14188 fputs ("nlt", file);
14189 break;
14190 case UNGT:
14191 case GT:
14192 fputs ("nle", file);
14193 break;
14194 case ORDERED:
14195 fputs ("ord", file);
14196 break;
14197 default:
14198 output_operand_lossage ("operand is not a condition code, "
14199 "invalid operand code 'D'");
14200 return;
14201 }
14202 }
14203 return;
14204 case 'O':
14205 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14206 if (ASSEMBLER_DIALECT == ASM_ATT)
14207 {
14208 switch (GET_MODE (x))
14209 {
14210 case HImode: putc ('w', file); break;
14211 case SImode:
14212 case SFmode: putc ('l', file); break;
14213 case DImode:
14214 case DFmode: putc ('q', file); break;
14215 default: gcc_unreachable ();
14216 }
14217 putc ('.', file);
14218 }
14219 #endif
14220 return;
14221 case 'C':
14222 if (!COMPARISON_P (x))
14223 {
14224 output_operand_lossage ("operand is neither a constant nor a "
14225 "condition code, invalid operand code "
14226 "'C'");
14227 return;
14228 }
14229 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
14230 return;
14231 case 'F':
14232 if (!COMPARISON_P (x))
14233 {
14234 output_operand_lossage ("operand is neither a constant nor a "
14235 "condition code, invalid operand code "
14236 "'F'");
14237 return;
14238 }
14239 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14240 if (ASSEMBLER_DIALECT == ASM_ATT)
14241 putc ('.', file);
14242 #endif
14243 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
14244 return;
14245
14246 /* Like above, but reverse condition */
14247 case 'c':
14248 /* Check to see if argument to %c is really a constant
14249 and not a condition code which needs to be reversed. */
14250 if (!COMPARISON_P (x))
14251 {
14252 output_operand_lossage ("operand is neither a constant nor a "
14253 "condition code, invalid operand "
14254 "code 'c'");
14255 return;
14256 }
14257 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
14258 return;
14259 case 'f':
14260 if (!COMPARISON_P (x))
14261 {
14262 output_operand_lossage ("operand is neither a constant nor a "
14263 "condition code, invalid operand "
14264 "code 'f'");
14265 return;
14266 }
14267 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14268 if (ASSEMBLER_DIALECT == ASM_ATT)
14269 putc ('.', file);
14270 #endif
14271 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14272 return;
14273
14274 case 'H':
14275 if (!offsettable_memref_p (x))
14276 {
14277 output_operand_lossage ("operand is not an offsettable memory "
14278 "reference, invalid operand "
14279 "code 'H'");
14280 return;
14281 }
14282 /* It doesn't actually matter what mode we use here, as we're
14283 only going to use this for printing. */
14284 x = adjust_address_nv (x, DImode, 8);
14285 break;
14286
14287 case '+':
14288 {
14289 rtx x;
14290
14291 if (!optimize
14292 || optimize_function_for_size_p (cfun)
14293 || !TARGET_BRANCH_PREDICTION_HINTS)
14294 return;
14295
14296 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14297 if (x)
14298 {
14299 int pred_val = INTVAL (XEXP (x, 0));
14300
14301 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14302 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14303 {
14304 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14305 bool cputaken
14306 = final_forward_branch_p (current_output_insn) == 0;
14307
14308 /* Emit hints only in the case default branch prediction
14309 heuristics would fail. */
14310 if (taken != cputaken)
14311 {
14312 /* We use 3e (DS) prefix for taken branches and
14313 2e (CS) prefix for not taken branches. */
14314 if (taken)
14315 fputs ("ds ; ", file);
14316 else
14317 fputs ("cs ; ", file);
14318 }
14319 }
14320 }
14321 return;
14322 }
14323
14324 case 'Y':
14325 switch (GET_CODE (x))
14326 {
14327 case NE:
14328 fputs ("neq", file);
14329 break;
14330 case EQ:
14331 fputs ("eq", file);
14332 break;
14333 case GE:
14334 case GEU:
14335 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14336 break;
14337 case GT:
14338 case GTU:
14339 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14340 break;
14341 case LE:
14342 case LEU:
14343 fputs ("le", file);
14344 break;
14345 case LT:
14346 case LTU:
14347 fputs ("lt", file);
14348 break;
14349 case UNORDERED:
14350 fputs ("unord", file);
14351 break;
14352 case ORDERED:
14353 fputs ("ord", file);
14354 break;
14355 case UNEQ:
14356 fputs ("ueq", file);
14357 break;
14358 case UNGE:
14359 fputs ("nlt", file);
14360 break;
14361 case UNGT:
14362 fputs ("nle", file);
14363 break;
14364 case UNLE:
14365 fputs ("ule", file);
14366 break;
14367 case UNLT:
14368 fputs ("ult", file);
14369 break;
14370 case LTGT:
14371 fputs ("une", file);
14372 break;
14373 default:
14374 output_operand_lossage ("operand is not a condition code, "
14375 "invalid operand code 'Y'");
14376 return;
14377 }
14378 return;
14379
14380 case ';':
14381 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14382 putc (';', file);
14383 #endif
14384 return;
14385
14386 case '@':
14387 if (ASSEMBLER_DIALECT == ASM_ATT)
14388 putc ('%', file);
14389
14390 /* The kernel uses a different segment register for performance
14391 reasons; a system call would not have to trash the userspace
14392 segment register, which would be expensive. */
14393 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14394 fputs ("fs", file);
14395 else
14396 fputs ("gs", file);
14397 return;
14398
14399 case '~':
14400 putc (TARGET_AVX2 ? 'i' : 'f', file);
14401 return;
14402
14403 case '^':
14404 if (TARGET_64BIT && Pmode != word_mode)
14405 fputs ("addr32 ", file);
14406 return;
14407
14408 default:
14409 output_operand_lossage ("invalid operand code '%c'", code);
14410 }
14411 }
14412
14413 if (REG_P (x))
14414 print_reg (x, code, file);
14415
14416 else if (MEM_P (x))
14417 {
14418 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14419 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14420 && GET_MODE (x) != BLKmode)
14421 {
14422 const char * size;
14423 switch (GET_MODE_SIZE (GET_MODE (x)))
14424 {
14425 case 1: size = "BYTE"; break;
14426 case 2: size = "WORD"; break;
14427 case 4: size = "DWORD"; break;
14428 case 8: size = "QWORD"; break;
14429 case 12: size = "TBYTE"; break;
14430 case 16:
14431 if (GET_MODE (x) == XFmode)
14432 size = "TBYTE";
14433 else
14434 size = "XMMWORD";
14435 break;
14436 case 32: size = "YMMWORD"; break;
14437 default:
14438 gcc_unreachable ();
14439 }
14440
14441 /* Check for explicit size override (codes 'b', 'w', 'k',
14442 'q' and 'x') */
14443 if (code == 'b')
14444 size = "BYTE";
14445 else if (code == 'w')
14446 size = "WORD";
14447 else if (code == 'k')
14448 size = "DWORD";
14449 else if (code == 'q')
14450 size = "QWORD";
14451 else if (code == 'x')
14452 size = "XMMWORD";
14453
14454 fputs (size, file);
14455 fputs (" PTR ", file);
14456 }
14457
14458 x = XEXP (x, 0);
14459 /* Avoid (%rip) for call operands. */
14460 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14461 && !CONST_INT_P (x))
14462 output_addr_const (file, x);
14463 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14464 output_operand_lossage ("invalid constraints for operand");
14465 else
14466 output_address (x);
14467 }
14468
14469 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14470 {
14471 REAL_VALUE_TYPE r;
14472 long l;
14473
14474 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14475 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14476
14477 if (ASSEMBLER_DIALECT == ASM_ATT)
14478 putc ('$', file);
14479 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14480 if (code == 'q')
14481 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14482 else
14483 fprintf (file, "0x%08x", (unsigned int) l);
14484 }
14485
14486 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14487 {
14488 REAL_VALUE_TYPE r;
14489 long l[2];
14490
14491 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14492 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14493
14494 if (ASSEMBLER_DIALECT == ASM_ATT)
14495 putc ('$', file);
14496 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14497 }
14498
14499 /* These float cases don't actually occur as immediate operands. */
14500 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14501 {
14502 char dstr[30];
14503
14504 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14505 fputs (dstr, file);
14506 }
14507
14508 else
14509 {
14510 /* We have patterns that allow zero sets of memory, for instance.
14511 In 64-bit mode, we should probably support all 8-byte vectors,
14512 since we can in fact encode that into an immediate. */
14513 if (GET_CODE (x) == CONST_VECTOR)
14514 {
14515 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14516 x = const0_rtx;
14517 }
14518
14519 if (code != 'P' && code != 'p')
14520 {
14521 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14522 {
14523 if (ASSEMBLER_DIALECT == ASM_ATT)
14524 putc ('$', file);
14525 }
14526 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14527 || GET_CODE (x) == LABEL_REF)
14528 {
14529 if (ASSEMBLER_DIALECT == ASM_ATT)
14530 putc ('$', file);
14531 else
14532 fputs ("OFFSET FLAT:", file);
14533 }
14534 }
14535 if (CONST_INT_P (x))
14536 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14537 else if (flag_pic || MACHOPIC_INDIRECT)
14538 output_pic_addr_const (file, x, code);
14539 else
14540 output_addr_const (file, x);
14541 }
14542 }
14543
14544 static bool
14545 ix86_print_operand_punct_valid_p (unsigned char code)
14546 {
14547 return (code == '@' || code == '*' || code == '+' || code == '&'
14548 || code == ';' || code == '~' || code == '^');
14549 }
14550 \f
14551 /* Print a memory operand whose address is ADDR. */
14552
14553 static void
14554 ix86_print_operand_address (FILE *file, rtx addr)
14555 {
14556 struct ix86_address parts;
14557 rtx base, index, disp;
14558 int scale;
14559 int ok;
14560 bool vsib = false;
14561 int code = 0;
14562
14563 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14564 {
14565 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14566 gcc_assert (parts.index == NULL_RTX);
14567 parts.index = XVECEXP (addr, 0, 1);
14568 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14569 addr = XVECEXP (addr, 0, 0);
14570 vsib = true;
14571 }
14572 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14573 {
14574 gcc_assert (TARGET_64BIT);
14575 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14576 code = 'q';
14577 }
14578 else
14579 ok = ix86_decompose_address (addr, &parts);
14580
14581 gcc_assert (ok);
14582
14583 if (parts.base && GET_CODE (parts.base) == SUBREG)
14584 {
14585 rtx tmp = SUBREG_REG (parts.base);
14586 parts.base = simplify_subreg (GET_MODE (parts.base),
14587 tmp, GET_MODE (tmp), 0);
14588 }
14589
14590 if (parts.index && GET_CODE (parts.index) == SUBREG)
14591 {
14592 rtx tmp = SUBREG_REG (parts.index);
14593 parts.index = simplify_subreg (GET_MODE (parts.index),
14594 tmp, GET_MODE (tmp), 0);
14595 }
14596
14597 base = parts.base;
14598 index = parts.index;
14599 disp = parts.disp;
14600 scale = parts.scale;
14601
14602 switch (parts.seg)
14603 {
14604 case SEG_DEFAULT:
14605 break;
14606 case SEG_FS:
14607 case SEG_GS:
14608 if (ASSEMBLER_DIALECT == ASM_ATT)
14609 putc ('%', file);
14610 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14611 break;
14612 default:
14613 gcc_unreachable ();
14614 }
14615
14616 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14617 if (TARGET_64BIT && !base && !index)
14618 {
14619 rtx symbol = disp;
14620
14621 if (GET_CODE (disp) == CONST
14622 && GET_CODE (XEXP (disp, 0)) == PLUS
14623 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14624 symbol = XEXP (XEXP (disp, 0), 0);
14625
14626 if (GET_CODE (symbol) == LABEL_REF
14627 || (GET_CODE (symbol) == SYMBOL_REF
14628 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14629 base = pc_rtx;
14630 }
14631 if (!base && !index)
14632 {
14633 /* Displacement only requires special attention. */
14634
14635 if (CONST_INT_P (disp))
14636 {
14637 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14638 fputs ("ds:", file);
14639 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14640 }
14641 else if (flag_pic)
14642 output_pic_addr_const (file, disp, 0);
14643 else
14644 output_addr_const (file, disp);
14645 }
14646 else
14647 {
14648 /* Print SImode register names for zero-extended
14649 addresses to force addr32 prefix. */
14650 if (TARGET_64BIT
14651 && (GET_CODE (addr) == ZERO_EXTEND
14652 || GET_CODE (addr) == AND))
14653 {
14654 gcc_assert (!code);
14655 code = 'l';
14656 }
14657
14658 if (ASSEMBLER_DIALECT == ASM_ATT)
14659 {
14660 if (disp)
14661 {
14662 if (flag_pic)
14663 output_pic_addr_const (file, disp, 0);
14664 else if (GET_CODE (disp) == LABEL_REF)
14665 output_asm_label (disp);
14666 else
14667 output_addr_const (file, disp);
14668 }
14669
14670 putc ('(', file);
14671 if (base)
14672 print_reg (base, code, file);
14673 if (index)
14674 {
14675 putc (',', file);
14676 print_reg (index, vsib ? 0 : code, file);
14677 if (scale != 1 || vsib)
14678 fprintf (file, ",%d", scale);
14679 }
14680 putc (')', file);
14681 }
14682 else
14683 {
14684 rtx offset = NULL_RTX;
14685
14686 if (disp)
14687 {
14688 /* Pull out the offset of a symbol; print any symbol itself. */
14689 if (GET_CODE (disp) == CONST
14690 && GET_CODE (XEXP (disp, 0)) == PLUS
14691 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14692 {
14693 offset = XEXP (XEXP (disp, 0), 1);
14694 disp = gen_rtx_CONST (VOIDmode,
14695 XEXP (XEXP (disp, 0), 0));
14696 }
14697
14698 if (flag_pic)
14699 output_pic_addr_const (file, disp, 0);
14700 else if (GET_CODE (disp) == LABEL_REF)
14701 output_asm_label (disp);
14702 else if (CONST_INT_P (disp))
14703 offset = disp;
14704 else
14705 output_addr_const (file, disp);
14706 }
14707
14708 putc ('[', file);
14709 if (base)
14710 {
14711 print_reg (base, code, file);
14712 if (offset)
14713 {
14714 if (INTVAL (offset) >= 0)
14715 putc ('+', file);
14716 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14717 }
14718 }
14719 else if (offset)
14720 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14721 else
14722 putc ('0', file);
14723
14724 if (index)
14725 {
14726 putc ('+', file);
14727 print_reg (index, vsib ? 0 : code, file);
14728 if (scale != 1 || vsib)
14729 fprintf (file, "*%d", scale);
14730 }
14731 putc (']', file);
14732 }
14733 }
14734 }
14735
14736 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14737
14738 static bool
14739 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14740 {
14741 rtx op;
14742
14743 if (GET_CODE (x) != UNSPEC)
14744 return false;
14745
14746 op = XVECEXP (x, 0, 0);
14747 switch (XINT (x, 1))
14748 {
14749 case UNSPEC_GOTTPOFF:
14750 output_addr_const (file, op);
14751 /* FIXME: This might be @TPOFF in Sun ld. */
14752 fputs ("@gottpoff", file);
14753 break;
14754 case UNSPEC_TPOFF:
14755 output_addr_const (file, op);
14756 fputs ("@tpoff", file);
14757 break;
14758 case UNSPEC_NTPOFF:
14759 output_addr_const (file, op);
14760 if (TARGET_64BIT)
14761 fputs ("@tpoff", file);
14762 else
14763 fputs ("@ntpoff", file);
14764 break;
14765 case UNSPEC_DTPOFF:
14766 output_addr_const (file, op);
14767 fputs ("@dtpoff", file);
14768 break;
14769 case UNSPEC_GOTNTPOFF:
14770 output_addr_const (file, op);
14771 if (TARGET_64BIT)
14772 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14773 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14774 else
14775 fputs ("@gotntpoff", file);
14776 break;
14777 case UNSPEC_INDNTPOFF:
14778 output_addr_const (file, op);
14779 fputs ("@indntpoff", file);
14780 break;
14781 #if TARGET_MACHO
14782 case UNSPEC_MACHOPIC_OFFSET:
14783 output_addr_const (file, op);
14784 putc ('-', file);
14785 machopic_output_function_base_name (file);
14786 break;
14787 #endif
14788
14789 case UNSPEC_STACK_CHECK:
14790 {
14791 int offset;
14792
14793 gcc_assert (flag_split_stack);
14794
14795 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14796 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14797 #else
14798 gcc_unreachable ();
14799 #endif
14800
14801 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14802 }
14803 break;
14804
14805 default:
14806 return false;
14807 }
14808
14809 return true;
14810 }
14811 \f
14812 /* Split one or more double-mode RTL references into pairs of half-mode
14813 references. The RTL can be REG, offsettable MEM, integer constant, or
14814 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14815 split and "num" is its length. lo_half and hi_half are output arrays
14816 that parallel "operands". */
14817
14818 void
14819 split_double_mode (enum machine_mode mode, rtx operands[],
14820 int num, rtx lo_half[], rtx hi_half[])
14821 {
14822 enum machine_mode half_mode;
14823 unsigned int byte;
14824
14825 switch (mode)
14826 {
14827 case TImode:
14828 half_mode = DImode;
14829 break;
14830 case DImode:
14831 half_mode = SImode;
14832 break;
14833 default:
14834 gcc_unreachable ();
14835 }
14836
14837 byte = GET_MODE_SIZE (half_mode);
14838
14839 while (num--)
14840 {
14841 rtx op = operands[num];
14842
14843 /* simplify_subreg refuse to split volatile memory addresses,
14844 but we still have to handle it. */
14845 if (MEM_P (op))
14846 {
14847 lo_half[num] = adjust_address (op, half_mode, 0);
14848 hi_half[num] = adjust_address (op, half_mode, byte);
14849 }
14850 else
14851 {
14852 lo_half[num] = simplify_gen_subreg (half_mode, op,
14853 GET_MODE (op) == VOIDmode
14854 ? mode : GET_MODE (op), 0);
14855 hi_half[num] = simplify_gen_subreg (half_mode, op,
14856 GET_MODE (op) == VOIDmode
14857 ? mode : GET_MODE (op), byte);
14858 }
14859 }
14860 }
14861 \f
14862 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14863 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14864 is the expression of the binary operation. The output may either be
14865 emitted here, or returned to the caller, like all output_* functions.
14866
14867 There is no guarantee that the operands are the same mode, as they
14868 might be within FLOAT or FLOAT_EXTEND expressions. */
14869
14870 #ifndef SYSV386_COMPAT
14871 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14872 wants to fix the assemblers because that causes incompatibility
14873 with gcc. No-one wants to fix gcc because that causes
14874 incompatibility with assemblers... You can use the option of
14875 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14876 #define SYSV386_COMPAT 1
14877 #endif
14878
14879 const char *
14880 output_387_binary_op (rtx insn, rtx *operands)
14881 {
14882 static char buf[40];
14883 const char *p;
14884 const char *ssep;
14885 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14886
14887 #ifdef ENABLE_CHECKING
14888 /* Even if we do not want to check the inputs, this documents input
14889 constraints. Which helps in understanding the following code. */
14890 if (STACK_REG_P (operands[0])
14891 && ((REG_P (operands[1])
14892 && REGNO (operands[0]) == REGNO (operands[1])
14893 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14894 || (REG_P (operands[2])
14895 && REGNO (operands[0]) == REGNO (operands[2])
14896 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14897 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14898 ; /* ok */
14899 else
14900 gcc_assert (is_sse);
14901 #endif
14902
14903 switch (GET_CODE (operands[3]))
14904 {
14905 case PLUS:
14906 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14907 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14908 p = "fiadd";
14909 else
14910 p = "fadd";
14911 ssep = "vadd";
14912 break;
14913
14914 case MINUS:
14915 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14916 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14917 p = "fisub";
14918 else
14919 p = "fsub";
14920 ssep = "vsub";
14921 break;
14922
14923 case MULT:
14924 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14925 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14926 p = "fimul";
14927 else
14928 p = "fmul";
14929 ssep = "vmul";
14930 break;
14931
14932 case DIV:
14933 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14934 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14935 p = "fidiv";
14936 else
14937 p = "fdiv";
14938 ssep = "vdiv";
14939 break;
14940
14941 default:
14942 gcc_unreachable ();
14943 }
14944
14945 if (is_sse)
14946 {
14947 if (TARGET_AVX)
14948 {
14949 strcpy (buf, ssep);
14950 if (GET_MODE (operands[0]) == SFmode)
14951 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14952 else
14953 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14954 }
14955 else
14956 {
14957 strcpy (buf, ssep + 1);
14958 if (GET_MODE (operands[0]) == SFmode)
14959 strcat (buf, "ss\t{%2, %0|%0, %2}");
14960 else
14961 strcat (buf, "sd\t{%2, %0|%0, %2}");
14962 }
14963 return buf;
14964 }
14965 strcpy (buf, p);
14966
14967 switch (GET_CODE (operands[3]))
14968 {
14969 case MULT:
14970 case PLUS:
14971 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14972 {
14973 rtx temp = operands[2];
14974 operands[2] = operands[1];
14975 operands[1] = temp;
14976 }
14977
14978 /* know operands[0] == operands[1]. */
14979
14980 if (MEM_P (operands[2]))
14981 {
14982 p = "%Z2\t%2";
14983 break;
14984 }
14985
14986 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14987 {
14988 if (STACK_TOP_P (operands[0]))
14989 /* How is it that we are storing to a dead operand[2]?
14990 Well, presumably operands[1] is dead too. We can't
14991 store the result to st(0) as st(0) gets popped on this
14992 instruction. Instead store to operands[2] (which I
14993 think has to be st(1)). st(1) will be popped later.
14994 gcc <= 2.8.1 didn't have this check and generated
14995 assembly code that the Unixware assembler rejected. */
14996 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14997 else
14998 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14999 break;
15000 }
15001
15002 if (STACK_TOP_P (operands[0]))
15003 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15004 else
15005 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15006 break;
15007
15008 case MINUS:
15009 case DIV:
15010 if (MEM_P (operands[1]))
15011 {
15012 p = "r%Z1\t%1";
15013 break;
15014 }
15015
15016 if (MEM_P (operands[2]))
15017 {
15018 p = "%Z2\t%2";
15019 break;
15020 }
15021
15022 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15023 {
15024 #if SYSV386_COMPAT
15025 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15026 derived assemblers, confusingly reverse the direction of
15027 the operation for fsub{r} and fdiv{r} when the
15028 destination register is not st(0). The Intel assembler
15029 doesn't have this brain damage. Read !SYSV386_COMPAT to
15030 figure out what the hardware really does. */
15031 if (STACK_TOP_P (operands[0]))
15032 p = "{p\t%0, %2|rp\t%2, %0}";
15033 else
15034 p = "{rp\t%2, %0|p\t%0, %2}";
15035 #else
15036 if (STACK_TOP_P (operands[0]))
15037 /* As above for fmul/fadd, we can't store to st(0). */
15038 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15039 else
15040 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15041 #endif
15042 break;
15043 }
15044
15045 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15046 {
15047 #if SYSV386_COMPAT
15048 if (STACK_TOP_P (operands[0]))
15049 p = "{rp\t%0, %1|p\t%1, %0}";
15050 else
15051 p = "{p\t%1, %0|rp\t%0, %1}";
15052 #else
15053 if (STACK_TOP_P (operands[0]))
15054 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15055 else
15056 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15057 #endif
15058 break;
15059 }
15060
15061 if (STACK_TOP_P (operands[0]))
15062 {
15063 if (STACK_TOP_P (operands[1]))
15064 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15065 else
15066 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15067 break;
15068 }
15069 else if (STACK_TOP_P (operands[1]))
15070 {
15071 #if SYSV386_COMPAT
15072 p = "{\t%1, %0|r\t%0, %1}";
15073 #else
15074 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15075 #endif
15076 }
15077 else
15078 {
15079 #if SYSV386_COMPAT
15080 p = "{r\t%2, %0|\t%0, %2}";
15081 #else
15082 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15083 #endif
15084 }
15085 break;
15086
15087 default:
15088 gcc_unreachable ();
15089 }
15090
15091 strcat (buf, p);
15092 return buf;
15093 }
15094
15095 /* Return needed mode for entity in optimize_mode_switching pass. */
15096
15097 int
15098 ix86_mode_needed (int entity, rtx insn)
15099 {
15100 enum attr_i387_cw mode;
15101
15102 /* The mode UNINITIALIZED is used to store control word after a
15103 function call or ASM pattern. The mode ANY specify that function
15104 has no requirements on the control word and make no changes in the
15105 bits we are interested in. */
15106
15107 if (CALL_P (insn)
15108 || (NONJUMP_INSN_P (insn)
15109 && (asm_noperands (PATTERN (insn)) >= 0
15110 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15111 return I387_CW_UNINITIALIZED;
15112
15113 if (recog_memoized (insn) < 0)
15114 return I387_CW_ANY;
15115
15116 mode = get_attr_i387_cw (insn);
15117
15118 switch (entity)
15119 {
15120 case I387_TRUNC:
15121 if (mode == I387_CW_TRUNC)
15122 return mode;
15123 break;
15124
15125 case I387_FLOOR:
15126 if (mode == I387_CW_FLOOR)
15127 return mode;
15128 break;
15129
15130 case I387_CEIL:
15131 if (mode == I387_CW_CEIL)
15132 return mode;
15133 break;
15134
15135 case I387_MASK_PM:
15136 if (mode == I387_CW_MASK_PM)
15137 return mode;
15138 break;
15139
15140 default:
15141 gcc_unreachable ();
15142 }
15143
15144 return I387_CW_ANY;
15145 }
15146
15147 /* Output code to initialize control word copies used by trunc?f?i and
15148 rounding patterns. CURRENT_MODE is set to current control word,
15149 while NEW_MODE is set to new control word. */
15150
15151 void
15152 emit_i387_cw_initialization (int mode)
15153 {
15154 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15155 rtx new_mode;
15156
15157 enum ix86_stack_slot slot;
15158
15159 rtx reg = gen_reg_rtx (HImode);
15160
15161 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15162 emit_move_insn (reg, copy_rtx (stored_mode));
15163
15164 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15165 || optimize_function_for_size_p (cfun))
15166 {
15167 switch (mode)
15168 {
15169 case I387_CW_TRUNC:
15170 /* round toward zero (truncate) */
15171 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15172 slot = SLOT_CW_TRUNC;
15173 break;
15174
15175 case I387_CW_FLOOR:
15176 /* round down toward -oo */
15177 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15178 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15179 slot = SLOT_CW_FLOOR;
15180 break;
15181
15182 case I387_CW_CEIL:
15183 /* round up toward +oo */
15184 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15185 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15186 slot = SLOT_CW_CEIL;
15187 break;
15188
15189 case I387_CW_MASK_PM:
15190 /* mask precision exception for nearbyint() */
15191 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15192 slot = SLOT_CW_MASK_PM;
15193 break;
15194
15195 default:
15196 gcc_unreachable ();
15197 }
15198 }
15199 else
15200 {
15201 switch (mode)
15202 {
15203 case I387_CW_TRUNC:
15204 /* round toward zero (truncate) */
15205 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15206 slot = SLOT_CW_TRUNC;
15207 break;
15208
15209 case I387_CW_FLOOR:
15210 /* round down toward -oo */
15211 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15212 slot = SLOT_CW_FLOOR;
15213 break;
15214
15215 case I387_CW_CEIL:
15216 /* round up toward +oo */
15217 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15218 slot = SLOT_CW_CEIL;
15219 break;
15220
15221 case I387_CW_MASK_PM:
15222 /* mask precision exception for nearbyint() */
15223 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15224 slot = SLOT_CW_MASK_PM;
15225 break;
15226
15227 default:
15228 gcc_unreachable ();
15229 }
15230 }
15231
15232 gcc_assert (slot < MAX_386_STACK_LOCALS);
15233
15234 new_mode = assign_386_stack_local (HImode, slot);
15235 emit_move_insn (new_mode, reg);
15236 }
15237
15238 /* Output code for INSN to convert a float to a signed int. OPERANDS
15239 are the insn operands. The output may be [HSD]Imode and the input
15240 operand may be [SDX]Fmode. */
15241
15242 const char *
15243 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15244 {
15245 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15246 int dimode_p = GET_MODE (operands[0]) == DImode;
15247 int round_mode = get_attr_i387_cw (insn);
15248
15249 /* Jump through a hoop or two for DImode, since the hardware has no
15250 non-popping instruction. We used to do this a different way, but
15251 that was somewhat fragile and broke with post-reload splitters. */
15252 if ((dimode_p || fisttp) && !stack_top_dies)
15253 output_asm_insn ("fld\t%y1", operands);
15254
15255 gcc_assert (STACK_TOP_P (operands[1]));
15256 gcc_assert (MEM_P (operands[0]));
15257 gcc_assert (GET_MODE (operands[1]) != TFmode);
15258
15259 if (fisttp)
15260 output_asm_insn ("fisttp%Z0\t%0", operands);
15261 else
15262 {
15263 if (round_mode != I387_CW_ANY)
15264 output_asm_insn ("fldcw\t%3", operands);
15265 if (stack_top_dies || dimode_p)
15266 output_asm_insn ("fistp%Z0\t%0", operands);
15267 else
15268 output_asm_insn ("fist%Z0\t%0", operands);
15269 if (round_mode != I387_CW_ANY)
15270 output_asm_insn ("fldcw\t%2", operands);
15271 }
15272
15273 return "";
15274 }
15275
15276 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15277 have the values zero or one, indicates the ffreep insn's operand
15278 from the OPERANDS array. */
15279
15280 static const char *
15281 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15282 {
15283 if (TARGET_USE_FFREEP)
15284 #ifdef HAVE_AS_IX86_FFREEP
15285 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15286 #else
15287 {
15288 static char retval[32];
15289 int regno = REGNO (operands[opno]);
15290
15291 gcc_assert (FP_REGNO_P (regno));
15292
15293 regno -= FIRST_STACK_REG;
15294
15295 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15296 return retval;
15297 }
15298 #endif
15299
15300 return opno ? "fstp\t%y1" : "fstp\t%y0";
15301 }
15302
15303
15304 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15305 should be used. UNORDERED_P is true when fucom should be used. */
15306
15307 const char *
15308 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15309 {
15310 int stack_top_dies;
15311 rtx cmp_op0, cmp_op1;
15312 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15313
15314 if (eflags_p)
15315 {
15316 cmp_op0 = operands[0];
15317 cmp_op1 = operands[1];
15318 }
15319 else
15320 {
15321 cmp_op0 = operands[1];
15322 cmp_op1 = operands[2];
15323 }
15324
15325 if (is_sse)
15326 {
15327 if (GET_MODE (operands[0]) == SFmode)
15328 if (unordered_p)
15329 return "%vucomiss\t{%1, %0|%0, %1}";
15330 else
15331 return "%vcomiss\t{%1, %0|%0, %1}";
15332 else
15333 if (unordered_p)
15334 return "%vucomisd\t{%1, %0|%0, %1}";
15335 else
15336 return "%vcomisd\t{%1, %0|%0, %1}";
15337 }
15338
15339 gcc_assert (STACK_TOP_P (cmp_op0));
15340
15341 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15342
15343 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15344 {
15345 if (stack_top_dies)
15346 {
15347 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15348 return output_387_ffreep (operands, 1);
15349 }
15350 else
15351 return "ftst\n\tfnstsw\t%0";
15352 }
15353
15354 if (STACK_REG_P (cmp_op1)
15355 && stack_top_dies
15356 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15357 && REGNO (cmp_op1) != FIRST_STACK_REG)
15358 {
15359 /* If both the top of the 387 stack dies, and the other operand
15360 is also a stack register that dies, then this must be a
15361 `fcompp' float compare */
15362
15363 if (eflags_p)
15364 {
15365 /* There is no double popping fcomi variant. Fortunately,
15366 eflags is immune from the fstp's cc clobbering. */
15367 if (unordered_p)
15368 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15369 else
15370 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15371 return output_387_ffreep (operands, 0);
15372 }
15373 else
15374 {
15375 if (unordered_p)
15376 return "fucompp\n\tfnstsw\t%0";
15377 else
15378 return "fcompp\n\tfnstsw\t%0";
15379 }
15380 }
15381 else
15382 {
15383 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15384
15385 static const char * const alt[16] =
15386 {
15387 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15388 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15389 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15390 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15391
15392 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15393 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15394 NULL,
15395 NULL,
15396
15397 "fcomi\t{%y1, %0|%0, %y1}",
15398 "fcomip\t{%y1, %0|%0, %y1}",
15399 "fucomi\t{%y1, %0|%0, %y1}",
15400 "fucomip\t{%y1, %0|%0, %y1}",
15401
15402 NULL,
15403 NULL,
15404 NULL,
15405 NULL
15406 };
15407
15408 int mask;
15409 const char *ret;
15410
15411 mask = eflags_p << 3;
15412 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15413 mask |= unordered_p << 1;
15414 mask |= stack_top_dies;
15415
15416 gcc_assert (mask < 16);
15417 ret = alt[mask];
15418 gcc_assert (ret);
15419
15420 return ret;
15421 }
15422 }
15423
15424 void
15425 ix86_output_addr_vec_elt (FILE *file, int value)
15426 {
15427 const char *directive = ASM_LONG;
15428
15429 #ifdef ASM_QUAD
15430 if (TARGET_LP64)
15431 directive = ASM_QUAD;
15432 #else
15433 gcc_assert (!TARGET_64BIT);
15434 #endif
15435
15436 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15437 }
15438
15439 void
15440 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15441 {
15442 const char *directive = ASM_LONG;
15443
15444 #ifdef ASM_QUAD
15445 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15446 directive = ASM_QUAD;
15447 #else
15448 gcc_assert (!TARGET_64BIT);
15449 #endif
15450 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15451 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15452 fprintf (file, "%s%s%d-%s%d\n",
15453 directive, LPREFIX, value, LPREFIX, rel);
15454 else if (HAVE_AS_GOTOFF_IN_DATA)
15455 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15456 #if TARGET_MACHO
15457 else if (TARGET_MACHO)
15458 {
15459 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15460 machopic_output_function_base_name (file);
15461 putc ('\n', file);
15462 }
15463 #endif
15464 else
15465 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15466 GOT_SYMBOL_NAME, LPREFIX, value);
15467 }
15468 \f
15469 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15470 for the target. */
15471
15472 void
15473 ix86_expand_clear (rtx dest)
15474 {
15475 rtx tmp;
15476
15477 /* We play register width games, which are only valid after reload. */
15478 gcc_assert (reload_completed);
15479
15480 /* Avoid HImode and its attendant prefix byte. */
15481 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15482 dest = gen_rtx_REG (SImode, REGNO (dest));
15483 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15484
15485 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15486 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15487 {
15488 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15489 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15490 }
15491
15492 emit_insn (tmp);
15493 }
15494
15495 /* X is an unchanging MEM. If it is a constant pool reference, return
15496 the constant pool rtx, else NULL. */
15497
15498 rtx
15499 maybe_get_pool_constant (rtx x)
15500 {
15501 x = ix86_delegitimize_address (XEXP (x, 0));
15502
15503 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15504 return get_pool_constant (x);
15505
15506 return NULL_RTX;
15507 }
15508
15509 void
15510 ix86_expand_move (enum machine_mode mode, rtx operands[])
15511 {
15512 rtx op0, op1;
15513 enum tls_model model;
15514
15515 op0 = operands[0];
15516 op1 = operands[1];
15517
15518 if (GET_CODE (op1) == SYMBOL_REF)
15519 {
15520 model = SYMBOL_REF_TLS_MODEL (op1);
15521 if (model)
15522 {
15523 op1 = legitimize_tls_address (op1, model, true);
15524 op1 = force_operand (op1, op0);
15525 if (op1 == op0)
15526 return;
15527 if (GET_MODE (op1) != mode)
15528 op1 = convert_to_mode (mode, op1, 1);
15529 }
15530 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15531 && SYMBOL_REF_DLLIMPORT_P (op1))
15532 op1 = legitimize_dllimport_symbol (op1, false);
15533 }
15534 else if (GET_CODE (op1) == CONST
15535 && GET_CODE (XEXP (op1, 0)) == PLUS
15536 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15537 {
15538 rtx addend = XEXP (XEXP (op1, 0), 1);
15539 rtx symbol = XEXP (XEXP (op1, 0), 0);
15540 rtx tmp = NULL;
15541
15542 model = SYMBOL_REF_TLS_MODEL (symbol);
15543 if (model)
15544 tmp = legitimize_tls_address (symbol, model, true);
15545 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15546 && SYMBOL_REF_DLLIMPORT_P (symbol))
15547 tmp = legitimize_dllimport_symbol (symbol, true);
15548
15549 if (tmp)
15550 {
15551 tmp = force_operand (tmp, NULL);
15552 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15553 op0, 1, OPTAB_DIRECT);
15554 if (tmp == op0)
15555 return;
15556 if (GET_MODE (tmp) != mode)
15557 op1 = convert_to_mode (mode, tmp, 1);
15558 }
15559 }
15560
15561 if ((flag_pic || MACHOPIC_INDIRECT)
15562 && symbolic_operand (op1, mode))
15563 {
15564 if (TARGET_MACHO && !TARGET_64BIT)
15565 {
15566 #if TARGET_MACHO
15567 /* dynamic-no-pic */
15568 if (MACHOPIC_INDIRECT)
15569 {
15570 rtx temp = ((reload_in_progress
15571 || ((op0 && REG_P (op0))
15572 && mode == Pmode))
15573 ? op0 : gen_reg_rtx (Pmode));
15574 op1 = machopic_indirect_data_reference (op1, temp);
15575 if (MACHOPIC_PURE)
15576 op1 = machopic_legitimize_pic_address (op1, mode,
15577 temp == op1 ? 0 : temp);
15578 }
15579 if (op0 != op1 && GET_CODE (op0) != MEM)
15580 {
15581 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15582 emit_insn (insn);
15583 return;
15584 }
15585 if (GET_CODE (op0) == MEM)
15586 op1 = force_reg (Pmode, op1);
15587 else
15588 {
15589 rtx temp = op0;
15590 if (GET_CODE (temp) != REG)
15591 temp = gen_reg_rtx (Pmode);
15592 temp = legitimize_pic_address (op1, temp);
15593 if (temp == op0)
15594 return;
15595 op1 = temp;
15596 }
15597 /* dynamic-no-pic */
15598 #endif
15599 }
15600 else
15601 {
15602 if (MEM_P (op0))
15603 op1 = force_reg (mode, op1);
15604 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15605 {
15606 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15607 op1 = legitimize_pic_address (op1, reg);
15608 if (op0 == op1)
15609 return;
15610 if (GET_MODE (op1) != mode)
15611 op1 = convert_to_mode (mode, op1, 1);
15612 }
15613 }
15614 }
15615 else
15616 {
15617 if (MEM_P (op0)
15618 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15619 || !push_operand (op0, mode))
15620 && MEM_P (op1))
15621 op1 = force_reg (mode, op1);
15622
15623 if (push_operand (op0, mode)
15624 && ! general_no_elim_operand (op1, mode))
15625 op1 = copy_to_mode_reg (mode, op1);
15626
15627 /* Force large constants in 64bit compilation into register
15628 to get them CSEed. */
15629 if (can_create_pseudo_p ()
15630 && (mode == DImode) && TARGET_64BIT
15631 && immediate_operand (op1, mode)
15632 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15633 && !register_operand (op0, mode)
15634 && optimize)
15635 op1 = copy_to_mode_reg (mode, op1);
15636
15637 if (can_create_pseudo_p ()
15638 && FLOAT_MODE_P (mode)
15639 && GET_CODE (op1) == CONST_DOUBLE)
15640 {
15641 /* If we are loading a floating point constant to a register,
15642 force the value to memory now, since we'll get better code
15643 out the back end. */
15644
15645 op1 = validize_mem (force_const_mem (mode, op1));
15646 if (!register_operand (op0, mode))
15647 {
15648 rtx temp = gen_reg_rtx (mode);
15649 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15650 emit_move_insn (op0, temp);
15651 return;
15652 }
15653 }
15654 }
15655
15656 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15657 }
15658
15659 void
15660 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15661 {
15662 rtx op0 = operands[0], op1 = operands[1];
15663 unsigned int align = GET_MODE_ALIGNMENT (mode);
15664
15665 /* Force constants other than zero into memory. We do not know how
15666 the instructions used to build constants modify the upper 64 bits
15667 of the register, once we have that information we may be able
15668 to handle some of them more efficiently. */
15669 if (can_create_pseudo_p ()
15670 && register_operand (op0, mode)
15671 && (CONSTANT_P (op1)
15672 || (GET_CODE (op1) == SUBREG
15673 && CONSTANT_P (SUBREG_REG (op1))))
15674 && !standard_sse_constant_p (op1))
15675 op1 = validize_mem (force_const_mem (mode, op1));
15676
15677 /* We need to check memory alignment for SSE mode since attribute
15678 can make operands unaligned. */
15679 if (can_create_pseudo_p ()
15680 && SSE_REG_MODE_P (mode)
15681 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15682 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15683 {
15684 rtx tmp[2];
15685
15686 /* ix86_expand_vector_move_misalign() does not like constants ... */
15687 if (CONSTANT_P (op1)
15688 || (GET_CODE (op1) == SUBREG
15689 && CONSTANT_P (SUBREG_REG (op1))))
15690 op1 = validize_mem (force_const_mem (mode, op1));
15691
15692 /* ... nor both arguments in memory. */
15693 if (!register_operand (op0, mode)
15694 && !register_operand (op1, mode))
15695 op1 = force_reg (mode, op1);
15696
15697 tmp[0] = op0; tmp[1] = op1;
15698 ix86_expand_vector_move_misalign (mode, tmp);
15699 return;
15700 }
15701
15702 /* Make operand1 a register if it isn't already. */
15703 if (can_create_pseudo_p ()
15704 && !register_operand (op0, mode)
15705 && !register_operand (op1, mode))
15706 {
15707 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15708 return;
15709 }
15710
15711 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15712 }
15713
15714 /* Split 32-byte AVX unaligned load and store if needed. */
15715
15716 static void
15717 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15718 {
15719 rtx m;
15720 rtx (*extract) (rtx, rtx, rtx);
15721 rtx (*move_unaligned) (rtx, rtx);
15722 enum machine_mode mode;
15723
15724 switch (GET_MODE (op0))
15725 {
15726 default:
15727 gcc_unreachable ();
15728 case V32QImode:
15729 extract = gen_avx_vextractf128v32qi;
15730 move_unaligned = gen_avx_movdqu256;
15731 mode = V16QImode;
15732 break;
15733 case V8SFmode:
15734 extract = gen_avx_vextractf128v8sf;
15735 move_unaligned = gen_avx_movups256;
15736 mode = V4SFmode;
15737 break;
15738 case V4DFmode:
15739 extract = gen_avx_vextractf128v4df;
15740 move_unaligned = gen_avx_movupd256;
15741 mode = V2DFmode;
15742 break;
15743 }
15744
15745 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15746 {
15747 rtx r = gen_reg_rtx (mode);
15748 m = adjust_address (op1, mode, 0);
15749 emit_move_insn (r, m);
15750 m = adjust_address (op1, mode, 16);
15751 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15752 emit_move_insn (op0, r);
15753 }
15754 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15755 {
15756 m = adjust_address (op0, mode, 0);
15757 emit_insn (extract (m, op1, const0_rtx));
15758 m = adjust_address (op0, mode, 16);
15759 emit_insn (extract (m, op1, const1_rtx));
15760 }
15761 else
15762 emit_insn (move_unaligned (op0, op1));
15763 }
15764
15765 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15766 straight to ix86_expand_vector_move. */
15767 /* Code generation for scalar reg-reg moves of single and double precision data:
15768 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15769 movaps reg, reg
15770 else
15771 movss reg, reg
15772 if (x86_sse_partial_reg_dependency == true)
15773 movapd reg, reg
15774 else
15775 movsd reg, reg
15776
15777 Code generation for scalar loads of double precision data:
15778 if (x86_sse_split_regs == true)
15779 movlpd mem, reg (gas syntax)
15780 else
15781 movsd mem, reg
15782
15783 Code generation for unaligned packed loads of single precision data
15784 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15785 if (x86_sse_unaligned_move_optimal)
15786 movups mem, reg
15787
15788 if (x86_sse_partial_reg_dependency == true)
15789 {
15790 xorps reg, reg
15791 movlps mem, reg
15792 movhps mem+8, reg
15793 }
15794 else
15795 {
15796 movlps mem, reg
15797 movhps mem+8, reg
15798 }
15799
15800 Code generation for unaligned packed loads of double precision data
15801 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15802 if (x86_sse_unaligned_move_optimal)
15803 movupd mem, reg
15804
15805 if (x86_sse_split_regs == true)
15806 {
15807 movlpd mem, reg
15808 movhpd mem+8, reg
15809 }
15810 else
15811 {
15812 movsd mem, reg
15813 movhpd mem+8, reg
15814 }
15815 */
15816
15817 void
15818 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15819 {
15820 rtx op0, op1, m;
15821
15822 op0 = operands[0];
15823 op1 = operands[1];
15824
15825 if (TARGET_AVX)
15826 {
15827 switch (GET_MODE_CLASS (mode))
15828 {
15829 case MODE_VECTOR_INT:
15830 case MODE_INT:
15831 switch (GET_MODE_SIZE (mode))
15832 {
15833 case 16:
15834 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15835 {
15836 op0 = gen_lowpart (V4SFmode, op0);
15837 op1 = gen_lowpart (V4SFmode, op1);
15838 emit_insn (gen_sse_movups (op0, op1));
15839 }
15840 else
15841 {
15842 op0 = gen_lowpart (V16QImode, op0);
15843 op1 = gen_lowpart (V16QImode, op1);
15844 emit_insn (gen_sse2_movdqu (op0, op1));
15845 }
15846 break;
15847 case 32:
15848 op0 = gen_lowpart (V32QImode, op0);
15849 op1 = gen_lowpart (V32QImode, op1);
15850 ix86_avx256_split_vector_move_misalign (op0, op1);
15851 break;
15852 default:
15853 gcc_unreachable ();
15854 }
15855 break;
15856 case MODE_VECTOR_FLOAT:
15857 switch (mode)
15858 {
15859 case V4SFmode:
15860 emit_insn (gen_sse_movups (op0, op1));
15861 break;
15862 case V2DFmode:
15863 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15864 {
15865 op0 = gen_lowpart (V4SFmode, op0);
15866 op1 = gen_lowpart (V4SFmode, op1);
15867 emit_insn (gen_sse_movups (op0, op1));
15868 }
15869 else
15870 emit_insn (gen_sse2_movupd (op0, op1));
15871 break;
15872 case V8SFmode:
15873 case V4DFmode:
15874 ix86_avx256_split_vector_move_misalign (op0, op1);
15875 break;
15876 default:
15877 gcc_unreachable ();
15878 }
15879 break;
15880
15881 default:
15882 gcc_unreachable ();
15883 }
15884
15885 return;
15886 }
15887
15888 if (MEM_P (op1))
15889 {
15890 /* If we're optimizing for size, movups is the smallest. */
15891 if (optimize_insn_for_size_p ()
15892 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15893 {
15894 op0 = gen_lowpart (V4SFmode, op0);
15895 op1 = gen_lowpart (V4SFmode, op1);
15896 emit_insn (gen_sse_movups (op0, op1));
15897 return;
15898 }
15899
15900 /* ??? If we have typed data, then it would appear that using
15901 movdqu is the only way to get unaligned data loaded with
15902 integer type. */
15903 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15904 {
15905 op0 = gen_lowpart (V16QImode, op0);
15906 op1 = gen_lowpart (V16QImode, op1);
15907 emit_insn (gen_sse2_movdqu (op0, op1));
15908 return;
15909 }
15910
15911 if (TARGET_SSE2 && mode == V2DFmode)
15912 {
15913 rtx zero;
15914
15915 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15916 {
15917 emit_insn (gen_sse2_movupd (op0, op1));
15918 return;
15919 }
15920
15921 /* When SSE registers are split into halves, we can avoid
15922 writing to the top half twice. */
15923 if (TARGET_SSE_SPLIT_REGS)
15924 {
15925 emit_clobber (op0);
15926 zero = op0;
15927 }
15928 else
15929 {
15930 /* ??? Not sure about the best option for the Intel chips.
15931 The following would seem to satisfy; the register is
15932 entirely cleared, breaking the dependency chain. We
15933 then store to the upper half, with a dependency depth
15934 of one. A rumor has it that Intel recommends two movsd
15935 followed by an unpacklpd, but this is unconfirmed. And
15936 given that the dependency depth of the unpacklpd would
15937 still be one, I'm not sure why this would be better. */
15938 zero = CONST0_RTX (V2DFmode);
15939 }
15940
15941 m = adjust_address (op1, DFmode, 0);
15942 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15943 m = adjust_address (op1, DFmode, 8);
15944 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15945 }
15946 else
15947 {
15948 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15949 {
15950 op0 = gen_lowpart (V4SFmode, op0);
15951 op1 = gen_lowpart (V4SFmode, op1);
15952 emit_insn (gen_sse_movups (op0, op1));
15953 return;
15954 }
15955
15956 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15957 emit_move_insn (op0, CONST0_RTX (mode));
15958 else
15959 emit_clobber (op0);
15960
15961 if (mode != V4SFmode)
15962 op0 = gen_lowpart (V4SFmode, op0);
15963 m = adjust_address (op1, V2SFmode, 0);
15964 emit_insn (gen_sse_loadlps (op0, op0, m));
15965 m = adjust_address (op1, V2SFmode, 8);
15966 emit_insn (gen_sse_loadhps (op0, op0, m));
15967 }
15968 }
15969 else if (MEM_P (op0))
15970 {
15971 /* If we're optimizing for size, movups is the smallest. */
15972 if (optimize_insn_for_size_p ()
15973 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15974 {
15975 op0 = gen_lowpart (V4SFmode, op0);
15976 op1 = gen_lowpart (V4SFmode, op1);
15977 emit_insn (gen_sse_movups (op0, op1));
15978 return;
15979 }
15980
15981 /* ??? Similar to above, only less clear
15982 because of typeless stores. */
15983 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15984 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15985 {
15986 op0 = gen_lowpart (V16QImode, op0);
15987 op1 = gen_lowpart (V16QImode, op1);
15988 emit_insn (gen_sse2_movdqu (op0, op1));
15989 return;
15990 }
15991
15992 if (TARGET_SSE2 && mode == V2DFmode)
15993 {
15994 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15995 emit_insn (gen_sse2_movupd (op0, op1));
15996 else
15997 {
15998 m = adjust_address (op0, DFmode, 0);
15999 emit_insn (gen_sse2_storelpd (m, op1));
16000 m = adjust_address (op0, DFmode, 8);
16001 emit_insn (gen_sse2_storehpd (m, op1));
16002 }
16003 }
16004 else
16005 {
16006 if (mode != V4SFmode)
16007 op1 = gen_lowpart (V4SFmode, op1);
16008
16009 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
16010 {
16011 op0 = gen_lowpart (V4SFmode, op0);
16012 emit_insn (gen_sse_movups (op0, op1));
16013 }
16014 else
16015 {
16016 m = adjust_address (op0, V2SFmode, 0);
16017 emit_insn (gen_sse_storelps (m, op1));
16018 m = adjust_address (op0, V2SFmode, 8);
16019 emit_insn (gen_sse_storehps (m, op1));
16020 }
16021 }
16022 }
16023 else
16024 gcc_unreachable ();
16025 }
16026
16027 /* Expand a push in MODE. This is some mode for which we do not support
16028 proper push instructions, at least from the registers that we expect
16029 the value to live in. */
16030
16031 void
16032 ix86_expand_push (enum machine_mode mode, rtx x)
16033 {
16034 rtx tmp;
16035
16036 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16037 GEN_INT (-GET_MODE_SIZE (mode)),
16038 stack_pointer_rtx, 1, OPTAB_DIRECT);
16039 if (tmp != stack_pointer_rtx)
16040 emit_move_insn (stack_pointer_rtx, tmp);
16041
16042 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16043
16044 /* When we push an operand onto stack, it has to be aligned at least
16045 at the function argument boundary. However since we don't have
16046 the argument type, we can't determine the actual argument
16047 boundary. */
16048 emit_move_insn (tmp, x);
16049 }
16050
16051 /* Helper function of ix86_fixup_binary_operands to canonicalize
16052 operand order. Returns true if the operands should be swapped. */
16053
16054 static bool
16055 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16056 rtx operands[])
16057 {
16058 rtx dst = operands[0];
16059 rtx src1 = operands[1];
16060 rtx src2 = operands[2];
16061
16062 /* If the operation is not commutative, we can't do anything. */
16063 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16064 return false;
16065
16066 /* Highest priority is that src1 should match dst. */
16067 if (rtx_equal_p (dst, src1))
16068 return false;
16069 if (rtx_equal_p (dst, src2))
16070 return true;
16071
16072 /* Next highest priority is that immediate constants come second. */
16073 if (immediate_operand (src2, mode))
16074 return false;
16075 if (immediate_operand (src1, mode))
16076 return true;
16077
16078 /* Lowest priority is that memory references should come second. */
16079 if (MEM_P (src2))
16080 return false;
16081 if (MEM_P (src1))
16082 return true;
16083
16084 return false;
16085 }
16086
16087
16088 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16089 destination to use for the operation. If different from the true
16090 destination in operands[0], a copy operation will be required. */
16091
16092 rtx
16093 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16094 rtx operands[])
16095 {
16096 rtx dst = operands[0];
16097 rtx src1 = operands[1];
16098 rtx src2 = operands[2];
16099
16100 /* Canonicalize operand order. */
16101 if (ix86_swap_binary_operands_p (code, mode, operands))
16102 {
16103 rtx temp;
16104
16105 /* It is invalid to swap operands of different modes. */
16106 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16107
16108 temp = src1;
16109 src1 = src2;
16110 src2 = temp;
16111 }
16112
16113 /* Both source operands cannot be in memory. */
16114 if (MEM_P (src1) && MEM_P (src2))
16115 {
16116 /* Optimization: Only read from memory once. */
16117 if (rtx_equal_p (src1, src2))
16118 {
16119 src2 = force_reg (mode, src2);
16120 src1 = src2;
16121 }
16122 else
16123 src2 = force_reg (mode, src2);
16124 }
16125
16126 /* If the destination is memory, and we do not have matching source
16127 operands, do things in registers. */
16128 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16129 dst = gen_reg_rtx (mode);
16130
16131 /* Source 1 cannot be a constant. */
16132 if (CONSTANT_P (src1))
16133 src1 = force_reg (mode, src1);
16134
16135 /* Source 1 cannot be a non-matching memory. */
16136 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16137 src1 = force_reg (mode, src1);
16138
16139 /* Improve address combine. */
16140 if (code == PLUS
16141 && GET_MODE_CLASS (mode) == MODE_INT
16142 && MEM_P (src2))
16143 src2 = force_reg (mode, src2);
16144
16145 operands[1] = src1;
16146 operands[2] = src2;
16147 return dst;
16148 }
16149
16150 /* Similarly, but assume that the destination has already been
16151 set up properly. */
16152
16153 void
16154 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16155 enum machine_mode mode, rtx operands[])
16156 {
16157 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16158 gcc_assert (dst == operands[0]);
16159 }
16160
16161 /* Attempt to expand a binary operator. Make the expansion closer to the
16162 actual machine, then just general_operand, which will allow 3 separate
16163 memory references (one output, two input) in a single insn. */
16164
16165 void
16166 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16167 rtx operands[])
16168 {
16169 rtx src1, src2, dst, op, clob;
16170
16171 dst = ix86_fixup_binary_operands (code, mode, operands);
16172 src1 = operands[1];
16173 src2 = operands[2];
16174
16175 /* Emit the instruction. */
16176
16177 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16178 if (reload_in_progress)
16179 {
16180 /* Reload doesn't know about the flags register, and doesn't know that
16181 it doesn't want to clobber it. We can only do this with PLUS. */
16182 gcc_assert (code == PLUS);
16183 emit_insn (op);
16184 }
16185 else if (reload_completed
16186 && code == PLUS
16187 && !rtx_equal_p (dst, src1))
16188 {
16189 /* This is going to be an LEA; avoid splitting it later. */
16190 emit_insn (op);
16191 }
16192 else
16193 {
16194 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16195 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16196 }
16197
16198 /* Fix up the destination if needed. */
16199 if (dst != operands[0])
16200 emit_move_insn (operands[0], dst);
16201 }
16202
16203 /* Return TRUE or FALSE depending on whether the binary operator meets the
16204 appropriate constraints. */
16205
16206 bool
16207 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16208 rtx operands[3])
16209 {
16210 rtx dst = operands[0];
16211 rtx src1 = operands[1];
16212 rtx src2 = operands[2];
16213
16214 /* Both source operands cannot be in memory. */
16215 if (MEM_P (src1) && MEM_P (src2))
16216 return false;
16217
16218 /* Canonicalize operand order for commutative operators. */
16219 if (ix86_swap_binary_operands_p (code, mode, operands))
16220 {
16221 rtx temp = src1;
16222 src1 = src2;
16223 src2 = temp;
16224 }
16225
16226 /* If the destination is memory, we must have a matching source operand. */
16227 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16228 return false;
16229
16230 /* Source 1 cannot be a constant. */
16231 if (CONSTANT_P (src1))
16232 return false;
16233
16234 /* Source 1 cannot be a non-matching memory. */
16235 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16236 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16237 return (code == AND
16238 && (mode == HImode
16239 || mode == SImode
16240 || (TARGET_64BIT && mode == DImode))
16241 && satisfies_constraint_L (src2));
16242
16243 return true;
16244 }
16245
16246 /* Attempt to expand a unary operator. Make the expansion closer to the
16247 actual machine, then just general_operand, which will allow 2 separate
16248 memory references (one output, one input) in a single insn. */
16249
16250 void
16251 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16252 rtx operands[])
16253 {
16254 int matching_memory;
16255 rtx src, dst, op, clob;
16256
16257 dst = operands[0];
16258 src = operands[1];
16259
16260 /* If the destination is memory, and we do not have matching source
16261 operands, do things in registers. */
16262 matching_memory = 0;
16263 if (MEM_P (dst))
16264 {
16265 if (rtx_equal_p (dst, src))
16266 matching_memory = 1;
16267 else
16268 dst = gen_reg_rtx (mode);
16269 }
16270
16271 /* When source operand is memory, destination must match. */
16272 if (MEM_P (src) && !matching_memory)
16273 src = force_reg (mode, src);
16274
16275 /* Emit the instruction. */
16276
16277 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16278 if (reload_in_progress || code == NOT)
16279 {
16280 /* Reload doesn't know about the flags register, and doesn't know that
16281 it doesn't want to clobber it. */
16282 gcc_assert (code == NOT);
16283 emit_insn (op);
16284 }
16285 else
16286 {
16287 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16288 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16289 }
16290
16291 /* Fix up the destination if needed. */
16292 if (dst != operands[0])
16293 emit_move_insn (operands[0], dst);
16294 }
16295
16296 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16297 divisor are within the range [0-255]. */
16298
16299 void
16300 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16301 bool signed_p)
16302 {
16303 rtx end_label, qimode_label;
16304 rtx insn, div, mod;
16305 rtx scratch, tmp0, tmp1, tmp2;
16306 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16307 rtx (*gen_zero_extend) (rtx, rtx);
16308 rtx (*gen_test_ccno_1) (rtx, rtx);
16309
16310 switch (mode)
16311 {
16312 case SImode:
16313 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16314 gen_test_ccno_1 = gen_testsi_ccno_1;
16315 gen_zero_extend = gen_zero_extendqisi2;
16316 break;
16317 case DImode:
16318 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16319 gen_test_ccno_1 = gen_testdi_ccno_1;
16320 gen_zero_extend = gen_zero_extendqidi2;
16321 break;
16322 default:
16323 gcc_unreachable ();
16324 }
16325
16326 end_label = gen_label_rtx ();
16327 qimode_label = gen_label_rtx ();
16328
16329 scratch = gen_reg_rtx (mode);
16330
16331 /* Use 8bit unsigned divimod if dividend and divisor are within
16332 the range [0-255]. */
16333 emit_move_insn (scratch, operands[2]);
16334 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16335 scratch, 1, OPTAB_DIRECT);
16336 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16337 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16338 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16339 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16340 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16341 pc_rtx);
16342 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16343 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16344 JUMP_LABEL (insn) = qimode_label;
16345
16346 /* Generate original signed/unsigned divimod. */
16347 div = gen_divmod4_1 (operands[0], operands[1],
16348 operands[2], operands[3]);
16349 emit_insn (div);
16350
16351 /* Branch to the end. */
16352 emit_jump_insn (gen_jump (end_label));
16353 emit_barrier ();
16354
16355 /* Generate 8bit unsigned divide. */
16356 emit_label (qimode_label);
16357 /* Don't use operands[0] for result of 8bit divide since not all
16358 registers support QImode ZERO_EXTRACT. */
16359 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16360 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16361 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16362 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16363
16364 if (signed_p)
16365 {
16366 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16367 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16368 }
16369 else
16370 {
16371 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16372 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16373 }
16374
16375 /* Extract remainder from AH. */
16376 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16377 if (REG_P (operands[1]))
16378 insn = emit_move_insn (operands[1], tmp1);
16379 else
16380 {
16381 /* Need a new scratch register since the old one has result
16382 of 8bit divide. */
16383 scratch = gen_reg_rtx (mode);
16384 emit_move_insn (scratch, tmp1);
16385 insn = emit_move_insn (operands[1], scratch);
16386 }
16387 set_unique_reg_note (insn, REG_EQUAL, mod);
16388
16389 /* Zero extend quotient from AL. */
16390 tmp1 = gen_lowpart (QImode, tmp0);
16391 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16392 set_unique_reg_note (insn, REG_EQUAL, div);
16393
16394 emit_label (end_label);
16395 }
16396
16397 #define LEA_MAX_STALL (3)
16398 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16399
16400 /* Increase given DISTANCE in half-cycles according to
16401 dependencies between PREV and NEXT instructions.
16402 Add 1 half-cycle if there is no dependency and
16403 go to next cycle if there is some dependecy. */
16404
16405 static unsigned int
16406 increase_distance (rtx prev, rtx next, unsigned int distance)
16407 {
16408 df_ref *use_rec;
16409 df_ref *def_rec;
16410
16411 if (!prev || !next)
16412 return distance + (distance & 1) + 2;
16413
16414 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16415 return distance + 1;
16416
16417 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16418 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16419 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16420 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16421 return distance + (distance & 1) + 2;
16422
16423 return distance + 1;
16424 }
16425
16426 /* Function checks if instruction INSN defines register number
16427 REGNO1 or REGNO2. */
16428
16429 static bool
16430 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16431 rtx insn)
16432 {
16433 df_ref *def_rec;
16434
16435 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16436 if (DF_REF_REG_DEF_P (*def_rec)
16437 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16438 && (regno1 == DF_REF_REGNO (*def_rec)
16439 || regno2 == DF_REF_REGNO (*def_rec)))
16440 {
16441 return true;
16442 }
16443
16444 return false;
16445 }
16446
16447 /* Function checks if instruction INSN uses register number
16448 REGNO as a part of address expression. */
16449
16450 static bool
16451 insn_uses_reg_mem (unsigned int regno, rtx insn)
16452 {
16453 df_ref *use_rec;
16454
16455 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16456 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16457 return true;
16458
16459 return false;
16460 }
16461
16462 /* Search backward for non-agu definition of register number REGNO1
16463 or register number REGNO2 in basic block starting from instruction
16464 START up to head of basic block or instruction INSN.
16465
16466 Function puts true value into *FOUND var if definition was found
16467 and false otherwise.
16468
16469 Distance in half-cycles between START and found instruction or head
16470 of BB is added to DISTANCE and returned. */
16471
16472 static int
16473 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16474 rtx insn, int distance,
16475 rtx start, bool *found)
16476 {
16477 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16478 rtx prev = start;
16479 rtx next = NULL;
16480
16481 *found = false;
16482
16483 while (prev
16484 && prev != insn
16485 && distance < LEA_SEARCH_THRESHOLD)
16486 {
16487 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16488 {
16489 distance = increase_distance (prev, next, distance);
16490 if (insn_defines_reg (regno1, regno2, prev))
16491 {
16492 if (recog_memoized (prev) < 0
16493 || get_attr_type (prev) != TYPE_LEA)
16494 {
16495 *found = true;
16496 return distance;
16497 }
16498 }
16499
16500 next = prev;
16501 }
16502 if (prev == BB_HEAD (bb))
16503 break;
16504
16505 prev = PREV_INSN (prev);
16506 }
16507
16508 return distance;
16509 }
16510
16511 /* Search backward for non-agu definition of register number REGNO1
16512 or register number REGNO2 in INSN's basic block until
16513 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16514 2. Reach neighbour BBs boundary, or
16515 3. Reach agu definition.
16516 Returns the distance between the non-agu definition point and INSN.
16517 If no definition point, returns -1. */
16518
16519 static int
16520 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16521 rtx insn)
16522 {
16523 basic_block bb = BLOCK_FOR_INSN (insn);
16524 int distance = 0;
16525 bool found = false;
16526
16527 if (insn != BB_HEAD (bb))
16528 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16529 distance, PREV_INSN (insn),
16530 &found);
16531
16532 if (!found && distance < LEA_SEARCH_THRESHOLD)
16533 {
16534 edge e;
16535 edge_iterator ei;
16536 bool simple_loop = false;
16537
16538 FOR_EACH_EDGE (e, ei, bb->preds)
16539 if (e->src == bb)
16540 {
16541 simple_loop = true;
16542 break;
16543 }
16544
16545 if (simple_loop)
16546 distance = distance_non_agu_define_in_bb (regno1, regno2,
16547 insn, distance,
16548 BB_END (bb), &found);
16549 else
16550 {
16551 int shortest_dist = -1;
16552 bool found_in_bb = false;
16553
16554 FOR_EACH_EDGE (e, ei, bb->preds)
16555 {
16556 int bb_dist
16557 = distance_non_agu_define_in_bb (regno1, regno2,
16558 insn, distance,
16559 BB_END (e->src),
16560 &found_in_bb);
16561 if (found_in_bb)
16562 {
16563 if (shortest_dist < 0)
16564 shortest_dist = bb_dist;
16565 else if (bb_dist > 0)
16566 shortest_dist = MIN (bb_dist, shortest_dist);
16567
16568 found = true;
16569 }
16570 }
16571
16572 distance = shortest_dist;
16573 }
16574 }
16575
16576 /* get_attr_type may modify recog data. We want to make sure
16577 that recog data is valid for instruction INSN, on which
16578 distance_non_agu_define is called. INSN is unchanged here. */
16579 extract_insn_cached (insn);
16580
16581 if (!found)
16582 return -1;
16583
16584 return distance >> 1;
16585 }
16586
16587 /* Return the distance in half-cycles between INSN and the next
16588 insn that uses register number REGNO in memory address added
16589 to DISTANCE. Return -1 if REGNO0 is set.
16590
16591 Put true value into *FOUND if register usage was found and
16592 false otherwise.
16593 Put true value into *REDEFINED if register redefinition was
16594 found and false otherwise. */
16595
16596 static int
16597 distance_agu_use_in_bb (unsigned int regno,
16598 rtx insn, int distance, rtx start,
16599 bool *found, bool *redefined)
16600 {
16601 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16602 rtx next = start;
16603 rtx prev = NULL;
16604
16605 *found = false;
16606 *redefined = false;
16607
16608 while (next
16609 && next != insn
16610 && distance < LEA_SEARCH_THRESHOLD)
16611 {
16612 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16613 {
16614 distance = increase_distance(prev, next, distance);
16615 if (insn_uses_reg_mem (regno, next))
16616 {
16617 /* Return DISTANCE if OP0 is used in memory
16618 address in NEXT. */
16619 *found = true;
16620 return distance;
16621 }
16622
16623 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16624 {
16625 /* Return -1 if OP0 is set in NEXT. */
16626 *redefined = true;
16627 return -1;
16628 }
16629
16630 prev = next;
16631 }
16632
16633 if (next == BB_END (bb))
16634 break;
16635
16636 next = NEXT_INSN (next);
16637 }
16638
16639 return distance;
16640 }
16641
16642 /* Return the distance between INSN and the next insn that uses
16643 register number REGNO0 in memory address. Return -1 if no such
16644 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16645
16646 static int
16647 distance_agu_use (unsigned int regno0, rtx insn)
16648 {
16649 basic_block bb = BLOCK_FOR_INSN (insn);
16650 int distance = 0;
16651 bool found = false;
16652 bool redefined = false;
16653
16654 if (insn != BB_END (bb))
16655 distance = distance_agu_use_in_bb (regno0, insn, distance,
16656 NEXT_INSN (insn),
16657 &found, &redefined);
16658
16659 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16660 {
16661 edge e;
16662 edge_iterator ei;
16663 bool simple_loop = false;
16664
16665 FOR_EACH_EDGE (e, ei, bb->succs)
16666 if (e->dest == bb)
16667 {
16668 simple_loop = true;
16669 break;
16670 }
16671
16672 if (simple_loop)
16673 distance = distance_agu_use_in_bb (regno0, insn,
16674 distance, BB_HEAD (bb),
16675 &found, &redefined);
16676 else
16677 {
16678 int shortest_dist = -1;
16679 bool found_in_bb = false;
16680 bool redefined_in_bb = false;
16681
16682 FOR_EACH_EDGE (e, ei, bb->succs)
16683 {
16684 int bb_dist
16685 = distance_agu_use_in_bb (regno0, insn,
16686 distance, BB_HEAD (e->dest),
16687 &found_in_bb, &redefined_in_bb);
16688 if (found_in_bb)
16689 {
16690 if (shortest_dist < 0)
16691 shortest_dist = bb_dist;
16692 else if (bb_dist > 0)
16693 shortest_dist = MIN (bb_dist, shortest_dist);
16694
16695 found = true;
16696 }
16697 }
16698
16699 distance = shortest_dist;
16700 }
16701 }
16702
16703 if (!found || redefined)
16704 return -1;
16705
16706 return distance >> 1;
16707 }
16708
16709 /* Define this macro to tune LEA priority vs ADD, it take effect when
16710 there is a dilemma of choicing LEA or ADD
16711 Negative value: ADD is more preferred than LEA
16712 Zero: Netrual
16713 Positive value: LEA is more preferred than ADD*/
16714 #define IX86_LEA_PRIORITY 0
16715
16716 /* Return true if usage of lea INSN has performance advantage
16717 over a sequence of instructions. Instructions sequence has
16718 SPLIT_COST cycles higher latency than lea latency. */
16719
16720 bool
16721 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16722 unsigned int regno2, unsigned int split_cost)
16723 {
16724 int dist_define, dist_use;
16725
16726 dist_define = distance_non_agu_define (regno1, regno2, insn);
16727 dist_use = distance_agu_use (regno0, insn);
16728
16729 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16730 {
16731 /* If there is no non AGU operand definition, no AGU
16732 operand usage and split cost is 0 then both lea
16733 and non lea variants have same priority. Currently
16734 we prefer lea for 64 bit code and non lea on 32 bit
16735 code. */
16736 if (dist_use < 0 && split_cost == 0)
16737 return TARGET_64BIT || IX86_LEA_PRIORITY;
16738 else
16739 return true;
16740 }
16741
16742 /* With longer definitions distance lea is more preferable.
16743 Here we change it to take into account splitting cost and
16744 lea priority. */
16745 dist_define += split_cost + IX86_LEA_PRIORITY;
16746
16747 /* If there is no use in memory addess then we just check
16748 that split cost does not exceed AGU stall. */
16749 if (dist_use < 0)
16750 return dist_define >= LEA_MAX_STALL;
16751
16752 /* If this insn has both backward non-agu dependence and forward
16753 agu dependence, the one with short distance takes effect. */
16754 return dist_define >= dist_use;
16755 }
16756
16757 /* Return true if it is legal to clobber flags by INSN and
16758 false otherwise. */
16759
16760 static bool
16761 ix86_ok_to_clobber_flags (rtx insn)
16762 {
16763 basic_block bb = BLOCK_FOR_INSN (insn);
16764 df_ref *use;
16765 bitmap live;
16766
16767 while (insn)
16768 {
16769 if (NONDEBUG_INSN_P (insn))
16770 {
16771 for (use = DF_INSN_USES (insn); *use; use++)
16772 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16773 return false;
16774
16775 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16776 return true;
16777 }
16778
16779 if (insn == BB_END (bb))
16780 break;
16781
16782 insn = NEXT_INSN (insn);
16783 }
16784
16785 live = df_get_live_out(bb);
16786 return !REGNO_REG_SET_P (live, FLAGS_REG);
16787 }
16788
16789 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16790 move and add to avoid AGU stalls. */
16791
16792 bool
16793 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16794 {
16795 unsigned int regno0 = true_regnum (operands[0]);
16796 unsigned int regno1 = true_regnum (operands[1]);
16797 unsigned int regno2 = true_regnum (operands[2]);
16798
16799 /* Check if we need to optimize. */
16800 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16801 return false;
16802
16803 /* Check it is correct to split here. */
16804 if (!ix86_ok_to_clobber_flags(insn))
16805 return false;
16806
16807 /* We need to split only adds with non destructive
16808 destination operand. */
16809 if (regno0 == regno1 || regno0 == regno2)
16810 return false;
16811 else
16812 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16813 }
16814
16815 /* Return true if we should emit lea instruction instead of mov
16816 instruction. */
16817
16818 bool
16819 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16820 {
16821 unsigned int regno0;
16822 unsigned int regno1;
16823
16824 /* Check if we need to optimize. */
16825 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16826 return false;
16827
16828 /* Use lea for reg to reg moves only. */
16829 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16830 return false;
16831
16832 regno0 = true_regnum (operands[0]);
16833 regno1 = true_regnum (operands[1]);
16834
16835 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16836 }
16837
16838 /* Return true if we need to split lea into a sequence of
16839 instructions to avoid AGU stalls. */
16840
16841 bool
16842 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16843 {
16844 unsigned int regno0 = true_regnum (operands[0]) ;
16845 unsigned int regno1 = -1;
16846 unsigned int regno2 = -1;
16847 unsigned int split_cost = 0;
16848 struct ix86_address parts;
16849 int ok;
16850
16851 /* Check we need to optimize. */
16852 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16853 return false;
16854
16855 /* Check it is correct to split here. */
16856 if (!ix86_ok_to_clobber_flags(insn))
16857 return false;
16858
16859 ok = ix86_decompose_address (operands[1], &parts);
16860 gcc_assert (ok);
16861
16862 /* We should not split into add if non legitimate pic
16863 operand is used as displacement. */
16864 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16865 return false;
16866
16867 if (parts.base)
16868 regno1 = true_regnum (parts.base);
16869 if (parts.index)
16870 regno2 = true_regnum (parts.index);
16871
16872 /* Compute how many cycles we will add to execution time
16873 if split lea into a sequence of instructions. */
16874 if (parts.base || parts.index)
16875 {
16876 /* Have to use mov instruction if non desctructive
16877 destination form is used. */
16878 if (regno1 != regno0 && regno2 != regno0)
16879 split_cost += 1;
16880
16881 /* Have to add index to base if both exist. */
16882 if (parts.base && parts.index)
16883 split_cost += 1;
16884
16885 /* Have to use shift and adds if scale is 2 or greater. */
16886 if (parts.scale > 1)
16887 {
16888 if (regno0 != regno1)
16889 split_cost += 1;
16890 else if (regno2 == regno0)
16891 split_cost += 4;
16892 else
16893 split_cost += parts.scale;
16894 }
16895
16896 /* Have to use add instruction with immediate if
16897 disp is non zero. */
16898 if (parts.disp && parts.disp != const0_rtx)
16899 split_cost += 1;
16900
16901 /* Subtract the price of lea. */
16902 split_cost -= 1;
16903 }
16904
16905 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16906 }
16907
16908 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16909 matches destination. RTX includes clobber of FLAGS_REG. */
16910
16911 static void
16912 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16913 rtx dst, rtx src)
16914 {
16915 rtx op, clob;
16916
16917 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16918 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16919
16920 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16921 }
16922
16923 /* Split lea instructions into a sequence of instructions
16924 which are executed on ALU to avoid AGU stalls.
16925 It is assumed that it is allowed to clobber flags register
16926 at lea position. */
16927
16928 extern void
16929 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16930 {
16931 unsigned int regno0 = true_regnum (operands[0]) ;
16932 unsigned int regno1 = INVALID_REGNUM;
16933 unsigned int regno2 = INVALID_REGNUM;
16934 struct ix86_address parts;
16935 rtx tmp;
16936 int ok, adds;
16937
16938 ok = ix86_decompose_address (operands[1], &parts);
16939 gcc_assert (ok);
16940
16941 if (parts.base)
16942 {
16943 if (GET_MODE (parts.base) != mode)
16944 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16945 regno1 = true_regnum (parts.base);
16946 }
16947
16948 if (parts.index)
16949 {
16950 if (GET_MODE (parts.index) != mode)
16951 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16952 regno2 = true_regnum (parts.index);
16953 }
16954
16955 if (parts.scale > 1)
16956 {
16957 /* Case r1 = r1 + ... */
16958 if (regno1 == regno0)
16959 {
16960 /* If we have a case r1 = r1 + C * r1 then we
16961 should use multiplication which is very
16962 expensive. Assume cost model is wrong if we
16963 have such case here. */
16964 gcc_assert (regno2 != regno0);
16965
16966 for (adds = parts.scale; adds > 0; adds--)
16967 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16968 }
16969 else
16970 {
16971 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16972 if (regno0 != regno2)
16973 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16974
16975 /* Use shift for scaling. */
16976 ix86_emit_binop (ASHIFT, mode, operands[0],
16977 GEN_INT (exact_log2 (parts.scale)));
16978
16979 if (parts.base)
16980 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16981
16982 if (parts.disp && parts.disp != const0_rtx)
16983 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16984 }
16985 }
16986 else if (!parts.base && !parts.index)
16987 {
16988 gcc_assert(parts.disp);
16989 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16990 }
16991 else
16992 {
16993 if (!parts.base)
16994 {
16995 if (regno0 != regno2)
16996 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16997 }
16998 else if (!parts.index)
16999 {
17000 if (regno0 != regno1)
17001 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17002 }
17003 else
17004 {
17005 if (regno0 == regno1)
17006 tmp = parts.index;
17007 else if (regno0 == regno2)
17008 tmp = parts.base;
17009 else
17010 {
17011 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
17012 tmp = parts.index;
17013 }
17014
17015 ix86_emit_binop (PLUS, mode, operands[0], tmp);
17016 }
17017
17018 if (parts.disp && parts.disp != const0_rtx)
17019 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
17020 }
17021 }
17022
17023 /* Return true if it is ok to optimize an ADD operation to LEA
17024 operation to avoid flag register consumation. For most processors,
17025 ADD is faster than LEA. For the processors like ATOM, if the
17026 destination register of LEA holds an actual address which will be
17027 used soon, LEA is better and otherwise ADD is better. */
17028
17029 bool
17030 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17031 {
17032 unsigned int regno0 = true_regnum (operands[0]);
17033 unsigned int regno1 = true_regnum (operands[1]);
17034 unsigned int regno2 = true_regnum (operands[2]);
17035
17036 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17037 if (regno0 != regno1 && regno0 != regno2)
17038 return true;
17039
17040 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17041 return false;
17042
17043 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17044 }
17045
17046 /* Return true if destination reg of SET_BODY is shift count of
17047 USE_BODY. */
17048
17049 static bool
17050 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17051 {
17052 rtx set_dest;
17053 rtx shift_rtx;
17054 int i;
17055
17056 /* Retrieve destination of SET_BODY. */
17057 switch (GET_CODE (set_body))
17058 {
17059 case SET:
17060 set_dest = SET_DEST (set_body);
17061 if (!set_dest || !REG_P (set_dest))
17062 return false;
17063 break;
17064 case PARALLEL:
17065 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17066 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17067 use_body))
17068 return true;
17069 default:
17070 return false;
17071 break;
17072 }
17073
17074 /* Retrieve shift count of USE_BODY. */
17075 switch (GET_CODE (use_body))
17076 {
17077 case SET:
17078 shift_rtx = XEXP (use_body, 1);
17079 break;
17080 case PARALLEL:
17081 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17082 if (ix86_dep_by_shift_count_body (set_body,
17083 XVECEXP (use_body, 0, i)))
17084 return true;
17085 default:
17086 return false;
17087 break;
17088 }
17089
17090 if (shift_rtx
17091 && (GET_CODE (shift_rtx) == ASHIFT
17092 || GET_CODE (shift_rtx) == LSHIFTRT
17093 || GET_CODE (shift_rtx) == ASHIFTRT
17094 || GET_CODE (shift_rtx) == ROTATE
17095 || GET_CODE (shift_rtx) == ROTATERT))
17096 {
17097 rtx shift_count = XEXP (shift_rtx, 1);
17098
17099 /* Return true if shift count is dest of SET_BODY. */
17100 if (REG_P (shift_count)
17101 && true_regnum (set_dest) == true_regnum (shift_count))
17102 return true;
17103 }
17104
17105 return false;
17106 }
17107
17108 /* Return true if destination reg of SET_INSN is shift count of
17109 USE_INSN. */
17110
17111 bool
17112 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17113 {
17114 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17115 PATTERN (use_insn));
17116 }
17117
17118 /* Return TRUE or FALSE depending on whether the unary operator meets the
17119 appropriate constraints. */
17120
17121 bool
17122 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17123 enum machine_mode mode ATTRIBUTE_UNUSED,
17124 rtx operands[2] ATTRIBUTE_UNUSED)
17125 {
17126 /* If one of operands is memory, source and destination must match. */
17127 if ((MEM_P (operands[0])
17128 || MEM_P (operands[1]))
17129 && ! rtx_equal_p (operands[0], operands[1]))
17130 return false;
17131 return true;
17132 }
17133
17134 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17135 are ok, keeping in mind the possible movddup alternative. */
17136
17137 bool
17138 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17139 {
17140 if (MEM_P (operands[0]))
17141 return rtx_equal_p (operands[0], operands[1 + high]);
17142 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17143 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17144 return true;
17145 }
17146
17147 /* Post-reload splitter for converting an SF or DFmode value in an
17148 SSE register into an unsigned SImode. */
17149
17150 void
17151 ix86_split_convert_uns_si_sse (rtx operands[])
17152 {
17153 enum machine_mode vecmode;
17154 rtx value, large, zero_or_two31, input, two31, x;
17155
17156 large = operands[1];
17157 zero_or_two31 = operands[2];
17158 input = operands[3];
17159 two31 = operands[4];
17160 vecmode = GET_MODE (large);
17161 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17162
17163 /* Load up the value into the low element. We must ensure that the other
17164 elements are valid floats -- zero is the easiest such value. */
17165 if (MEM_P (input))
17166 {
17167 if (vecmode == V4SFmode)
17168 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17169 else
17170 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17171 }
17172 else
17173 {
17174 input = gen_rtx_REG (vecmode, REGNO (input));
17175 emit_move_insn (value, CONST0_RTX (vecmode));
17176 if (vecmode == V4SFmode)
17177 emit_insn (gen_sse_movss (value, value, input));
17178 else
17179 emit_insn (gen_sse2_movsd (value, value, input));
17180 }
17181
17182 emit_move_insn (large, two31);
17183 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17184
17185 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17186 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17187
17188 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17189 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17190
17191 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17192 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17193
17194 large = gen_rtx_REG (V4SImode, REGNO (large));
17195 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17196
17197 x = gen_rtx_REG (V4SImode, REGNO (value));
17198 if (vecmode == V4SFmode)
17199 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17200 else
17201 emit_insn (gen_sse2_cvttpd2dq (x, value));
17202 value = x;
17203
17204 emit_insn (gen_xorv4si3 (value, value, large));
17205 }
17206
17207 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17208 Expects the 64-bit DImode to be supplied in a pair of integral
17209 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17210 -mfpmath=sse, !optimize_size only. */
17211
17212 void
17213 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17214 {
17215 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17216 rtx int_xmm, fp_xmm;
17217 rtx biases, exponents;
17218 rtx x;
17219
17220 int_xmm = gen_reg_rtx (V4SImode);
17221 if (TARGET_INTER_UNIT_MOVES)
17222 emit_insn (gen_movdi_to_sse (int_xmm, input));
17223 else if (TARGET_SSE_SPLIT_REGS)
17224 {
17225 emit_clobber (int_xmm);
17226 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17227 }
17228 else
17229 {
17230 x = gen_reg_rtx (V2DImode);
17231 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17232 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17233 }
17234
17235 x = gen_rtx_CONST_VECTOR (V4SImode,
17236 gen_rtvec (4, GEN_INT (0x43300000UL),
17237 GEN_INT (0x45300000UL),
17238 const0_rtx, const0_rtx));
17239 exponents = validize_mem (force_const_mem (V4SImode, x));
17240
17241 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17242 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17243
17244 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17245 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17246 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17247 (0x1.0p84 + double(fp_value_hi_xmm)).
17248 Note these exponents differ by 32. */
17249
17250 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17251
17252 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17253 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17254 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17255 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17256 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17257 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17258 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17259 biases = validize_mem (force_const_mem (V2DFmode, biases));
17260 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17261
17262 /* Add the upper and lower DFmode values together. */
17263 if (TARGET_SSE3)
17264 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17265 else
17266 {
17267 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17268 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17269 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17270 }
17271
17272 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17273 }
17274
17275 /* Not used, but eases macroization of patterns. */
17276 void
17277 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17278 rtx input ATTRIBUTE_UNUSED)
17279 {
17280 gcc_unreachable ();
17281 }
17282
17283 /* Convert an unsigned SImode value into a DFmode. Only currently used
17284 for SSE, but applicable anywhere. */
17285
17286 void
17287 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17288 {
17289 REAL_VALUE_TYPE TWO31r;
17290 rtx x, fp;
17291
17292 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17293 NULL, 1, OPTAB_DIRECT);
17294
17295 fp = gen_reg_rtx (DFmode);
17296 emit_insn (gen_floatsidf2 (fp, x));
17297
17298 real_ldexp (&TWO31r, &dconst1, 31);
17299 x = const_double_from_real_value (TWO31r, DFmode);
17300
17301 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17302 if (x != target)
17303 emit_move_insn (target, x);
17304 }
17305
17306 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17307 32-bit mode; otherwise we have a direct convert instruction. */
17308
17309 void
17310 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17311 {
17312 REAL_VALUE_TYPE TWO32r;
17313 rtx fp_lo, fp_hi, x;
17314
17315 fp_lo = gen_reg_rtx (DFmode);
17316 fp_hi = gen_reg_rtx (DFmode);
17317
17318 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17319
17320 real_ldexp (&TWO32r, &dconst1, 32);
17321 x = const_double_from_real_value (TWO32r, DFmode);
17322 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17323
17324 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17325
17326 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17327 0, OPTAB_DIRECT);
17328 if (x != target)
17329 emit_move_insn (target, x);
17330 }
17331
17332 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17333 For x86_32, -mfpmath=sse, !optimize_size only. */
17334 void
17335 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17336 {
17337 REAL_VALUE_TYPE ONE16r;
17338 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17339
17340 real_ldexp (&ONE16r, &dconst1, 16);
17341 x = const_double_from_real_value (ONE16r, SFmode);
17342 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17343 NULL, 0, OPTAB_DIRECT);
17344 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17345 NULL, 0, OPTAB_DIRECT);
17346 fp_hi = gen_reg_rtx (SFmode);
17347 fp_lo = gen_reg_rtx (SFmode);
17348 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17349 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17350 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17351 0, OPTAB_DIRECT);
17352 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17353 0, OPTAB_DIRECT);
17354 if (!rtx_equal_p (target, fp_hi))
17355 emit_move_insn (target, fp_hi);
17356 }
17357
17358 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17359 a vector of unsigned ints VAL to vector of floats TARGET. */
17360
17361 void
17362 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17363 {
17364 rtx tmp[8];
17365 REAL_VALUE_TYPE TWO16r;
17366 enum machine_mode intmode = GET_MODE (val);
17367 enum machine_mode fltmode = GET_MODE (target);
17368 rtx (*cvt) (rtx, rtx);
17369
17370 if (intmode == V4SImode)
17371 cvt = gen_floatv4siv4sf2;
17372 else
17373 cvt = gen_floatv8siv8sf2;
17374 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17375 tmp[0] = force_reg (intmode, tmp[0]);
17376 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17377 OPTAB_DIRECT);
17378 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17379 NULL_RTX, 1, OPTAB_DIRECT);
17380 tmp[3] = gen_reg_rtx (fltmode);
17381 emit_insn (cvt (tmp[3], tmp[1]));
17382 tmp[4] = gen_reg_rtx (fltmode);
17383 emit_insn (cvt (tmp[4], tmp[2]));
17384 real_ldexp (&TWO16r, &dconst1, 16);
17385 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17386 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17387 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17388 OPTAB_DIRECT);
17389 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17390 OPTAB_DIRECT);
17391 if (tmp[7] != target)
17392 emit_move_insn (target, tmp[7]);
17393 }
17394
17395 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17396 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17397 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17398 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17399
17400 rtx
17401 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17402 {
17403 REAL_VALUE_TYPE TWO31r;
17404 rtx two31r, tmp[4];
17405 enum machine_mode mode = GET_MODE (val);
17406 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17407 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17408 rtx (*cmp) (rtx, rtx, rtx, rtx);
17409 int i;
17410
17411 for (i = 0; i < 3; i++)
17412 tmp[i] = gen_reg_rtx (mode);
17413 real_ldexp (&TWO31r, &dconst1, 31);
17414 two31r = const_double_from_real_value (TWO31r, scalarmode);
17415 two31r = ix86_build_const_vector (mode, 1, two31r);
17416 two31r = force_reg (mode, two31r);
17417 switch (mode)
17418 {
17419 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17420 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17421 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17422 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17423 default: gcc_unreachable ();
17424 }
17425 tmp[3] = gen_rtx_LE (mode, two31r, val);
17426 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17427 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17428 0, OPTAB_DIRECT);
17429 if (intmode == V4SImode || TARGET_AVX2)
17430 *xorp = expand_simple_binop (intmode, ASHIFT,
17431 gen_lowpart (intmode, tmp[0]),
17432 GEN_INT (31), NULL_RTX, 0,
17433 OPTAB_DIRECT);
17434 else
17435 {
17436 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17437 two31 = ix86_build_const_vector (intmode, 1, two31);
17438 *xorp = expand_simple_binop (intmode, AND,
17439 gen_lowpart (intmode, tmp[0]),
17440 two31, NULL_RTX, 0,
17441 OPTAB_DIRECT);
17442 }
17443 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17444 0, OPTAB_DIRECT);
17445 }
17446
17447 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17448 then replicate the value for all elements of the vector
17449 register. */
17450
17451 rtx
17452 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17453 {
17454 int i, n_elt;
17455 rtvec v;
17456 enum machine_mode scalar_mode;
17457
17458 switch (mode)
17459 {
17460 case V32QImode:
17461 case V16QImode:
17462 case V16HImode:
17463 case V8HImode:
17464 case V8SImode:
17465 case V4SImode:
17466 case V4DImode:
17467 case V2DImode:
17468 gcc_assert (vect);
17469 case V8SFmode:
17470 case V4SFmode:
17471 case V4DFmode:
17472 case V2DFmode:
17473 n_elt = GET_MODE_NUNITS (mode);
17474 v = rtvec_alloc (n_elt);
17475 scalar_mode = GET_MODE_INNER (mode);
17476
17477 RTVEC_ELT (v, 0) = value;
17478
17479 for (i = 1; i < n_elt; ++i)
17480 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17481
17482 return gen_rtx_CONST_VECTOR (mode, v);
17483
17484 default:
17485 gcc_unreachable ();
17486 }
17487 }
17488
17489 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17490 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17491 for an SSE register. If VECT is true, then replicate the mask for
17492 all elements of the vector register. If INVERT is true, then create
17493 a mask excluding the sign bit. */
17494
17495 rtx
17496 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17497 {
17498 enum machine_mode vec_mode, imode;
17499 HOST_WIDE_INT hi, lo;
17500 int shift = 63;
17501 rtx v;
17502 rtx mask;
17503
17504 /* Find the sign bit, sign extended to 2*HWI. */
17505 switch (mode)
17506 {
17507 case V8SImode:
17508 case V4SImode:
17509 case V8SFmode:
17510 case V4SFmode:
17511 vec_mode = mode;
17512 mode = GET_MODE_INNER (mode);
17513 imode = SImode;
17514 lo = 0x80000000, hi = lo < 0;
17515 break;
17516
17517 case V4DImode:
17518 case V2DImode:
17519 case V4DFmode:
17520 case V2DFmode:
17521 vec_mode = mode;
17522 mode = GET_MODE_INNER (mode);
17523 imode = DImode;
17524 if (HOST_BITS_PER_WIDE_INT >= 64)
17525 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17526 else
17527 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17528 break;
17529
17530 case TImode:
17531 case TFmode:
17532 vec_mode = VOIDmode;
17533 if (HOST_BITS_PER_WIDE_INT >= 64)
17534 {
17535 imode = TImode;
17536 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17537 }
17538 else
17539 {
17540 rtvec vec;
17541
17542 imode = DImode;
17543 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17544
17545 if (invert)
17546 {
17547 lo = ~lo, hi = ~hi;
17548 v = constm1_rtx;
17549 }
17550 else
17551 v = const0_rtx;
17552
17553 mask = immed_double_const (lo, hi, imode);
17554
17555 vec = gen_rtvec (2, v, mask);
17556 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17557 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17558
17559 return v;
17560 }
17561 break;
17562
17563 default:
17564 gcc_unreachable ();
17565 }
17566
17567 if (invert)
17568 lo = ~lo, hi = ~hi;
17569
17570 /* Force this value into the low part of a fp vector constant. */
17571 mask = immed_double_const (lo, hi, imode);
17572 mask = gen_lowpart (mode, mask);
17573
17574 if (vec_mode == VOIDmode)
17575 return force_reg (mode, mask);
17576
17577 v = ix86_build_const_vector (vec_mode, vect, mask);
17578 return force_reg (vec_mode, v);
17579 }
17580
17581 /* Generate code for floating point ABS or NEG. */
17582
17583 void
17584 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17585 rtx operands[])
17586 {
17587 rtx mask, set, dst, src;
17588 bool use_sse = false;
17589 bool vector_mode = VECTOR_MODE_P (mode);
17590 enum machine_mode vmode = mode;
17591
17592 if (vector_mode)
17593 use_sse = true;
17594 else if (mode == TFmode)
17595 use_sse = true;
17596 else if (TARGET_SSE_MATH)
17597 {
17598 use_sse = SSE_FLOAT_MODE_P (mode);
17599 if (mode == SFmode)
17600 vmode = V4SFmode;
17601 else if (mode == DFmode)
17602 vmode = V2DFmode;
17603 }
17604
17605 /* NEG and ABS performed with SSE use bitwise mask operations.
17606 Create the appropriate mask now. */
17607 if (use_sse)
17608 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17609 else
17610 mask = NULL_RTX;
17611
17612 dst = operands[0];
17613 src = operands[1];
17614
17615 set = gen_rtx_fmt_e (code, mode, src);
17616 set = gen_rtx_SET (VOIDmode, dst, set);
17617
17618 if (mask)
17619 {
17620 rtx use, clob;
17621 rtvec par;
17622
17623 use = gen_rtx_USE (VOIDmode, mask);
17624 if (vector_mode)
17625 par = gen_rtvec (2, set, use);
17626 else
17627 {
17628 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17629 par = gen_rtvec (3, set, use, clob);
17630 }
17631 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17632 }
17633 else
17634 emit_insn (set);
17635 }
17636
17637 /* Expand a copysign operation. Special case operand 0 being a constant. */
17638
17639 void
17640 ix86_expand_copysign (rtx operands[])
17641 {
17642 enum machine_mode mode, vmode;
17643 rtx dest, op0, op1, mask, nmask;
17644
17645 dest = operands[0];
17646 op0 = operands[1];
17647 op1 = operands[2];
17648
17649 mode = GET_MODE (dest);
17650
17651 if (mode == SFmode)
17652 vmode = V4SFmode;
17653 else if (mode == DFmode)
17654 vmode = V2DFmode;
17655 else
17656 vmode = mode;
17657
17658 if (GET_CODE (op0) == CONST_DOUBLE)
17659 {
17660 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17661
17662 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17663 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17664
17665 if (mode == SFmode || mode == DFmode)
17666 {
17667 if (op0 == CONST0_RTX (mode))
17668 op0 = CONST0_RTX (vmode);
17669 else
17670 {
17671 rtx v = ix86_build_const_vector (vmode, false, op0);
17672
17673 op0 = force_reg (vmode, v);
17674 }
17675 }
17676 else if (op0 != CONST0_RTX (mode))
17677 op0 = force_reg (mode, op0);
17678
17679 mask = ix86_build_signbit_mask (vmode, 0, 0);
17680
17681 if (mode == SFmode)
17682 copysign_insn = gen_copysignsf3_const;
17683 else if (mode == DFmode)
17684 copysign_insn = gen_copysigndf3_const;
17685 else
17686 copysign_insn = gen_copysigntf3_const;
17687
17688 emit_insn (copysign_insn (dest, op0, op1, mask));
17689 }
17690 else
17691 {
17692 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17693
17694 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17695 mask = ix86_build_signbit_mask (vmode, 0, 0);
17696
17697 if (mode == SFmode)
17698 copysign_insn = gen_copysignsf3_var;
17699 else if (mode == DFmode)
17700 copysign_insn = gen_copysigndf3_var;
17701 else
17702 copysign_insn = gen_copysigntf3_var;
17703
17704 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17705 }
17706 }
17707
17708 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17709 be a constant, and so has already been expanded into a vector constant. */
17710
17711 void
17712 ix86_split_copysign_const (rtx operands[])
17713 {
17714 enum machine_mode mode, vmode;
17715 rtx dest, op0, mask, x;
17716
17717 dest = operands[0];
17718 op0 = operands[1];
17719 mask = operands[3];
17720
17721 mode = GET_MODE (dest);
17722 vmode = GET_MODE (mask);
17723
17724 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17725 x = gen_rtx_AND (vmode, dest, mask);
17726 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17727
17728 if (op0 != CONST0_RTX (vmode))
17729 {
17730 x = gen_rtx_IOR (vmode, dest, op0);
17731 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17732 }
17733 }
17734
17735 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17736 so we have to do two masks. */
17737
17738 void
17739 ix86_split_copysign_var (rtx operands[])
17740 {
17741 enum machine_mode mode, vmode;
17742 rtx dest, scratch, op0, op1, mask, nmask, x;
17743
17744 dest = operands[0];
17745 scratch = operands[1];
17746 op0 = operands[2];
17747 op1 = operands[3];
17748 nmask = operands[4];
17749 mask = operands[5];
17750
17751 mode = GET_MODE (dest);
17752 vmode = GET_MODE (mask);
17753
17754 if (rtx_equal_p (op0, op1))
17755 {
17756 /* Shouldn't happen often (it's useless, obviously), but when it does
17757 we'd generate incorrect code if we continue below. */
17758 emit_move_insn (dest, op0);
17759 return;
17760 }
17761
17762 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17763 {
17764 gcc_assert (REGNO (op1) == REGNO (scratch));
17765
17766 x = gen_rtx_AND (vmode, scratch, mask);
17767 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17768
17769 dest = mask;
17770 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17771 x = gen_rtx_NOT (vmode, dest);
17772 x = gen_rtx_AND (vmode, x, op0);
17773 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17774 }
17775 else
17776 {
17777 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17778 {
17779 x = gen_rtx_AND (vmode, scratch, mask);
17780 }
17781 else /* alternative 2,4 */
17782 {
17783 gcc_assert (REGNO (mask) == REGNO (scratch));
17784 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17785 x = gen_rtx_AND (vmode, scratch, op1);
17786 }
17787 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17788
17789 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17790 {
17791 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17792 x = gen_rtx_AND (vmode, dest, nmask);
17793 }
17794 else /* alternative 3,4 */
17795 {
17796 gcc_assert (REGNO (nmask) == REGNO (dest));
17797 dest = nmask;
17798 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17799 x = gen_rtx_AND (vmode, dest, op0);
17800 }
17801 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17802 }
17803
17804 x = gen_rtx_IOR (vmode, dest, scratch);
17805 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17806 }
17807
17808 /* Return TRUE or FALSE depending on whether the first SET in INSN
17809 has source and destination with matching CC modes, and that the
17810 CC mode is at least as constrained as REQ_MODE. */
17811
17812 bool
17813 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17814 {
17815 rtx set;
17816 enum machine_mode set_mode;
17817
17818 set = PATTERN (insn);
17819 if (GET_CODE (set) == PARALLEL)
17820 set = XVECEXP (set, 0, 0);
17821 gcc_assert (GET_CODE (set) == SET);
17822 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17823
17824 set_mode = GET_MODE (SET_DEST (set));
17825 switch (set_mode)
17826 {
17827 case CCNOmode:
17828 if (req_mode != CCNOmode
17829 && (req_mode != CCmode
17830 || XEXP (SET_SRC (set), 1) != const0_rtx))
17831 return false;
17832 break;
17833 case CCmode:
17834 if (req_mode == CCGCmode)
17835 return false;
17836 /* FALLTHRU */
17837 case CCGCmode:
17838 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17839 return false;
17840 /* FALLTHRU */
17841 case CCGOCmode:
17842 if (req_mode == CCZmode)
17843 return false;
17844 /* FALLTHRU */
17845 case CCZmode:
17846 break;
17847
17848 case CCAmode:
17849 case CCCmode:
17850 case CCOmode:
17851 case CCSmode:
17852 if (set_mode != req_mode)
17853 return false;
17854 break;
17855
17856 default:
17857 gcc_unreachable ();
17858 }
17859
17860 return GET_MODE (SET_SRC (set)) == set_mode;
17861 }
17862
17863 /* Generate insn patterns to do an integer compare of OPERANDS. */
17864
17865 static rtx
17866 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17867 {
17868 enum machine_mode cmpmode;
17869 rtx tmp, flags;
17870
17871 cmpmode = SELECT_CC_MODE (code, op0, op1);
17872 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17873
17874 /* This is very simple, but making the interface the same as in the
17875 FP case makes the rest of the code easier. */
17876 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17877 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17878
17879 /* Return the test that should be put into the flags user, i.e.
17880 the bcc, scc, or cmov instruction. */
17881 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17882 }
17883
17884 /* Figure out whether to use ordered or unordered fp comparisons.
17885 Return the appropriate mode to use. */
17886
17887 enum machine_mode
17888 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17889 {
17890 /* ??? In order to make all comparisons reversible, we do all comparisons
17891 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17892 all forms trapping and nontrapping comparisons, we can make inequality
17893 comparisons trapping again, since it results in better code when using
17894 FCOM based compares. */
17895 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17896 }
17897
17898 enum machine_mode
17899 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17900 {
17901 enum machine_mode mode = GET_MODE (op0);
17902
17903 if (SCALAR_FLOAT_MODE_P (mode))
17904 {
17905 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17906 return ix86_fp_compare_mode (code);
17907 }
17908
17909 switch (code)
17910 {
17911 /* Only zero flag is needed. */
17912 case EQ: /* ZF=0 */
17913 case NE: /* ZF!=0 */
17914 return CCZmode;
17915 /* Codes needing carry flag. */
17916 case GEU: /* CF=0 */
17917 case LTU: /* CF=1 */
17918 /* Detect overflow checks. They need just the carry flag. */
17919 if (GET_CODE (op0) == PLUS
17920 && rtx_equal_p (op1, XEXP (op0, 0)))
17921 return CCCmode;
17922 else
17923 return CCmode;
17924 case GTU: /* CF=0 & ZF=0 */
17925 case LEU: /* CF=1 | ZF=1 */
17926 /* Detect overflow checks. They need just the carry flag. */
17927 if (GET_CODE (op0) == MINUS
17928 && rtx_equal_p (op1, XEXP (op0, 0)))
17929 return CCCmode;
17930 else
17931 return CCmode;
17932 /* Codes possibly doable only with sign flag when
17933 comparing against zero. */
17934 case GE: /* SF=OF or SF=0 */
17935 case LT: /* SF<>OF or SF=1 */
17936 if (op1 == const0_rtx)
17937 return CCGOCmode;
17938 else
17939 /* For other cases Carry flag is not required. */
17940 return CCGCmode;
17941 /* Codes doable only with sign flag when comparing
17942 against zero, but we miss jump instruction for it
17943 so we need to use relational tests against overflow
17944 that thus needs to be zero. */
17945 case GT: /* ZF=0 & SF=OF */
17946 case LE: /* ZF=1 | SF<>OF */
17947 if (op1 == const0_rtx)
17948 return CCNOmode;
17949 else
17950 return CCGCmode;
17951 /* strcmp pattern do (use flags) and combine may ask us for proper
17952 mode. */
17953 case USE:
17954 return CCmode;
17955 default:
17956 gcc_unreachable ();
17957 }
17958 }
17959
17960 /* Return the fixed registers used for condition codes. */
17961
17962 static bool
17963 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17964 {
17965 *p1 = FLAGS_REG;
17966 *p2 = FPSR_REG;
17967 return true;
17968 }
17969
17970 /* If two condition code modes are compatible, return a condition code
17971 mode which is compatible with both. Otherwise, return
17972 VOIDmode. */
17973
17974 static enum machine_mode
17975 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17976 {
17977 if (m1 == m2)
17978 return m1;
17979
17980 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17981 return VOIDmode;
17982
17983 if ((m1 == CCGCmode && m2 == CCGOCmode)
17984 || (m1 == CCGOCmode && m2 == CCGCmode))
17985 return CCGCmode;
17986
17987 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
17988 return m2;
17989 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
17990 return m1;
17991
17992 switch (m1)
17993 {
17994 default:
17995 gcc_unreachable ();
17996
17997 case CCmode:
17998 case CCGCmode:
17999 case CCGOCmode:
18000 case CCNOmode:
18001 case CCAmode:
18002 case CCCmode:
18003 case CCOmode:
18004 case CCSmode:
18005 case CCZmode:
18006 switch (m2)
18007 {
18008 default:
18009 return VOIDmode;
18010
18011 case CCmode:
18012 case CCGCmode:
18013 case CCGOCmode:
18014 case CCNOmode:
18015 case CCAmode:
18016 case CCCmode:
18017 case CCOmode:
18018 case CCSmode:
18019 case CCZmode:
18020 return CCmode;
18021 }
18022
18023 case CCFPmode:
18024 case CCFPUmode:
18025 /* These are only compatible with themselves, which we already
18026 checked above. */
18027 return VOIDmode;
18028 }
18029 }
18030
18031
18032 /* Return a comparison we can do and that it is equivalent to
18033 swap_condition (code) apart possibly from orderedness.
18034 But, never change orderedness if TARGET_IEEE_FP, returning
18035 UNKNOWN in that case if necessary. */
18036
18037 static enum rtx_code
18038 ix86_fp_swap_condition (enum rtx_code code)
18039 {
18040 switch (code)
18041 {
18042 case GT: /* GTU - CF=0 & ZF=0 */
18043 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18044 case GE: /* GEU - CF=0 */
18045 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18046 case UNLT: /* LTU - CF=1 */
18047 return TARGET_IEEE_FP ? UNKNOWN : GT;
18048 case UNLE: /* LEU - CF=1 | ZF=1 */
18049 return TARGET_IEEE_FP ? UNKNOWN : GE;
18050 default:
18051 return swap_condition (code);
18052 }
18053 }
18054
18055 /* Return cost of comparison CODE using the best strategy for performance.
18056 All following functions do use number of instructions as a cost metrics.
18057 In future this should be tweaked to compute bytes for optimize_size and
18058 take into account performance of various instructions on various CPUs. */
18059
18060 static int
18061 ix86_fp_comparison_cost (enum rtx_code code)
18062 {
18063 int arith_cost;
18064
18065 /* The cost of code using bit-twiddling on %ah. */
18066 switch (code)
18067 {
18068 case UNLE:
18069 case UNLT:
18070 case LTGT:
18071 case GT:
18072 case GE:
18073 case UNORDERED:
18074 case ORDERED:
18075 case UNEQ:
18076 arith_cost = 4;
18077 break;
18078 case LT:
18079 case NE:
18080 case EQ:
18081 case UNGE:
18082 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18083 break;
18084 case LE:
18085 case UNGT:
18086 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18087 break;
18088 default:
18089 gcc_unreachable ();
18090 }
18091
18092 switch (ix86_fp_comparison_strategy (code))
18093 {
18094 case IX86_FPCMP_COMI:
18095 return arith_cost > 4 ? 3 : 2;
18096 case IX86_FPCMP_SAHF:
18097 return arith_cost > 4 ? 4 : 3;
18098 default:
18099 return arith_cost;
18100 }
18101 }
18102
18103 /* Return strategy to use for floating-point. We assume that fcomi is always
18104 preferrable where available, since that is also true when looking at size
18105 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18106
18107 enum ix86_fpcmp_strategy
18108 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18109 {
18110 /* Do fcomi/sahf based test when profitable. */
18111
18112 if (TARGET_CMOVE)
18113 return IX86_FPCMP_COMI;
18114
18115 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18116 return IX86_FPCMP_SAHF;
18117
18118 return IX86_FPCMP_ARITH;
18119 }
18120
18121 /* Swap, force into registers, or otherwise massage the two operands
18122 to a fp comparison. The operands are updated in place; the new
18123 comparison code is returned. */
18124
18125 static enum rtx_code
18126 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18127 {
18128 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18129 rtx op0 = *pop0, op1 = *pop1;
18130 enum machine_mode op_mode = GET_MODE (op0);
18131 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18132
18133 /* All of the unordered compare instructions only work on registers.
18134 The same is true of the fcomi compare instructions. The XFmode
18135 compare instructions require registers except when comparing
18136 against zero or when converting operand 1 from fixed point to
18137 floating point. */
18138
18139 if (!is_sse
18140 && (fpcmp_mode == CCFPUmode
18141 || (op_mode == XFmode
18142 && ! (standard_80387_constant_p (op0) == 1
18143 || standard_80387_constant_p (op1) == 1)
18144 && GET_CODE (op1) != FLOAT)
18145 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18146 {
18147 op0 = force_reg (op_mode, op0);
18148 op1 = force_reg (op_mode, op1);
18149 }
18150 else
18151 {
18152 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18153 things around if they appear profitable, otherwise force op0
18154 into a register. */
18155
18156 if (standard_80387_constant_p (op0) == 0
18157 || (MEM_P (op0)
18158 && ! (standard_80387_constant_p (op1) == 0
18159 || MEM_P (op1))))
18160 {
18161 enum rtx_code new_code = ix86_fp_swap_condition (code);
18162 if (new_code != UNKNOWN)
18163 {
18164 rtx tmp;
18165 tmp = op0, op0 = op1, op1 = tmp;
18166 code = new_code;
18167 }
18168 }
18169
18170 if (!REG_P (op0))
18171 op0 = force_reg (op_mode, op0);
18172
18173 if (CONSTANT_P (op1))
18174 {
18175 int tmp = standard_80387_constant_p (op1);
18176 if (tmp == 0)
18177 op1 = validize_mem (force_const_mem (op_mode, op1));
18178 else if (tmp == 1)
18179 {
18180 if (TARGET_CMOVE)
18181 op1 = force_reg (op_mode, op1);
18182 }
18183 else
18184 op1 = force_reg (op_mode, op1);
18185 }
18186 }
18187
18188 /* Try to rearrange the comparison to make it cheaper. */
18189 if (ix86_fp_comparison_cost (code)
18190 > ix86_fp_comparison_cost (swap_condition (code))
18191 && (REG_P (op1) || can_create_pseudo_p ()))
18192 {
18193 rtx tmp;
18194 tmp = op0, op0 = op1, op1 = tmp;
18195 code = swap_condition (code);
18196 if (!REG_P (op0))
18197 op0 = force_reg (op_mode, op0);
18198 }
18199
18200 *pop0 = op0;
18201 *pop1 = op1;
18202 return code;
18203 }
18204
18205 /* Convert comparison codes we use to represent FP comparison to integer
18206 code that will result in proper branch. Return UNKNOWN if no such code
18207 is available. */
18208
18209 enum rtx_code
18210 ix86_fp_compare_code_to_integer (enum rtx_code code)
18211 {
18212 switch (code)
18213 {
18214 case GT:
18215 return GTU;
18216 case GE:
18217 return GEU;
18218 case ORDERED:
18219 case UNORDERED:
18220 return code;
18221 break;
18222 case UNEQ:
18223 return EQ;
18224 break;
18225 case UNLT:
18226 return LTU;
18227 break;
18228 case UNLE:
18229 return LEU;
18230 break;
18231 case LTGT:
18232 return NE;
18233 break;
18234 default:
18235 return UNKNOWN;
18236 }
18237 }
18238
18239 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18240
18241 static rtx
18242 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18243 {
18244 enum machine_mode fpcmp_mode, intcmp_mode;
18245 rtx tmp, tmp2;
18246
18247 fpcmp_mode = ix86_fp_compare_mode (code);
18248 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18249
18250 /* Do fcomi/sahf based test when profitable. */
18251 switch (ix86_fp_comparison_strategy (code))
18252 {
18253 case IX86_FPCMP_COMI:
18254 intcmp_mode = fpcmp_mode;
18255 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18256 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18257 tmp);
18258 emit_insn (tmp);
18259 break;
18260
18261 case IX86_FPCMP_SAHF:
18262 intcmp_mode = fpcmp_mode;
18263 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18264 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18265 tmp);
18266
18267 if (!scratch)
18268 scratch = gen_reg_rtx (HImode);
18269 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18270 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18271 break;
18272
18273 case IX86_FPCMP_ARITH:
18274 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18275 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18276 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18277 if (!scratch)
18278 scratch = gen_reg_rtx (HImode);
18279 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18280
18281 /* In the unordered case, we have to check C2 for NaN's, which
18282 doesn't happen to work out to anything nice combination-wise.
18283 So do some bit twiddling on the value we've got in AH to come
18284 up with an appropriate set of condition codes. */
18285
18286 intcmp_mode = CCNOmode;
18287 switch (code)
18288 {
18289 case GT:
18290 case UNGT:
18291 if (code == GT || !TARGET_IEEE_FP)
18292 {
18293 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18294 code = EQ;
18295 }
18296 else
18297 {
18298 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18299 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18300 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18301 intcmp_mode = CCmode;
18302 code = GEU;
18303 }
18304 break;
18305 case LT:
18306 case UNLT:
18307 if (code == LT && TARGET_IEEE_FP)
18308 {
18309 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18310 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18311 intcmp_mode = CCmode;
18312 code = EQ;
18313 }
18314 else
18315 {
18316 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18317 code = NE;
18318 }
18319 break;
18320 case GE:
18321 case UNGE:
18322 if (code == GE || !TARGET_IEEE_FP)
18323 {
18324 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18325 code = EQ;
18326 }
18327 else
18328 {
18329 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18330 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18331 code = NE;
18332 }
18333 break;
18334 case LE:
18335 case UNLE:
18336 if (code == LE && TARGET_IEEE_FP)
18337 {
18338 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18339 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18340 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18341 intcmp_mode = CCmode;
18342 code = LTU;
18343 }
18344 else
18345 {
18346 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18347 code = NE;
18348 }
18349 break;
18350 case EQ:
18351 case UNEQ:
18352 if (code == EQ && TARGET_IEEE_FP)
18353 {
18354 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18355 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18356 intcmp_mode = CCmode;
18357 code = EQ;
18358 }
18359 else
18360 {
18361 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18362 code = NE;
18363 }
18364 break;
18365 case NE:
18366 case LTGT:
18367 if (code == NE && TARGET_IEEE_FP)
18368 {
18369 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18370 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18371 GEN_INT (0x40)));
18372 code = NE;
18373 }
18374 else
18375 {
18376 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18377 code = EQ;
18378 }
18379 break;
18380
18381 case UNORDERED:
18382 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18383 code = NE;
18384 break;
18385 case ORDERED:
18386 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18387 code = EQ;
18388 break;
18389
18390 default:
18391 gcc_unreachable ();
18392 }
18393 break;
18394
18395 default:
18396 gcc_unreachable();
18397 }
18398
18399 /* Return the test that should be put into the flags user, i.e.
18400 the bcc, scc, or cmov instruction. */
18401 return gen_rtx_fmt_ee (code, VOIDmode,
18402 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18403 const0_rtx);
18404 }
18405
18406 static rtx
18407 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18408 {
18409 rtx ret;
18410
18411 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18412 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18413
18414 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18415 {
18416 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18417 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18418 }
18419 else
18420 ret = ix86_expand_int_compare (code, op0, op1);
18421
18422 return ret;
18423 }
18424
18425 void
18426 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18427 {
18428 enum machine_mode mode = GET_MODE (op0);
18429 rtx tmp;
18430
18431 switch (mode)
18432 {
18433 case SFmode:
18434 case DFmode:
18435 case XFmode:
18436 case QImode:
18437 case HImode:
18438 case SImode:
18439 simple:
18440 tmp = ix86_expand_compare (code, op0, op1);
18441 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18442 gen_rtx_LABEL_REF (VOIDmode, label),
18443 pc_rtx);
18444 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18445 return;
18446
18447 case DImode:
18448 if (TARGET_64BIT)
18449 goto simple;
18450 case TImode:
18451 /* Expand DImode branch into multiple compare+branch. */
18452 {
18453 rtx lo[2], hi[2], label2;
18454 enum rtx_code code1, code2, code3;
18455 enum machine_mode submode;
18456
18457 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18458 {
18459 tmp = op0, op0 = op1, op1 = tmp;
18460 code = swap_condition (code);
18461 }
18462
18463 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18464 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18465
18466 submode = mode == DImode ? SImode : DImode;
18467
18468 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18469 avoid two branches. This costs one extra insn, so disable when
18470 optimizing for size. */
18471
18472 if ((code == EQ || code == NE)
18473 && (!optimize_insn_for_size_p ()
18474 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18475 {
18476 rtx xor0, xor1;
18477
18478 xor1 = hi[0];
18479 if (hi[1] != const0_rtx)
18480 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18481 NULL_RTX, 0, OPTAB_WIDEN);
18482
18483 xor0 = lo[0];
18484 if (lo[1] != const0_rtx)
18485 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18486 NULL_RTX, 0, OPTAB_WIDEN);
18487
18488 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18489 NULL_RTX, 0, OPTAB_WIDEN);
18490
18491 ix86_expand_branch (code, tmp, const0_rtx, label);
18492 return;
18493 }
18494
18495 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18496 op1 is a constant and the low word is zero, then we can just
18497 examine the high word. Similarly for low word -1 and
18498 less-or-equal-than or greater-than. */
18499
18500 if (CONST_INT_P (hi[1]))
18501 switch (code)
18502 {
18503 case LT: case LTU: case GE: case GEU:
18504 if (lo[1] == const0_rtx)
18505 {
18506 ix86_expand_branch (code, hi[0], hi[1], label);
18507 return;
18508 }
18509 break;
18510 case LE: case LEU: case GT: case GTU:
18511 if (lo[1] == constm1_rtx)
18512 {
18513 ix86_expand_branch (code, hi[0], hi[1], label);
18514 return;
18515 }
18516 break;
18517 default:
18518 break;
18519 }
18520
18521 /* Otherwise, we need two or three jumps. */
18522
18523 label2 = gen_label_rtx ();
18524
18525 code1 = code;
18526 code2 = swap_condition (code);
18527 code3 = unsigned_condition (code);
18528
18529 switch (code)
18530 {
18531 case LT: case GT: case LTU: case GTU:
18532 break;
18533
18534 case LE: code1 = LT; code2 = GT; break;
18535 case GE: code1 = GT; code2 = LT; break;
18536 case LEU: code1 = LTU; code2 = GTU; break;
18537 case GEU: code1 = GTU; code2 = LTU; break;
18538
18539 case EQ: code1 = UNKNOWN; code2 = NE; break;
18540 case NE: code2 = UNKNOWN; break;
18541
18542 default:
18543 gcc_unreachable ();
18544 }
18545
18546 /*
18547 * a < b =>
18548 * if (hi(a) < hi(b)) goto true;
18549 * if (hi(a) > hi(b)) goto false;
18550 * if (lo(a) < lo(b)) goto true;
18551 * false:
18552 */
18553
18554 if (code1 != UNKNOWN)
18555 ix86_expand_branch (code1, hi[0], hi[1], label);
18556 if (code2 != UNKNOWN)
18557 ix86_expand_branch (code2, hi[0], hi[1], label2);
18558
18559 ix86_expand_branch (code3, lo[0], lo[1], label);
18560
18561 if (code2 != UNKNOWN)
18562 emit_label (label2);
18563 return;
18564 }
18565
18566 default:
18567 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18568 goto simple;
18569 }
18570 }
18571
18572 /* Split branch based on floating point condition. */
18573 void
18574 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18575 rtx target1, rtx target2, rtx tmp, rtx pushed)
18576 {
18577 rtx condition;
18578 rtx i;
18579
18580 if (target2 != pc_rtx)
18581 {
18582 rtx tmp = target2;
18583 code = reverse_condition_maybe_unordered (code);
18584 target2 = target1;
18585 target1 = tmp;
18586 }
18587
18588 condition = ix86_expand_fp_compare (code, op1, op2,
18589 tmp);
18590
18591 /* Remove pushed operand from stack. */
18592 if (pushed)
18593 ix86_free_from_memory (GET_MODE (pushed));
18594
18595 i = emit_jump_insn (gen_rtx_SET
18596 (VOIDmode, pc_rtx,
18597 gen_rtx_IF_THEN_ELSE (VOIDmode,
18598 condition, target1, target2)));
18599 if (split_branch_probability >= 0)
18600 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18601 }
18602
18603 void
18604 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18605 {
18606 rtx ret;
18607
18608 gcc_assert (GET_MODE (dest) == QImode);
18609
18610 ret = ix86_expand_compare (code, op0, op1);
18611 PUT_MODE (ret, QImode);
18612 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18613 }
18614
18615 /* Expand comparison setting or clearing carry flag. Return true when
18616 successful and set pop for the operation. */
18617 static bool
18618 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18619 {
18620 enum machine_mode mode =
18621 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18622
18623 /* Do not handle double-mode compares that go through special path. */
18624 if (mode == (TARGET_64BIT ? TImode : DImode))
18625 return false;
18626
18627 if (SCALAR_FLOAT_MODE_P (mode))
18628 {
18629 rtx compare_op, compare_seq;
18630
18631 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18632
18633 /* Shortcut: following common codes never translate
18634 into carry flag compares. */
18635 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18636 || code == ORDERED || code == UNORDERED)
18637 return false;
18638
18639 /* These comparisons require zero flag; swap operands so they won't. */
18640 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18641 && !TARGET_IEEE_FP)
18642 {
18643 rtx tmp = op0;
18644 op0 = op1;
18645 op1 = tmp;
18646 code = swap_condition (code);
18647 }
18648
18649 /* Try to expand the comparison and verify that we end up with
18650 carry flag based comparison. This fails to be true only when
18651 we decide to expand comparison using arithmetic that is not
18652 too common scenario. */
18653 start_sequence ();
18654 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18655 compare_seq = get_insns ();
18656 end_sequence ();
18657
18658 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18659 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18660 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18661 else
18662 code = GET_CODE (compare_op);
18663
18664 if (code != LTU && code != GEU)
18665 return false;
18666
18667 emit_insn (compare_seq);
18668 *pop = compare_op;
18669 return true;
18670 }
18671
18672 if (!INTEGRAL_MODE_P (mode))
18673 return false;
18674
18675 switch (code)
18676 {
18677 case LTU:
18678 case GEU:
18679 break;
18680
18681 /* Convert a==0 into (unsigned)a<1. */
18682 case EQ:
18683 case NE:
18684 if (op1 != const0_rtx)
18685 return false;
18686 op1 = const1_rtx;
18687 code = (code == EQ ? LTU : GEU);
18688 break;
18689
18690 /* Convert a>b into b<a or a>=b-1. */
18691 case GTU:
18692 case LEU:
18693 if (CONST_INT_P (op1))
18694 {
18695 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18696 /* Bail out on overflow. We still can swap operands but that
18697 would force loading of the constant into register. */
18698 if (op1 == const0_rtx
18699 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18700 return false;
18701 code = (code == GTU ? GEU : LTU);
18702 }
18703 else
18704 {
18705 rtx tmp = op1;
18706 op1 = op0;
18707 op0 = tmp;
18708 code = (code == GTU ? LTU : GEU);
18709 }
18710 break;
18711
18712 /* Convert a>=0 into (unsigned)a<0x80000000. */
18713 case LT:
18714 case GE:
18715 if (mode == DImode || op1 != const0_rtx)
18716 return false;
18717 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18718 code = (code == LT ? GEU : LTU);
18719 break;
18720 case LE:
18721 case GT:
18722 if (mode == DImode || op1 != constm1_rtx)
18723 return false;
18724 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18725 code = (code == LE ? GEU : LTU);
18726 break;
18727
18728 default:
18729 return false;
18730 }
18731 /* Swapping operands may cause constant to appear as first operand. */
18732 if (!nonimmediate_operand (op0, VOIDmode))
18733 {
18734 if (!can_create_pseudo_p ())
18735 return false;
18736 op0 = force_reg (mode, op0);
18737 }
18738 *pop = ix86_expand_compare (code, op0, op1);
18739 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18740 return true;
18741 }
18742
18743 bool
18744 ix86_expand_int_movcc (rtx operands[])
18745 {
18746 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18747 rtx compare_seq, compare_op;
18748 enum machine_mode mode = GET_MODE (operands[0]);
18749 bool sign_bit_compare_p = false;
18750 rtx op0 = XEXP (operands[1], 0);
18751 rtx op1 = XEXP (operands[1], 1);
18752
18753 start_sequence ();
18754 compare_op = ix86_expand_compare (code, op0, op1);
18755 compare_seq = get_insns ();
18756 end_sequence ();
18757
18758 compare_code = GET_CODE (compare_op);
18759
18760 if ((op1 == const0_rtx && (code == GE || code == LT))
18761 || (op1 == constm1_rtx && (code == GT || code == LE)))
18762 sign_bit_compare_p = true;
18763
18764 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18765 HImode insns, we'd be swallowed in word prefix ops. */
18766
18767 if ((mode != HImode || TARGET_FAST_PREFIX)
18768 && (mode != (TARGET_64BIT ? TImode : DImode))
18769 && CONST_INT_P (operands[2])
18770 && CONST_INT_P (operands[3]))
18771 {
18772 rtx out = operands[0];
18773 HOST_WIDE_INT ct = INTVAL (operands[2]);
18774 HOST_WIDE_INT cf = INTVAL (operands[3]);
18775 HOST_WIDE_INT diff;
18776
18777 diff = ct - cf;
18778 /* Sign bit compares are better done using shifts than we do by using
18779 sbb. */
18780 if (sign_bit_compare_p
18781 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18782 {
18783 /* Detect overlap between destination and compare sources. */
18784 rtx tmp = out;
18785
18786 if (!sign_bit_compare_p)
18787 {
18788 rtx flags;
18789 bool fpcmp = false;
18790
18791 compare_code = GET_CODE (compare_op);
18792
18793 flags = XEXP (compare_op, 0);
18794
18795 if (GET_MODE (flags) == CCFPmode
18796 || GET_MODE (flags) == CCFPUmode)
18797 {
18798 fpcmp = true;
18799 compare_code
18800 = ix86_fp_compare_code_to_integer (compare_code);
18801 }
18802
18803 /* To simplify rest of code, restrict to the GEU case. */
18804 if (compare_code == LTU)
18805 {
18806 HOST_WIDE_INT tmp = ct;
18807 ct = cf;
18808 cf = tmp;
18809 compare_code = reverse_condition (compare_code);
18810 code = reverse_condition (code);
18811 }
18812 else
18813 {
18814 if (fpcmp)
18815 PUT_CODE (compare_op,
18816 reverse_condition_maybe_unordered
18817 (GET_CODE (compare_op)));
18818 else
18819 PUT_CODE (compare_op,
18820 reverse_condition (GET_CODE (compare_op)));
18821 }
18822 diff = ct - cf;
18823
18824 if (reg_overlap_mentioned_p (out, op0)
18825 || reg_overlap_mentioned_p (out, op1))
18826 tmp = gen_reg_rtx (mode);
18827
18828 if (mode == DImode)
18829 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18830 else
18831 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18832 flags, compare_op));
18833 }
18834 else
18835 {
18836 if (code == GT || code == GE)
18837 code = reverse_condition (code);
18838 else
18839 {
18840 HOST_WIDE_INT tmp = ct;
18841 ct = cf;
18842 cf = tmp;
18843 diff = ct - cf;
18844 }
18845 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18846 }
18847
18848 if (diff == 1)
18849 {
18850 /*
18851 * cmpl op0,op1
18852 * sbbl dest,dest
18853 * [addl dest, ct]
18854 *
18855 * Size 5 - 8.
18856 */
18857 if (ct)
18858 tmp = expand_simple_binop (mode, PLUS,
18859 tmp, GEN_INT (ct),
18860 copy_rtx (tmp), 1, OPTAB_DIRECT);
18861 }
18862 else if (cf == -1)
18863 {
18864 /*
18865 * cmpl op0,op1
18866 * sbbl dest,dest
18867 * orl $ct, dest
18868 *
18869 * Size 8.
18870 */
18871 tmp = expand_simple_binop (mode, IOR,
18872 tmp, GEN_INT (ct),
18873 copy_rtx (tmp), 1, OPTAB_DIRECT);
18874 }
18875 else if (diff == -1 && ct)
18876 {
18877 /*
18878 * cmpl op0,op1
18879 * sbbl dest,dest
18880 * notl dest
18881 * [addl dest, cf]
18882 *
18883 * Size 8 - 11.
18884 */
18885 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18886 if (cf)
18887 tmp = expand_simple_binop (mode, PLUS,
18888 copy_rtx (tmp), GEN_INT (cf),
18889 copy_rtx (tmp), 1, OPTAB_DIRECT);
18890 }
18891 else
18892 {
18893 /*
18894 * cmpl op0,op1
18895 * sbbl dest,dest
18896 * [notl dest]
18897 * andl cf - ct, dest
18898 * [addl dest, ct]
18899 *
18900 * Size 8 - 11.
18901 */
18902
18903 if (cf == 0)
18904 {
18905 cf = ct;
18906 ct = 0;
18907 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18908 }
18909
18910 tmp = expand_simple_binop (mode, AND,
18911 copy_rtx (tmp),
18912 gen_int_mode (cf - ct, mode),
18913 copy_rtx (tmp), 1, OPTAB_DIRECT);
18914 if (ct)
18915 tmp = expand_simple_binop (mode, PLUS,
18916 copy_rtx (tmp), GEN_INT (ct),
18917 copy_rtx (tmp), 1, OPTAB_DIRECT);
18918 }
18919
18920 if (!rtx_equal_p (tmp, out))
18921 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18922
18923 return true;
18924 }
18925
18926 if (diff < 0)
18927 {
18928 enum machine_mode cmp_mode = GET_MODE (op0);
18929
18930 HOST_WIDE_INT tmp;
18931 tmp = ct, ct = cf, cf = tmp;
18932 diff = -diff;
18933
18934 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18935 {
18936 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18937
18938 /* We may be reversing unordered compare to normal compare, that
18939 is not valid in general (we may convert non-trapping condition
18940 to trapping one), however on i386 we currently emit all
18941 comparisons unordered. */
18942 compare_code = reverse_condition_maybe_unordered (compare_code);
18943 code = reverse_condition_maybe_unordered (code);
18944 }
18945 else
18946 {
18947 compare_code = reverse_condition (compare_code);
18948 code = reverse_condition (code);
18949 }
18950 }
18951
18952 compare_code = UNKNOWN;
18953 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18954 && CONST_INT_P (op1))
18955 {
18956 if (op1 == const0_rtx
18957 && (code == LT || code == GE))
18958 compare_code = code;
18959 else if (op1 == constm1_rtx)
18960 {
18961 if (code == LE)
18962 compare_code = LT;
18963 else if (code == GT)
18964 compare_code = GE;
18965 }
18966 }
18967
18968 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18969 if (compare_code != UNKNOWN
18970 && GET_MODE (op0) == GET_MODE (out)
18971 && (cf == -1 || ct == -1))
18972 {
18973 /* If lea code below could be used, only optimize
18974 if it results in a 2 insn sequence. */
18975
18976 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18977 || diff == 3 || diff == 5 || diff == 9)
18978 || (compare_code == LT && ct == -1)
18979 || (compare_code == GE && cf == -1))
18980 {
18981 /*
18982 * notl op1 (if necessary)
18983 * sarl $31, op1
18984 * orl cf, op1
18985 */
18986 if (ct != -1)
18987 {
18988 cf = ct;
18989 ct = -1;
18990 code = reverse_condition (code);
18991 }
18992
18993 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18994
18995 out = expand_simple_binop (mode, IOR,
18996 out, GEN_INT (cf),
18997 out, 1, OPTAB_DIRECT);
18998 if (out != operands[0])
18999 emit_move_insn (operands[0], out);
19000
19001 return true;
19002 }
19003 }
19004
19005
19006 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19007 || diff == 3 || diff == 5 || diff == 9)
19008 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19009 && (mode != DImode
19010 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19011 {
19012 /*
19013 * xorl dest,dest
19014 * cmpl op1,op2
19015 * setcc dest
19016 * lea cf(dest*(ct-cf)),dest
19017 *
19018 * Size 14.
19019 *
19020 * This also catches the degenerate setcc-only case.
19021 */
19022
19023 rtx tmp;
19024 int nops;
19025
19026 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19027
19028 nops = 0;
19029 /* On x86_64 the lea instruction operates on Pmode, so we need
19030 to get arithmetics done in proper mode to match. */
19031 if (diff == 1)
19032 tmp = copy_rtx (out);
19033 else
19034 {
19035 rtx out1;
19036 out1 = copy_rtx (out);
19037 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19038 nops++;
19039 if (diff & 1)
19040 {
19041 tmp = gen_rtx_PLUS (mode, tmp, out1);
19042 nops++;
19043 }
19044 }
19045 if (cf != 0)
19046 {
19047 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19048 nops++;
19049 }
19050 if (!rtx_equal_p (tmp, out))
19051 {
19052 if (nops == 1)
19053 out = force_operand (tmp, copy_rtx (out));
19054 else
19055 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19056 }
19057 if (!rtx_equal_p (out, operands[0]))
19058 emit_move_insn (operands[0], copy_rtx (out));
19059
19060 return true;
19061 }
19062
19063 /*
19064 * General case: Jumpful:
19065 * xorl dest,dest cmpl op1, op2
19066 * cmpl op1, op2 movl ct, dest
19067 * setcc dest jcc 1f
19068 * decl dest movl cf, dest
19069 * andl (cf-ct),dest 1:
19070 * addl ct,dest
19071 *
19072 * Size 20. Size 14.
19073 *
19074 * This is reasonably steep, but branch mispredict costs are
19075 * high on modern cpus, so consider failing only if optimizing
19076 * for space.
19077 */
19078
19079 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19080 && BRANCH_COST (optimize_insn_for_speed_p (),
19081 false) >= 2)
19082 {
19083 if (cf == 0)
19084 {
19085 enum machine_mode cmp_mode = GET_MODE (op0);
19086
19087 cf = ct;
19088 ct = 0;
19089
19090 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19091 {
19092 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19093
19094 /* We may be reversing unordered compare to normal compare,
19095 that is not valid in general (we may convert non-trapping
19096 condition to trapping one), however on i386 we currently
19097 emit all comparisons unordered. */
19098 code = reverse_condition_maybe_unordered (code);
19099 }
19100 else
19101 {
19102 code = reverse_condition (code);
19103 if (compare_code != UNKNOWN)
19104 compare_code = reverse_condition (compare_code);
19105 }
19106 }
19107
19108 if (compare_code != UNKNOWN)
19109 {
19110 /* notl op1 (if needed)
19111 sarl $31, op1
19112 andl (cf-ct), op1
19113 addl ct, op1
19114
19115 For x < 0 (resp. x <= -1) there will be no notl,
19116 so if possible swap the constants to get rid of the
19117 complement.
19118 True/false will be -1/0 while code below (store flag
19119 followed by decrement) is 0/-1, so the constants need
19120 to be exchanged once more. */
19121
19122 if (compare_code == GE || !cf)
19123 {
19124 code = reverse_condition (code);
19125 compare_code = LT;
19126 }
19127 else
19128 {
19129 HOST_WIDE_INT tmp = cf;
19130 cf = ct;
19131 ct = tmp;
19132 }
19133
19134 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19135 }
19136 else
19137 {
19138 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19139
19140 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19141 constm1_rtx,
19142 copy_rtx (out), 1, OPTAB_DIRECT);
19143 }
19144
19145 out = expand_simple_binop (mode, AND, copy_rtx (out),
19146 gen_int_mode (cf - ct, mode),
19147 copy_rtx (out), 1, OPTAB_DIRECT);
19148 if (ct)
19149 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19150 copy_rtx (out), 1, OPTAB_DIRECT);
19151 if (!rtx_equal_p (out, operands[0]))
19152 emit_move_insn (operands[0], copy_rtx (out));
19153
19154 return true;
19155 }
19156 }
19157
19158 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19159 {
19160 /* Try a few things more with specific constants and a variable. */
19161
19162 optab op;
19163 rtx var, orig_out, out, tmp;
19164
19165 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19166 return false;
19167
19168 /* If one of the two operands is an interesting constant, load a
19169 constant with the above and mask it in with a logical operation. */
19170
19171 if (CONST_INT_P (operands[2]))
19172 {
19173 var = operands[3];
19174 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19175 operands[3] = constm1_rtx, op = and_optab;
19176 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19177 operands[3] = const0_rtx, op = ior_optab;
19178 else
19179 return false;
19180 }
19181 else if (CONST_INT_P (operands[3]))
19182 {
19183 var = operands[2];
19184 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19185 operands[2] = constm1_rtx, op = and_optab;
19186 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19187 operands[2] = const0_rtx, op = ior_optab;
19188 else
19189 return false;
19190 }
19191 else
19192 return false;
19193
19194 orig_out = operands[0];
19195 tmp = gen_reg_rtx (mode);
19196 operands[0] = tmp;
19197
19198 /* Recurse to get the constant loaded. */
19199 if (ix86_expand_int_movcc (operands) == 0)
19200 return false;
19201
19202 /* Mask in the interesting variable. */
19203 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19204 OPTAB_WIDEN);
19205 if (!rtx_equal_p (out, orig_out))
19206 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19207
19208 return true;
19209 }
19210
19211 /*
19212 * For comparison with above,
19213 *
19214 * movl cf,dest
19215 * movl ct,tmp
19216 * cmpl op1,op2
19217 * cmovcc tmp,dest
19218 *
19219 * Size 15.
19220 */
19221
19222 if (! nonimmediate_operand (operands[2], mode))
19223 operands[2] = force_reg (mode, operands[2]);
19224 if (! nonimmediate_operand (operands[3], mode))
19225 operands[3] = force_reg (mode, operands[3]);
19226
19227 if (! register_operand (operands[2], VOIDmode)
19228 && (mode == QImode
19229 || ! register_operand (operands[3], VOIDmode)))
19230 operands[2] = force_reg (mode, operands[2]);
19231
19232 if (mode == QImode
19233 && ! register_operand (operands[3], VOIDmode))
19234 operands[3] = force_reg (mode, operands[3]);
19235
19236 emit_insn (compare_seq);
19237 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19238 gen_rtx_IF_THEN_ELSE (mode,
19239 compare_op, operands[2],
19240 operands[3])));
19241 return true;
19242 }
19243
19244 /* Swap, force into registers, or otherwise massage the two operands
19245 to an sse comparison with a mask result. Thus we differ a bit from
19246 ix86_prepare_fp_compare_args which expects to produce a flags result.
19247
19248 The DEST operand exists to help determine whether to commute commutative
19249 operators. The POP0/POP1 operands are updated in place. The new
19250 comparison code is returned, or UNKNOWN if not implementable. */
19251
19252 static enum rtx_code
19253 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19254 rtx *pop0, rtx *pop1)
19255 {
19256 rtx tmp;
19257
19258 switch (code)
19259 {
19260 case LTGT:
19261 case UNEQ:
19262 /* AVX supports all the needed comparisons. */
19263 if (TARGET_AVX)
19264 break;
19265 /* We have no LTGT as an operator. We could implement it with
19266 NE & ORDERED, but this requires an extra temporary. It's
19267 not clear that it's worth it. */
19268 return UNKNOWN;
19269
19270 case LT:
19271 case LE:
19272 case UNGT:
19273 case UNGE:
19274 /* These are supported directly. */
19275 break;
19276
19277 case EQ:
19278 case NE:
19279 case UNORDERED:
19280 case ORDERED:
19281 /* AVX has 3 operand comparisons, no need to swap anything. */
19282 if (TARGET_AVX)
19283 break;
19284 /* For commutative operators, try to canonicalize the destination
19285 operand to be first in the comparison - this helps reload to
19286 avoid extra moves. */
19287 if (!dest || !rtx_equal_p (dest, *pop1))
19288 break;
19289 /* FALLTHRU */
19290
19291 case GE:
19292 case GT:
19293 case UNLE:
19294 case UNLT:
19295 /* These are not supported directly before AVX, and furthermore
19296 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19297 comparison operands to transform into something that is
19298 supported. */
19299 tmp = *pop0;
19300 *pop0 = *pop1;
19301 *pop1 = tmp;
19302 code = swap_condition (code);
19303 break;
19304
19305 default:
19306 gcc_unreachable ();
19307 }
19308
19309 return code;
19310 }
19311
19312 /* Detect conditional moves that exactly match min/max operational
19313 semantics. Note that this is IEEE safe, as long as we don't
19314 interchange the operands.
19315
19316 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19317 and TRUE if the operation is successful and instructions are emitted. */
19318
19319 static bool
19320 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19321 rtx cmp_op1, rtx if_true, rtx if_false)
19322 {
19323 enum machine_mode mode;
19324 bool is_min;
19325 rtx tmp;
19326
19327 if (code == LT)
19328 ;
19329 else if (code == UNGE)
19330 {
19331 tmp = if_true;
19332 if_true = if_false;
19333 if_false = tmp;
19334 }
19335 else
19336 return false;
19337
19338 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19339 is_min = true;
19340 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19341 is_min = false;
19342 else
19343 return false;
19344
19345 mode = GET_MODE (dest);
19346
19347 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19348 but MODE may be a vector mode and thus not appropriate. */
19349 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19350 {
19351 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19352 rtvec v;
19353
19354 if_true = force_reg (mode, if_true);
19355 v = gen_rtvec (2, if_true, if_false);
19356 tmp = gen_rtx_UNSPEC (mode, v, u);
19357 }
19358 else
19359 {
19360 code = is_min ? SMIN : SMAX;
19361 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19362 }
19363
19364 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19365 return true;
19366 }
19367
19368 /* Expand an sse vector comparison. Return the register with the result. */
19369
19370 static rtx
19371 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19372 rtx op_true, rtx op_false)
19373 {
19374 enum machine_mode mode = GET_MODE (dest);
19375 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19376 rtx x;
19377
19378 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19379 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19380 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19381
19382 if (optimize
19383 || reg_overlap_mentioned_p (dest, op_true)
19384 || reg_overlap_mentioned_p (dest, op_false))
19385 dest = gen_reg_rtx (mode);
19386
19387 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19388 if (cmp_mode != mode)
19389 {
19390 x = force_reg (cmp_mode, x);
19391 convert_move (dest, x, false);
19392 }
19393 else
19394 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19395
19396 return dest;
19397 }
19398
19399 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19400 operations. This is used for both scalar and vector conditional moves. */
19401
19402 static void
19403 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19404 {
19405 enum machine_mode mode = GET_MODE (dest);
19406 rtx t2, t3, x;
19407
19408 if (vector_all_ones_operand (op_true, mode)
19409 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19410 {
19411 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19412 }
19413 else if (op_false == CONST0_RTX (mode))
19414 {
19415 op_true = force_reg (mode, op_true);
19416 x = gen_rtx_AND (mode, cmp, op_true);
19417 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19418 }
19419 else if (op_true == CONST0_RTX (mode))
19420 {
19421 op_false = force_reg (mode, op_false);
19422 x = gen_rtx_NOT (mode, cmp);
19423 x = gen_rtx_AND (mode, x, op_false);
19424 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19425 }
19426 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19427 {
19428 op_false = force_reg (mode, op_false);
19429 x = gen_rtx_IOR (mode, cmp, op_false);
19430 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19431 }
19432 else if (TARGET_XOP)
19433 {
19434 op_true = force_reg (mode, op_true);
19435
19436 if (!nonimmediate_operand (op_false, mode))
19437 op_false = force_reg (mode, op_false);
19438
19439 emit_insn (gen_rtx_SET (mode, dest,
19440 gen_rtx_IF_THEN_ELSE (mode, cmp,
19441 op_true,
19442 op_false)));
19443 }
19444 else
19445 {
19446 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19447
19448 if (!nonimmediate_operand (op_true, mode))
19449 op_true = force_reg (mode, op_true);
19450
19451 op_false = force_reg (mode, op_false);
19452
19453 switch (mode)
19454 {
19455 case V4SFmode:
19456 if (TARGET_SSE4_1)
19457 gen = gen_sse4_1_blendvps;
19458 break;
19459 case V2DFmode:
19460 if (TARGET_SSE4_1)
19461 gen = gen_sse4_1_blendvpd;
19462 break;
19463 case V16QImode:
19464 case V8HImode:
19465 case V4SImode:
19466 case V2DImode:
19467 if (TARGET_SSE4_1)
19468 {
19469 gen = gen_sse4_1_pblendvb;
19470 dest = gen_lowpart (V16QImode, dest);
19471 op_false = gen_lowpart (V16QImode, op_false);
19472 op_true = gen_lowpart (V16QImode, op_true);
19473 cmp = gen_lowpart (V16QImode, cmp);
19474 }
19475 break;
19476 case V8SFmode:
19477 if (TARGET_AVX)
19478 gen = gen_avx_blendvps256;
19479 break;
19480 case V4DFmode:
19481 if (TARGET_AVX)
19482 gen = gen_avx_blendvpd256;
19483 break;
19484 case V32QImode:
19485 case V16HImode:
19486 case V8SImode:
19487 case V4DImode:
19488 if (TARGET_AVX2)
19489 {
19490 gen = gen_avx2_pblendvb;
19491 dest = gen_lowpart (V32QImode, dest);
19492 op_false = gen_lowpart (V32QImode, op_false);
19493 op_true = gen_lowpart (V32QImode, op_true);
19494 cmp = gen_lowpart (V32QImode, cmp);
19495 }
19496 break;
19497 default:
19498 break;
19499 }
19500
19501 if (gen != NULL)
19502 emit_insn (gen (dest, op_false, op_true, cmp));
19503 else
19504 {
19505 op_true = force_reg (mode, op_true);
19506
19507 t2 = gen_reg_rtx (mode);
19508 if (optimize)
19509 t3 = gen_reg_rtx (mode);
19510 else
19511 t3 = dest;
19512
19513 x = gen_rtx_AND (mode, op_true, cmp);
19514 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19515
19516 x = gen_rtx_NOT (mode, cmp);
19517 x = gen_rtx_AND (mode, x, op_false);
19518 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19519
19520 x = gen_rtx_IOR (mode, t3, t2);
19521 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19522 }
19523 }
19524 }
19525
19526 /* Expand a floating-point conditional move. Return true if successful. */
19527
19528 bool
19529 ix86_expand_fp_movcc (rtx operands[])
19530 {
19531 enum machine_mode mode = GET_MODE (operands[0]);
19532 enum rtx_code code = GET_CODE (operands[1]);
19533 rtx tmp, compare_op;
19534 rtx op0 = XEXP (operands[1], 0);
19535 rtx op1 = XEXP (operands[1], 1);
19536
19537 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19538 {
19539 enum machine_mode cmode;
19540
19541 /* Since we've no cmove for sse registers, don't force bad register
19542 allocation just to gain access to it. Deny movcc when the
19543 comparison mode doesn't match the move mode. */
19544 cmode = GET_MODE (op0);
19545 if (cmode == VOIDmode)
19546 cmode = GET_MODE (op1);
19547 if (cmode != mode)
19548 return false;
19549
19550 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19551 if (code == UNKNOWN)
19552 return false;
19553
19554 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19555 operands[2], operands[3]))
19556 return true;
19557
19558 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19559 operands[2], operands[3]);
19560 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19561 return true;
19562 }
19563
19564 /* The floating point conditional move instructions don't directly
19565 support conditions resulting from a signed integer comparison. */
19566
19567 compare_op = ix86_expand_compare (code, op0, op1);
19568 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19569 {
19570 tmp = gen_reg_rtx (QImode);
19571 ix86_expand_setcc (tmp, code, op0, op1);
19572
19573 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19574 }
19575
19576 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19577 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19578 operands[2], operands[3])));
19579
19580 return true;
19581 }
19582
19583 /* Expand a floating-point vector conditional move; a vcond operation
19584 rather than a movcc operation. */
19585
19586 bool
19587 ix86_expand_fp_vcond (rtx operands[])
19588 {
19589 enum rtx_code code = GET_CODE (operands[3]);
19590 rtx cmp;
19591
19592 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19593 &operands[4], &operands[5]);
19594 if (code == UNKNOWN)
19595 {
19596 rtx temp;
19597 switch (GET_CODE (operands[3]))
19598 {
19599 case LTGT:
19600 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19601 operands[5], operands[0], operands[0]);
19602 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19603 operands[5], operands[1], operands[2]);
19604 code = AND;
19605 break;
19606 case UNEQ:
19607 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19608 operands[5], operands[0], operands[0]);
19609 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19610 operands[5], operands[1], operands[2]);
19611 code = IOR;
19612 break;
19613 default:
19614 gcc_unreachable ();
19615 }
19616 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19617 OPTAB_DIRECT);
19618 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19619 return true;
19620 }
19621
19622 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19623 operands[5], operands[1], operands[2]))
19624 return true;
19625
19626 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19627 operands[1], operands[2]);
19628 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19629 return true;
19630 }
19631
19632 /* Expand a signed/unsigned integral vector conditional move. */
19633
19634 bool
19635 ix86_expand_int_vcond (rtx operands[])
19636 {
19637 enum machine_mode data_mode = GET_MODE (operands[0]);
19638 enum machine_mode mode = GET_MODE (operands[4]);
19639 enum rtx_code code = GET_CODE (operands[3]);
19640 bool negate = false;
19641 rtx x, cop0, cop1;
19642
19643 cop0 = operands[4];
19644 cop1 = operands[5];
19645
19646 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19647 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19648 if ((code == LT || code == GE)
19649 && data_mode == mode
19650 && cop1 == CONST0_RTX (mode)
19651 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19652 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19653 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19654 && (GET_MODE_SIZE (data_mode) == 16
19655 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19656 {
19657 rtx negop = operands[2 - (code == LT)];
19658 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19659 if (negop == CONST1_RTX (data_mode))
19660 {
19661 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19662 operands[0], 1, OPTAB_DIRECT);
19663 if (res != operands[0])
19664 emit_move_insn (operands[0], res);
19665 return true;
19666 }
19667 else if (GET_MODE_INNER (data_mode) != DImode
19668 && vector_all_ones_operand (negop, data_mode))
19669 {
19670 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19671 operands[0], 0, OPTAB_DIRECT);
19672 if (res != operands[0])
19673 emit_move_insn (operands[0], res);
19674 return true;
19675 }
19676 }
19677
19678 if (!nonimmediate_operand (cop1, mode))
19679 cop1 = force_reg (mode, cop1);
19680 if (!general_operand (operands[1], data_mode))
19681 operands[1] = force_reg (data_mode, operands[1]);
19682 if (!general_operand (operands[2], data_mode))
19683 operands[2] = force_reg (data_mode, operands[2]);
19684
19685 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19686 if (TARGET_XOP
19687 && (mode == V16QImode || mode == V8HImode
19688 || mode == V4SImode || mode == V2DImode))
19689 ;
19690 else
19691 {
19692 /* Canonicalize the comparison to EQ, GT, GTU. */
19693 switch (code)
19694 {
19695 case EQ:
19696 case GT:
19697 case GTU:
19698 break;
19699
19700 case NE:
19701 case LE:
19702 case LEU:
19703 code = reverse_condition (code);
19704 negate = true;
19705 break;
19706
19707 case GE:
19708 case GEU:
19709 code = reverse_condition (code);
19710 negate = true;
19711 /* FALLTHRU */
19712
19713 case LT:
19714 case LTU:
19715 code = swap_condition (code);
19716 x = cop0, cop0 = cop1, cop1 = x;
19717 break;
19718
19719 default:
19720 gcc_unreachable ();
19721 }
19722
19723 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19724 if (mode == V2DImode)
19725 {
19726 switch (code)
19727 {
19728 case EQ:
19729 /* SSE4.1 supports EQ. */
19730 if (!TARGET_SSE4_1)
19731 return false;
19732 break;
19733
19734 case GT:
19735 case GTU:
19736 /* SSE4.2 supports GT/GTU. */
19737 if (!TARGET_SSE4_2)
19738 return false;
19739 break;
19740
19741 default:
19742 gcc_unreachable ();
19743 }
19744 }
19745
19746 /* Unsigned parallel compare is not supported by the hardware.
19747 Play some tricks to turn this into a signed comparison
19748 against 0. */
19749 if (code == GTU)
19750 {
19751 cop0 = force_reg (mode, cop0);
19752
19753 switch (mode)
19754 {
19755 case V8SImode:
19756 case V4DImode:
19757 case V4SImode:
19758 case V2DImode:
19759 {
19760 rtx t1, t2, mask;
19761 rtx (*gen_sub3) (rtx, rtx, rtx);
19762
19763 switch (mode)
19764 {
19765 case V8SImode: gen_sub3 = gen_subv8si3; break;
19766 case V4DImode: gen_sub3 = gen_subv4di3; break;
19767 case V4SImode: gen_sub3 = gen_subv4si3; break;
19768 case V2DImode: gen_sub3 = gen_subv2di3; break;
19769 default:
19770 gcc_unreachable ();
19771 }
19772 /* Subtract (-(INT MAX) - 1) from both operands to make
19773 them signed. */
19774 mask = ix86_build_signbit_mask (mode, true, false);
19775 t1 = gen_reg_rtx (mode);
19776 emit_insn (gen_sub3 (t1, cop0, mask));
19777
19778 t2 = gen_reg_rtx (mode);
19779 emit_insn (gen_sub3 (t2, cop1, mask));
19780
19781 cop0 = t1;
19782 cop1 = t2;
19783 code = GT;
19784 }
19785 break;
19786
19787 case V32QImode:
19788 case V16HImode:
19789 case V16QImode:
19790 case V8HImode:
19791 /* Perform a parallel unsigned saturating subtraction. */
19792 x = gen_reg_rtx (mode);
19793 emit_insn (gen_rtx_SET (VOIDmode, x,
19794 gen_rtx_US_MINUS (mode, cop0, cop1)));
19795
19796 cop0 = x;
19797 cop1 = CONST0_RTX (mode);
19798 code = EQ;
19799 negate = !negate;
19800 break;
19801
19802 default:
19803 gcc_unreachable ();
19804 }
19805 }
19806 }
19807
19808 /* Allow the comparison to be done in one mode, but the movcc to
19809 happen in another mode. */
19810 if (data_mode == mode)
19811 {
19812 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19813 operands[1+negate], operands[2-negate]);
19814 }
19815 else
19816 {
19817 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19818 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19819 code, cop0, cop1,
19820 operands[1+negate], operands[2-negate]);
19821 x = gen_lowpart (data_mode, x);
19822 }
19823
19824 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19825 operands[2-negate]);
19826 return true;
19827 }
19828
19829 /* Expand a variable vector permutation. */
19830
19831 void
19832 ix86_expand_vec_perm (rtx operands[])
19833 {
19834 rtx target = operands[0];
19835 rtx op0 = operands[1];
19836 rtx op1 = operands[2];
19837 rtx mask = operands[3];
19838 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19839 enum machine_mode mode = GET_MODE (op0);
19840 enum machine_mode maskmode = GET_MODE (mask);
19841 int w, e, i;
19842 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19843
19844 /* Number of elements in the vector. */
19845 w = GET_MODE_NUNITS (mode);
19846 e = GET_MODE_UNIT_SIZE (mode);
19847 gcc_assert (w <= 32);
19848
19849 if (TARGET_AVX2)
19850 {
19851 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19852 {
19853 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19854 an constant shuffle operand. With a tiny bit of effort we can
19855 use VPERMD instead. A re-interpretation stall for V4DFmode is
19856 unfortunate but there's no avoiding it.
19857 Similarly for V16HImode we don't have instructions for variable
19858 shuffling, while for V32QImode we can use after preparing suitable
19859 masks vpshufb; vpshufb; vpermq; vpor. */
19860
19861 if (mode == V16HImode)
19862 {
19863 maskmode = mode = V32QImode;
19864 w = 32;
19865 e = 1;
19866 }
19867 else
19868 {
19869 maskmode = mode = V8SImode;
19870 w = 8;
19871 e = 4;
19872 }
19873 t1 = gen_reg_rtx (maskmode);
19874
19875 /* Replicate the low bits of the V4DImode mask into V8SImode:
19876 mask = { A B C D }
19877 t1 = { A A B B C C D D }. */
19878 for (i = 0; i < w / 2; ++i)
19879 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19880 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19881 vt = force_reg (maskmode, vt);
19882 mask = gen_lowpart (maskmode, mask);
19883 if (maskmode == V8SImode)
19884 emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
19885 else
19886 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19887
19888 /* Multiply the shuffle indicies by two. */
19889 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19890 OPTAB_DIRECT);
19891
19892 /* Add one to the odd shuffle indicies:
19893 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19894 for (i = 0; i < w / 2; ++i)
19895 {
19896 vec[i * 2] = const0_rtx;
19897 vec[i * 2 + 1] = const1_rtx;
19898 }
19899 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19900 vt = force_const_mem (maskmode, vt);
19901 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19902 OPTAB_DIRECT);
19903
19904 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19905 operands[3] = mask = t1;
19906 target = gen_lowpart (mode, target);
19907 op0 = gen_lowpart (mode, op0);
19908 op1 = gen_lowpart (mode, op1);
19909 }
19910
19911 switch (mode)
19912 {
19913 case V8SImode:
19914 /* The VPERMD and VPERMPS instructions already properly ignore
19915 the high bits of the shuffle elements. No need for us to
19916 perform an AND ourselves. */
19917 if (one_operand_shuffle)
19918 emit_insn (gen_avx2_permvarv8si (target, mask, op0));
19919 else
19920 {
19921 t1 = gen_reg_rtx (V8SImode);
19922 t2 = gen_reg_rtx (V8SImode);
19923 emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
19924 emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
19925 goto merge_two;
19926 }
19927 return;
19928
19929 case V8SFmode:
19930 mask = gen_lowpart (V8SFmode, mask);
19931 if (one_operand_shuffle)
19932 emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
19933 else
19934 {
19935 t1 = gen_reg_rtx (V8SFmode);
19936 t2 = gen_reg_rtx (V8SFmode);
19937 emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
19938 emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
19939 goto merge_two;
19940 }
19941 return;
19942
19943 case V4SImode:
19944 /* By combining the two 128-bit input vectors into one 256-bit
19945 input vector, we can use VPERMD and VPERMPS for the full
19946 two-operand shuffle. */
19947 t1 = gen_reg_rtx (V8SImode);
19948 t2 = gen_reg_rtx (V8SImode);
19949 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19950 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19951 emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
19952 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19953 return;
19954
19955 case V4SFmode:
19956 t1 = gen_reg_rtx (V8SFmode);
19957 t2 = gen_reg_rtx (V8SFmode);
19958 mask = gen_lowpart (V4SFmode, mask);
19959 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19960 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19961 emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
19962 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19963 return;
19964
19965 case V32QImode:
19966 t1 = gen_reg_rtx (V32QImode);
19967 t2 = gen_reg_rtx (V32QImode);
19968 t3 = gen_reg_rtx (V32QImode);
19969 vt2 = GEN_INT (128);
19970 for (i = 0; i < 32; i++)
19971 vec[i] = vt2;
19972 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19973 vt = force_reg (V32QImode, vt);
19974 for (i = 0; i < 32; i++)
19975 vec[i] = i < 16 ? vt2 : const0_rtx;
19976 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19977 vt2 = force_reg (V32QImode, vt2);
19978 /* From mask create two adjusted masks, which contain the same
19979 bits as mask in the low 7 bits of each vector element.
19980 The first mask will have the most significant bit clear
19981 if it requests element from the same 128-bit lane
19982 and MSB set if it requests element from the other 128-bit lane.
19983 The second mask will have the opposite values of the MSB,
19984 and additionally will have its 128-bit lanes swapped.
19985 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19986 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19987 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19988 stands for other 12 bytes. */
19989 /* The bit whether element is from the same lane or the other
19990 lane is bit 4, so shift it up by 3 to the MSB position. */
19991 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19992 gen_lowpart (V4DImode, mask),
19993 GEN_INT (3)));
19994 /* Clear MSB bits from the mask just in case it had them set. */
19995 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19996 /* After this t1 will have MSB set for elements from other lane. */
19997 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
19998 /* Clear bits other than MSB. */
19999 emit_insn (gen_andv32qi3 (t1, t1, vt));
20000 /* Or in the lower bits from mask into t3. */
20001 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20002 /* And invert MSB bits in t1, so MSB is set for elements from the same
20003 lane. */
20004 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20005 /* Swap 128-bit lanes in t3. */
20006 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20007 gen_lowpart (V4DImode, t3),
20008 const2_rtx, GEN_INT (3),
20009 const0_rtx, const1_rtx));
20010 /* And or in the lower bits from mask into t1. */
20011 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20012 if (one_operand_shuffle)
20013 {
20014 /* Each of these shuffles will put 0s in places where
20015 element from the other 128-bit lane is needed, otherwise
20016 will shuffle in the requested value. */
20017 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20018 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20019 /* For t3 the 128-bit lanes are swapped again. */
20020 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20021 gen_lowpart (V4DImode, t3),
20022 const2_rtx, GEN_INT (3),
20023 const0_rtx, const1_rtx));
20024 /* And oring both together leads to the result. */
20025 emit_insn (gen_iorv32qi3 (target, t1, t3));
20026 return;
20027 }
20028
20029 t4 = gen_reg_rtx (V32QImode);
20030 /* Similarly to the above one_operand_shuffle code,
20031 just for repeated twice for each operand. merge_two:
20032 code will merge the two results together. */
20033 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20034 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20035 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20036 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20037 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20038 gen_lowpart (V4DImode, t4),
20039 const2_rtx, GEN_INT (3),
20040 const0_rtx, const1_rtx));
20041 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20042 gen_lowpart (V4DImode, t3),
20043 const2_rtx, GEN_INT (3),
20044 const0_rtx, const1_rtx));
20045 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20046 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20047 t1 = t4;
20048 t2 = t3;
20049 goto merge_two;
20050
20051 default:
20052 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20053 break;
20054 }
20055 }
20056
20057 if (TARGET_XOP)
20058 {
20059 /* The XOP VPPERM insn supports three inputs. By ignoring the
20060 one_operand_shuffle special case, we avoid creating another
20061 set of constant vectors in memory. */
20062 one_operand_shuffle = false;
20063
20064 /* mask = mask & {2*w-1, ...} */
20065 vt = GEN_INT (2*w - 1);
20066 }
20067 else
20068 {
20069 /* mask = mask & {w-1, ...} */
20070 vt = GEN_INT (w - 1);
20071 }
20072
20073 for (i = 0; i < w; i++)
20074 vec[i] = vt;
20075 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20076 mask = expand_simple_binop (maskmode, AND, mask, vt,
20077 NULL_RTX, 0, OPTAB_DIRECT);
20078
20079 /* For non-QImode operations, convert the word permutation control
20080 into a byte permutation control. */
20081 if (mode != V16QImode)
20082 {
20083 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20084 GEN_INT (exact_log2 (e)),
20085 NULL_RTX, 0, OPTAB_DIRECT);
20086
20087 /* Convert mask to vector of chars. */
20088 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20089
20090 /* Replicate each of the input bytes into byte positions:
20091 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20092 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20093 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20094 for (i = 0; i < 16; ++i)
20095 vec[i] = GEN_INT (i/e * e);
20096 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20097 vt = force_const_mem (V16QImode, vt);
20098 if (TARGET_XOP)
20099 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20100 else
20101 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20102
20103 /* Convert it into the byte positions by doing
20104 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20105 for (i = 0; i < 16; ++i)
20106 vec[i] = GEN_INT (i % e);
20107 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20108 vt = force_const_mem (V16QImode, vt);
20109 emit_insn (gen_addv16qi3 (mask, mask, vt));
20110 }
20111
20112 /* The actual shuffle operations all operate on V16QImode. */
20113 op0 = gen_lowpart (V16QImode, op0);
20114 op1 = gen_lowpart (V16QImode, op1);
20115 target = gen_lowpart (V16QImode, target);
20116
20117 if (TARGET_XOP)
20118 {
20119 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20120 }
20121 else if (one_operand_shuffle)
20122 {
20123 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20124 }
20125 else
20126 {
20127 rtx xops[6];
20128 bool ok;
20129
20130 /* Shuffle the two input vectors independently. */
20131 t1 = gen_reg_rtx (V16QImode);
20132 t2 = gen_reg_rtx (V16QImode);
20133 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20134 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20135
20136 merge_two:
20137 /* Then merge them together. The key is whether any given control
20138 element contained a bit set that indicates the second word. */
20139 mask = operands[3];
20140 vt = GEN_INT (w);
20141 if (maskmode == V2DImode && !TARGET_SSE4_1)
20142 {
20143 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20144 more shuffle to convert the V2DI input mask into a V4SI
20145 input mask. At which point the masking that expand_int_vcond
20146 will work as desired. */
20147 rtx t3 = gen_reg_rtx (V4SImode);
20148 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20149 const0_rtx, const0_rtx,
20150 const2_rtx, const2_rtx));
20151 mask = t3;
20152 maskmode = V4SImode;
20153 e = w = 4;
20154 }
20155
20156 for (i = 0; i < w; i++)
20157 vec[i] = vt;
20158 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20159 vt = force_reg (maskmode, vt);
20160 mask = expand_simple_binop (maskmode, AND, mask, vt,
20161 NULL_RTX, 0, OPTAB_DIRECT);
20162
20163 xops[0] = gen_lowpart (mode, operands[0]);
20164 xops[1] = gen_lowpart (mode, t2);
20165 xops[2] = gen_lowpart (mode, t1);
20166 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20167 xops[4] = mask;
20168 xops[5] = vt;
20169 ok = ix86_expand_int_vcond (xops);
20170 gcc_assert (ok);
20171 }
20172 }
20173
20174 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20175 true if we should do zero extension, else sign extension. HIGH_P is
20176 true if we want the N/2 high elements, else the low elements. */
20177
20178 void
20179 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
20180 {
20181 enum machine_mode imode = GET_MODE (operands[1]);
20182 rtx tmp, dest;
20183
20184 if (TARGET_SSE4_1)
20185 {
20186 rtx (*unpack)(rtx, rtx);
20187 rtx (*extract)(rtx, rtx) = NULL;
20188 enum machine_mode halfmode = BLKmode;
20189
20190 switch (imode)
20191 {
20192 case V32QImode:
20193 if (unsigned_p)
20194 unpack = gen_avx2_zero_extendv16qiv16hi2;
20195 else
20196 unpack = gen_avx2_sign_extendv16qiv16hi2;
20197 halfmode = V16QImode;
20198 extract
20199 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20200 break;
20201 case V16HImode:
20202 if (unsigned_p)
20203 unpack = gen_avx2_zero_extendv8hiv8si2;
20204 else
20205 unpack = gen_avx2_sign_extendv8hiv8si2;
20206 halfmode = V8HImode;
20207 extract
20208 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20209 break;
20210 case V8SImode:
20211 if (unsigned_p)
20212 unpack = gen_avx2_zero_extendv4siv4di2;
20213 else
20214 unpack = gen_avx2_sign_extendv4siv4di2;
20215 halfmode = V4SImode;
20216 extract
20217 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20218 break;
20219 case V16QImode:
20220 if (unsigned_p)
20221 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20222 else
20223 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20224 break;
20225 case V8HImode:
20226 if (unsigned_p)
20227 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20228 else
20229 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20230 break;
20231 case V4SImode:
20232 if (unsigned_p)
20233 unpack = gen_sse4_1_zero_extendv2siv2di2;
20234 else
20235 unpack = gen_sse4_1_sign_extendv2siv2di2;
20236 break;
20237 default:
20238 gcc_unreachable ();
20239 }
20240
20241 if (GET_MODE_SIZE (imode) == 32)
20242 {
20243 tmp = gen_reg_rtx (halfmode);
20244 emit_insn (extract (tmp, operands[1]));
20245 }
20246 else if (high_p)
20247 {
20248 /* Shift higher 8 bytes to lower 8 bytes. */
20249 tmp = gen_reg_rtx (imode);
20250 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20251 gen_lowpart (V1TImode, operands[1]),
20252 GEN_INT (64)));
20253 }
20254 else
20255 tmp = operands[1];
20256
20257 emit_insn (unpack (operands[0], tmp));
20258 }
20259 else
20260 {
20261 rtx (*unpack)(rtx, rtx, rtx);
20262
20263 switch (imode)
20264 {
20265 case V16QImode:
20266 if (high_p)
20267 unpack = gen_vec_interleave_highv16qi;
20268 else
20269 unpack = gen_vec_interleave_lowv16qi;
20270 break;
20271 case V8HImode:
20272 if (high_p)
20273 unpack = gen_vec_interleave_highv8hi;
20274 else
20275 unpack = gen_vec_interleave_lowv8hi;
20276 break;
20277 case V4SImode:
20278 if (high_p)
20279 unpack = gen_vec_interleave_highv4si;
20280 else
20281 unpack = gen_vec_interleave_lowv4si;
20282 break;
20283 default:
20284 gcc_unreachable ();
20285 }
20286
20287 dest = gen_lowpart (imode, operands[0]);
20288
20289 if (unsigned_p)
20290 tmp = force_reg (imode, CONST0_RTX (imode));
20291 else
20292 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20293 operands[1], pc_rtx, pc_rtx);
20294
20295 emit_insn (unpack (dest, operands[1], tmp));
20296 }
20297 }
20298
20299 /* Expand conditional increment or decrement using adb/sbb instructions.
20300 The default case using setcc followed by the conditional move can be
20301 done by generic code. */
20302 bool
20303 ix86_expand_int_addcc (rtx operands[])
20304 {
20305 enum rtx_code code = GET_CODE (operands[1]);
20306 rtx flags;
20307 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20308 rtx compare_op;
20309 rtx val = const0_rtx;
20310 bool fpcmp = false;
20311 enum machine_mode mode;
20312 rtx op0 = XEXP (operands[1], 0);
20313 rtx op1 = XEXP (operands[1], 1);
20314
20315 if (operands[3] != const1_rtx
20316 && operands[3] != constm1_rtx)
20317 return false;
20318 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20319 return false;
20320 code = GET_CODE (compare_op);
20321
20322 flags = XEXP (compare_op, 0);
20323
20324 if (GET_MODE (flags) == CCFPmode
20325 || GET_MODE (flags) == CCFPUmode)
20326 {
20327 fpcmp = true;
20328 code = ix86_fp_compare_code_to_integer (code);
20329 }
20330
20331 if (code != LTU)
20332 {
20333 val = constm1_rtx;
20334 if (fpcmp)
20335 PUT_CODE (compare_op,
20336 reverse_condition_maybe_unordered
20337 (GET_CODE (compare_op)));
20338 else
20339 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20340 }
20341
20342 mode = GET_MODE (operands[0]);
20343
20344 /* Construct either adc or sbb insn. */
20345 if ((code == LTU) == (operands[3] == constm1_rtx))
20346 {
20347 switch (mode)
20348 {
20349 case QImode:
20350 insn = gen_subqi3_carry;
20351 break;
20352 case HImode:
20353 insn = gen_subhi3_carry;
20354 break;
20355 case SImode:
20356 insn = gen_subsi3_carry;
20357 break;
20358 case DImode:
20359 insn = gen_subdi3_carry;
20360 break;
20361 default:
20362 gcc_unreachable ();
20363 }
20364 }
20365 else
20366 {
20367 switch (mode)
20368 {
20369 case QImode:
20370 insn = gen_addqi3_carry;
20371 break;
20372 case HImode:
20373 insn = gen_addhi3_carry;
20374 break;
20375 case SImode:
20376 insn = gen_addsi3_carry;
20377 break;
20378 case DImode:
20379 insn = gen_adddi3_carry;
20380 break;
20381 default:
20382 gcc_unreachable ();
20383 }
20384 }
20385 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20386
20387 return true;
20388 }
20389
20390
20391 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20392 but works for floating pointer parameters and nonoffsetable memories.
20393 For pushes, it returns just stack offsets; the values will be saved
20394 in the right order. Maximally three parts are generated. */
20395
20396 static int
20397 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20398 {
20399 int size;
20400
20401 if (!TARGET_64BIT)
20402 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20403 else
20404 size = (GET_MODE_SIZE (mode) + 4) / 8;
20405
20406 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20407 gcc_assert (size >= 2 && size <= 4);
20408
20409 /* Optimize constant pool reference to immediates. This is used by fp
20410 moves, that force all constants to memory to allow combining. */
20411 if (MEM_P (operand) && MEM_READONLY_P (operand))
20412 {
20413 rtx tmp = maybe_get_pool_constant (operand);
20414 if (tmp)
20415 operand = tmp;
20416 }
20417
20418 if (MEM_P (operand) && !offsettable_memref_p (operand))
20419 {
20420 /* The only non-offsetable memories we handle are pushes. */
20421 int ok = push_operand (operand, VOIDmode);
20422
20423 gcc_assert (ok);
20424
20425 operand = copy_rtx (operand);
20426 PUT_MODE (operand, word_mode);
20427 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20428 return size;
20429 }
20430
20431 if (GET_CODE (operand) == CONST_VECTOR)
20432 {
20433 enum machine_mode imode = int_mode_for_mode (mode);
20434 /* Caution: if we looked through a constant pool memory above,
20435 the operand may actually have a different mode now. That's
20436 ok, since we want to pun this all the way back to an integer. */
20437 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20438 gcc_assert (operand != NULL);
20439 mode = imode;
20440 }
20441
20442 if (!TARGET_64BIT)
20443 {
20444 if (mode == DImode)
20445 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20446 else
20447 {
20448 int i;
20449
20450 if (REG_P (operand))
20451 {
20452 gcc_assert (reload_completed);
20453 for (i = 0; i < size; i++)
20454 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20455 }
20456 else if (offsettable_memref_p (operand))
20457 {
20458 operand = adjust_address (operand, SImode, 0);
20459 parts[0] = operand;
20460 for (i = 1; i < size; i++)
20461 parts[i] = adjust_address (operand, SImode, 4 * i);
20462 }
20463 else if (GET_CODE (operand) == CONST_DOUBLE)
20464 {
20465 REAL_VALUE_TYPE r;
20466 long l[4];
20467
20468 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20469 switch (mode)
20470 {
20471 case TFmode:
20472 real_to_target (l, &r, mode);
20473 parts[3] = gen_int_mode (l[3], SImode);
20474 parts[2] = gen_int_mode (l[2], SImode);
20475 break;
20476 case XFmode:
20477 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20478 parts[2] = gen_int_mode (l[2], SImode);
20479 break;
20480 case DFmode:
20481 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20482 break;
20483 default:
20484 gcc_unreachable ();
20485 }
20486 parts[1] = gen_int_mode (l[1], SImode);
20487 parts[0] = gen_int_mode (l[0], SImode);
20488 }
20489 else
20490 gcc_unreachable ();
20491 }
20492 }
20493 else
20494 {
20495 if (mode == TImode)
20496 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20497 if (mode == XFmode || mode == TFmode)
20498 {
20499 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20500 if (REG_P (operand))
20501 {
20502 gcc_assert (reload_completed);
20503 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20504 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20505 }
20506 else if (offsettable_memref_p (operand))
20507 {
20508 operand = adjust_address (operand, DImode, 0);
20509 parts[0] = operand;
20510 parts[1] = adjust_address (operand, upper_mode, 8);
20511 }
20512 else if (GET_CODE (operand) == CONST_DOUBLE)
20513 {
20514 REAL_VALUE_TYPE r;
20515 long l[4];
20516
20517 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20518 real_to_target (l, &r, mode);
20519
20520 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20521 if (HOST_BITS_PER_WIDE_INT >= 64)
20522 parts[0]
20523 = gen_int_mode
20524 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20525 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20526 DImode);
20527 else
20528 parts[0] = immed_double_const (l[0], l[1], DImode);
20529
20530 if (upper_mode == SImode)
20531 parts[1] = gen_int_mode (l[2], SImode);
20532 else if (HOST_BITS_PER_WIDE_INT >= 64)
20533 parts[1]
20534 = gen_int_mode
20535 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20536 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20537 DImode);
20538 else
20539 parts[1] = immed_double_const (l[2], l[3], DImode);
20540 }
20541 else
20542 gcc_unreachable ();
20543 }
20544 }
20545
20546 return size;
20547 }
20548
20549 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20550 Return false when normal moves are needed; true when all required
20551 insns have been emitted. Operands 2-4 contain the input values
20552 int the correct order; operands 5-7 contain the output values. */
20553
20554 void
20555 ix86_split_long_move (rtx operands[])
20556 {
20557 rtx part[2][4];
20558 int nparts, i, j;
20559 int push = 0;
20560 int collisions = 0;
20561 enum machine_mode mode = GET_MODE (operands[0]);
20562 bool collisionparts[4];
20563
20564 /* The DFmode expanders may ask us to move double.
20565 For 64bit target this is single move. By hiding the fact
20566 here we simplify i386.md splitters. */
20567 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20568 {
20569 /* Optimize constant pool reference to immediates. This is used by
20570 fp moves, that force all constants to memory to allow combining. */
20571
20572 if (MEM_P (operands[1])
20573 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20574 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20575 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20576 if (push_operand (operands[0], VOIDmode))
20577 {
20578 operands[0] = copy_rtx (operands[0]);
20579 PUT_MODE (operands[0], word_mode);
20580 }
20581 else
20582 operands[0] = gen_lowpart (DImode, operands[0]);
20583 operands[1] = gen_lowpart (DImode, operands[1]);
20584 emit_move_insn (operands[0], operands[1]);
20585 return;
20586 }
20587
20588 /* The only non-offsettable memory we handle is push. */
20589 if (push_operand (operands[0], VOIDmode))
20590 push = 1;
20591 else
20592 gcc_assert (!MEM_P (operands[0])
20593 || offsettable_memref_p (operands[0]));
20594
20595 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20596 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20597
20598 /* When emitting push, take care for source operands on the stack. */
20599 if (push && MEM_P (operands[1])
20600 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20601 {
20602 rtx src_base = XEXP (part[1][nparts - 1], 0);
20603
20604 /* Compensate for the stack decrement by 4. */
20605 if (!TARGET_64BIT && nparts == 3
20606 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20607 src_base = plus_constant (src_base, 4);
20608
20609 /* src_base refers to the stack pointer and is
20610 automatically decreased by emitted push. */
20611 for (i = 0; i < nparts; i++)
20612 part[1][i] = change_address (part[1][i],
20613 GET_MODE (part[1][i]), src_base);
20614 }
20615
20616 /* We need to do copy in the right order in case an address register
20617 of the source overlaps the destination. */
20618 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20619 {
20620 rtx tmp;
20621
20622 for (i = 0; i < nparts; i++)
20623 {
20624 collisionparts[i]
20625 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20626 if (collisionparts[i])
20627 collisions++;
20628 }
20629
20630 /* Collision in the middle part can be handled by reordering. */
20631 if (collisions == 1 && nparts == 3 && collisionparts [1])
20632 {
20633 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20634 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20635 }
20636 else if (collisions == 1
20637 && nparts == 4
20638 && (collisionparts [1] || collisionparts [2]))
20639 {
20640 if (collisionparts [1])
20641 {
20642 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20643 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20644 }
20645 else
20646 {
20647 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20648 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20649 }
20650 }
20651
20652 /* If there are more collisions, we can't handle it by reordering.
20653 Do an lea to the last part and use only one colliding move. */
20654 else if (collisions > 1)
20655 {
20656 rtx base;
20657
20658 collisions = 1;
20659
20660 base = part[0][nparts - 1];
20661
20662 /* Handle the case when the last part isn't valid for lea.
20663 Happens in 64-bit mode storing the 12-byte XFmode. */
20664 if (GET_MODE (base) != Pmode)
20665 base = gen_rtx_REG (Pmode, REGNO (base));
20666
20667 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20668 part[1][0] = replace_equiv_address (part[1][0], base);
20669 for (i = 1; i < nparts; i++)
20670 {
20671 tmp = plus_constant (base, UNITS_PER_WORD * i);
20672 part[1][i] = replace_equiv_address (part[1][i], tmp);
20673 }
20674 }
20675 }
20676
20677 if (push)
20678 {
20679 if (!TARGET_64BIT)
20680 {
20681 if (nparts == 3)
20682 {
20683 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20684 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
20685 stack_pointer_rtx, GEN_INT (-4)));
20686 emit_move_insn (part[0][2], part[1][2]);
20687 }
20688 else if (nparts == 4)
20689 {
20690 emit_move_insn (part[0][3], part[1][3]);
20691 emit_move_insn (part[0][2], part[1][2]);
20692 }
20693 }
20694 else
20695 {
20696 /* In 64bit mode we don't have 32bit push available. In case this is
20697 register, it is OK - we will just use larger counterpart. We also
20698 retype memory - these comes from attempt to avoid REX prefix on
20699 moving of second half of TFmode value. */
20700 if (GET_MODE (part[1][1]) == SImode)
20701 {
20702 switch (GET_CODE (part[1][1]))
20703 {
20704 case MEM:
20705 part[1][1] = adjust_address (part[1][1], DImode, 0);
20706 break;
20707
20708 case REG:
20709 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20710 break;
20711
20712 default:
20713 gcc_unreachable ();
20714 }
20715
20716 if (GET_MODE (part[1][0]) == SImode)
20717 part[1][0] = part[1][1];
20718 }
20719 }
20720 emit_move_insn (part[0][1], part[1][1]);
20721 emit_move_insn (part[0][0], part[1][0]);
20722 return;
20723 }
20724
20725 /* Choose correct order to not overwrite the source before it is copied. */
20726 if ((REG_P (part[0][0])
20727 && REG_P (part[1][1])
20728 && (REGNO (part[0][0]) == REGNO (part[1][1])
20729 || (nparts == 3
20730 && REGNO (part[0][0]) == REGNO (part[1][2]))
20731 || (nparts == 4
20732 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20733 || (collisions > 0
20734 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20735 {
20736 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20737 {
20738 operands[2 + i] = part[0][j];
20739 operands[6 + i] = part[1][j];
20740 }
20741 }
20742 else
20743 {
20744 for (i = 0; i < nparts; i++)
20745 {
20746 operands[2 + i] = part[0][i];
20747 operands[6 + i] = part[1][i];
20748 }
20749 }
20750
20751 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20752 if (optimize_insn_for_size_p ())
20753 {
20754 for (j = 0; j < nparts - 1; j++)
20755 if (CONST_INT_P (operands[6 + j])
20756 && operands[6 + j] != const0_rtx
20757 && REG_P (operands[2 + j]))
20758 for (i = j; i < nparts - 1; i++)
20759 if (CONST_INT_P (operands[7 + i])
20760 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20761 operands[7 + i] = operands[2 + j];
20762 }
20763
20764 for (i = 0; i < nparts; i++)
20765 emit_move_insn (operands[2 + i], operands[6 + i]);
20766
20767 return;
20768 }
20769
20770 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20771 left shift by a constant, either using a single shift or
20772 a sequence of add instructions. */
20773
20774 static void
20775 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20776 {
20777 rtx (*insn)(rtx, rtx, rtx);
20778
20779 if (count == 1
20780 || (count * ix86_cost->add <= ix86_cost->shift_const
20781 && !optimize_insn_for_size_p ()))
20782 {
20783 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20784 while (count-- > 0)
20785 emit_insn (insn (operand, operand, operand));
20786 }
20787 else
20788 {
20789 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20790 emit_insn (insn (operand, operand, GEN_INT (count)));
20791 }
20792 }
20793
20794 void
20795 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20796 {
20797 rtx (*gen_ashl3)(rtx, rtx, rtx);
20798 rtx (*gen_shld)(rtx, rtx, rtx);
20799 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20800
20801 rtx low[2], high[2];
20802 int count;
20803
20804 if (CONST_INT_P (operands[2]))
20805 {
20806 split_double_mode (mode, operands, 2, low, high);
20807 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20808
20809 if (count >= half_width)
20810 {
20811 emit_move_insn (high[0], low[1]);
20812 emit_move_insn (low[0], const0_rtx);
20813
20814 if (count > half_width)
20815 ix86_expand_ashl_const (high[0], count - half_width, mode);
20816 }
20817 else
20818 {
20819 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20820
20821 if (!rtx_equal_p (operands[0], operands[1]))
20822 emit_move_insn (operands[0], operands[1]);
20823
20824 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20825 ix86_expand_ashl_const (low[0], count, mode);
20826 }
20827 return;
20828 }
20829
20830 split_double_mode (mode, operands, 1, low, high);
20831
20832 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20833
20834 if (operands[1] == const1_rtx)
20835 {
20836 /* Assuming we've chosen a QImode capable registers, then 1 << N
20837 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20838 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20839 {
20840 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20841
20842 ix86_expand_clear (low[0]);
20843 ix86_expand_clear (high[0]);
20844 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20845
20846 d = gen_lowpart (QImode, low[0]);
20847 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20848 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20849 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20850
20851 d = gen_lowpart (QImode, high[0]);
20852 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20853 s = gen_rtx_NE (QImode, flags, const0_rtx);
20854 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20855 }
20856
20857 /* Otherwise, we can get the same results by manually performing
20858 a bit extract operation on bit 5/6, and then performing the two
20859 shifts. The two methods of getting 0/1 into low/high are exactly
20860 the same size. Avoiding the shift in the bit extract case helps
20861 pentium4 a bit; no one else seems to care much either way. */
20862 else
20863 {
20864 enum machine_mode half_mode;
20865 rtx (*gen_lshr3)(rtx, rtx, rtx);
20866 rtx (*gen_and3)(rtx, rtx, rtx);
20867 rtx (*gen_xor3)(rtx, rtx, rtx);
20868 HOST_WIDE_INT bits;
20869 rtx x;
20870
20871 if (mode == DImode)
20872 {
20873 half_mode = SImode;
20874 gen_lshr3 = gen_lshrsi3;
20875 gen_and3 = gen_andsi3;
20876 gen_xor3 = gen_xorsi3;
20877 bits = 5;
20878 }
20879 else
20880 {
20881 half_mode = DImode;
20882 gen_lshr3 = gen_lshrdi3;
20883 gen_and3 = gen_anddi3;
20884 gen_xor3 = gen_xordi3;
20885 bits = 6;
20886 }
20887
20888 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20889 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20890 else
20891 x = gen_lowpart (half_mode, operands[2]);
20892 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20893
20894 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20895 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20896 emit_move_insn (low[0], high[0]);
20897 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20898 }
20899
20900 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20901 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20902 return;
20903 }
20904
20905 if (operands[1] == constm1_rtx)
20906 {
20907 /* For -1 << N, we can avoid the shld instruction, because we
20908 know that we're shifting 0...31/63 ones into a -1. */
20909 emit_move_insn (low[0], constm1_rtx);
20910 if (optimize_insn_for_size_p ())
20911 emit_move_insn (high[0], low[0]);
20912 else
20913 emit_move_insn (high[0], constm1_rtx);
20914 }
20915 else
20916 {
20917 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20918
20919 if (!rtx_equal_p (operands[0], operands[1]))
20920 emit_move_insn (operands[0], operands[1]);
20921
20922 split_double_mode (mode, operands, 1, low, high);
20923 emit_insn (gen_shld (high[0], low[0], operands[2]));
20924 }
20925
20926 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20927
20928 if (TARGET_CMOVE && scratch)
20929 {
20930 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20931 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20932
20933 ix86_expand_clear (scratch);
20934 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20935 }
20936 else
20937 {
20938 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20939 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20940
20941 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20942 }
20943 }
20944
20945 void
20946 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20947 {
20948 rtx (*gen_ashr3)(rtx, rtx, rtx)
20949 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20950 rtx (*gen_shrd)(rtx, rtx, rtx);
20951 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20952
20953 rtx low[2], high[2];
20954 int count;
20955
20956 if (CONST_INT_P (operands[2]))
20957 {
20958 split_double_mode (mode, operands, 2, low, high);
20959 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20960
20961 if (count == GET_MODE_BITSIZE (mode) - 1)
20962 {
20963 emit_move_insn (high[0], high[1]);
20964 emit_insn (gen_ashr3 (high[0], high[0],
20965 GEN_INT (half_width - 1)));
20966 emit_move_insn (low[0], high[0]);
20967
20968 }
20969 else if (count >= half_width)
20970 {
20971 emit_move_insn (low[0], high[1]);
20972 emit_move_insn (high[0], low[0]);
20973 emit_insn (gen_ashr3 (high[0], high[0],
20974 GEN_INT (half_width - 1)));
20975
20976 if (count > half_width)
20977 emit_insn (gen_ashr3 (low[0], low[0],
20978 GEN_INT (count - half_width)));
20979 }
20980 else
20981 {
20982 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20983
20984 if (!rtx_equal_p (operands[0], operands[1]))
20985 emit_move_insn (operands[0], operands[1]);
20986
20987 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20988 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20989 }
20990 }
20991 else
20992 {
20993 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20994
20995 if (!rtx_equal_p (operands[0], operands[1]))
20996 emit_move_insn (operands[0], operands[1]);
20997
20998 split_double_mode (mode, operands, 1, low, high);
20999
21000 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21001 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21002
21003 if (TARGET_CMOVE && scratch)
21004 {
21005 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21006 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21007
21008 emit_move_insn (scratch, high[0]);
21009 emit_insn (gen_ashr3 (scratch, scratch,
21010 GEN_INT (half_width - 1)));
21011 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21012 scratch));
21013 }
21014 else
21015 {
21016 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21017 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21018
21019 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21020 }
21021 }
21022 }
21023
21024 void
21025 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21026 {
21027 rtx (*gen_lshr3)(rtx, rtx, rtx)
21028 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21029 rtx (*gen_shrd)(rtx, rtx, rtx);
21030 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21031
21032 rtx low[2], high[2];
21033 int count;
21034
21035 if (CONST_INT_P (operands[2]))
21036 {
21037 split_double_mode (mode, operands, 2, low, high);
21038 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21039
21040 if (count >= half_width)
21041 {
21042 emit_move_insn (low[0], high[1]);
21043 ix86_expand_clear (high[0]);
21044
21045 if (count > half_width)
21046 emit_insn (gen_lshr3 (low[0], low[0],
21047 GEN_INT (count - half_width)));
21048 }
21049 else
21050 {
21051 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21052
21053 if (!rtx_equal_p (operands[0], operands[1]))
21054 emit_move_insn (operands[0], operands[1]);
21055
21056 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21057 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21058 }
21059 }
21060 else
21061 {
21062 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21063
21064 if (!rtx_equal_p (operands[0], operands[1]))
21065 emit_move_insn (operands[0], operands[1]);
21066
21067 split_double_mode (mode, operands, 1, low, high);
21068
21069 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21070 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21071
21072 if (TARGET_CMOVE && scratch)
21073 {
21074 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21075 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21076
21077 ix86_expand_clear (scratch);
21078 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21079 scratch));
21080 }
21081 else
21082 {
21083 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21084 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21085
21086 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21087 }
21088 }
21089 }
21090
21091 /* Predict just emitted jump instruction to be taken with probability PROB. */
21092 static void
21093 predict_jump (int prob)
21094 {
21095 rtx insn = get_last_insn ();
21096 gcc_assert (JUMP_P (insn));
21097 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21098 }
21099
21100 /* Helper function for the string operations below. Dest VARIABLE whether
21101 it is aligned to VALUE bytes. If true, jump to the label. */
21102 static rtx
21103 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21104 {
21105 rtx label = gen_label_rtx ();
21106 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21107 if (GET_MODE (variable) == DImode)
21108 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21109 else
21110 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21111 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21112 1, label);
21113 if (epilogue)
21114 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21115 else
21116 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21117 return label;
21118 }
21119
21120 /* Adjust COUNTER by the VALUE. */
21121 static void
21122 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21123 {
21124 rtx (*gen_add)(rtx, rtx, rtx)
21125 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21126
21127 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21128 }
21129
21130 /* Zero extend possibly SImode EXP to Pmode register. */
21131 rtx
21132 ix86_zero_extend_to_Pmode (rtx exp)
21133 {
21134 if (GET_MODE (exp) != Pmode)
21135 exp = convert_to_mode (Pmode, exp, 1);
21136 return force_reg (Pmode, exp);
21137 }
21138
21139 /* Divide COUNTREG by SCALE. */
21140 static rtx
21141 scale_counter (rtx countreg, int scale)
21142 {
21143 rtx sc;
21144
21145 if (scale == 1)
21146 return countreg;
21147 if (CONST_INT_P (countreg))
21148 return GEN_INT (INTVAL (countreg) / scale);
21149 gcc_assert (REG_P (countreg));
21150
21151 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21152 GEN_INT (exact_log2 (scale)),
21153 NULL, 1, OPTAB_DIRECT);
21154 return sc;
21155 }
21156
21157 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21158 DImode for constant loop counts. */
21159
21160 static enum machine_mode
21161 counter_mode (rtx count_exp)
21162 {
21163 if (GET_MODE (count_exp) != VOIDmode)
21164 return GET_MODE (count_exp);
21165 if (!CONST_INT_P (count_exp))
21166 return Pmode;
21167 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21168 return DImode;
21169 return SImode;
21170 }
21171
21172 /* When SRCPTR is non-NULL, output simple loop to move memory
21173 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21174 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21175 equivalent loop to set memory by VALUE (supposed to be in MODE).
21176
21177 The size is rounded down to whole number of chunk size moved at once.
21178 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21179
21180
21181 static void
21182 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21183 rtx destptr, rtx srcptr, rtx value,
21184 rtx count, enum machine_mode mode, int unroll,
21185 int expected_size)
21186 {
21187 rtx out_label, top_label, iter, tmp;
21188 enum machine_mode iter_mode = counter_mode (count);
21189 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21190 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21191 rtx size;
21192 rtx x_addr;
21193 rtx y_addr;
21194 int i;
21195
21196 top_label = gen_label_rtx ();
21197 out_label = gen_label_rtx ();
21198 iter = gen_reg_rtx (iter_mode);
21199
21200 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21201 NULL, 1, OPTAB_DIRECT);
21202 /* Those two should combine. */
21203 if (piece_size == const1_rtx)
21204 {
21205 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21206 true, out_label);
21207 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21208 }
21209 emit_move_insn (iter, const0_rtx);
21210
21211 emit_label (top_label);
21212
21213 tmp = convert_modes (Pmode, iter_mode, iter, true);
21214 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21215 destmem = change_address (destmem, mode, x_addr);
21216
21217 if (srcmem)
21218 {
21219 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21220 srcmem = change_address (srcmem, mode, y_addr);
21221
21222 /* When unrolling for chips that reorder memory reads and writes,
21223 we can save registers by using single temporary.
21224 Also using 4 temporaries is overkill in 32bit mode. */
21225 if (!TARGET_64BIT && 0)
21226 {
21227 for (i = 0; i < unroll; i++)
21228 {
21229 if (i)
21230 {
21231 destmem =
21232 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21233 srcmem =
21234 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21235 }
21236 emit_move_insn (destmem, srcmem);
21237 }
21238 }
21239 else
21240 {
21241 rtx tmpreg[4];
21242 gcc_assert (unroll <= 4);
21243 for (i = 0; i < unroll; i++)
21244 {
21245 tmpreg[i] = gen_reg_rtx (mode);
21246 if (i)
21247 {
21248 srcmem =
21249 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21250 }
21251 emit_move_insn (tmpreg[i], srcmem);
21252 }
21253 for (i = 0; i < unroll; i++)
21254 {
21255 if (i)
21256 {
21257 destmem =
21258 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21259 }
21260 emit_move_insn (destmem, tmpreg[i]);
21261 }
21262 }
21263 }
21264 else
21265 for (i = 0; i < unroll; i++)
21266 {
21267 if (i)
21268 destmem =
21269 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21270 emit_move_insn (destmem, value);
21271 }
21272
21273 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21274 true, OPTAB_LIB_WIDEN);
21275 if (tmp != iter)
21276 emit_move_insn (iter, tmp);
21277
21278 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21279 true, top_label);
21280 if (expected_size != -1)
21281 {
21282 expected_size /= GET_MODE_SIZE (mode) * unroll;
21283 if (expected_size == 0)
21284 predict_jump (0);
21285 else if (expected_size > REG_BR_PROB_BASE)
21286 predict_jump (REG_BR_PROB_BASE - 1);
21287 else
21288 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21289 }
21290 else
21291 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21292 iter = ix86_zero_extend_to_Pmode (iter);
21293 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21294 true, OPTAB_LIB_WIDEN);
21295 if (tmp != destptr)
21296 emit_move_insn (destptr, tmp);
21297 if (srcptr)
21298 {
21299 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21300 true, OPTAB_LIB_WIDEN);
21301 if (tmp != srcptr)
21302 emit_move_insn (srcptr, tmp);
21303 }
21304 emit_label (out_label);
21305 }
21306
21307 /* Output "rep; mov" instruction.
21308 Arguments have same meaning as for previous function */
21309 static void
21310 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21311 rtx destptr, rtx srcptr,
21312 rtx count,
21313 enum machine_mode mode)
21314 {
21315 rtx destexp;
21316 rtx srcexp;
21317 rtx countreg;
21318 HOST_WIDE_INT rounded_count;
21319
21320 /* If the size is known, it is shorter to use rep movs. */
21321 if (mode == QImode && CONST_INT_P (count)
21322 && !(INTVAL (count) & 3))
21323 mode = SImode;
21324
21325 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21326 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21327 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21328 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21329 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21330 if (mode != QImode)
21331 {
21332 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21333 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21334 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21335 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21336 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21337 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21338 }
21339 else
21340 {
21341 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21342 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21343 }
21344 if (CONST_INT_P (count))
21345 {
21346 rounded_count = (INTVAL (count)
21347 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21348 destmem = shallow_copy_rtx (destmem);
21349 srcmem = shallow_copy_rtx (srcmem);
21350 set_mem_size (destmem, rounded_count);
21351 set_mem_size (srcmem, rounded_count);
21352 }
21353 else
21354 {
21355 if (MEM_SIZE_KNOWN_P (destmem))
21356 clear_mem_size (destmem);
21357 if (MEM_SIZE_KNOWN_P (srcmem))
21358 clear_mem_size (srcmem);
21359 }
21360 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21361 destexp, srcexp));
21362 }
21363
21364 /* Output "rep; stos" instruction.
21365 Arguments have same meaning as for previous function */
21366 static void
21367 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21368 rtx count, enum machine_mode mode,
21369 rtx orig_value)
21370 {
21371 rtx destexp;
21372 rtx countreg;
21373 HOST_WIDE_INT rounded_count;
21374
21375 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21376 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21377 value = force_reg (mode, gen_lowpart (mode, value));
21378 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21379 if (mode != QImode)
21380 {
21381 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21382 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21383 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21384 }
21385 else
21386 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21387 if (orig_value == const0_rtx && CONST_INT_P (count))
21388 {
21389 rounded_count = (INTVAL (count)
21390 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21391 destmem = shallow_copy_rtx (destmem);
21392 set_mem_size (destmem, rounded_count);
21393 }
21394 else if (MEM_SIZE_KNOWN_P (destmem))
21395 clear_mem_size (destmem);
21396 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21397 }
21398
21399 static void
21400 emit_strmov (rtx destmem, rtx srcmem,
21401 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21402 {
21403 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21404 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21405 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21406 }
21407
21408 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21409 static void
21410 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21411 rtx destptr, rtx srcptr, rtx count, int max_size)
21412 {
21413 rtx src, dest;
21414 if (CONST_INT_P (count))
21415 {
21416 HOST_WIDE_INT countval = INTVAL (count);
21417 int offset = 0;
21418
21419 if ((countval & 0x10) && max_size > 16)
21420 {
21421 if (TARGET_64BIT)
21422 {
21423 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21424 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21425 }
21426 else
21427 gcc_unreachable ();
21428 offset += 16;
21429 }
21430 if ((countval & 0x08) && max_size > 8)
21431 {
21432 if (TARGET_64BIT)
21433 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21434 else
21435 {
21436 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21437 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21438 }
21439 offset += 8;
21440 }
21441 if ((countval & 0x04) && max_size > 4)
21442 {
21443 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21444 offset += 4;
21445 }
21446 if ((countval & 0x02) && max_size > 2)
21447 {
21448 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21449 offset += 2;
21450 }
21451 if ((countval & 0x01) && max_size > 1)
21452 {
21453 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21454 offset += 1;
21455 }
21456 return;
21457 }
21458 if (max_size > 8)
21459 {
21460 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21461 count, 1, OPTAB_DIRECT);
21462 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21463 count, QImode, 1, 4);
21464 return;
21465 }
21466
21467 /* When there are stringops, we can cheaply increase dest and src pointers.
21468 Otherwise we save code size by maintaining offset (zero is readily
21469 available from preceding rep operation) and using x86 addressing modes.
21470 */
21471 if (TARGET_SINGLE_STRINGOP)
21472 {
21473 if (max_size > 4)
21474 {
21475 rtx label = ix86_expand_aligntest (count, 4, true);
21476 src = change_address (srcmem, SImode, srcptr);
21477 dest = change_address (destmem, SImode, destptr);
21478 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21479 emit_label (label);
21480 LABEL_NUSES (label) = 1;
21481 }
21482 if (max_size > 2)
21483 {
21484 rtx label = ix86_expand_aligntest (count, 2, true);
21485 src = change_address (srcmem, HImode, srcptr);
21486 dest = change_address (destmem, HImode, destptr);
21487 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21488 emit_label (label);
21489 LABEL_NUSES (label) = 1;
21490 }
21491 if (max_size > 1)
21492 {
21493 rtx label = ix86_expand_aligntest (count, 1, true);
21494 src = change_address (srcmem, QImode, srcptr);
21495 dest = change_address (destmem, QImode, destptr);
21496 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21497 emit_label (label);
21498 LABEL_NUSES (label) = 1;
21499 }
21500 }
21501 else
21502 {
21503 rtx offset = force_reg (Pmode, const0_rtx);
21504 rtx tmp;
21505
21506 if (max_size > 4)
21507 {
21508 rtx label = ix86_expand_aligntest (count, 4, true);
21509 src = change_address (srcmem, SImode, srcptr);
21510 dest = change_address (destmem, SImode, destptr);
21511 emit_move_insn (dest, src);
21512 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21513 true, OPTAB_LIB_WIDEN);
21514 if (tmp != offset)
21515 emit_move_insn (offset, tmp);
21516 emit_label (label);
21517 LABEL_NUSES (label) = 1;
21518 }
21519 if (max_size > 2)
21520 {
21521 rtx label = ix86_expand_aligntest (count, 2, true);
21522 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21523 src = change_address (srcmem, HImode, tmp);
21524 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21525 dest = change_address (destmem, HImode, tmp);
21526 emit_move_insn (dest, src);
21527 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21528 true, OPTAB_LIB_WIDEN);
21529 if (tmp != offset)
21530 emit_move_insn (offset, tmp);
21531 emit_label (label);
21532 LABEL_NUSES (label) = 1;
21533 }
21534 if (max_size > 1)
21535 {
21536 rtx label = ix86_expand_aligntest (count, 1, true);
21537 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21538 src = change_address (srcmem, QImode, tmp);
21539 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21540 dest = change_address (destmem, QImode, tmp);
21541 emit_move_insn (dest, src);
21542 emit_label (label);
21543 LABEL_NUSES (label) = 1;
21544 }
21545 }
21546 }
21547
21548 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21549 static void
21550 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21551 rtx count, int max_size)
21552 {
21553 count =
21554 expand_simple_binop (counter_mode (count), AND, count,
21555 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21556 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21557 gen_lowpart (QImode, value), count, QImode,
21558 1, max_size / 2);
21559 }
21560
21561 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21562 static void
21563 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21564 {
21565 rtx dest;
21566
21567 if (CONST_INT_P (count))
21568 {
21569 HOST_WIDE_INT countval = INTVAL (count);
21570 int offset = 0;
21571
21572 if ((countval & 0x10) && max_size > 16)
21573 {
21574 if (TARGET_64BIT)
21575 {
21576 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21577 emit_insn (gen_strset (destptr, dest, value));
21578 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21579 emit_insn (gen_strset (destptr, dest, value));
21580 }
21581 else
21582 gcc_unreachable ();
21583 offset += 16;
21584 }
21585 if ((countval & 0x08) && max_size > 8)
21586 {
21587 if (TARGET_64BIT)
21588 {
21589 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21590 emit_insn (gen_strset (destptr, dest, value));
21591 }
21592 else
21593 {
21594 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21595 emit_insn (gen_strset (destptr, dest, value));
21596 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21597 emit_insn (gen_strset (destptr, dest, value));
21598 }
21599 offset += 8;
21600 }
21601 if ((countval & 0x04) && max_size > 4)
21602 {
21603 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21604 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21605 offset += 4;
21606 }
21607 if ((countval & 0x02) && max_size > 2)
21608 {
21609 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21610 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21611 offset += 2;
21612 }
21613 if ((countval & 0x01) && max_size > 1)
21614 {
21615 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21616 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21617 offset += 1;
21618 }
21619 return;
21620 }
21621 if (max_size > 32)
21622 {
21623 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21624 return;
21625 }
21626 if (max_size > 16)
21627 {
21628 rtx label = ix86_expand_aligntest (count, 16, true);
21629 if (TARGET_64BIT)
21630 {
21631 dest = change_address (destmem, DImode, destptr);
21632 emit_insn (gen_strset (destptr, dest, value));
21633 emit_insn (gen_strset (destptr, dest, value));
21634 }
21635 else
21636 {
21637 dest = change_address (destmem, SImode, destptr);
21638 emit_insn (gen_strset (destptr, dest, value));
21639 emit_insn (gen_strset (destptr, dest, value));
21640 emit_insn (gen_strset (destptr, dest, value));
21641 emit_insn (gen_strset (destptr, dest, value));
21642 }
21643 emit_label (label);
21644 LABEL_NUSES (label) = 1;
21645 }
21646 if (max_size > 8)
21647 {
21648 rtx label = ix86_expand_aligntest (count, 8, true);
21649 if (TARGET_64BIT)
21650 {
21651 dest = change_address (destmem, DImode, destptr);
21652 emit_insn (gen_strset (destptr, dest, value));
21653 }
21654 else
21655 {
21656 dest = change_address (destmem, SImode, destptr);
21657 emit_insn (gen_strset (destptr, dest, value));
21658 emit_insn (gen_strset (destptr, dest, value));
21659 }
21660 emit_label (label);
21661 LABEL_NUSES (label) = 1;
21662 }
21663 if (max_size > 4)
21664 {
21665 rtx label = ix86_expand_aligntest (count, 4, true);
21666 dest = change_address (destmem, SImode, destptr);
21667 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21668 emit_label (label);
21669 LABEL_NUSES (label) = 1;
21670 }
21671 if (max_size > 2)
21672 {
21673 rtx label = ix86_expand_aligntest (count, 2, true);
21674 dest = change_address (destmem, HImode, destptr);
21675 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21676 emit_label (label);
21677 LABEL_NUSES (label) = 1;
21678 }
21679 if (max_size > 1)
21680 {
21681 rtx label = ix86_expand_aligntest (count, 1, true);
21682 dest = change_address (destmem, QImode, destptr);
21683 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21684 emit_label (label);
21685 LABEL_NUSES (label) = 1;
21686 }
21687 }
21688
21689 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21690 DESIRED_ALIGNMENT. */
21691 static void
21692 expand_movmem_prologue (rtx destmem, rtx srcmem,
21693 rtx destptr, rtx srcptr, rtx count,
21694 int align, int desired_alignment)
21695 {
21696 if (align <= 1 && desired_alignment > 1)
21697 {
21698 rtx label = ix86_expand_aligntest (destptr, 1, false);
21699 srcmem = change_address (srcmem, QImode, srcptr);
21700 destmem = change_address (destmem, QImode, destptr);
21701 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21702 ix86_adjust_counter (count, 1);
21703 emit_label (label);
21704 LABEL_NUSES (label) = 1;
21705 }
21706 if (align <= 2 && desired_alignment > 2)
21707 {
21708 rtx label = ix86_expand_aligntest (destptr, 2, false);
21709 srcmem = change_address (srcmem, HImode, srcptr);
21710 destmem = change_address (destmem, HImode, destptr);
21711 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21712 ix86_adjust_counter (count, 2);
21713 emit_label (label);
21714 LABEL_NUSES (label) = 1;
21715 }
21716 if (align <= 4 && desired_alignment > 4)
21717 {
21718 rtx label = ix86_expand_aligntest (destptr, 4, false);
21719 srcmem = change_address (srcmem, SImode, srcptr);
21720 destmem = change_address (destmem, SImode, destptr);
21721 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21722 ix86_adjust_counter (count, 4);
21723 emit_label (label);
21724 LABEL_NUSES (label) = 1;
21725 }
21726 gcc_assert (desired_alignment <= 8);
21727 }
21728
21729 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21730 ALIGN_BYTES is how many bytes need to be copied. */
21731 static rtx
21732 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21733 int desired_align, int align_bytes)
21734 {
21735 rtx src = *srcp;
21736 rtx orig_dst = dst;
21737 rtx orig_src = src;
21738 int off = 0;
21739 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21740 if (src_align_bytes >= 0)
21741 src_align_bytes = desired_align - src_align_bytes;
21742 if (align_bytes & 1)
21743 {
21744 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21745 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21746 off = 1;
21747 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21748 }
21749 if (align_bytes & 2)
21750 {
21751 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21752 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21753 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21754 set_mem_align (dst, 2 * BITS_PER_UNIT);
21755 if (src_align_bytes >= 0
21756 && (src_align_bytes & 1) == (align_bytes & 1)
21757 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21758 set_mem_align (src, 2 * BITS_PER_UNIT);
21759 off = 2;
21760 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21761 }
21762 if (align_bytes & 4)
21763 {
21764 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21765 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21766 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21767 set_mem_align (dst, 4 * BITS_PER_UNIT);
21768 if (src_align_bytes >= 0)
21769 {
21770 unsigned int src_align = 0;
21771 if ((src_align_bytes & 3) == (align_bytes & 3))
21772 src_align = 4;
21773 else if ((src_align_bytes & 1) == (align_bytes & 1))
21774 src_align = 2;
21775 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21776 set_mem_align (src, src_align * BITS_PER_UNIT);
21777 }
21778 off = 4;
21779 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21780 }
21781 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21782 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21783 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21784 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21785 if (src_align_bytes >= 0)
21786 {
21787 unsigned int src_align = 0;
21788 if ((src_align_bytes & 7) == (align_bytes & 7))
21789 src_align = 8;
21790 else if ((src_align_bytes & 3) == (align_bytes & 3))
21791 src_align = 4;
21792 else if ((src_align_bytes & 1) == (align_bytes & 1))
21793 src_align = 2;
21794 if (src_align > (unsigned int) desired_align)
21795 src_align = desired_align;
21796 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21797 set_mem_align (src, src_align * BITS_PER_UNIT);
21798 }
21799 if (MEM_SIZE_KNOWN_P (orig_dst))
21800 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21801 if (MEM_SIZE_KNOWN_P (orig_src))
21802 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21803 *srcp = src;
21804 return dst;
21805 }
21806
21807 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21808 DESIRED_ALIGNMENT. */
21809 static void
21810 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21811 int align, int desired_alignment)
21812 {
21813 if (align <= 1 && desired_alignment > 1)
21814 {
21815 rtx label = ix86_expand_aligntest (destptr, 1, false);
21816 destmem = change_address (destmem, QImode, destptr);
21817 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21818 ix86_adjust_counter (count, 1);
21819 emit_label (label);
21820 LABEL_NUSES (label) = 1;
21821 }
21822 if (align <= 2 && desired_alignment > 2)
21823 {
21824 rtx label = ix86_expand_aligntest (destptr, 2, false);
21825 destmem = change_address (destmem, HImode, destptr);
21826 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21827 ix86_adjust_counter (count, 2);
21828 emit_label (label);
21829 LABEL_NUSES (label) = 1;
21830 }
21831 if (align <= 4 && desired_alignment > 4)
21832 {
21833 rtx label = ix86_expand_aligntest (destptr, 4, false);
21834 destmem = change_address (destmem, SImode, destptr);
21835 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21836 ix86_adjust_counter (count, 4);
21837 emit_label (label);
21838 LABEL_NUSES (label) = 1;
21839 }
21840 gcc_assert (desired_alignment <= 8);
21841 }
21842
21843 /* Set enough from DST to align DST known to by aligned by ALIGN to
21844 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21845 static rtx
21846 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21847 int desired_align, int align_bytes)
21848 {
21849 int off = 0;
21850 rtx orig_dst = dst;
21851 if (align_bytes & 1)
21852 {
21853 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21854 off = 1;
21855 emit_insn (gen_strset (destreg, dst,
21856 gen_lowpart (QImode, value)));
21857 }
21858 if (align_bytes & 2)
21859 {
21860 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21861 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21862 set_mem_align (dst, 2 * BITS_PER_UNIT);
21863 off = 2;
21864 emit_insn (gen_strset (destreg, dst,
21865 gen_lowpart (HImode, value)));
21866 }
21867 if (align_bytes & 4)
21868 {
21869 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21870 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21871 set_mem_align (dst, 4 * BITS_PER_UNIT);
21872 off = 4;
21873 emit_insn (gen_strset (destreg, dst,
21874 gen_lowpart (SImode, value)));
21875 }
21876 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21877 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21878 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21879 if (MEM_SIZE_KNOWN_P (orig_dst))
21880 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21881 return dst;
21882 }
21883
21884 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21885 static enum stringop_alg
21886 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21887 int *dynamic_check)
21888 {
21889 const struct stringop_algs * algs;
21890 bool optimize_for_speed;
21891 /* Algorithms using the rep prefix want at least edi and ecx;
21892 additionally, memset wants eax and memcpy wants esi. Don't
21893 consider such algorithms if the user has appropriated those
21894 registers for their own purposes. */
21895 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21896 || (memset
21897 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21898
21899 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21900 || (alg != rep_prefix_1_byte \
21901 && alg != rep_prefix_4_byte \
21902 && alg != rep_prefix_8_byte))
21903 const struct processor_costs *cost;
21904
21905 /* Even if the string operation call is cold, we still might spend a lot
21906 of time processing large blocks. */
21907 if (optimize_function_for_size_p (cfun)
21908 || (optimize_insn_for_size_p ()
21909 && expected_size != -1 && expected_size < 256))
21910 optimize_for_speed = false;
21911 else
21912 optimize_for_speed = true;
21913
21914 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21915
21916 *dynamic_check = -1;
21917 if (memset)
21918 algs = &cost->memset[TARGET_64BIT != 0];
21919 else
21920 algs = &cost->memcpy[TARGET_64BIT != 0];
21921 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21922 return ix86_stringop_alg;
21923 /* rep; movq or rep; movl is the smallest variant. */
21924 else if (!optimize_for_speed)
21925 {
21926 if (!count || (count & 3))
21927 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21928 else
21929 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21930 }
21931 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21932 */
21933 else if (expected_size != -1 && expected_size < 4)
21934 return loop_1_byte;
21935 else if (expected_size != -1)
21936 {
21937 unsigned int i;
21938 enum stringop_alg alg = libcall;
21939 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21940 {
21941 /* We get here if the algorithms that were not libcall-based
21942 were rep-prefix based and we are unable to use rep prefixes
21943 based on global register usage. Break out of the loop and
21944 use the heuristic below. */
21945 if (algs->size[i].max == 0)
21946 break;
21947 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21948 {
21949 enum stringop_alg candidate = algs->size[i].alg;
21950
21951 if (candidate != libcall && ALG_USABLE_P (candidate))
21952 alg = candidate;
21953 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21954 last non-libcall inline algorithm. */
21955 if (TARGET_INLINE_ALL_STRINGOPS)
21956 {
21957 /* When the current size is best to be copied by a libcall,
21958 but we are still forced to inline, run the heuristic below
21959 that will pick code for medium sized blocks. */
21960 if (alg != libcall)
21961 return alg;
21962 break;
21963 }
21964 else if (ALG_USABLE_P (candidate))
21965 return candidate;
21966 }
21967 }
21968 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21969 }
21970 /* When asked to inline the call anyway, try to pick meaningful choice.
21971 We look for maximal size of block that is faster to copy by hand and
21972 take blocks of at most of that size guessing that average size will
21973 be roughly half of the block.
21974
21975 If this turns out to be bad, we might simply specify the preferred
21976 choice in ix86_costs. */
21977 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21978 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21979 {
21980 int max = -1;
21981 enum stringop_alg alg;
21982 int i;
21983 bool any_alg_usable_p = true;
21984
21985 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21986 {
21987 enum stringop_alg candidate = algs->size[i].alg;
21988 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21989
21990 if (candidate != libcall && candidate
21991 && ALG_USABLE_P (candidate))
21992 max = algs->size[i].max;
21993 }
21994 /* If there aren't any usable algorithms, then recursing on
21995 smaller sizes isn't going to find anything. Just return the
21996 simple byte-at-a-time copy loop. */
21997 if (!any_alg_usable_p)
21998 {
21999 /* Pick something reasonable. */
22000 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22001 *dynamic_check = 128;
22002 return loop_1_byte;
22003 }
22004 if (max == -1)
22005 max = 4096;
22006 alg = decide_alg (count, max / 2, memset, dynamic_check);
22007 gcc_assert (*dynamic_check == -1);
22008 gcc_assert (alg != libcall);
22009 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22010 *dynamic_check = max;
22011 return alg;
22012 }
22013 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22014 #undef ALG_USABLE_P
22015 }
22016
22017 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22018 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22019 static int
22020 decide_alignment (int align,
22021 enum stringop_alg alg,
22022 int expected_size)
22023 {
22024 int desired_align = 0;
22025 switch (alg)
22026 {
22027 case no_stringop:
22028 gcc_unreachable ();
22029 case loop:
22030 case unrolled_loop:
22031 desired_align = GET_MODE_SIZE (Pmode);
22032 break;
22033 case rep_prefix_8_byte:
22034 desired_align = 8;
22035 break;
22036 case rep_prefix_4_byte:
22037 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22038 copying whole cacheline at once. */
22039 if (TARGET_PENTIUMPRO)
22040 desired_align = 8;
22041 else
22042 desired_align = 4;
22043 break;
22044 case rep_prefix_1_byte:
22045 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22046 copying whole cacheline at once. */
22047 if (TARGET_PENTIUMPRO)
22048 desired_align = 8;
22049 else
22050 desired_align = 1;
22051 break;
22052 case loop_1_byte:
22053 desired_align = 1;
22054 break;
22055 case libcall:
22056 return 0;
22057 }
22058
22059 if (optimize_size)
22060 desired_align = 1;
22061 if (desired_align < align)
22062 desired_align = align;
22063 if (expected_size != -1 && expected_size < 4)
22064 desired_align = align;
22065 return desired_align;
22066 }
22067
22068 /* Return the smallest power of 2 greater than VAL. */
22069 static int
22070 smallest_pow2_greater_than (int val)
22071 {
22072 int ret = 1;
22073 while (ret <= val)
22074 ret <<= 1;
22075 return ret;
22076 }
22077
22078 /* Expand string move (memcpy) operation. Use i386 string operations
22079 when profitable. expand_setmem contains similar code. The code
22080 depends upon architecture, block size and alignment, but always has
22081 the same overall structure:
22082
22083 1) Prologue guard: Conditional that jumps up to epilogues for small
22084 blocks that can be handled by epilogue alone. This is faster
22085 but also needed for correctness, since prologue assume the block
22086 is larger than the desired alignment.
22087
22088 Optional dynamic check for size and libcall for large
22089 blocks is emitted here too, with -minline-stringops-dynamically.
22090
22091 2) Prologue: copy first few bytes in order to get destination
22092 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22093 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22094 copied. We emit either a jump tree on power of two sized
22095 blocks, or a byte loop.
22096
22097 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22098 with specified algorithm.
22099
22100 4) Epilogue: code copying tail of the block that is too small to be
22101 handled by main body (or up to size guarded by prologue guard). */
22102
22103 bool
22104 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22105 rtx expected_align_exp, rtx expected_size_exp)
22106 {
22107 rtx destreg;
22108 rtx srcreg;
22109 rtx label = NULL;
22110 rtx tmp;
22111 rtx jump_around_label = NULL;
22112 HOST_WIDE_INT align = 1;
22113 unsigned HOST_WIDE_INT count = 0;
22114 HOST_WIDE_INT expected_size = -1;
22115 int size_needed = 0, epilogue_size_needed;
22116 int desired_align = 0, align_bytes = 0;
22117 enum stringop_alg alg;
22118 int dynamic_check;
22119 bool need_zero_guard = false;
22120
22121 if (CONST_INT_P (align_exp))
22122 align = INTVAL (align_exp);
22123 /* i386 can do misaligned access on reasonably increased cost. */
22124 if (CONST_INT_P (expected_align_exp)
22125 && INTVAL (expected_align_exp) > align)
22126 align = INTVAL (expected_align_exp);
22127 /* ALIGN is the minimum of destination and source alignment, but we care here
22128 just about destination alignment. */
22129 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22130 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22131
22132 if (CONST_INT_P (count_exp))
22133 count = expected_size = INTVAL (count_exp);
22134 if (CONST_INT_P (expected_size_exp) && count == 0)
22135 expected_size = INTVAL (expected_size_exp);
22136
22137 /* Make sure we don't need to care about overflow later on. */
22138 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22139 return false;
22140
22141 /* Step 0: Decide on preferred algorithm, desired alignment and
22142 size of chunks to be copied by main loop. */
22143
22144 alg = decide_alg (count, expected_size, false, &dynamic_check);
22145 desired_align = decide_alignment (align, alg, expected_size);
22146
22147 if (!TARGET_ALIGN_STRINGOPS)
22148 align = desired_align;
22149
22150 if (alg == libcall)
22151 return false;
22152 gcc_assert (alg != no_stringop);
22153 if (!count)
22154 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22155 destreg = copy_addr_to_reg (XEXP (dst, 0));
22156 srcreg = copy_addr_to_reg (XEXP (src, 0));
22157 switch (alg)
22158 {
22159 case libcall:
22160 case no_stringop:
22161 gcc_unreachable ();
22162 case loop:
22163 need_zero_guard = true;
22164 size_needed = GET_MODE_SIZE (word_mode);
22165 break;
22166 case unrolled_loop:
22167 need_zero_guard = true;
22168 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22169 break;
22170 case rep_prefix_8_byte:
22171 size_needed = 8;
22172 break;
22173 case rep_prefix_4_byte:
22174 size_needed = 4;
22175 break;
22176 case rep_prefix_1_byte:
22177 size_needed = 1;
22178 break;
22179 case loop_1_byte:
22180 need_zero_guard = true;
22181 size_needed = 1;
22182 break;
22183 }
22184
22185 epilogue_size_needed = size_needed;
22186
22187 /* Step 1: Prologue guard. */
22188
22189 /* Alignment code needs count to be in register. */
22190 if (CONST_INT_P (count_exp) && desired_align > align)
22191 {
22192 if (INTVAL (count_exp) > desired_align
22193 && INTVAL (count_exp) > size_needed)
22194 {
22195 align_bytes
22196 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22197 if (align_bytes <= 0)
22198 align_bytes = 0;
22199 else
22200 align_bytes = desired_align - align_bytes;
22201 }
22202 if (align_bytes == 0)
22203 count_exp = force_reg (counter_mode (count_exp), count_exp);
22204 }
22205 gcc_assert (desired_align >= 1 && align >= 1);
22206
22207 /* Ensure that alignment prologue won't copy past end of block. */
22208 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22209 {
22210 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22211 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22212 Make sure it is power of 2. */
22213 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22214
22215 if (count)
22216 {
22217 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22218 {
22219 /* If main algorithm works on QImode, no epilogue is needed.
22220 For small sizes just don't align anything. */
22221 if (size_needed == 1)
22222 desired_align = align;
22223 else
22224 goto epilogue;
22225 }
22226 }
22227 else
22228 {
22229 label = gen_label_rtx ();
22230 emit_cmp_and_jump_insns (count_exp,
22231 GEN_INT (epilogue_size_needed),
22232 LTU, 0, counter_mode (count_exp), 1, label);
22233 if (expected_size == -1 || expected_size < epilogue_size_needed)
22234 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22235 else
22236 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22237 }
22238 }
22239
22240 /* Emit code to decide on runtime whether library call or inline should be
22241 used. */
22242 if (dynamic_check != -1)
22243 {
22244 if (CONST_INT_P (count_exp))
22245 {
22246 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22247 {
22248 emit_block_move_via_libcall (dst, src, count_exp, false);
22249 count_exp = const0_rtx;
22250 goto epilogue;
22251 }
22252 }
22253 else
22254 {
22255 rtx hot_label = gen_label_rtx ();
22256 jump_around_label = gen_label_rtx ();
22257 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22258 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22259 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22260 emit_block_move_via_libcall (dst, src, count_exp, false);
22261 emit_jump (jump_around_label);
22262 emit_label (hot_label);
22263 }
22264 }
22265
22266 /* Step 2: Alignment prologue. */
22267
22268 if (desired_align > align)
22269 {
22270 if (align_bytes == 0)
22271 {
22272 /* Except for the first move in epilogue, we no longer know
22273 constant offset in aliasing info. It don't seems to worth
22274 the pain to maintain it for the first move, so throw away
22275 the info early. */
22276 src = change_address (src, BLKmode, srcreg);
22277 dst = change_address (dst, BLKmode, destreg);
22278 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22279 desired_align);
22280 }
22281 else
22282 {
22283 /* If we know how many bytes need to be stored before dst is
22284 sufficiently aligned, maintain aliasing info accurately. */
22285 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22286 desired_align, align_bytes);
22287 count_exp = plus_constant (count_exp, -align_bytes);
22288 count -= align_bytes;
22289 }
22290 if (need_zero_guard
22291 && (count < (unsigned HOST_WIDE_INT) size_needed
22292 || (align_bytes == 0
22293 && count < ((unsigned HOST_WIDE_INT) size_needed
22294 + desired_align - align))))
22295 {
22296 /* It is possible that we copied enough so the main loop will not
22297 execute. */
22298 gcc_assert (size_needed > 1);
22299 if (label == NULL_RTX)
22300 label = gen_label_rtx ();
22301 emit_cmp_and_jump_insns (count_exp,
22302 GEN_INT (size_needed),
22303 LTU, 0, counter_mode (count_exp), 1, label);
22304 if (expected_size == -1
22305 || expected_size < (desired_align - align) / 2 + size_needed)
22306 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22307 else
22308 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22309 }
22310 }
22311 if (label && size_needed == 1)
22312 {
22313 emit_label (label);
22314 LABEL_NUSES (label) = 1;
22315 label = NULL;
22316 epilogue_size_needed = 1;
22317 }
22318 else if (label == NULL_RTX)
22319 epilogue_size_needed = size_needed;
22320
22321 /* Step 3: Main loop. */
22322
22323 switch (alg)
22324 {
22325 case libcall:
22326 case no_stringop:
22327 gcc_unreachable ();
22328 case loop_1_byte:
22329 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22330 count_exp, QImode, 1, expected_size);
22331 break;
22332 case loop:
22333 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22334 count_exp, word_mode, 1, expected_size);
22335 break;
22336 case unrolled_loop:
22337 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22338 registers for 4 temporaries anyway. */
22339 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22340 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22341 expected_size);
22342 break;
22343 case rep_prefix_8_byte:
22344 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22345 DImode);
22346 break;
22347 case rep_prefix_4_byte:
22348 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22349 SImode);
22350 break;
22351 case rep_prefix_1_byte:
22352 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22353 QImode);
22354 break;
22355 }
22356 /* Adjust properly the offset of src and dest memory for aliasing. */
22357 if (CONST_INT_P (count_exp))
22358 {
22359 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22360 (count / size_needed) * size_needed);
22361 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22362 (count / size_needed) * size_needed);
22363 }
22364 else
22365 {
22366 src = change_address (src, BLKmode, srcreg);
22367 dst = change_address (dst, BLKmode, destreg);
22368 }
22369
22370 /* Step 4: Epilogue to copy the remaining bytes. */
22371 epilogue:
22372 if (label)
22373 {
22374 /* When the main loop is done, COUNT_EXP might hold original count,
22375 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22376 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22377 bytes. Compensate if needed. */
22378
22379 if (size_needed < epilogue_size_needed)
22380 {
22381 tmp =
22382 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22383 GEN_INT (size_needed - 1), count_exp, 1,
22384 OPTAB_DIRECT);
22385 if (tmp != count_exp)
22386 emit_move_insn (count_exp, tmp);
22387 }
22388 emit_label (label);
22389 LABEL_NUSES (label) = 1;
22390 }
22391
22392 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22393 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22394 epilogue_size_needed);
22395 if (jump_around_label)
22396 emit_label (jump_around_label);
22397 return true;
22398 }
22399
22400 /* Helper function for memcpy. For QImode value 0xXY produce
22401 0xXYXYXYXY of wide specified by MODE. This is essentially
22402 a * 0x10101010, but we can do slightly better than
22403 synth_mult by unwinding the sequence by hand on CPUs with
22404 slow multiply. */
22405 static rtx
22406 promote_duplicated_reg (enum machine_mode mode, rtx val)
22407 {
22408 enum machine_mode valmode = GET_MODE (val);
22409 rtx tmp;
22410 int nops = mode == DImode ? 3 : 2;
22411
22412 gcc_assert (mode == SImode || mode == DImode);
22413 if (val == const0_rtx)
22414 return copy_to_mode_reg (mode, const0_rtx);
22415 if (CONST_INT_P (val))
22416 {
22417 HOST_WIDE_INT v = INTVAL (val) & 255;
22418
22419 v |= v << 8;
22420 v |= v << 16;
22421 if (mode == DImode)
22422 v |= (v << 16) << 16;
22423 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22424 }
22425
22426 if (valmode == VOIDmode)
22427 valmode = QImode;
22428 if (valmode != QImode)
22429 val = gen_lowpart (QImode, val);
22430 if (mode == QImode)
22431 return val;
22432 if (!TARGET_PARTIAL_REG_STALL)
22433 nops--;
22434 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22435 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22436 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22437 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22438 {
22439 rtx reg = convert_modes (mode, QImode, val, true);
22440 tmp = promote_duplicated_reg (mode, const1_rtx);
22441 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22442 OPTAB_DIRECT);
22443 }
22444 else
22445 {
22446 rtx reg = convert_modes (mode, QImode, val, true);
22447
22448 if (!TARGET_PARTIAL_REG_STALL)
22449 if (mode == SImode)
22450 emit_insn (gen_movsi_insv_1 (reg, reg));
22451 else
22452 emit_insn (gen_movdi_insv_1 (reg, reg));
22453 else
22454 {
22455 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22456 NULL, 1, OPTAB_DIRECT);
22457 reg =
22458 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22459 }
22460 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22461 NULL, 1, OPTAB_DIRECT);
22462 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22463 if (mode == SImode)
22464 return reg;
22465 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22466 NULL, 1, OPTAB_DIRECT);
22467 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22468 return reg;
22469 }
22470 }
22471
22472 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22473 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22474 alignment from ALIGN to DESIRED_ALIGN. */
22475 static rtx
22476 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22477 {
22478 rtx promoted_val;
22479
22480 if (TARGET_64BIT
22481 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22482 promoted_val = promote_duplicated_reg (DImode, val);
22483 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22484 promoted_val = promote_duplicated_reg (SImode, val);
22485 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22486 promoted_val = promote_duplicated_reg (HImode, val);
22487 else
22488 promoted_val = val;
22489
22490 return promoted_val;
22491 }
22492
22493 /* Expand string clear operation (bzero). Use i386 string operations when
22494 profitable. See expand_movmem comment for explanation of individual
22495 steps performed. */
22496 bool
22497 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22498 rtx expected_align_exp, rtx expected_size_exp)
22499 {
22500 rtx destreg;
22501 rtx label = NULL;
22502 rtx tmp;
22503 rtx jump_around_label = NULL;
22504 HOST_WIDE_INT align = 1;
22505 unsigned HOST_WIDE_INT count = 0;
22506 HOST_WIDE_INT expected_size = -1;
22507 int size_needed = 0, epilogue_size_needed;
22508 int desired_align = 0, align_bytes = 0;
22509 enum stringop_alg alg;
22510 rtx promoted_val = NULL;
22511 bool force_loopy_epilogue = false;
22512 int dynamic_check;
22513 bool need_zero_guard = false;
22514
22515 if (CONST_INT_P (align_exp))
22516 align = INTVAL (align_exp);
22517 /* i386 can do misaligned access on reasonably increased cost. */
22518 if (CONST_INT_P (expected_align_exp)
22519 && INTVAL (expected_align_exp) > align)
22520 align = INTVAL (expected_align_exp);
22521 if (CONST_INT_P (count_exp))
22522 count = expected_size = INTVAL (count_exp);
22523 if (CONST_INT_P (expected_size_exp) && count == 0)
22524 expected_size = INTVAL (expected_size_exp);
22525
22526 /* Make sure we don't need to care about overflow later on. */
22527 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22528 return false;
22529
22530 /* Step 0: Decide on preferred algorithm, desired alignment and
22531 size of chunks to be copied by main loop. */
22532
22533 alg = decide_alg (count, expected_size, true, &dynamic_check);
22534 desired_align = decide_alignment (align, alg, expected_size);
22535
22536 if (!TARGET_ALIGN_STRINGOPS)
22537 align = desired_align;
22538
22539 if (alg == libcall)
22540 return false;
22541 gcc_assert (alg != no_stringop);
22542 if (!count)
22543 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22544 destreg = copy_addr_to_reg (XEXP (dst, 0));
22545 switch (alg)
22546 {
22547 case libcall:
22548 case no_stringop:
22549 gcc_unreachable ();
22550 case loop:
22551 need_zero_guard = true;
22552 size_needed = GET_MODE_SIZE (word_mode);
22553 break;
22554 case unrolled_loop:
22555 need_zero_guard = true;
22556 size_needed = GET_MODE_SIZE (word_mode) * 4;
22557 break;
22558 case rep_prefix_8_byte:
22559 size_needed = 8;
22560 break;
22561 case rep_prefix_4_byte:
22562 size_needed = 4;
22563 break;
22564 case rep_prefix_1_byte:
22565 size_needed = 1;
22566 break;
22567 case loop_1_byte:
22568 need_zero_guard = true;
22569 size_needed = 1;
22570 break;
22571 }
22572 epilogue_size_needed = size_needed;
22573
22574 /* Step 1: Prologue guard. */
22575
22576 /* Alignment code needs count to be in register. */
22577 if (CONST_INT_P (count_exp) && desired_align > align)
22578 {
22579 if (INTVAL (count_exp) > desired_align
22580 && INTVAL (count_exp) > size_needed)
22581 {
22582 align_bytes
22583 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22584 if (align_bytes <= 0)
22585 align_bytes = 0;
22586 else
22587 align_bytes = desired_align - align_bytes;
22588 }
22589 if (align_bytes == 0)
22590 {
22591 enum machine_mode mode = SImode;
22592 if (TARGET_64BIT && (count & ~0xffffffff))
22593 mode = DImode;
22594 count_exp = force_reg (mode, count_exp);
22595 }
22596 }
22597 /* Do the cheap promotion to allow better CSE across the
22598 main loop and epilogue (ie one load of the big constant in the
22599 front of all code. */
22600 if (CONST_INT_P (val_exp))
22601 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22602 desired_align, align);
22603 /* Ensure that alignment prologue won't copy past end of block. */
22604 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22605 {
22606 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22607 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22608 Make sure it is power of 2. */
22609 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22610
22611 /* To improve performance of small blocks, we jump around the VAL
22612 promoting mode. This mean that if the promoted VAL is not constant,
22613 we might not use it in the epilogue and have to use byte
22614 loop variant. */
22615 if (epilogue_size_needed > 2 && !promoted_val)
22616 force_loopy_epilogue = true;
22617 if (count)
22618 {
22619 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22620 {
22621 /* If main algorithm works on QImode, no epilogue is needed.
22622 For small sizes just don't align anything. */
22623 if (size_needed == 1)
22624 desired_align = align;
22625 else
22626 goto epilogue;
22627 }
22628 }
22629 else
22630 {
22631 label = gen_label_rtx ();
22632 emit_cmp_and_jump_insns (count_exp,
22633 GEN_INT (epilogue_size_needed),
22634 LTU, 0, counter_mode (count_exp), 1, label);
22635 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22636 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22637 else
22638 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22639 }
22640 }
22641 if (dynamic_check != -1)
22642 {
22643 rtx hot_label = gen_label_rtx ();
22644 jump_around_label = gen_label_rtx ();
22645 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22646 LEU, 0, counter_mode (count_exp), 1, hot_label);
22647 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22648 set_storage_via_libcall (dst, count_exp, val_exp, false);
22649 emit_jump (jump_around_label);
22650 emit_label (hot_label);
22651 }
22652
22653 /* Step 2: Alignment prologue. */
22654
22655 /* Do the expensive promotion once we branched off the small blocks. */
22656 if (!promoted_val)
22657 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22658 desired_align, align);
22659 gcc_assert (desired_align >= 1 && align >= 1);
22660
22661 if (desired_align > align)
22662 {
22663 if (align_bytes == 0)
22664 {
22665 /* Except for the first move in epilogue, we no longer know
22666 constant offset in aliasing info. It don't seems to worth
22667 the pain to maintain it for the first move, so throw away
22668 the info early. */
22669 dst = change_address (dst, BLKmode, destreg);
22670 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22671 desired_align);
22672 }
22673 else
22674 {
22675 /* If we know how many bytes need to be stored before dst is
22676 sufficiently aligned, maintain aliasing info accurately. */
22677 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22678 desired_align, align_bytes);
22679 count_exp = plus_constant (count_exp, -align_bytes);
22680 count -= align_bytes;
22681 }
22682 if (need_zero_guard
22683 && (count < (unsigned HOST_WIDE_INT) size_needed
22684 || (align_bytes == 0
22685 && count < ((unsigned HOST_WIDE_INT) size_needed
22686 + desired_align - align))))
22687 {
22688 /* It is possible that we copied enough so the main loop will not
22689 execute. */
22690 gcc_assert (size_needed > 1);
22691 if (label == NULL_RTX)
22692 label = gen_label_rtx ();
22693 emit_cmp_and_jump_insns (count_exp,
22694 GEN_INT (size_needed),
22695 LTU, 0, counter_mode (count_exp), 1, label);
22696 if (expected_size == -1
22697 || expected_size < (desired_align - align) / 2 + size_needed)
22698 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22699 else
22700 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22701 }
22702 }
22703 if (label && size_needed == 1)
22704 {
22705 emit_label (label);
22706 LABEL_NUSES (label) = 1;
22707 label = NULL;
22708 promoted_val = val_exp;
22709 epilogue_size_needed = 1;
22710 }
22711 else if (label == NULL_RTX)
22712 epilogue_size_needed = size_needed;
22713
22714 /* Step 3: Main loop. */
22715
22716 switch (alg)
22717 {
22718 case libcall:
22719 case no_stringop:
22720 gcc_unreachable ();
22721 case loop_1_byte:
22722 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22723 count_exp, QImode, 1, expected_size);
22724 break;
22725 case loop:
22726 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22727 count_exp, word_mode, 1, expected_size);
22728 break;
22729 case unrolled_loop:
22730 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22731 count_exp, word_mode, 4, expected_size);
22732 break;
22733 case rep_prefix_8_byte:
22734 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22735 DImode, val_exp);
22736 break;
22737 case rep_prefix_4_byte:
22738 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22739 SImode, val_exp);
22740 break;
22741 case rep_prefix_1_byte:
22742 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22743 QImode, val_exp);
22744 break;
22745 }
22746 /* Adjust properly the offset of src and dest memory for aliasing. */
22747 if (CONST_INT_P (count_exp))
22748 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22749 (count / size_needed) * size_needed);
22750 else
22751 dst = change_address (dst, BLKmode, destreg);
22752
22753 /* Step 4: Epilogue to copy the remaining bytes. */
22754
22755 if (label)
22756 {
22757 /* When the main loop is done, COUNT_EXP might hold original count,
22758 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22759 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22760 bytes. Compensate if needed. */
22761
22762 if (size_needed < epilogue_size_needed)
22763 {
22764 tmp =
22765 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22766 GEN_INT (size_needed - 1), count_exp, 1,
22767 OPTAB_DIRECT);
22768 if (tmp != count_exp)
22769 emit_move_insn (count_exp, tmp);
22770 }
22771 emit_label (label);
22772 LABEL_NUSES (label) = 1;
22773 }
22774 epilogue:
22775 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22776 {
22777 if (force_loopy_epilogue)
22778 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22779 epilogue_size_needed);
22780 else
22781 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22782 epilogue_size_needed);
22783 }
22784 if (jump_around_label)
22785 emit_label (jump_around_label);
22786 return true;
22787 }
22788
22789 /* Expand the appropriate insns for doing strlen if not just doing
22790 repnz; scasb
22791
22792 out = result, initialized with the start address
22793 align_rtx = alignment of the address.
22794 scratch = scratch register, initialized with the startaddress when
22795 not aligned, otherwise undefined
22796
22797 This is just the body. It needs the initializations mentioned above and
22798 some address computing at the end. These things are done in i386.md. */
22799
22800 static void
22801 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22802 {
22803 int align;
22804 rtx tmp;
22805 rtx align_2_label = NULL_RTX;
22806 rtx align_3_label = NULL_RTX;
22807 rtx align_4_label = gen_label_rtx ();
22808 rtx end_0_label = gen_label_rtx ();
22809 rtx mem;
22810 rtx tmpreg = gen_reg_rtx (SImode);
22811 rtx scratch = gen_reg_rtx (SImode);
22812 rtx cmp;
22813
22814 align = 0;
22815 if (CONST_INT_P (align_rtx))
22816 align = INTVAL (align_rtx);
22817
22818 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22819
22820 /* Is there a known alignment and is it less than 4? */
22821 if (align < 4)
22822 {
22823 rtx scratch1 = gen_reg_rtx (Pmode);
22824 emit_move_insn (scratch1, out);
22825 /* Is there a known alignment and is it not 2? */
22826 if (align != 2)
22827 {
22828 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22829 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22830
22831 /* Leave just the 3 lower bits. */
22832 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22833 NULL_RTX, 0, OPTAB_WIDEN);
22834
22835 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22836 Pmode, 1, align_4_label);
22837 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22838 Pmode, 1, align_2_label);
22839 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22840 Pmode, 1, align_3_label);
22841 }
22842 else
22843 {
22844 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22845 check if is aligned to 4 - byte. */
22846
22847 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22848 NULL_RTX, 0, OPTAB_WIDEN);
22849
22850 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22851 Pmode, 1, align_4_label);
22852 }
22853
22854 mem = change_address (src, QImode, out);
22855
22856 /* Now compare the bytes. */
22857
22858 /* Compare the first n unaligned byte on a byte per byte basis. */
22859 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22860 QImode, 1, end_0_label);
22861
22862 /* Increment the address. */
22863 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22864
22865 /* Not needed with an alignment of 2 */
22866 if (align != 2)
22867 {
22868 emit_label (align_2_label);
22869
22870 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22871 end_0_label);
22872
22873 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22874
22875 emit_label (align_3_label);
22876 }
22877
22878 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22879 end_0_label);
22880
22881 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22882 }
22883
22884 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22885 align this loop. It gives only huge programs, but does not help to
22886 speed up. */
22887 emit_label (align_4_label);
22888
22889 mem = change_address (src, SImode, out);
22890 emit_move_insn (scratch, mem);
22891 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22892
22893 /* This formula yields a nonzero result iff one of the bytes is zero.
22894 This saves three branches inside loop and many cycles. */
22895
22896 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22897 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22898 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22899 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22900 gen_int_mode (0x80808080, SImode)));
22901 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22902 align_4_label);
22903
22904 if (TARGET_CMOVE)
22905 {
22906 rtx reg = gen_reg_rtx (SImode);
22907 rtx reg2 = gen_reg_rtx (Pmode);
22908 emit_move_insn (reg, tmpreg);
22909 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22910
22911 /* If zero is not in the first two bytes, move two bytes forward. */
22912 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22913 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22914 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22915 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22916 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22917 reg,
22918 tmpreg)));
22919 /* Emit lea manually to avoid clobbering of flags. */
22920 emit_insn (gen_rtx_SET (SImode, reg2,
22921 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22922
22923 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22924 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22925 emit_insn (gen_rtx_SET (VOIDmode, out,
22926 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22927 reg2,
22928 out)));
22929 }
22930 else
22931 {
22932 rtx end_2_label = gen_label_rtx ();
22933 /* Is zero in the first two bytes? */
22934
22935 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22936 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22937 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22938 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22939 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22940 pc_rtx);
22941 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22942 JUMP_LABEL (tmp) = end_2_label;
22943
22944 /* Not in the first two. Move two bytes forward. */
22945 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22946 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22947
22948 emit_label (end_2_label);
22949
22950 }
22951
22952 /* Avoid branch in fixing the byte. */
22953 tmpreg = gen_lowpart (QImode, tmpreg);
22954 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22955 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22956 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22957 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22958
22959 emit_label (end_0_label);
22960 }
22961
22962 /* Expand strlen. */
22963
22964 bool
22965 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22966 {
22967 rtx addr, scratch1, scratch2, scratch3, scratch4;
22968
22969 /* The generic case of strlen expander is long. Avoid it's
22970 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22971
22972 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22973 && !TARGET_INLINE_ALL_STRINGOPS
22974 && !optimize_insn_for_size_p ()
22975 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22976 return false;
22977
22978 addr = force_reg (Pmode, XEXP (src, 0));
22979 scratch1 = gen_reg_rtx (Pmode);
22980
22981 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22982 && !optimize_insn_for_size_p ())
22983 {
22984 /* Well it seems that some optimizer does not combine a call like
22985 foo(strlen(bar), strlen(bar));
22986 when the move and the subtraction is done here. It does calculate
22987 the length just once when these instructions are done inside of
22988 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
22989 often used and I use one fewer register for the lifetime of
22990 output_strlen_unroll() this is better. */
22991
22992 emit_move_insn (out, addr);
22993
22994 ix86_expand_strlensi_unroll_1 (out, src, align);
22995
22996 /* strlensi_unroll_1 returns the address of the zero at the end of
22997 the string, like memchr(), so compute the length by subtracting
22998 the start address. */
22999 emit_insn (ix86_gen_sub3 (out, out, addr));
23000 }
23001 else
23002 {
23003 rtx unspec;
23004
23005 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23006 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23007 return false;
23008
23009 scratch2 = gen_reg_rtx (Pmode);
23010 scratch3 = gen_reg_rtx (Pmode);
23011 scratch4 = force_reg (Pmode, constm1_rtx);
23012
23013 emit_move_insn (scratch3, addr);
23014 eoschar = force_reg (QImode, eoschar);
23015
23016 src = replace_equiv_address_nv (src, scratch3);
23017
23018 /* If .md starts supporting :P, this can be done in .md. */
23019 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23020 scratch4), UNSPEC_SCAS);
23021 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23022 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23023 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23024 }
23025 return true;
23026 }
23027
23028 /* For given symbol (function) construct code to compute address of it's PLT
23029 entry in large x86-64 PIC model. */
23030 rtx
23031 construct_plt_address (rtx symbol)
23032 {
23033 rtx tmp, unspec;
23034
23035 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23036 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23037 gcc_assert (Pmode == DImode);
23038
23039 tmp = gen_reg_rtx (Pmode);
23040 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23041
23042 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23043 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23044 return tmp;
23045 }
23046
23047 rtx
23048 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23049 rtx callarg2,
23050 rtx pop, bool sibcall)
23051 {
23052 /* We need to represent that SI and DI registers are clobbered
23053 by SYSV calls. */
23054 static int clobbered_registers[] = {
23055 XMM6_REG, XMM7_REG, XMM8_REG,
23056 XMM9_REG, XMM10_REG, XMM11_REG,
23057 XMM12_REG, XMM13_REG, XMM14_REG,
23058 XMM15_REG, SI_REG, DI_REG
23059 };
23060 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23061 rtx use = NULL, call;
23062 unsigned int vec_len;
23063
23064 if (pop == const0_rtx)
23065 pop = NULL;
23066 gcc_assert (!TARGET_64BIT || !pop);
23067
23068 if (TARGET_MACHO && !TARGET_64BIT)
23069 {
23070 #if TARGET_MACHO
23071 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23072 fnaddr = machopic_indirect_call_target (fnaddr);
23073 #endif
23074 }
23075 else
23076 {
23077 /* Static functions and indirect calls don't need the pic register. */
23078 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23079 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23080 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23081 use_reg (&use, pic_offset_table_rtx);
23082 }
23083
23084 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23085 {
23086 rtx al = gen_rtx_REG (QImode, AX_REG);
23087 emit_move_insn (al, callarg2);
23088 use_reg (&use, al);
23089 }
23090
23091 if (ix86_cmodel == CM_LARGE_PIC
23092 && MEM_P (fnaddr)
23093 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23094 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23095 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23096 else if (sibcall
23097 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23098 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23099 {
23100 fnaddr = XEXP (fnaddr, 0);
23101 if (GET_MODE (fnaddr) != word_mode)
23102 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23103 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23104 }
23105
23106 vec_len = 0;
23107 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23108 if (retval)
23109 call = gen_rtx_SET (VOIDmode, retval, call);
23110 vec[vec_len++] = call;
23111
23112 if (pop)
23113 {
23114 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23115 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23116 vec[vec_len++] = pop;
23117 }
23118
23119 if (TARGET_64BIT_MS_ABI
23120 && (!callarg2 || INTVAL (callarg2) != -2))
23121 {
23122 unsigned i;
23123
23124 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23125 UNSPEC_MS_TO_SYSV_CALL);
23126
23127 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23128 vec[vec_len++]
23129 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23130 ? TImode : DImode,
23131 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23132 ? TImode : DImode,
23133 clobbered_registers[i]));
23134 }
23135
23136 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23137 if (TARGET_VZEROUPPER)
23138 {
23139 int avx256;
23140 if (cfun->machine->callee_pass_avx256_p)
23141 {
23142 if (cfun->machine->callee_return_avx256_p)
23143 avx256 = callee_return_pass_avx256;
23144 else
23145 avx256 = callee_pass_avx256;
23146 }
23147 else if (cfun->machine->callee_return_avx256_p)
23148 avx256 = callee_return_avx256;
23149 else
23150 avx256 = call_no_avx256;
23151
23152 if (reload_completed)
23153 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23154 else
23155 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23156 gen_rtvec (1, GEN_INT (avx256)),
23157 UNSPEC_CALL_NEEDS_VZEROUPPER);
23158 }
23159
23160 if (vec_len > 1)
23161 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23162 call = emit_call_insn (call);
23163 if (use)
23164 CALL_INSN_FUNCTION_USAGE (call) = use;
23165
23166 return call;
23167 }
23168
23169 void
23170 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23171 {
23172 rtx pat = PATTERN (insn);
23173 rtvec vec = XVEC (pat, 0);
23174 int len = GET_NUM_ELEM (vec) - 1;
23175
23176 /* Strip off the last entry of the parallel. */
23177 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23178 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23179 if (len == 1)
23180 pat = RTVEC_ELT (vec, 0);
23181 else
23182 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23183
23184 emit_insn (gen_avx_vzeroupper (vzeroupper));
23185 emit_call_insn (pat);
23186 }
23187
23188 /* Output the assembly for a call instruction. */
23189
23190 const char *
23191 ix86_output_call_insn (rtx insn, rtx call_op)
23192 {
23193 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23194 bool seh_nop_p = false;
23195 const char *xasm;
23196
23197 if (SIBLING_CALL_P (insn))
23198 {
23199 if (direct_p)
23200 xasm = "jmp\t%P0";
23201 /* SEH epilogue detection requires the indirect branch case
23202 to include REX.W. */
23203 else if (TARGET_SEH)
23204 xasm = "rex.W jmp %A0";
23205 else
23206 xasm = "jmp\t%A0";
23207
23208 output_asm_insn (xasm, &call_op);
23209 return "";
23210 }
23211
23212 /* SEH unwinding can require an extra nop to be emitted in several
23213 circumstances. Determine if we have one of those. */
23214 if (TARGET_SEH)
23215 {
23216 rtx i;
23217
23218 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23219 {
23220 /* If we get to another real insn, we don't need the nop. */
23221 if (INSN_P (i))
23222 break;
23223
23224 /* If we get to the epilogue note, prevent a catch region from
23225 being adjacent to the standard epilogue sequence. If non-
23226 call-exceptions, we'll have done this during epilogue emission. */
23227 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23228 && !flag_non_call_exceptions
23229 && !can_throw_internal (insn))
23230 {
23231 seh_nop_p = true;
23232 break;
23233 }
23234 }
23235
23236 /* If we didn't find a real insn following the call, prevent the
23237 unwinder from looking into the next function. */
23238 if (i == NULL)
23239 seh_nop_p = true;
23240 }
23241
23242 if (direct_p)
23243 xasm = "call\t%P0";
23244 else
23245 xasm = "call\t%A0";
23246
23247 output_asm_insn (xasm, &call_op);
23248
23249 if (seh_nop_p)
23250 return "nop";
23251
23252 return "";
23253 }
23254 \f
23255 /* Clear stack slot assignments remembered from previous functions.
23256 This is called from INIT_EXPANDERS once before RTL is emitted for each
23257 function. */
23258
23259 static struct machine_function *
23260 ix86_init_machine_status (void)
23261 {
23262 struct machine_function *f;
23263
23264 f = ggc_alloc_cleared_machine_function ();
23265 f->use_fast_prologue_epilogue_nregs = -1;
23266 f->tls_descriptor_call_expanded_p = 0;
23267 f->call_abi = ix86_abi;
23268
23269 return f;
23270 }
23271
23272 /* Return a MEM corresponding to a stack slot with mode MODE.
23273 Allocate a new slot if necessary.
23274
23275 The RTL for a function can have several slots available: N is
23276 which slot to use. */
23277
23278 rtx
23279 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23280 {
23281 struct stack_local_entry *s;
23282
23283 gcc_assert (n < MAX_386_STACK_LOCALS);
23284
23285 /* Virtual slot is valid only before vregs are instantiated. */
23286 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23287
23288 for (s = ix86_stack_locals; s; s = s->next)
23289 if (s->mode == mode && s->n == n)
23290 return validize_mem (copy_rtx (s->rtl));
23291
23292 s = ggc_alloc_stack_local_entry ();
23293 s->n = n;
23294 s->mode = mode;
23295 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23296
23297 s->next = ix86_stack_locals;
23298 ix86_stack_locals = s;
23299 return validize_mem (s->rtl);
23300 }
23301 \f
23302 /* Calculate the length of the memory address in the instruction encoding.
23303 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23304 or other prefixes. */
23305
23306 int
23307 memory_address_length (rtx addr)
23308 {
23309 struct ix86_address parts;
23310 rtx base, index, disp;
23311 int len;
23312 int ok;
23313
23314 if (GET_CODE (addr) == PRE_DEC
23315 || GET_CODE (addr) == POST_INC
23316 || GET_CODE (addr) == PRE_MODIFY
23317 || GET_CODE (addr) == POST_MODIFY)
23318 return 0;
23319
23320 ok = ix86_decompose_address (addr, &parts);
23321 gcc_assert (ok);
23322
23323 if (parts.base && GET_CODE (parts.base) == SUBREG)
23324 parts.base = SUBREG_REG (parts.base);
23325 if (parts.index && GET_CODE (parts.index) == SUBREG)
23326 parts.index = SUBREG_REG (parts.index);
23327
23328 base = parts.base;
23329 index = parts.index;
23330 disp = parts.disp;
23331
23332 /* Add length of addr32 prefix. */
23333 len = (GET_CODE (addr) == ZERO_EXTEND
23334 || GET_CODE (addr) == AND);
23335
23336 /* Rule of thumb:
23337 - esp as the base always wants an index,
23338 - ebp as the base always wants a displacement,
23339 - r12 as the base always wants an index,
23340 - r13 as the base always wants a displacement. */
23341
23342 /* Register Indirect. */
23343 if (base && !index && !disp)
23344 {
23345 /* esp (for its index) and ebp (for its displacement) need
23346 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23347 code. */
23348 if (REG_P (addr)
23349 && (addr == arg_pointer_rtx
23350 || addr == frame_pointer_rtx
23351 || REGNO (addr) == SP_REG
23352 || REGNO (addr) == BP_REG
23353 || REGNO (addr) == R12_REG
23354 || REGNO (addr) == R13_REG))
23355 len = 1;
23356 }
23357
23358 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23359 is not disp32, but disp32(%rip), so for disp32
23360 SIB byte is needed, unless print_operand_address
23361 optimizes it into disp32(%rip) or (%rip) is implied
23362 by UNSPEC. */
23363 else if (disp && !base && !index)
23364 {
23365 len = 4;
23366 if (TARGET_64BIT)
23367 {
23368 rtx symbol = disp;
23369
23370 if (GET_CODE (disp) == CONST)
23371 symbol = XEXP (disp, 0);
23372 if (GET_CODE (symbol) == PLUS
23373 && CONST_INT_P (XEXP (symbol, 1)))
23374 symbol = XEXP (symbol, 0);
23375
23376 if (GET_CODE (symbol) != LABEL_REF
23377 && (GET_CODE (symbol) != SYMBOL_REF
23378 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23379 && (GET_CODE (symbol) != UNSPEC
23380 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23381 && XINT (symbol, 1) != UNSPEC_PCREL
23382 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23383 len += 1;
23384 }
23385 }
23386
23387 else
23388 {
23389 /* Find the length of the displacement constant. */
23390 if (disp)
23391 {
23392 if (base && satisfies_constraint_K (disp))
23393 len = 1;
23394 else
23395 len = 4;
23396 }
23397 /* ebp always wants a displacement. Similarly r13. */
23398 else if (base && REG_P (base)
23399 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23400 len = 1;
23401
23402 /* An index requires the two-byte modrm form.... */
23403 if (index
23404 /* ...like esp (or r12), which always wants an index. */
23405 || base == arg_pointer_rtx
23406 || base == frame_pointer_rtx
23407 || (base && REG_P (base)
23408 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23409 len += 1;
23410 }
23411
23412 switch (parts.seg)
23413 {
23414 case SEG_FS:
23415 case SEG_GS:
23416 len += 1;
23417 break;
23418 default:
23419 break;
23420 }
23421
23422 return len;
23423 }
23424
23425 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23426 is set, expect that insn have 8bit immediate alternative. */
23427 int
23428 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23429 {
23430 int len = 0;
23431 int i;
23432 extract_insn_cached (insn);
23433 for (i = recog_data.n_operands - 1; i >= 0; --i)
23434 if (CONSTANT_P (recog_data.operand[i]))
23435 {
23436 enum attr_mode mode = get_attr_mode (insn);
23437
23438 gcc_assert (!len);
23439 if (shortform && CONST_INT_P (recog_data.operand[i]))
23440 {
23441 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23442 switch (mode)
23443 {
23444 case MODE_QI:
23445 len = 1;
23446 continue;
23447 case MODE_HI:
23448 ival = trunc_int_for_mode (ival, HImode);
23449 break;
23450 case MODE_SI:
23451 ival = trunc_int_for_mode (ival, SImode);
23452 break;
23453 default:
23454 break;
23455 }
23456 if (IN_RANGE (ival, -128, 127))
23457 {
23458 len = 1;
23459 continue;
23460 }
23461 }
23462 switch (mode)
23463 {
23464 case MODE_QI:
23465 len = 1;
23466 break;
23467 case MODE_HI:
23468 len = 2;
23469 break;
23470 case MODE_SI:
23471 len = 4;
23472 break;
23473 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23474 case MODE_DI:
23475 len = 4;
23476 break;
23477 default:
23478 fatal_insn ("unknown insn mode", insn);
23479 }
23480 }
23481 return len;
23482 }
23483 /* Compute default value for "length_address" attribute. */
23484 int
23485 ix86_attr_length_address_default (rtx insn)
23486 {
23487 int i;
23488
23489 if (get_attr_type (insn) == TYPE_LEA)
23490 {
23491 rtx set = PATTERN (insn), addr;
23492
23493 if (GET_CODE (set) == PARALLEL)
23494 set = XVECEXP (set, 0, 0);
23495
23496 gcc_assert (GET_CODE (set) == SET);
23497
23498 addr = SET_SRC (set);
23499 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23500 {
23501 if (GET_CODE (addr) == ZERO_EXTEND)
23502 addr = XEXP (addr, 0);
23503 if (GET_CODE (addr) == SUBREG)
23504 addr = SUBREG_REG (addr);
23505 }
23506
23507 return memory_address_length (addr);
23508 }
23509
23510 extract_insn_cached (insn);
23511 for (i = recog_data.n_operands - 1; i >= 0; --i)
23512 if (MEM_P (recog_data.operand[i]))
23513 {
23514 constrain_operands_cached (reload_completed);
23515 if (which_alternative != -1)
23516 {
23517 const char *constraints = recog_data.constraints[i];
23518 int alt = which_alternative;
23519
23520 while (*constraints == '=' || *constraints == '+')
23521 constraints++;
23522 while (alt-- > 0)
23523 while (*constraints++ != ',')
23524 ;
23525 /* Skip ignored operands. */
23526 if (*constraints == 'X')
23527 continue;
23528 }
23529 return memory_address_length (XEXP (recog_data.operand[i], 0));
23530 }
23531 return 0;
23532 }
23533
23534 /* Compute default value for "length_vex" attribute. It includes
23535 2 or 3 byte VEX prefix and 1 opcode byte. */
23536
23537 int
23538 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23539 {
23540 int i;
23541
23542 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23543 byte VEX prefix. */
23544 if (!has_0f_opcode || has_vex_w)
23545 return 3 + 1;
23546
23547 /* We can always use 2 byte VEX prefix in 32bit. */
23548 if (!TARGET_64BIT)
23549 return 2 + 1;
23550
23551 extract_insn_cached (insn);
23552
23553 for (i = recog_data.n_operands - 1; i >= 0; --i)
23554 if (REG_P (recog_data.operand[i]))
23555 {
23556 /* REX.W bit uses 3 byte VEX prefix. */
23557 if (GET_MODE (recog_data.operand[i]) == DImode
23558 && GENERAL_REG_P (recog_data.operand[i]))
23559 return 3 + 1;
23560 }
23561 else
23562 {
23563 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23564 if (MEM_P (recog_data.operand[i])
23565 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23566 return 3 + 1;
23567 }
23568
23569 return 2 + 1;
23570 }
23571 \f
23572 /* Return the maximum number of instructions a cpu can issue. */
23573
23574 static int
23575 ix86_issue_rate (void)
23576 {
23577 switch (ix86_tune)
23578 {
23579 case PROCESSOR_PENTIUM:
23580 case PROCESSOR_ATOM:
23581 case PROCESSOR_K6:
23582 return 2;
23583
23584 case PROCESSOR_PENTIUMPRO:
23585 case PROCESSOR_PENTIUM4:
23586 case PROCESSOR_CORE2_32:
23587 case PROCESSOR_CORE2_64:
23588 case PROCESSOR_COREI7_32:
23589 case PROCESSOR_COREI7_64:
23590 case PROCESSOR_ATHLON:
23591 case PROCESSOR_K8:
23592 case PROCESSOR_AMDFAM10:
23593 case PROCESSOR_NOCONA:
23594 case PROCESSOR_GENERIC32:
23595 case PROCESSOR_GENERIC64:
23596 case PROCESSOR_BDVER1:
23597 case PROCESSOR_BDVER2:
23598 case PROCESSOR_BTVER1:
23599 return 3;
23600
23601 default:
23602 return 1;
23603 }
23604 }
23605
23606 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23607 by DEP_INSN and nothing set by DEP_INSN. */
23608
23609 static bool
23610 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23611 {
23612 rtx set, set2;
23613
23614 /* Simplify the test for uninteresting insns. */
23615 if (insn_type != TYPE_SETCC
23616 && insn_type != TYPE_ICMOV
23617 && insn_type != TYPE_FCMOV
23618 && insn_type != TYPE_IBR)
23619 return false;
23620
23621 if ((set = single_set (dep_insn)) != 0)
23622 {
23623 set = SET_DEST (set);
23624 set2 = NULL_RTX;
23625 }
23626 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23627 && XVECLEN (PATTERN (dep_insn), 0) == 2
23628 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23629 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23630 {
23631 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23632 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23633 }
23634 else
23635 return false;
23636
23637 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23638 return false;
23639
23640 /* This test is true if the dependent insn reads the flags but
23641 not any other potentially set register. */
23642 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23643 return false;
23644
23645 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23646 return false;
23647
23648 return true;
23649 }
23650
23651 /* Return true iff USE_INSN has a memory address with operands set by
23652 SET_INSN. */
23653
23654 bool
23655 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23656 {
23657 int i;
23658 extract_insn_cached (use_insn);
23659 for (i = recog_data.n_operands - 1; i >= 0; --i)
23660 if (MEM_P (recog_data.operand[i]))
23661 {
23662 rtx addr = XEXP (recog_data.operand[i], 0);
23663 return modified_in_p (addr, set_insn) != 0;
23664 }
23665 return false;
23666 }
23667
23668 static int
23669 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23670 {
23671 enum attr_type insn_type, dep_insn_type;
23672 enum attr_memory memory;
23673 rtx set, set2;
23674 int dep_insn_code_number;
23675
23676 /* Anti and output dependencies have zero cost on all CPUs. */
23677 if (REG_NOTE_KIND (link) != 0)
23678 return 0;
23679
23680 dep_insn_code_number = recog_memoized (dep_insn);
23681
23682 /* If we can't recognize the insns, we can't really do anything. */
23683 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23684 return cost;
23685
23686 insn_type = get_attr_type (insn);
23687 dep_insn_type = get_attr_type (dep_insn);
23688
23689 switch (ix86_tune)
23690 {
23691 case PROCESSOR_PENTIUM:
23692 /* Address Generation Interlock adds a cycle of latency. */
23693 if (insn_type == TYPE_LEA)
23694 {
23695 rtx addr = PATTERN (insn);
23696
23697 if (GET_CODE (addr) == PARALLEL)
23698 addr = XVECEXP (addr, 0, 0);
23699
23700 gcc_assert (GET_CODE (addr) == SET);
23701
23702 addr = SET_SRC (addr);
23703 if (modified_in_p (addr, dep_insn))
23704 cost += 1;
23705 }
23706 else if (ix86_agi_dependent (dep_insn, insn))
23707 cost += 1;
23708
23709 /* ??? Compares pair with jump/setcc. */
23710 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23711 cost = 0;
23712
23713 /* Floating point stores require value to be ready one cycle earlier. */
23714 if (insn_type == TYPE_FMOV
23715 && get_attr_memory (insn) == MEMORY_STORE
23716 && !ix86_agi_dependent (dep_insn, insn))
23717 cost += 1;
23718 break;
23719
23720 case PROCESSOR_PENTIUMPRO:
23721 memory = get_attr_memory (insn);
23722
23723 /* INT->FP conversion is expensive. */
23724 if (get_attr_fp_int_src (dep_insn))
23725 cost += 5;
23726
23727 /* There is one cycle extra latency between an FP op and a store. */
23728 if (insn_type == TYPE_FMOV
23729 && (set = single_set (dep_insn)) != NULL_RTX
23730 && (set2 = single_set (insn)) != NULL_RTX
23731 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23732 && MEM_P (SET_DEST (set2)))
23733 cost += 1;
23734
23735 /* Show ability of reorder buffer to hide latency of load by executing
23736 in parallel with previous instruction in case
23737 previous instruction is not needed to compute the address. */
23738 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23739 && !ix86_agi_dependent (dep_insn, insn))
23740 {
23741 /* Claim moves to take one cycle, as core can issue one load
23742 at time and the next load can start cycle later. */
23743 if (dep_insn_type == TYPE_IMOV
23744 || dep_insn_type == TYPE_FMOV)
23745 cost = 1;
23746 else if (cost > 1)
23747 cost--;
23748 }
23749 break;
23750
23751 case PROCESSOR_K6:
23752 memory = get_attr_memory (insn);
23753
23754 /* The esp dependency is resolved before the instruction is really
23755 finished. */
23756 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23757 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23758 return 1;
23759
23760 /* INT->FP conversion is expensive. */
23761 if (get_attr_fp_int_src (dep_insn))
23762 cost += 5;
23763
23764 /* Show ability of reorder buffer to hide latency of load by executing
23765 in parallel with previous instruction in case
23766 previous instruction is not needed to compute the address. */
23767 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23768 && !ix86_agi_dependent (dep_insn, insn))
23769 {
23770 /* Claim moves to take one cycle, as core can issue one load
23771 at time and the next load can start cycle later. */
23772 if (dep_insn_type == TYPE_IMOV
23773 || dep_insn_type == TYPE_FMOV)
23774 cost = 1;
23775 else if (cost > 2)
23776 cost -= 2;
23777 else
23778 cost = 1;
23779 }
23780 break;
23781
23782 case PROCESSOR_ATHLON:
23783 case PROCESSOR_K8:
23784 case PROCESSOR_AMDFAM10:
23785 case PROCESSOR_BDVER1:
23786 case PROCESSOR_BDVER2:
23787 case PROCESSOR_BTVER1:
23788 case PROCESSOR_ATOM:
23789 case PROCESSOR_GENERIC32:
23790 case PROCESSOR_GENERIC64:
23791 memory = get_attr_memory (insn);
23792
23793 /* Show ability of reorder buffer to hide latency of load by executing
23794 in parallel with previous instruction in case
23795 previous instruction is not needed to compute the address. */
23796 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23797 && !ix86_agi_dependent (dep_insn, insn))
23798 {
23799 enum attr_unit unit = get_attr_unit (insn);
23800 int loadcost = 3;
23801
23802 /* Because of the difference between the length of integer and
23803 floating unit pipeline preparation stages, the memory operands
23804 for floating point are cheaper.
23805
23806 ??? For Athlon it the difference is most probably 2. */
23807 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23808 loadcost = 3;
23809 else
23810 loadcost = TARGET_ATHLON ? 2 : 0;
23811
23812 if (cost >= loadcost)
23813 cost -= loadcost;
23814 else
23815 cost = 0;
23816 }
23817
23818 default:
23819 break;
23820 }
23821
23822 return cost;
23823 }
23824
23825 /* How many alternative schedules to try. This should be as wide as the
23826 scheduling freedom in the DFA, but no wider. Making this value too
23827 large results extra work for the scheduler. */
23828
23829 static int
23830 ia32_multipass_dfa_lookahead (void)
23831 {
23832 switch (ix86_tune)
23833 {
23834 case PROCESSOR_PENTIUM:
23835 return 2;
23836
23837 case PROCESSOR_PENTIUMPRO:
23838 case PROCESSOR_K6:
23839 return 1;
23840
23841 case PROCESSOR_CORE2_32:
23842 case PROCESSOR_CORE2_64:
23843 case PROCESSOR_COREI7_32:
23844 case PROCESSOR_COREI7_64:
23845 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23846 as many instructions can be executed on a cycle, i.e.,
23847 issue_rate. I wonder why tuning for many CPUs does not do this. */
23848 return ix86_issue_rate ();
23849
23850 default:
23851 return 0;
23852 }
23853 }
23854
23855 \f
23856
23857 /* Model decoder of Core 2/i7.
23858 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23859 track the instruction fetch block boundaries and make sure that long
23860 (9+ bytes) instructions are assigned to D0. */
23861
23862 /* Maximum length of an insn that can be handled by
23863 a secondary decoder unit. '8' for Core 2/i7. */
23864 static int core2i7_secondary_decoder_max_insn_size;
23865
23866 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23867 '16' for Core 2/i7. */
23868 static int core2i7_ifetch_block_size;
23869
23870 /* Maximum number of instructions decoder can handle per cycle.
23871 '6' for Core 2/i7. */
23872 static int core2i7_ifetch_block_max_insns;
23873
23874 typedef struct ix86_first_cycle_multipass_data_ *
23875 ix86_first_cycle_multipass_data_t;
23876 typedef const struct ix86_first_cycle_multipass_data_ *
23877 const_ix86_first_cycle_multipass_data_t;
23878
23879 /* A variable to store target state across calls to max_issue within
23880 one cycle. */
23881 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23882 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23883
23884 /* Initialize DATA. */
23885 static void
23886 core2i7_first_cycle_multipass_init (void *_data)
23887 {
23888 ix86_first_cycle_multipass_data_t data
23889 = (ix86_first_cycle_multipass_data_t) _data;
23890
23891 data->ifetch_block_len = 0;
23892 data->ifetch_block_n_insns = 0;
23893 data->ready_try_change = NULL;
23894 data->ready_try_change_size = 0;
23895 }
23896
23897 /* Advancing the cycle; reset ifetch block counts. */
23898 static void
23899 core2i7_dfa_post_advance_cycle (void)
23900 {
23901 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
23902
23903 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23904
23905 data->ifetch_block_len = 0;
23906 data->ifetch_block_n_insns = 0;
23907 }
23908
23909 static int min_insn_size (rtx);
23910
23911 /* Filter out insns from ready_try that the core will not be able to issue
23912 on current cycle due to decoder. */
23913 static void
23914 core2i7_first_cycle_multipass_filter_ready_try
23915 (const_ix86_first_cycle_multipass_data_t data,
23916 char *ready_try, int n_ready, bool first_cycle_insn_p)
23917 {
23918 while (n_ready--)
23919 {
23920 rtx insn;
23921 int insn_size;
23922
23923 if (ready_try[n_ready])
23924 continue;
23925
23926 insn = get_ready_element (n_ready);
23927 insn_size = min_insn_size (insn);
23928
23929 if (/* If this is a too long an insn for a secondary decoder ... */
23930 (!first_cycle_insn_p
23931 && insn_size > core2i7_secondary_decoder_max_insn_size)
23932 /* ... or it would not fit into the ifetch block ... */
23933 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
23934 /* ... or the decoder is full already ... */
23935 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
23936 /* ... mask the insn out. */
23937 {
23938 ready_try[n_ready] = 1;
23939
23940 if (data->ready_try_change)
23941 SET_BIT (data->ready_try_change, n_ready);
23942 }
23943 }
23944 }
23945
23946 /* Prepare for a new round of multipass lookahead scheduling. */
23947 static void
23948 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
23949 bool first_cycle_insn_p)
23950 {
23951 ix86_first_cycle_multipass_data_t data
23952 = (ix86_first_cycle_multipass_data_t) _data;
23953 const_ix86_first_cycle_multipass_data_t prev_data
23954 = ix86_first_cycle_multipass_data;
23955
23956 /* Restore the state from the end of the previous round. */
23957 data->ifetch_block_len = prev_data->ifetch_block_len;
23958 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
23959
23960 /* Filter instructions that cannot be issued on current cycle due to
23961 decoder restrictions. */
23962 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23963 first_cycle_insn_p);
23964 }
23965
23966 /* INSN is being issued in current solution. Account for its impact on
23967 the decoder model. */
23968 static void
23969 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
23970 rtx insn, const void *_prev_data)
23971 {
23972 ix86_first_cycle_multipass_data_t data
23973 = (ix86_first_cycle_multipass_data_t) _data;
23974 const_ix86_first_cycle_multipass_data_t prev_data
23975 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
23976
23977 int insn_size = min_insn_size (insn);
23978
23979 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
23980 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
23981 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
23982 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23983
23984 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
23985 if (!data->ready_try_change)
23986 {
23987 data->ready_try_change = sbitmap_alloc (n_ready);
23988 data->ready_try_change_size = n_ready;
23989 }
23990 else if (data->ready_try_change_size < n_ready)
23991 {
23992 data->ready_try_change = sbitmap_resize (data->ready_try_change,
23993 n_ready, 0);
23994 data->ready_try_change_size = n_ready;
23995 }
23996 sbitmap_zero (data->ready_try_change);
23997
23998 /* Filter out insns from ready_try that the core will not be able to issue
23999 on current cycle due to decoder. */
24000 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24001 false);
24002 }
24003
24004 /* Revert the effect on ready_try. */
24005 static void
24006 core2i7_first_cycle_multipass_backtrack (const void *_data,
24007 char *ready_try,
24008 int n_ready ATTRIBUTE_UNUSED)
24009 {
24010 const_ix86_first_cycle_multipass_data_t data
24011 = (const_ix86_first_cycle_multipass_data_t) _data;
24012 unsigned int i = 0;
24013 sbitmap_iterator sbi;
24014
24015 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
24016 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
24017 {
24018 ready_try[i] = 0;
24019 }
24020 }
24021
24022 /* Save the result of multipass lookahead scheduling for the next round. */
24023 static void
24024 core2i7_first_cycle_multipass_end (const void *_data)
24025 {
24026 const_ix86_first_cycle_multipass_data_t data
24027 = (const_ix86_first_cycle_multipass_data_t) _data;
24028 ix86_first_cycle_multipass_data_t next_data
24029 = ix86_first_cycle_multipass_data;
24030
24031 if (data != NULL)
24032 {
24033 next_data->ifetch_block_len = data->ifetch_block_len;
24034 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24035 }
24036 }
24037
24038 /* Deallocate target data. */
24039 static void
24040 core2i7_first_cycle_multipass_fini (void *_data)
24041 {
24042 ix86_first_cycle_multipass_data_t data
24043 = (ix86_first_cycle_multipass_data_t) _data;
24044
24045 if (data->ready_try_change)
24046 {
24047 sbitmap_free (data->ready_try_change);
24048 data->ready_try_change = NULL;
24049 data->ready_try_change_size = 0;
24050 }
24051 }
24052
24053 /* Prepare for scheduling pass. */
24054 static void
24055 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24056 int verbose ATTRIBUTE_UNUSED,
24057 int max_uid ATTRIBUTE_UNUSED)
24058 {
24059 /* Install scheduling hooks for current CPU. Some of these hooks are used
24060 in time-critical parts of the scheduler, so we only set them up when
24061 they are actually used. */
24062 switch (ix86_tune)
24063 {
24064 case PROCESSOR_CORE2_32:
24065 case PROCESSOR_CORE2_64:
24066 case PROCESSOR_COREI7_32:
24067 case PROCESSOR_COREI7_64:
24068 targetm.sched.dfa_post_advance_cycle
24069 = core2i7_dfa_post_advance_cycle;
24070 targetm.sched.first_cycle_multipass_init
24071 = core2i7_first_cycle_multipass_init;
24072 targetm.sched.first_cycle_multipass_begin
24073 = core2i7_first_cycle_multipass_begin;
24074 targetm.sched.first_cycle_multipass_issue
24075 = core2i7_first_cycle_multipass_issue;
24076 targetm.sched.first_cycle_multipass_backtrack
24077 = core2i7_first_cycle_multipass_backtrack;
24078 targetm.sched.first_cycle_multipass_end
24079 = core2i7_first_cycle_multipass_end;
24080 targetm.sched.first_cycle_multipass_fini
24081 = core2i7_first_cycle_multipass_fini;
24082
24083 /* Set decoder parameters. */
24084 core2i7_secondary_decoder_max_insn_size = 8;
24085 core2i7_ifetch_block_size = 16;
24086 core2i7_ifetch_block_max_insns = 6;
24087 break;
24088
24089 default:
24090 targetm.sched.dfa_post_advance_cycle = NULL;
24091 targetm.sched.first_cycle_multipass_init = NULL;
24092 targetm.sched.first_cycle_multipass_begin = NULL;
24093 targetm.sched.first_cycle_multipass_issue = NULL;
24094 targetm.sched.first_cycle_multipass_backtrack = NULL;
24095 targetm.sched.first_cycle_multipass_end = NULL;
24096 targetm.sched.first_cycle_multipass_fini = NULL;
24097 break;
24098 }
24099 }
24100
24101 \f
24102 /* Compute the alignment given to a constant that is being placed in memory.
24103 EXP is the constant and ALIGN is the alignment that the object would
24104 ordinarily have.
24105 The value of this function is used instead of that alignment to align
24106 the object. */
24107
24108 int
24109 ix86_constant_alignment (tree exp, int align)
24110 {
24111 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24112 || TREE_CODE (exp) == INTEGER_CST)
24113 {
24114 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24115 return 64;
24116 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24117 return 128;
24118 }
24119 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24120 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24121 return BITS_PER_WORD;
24122
24123 return align;
24124 }
24125
24126 /* Compute the alignment for a static variable.
24127 TYPE is the data type, and ALIGN is the alignment that
24128 the object would ordinarily have. The value of this function is used
24129 instead of that alignment to align the object. */
24130
24131 int
24132 ix86_data_alignment (tree type, int align)
24133 {
24134 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24135
24136 if (AGGREGATE_TYPE_P (type)
24137 && TYPE_SIZE (type)
24138 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24139 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24140 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24141 && align < max_align)
24142 align = max_align;
24143
24144 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24145 to 16byte boundary. */
24146 if (TARGET_64BIT)
24147 {
24148 if (AGGREGATE_TYPE_P (type)
24149 && TYPE_SIZE (type)
24150 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24151 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24152 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24153 return 128;
24154 }
24155
24156 if (TREE_CODE (type) == ARRAY_TYPE)
24157 {
24158 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24159 return 64;
24160 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24161 return 128;
24162 }
24163 else if (TREE_CODE (type) == COMPLEX_TYPE)
24164 {
24165
24166 if (TYPE_MODE (type) == DCmode && align < 64)
24167 return 64;
24168 if ((TYPE_MODE (type) == XCmode
24169 || TYPE_MODE (type) == TCmode) && align < 128)
24170 return 128;
24171 }
24172 else if ((TREE_CODE (type) == RECORD_TYPE
24173 || TREE_CODE (type) == UNION_TYPE
24174 || TREE_CODE (type) == QUAL_UNION_TYPE)
24175 && TYPE_FIELDS (type))
24176 {
24177 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24178 return 64;
24179 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24180 return 128;
24181 }
24182 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24183 || TREE_CODE (type) == INTEGER_TYPE)
24184 {
24185 if (TYPE_MODE (type) == DFmode && align < 64)
24186 return 64;
24187 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24188 return 128;
24189 }
24190
24191 return align;
24192 }
24193
24194 /* Compute the alignment for a local variable or a stack slot. EXP is
24195 the data type or decl itself, MODE is the widest mode available and
24196 ALIGN is the alignment that the object would ordinarily have. The
24197 value of this macro is used instead of that alignment to align the
24198 object. */
24199
24200 unsigned int
24201 ix86_local_alignment (tree exp, enum machine_mode mode,
24202 unsigned int align)
24203 {
24204 tree type, decl;
24205
24206 if (exp && DECL_P (exp))
24207 {
24208 type = TREE_TYPE (exp);
24209 decl = exp;
24210 }
24211 else
24212 {
24213 type = exp;
24214 decl = NULL;
24215 }
24216
24217 /* Don't do dynamic stack realignment for long long objects with
24218 -mpreferred-stack-boundary=2. */
24219 if (!TARGET_64BIT
24220 && align == 64
24221 && ix86_preferred_stack_boundary < 64
24222 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24223 && (!type || !TYPE_USER_ALIGN (type))
24224 && (!decl || !DECL_USER_ALIGN (decl)))
24225 align = 32;
24226
24227 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24228 register in MODE. We will return the largest alignment of XF
24229 and DF. */
24230 if (!type)
24231 {
24232 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24233 align = GET_MODE_ALIGNMENT (DFmode);
24234 return align;
24235 }
24236
24237 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24238 to 16byte boundary. Exact wording is:
24239
24240 An array uses the same alignment as its elements, except that a local or
24241 global array variable of length at least 16 bytes or
24242 a C99 variable-length array variable always has alignment of at least 16 bytes.
24243
24244 This was added to allow use of aligned SSE instructions at arrays. This
24245 rule is meant for static storage (where compiler can not do the analysis
24246 by itself). We follow it for automatic variables only when convenient.
24247 We fully control everything in the function compiled and functions from
24248 other unit can not rely on the alignment.
24249
24250 Exclude va_list type. It is the common case of local array where
24251 we can not benefit from the alignment. */
24252 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24253 && TARGET_SSE)
24254 {
24255 if (AGGREGATE_TYPE_P (type)
24256 && (va_list_type_node == NULL_TREE
24257 || (TYPE_MAIN_VARIANT (type)
24258 != TYPE_MAIN_VARIANT (va_list_type_node)))
24259 && TYPE_SIZE (type)
24260 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24261 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24262 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24263 return 128;
24264 }
24265 if (TREE_CODE (type) == ARRAY_TYPE)
24266 {
24267 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24268 return 64;
24269 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24270 return 128;
24271 }
24272 else if (TREE_CODE (type) == COMPLEX_TYPE)
24273 {
24274 if (TYPE_MODE (type) == DCmode && align < 64)
24275 return 64;
24276 if ((TYPE_MODE (type) == XCmode
24277 || TYPE_MODE (type) == TCmode) && align < 128)
24278 return 128;
24279 }
24280 else if ((TREE_CODE (type) == RECORD_TYPE
24281 || TREE_CODE (type) == UNION_TYPE
24282 || TREE_CODE (type) == QUAL_UNION_TYPE)
24283 && TYPE_FIELDS (type))
24284 {
24285 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24286 return 64;
24287 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24288 return 128;
24289 }
24290 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24291 || TREE_CODE (type) == INTEGER_TYPE)
24292 {
24293
24294 if (TYPE_MODE (type) == DFmode && align < 64)
24295 return 64;
24296 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24297 return 128;
24298 }
24299 return align;
24300 }
24301
24302 /* Compute the minimum required alignment for dynamic stack realignment
24303 purposes for a local variable, parameter or a stack slot. EXP is
24304 the data type or decl itself, MODE is its mode and ALIGN is the
24305 alignment that the object would ordinarily have. */
24306
24307 unsigned int
24308 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24309 unsigned int align)
24310 {
24311 tree type, decl;
24312
24313 if (exp && DECL_P (exp))
24314 {
24315 type = TREE_TYPE (exp);
24316 decl = exp;
24317 }
24318 else
24319 {
24320 type = exp;
24321 decl = NULL;
24322 }
24323
24324 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24325 return align;
24326
24327 /* Don't do dynamic stack realignment for long long objects with
24328 -mpreferred-stack-boundary=2. */
24329 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24330 && (!type || !TYPE_USER_ALIGN (type))
24331 && (!decl || !DECL_USER_ALIGN (decl)))
24332 return 32;
24333
24334 return align;
24335 }
24336 \f
24337 /* Find a location for the static chain incoming to a nested function.
24338 This is a register, unless all free registers are used by arguments. */
24339
24340 static rtx
24341 ix86_static_chain (const_tree fndecl, bool incoming_p)
24342 {
24343 unsigned regno;
24344
24345 if (!DECL_STATIC_CHAIN (fndecl))
24346 return NULL;
24347
24348 if (TARGET_64BIT)
24349 {
24350 /* We always use R10 in 64-bit mode. */
24351 regno = R10_REG;
24352 }
24353 else
24354 {
24355 tree fntype;
24356 unsigned int ccvt;
24357
24358 /* By default in 32-bit mode we use ECX to pass the static chain. */
24359 regno = CX_REG;
24360
24361 fntype = TREE_TYPE (fndecl);
24362 ccvt = ix86_get_callcvt (fntype);
24363 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24364 {
24365 /* Fastcall functions use ecx/edx for arguments, which leaves
24366 us with EAX for the static chain.
24367 Thiscall functions use ecx for arguments, which also
24368 leaves us with EAX for the static chain. */
24369 regno = AX_REG;
24370 }
24371 else if (ix86_function_regparm (fntype, fndecl) == 3)
24372 {
24373 /* For regparm 3, we have no free call-clobbered registers in
24374 which to store the static chain. In order to implement this,
24375 we have the trampoline push the static chain to the stack.
24376 However, we can't push a value below the return address when
24377 we call the nested function directly, so we have to use an
24378 alternate entry point. For this we use ESI, and have the
24379 alternate entry point push ESI, so that things appear the
24380 same once we're executing the nested function. */
24381 if (incoming_p)
24382 {
24383 if (fndecl == current_function_decl)
24384 ix86_static_chain_on_stack = true;
24385 return gen_frame_mem (SImode,
24386 plus_constant (arg_pointer_rtx, -8));
24387 }
24388 regno = SI_REG;
24389 }
24390 }
24391
24392 return gen_rtx_REG (Pmode, regno);
24393 }
24394
24395 /* Emit RTL insns to initialize the variable parts of a trampoline.
24396 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24397 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24398 to be passed to the target function. */
24399
24400 static void
24401 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24402 {
24403 rtx mem, fnaddr;
24404 int opcode;
24405 int offset = 0;
24406
24407 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24408
24409 if (TARGET_64BIT)
24410 {
24411 int size;
24412
24413 /* Load the function address to r11. Try to load address using
24414 the shorter movl instead of movabs. We may want to support
24415 movq for kernel mode, but kernel does not use trampolines at
24416 the moment. FNADDR is a 32bit address and may not be in
24417 DImode when ptr_mode == SImode. Always use movl in this
24418 case. */
24419 if (ptr_mode == SImode
24420 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24421 {
24422 fnaddr = copy_addr_to_reg (fnaddr);
24423
24424 mem = adjust_address (m_tramp, HImode, offset);
24425 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24426
24427 mem = adjust_address (m_tramp, SImode, offset + 2);
24428 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24429 offset += 6;
24430 }
24431 else
24432 {
24433 mem = adjust_address (m_tramp, HImode, offset);
24434 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24435
24436 mem = adjust_address (m_tramp, DImode, offset + 2);
24437 emit_move_insn (mem, fnaddr);
24438 offset += 10;
24439 }
24440
24441 /* Load static chain using movabs to r10. Use the shorter movl
24442 instead of movabs when ptr_mode == SImode. */
24443 if (ptr_mode == SImode)
24444 {
24445 opcode = 0xba41;
24446 size = 6;
24447 }
24448 else
24449 {
24450 opcode = 0xba49;
24451 size = 10;
24452 }
24453
24454 mem = adjust_address (m_tramp, HImode, offset);
24455 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24456
24457 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24458 emit_move_insn (mem, chain_value);
24459 offset += size;
24460
24461 /* Jump to r11; the last (unused) byte is a nop, only there to
24462 pad the write out to a single 32-bit store. */
24463 mem = adjust_address (m_tramp, SImode, offset);
24464 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24465 offset += 4;
24466 }
24467 else
24468 {
24469 rtx disp, chain;
24470
24471 /* Depending on the static chain location, either load a register
24472 with a constant, or push the constant to the stack. All of the
24473 instructions are the same size. */
24474 chain = ix86_static_chain (fndecl, true);
24475 if (REG_P (chain))
24476 {
24477 switch (REGNO (chain))
24478 {
24479 case AX_REG:
24480 opcode = 0xb8; break;
24481 case CX_REG:
24482 opcode = 0xb9; break;
24483 default:
24484 gcc_unreachable ();
24485 }
24486 }
24487 else
24488 opcode = 0x68;
24489
24490 mem = adjust_address (m_tramp, QImode, offset);
24491 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24492
24493 mem = adjust_address (m_tramp, SImode, offset + 1);
24494 emit_move_insn (mem, chain_value);
24495 offset += 5;
24496
24497 mem = adjust_address (m_tramp, QImode, offset);
24498 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24499
24500 mem = adjust_address (m_tramp, SImode, offset + 1);
24501
24502 /* Compute offset from the end of the jmp to the target function.
24503 In the case in which the trampoline stores the static chain on
24504 the stack, we need to skip the first insn which pushes the
24505 (call-saved) register static chain; this push is 1 byte. */
24506 offset += 5;
24507 disp = expand_binop (SImode, sub_optab, fnaddr,
24508 plus_constant (XEXP (m_tramp, 0),
24509 offset - (MEM_P (chain) ? 1 : 0)),
24510 NULL_RTX, 1, OPTAB_DIRECT);
24511 emit_move_insn (mem, disp);
24512 }
24513
24514 gcc_assert (offset <= TRAMPOLINE_SIZE);
24515
24516 #ifdef HAVE_ENABLE_EXECUTE_STACK
24517 #ifdef CHECK_EXECUTE_STACK_ENABLED
24518 if (CHECK_EXECUTE_STACK_ENABLED)
24519 #endif
24520 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24521 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24522 #endif
24523 }
24524 \f
24525 /* The following file contains several enumerations and data structures
24526 built from the definitions in i386-builtin-types.def. */
24527
24528 #include "i386-builtin-types.inc"
24529
24530 /* Table for the ix86 builtin non-function types. */
24531 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24532
24533 /* Retrieve an element from the above table, building some of
24534 the types lazily. */
24535
24536 static tree
24537 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24538 {
24539 unsigned int index;
24540 tree type, itype;
24541
24542 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24543
24544 type = ix86_builtin_type_tab[(int) tcode];
24545 if (type != NULL)
24546 return type;
24547
24548 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24549 if (tcode <= IX86_BT_LAST_VECT)
24550 {
24551 enum machine_mode mode;
24552
24553 index = tcode - IX86_BT_LAST_PRIM - 1;
24554 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24555 mode = ix86_builtin_type_vect_mode[index];
24556
24557 type = build_vector_type_for_mode (itype, mode);
24558 }
24559 else
24560 {
24561 int quals;
24562
24563 index = tcode - IX86_BT_LAST_VECT - 1;
24564 if (tcode <= IX86_BT_LAST_PTR)
24565 quals = TYPE_UNQUALIFIED;
24566 else
24567 quals = TYPE_QUAL_CONST;
24568
24569 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24570 if (quals != TYPE_UNQUALIFIED)
24571 itype = build_qualified_type (itype, quals);
24572
24573 type = build_pointer_type (itype);
24574 }
24575
24576 ix86_builtin_type_tab[(int) tcode] = type;
24577 return type;
24578 }
24579
24580 /* Table for the ix86 builtin function types. */
24581 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24582
24583 /* Retrieve an element from the above table, building some of
24584 the types lazily. */
24585
24586 static tree
24587 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24588 {
24589 tree type;
24590
24591 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24592
24593 type = ix86_builtin_func_type_tab[(int) tcode];
24594 if (type != NULL)
24595 return type;
24596
24597 if (tcode <= IX86_BT_LAST_FUNC)
24598 {
24599 unsigned start = ix86_builtin_func_start[(int) tcode];
24600 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24601 tree rtype, atype, args = void_list_node;
24602 unsigned i;
24603
24604 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24605 for (i = after - 1; i > start; --i)
24606 {
24607 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24608 args = tree_cons (NULL, atype, args);
24609 }
24610
24611 type = build_function_type (rtype, args);
24612 }
24613 else
24614 {
24615 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24616 enum ix86_builtin_func_type icode;
24617
24618 icode = ix86_builtin_func_alias_base[index];
24619 type = ix86_get_builtin_func_type (icode);
24620 }
24621
24622 ix86_builtin_func_type_tab[(int) tcode] = type;
24623 return type;
24624 }
24625
24626
24627 /* Codes for all the SSE/MMX builtins. */
24628 enum ix86_builtins
24629 {
24630 IX86_BUILTIN_ADDPS,
24631 IX86_BUILTIN_ADDSS,
24632 IX86_BUILTIN_DIVPS,
24633 IX86_BUILTIN_DIVSS,
24634 IX86_BUILTIN_MULPS,
24635 IX86_BUILTIN_MULSS,
24636 IX86_BUILTIN_SUBPS,
24637 IX86_BUILTIN_SUBSS,
24638
24639 IX86_BUILTIN_CMPEQPS,
24640 IX86_BUILTIN_CMPLTPS,
24641 IX86_BUILTIN_CMPLEPS,
24642 IX86_BUILTIN_CMPGTPS,
24643 IX86_BUILTIN_CMPGEPS,
24644 IX86_BUILTIN_CMPNEQPS,
24645 IX86_BUILTIN_CMPNLTPS,
24646 IX86_BUILTIN_CMPNLEPS,
24647 IX86_BUILTIN_CMPNGTPS,
24648 IX86_BUILTIN_CMPNGEPS,
24649 IX86_BUILTIN_CMPORDPS,
24650 IX86_BUILTIN_CMPUNORDPS,
24651 IX86_BUILTIN_CMPEQSS,
24652 IX86_BUILTIN_CMPLTSS,
24653 IX86_BUILTIN_CMPLESS,
24654 IX86_BUILTIN_CMPNEQSS,
24655 IX86_BUILTIN_CMPNLTSS,
24656 IX86_BUILTIN_CMPNLESS,
24657 IX86_BUILTIN_CMPNGTSS,
24658 IX86_BUILTIN_CMPNGESS,
24659 IX86_BUILTIN_CMPORDSS,
24660 IX86_BUILTIN_CMPUNORDSS,
24661
24662 IX86_BUILTIN_COMIEQSS,
24663 IX86_BUILTIN_COMILTSS,
24664 IX86_BUILTIN_COMILESS,
24665 IX86_BUILTIN_COMIGTSS,
24666 IX86_BUILTIN_COMIGESS,
24667 IX86_BUILTIN_COMINEQSS,
24668 IX86_BUILTIN_UCOMIEQSS,
24669 IX86_BUILTIN_UCOMILTSS,
24670 IX86_BUILTIN_UCOMILESS,
24671 IX86_BUILTIN_UCOMIGTSS,
24672 IX86_BUILTIN_UCOMIGESS,
24673 IX86_BUILTIN_UCOMINEQSS,
24674
24675 IX86_BUILTIN_CVTPI2PS,
24676 IX86_BUILTIN_CVTPS2PI,
24677 IX86_BUILTIN_CVTSI2SS,
24678 IX86_BUILTIN_CVTSI642SS,
24679 IX86_BUILTIN_CVTSS2SI,
24680 IX86_BUILTIN_CVTSS2SI64,
24681 IX86_BUILTIN_CVTTPS2PI,
24682 IX86_BUILTIN_CVTTSS2SI,
24683 IX86_BUILTIN_CVTTSS2SI64,
24684
24685 IX86_BUILTIN_MAXPS,
24686 IX86_BUILTIN_MAXSS,
24687 IX86_BUILTIN_MINPS,
24688 IX86_BUILTIN_MINSS,
24689
24690 IX86_BUILTIN_LOADUPS,
24691 IX86_BUILTIN_STOREUPS,
24692 IX86_BUILTIN_MOVSS,
24693
24694 IX86_BUILTIN_MOVHLPS,
24695 IX86_BUILTIN_MOVLHPS,
24696 IX86_BUILTIN_LOADHPS,
24697 IX86_BUILTIN_LOADLPS,
24698 IX86_BUILTIN_STOREHPS,
24699 IX86_BUILTIN_STORELPS,
24700
24701 IX86_BUILTIN_MASKMOVQ,
24702 IX86_BUILTIN_MOVMSKPS,
24703 IX86_BUILTIN_PMOVMSKB,
24704
24705 IX86_BUILTIN_MOVNTPS,
24706 IX86_BUILTIN_MOVNTQ,
24707
24708 IX86_BUILTIN_LOADDQU,
24709 IX86_BUILTIN_STOREDQU,
24710
24711 IX86_BUILTIN_PACKSSWB,
24712 IX86_BUILTIN_PACKSSDW,
24713 IX86_BUILTIN_PACKUSWB,
24714
24715 IX86_BUILTIN_PADDB,
24716 IX86_BUILTIN_PADDW,
24717 IX86_BUILTIN_PADDD,
24718 IX86_BUILTIN_PADDQ,
24719 IX86_BUILTIN_PADDSB,
24720 IX86_BUILTIN_PADDSW,
24721 IX86_BUILTIN_PADDUSB,
24722 IX86_BUILTIN_PADDUSW,
24723 IX86_BUILTIN_PSUBB,
24724 IX86_BUILTIN_PSUBW,
24725 IX86_BUILTIN_PSUBD,
24726 IX86_BUILTIN_PSUBQ,
24727 IX86_BUILTIN_PSUBSB,
24728 IX86_BUILTIN_PSUBSW,
24729 IX86_BUILTIN_PSUBUSB,
24730 IX86_BUILTIN_PSUBUSW,
24731
24732 IX86_BUILTIN_PAND,
24733 IX86_BUILTIN_PANDN,
24734 IX86_BUILTIN_POR,
24735 IX86_BUILTIN_PXOR,
24736
24737 IX86_BUILTIN_PAVGB,
24738 IX86_BUILTIN_PAVGW,
24739
24740 IX86_BUILTIN_PCMPEQB,
24741 IX86_BUILTIN_PCMPEQW,
24742 IX86_BUILTIN_PCMPEQD,
24743 IX86_BUILTIN_PCMPGTB,
24744 IX86_BUILTIN_PCMPGTW,
24745 IX86_BUILTIN_PCMPGTD,
24746
24747 IX86_BUILTIN_PMADDWD,
24748
24749 IX86_BUILTIN_PMAXSW,
24750 IX86_BUILTIN_PMAXUB,
24751 IX86_BUILTIN_PMINSW,
24752 IX86_BUILTIN_PMINUB,
24753
24754 IX86_BUILTIN_PMULHUW,
24755 IX86_BUILTIN_PMULHW,
24756 IX86_BUILTIN_PMULLW,
24757
24758 IX86_BUILTIN_PSADBW,
24759 IX86_BUILTIN_PSHUFW,
24760
24761 IX86_BUILTIN_PSLLW,
24762 IX86_BUILTIN_PSLLD,
24763 IX86_BUILTIN_PSLLQ,
24764 IX86_BUILTIN_PSRAW,
24765 IX86_BUILTIN_PSRAD,
24766 IX86_BUILTIN_PSRLW,
24767 IX86_BUILTIN_PSRLD,
24768 IX86_BUILTIN_PSRLQ,
24769 IX86_BUILTIN_PSLLWI,
24770 IX86_BUILTIN_PSLLDI,
24771 IX86_BUILTIN_PSLLQI,
24772 IX86_BUILTIN_PSRAWI,
24773 IX86_BUILTIN_PSRADI,
24774 IX86_BUILTIN_PSRLWI,
24775 IX86_BUILTIN_PSRLDI,
24776 IX86_BUILTIN_PSRLQI,
24777
24778 IX86_BUILTIN_PUNPCKHBW,
24779 IX86_BUILTIN_PUNPCKHWD,
24780 IX86_BUILTIN_PUNPCKHDQ,
24781 IX86_BUILTIN_PUNPCKLBW,
24782 IX86_BUILTIN_PUNPCKLWD,
24783 IX86_BUILTIN_PUNPCKLDQ,
24784
24785 IX86_BUILTIN_SHUFPS,
24786
24787 IX86_BUILTIN_RCPPS,
24788 IX86_BUILTIN_RCPSS,
24789 IX86_BUILTIN_RSQRTPS,
24790 IX86_BUILTIN_RSQRTPS_NR,
24791 IX86_BUILTIN_RSQRTSS,
24792 IX86_BUILTIN_RSQRTF,
24793 IX86_BUILTIN_SQRTPS,
24794 IX86_BUILTIN_SQRTPS_NR,
24795 IX86_BUILTIN_SQRTSS,
24796
24797 IX86_BUILTIN_UNPCKHPS,
24798 IX86_BUILTIN_UNPCKLPS,
24799
24800 IX86_BUILTIN_ANDPS,
24801 IX86_BUILTIN_ANDNPS,
24802 IX86_BUILTIN_ORPS,
24803 IX86_BUILTIN_XORPS,
24804
24805 IX86_BUILTIN_EMMS,
24806 IX86_BUILTIN_LDMXCSR,
24807 IX86_BUILTIN_STMXCSR,
24808 IX86_BUILTIN_SFENCE,
24809
24810 /* 3DNow! Original */
24811 IX86_BUILTIN_FEMMS,
24812 IX86_BUILTIN_PAVGUSB,
24813 IX86_BUILTIN_PF2ID,
24814 IX86_BUILTIN_PFACC,
24815 IX86_BUILTIN_PFADD,
24816 IX86_BUILTIN_PFCMPEQ,
24817 IX86_BUILTIN_PFCMPGE,
24818 IX86_BUILTIN_PFCMPGT,
24819 IX86_BUILTIN_PFMAX,
24820 IX86_BUILTIN_PFMIN,
24821 IX86_BUILTIN_PFMUL,
24822 IX86_BUILTIN_PFRCP,
24823 IX86_BUILTIN_PFRCPIT1,
24824 IX86_BUILTIN_PFRCPIT2,
24825 IX86_BUILTIN_PFRSQIT1,
24826 IX86_BUILTIN_PFRSQRT,
24827 IX86_BUILTIN_PFSUB,
24828 IX86_BUILTIN_PFSUBR,
24829 IX86_BUILTIN_PI2FD,
24830 IX86_BUILTIN_PMULHRW,
24831
24832 /* 3DNow! Athlon Extensions */
24833 IX86_BUILTIN_PF2IW,
24834 IX86_BUILTIN_PFNACC,
24835 IX86_BUILTIN_PFPNACC,
24836 IX86_BUILTIN_PI2FW,
24837 IX86_BUILTIN_PSWAPDSI,
24838 IX86_BUILTIN_PSWAPDSF,
24839
24840 /* SSE2 */
24841 IX86_BUILTIN_ADDPD,
24842 IX86_BUILTIN_ADDSD,
24843 IX86_BUILTIN_DIVPD,
24844 IX86_BUILTIN_DIVSD,
24845 IX86_BUILTIN_MULPD,
24846 IX86_BUILTIN_MULSD,
24847 IX86_BUILTIN_SUBPD,
24848 IX86_BUILTIN_SUBSD,
24849
24850 IX86_BUILTIN_CMPEQPD,
24851 IX86_BUILTIN_CMPLTPD,
24852 IX86_BUILTIN_CMPLEPD,
24853 IX86_BUILTIN_CMPGTPD,
24854 IX86_BUILTIN_CMPGEPD,
24855 IX86_BUILTIN_CMPNEQPD,
24856 IX86_BUILTIN_CMPNLTPD,
24857 IX86_BUILTIN_CMPNLEPD,
24858 IX86_BUILTIN_CMPNGTPD,
24859 IX86_BUILTIN_CMPNGEPD,
24860 IX86_BUILTIN_CMPORDPD,
24861 IX86_BUILTIN_CMPUNORDPD,
24862 IX86_BUILTIN_CMPEQSD,
24863 IX86_BUILTIN_CMPLTSD,
24864 IX86_BUILTIN_CMPLESD,
24865 IX86_BUILTIN_CMPNEQSD,
24866 IX86_BUILTIN_CMPNLTSD,
24867 IX86_BUILTIN_CMPNLESD,
24868 IX86_BUILTIN_CMPORDSD,
24869 IX86_BUILTIN_CMPUNORDSD,
24870
24871 IX86_BUILTIN_COMIEQSD,
24872 IX86_BUILTIN_COMILTSD,
24873 IX86_BUILTIN_COMILESD,
24874 IX86_BUILTIN_COMIGTSD,
24875 IX86_BUILTIN_COMIGESD,
24876 IX86_BUILTIN_COMINEQSD,
24877 IX86_BUILTIN_UCOMIEQSD,
24878 IX86_BUILTIN_UCOMILTSD,
24879 IX86_BUILTIN_UCOMILESD,
24880 IX86_BUILTIN_UCOMIGTSD,
24881 IX86_BUILTIN_UCOMIGESD,
24882 IX86_BUILTIN_UCOMINEQSD,
24883
24884 IX86_BUILTIN_MAXPD,
24885 IX86_BUILTIN_MAXSD,
24886 IX86_BUILTIN_MINPD,
24887 IX86_BUILTIN_MINSD,
24888
24889 IX86_BUILTIN_ANDPD,
24890 IX86_BUILTIN_ANDNPD,
24891 IX86_BUILTIN_ORPD,
24892 IX86_BUILTIN_XORPD,
24893
24894 IX86_BUILTIN_SQRTPD,
24895 IX86_BUILTIN_SQRTSD,
24896
24897 IX86_BUILTIN_UNPCKHPD,
24898 IX86_BUILTIN_UNPCKLPD,
24899
24900 IX86_BUILTIN_SHUFPD,
24901
24902 IX86_BUILTIN_LOADUPD,
24903 IX86_BUILTIN_STOREUPD,
24904 IX86_BUILTIN_MOVSD,
24905
24906 IX86_BUILTIN_LOADHPD,
24907 IX86_BUILTIN_LOADLPD,
24908
24909 IX86_BUILTIN_CVTDQ2PD,
24910 IX86_BUILTIN_CVTDQ2PS,
24911
24912 IX86_BUILTIN_CVTPD2DQ,
24913 IX86_BUILTIN_CVTPD2PI,
24914 IX86_BUILTIN_CVTPD2PS,
24915 IX86_BUILTIN_CVTTPD2DQ,
24916 IX86_BUILTIN_CVTTPD2PI,
24917
24918 IX86_BUILTIN_CVTPI2PD,
24919 IX86_BUILTIN_CVTSI2SD,
24920 IX86_BUILTIN_CVTSI642SD,
24921
24922 IX86_BUILTIN_CVTSD2SI,
24923 IX86_BUILTIN_CVTSD2SI64,
24924 IX86_BUILTIN_CVTSD2SS,
24925 IX86_BUILTIN_CVTSS2SD,
24926 IX86_BUILTIN_CVTTSD2SI,
24927 IX86_BUILTIN_CVTTSD2SI64,
24928
24929 IX86_BUILTIN_CVTPS2DQ,
24930 IX86_BUILTIN_CVTPS2PD,
24931 IX86_BUILTIN_CVTTPS2DQ,
24932
24933 IX86_BUILTIN_MOVNTI,
24934 IX86_BUILTIN_MOVNTI64,
24935 IX86_BUILTIN_MOVNTPD,
24936 IX86_BUILTIN_MOVNTDQ,
24937
24938 IX86_BUILTIN_MOVQ128,
24939
24940 /* SSE2 MMX */
24941 IX86_BUILTIN_MASKMOVDQU,
24942 IX86_BUILTIN_MOVMSKPD,
24943 IX86_BUILTIN_PMOVMSKB128,
24944
24945 IX86_BUILTIN_PACKSSWB128,
24946 IX86_BUILTIN_PACKSSDW128,
24947 IX86_BUILTIN_PACKUSWB128,
24948
24949 IX86_BUILTIN_PADDB128,
24950 IX86_BUILTIN_PADDW128,
24951 IX86_BUILTIN_PADDD128,
24952 IX86_BUILTIN_PADDQ128,
24953 IX86_BUILTIN_PADDSB128,
24954 IX86_BUILTIN_PADDSW128,
24955 IX86_BUILTIN_PADDUSB128,
24956 IX86_BUILTIN_PADDUSW128,
24957 IX86_BUILTIN_PSUBB128,
24958 IX86_BUILTIN_PSUBW128,
24959 IX86_BUILTIN_PSUBD128,
24960 IX86_BUILTIN_PSUBQ128,
24961 IX86_BUILTIN_PSUBSB128,
24962 IX86_BUILTIN_PSUBSW128,
24963 IX86_BUILTIN_PSUBUSB128,
24964 IX86_BUILTIN_PSUBUSW128,
24965
24966 IX86_BUILTIN_PAND128,
24967 IX86_BUILTIN_PANDN128,
24968 IX86_BUILTIN_POR128,
24969 IX86_BUILTIN_PXOR128,
24970
24971 IX86_BUILTIN_PAVGB128,
24972 IX86_BUILTIN_PAVGW128,
24973
24974 IX86_BUILTIN_PCMPEQB128,
24975 IX86_BUILTIN_PCMPEQW128,
24976 IX86_BUILTIN_PCMPEQD128,
24977 IX86_BUILTIN_PCMPGTB128,
24978 IX86_BUILTIN_PCMPGTW128,
24979 IX86_BUILTIN_PCMPGTD128,
24980
24981 IX86_BUILTIN_PMADDWD128,
24982
24983 IX86_BUILTIN_PMAXSW128,
24984 IX86_BUILTIN_PMAXUB128,
24985 IX86_BUILTIN_PMINSW128,
24986 IX86_BUILTIN_PMINUB128,
24987
24988 IX86_BUILTIN_PMULUDQ,
24989 IX86_BUILTIN_PMULUDQ128,
24990 IX86_BUILTIN_PMULHUW128,
24991 IX86_BUILTIN_PMULHW128,
24992 IX86_BUILTIN_PMULLW128,
24993
24994 IX86_BUILTIN_PSADBW128,
24995 IX86_BUILTIN_PSHUFHW,
24996 IX86_BUILTIN_PSHUFLW,
24997 IX86_BUILTIN_PSHUFD,
24998
24999 IX86_BUILTIN_PSLLDQI128,
25000 IX86_BUILTIN_PSLLWI128,
25001 IX86_BUILTIN_PSLLDI128,
25002 IX86_BUILTIN_PSLLQI128,
25003 IX86_BUILTIN_PSRAWI128,
25004 IX86_BUILTIN_PSRADI128,
25005 IX86_BUILTIN_PSRLDQI128,
25006 IX86_BUILTIN_PSRLWI128,
25007 IX86_BUILTIN_PSRLDI128,
25008 IX86_BUILTIN_PSRLQI128,
25009
25010 IX86_BUILTIN_PSLLDQ128,
25011 IX86_BUILTIN_PSLLW128,
25012 IX86_BUILTIN_PSLLD128,
25013 IX86_BUILTIN_PSLLQ128,
25014 IX86_BUILTIN_PSRAW128,
25015 IX86_BUILTIN_PSRAD128,
25016 IX86_BUILTIN_PSRLW128,
25017 IX86_BUILTIN_PSRLD128,
25018 IX86_BUILTIN_PSRLQ128,
25019
25020 IX86_BUILTIN_PUNPCKHBW128,
25021 IX86_BUILTIN_PUNPCKHWD128,
25022 IX86_BUILTIN_PUNPCKHDQ128,
25023 IX86_BUILTIN_PUNPCKHQDQ128,
25024 IX86_BUILTIN_PUNPCKLBW128,
25025 IX86_BUILTIN_PUNPCKLWD128,
25026 IX86_BUILTIN_PUNPCKLDQ128,
25027 IX86_BUILTIN_PUNPCKLQDQ128,
25028
25029 IX86_BUILTIN_CLFLUSH,
25030 IX86_BUILTIN_MFENCE,
25031 IX86_BUILTIN_LFENCE,
25032 IX86_BUILTIN_PAUSE,
25033
25034 IX86_BUILTIN_BSRSI,
25035 IX86_BUILTIN_BSRDI,
25036 IX86_BUILTIN_RDPMC,
25037 IX86_BUILTIN_RDTSC,
25038 IX86_BUILTIN_RDTSCP,
25039 IX86_BUILTIN_ROLQI,
25040 IX86_BUILTIN_ROLHI,
25041 IX86_BUILTIN_RORQI,
25042 IX86_BUILTIN_RORHI,
25043
25044 /* SSE3. */
25045 IX86_BUILTIN_ADDSUBPS,
25046 IX86_BUILTIN_HADDPS,
25047 IX86_BUILTIN_HSUBPS,
25048 IX86_BUILTIN_MOVSHDUP,
25049 IX86_BUILTIN_MOVSLDUP,
25050 IX86_BUILTIN_ADDSUBPD,
25051 IX86_BUILTIN_HADDPD,
25052 IX86_BUILTIN_HSUBPD,
25053 IX86_BUILTIN_LDDQU,
25054
25055 IX86_BUILTIN_MONITOR,
25056 IX86_BUILTIN_MWAIT,
25057
25058 /* SSSE3. */
25059 IX86_BUILTIN_PHADDW,
25060 IX86_BUILTIN_PHADDD,
25061 IX86_BUILTIN_PHADDSW,
25062 IX86_BUILTIN_PHSUBW,
25063 IX86_BUILTIN_PHSUBD,
25064 IX86_BUILTIN_PHSUBSW,
25065 IX86_BUILTIN_PMADDUBSW,
25066 IX86_BUILTIN_PMULHRSW,
25067 IX86_BUILTIN_PSHUFB,
25068 IX86_BUILTIN_PSIGNB,
25069 IX86_BUILTIN_PSIGNW,
25070 IX86_BUILTIN_PSIGND,
25071 IX86_BUILTIN_PALIGNR,
25072 IX86_BUILTIN_PABSB,
25073 IX86_BUILTIN_PABSW,
25074 IX86_BUILTIN_PABSD,
25075
25076 IX86_BUILTIN_PHADDW128,
25077 IX86_BUILTIN_PHADDD128,
25078 IX86_BUILTIN_PHADDSW128,
25079 IX86_BUILTIN_PHSUBW128,
25080 IX86_BUILTIN_PHSUBD128,
25081 IX86_BUILTIN_PHSUBSW128,
25082 IX86_BUILTIN_PMADDUBSW128,
25083 IX86_BUILTIN_PMULHRSW128,
25084 IX86_BUILTIN_PSHUFB128,
25085 IX86_BUILTIN_PSIGNB128,
25086 IX86_BUILTIN_PSIGNW128,
25087 IX86_BUILTIN_PSIGND128,
25088 IX86_BUILTIN_PALIGNR128,
25089 IX86_BUILTIN_PABSB128,
25090 IX86_BUILTIN_PABSW128,
25091 IX86_BUILTIN_PABSD128,
25092
25093 /* AMDFAM10 - SSE4A New Instructions. */
25094 IX86_BUILTIN_MOVNTSD,
25095 IX86_BUILTIN_MOVNTSS,
25096 IX86_BUILTIN_EXTRQI,
25097 IX86_BUILTIN_EXTRQ,
25098 IX86_BUILTIN_INSERTQI,
25099 IX86_BUILTIN_INSERTQ,
25100
25101 /* SSE4.1. */
25102 IX86_BUILTIN_BLENDPD,
25103 IX86_BUILTIN_BLENDPS,
25104 IX86_BUILTIN_BLENDVPD,
25105 IX86_BUILTIN_BLENDVPS,
25106 IX86_BUILTIN_PBLENDVB128,
25107 IX86_BUILTIN_PBLENDW128,
25108
25109 IX86_BUILTIN_DPPD,
25110 IX86_BUILTIN_DPPS,
25111
25112 IX86_BUILTIN_INSERTPS128,
25113
25114 IX86_BUILTIN_MOVNTDQA,
25115 IX86_BUILTIN_MPSADBW128,
25116 IX86_BUILTIN_PACKUSDW128,
25117 IX86_BUILTIN_PCMPEQQ,
25118 IX86_BUILTIN_PHMINPOSUW128,
25119
25120 IX86_BUILTIN_PMAXSB128,
25121 IX86_BUILTIN_PMAXSD128,
25122 IX86_BUILTIN_PMAXUD128,
25123 IX86_BUILTIN_PMAXUW128,
25124
25125 IX86_BUILTIN_PMINSB128,
25126 IX86_BUILTIN_PMINSD128,
25127 IX86_BUILTIN_PMINUD128,
25128 IX86_BUILTIN_PMINUW128,
25129
25130 IX86_BUILTIN_PMOVSXBW128,
25131 IX86_BUILTIN_PMOVSXBD128,
25132 IX86_BUILTIN_PMOVSXBQ128,
25133 IX86_BUILTIN_PMOVSXWD128,
25134 IX86_BUILTIN_PMOVSXWQ128,
25135 IX86_BUILTIN_PMOVSXDQ128,
25136
25137 IX86_BUILTIN_PMOVZXBW128,
25138 IX86_BUILTIN_PMOVZXBD128,
25139 IX86_BUILTIN_PMOVZXBQ128,
25140 IX86_BUILTIN_PMOVZXWD128,
25141 IX86_BUILTIN_PMOVZXWQ128,
25142 IX86_BUILTIN_PMOVZXDQ128,
25143
25144 IX86_BUILTIN_PMULDQ128,
25145 IX86_BUILTIN_PMULLD128,
25146
25147 IX86_BUILTIN_ROUNDSD,
25148 IX86_BUILTIN_ROUNDSS,
25149
25150 IX86_BUILTIN_ROUNDPD,
25151 IX86_BUILTIN_ROUNDPS,
25152
25153 IX86_BUILTIN_FLOORPD,
25154 IX86_BUILTIN_CEILPD,
25155 IX86_BUILTIN_TRUNCPD,
25156 IX86_BUILTIN_RINTPD,
25157 IX86_BUILTIN_ROUNDPD_AZ,
25158
25159 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25160 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25161 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25162
25163 IX86_BUILTIN_FLOORPS,
25164 IX86_BUILTIN_CEILPS,
25165 IX86_BUILTIN_TRUNCPS,
25166 IX86_BUILTIN_RINTPS,
25167 IX86_BUILTIN_ROUNDPS_AZ,
25168
25169 IX86_BUILTIN_FLOORPS_SFIX,
25170 IX86_BUILTIN_CEILPS_SFIX,
25171 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25172
25173 IX86_BUILTIN_PTESTZ,
25174 IX86_BUILTIN_PTESTC,
25175 IX86_BUILTIN_PTESTNZC,
25176
25177 IX86_BUILTIN_VEC_INIT_V2SI,
25178 IX86_BUILTIN_VEC_INIT_V4HI,
25179 IX86_BUILTIN_VEC_INIT_V8QI,
25180 IX86_BUILTIN_VEC_EXT_V2DF,
25181 IX86_BUILTIN_VEC_EXT_V2DI,
25182 IX86_BUILTIN_VEC_EXT_V4SF,
25183 IX86_BUILTIN_VEC_EXT_V4SI,
25184 IX86_BUILTIN_VEC_EXT_V8HI,
25185 IX86_BUILTIN_VEC_EXT_V2SI,
25186 IX86_BUILTIN_VEC_EXT_V4HI,
25187 IX86_BUILTIN_VEC_EXT_V16QI,
25188 IX86_BUILTIN_VEC_SET_V2DI,
25189 IX86_BUILTIN_VEC_SET_V4SF,
25190 IX86_BUILTIN_VEC_SET_V4SI,
25191 IX86_BUILTIN_VEC_SET_V8HI,
25192 IX86_BUILTIN_VEC_SET_V4HI,
25193 IX86_BUILTIN_VEC_SET_V16QI,
25194
25195 IX86_BUILTIN_VEC_PACK_SFIX,
25196 IX86_BUILTIN_VEC_PACK_SFIX256,
25197
25198 /* SSE4.2. */
25199 IX86_BUILTIN_CRC32QI,
25200 IX86_BUILTIN_CRC32HI,
25201 IX86_BUILTIN_CRC32SI,
25202 IX86_BUILTIN_CRC32DI,
25203
25204 IX86_BUILTIN_PCMPESTRI128,
25205 IX86_BUILTIN_PCMPESTRM128,
25206 IX86_BUILTIN_PCMPESTRA128,
25207 IX86_BUILTIN_PCMPESTRC128,
25208 IX86_BUILTIN_PCMPESTRO128,
25209 IX86_BUILTIN_PCMPESTRS128,
25210 IX86_BUILTIN_PCMPESTRZ128,
25211 IX86_BUILTIN_PCMPISTRI128,
25212 IX86_BUILTIN_PCMPISTRM128,
25213 IX86_BUILTIN_PCMPISTRA128,
25214 IX86_BUILTIN_PCMPISTRC128,
25215 IX86_BUILTIN_PCMPISTRO128,
25216 IX86_BUILTIN_PCMPISTRS128,
25217 IX86_BUILTIN_PCMPISTRZ128,
25218
25219 IX86_BUILTIN_PCMPGTQ,
25220
25221 /* AES instructions */
25222 IX86_BUILTIN_AESENC128,
25223 IX86_BUILTIN_AESENCLAST128,
25224 IX86_BUILTIN_AESDEC128,
25225 IX86_BUILTIN_AESDECLAST128,
25226 IX86_BUILTIN_AESIMC128,
25227 IX86_BUILTIN_AESKEYGENASSIST128,
25228
25229 /* PCLMUL instruction */
25230 IX86_BUILTIN_PCLMULQDQ128,
25231
25232 /* AVX */
25233 IX86_BUILTIN_ADDPD256,
25234 IX86_BUILTIN_ADDPS256,
25235 IX86_BUILTIN_ADDSUBPD256,
25236 IX86_BUILTIN_ADDSUBPS256,
25237 IX86_BUILTIN_ANDPD256,
25238 IX86_BUILTIN_ANDPS256,
25239 IX86_BUILTIN_ANDNPD256,
25240 IX86_BUILTIN_ANDNPS256,
25241 IX86_BUILTIN_BLENDPD256,
25242 IX86_BUILTIN_BLENDPS256,
25243 IX86_BUILTIN_BLENDVPD256,
25244 IX86_BUILTIN_BLENDVPS256,
25245 IX86_BUILTIN_DIVPD256,
25246 IX86_BUILTIN_DIVPS256,
25247 IX86_BUILTIN_DPPS256,
25248 IX86_BUILTIN_HADDPD256,
25249 IX86_BUILTIN_HADDPS256,
25250 IX86_BUILTIN_HSUBPD256,
25251 IX86_BUILTIN_HSUBPS256,
25252 IX86_BUILTIN_MAXPD256,
25253 IX86_BUILTIN_MAXPS256,
25254 IX86_BUILTIN_MINPD256,
25255 IX86_BUILTIN_MINPS256,
25256 IX86_BUILTIN_MULPD256,
25257 IX86_BUILTIN_MULPS256,
25258 IX86_BUILTIN_ORPD256,
25259 IX86_BUILTIN_ORPS256,
25260 IX86_BUILTIN_SHUFPD256,
25261 IX86_BUILTIN_SHUFPS256,
25262 IX86_BUILTIN_SUBPD256,
25263 IX86_BUILTIN_SUBPS256,
25264 IX86_BUILTIN_XORPD256,
25265 IX86_BUILTIN_XORPS256,
25266 IX86_BUILTIN_CMPSD,
25267 IX86_BUILTIN_CMPSS,
25268 IX86_BUILTIN_CMPPD,
25269 IX86_BUILTIN_CMPPS,
25270 IX86_BUILTIN_CMPPD256,
25271 IX86_BUILTIN_CMPPS256,
25272 IX86_BUILTIN_CVTDQ2PD256,
25273 IX86_BUILTIN_CVTDQ2PS256,
25274 IX86_BUILTIN_CVTPD2PS256,
25275 IX86_BUILTIN_CVTPS2DQ256,
25276 IX86_BUILTIN_CVTPS2PD256,
25277 IX86_BUILTIN_CVTTPD2DQ256,
25278 IX86_BUILTIN_CVTPD2DQ256,
25279 IX86_BUILTIN_CVTTPS2DQ256,
25280 IX86_BUILTIN_EXTRACTF128PD256,
25281 IX86_BUILTIN_EXTRACTF128PS256,
25282 IX86_BUILTIN_EXTRACTF128SI256,
25283 IX86_BUILTIN_VZEROALL,
25284 IX86_BUILTIN_VZEROUPPER,
25285 IX86_BUILTIN_VPERMILVARPD,
25286 IX86_BUILTIN_VPERMILVARPS,
25287 IX86_BUILTIN_VPERMILVARPD256,
25288 IX86_BUILTIN_VPERMILVARPS256,
25289 IX86_BUILTIN_VPERMILPD,
25290 IX86_BUILTIN_VPERMILPS,
25291 IX86_BUILTIN_VPERMILPD256,
25292 IX86_BUILTIN_VPERMILPS256,
25293 IX86_BUILTIN_VPERMIL2PD,
25294 IX86_BUILTIN_VPERMIL2PS,
25295 IX86_BUILTIN_VPERMIL2PD256,
25296 IX86_BUILTIN_VPERMIL2PS256,
25297 IX86_BUILTIN_VPERM2F128PD256,
25298 IX86_BUILTIN_VPERM2F128PS256,
25299 IX86_BUILTIN_VPERM2F128SI256,
25300 IX86_BUILTIN_VBROADCASTSS,
25301 IX86_BUILTIN_VBROADCASTSD256,
25302 IX86_BUILTIN_VBROADCASTSS256,
25303 IX86_BUILTIN_VBROADCASTPD256,
25304 IX86_BUILTIN_VBROADCASTPS256,
25305 IX86_BUILTIN_VINSERTF128PD256,
25306 IX86_BUILTIN_VINSERTF128PS256,
25307 IX86_BUILTIN_VINSERTF128SI256,
25308 IX86_BUILTIN_LOADUPD256,
25309 IX86_BUILTIN_LOADUPS256,
25310 IX86_BUILTIN_STOREUPD256,
25311 IX86_BUILTIN_STOREUPS256,
25312 IX86_BUILTIN_LDDQU256,
25313 IX86_BUILTIN_MOVNTDQ256,
25314 IX86_BUILTIN_MOVNTPD256,
25315 IX86_BUILTIN_MOVNTPS256,
25316 IX86_BUILTIN_LOADDQU256,
25317 IX86_BUILTIN_STOREDQU256,
25318 IX86_BUILTIN_MASKLOADPD,
25319 IX86_BUILTIN_MASKLOADPS,
25320 IX86_BUILTIN_MASKSTOREPD,
25321 IX86_BUILTIN_MASKSTOREPS,
25322 IX86_BUILTIN_MASKLOADPD256,
25323 IX86_BUILTIN_MASKLOADPS256,
25324 IX86_BUILTIN_MASKSTOREPD256,
25325 IX86_BUILTIN_MASKSTOREPS256,
25326 IX86_BUILTIN_MOVSHDUP256,
25327 IX86_BUILTIN_MOVSLDUP256,
25328 IX86_BUILTIN_MOVDDUP256,
25329
25330 IX86_BUILTIN_SQRTPD256,
25331 IX86_BUILTIN_SQRTPS256,
25332 IX86_BUILTIN_SQRTPS_NR256,
25333 IX86_BUILTIN_RSQRTPS256,
25334 IX86_BUILTIN_RSQRTPS_NR256,
25335
25336 IX86_BUILTIN_RCPPS256,
25337
25338 IX86_BUILTIN_ROUNDPD256,
25339 IX86_BUILTIN_ROUNDPS256,
25340
25341 IX86_BUILTIN_FLOORPD256,
25342 IX86_BUILTIN_CEILPD256,
25343 IX86_BUILTIN_TRUNCPD256,
25344 IX86_BUILTIN_RINTPD256,
25345 IX86_BUILTIN_ROUNDPD_AZ256,
25346
25347 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25348 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25349 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25350
25351 IX86_BUILTIN_FLOORPS256,
25352 IX86_BUILTIN_CEILPS256,
25353 IX86_BUILTIN_TRUNCPS256,
25354 IX86_BUILTIN_RINTPS256,
25355 IX86_BUILTIN_ROUNDPS_AZ256,
25356
25357 IX86_BUILTIN_FLOORPS_SFIX256,
25358 IX86_BUILTIN_CEILPS_SFIX256,
25359 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25360
25361 IX86_BUILTIN_UNPCKHPD256,
25362 IX86_BUILTIN_UNPCKLPD256,
25363 IX86_BUILTIN_UNPCKHPS256,
25364 IX86_BUILTIN_UNPCKLPS256,
25365
25366 IX86_BUILTIN_SI256_SI,
25367 IX86_BUILTIN_PS256_PS,
25368 IX86_BUILTIN_PD256_PD,
25369 IX86_BUILTIN_SI_SI256,
25370 IX86_BUILTIN_PS_PS256,
25371 IX86_BUILTIN_PD_PD256,
25372
25373 IX86_BUILTIN_VTESTZPD,
25374 IX86_BUILTIN_VTESTCPD,
25375 IX86_BUILTIN_VTESTNZCPD,
25376 IX86_BUILTIN_VTESTZPS,
25377 IX86_BUILTIN_VTESTCPS,
25378 IX86_BUILTIN_VTESTNZCPS,
25379 IX86_BUILTIN_VTESTZPD256,
25380 IX86_BUILTIN_VTESTCPD256,
25381 IX86_BUILTIN_VTESTNZCPD256,
25382 IX86_BUILTIN_VTESTZPS256,
25383 IX86_BUILTIN_VTESTCPS256,
25384 IX86_BUILTIN_VTESTNZCPS256,
25385 IX86_BUILTIN_PTESTZ256,
25386 IX86_BUILTIN_PTESTC256,
25387 IX86_BUILTIN_PTESTNZC256,
25388
25389 IX86_BUILTIN_MOVMSKPD256,
25390 IX86_BUILTIN_MOVMSKPS256,
25391
25392 /* AVX2 */
25393 IX86_BUILTIN_MPSADBW256,
25394 IX86_BUILTIN_PABSB256,
25395 IX86_BUILTIN_PABSW256,
25396 IX86_BUILTIN_PABSD256,
25397 IX86_BUILTIN_PACKSSDW256,
25398 IX86_BUILTIN_PACKSSWB256,
25399 IX86_BUILTIN_PACKUSDW256,
25400 IX86_BUILTIN_PACKUSWB256,
25401 IX86_BUILTIN_PADDB256,
25402 IX86_BUILTIN_PADDW256,
25403 IX86_BUILTIN_PADDD256,
25404 IX86_BUILTIN_PADDQ256,
25405 IX86_BUILTIN_PADDSB256,
25406 IX86_BUILTIN_PADDSW256,
25407 IX86_BUILTIN_PADDUSB256,
25408 IX86_BUILTIN_PADDUSW256,
25409 IX86_BUILTIN_PALIGNR256,
25410 IX86_BUILTIN_AND256I,
25411 IX86_BUILTIN_ANDNOT256I,
25412 IX86_BUILTIN_PAVGB256,
25413 IX86_BUILTIN_PAVGW256,
25414 IX86_BUILTIN_PBLENDVB256,
25415 IX86_BUILTIN_PBLENDVW256,
25416 IX86_BUILTIN_PCMPEQB256,
25417 IX86_BUILTIN_PCMPEQW256,
25418 IX86_BUILTIN_PCMPEQD256,
25419 IX86_BUILTIN_PCMPEQQ256,
25420 IX86_BUILTIN_PCMPGTB256,
25421 IX86_BUILTIN_PCMPGTW256,
25422 IX86_BUILTIN_PCMPGTD256,
25423 IX86_BUILTIN_PCMPGTQ256,
25424 IX86_BUILTIN_PHADDW256,
25425 IX86_BUILTIN_PHADDD256,
25426 IX86_BUILTIN_PHADDSW256,
25427 IX86_BUILTIN_PHSUBW256,
25428 IX86_BUILTIN_PHSUBD256,
25429 IX86_BUILTIN_PHSUBSW256,
25430 IX86_BUILTIN_PMADDUBSW256,
25431 IX86_BUILTIN_PMADDWD256,
25432 IX86_BUILTIN_PMAXSB256,
25433 IX86_BUILTIN_PMAXSW256,
25434 IX86_BUILTIN_PMAXSD256,
25435 IX86_BUILTIN_PMAXUB256,
25436 IX86_BUILTIN_PMAXUW256,
25437 IX86_BUILTIN_PMAXUD256,
25438 IX86_BUILTIN_PMINSB256,
25439 IX86_BUILTIN_PMINSW256,
25440 IX86_BUILTIN_PMINSD256,
25441 IX86_BUILTIN_PMINUB256,
25442 IX86_BUILTIN_PMINUW256,
25443 IX86_BUILTIN_PMINUD256,
25444 IX86_BUILTIN_PMOVMSKB256,
25445 IX86_BUILTIN_PMOVSXBW256,
25446 IX86_BUILTIN_PMOVSXBD256,
25447 IX86_BUILTIN_PMOVSXBQ256,
25448 IX86_BUILTIN_PMOVSXWD256,
25449 IX86_BUILTIN_PMOVSXWQ256,
25450 IX86_BUILTIN_PMOVSXDQ256,
25451 IX86_BUILTIN_PMOVZXBW256,
25452 IX86_BUILTIN_PMOVZXBD256,
25453 IX86_BUILTIN_PMOVZXBQ256,
25454 IX86_BUILTIN_PMOVZXWD256,
25455 IX86_BUILTIN_PMOVZXWQ256,
25456 IX86_BUILTIN_PMOVZXDQ256,
25457 IX86_BUILTIN_PMULDQ256,
25458 IX86_BUILTIN_PMULHRSW256,
25459 IX86_BUILTIN_PMULHUW256,
25460 IX86_BUILTIN_PMULHW256,
25461 IX86_BUILTIN_PMULLW256,
25462 IX86_BUILTIN_PMULLD256,
25463 IX86_BUILTIN_PMULUDQ256,
25464 IX86_BUILTIN_POR256,
25465 IX86_BUILTIN_PSADBW256,
25466 IX86_BUILTIN_PSHUFB256,
25467 IX86_BUILTIN_PSHUFD256,
25468 IX86_BUILTIN_PSHUFHW256,
25469 IX86_BUILTIN_PSHUFLW256,
25470 IX86_BUILTIN_PSIGNB256,
25471 IX86_BUILTIN_PSIGNW256,
25472 IX86_BUILTIN_PSIGND256,
25473 IX86_BUILTIN_PSLLDQI256,
25474 IX86_BUILTIN_PSLLWI256,
25475 IX86_BUILTIN_PSLLW256,
25476 IX86_BUILTIN_PSLLDI256,
25477 IX86_BUILTIN_PSLLD256,
25478 IX86_BUILTIN_PSLLQI256,
25479 IX86_BUILTIN_PSLLQ256,
25480 IX86_BUILTIN_PSRAWI256,
25481 IX86_BUILTIN_PSRAW256,
25482 IX86_BUILTIN_PSRADI256,
25483 IX86_BUILTIN_PSRAD256,
25484 IX86_BUILTIN_PSRLDQI256,
25485 IX86_BUILTIN_PSRLWI256,
25486 IX86_BUILTIN_PSRLW256,
25487 IX86_BUILTIN_PSRLDI256,
25488 IX86_BUILTIN_PSRLD256,
25489 IX86_BUILTIN_PSRLQI256,
25490 IX86_BUILTIN_PSRLQ256,
25491 IX86_BUILTIN_PSUBB256,
25492 IX86_BUILTIN_PSUBW256,
25493 IX86_BUILTIN_PSUBD256,
25494 IX86_BUILTIN_PSUBQ256,
25495 IX86_BUILTIN_PSUBSB256,
25496 IX86_BUILTIN_PSUBSW256,
25497 IX86_BUILTIN_PSUBUSB256,
25498 IX86_BUILTIN_PSUBUSW256,
25499 IX86_BUILTIN_PUNPCKHBW256,
25500 IX86_BUILTIN_PUNPCKHWD256,
25501 IX86_BUILTIN_PUNPCKHDQ256,
25502 IX86_BUILTIN_PUNPCKHQDQ256,
25503 IX86_BUILTIN_PUNPCKLBW256,
25504 IX86_BUILTIN_PUNPCKLWD256,
25505 IX86_BUILTIN_PUNPCKLDQ256,
25506 IX86_BUILTIN_PUNPCKLQDQ256,
25507 IX86_BUILTIN_PXOR256,
25508 IX86_BUILTIN_MOVNTDQA256,
25509 IX86_BUILTIN_VBROADCASTSS_PS,
25510 IX86_BUILTIN_VBROADCASTSS_PS256,
25511 IX86_BUILTIN_VBROADCASTSD_PD256,
25512 IX86_BUILTIN_VBROADCASTSI256,
25513 IX86_BUILTIN_PBLENDD256,
25514 IX86_BUILTIN_PBLENDD128,
25515 IX86_BUILTIN_PBROADCASTB256,
25516 IX86_BUILTIN_PBROADCASTW256,
25517 IX86_BUILTIN_PBROADCASTD256,
25518 IX86_BUILTIN_PBROADCASTQ256,
25519 IX86_BUILTIN_PBROADCASTB128,
25520 IX86_BUILTIN_PBROADCASTW128,
25521 IX86_BUILTIN_PBROADCASTD128,
25522 IX86_BUILTIN_PBROADCASTQ128,
25523 IX86_BUILTIN_VPERMVARSI256,
25524 IX86_BUILTIN_VPERMDF256,
25525 IX86_BUILTIN_VPERMVARSF256,
25526 IX86_BUILTIN_VPERMDI256,
25527 IX86_BUILTIN_VPERMTI256,
25528 IX86_BUILTIN_VEXTRACT128I256,
25529 IX86_BUILTIN_VINSERT128I256,
25530 IX86_BUILTIN_MASKLOADD,
25531 IX86_BUILTIN_MASKLOADQ,
25532 IX86_BUILTIN_MASKLOADD256,
25533 IX86_BUILTIN_MASKLOADQ256,
25534 IX86_BUILTIN_MASKSTORED,
25535 IX86_BUILTIN_MASKSTOREQ,
25536 IX86_BUILTIN_MASKSTORED256,
25537 IX86_BUILTIN_MASKSTOREQ256,
25538 IX86_BUILTIN_PSLLVV4DI,
25539 IX86_BUILTIN_PSLLVV2DI,
25540 IX86_BUILTIN_PSLLVV8SI,
25541 IX86_BUILTIN_PSLLVV4SI,
25542 IX86_BUILTIN_PSRAVV8SI,
25543 IX86_BUILTIN_PSRAVV4SI,
25544 IX86_BUILTIN_PSRLVV4DI,
25545 IX86_BUILTIN_PSRLVV2DI,
25546 IX86_BUILTIN_PSRLVV8SI,
25547 IX86_BUILTIN_PSRLVV4SI,
25548
25549 IX86_BUILTIN_GATHERSIV2DF,
25550 IX86_BUILTIN_GATHERSIV4DF,
25551 IX86_BUILTIN_GATHERDIV2DF,
25552 IX86_BUILTIN_GATHERDIV4DF,
25553 IX86_BUILTIN_GATHERSIV4SF,
25554 IX86_BUILTIN_GATHERSIV8SF,
25555 IX86_BUILTIN_GATHERDIV4SF,
25556 IX86_BUILTIN_GATHERDIV8SF,
25557 IX86_BUILTIN_GATHERSIV2DI,
25558 IX86_BUILTIN_GATHERSIV4DI,
25559 IX86_BUILTIN_GATHERDIV2DI,
25560 IX86_BUILTIN_GATHERDIV4DI,
25561 IX86_BUILTIN_GATHERSIV4SI,
25562 IX86_BUILTIN_GATHERSIV8SI,
25563 IX86_BUILTIN_GATHERDIV4SI,
25564 IX86_BUILTIN_GATHERDIV8SI,
25565
25566 /* Alternate 4 element gather for the vectorizer where
25567 all operands are 32-byte wide. */
25568 IX86_BUILTIN_GATHERALTSIV4DF,
25569 IX86_BUILTIN_GATHERALTDIV8SF,
25570 IX86_BUILTIN_GATHERALTSIV4DI,
25571 IX86_BUILTIN_GATHERALTDIV8SI,
25572
25573 /* TFmode support builtins. */
25574 IX86_BUILTIN_INFQ,
25575 IX86_BUILTIN_HUGE_VALQ,
25576 IX86_BUILTIN_FABSQ,
25577 IX86_BUILTIN_COPYSIGNQ,
25578
25579 /* Vectorizer support builtins. */
25580 IX86_BUILTIN_CPYSGNPS,
25581 IX86_BUILTIN_CPYSGNPD,
25582 IX86_BUILTIN_CPYSGNPS256,
25583 IX86_BUILTIN_CPYSGNPD256,
25584
25585 /* FMA4 instructions. */
25586 IX86_BUILTIN_VFMADDSS,
25587 IX86_BUILTIN_VFMADDSD,
25588 IX86_BUILTIN_VFMADDPS,
25589 IX86_BUILTIN_VFMADDPD,
25590 IX86_BUILTIN_VFMADDPS256,
25591 IX86_BUILTIN_VFMADDPD256,
25592 IX86_BUILTIN_VFMADDSUBPS,
25593 IX86_BUILTIN_VFMADDSUBPD,
25594 IX86_BUILTIN_VFMADDSUBPS256,
25595 IX86_BUILTIN_VFMADDSUBPD256,
25596
25597 /* FMA3 instructions. */
25598 IX86_BUILTIN_VFMADDSS3,
25599 IX86_BUILTIN_VFMADDSD3,
25600
25601 /* XOP instructions. */
25602 IX86_BUILTIN_VPCMOV,
25603 IX86_BUILTIN_VPCMOV_V2DI,
25604 IX86_BUILTIN_VPCMOV_V4SI,
25605 IX86_BUILTIN_VPCMOV_V8HI,
25606 IX86_BUILTIN_VPCMOV_V16QI,
25607 IX86_BUILTIN_VPCMOV_V4SF,
25608 IX86_BUILTIN_VPCMOV_V2DF,
25609 IX86_BUILTIN_VPCMOV256,
25610 IX86_BUILTIN_VPCMOV_V4DI256,
25611 IX86_BUILTIN_VPCMOV_V8SI256,
25612 IX86_BUILTIN_VPCMOV_V16HI256,
25613 IX86_BUILTIN_VPCMOV_V32QI256,
25614 IX86_BUILTIN_VPCMOV_V8SF256,
25615 IX86_BUILTIN_VPCMOV_V4DF256,
25616
25617 IX86_BUILTIN_VPPERM,
25618
25619 IX86_BUILTIN_VPMACSSWW,
25620 IX86_BUILTIN_VPMACSWW,
25621 IX86_BUILTIN_VPMACSSWD,
25622 IX86_BUILTIN_VPMACSWD,
25623 IX86_BUILTIN_VPMACSSDD,
25624 IX86_BUILTIN_VPMACSDD,
25625 IX86_BUILTIN_VPMACSSDQL,
25626 IX86_BUILTIN_VPMACSSDQH,
25627 IX86_BUILTIN_VPMACSDQL,
25628 IX86_BUILTIN_VPMACSDQH,
25629 IX86_BUILTIN_VPMADCSSWD,
25630 IX86_BUILTIN_VPMADCSWD,
25631
25632 IX86_BUILTIN_VPHADDBW,
25633 IX86_BUILTIN_VPHADDBD,
25634 IX86_BUILTIN_VPHADDBQ,
25635 IX86_BUILTIN_VPHADDWD,
25636 IX86_BUILTIN_VPHADDWQ,
25637 IX86_BUILTIN_VPHADDDQ,
25638 IX86_BUILTIN_VPHADDUBW,
25639 IX86_BUILTIN_VPHADDUBD,
25640 IX86_BUILTIN_VPHADDUBQ,
25641 IX86_BUILTIN_VPHADDUWD,
25642 IX86_BUILTIN_VPHADDUWQ,
25643 IX86_BUILTIN_VPHADDUDQ,
25644 IX86_BUILTIN_VPHSUBBW,
25645 IX86_BUILTIN_VPHSUBWD,
25646 IX86_BUILTIN_VPHSUBDQ,
25647
25648 IX86_BUILTIN_VPROTB,
25649 IX86_BUILTIN_VPROTW,
25650 IX86_BUILTIN_VPROTD,
25651 IX86_BUILTIN_VPROTQ,
25652 IX86_BUILTIN_VPROTB_IMM,
25653 IX86_BUILTIN_VPROTW_IMM,
25654 IX86_BUILTIN_VPROTD_IMM,
25655 IX86_BUILTIN_VPROTQ_IMM,
25656
25657 IX86_BUILTIN_VPSHLB,
25658 IX86_BUILTIN_VPSHLW,
25659 IX86_BUILTIN_VPSHLD,
25660 IX86_BUILTIN_VPSHLQ,
25661 IX86_BUILTIN_VPSHAB,
25662 IX86_BUILTIN_VPSHAW,
25663 IX86_BUILTIN_VPSHAD,
25664 IX86_BUILTIN_VPSHAQ,
25665
25666 IX86_BUILTIN_VFRCZSS,
25667 IX86_BUILTIN_VFRCZSD,
25668 IX86_BUILTIN_VFRCZPS,
25669 IX86_BUILTIN_VFRCZPD,
25670 IX86_BUILTIN_VFRCZPS256,
25671 IX86_BUILTIN_VFRCZPD256,
25672
25673 IX86_BUILTIN_VPCOMEQUB,
25674 IX86_BUILTIN_VPCOMNEUB,
25675 IX86_BUILTIN_VPCOMLTUB,
25676 IX86_BUILTIN_VPCOMLEUB,
25677 IX86_BUILTIN_VPCOMGTUB,
25678 IX86_BUILTIN_VPCOMGEUB,
25679 IX86_BUILTIN_VPCOMFALSEUB,
25680 IX86_BUILTIN_VPCOMTRUEUB,
25681
25682 IX86_BUILTIN_VPCOMEQUW,
25683 IX86_BUILTIN_VPCOMNEUW,
25684 IX86_BUILTIN_VPCOMLTUW,
25685 IX86_BUILTIN_VPCOMLEUW,
25686 IX86_BUILTIN_VPCOMGTUW,
25687 IX86_BUILTIN_VPCOMGEUW,
25688 IX86_BUILTIN_VPCOMFALSEUW,
25689 IX86_BUILTIN_VPCOMTRUEUW,
25690
25691 IX86_BUILTIN_VPCOMEQUD,
25692 IX86_BUILTIN_VPCOMNEUD,
25693 IX86_BUILTIN_VPCOMLTUD,
25694 IX86_BUILTIN_VPCOMLEUD,
25695 IX86_BUILTIN_VPCOMGTUD,
25696 IX86_BUILTIN_VPCOMGEUD,
25697 IX86_BUILTIN_VPCOMFALSEUD,
25698 IX86_BUILTIN_VPCOMTRUEUD,
25699
25700 IX86_BUILTIN_VPCOMEQUQ,
25701 IX86_BUILTIN_VPCOMNEUQ,
25702 IX86_BUILTIN_VPCOMLTUQ,
25703 IX86_BUILTIN_VPCOMLEUQ,
25704 IX86_BUILTIN_VPCOMGTUQ,
25705 IX86_BUILTIN_VPCOMGEUQ,
25706 IX86_BUILTIN_VPCOMFALSEUQ,
25707 IX86_BUILTIN_VPCOMTRUEUQ,
25708
25709 IX86_BUILTIN_VPCOMEQB,
25710 IX86_BUILTIN_VPCOMNEB,
25711 IX86_BUILTIN_VPCOMLTB,
25712 IX86_BUILTIN_VPCOMLEB,
25713 IX86_BUILTIN_VPCOMGTB,
25714 IX86_BUILTIN_VPCOMGEB,
25715 IX86_BUILTIN_VPCOMFALSEB,
25716 IX86_BUILTIN_VPCOMTRUEB,
25717
25718 IX86_BUILTIN_VPCOMEQW,
25719 IX86_BUILTIN_VPCOMNEW,
25720 IX86_BUILTIN_VPCOMLTW,
25721 IX86_BUILTIN_VPCOMLEW,
25722 IX86_BUILTIN_VPCOMGTW,
25723 IX86_BUILTIN_VPCOMGEW,
25724 IX86_BUILTIN_VPCOMFALSEW,
25725 IX86_BUILTIN_VPCOMTRUEW,
25726
25727 IX86_BUILTIN_VPCOMEQD,
25728 IX86_BUILTIN_VPCOMNED,
25729 IX86_BUILTIN_VPCOMLTD,
25730 IX86_BUILTIN_VPCOMLED,
25731 IX86_BUILTIN_VPCOMGTD,
25732 IX86_BUILTIN_VPCOMGED,
25733 IX86_BUILTIN_VPCOMFALSED,
25734 IX86_BUILTIN_VPCOMTRUED,
25735
25736 IX86_BUILTIN_VPCOMEQQ,
25737 IX86_BUILTIN_VPCOMNEQ,
25738 IX86_BUILTIN_VPCOMLTQ,
25739 IX86_BUILTIN_VPCOMLEQ,
25740 IX86_BUILTIN_VPCOMGTQ,
25741 IX86_BUILTIN_VPCOMGEQ,
25742 IX86_BUILTIN_VPCOMFALSEQ,
25743 IX86_BUILTIN_VPCOMTRUEQ,
25744
25745 /* LWP instructions. */
25746 IX86_BUILTIN_LLWPCB,
25747 IX86_BUILTIN_SLWPCB,
25748 IX86_BUILTIN_LWPVAL32,
25749 IX86_BUILTIN_LWPVAL64,
25750 IX86_BUILTIN_LWPINS32,
25751 IX86_BUILTIN_LWPINS64,
25752
25753 IX86_BUILTIN_CLZS,
25754
25755 /* RTM */
25756 IX86_BUILTIN_XBEGIN,
25757 IX86_BUILTIN_XEND,
25758 IX86_BUILTIN_XABORT,
25759 IX86_BUILTIN_XTEST,
25760
25761 /* BMI instructions. */
25762 IX86_BUILTIN_BEXTR32,
25763 IX86_BUILTIN_BEXTR64,
25764 IX86_BUILTIN_CTZS,
25765
25766 /* TBM instructions. */
25767 IX86_BUILTIN_BEXTRI32,
25768 IX86_BUILTIN_BEXTRI64,
25769
25770 /* BMI2 instructions. */
25771 IX86_BUILTIN_BZHI32,
25772 IX86_BUILTIN_BZHI64,
25773 IX86_BUILTIN_PDEP32,
25774 IX86_BUILTIN_PDEP64,
25775 IX86_BUILTIN_PEXT32,
25776 IX86_BUILTIN_PEXT64,
25777
25778 /* FSGSBASE instructions. */
25779 IX86_BUILTIN_RDFSBASE32,
25780 IX86_BUILTIN_RDFSBASE64,
25781 IX86_BUILTIN_RDGSBASE32,
25782 IX86_BUILTIN_RDGSBASE64,
25783 IX86_BUILTIN_WRFSBASE32,
25784 IX86_BUILTIN_WRFSBASE64,
25785 IX86_BUILTIN_WRGSBASE32,
25786 IX86_BUILTIN_WRGSBASE64,
25787
25788 /* RDRND instructions. */
25789 IX86_BUILTIN_RDRAND16_STEP,
25790 IX86_BUILTIN_RDRAND32_STEP,
25791 IX86_BUILTIN_RDRAND64_STEP,
25792
25793 /* F16C instructions. */
25794 IX86_BUILTIN_CVTPH2PS,
25795 IX86_BUILTIN_CVTPH2PS256,
25796 IX86_BUILTIN_CVTPS2PH,
25797 IX86_BUILTIN_CVTPS2PH256,
25798
25799 /* CFString built-in for darwin */
25800 IX86_BUILTIN_CFSTRING,
25801
25802 IX86_BUILTIN_MAX
25803 };
25804
25805 /* Table for the ix86 builtin decls. */
25806 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25807
25808 /* Table of all of the builtin functions that are possible with different ISA's
25809 but are waiting to be built until a function is declared to use that
25810 ISA. */
25811 struct builtin_isa {
25812 const char *name; /* function name */
25813 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25814 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25815 bool const_p; /* true if the declaration is constant */
25816 bool set_and_not_built_p;
25817 };
25818
25819 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25820
25821
25822 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25823 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25824 function decl in the ix86_builtins array. Returns the function decl or
25825 NULL_TREE, if the builtin was not added.
25826
25827 If the front end has a special hook for builtin functions, delay adding
25828 builtin functions that aren't in the current ISA until the ISA is changed
25829 with function specific optimization. Doing so, can save about 300K for the
25830 default compiler. When the builtin is expanded, check at that time whether
25831 it is valid.
25832
25833 If the front end doesn't have a special hook, record all builtins, even if
25834 it isn't an instruction set in the current ISA in case the user uses
25835 function specific options for a different ISA, so that we don't get scope
25836 errors if a builtin is added in the middle of a function scope. */
25837
25838 static inline tree
25839 def_builtin (HOST_WIDE_INT mask, const char *name,
25840 enum ix86_builtin_func_type tcode,
25841 enum ix86_builtins code)
25842 {
25843 tree decl = NULL_TREE;
25844
25845 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25846 {
25847 ix86_builtins_isa[(int) code].isa = mask;
25848
25849 mask &= ~OPTION_MASK_ISA_64BIT;
25850 if (mask == 0
25851 || (mask & ix86_isa_flags) != 0
25852 || (lang_hooks.builtin_function
25853 == lang_hooks.builtin_function_ext_scope))
25854
25855 {
25856 tree type = ix86_get_builtin_func_type (tcode);
25857 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25858 NULL, NULL_TREE);
25859 ix86_builtins[(int) code] = decl;
25860 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25861 }
25862 else
25863 {
25864 ix86_builtins[(int) code] = NULL_TREE;
25865 ix86_builtins_isa[(int) code].tcode = tcode;
25866 ix86_builtins_isa[(int) code].name = name;
25867 ix86_builtins_isa[(int) code].const_p = false;
25868 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25869 }
25870 }
25871
25872 return decl;
25873 }
25874
25875 /* Like def_builtin, but also marks the function decl "const". */
25876
25877 static inline tree
25878 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25879 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25880 {
25881 tree decl = def_builtin (mask, name, tcode, code);
25882 if (decl)
25883 TREE_READONLY (decl) = 1;
25884 else
25885 ix86_builtins_isa[(int) code].const_p = true;
25886
25887 return decl;
25888 }
25889
25890 /* Add any new builtin functions for a given ISA that may not have been
25891 declared. This saves a bit of space compared to adding all of the
25892 declarations to the tree, even if we didn't use them. */
25893
25894 static void
25895 ix86_add_new_builtins (HOST_WIDE_INT isa)
25896 {
25897 int i;
25898
25899 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
25900 {
25901 if ((ix86_builtins_isa[i].isa & isa) != 0
25902 && ix86_builtins_isa[i].set_and_not_built_p)
25903 {
25904 tree decl, type;
25905
25906 /* Don't define the builtin again. */
25907 ix86_builtins_isa[i].set_and_not_built_p = false;
25908
25909 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
25910 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
25911 type, i, BUILT_IN_MD, NULL,
25912 NULL_TREE);
25913
25914 ix86_builtins[i] = decl;
25915 if (ix86_builtins_isa[i].const_p)
25916 TREE_READONLY (decl) = 1;
25917 }
25918 }
25919 }
25920
25921 /* Bits for builtin_description.flag. */
25922
25923 /* Set when we don't support the comparison natively, and should
25924 swap_comparison in order to support it. */
25925 #define BUILTIN_DESC_SWAP_OPERANDS 1
25926
25927 struct builtin_description
25928 {
25929 const HOST_WIDE_INT mask;
25930 const enum insn_code icode;
25931 const char *const name;
25932 const enum ix86_builtins code;
25933 const enum rtx_code comparison;
25934 const int flag;
25935 };
25936
25937 static const struct builtin_description bdesc_comi[] =
25938 {
25939 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
25940 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
25941 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
25942 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
25943 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
25944 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
25945 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
25946 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
25947 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
25948 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
25949 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
25950 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
25951 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
25952 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
25953 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
25954 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
25955 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
25956 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
25957 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
25958 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
25959 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
25960 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
25961 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
25962 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
25963 };
25964
25965 static const struct builtin_description bdesc_pcmpestr[] =
25966 {
25967 /* SSE4.2 */
25968 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
25969 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
25970 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
25971 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
25972 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
25973 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
25974 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
25975 };
25976
25977 static const struct builtin_description bdesc_pcmpistr[] =
25978 {
25979 /* SSE4.2 */
25980 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
25981 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
25982 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
25983 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
25984 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
25985 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
25986 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
25987 };
25988
25989 /* Special builtins with variable number of arguments. */
25990 static const struct builtin_description bdesc_special_args[] =
25991 {
25992 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
25993 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
25994 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
25995
25996 /* MMX */
25997 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25998
25999 /* 3DNow! */
26000 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26001
26002 /* SSE */
26003 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26004 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26005 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26006
26007 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26008 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26009 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26010 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26011
26012 /* SSE or 3DNow!A */
26013 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26014 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26015
26016 /* SSE2 */
26017 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26018 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26019 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26020 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26021 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26022 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26023 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26024 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26025 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26026 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26027
26028 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26029 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26030
26031 /* SSE3 */
26032 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26033
26034 /* SSE4.1 */
26035 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26036
26037 /* SSE4A */
26038 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26039 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26040
26041 /* AVX */
26042 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26043 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26044
26045 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26046 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26047 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26048 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26049 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26050
26051 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26052 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26053 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26054 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26055 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26056 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26057 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26058
26059 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26060 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26061 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26062
26063 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26064 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26065 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26066 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26067 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26068 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26069 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26070 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26071
26072 /* AVX2 */
26073 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26074 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26075 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26076 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26077 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26078 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26079 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26080 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26081 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26082
26083 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26084 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26085 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26086 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26087 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26088 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26089
26090 /* FSGSBASE */
26091 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26092 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26093 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26094 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26095 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26096 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26097 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26098 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26099
26100 /* RTM */
26101 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26102 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26103 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26104 };
26105
26106 /* Builtins with variable number of arguments. */
26107 static const struct builtin_description bdesc_args[] =
26108 {
26109 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26110 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26111 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26112 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26113 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26114 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26115 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26116
26117 /* MMX */
26118 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26119 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26120 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26121 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26122 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26123 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26124
26125 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26126 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26127 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26128 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26129 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26130 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26131 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26132 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26133
26134 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26135 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26136
26137 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26138 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26139 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26140 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26141
26142 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26143 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26144 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26145 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26146 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26147 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26148
26149 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26150 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26151 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26152 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26153 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26154 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26155
26156 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26157 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26158 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26159
26160 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26161
26162 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26163 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26164 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26165 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26166 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26167 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26168
26169 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26170 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26171 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26172 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26173 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26174 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26175
26176 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26177 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26178 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26179 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26180
26181 /* 3DNow! */
26182 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26183 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26184 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26185 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26186
26187 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26188 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26189 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26190 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26191 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26192 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26193 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26194 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26195 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26196 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26197 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26198 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26199 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26200 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26201 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26202
26203 /* 3DNow!A */
26204 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26205 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26206 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26207 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26208 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26209 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26210
26211 /* SSE */
26212 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26213 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26214 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26215 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26216 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26217 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26218 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26219 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26220 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26221 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26222 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26223 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26224
26225 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26226
26227 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26228 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26229 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26230 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26231 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26232 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26233 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26234 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26235
26236 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26237 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26238 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26239 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26240 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26241 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26242 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26243 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26244 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26245 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26246 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26247 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26248 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26249 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26250 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26251 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26252 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26253 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26254 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26255 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26256 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26257 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26258
26259 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26260 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26261 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26262 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26263
26264 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26265 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26266 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26267 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26268
26269 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26270
26271 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26272 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26273 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26274 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26275 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26276
26277 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26278 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26279 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26280
26281 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26282
26283 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26284 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26285 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26286
26287 /* SSE MMX or 3Dnow!A */
26288 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26289 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26290 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26291
26292 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26293 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26294 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26295 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26296
26297 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26298 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26299
26300 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26301
26302 /* SSE2 */
26303 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26304
26305 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26306 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26307 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26308 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26309 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26310
26311 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26312 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26313 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26314 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26315 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26316
26317 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26318
26319 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26320 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26321 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26322 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26323
26324 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26325 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26326 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26327
26328 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26329 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26330 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26331 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26332 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26333 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26334 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26335 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26336
26337 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26338 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26339 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26340 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26341 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26342 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26343 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26344 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26345 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26346 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26347 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26348 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26349 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26350 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26351 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26352 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26353 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26354 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26355 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26356 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26357
26358 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26359 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26360 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26361 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26362
26363 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26364 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26365 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26366 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26367
26368 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26369
26370 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26371 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26372 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26373
26374 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26375
26376 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26377 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26378 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26379 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26380 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26381 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26382 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26383 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26384
26385 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26386 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26387 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26388 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26389 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26390 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26391 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26392 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26393
26394 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26395 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26396
26397 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26398 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26399 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26400 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26401
26402 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26403 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26404
26405 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26406 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26407 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26408 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26409 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26410 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26411
26412 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26413 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26414 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26415 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26416
26417 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26418 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26419 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26420 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26421 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26422 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26423 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26424 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26425
26426 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26427 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26428 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26429
26430 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26431 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26432
26433 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26434 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26435
26436 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26437
26438 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26439 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26440 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26441 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26442
26443 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26444 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26445 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26446 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26447 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26448 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26449 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26450
26451 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26452 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26453 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26454 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26455 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26456 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26457 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26458
26459 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26460 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26461 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26462 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26463
26464 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26465 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26466 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26467
26468 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26469
26470 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26471 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26472
26473 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26474
26475 /* SSE2 MMX */
26476 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26477 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26478
26479 /* SSE3 */
26480 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26481 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26482
26483 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26484 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26485 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26486 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26487 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26488 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26489
26490 /* SSSE3 */
26491 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26492 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26493 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26494 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26495 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26496 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26497
26498 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26499 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26500 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26501 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26502 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26503 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26504 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26505 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26506 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26507 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26508 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26509 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26510 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26511 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26512 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26513 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26514 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26515 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26516 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26517 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26518 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26519 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26520 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26521 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26522
26523 /* SSSE3. */
26524 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26525 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26526
26527 /* SSE4.1 */
26528 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26529 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26530 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26531 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26532 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26533 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26534 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26535 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26536 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26537 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26538
26539 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26540 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26541 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26542 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26543 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26544 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26545 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26546 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26547 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26548 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26549 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26550 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26551 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26552
26553 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26554 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26555 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26556 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26557 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26558 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26559 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26560 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26561 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26562 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26563 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26564 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26565
26566 /* SSE4.1 */
26567 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26568 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26569 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26570 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26571
26572 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26573 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26574 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26575 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26576
26577 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26578 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26579
26580 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26581 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26582
26583 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26584 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26585 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26586 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26587
26588 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26589 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26590
26591 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26592 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26593
26594 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26595 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26596 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26597
26598 /* SSE4.2 */
26599 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26600 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26601 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26602 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26603 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26604
26605 /* SSE4A */
26606 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26607 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26608 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26609 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26610
26611 /* AES */
26612 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26613 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26614
26615 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26616 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26617 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26618 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26619
26620 /* PCLMUL */
26621 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26622
26623 /* AVX */
26624 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26625 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26626 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26627 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26628 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26629 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26630 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26631 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26632 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26633 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26634 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26635 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26636 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26637 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26638 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26639 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26640 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26641 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26642 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26643 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26644 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26645 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26646 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26647 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26648 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26649 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26650
26651 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26652 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26653 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26654 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26655
26656 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26657 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26658 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26659 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26660 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26661 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26662 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26663 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26664 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26665 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26666 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26667 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26668 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26669 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26670 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26671 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26672 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26673 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26674 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26675 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26676 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26677 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26678 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26679 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26680 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26681 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26682 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26683 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26684 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26685 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26686 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26687 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26688 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26689 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26690
26691 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26692 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26693 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26694
26695 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26696 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26697 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26698 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26699 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26700
26701 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26702
26703 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26704 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26705
26706 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26707 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26708 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26709 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26710
26711 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26712 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26713
26714 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26715 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26716
26717 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26718 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26719 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26720 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26721
26722 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26723 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26724
26725 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26726 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26727
26728 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26729 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26730 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26731 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26732
26733 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26734 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26735 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26736 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26737 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26738 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26739
26740 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26741 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26742 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26743 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26744 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26745 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26746 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26747 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26748 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26749 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26750 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26751 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26752 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26753 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26754 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26755
26756 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26757 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26758
26759 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26760 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26761
26762 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26763
26764 /* AVX2 */
26765 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26766 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26767 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26768 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26769 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26770 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26771 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26772 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26773 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26774 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26775 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26776 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26778 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26779 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26780 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26781 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26782 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26783 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26784 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26785 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26786 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26787 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26788 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26789 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26790 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26791 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26792 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26793 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26794 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26795 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26796 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26797 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26798 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26799 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26800 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26801 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26802 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26803 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26804 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26805 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26806 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26807 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26808 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26809 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26810 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26811 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26812 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26813 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26814 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26815 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26816 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26817 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26818 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26819 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26820 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26821 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26822 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26823 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26824 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26825 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26826 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26827 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26828 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26829 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26830 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26831 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26832 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26833 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26834 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26835 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26836 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26837 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26838 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26839 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26840 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26841 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26842 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26843 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26844 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26845 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26846 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26847 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26848 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26849 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26850 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26851 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26852 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26853 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26854 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26855 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26856 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26857 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26858 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26859 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26860 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26861 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26862 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26863 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26864 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26865 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26866 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26867 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26868 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26869 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26870 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26871 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26872 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26873 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26874 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26875 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26876 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26877 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26878 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26879 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26880 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26881 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26882 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26883 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26884 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
26885 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26886 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
26887 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
26888 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26889 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26890 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26891 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26892 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26893 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26894 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26895 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26896 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26897 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
26898 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
26899 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
26900 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
26901 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26902 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26903 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26904 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26905 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26906 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26907 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26908 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26909 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26910 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26911
26912 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26913
26914 /* BMI */
26915 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26916 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26917 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26918
26919 /* TBM */
26920 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26921 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26922
26923 /* F16C */
26924 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
26925 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
26926 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
26927 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
26928
26929 /* BMI2 */
26930 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26931 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26932 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26933 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26934 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26935 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26936 };
26937
26938 /* FMA4 and XOP. */
26939 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
26940 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
26941 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
26942 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
26943 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
26944 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
26945 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
26946 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
26947 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
26948 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
26949 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
26950 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
26951 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
26952 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
26953 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
26954 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
26955 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
26956 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
26957 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
26958 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
26959 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
26960 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
26961 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
26962 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
26963 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
26964 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
26965 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
26966 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
26967 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
26968 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
26969 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
26970 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
26971 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
26972 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
26973 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
26974 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
26975 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
26976 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
26977 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
26978 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
26979 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
26980 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
26981 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
26982 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
26983 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
26984 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
26985 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
26986 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
26987 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
26988 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
26989 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
26990 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
26991
26992 static const struct builtin_description bdesc_multi_arg[] =
26993 {
26994 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
26995 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
26996 UNKNOWN, (int)MULTI_ARG_3_SF },
26997 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
26998 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
26999 UNKNOWN, (int)MULTI_ARG_3_DF },
27000
27001 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27002 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27003 UNKNOWN, (int)MULTI_ARG_3_SF },
27004 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27005 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27006 UNKNOWN, (int)MULTI_ARG_3_DF },
27007
27008 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27009 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27010 UNKNOWN, (int)MULTI_ARG_3_SF },
27011 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27012 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27013 UNKNOWN, (int)MULTI_ARG_3_DF },
27014 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27015 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27016 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27017 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27018 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27019 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27020
27021 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27022 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27023 UNKNOWN, (int)MULTI_ARG_3_SF },
27024 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27025 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27026 UNKNOWN, (int)MULTI_ARG_3_DF },
27027 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27028 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27029 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27030 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27031 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27032 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27033
27034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27037 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27039 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27041
27042 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27045 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27046 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27047 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27049
27050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27051
27052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27053 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27054 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27055 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27059 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27060 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27061 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27062 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27063 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27064
27065 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27066 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27067 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27068 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27069 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27070 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27071 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27072 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27073 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27074 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27075 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27076 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27077 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27078 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27079 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27080 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27081
27082 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27083 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27084 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27085 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27086 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27087 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27088
27089 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27090 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27091 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27092 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27093 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27094 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27095 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27096 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27097 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27098 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27099 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27100 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27101 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27102 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27103 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27104
27105 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27106 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27107 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27108 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27109 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27110 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27111 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27112
27113 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27114 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27115 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27116 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27117 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27118 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27119 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27120
27121 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27122 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27123 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27124 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27125 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27126 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27127 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27128
27129 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27130 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27131 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27132 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27133 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27134 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27135 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27136
27137 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27138 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27139 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27140 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27141 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27142 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27143 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27144
27145 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27146 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27147 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27148 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27149 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27150 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27151 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27152
27153 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27154 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27155 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27156 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27157 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27158 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27159 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27160
27161 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27162 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27163 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27164 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27165 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27166 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27167 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27168
27169 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27170 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27171 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27172 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27173 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27174 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27175 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27176 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27177
27178 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27179 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27180 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27181 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27182 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27183 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27184 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27185 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27186
27187 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27188 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27189 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27190 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27191
27192 };
27193 \f
27194 /* TM vector builtins. */
27195
27196 /* Reuse the existing x86-specific `struct builtin_description' cause
27197 we're lazy. Add casts to make them fit. */
27198 static const struct builtin_description bdesc_tm[] =
27199 {
27200 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27201 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27202 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27203 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27204 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27205 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27206 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27207
27208 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27209 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27210 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27211 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27212 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27213 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27214 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27215
27216 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27217 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27218 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27219 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27220 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27221 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27222 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27223
27224 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27225 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27226 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27227 };
27228
27229 /* TM callbacks. */
27230
27231 /* Return the builtin decl needed to load a vector of TYPE. */
27232
27233 static tree
27234 ix86_builtin_tm_load (tree type)
27235 {
27236 if (TREE_CODE (type) == VECTOR_TYPE)
27237 {
27238 switch (tree_low_cst (TYPE_SIZE (type), 1))
27239 {
27240 case 64:
27241 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27242 case 128:
27243 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27244 case 256:
27245 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27246 }
27247 }
27248 return NULL_TREE;
27249 }
27250
27251 /* Return the builtin decl needed to store a vector of TYPE. */
27252
27253 static tree
27254 ix86_builtin_tm_store (tree type)
27255 {
27256 if (TREE_CODE (type) == VECTOR_TYPE)
27257 {
27258 switch (tree_low_cst (TYPE_SIZE (type), 1))
27259 {
27260 case 64:
27261 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27262 case 128:
27263 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27264 case 256:
27265 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27266 }
27267 }
27268 return NULL_TREE;
27269 }
27270 \f
27271 /* Initialize the transactional memory vector load/store builtins. */
27272
27273 static void
27274 ix86_init_tm_builtins (void)
27275 {
27276 enum ix86_builtin_func_type ftype;
27277 const struct builtin_description *d;
27278 size_t i;
27279 tree decl;
27280 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27281 tree attrs_log, attrs_type_log;
27282
27283 if (!flag_tm)
27284 return;
27285
27286 /* If there are no builtins defined, we must be compiling in a
27287 language without trans-mem support. */
27288 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27289 return;
27290
27291 /* Use whatever attributes a normal TM load has. */
27292 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27293 attrs_load = DECL_ATTRIBUTES (decl);
27294 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27295 /* Use whatever attributes a normal TM store has. */
27296 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27297 attrs_store = DECL_ATTRIBUTES (decl);
27298 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27299 /* Use whatever attributes a normal TM log has. */
27300 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27301 attrs_log = DECL_ATTRIBUTES (decl);
27302 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27303
27304 for (i = 0, d = bdesc_tm;
27305 i < ARRAY_SIZE (bdesc_tm);
27306 i++, d++)
27307 {
27308 if ((d->mask & ix86_isa_flags) != 0
27309 || (lang_hooks.builtin_function
27310 == lang_hooks.builtin_function_ext_scope))
27311 {
27312 tree type, attrs, attrs_type;
27313 enum built_in_function code = (enum built_in_function) d->code;
27314
27315 ftype = (enum ix86_builtin_func_type) d->flag;
27316 type = ix86_get_builtin_func_type (ftype);
27317
27318 if (BUILTIN_TM_LOAD_P (code))
27319 {
27320 attrs = attrs_load;
27321 attrs_type = attrs_type_load;
27322 }
27323 else if (BUILTIN_TM_STORE_P (code))
27324 {
27325 attrs = attrs_store;
27326 attrs_type = attrs_type_store;
27327 }
27328 else
27329 {
27330 attrs = attrs_log;
27331 attrs_type = attrs_type_log;
27332 }
27333 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27334 /* The builtin without the prefix for
27335 calling it directly. */
27336 d->name + strlen ("__builtin_"),
27337 attrs);
27338 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27339 set the TYPE_ATTRIBUTES. */
27340 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27341
27342 set_builtin_decl (code, decl, false);
27343 }
27344 }
27345 }
27346
27347 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27348 in the current target ISA to allow the user to compile particular modules
27349 with different target specific options that differ from the command line
27350 options. */
27351 static void
27352 ix86_init_mmx_sse_builtins (void)
27353 {
27354 const struct builtin_description * d;
27355 enum ix86_builtin_func_type ftype;
27356 size_t i;
27357
27358 /* Add all special builtins with variable number of operands. */
27359 for (i = 0, d = bdesc_special_args;
27360 i < ARRAY_SIZE (bdesc_special_args);
27361 i++, d++)
27362 {
27363 if (d->name == 0)
27364 continue;
27365
27366 ftype = (enum ix86_builtin_func_type) d->flag;
27367 def_builtin (d->mask, d->name, ftype, d->code);
27368 }
27369
27370 /* Add all builtins with variable number of operands. */
27371 for (i = 0, d = bdesc_args;
27372 i < ARRAY_SIZE (bdesc_args);
27373 i++, d++)
27374 {
27375 if (d->name == 0)
27376 continue;
27377
27378 ftype = (enum ix86_builtin_func_type) d->flag;
27379 def_builtin_const (d->mask, d->name, ftype, d->code);
27380 }
27381
27382 /* pcmpestr[im] insns. */
27383 for (i = 0, d = bdesc_pcmpestr;
27384 i < ARRAY_SIZE (bdesc_pcmpestr);
27385 i++, d++)
27386 {
27387 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27388 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27389 else
27390 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27391 def_builtin_const (d->mask, d->name, ftype, d->code);
27392 }
27393
27394 /* pcmpistr[im] insns. */
27395 for (i = 0, d = bdesc_pcmpistr;
27396 i < ARRAY_SIZE (bdesc_pcmpistr);
27397 i++, d++)
27398 {
27399 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27400 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27401 else
27402 ftype = INT_FTYPE_V16QI_V16QI_INT;
27403 def_builtin_const (d->mask, d->name, ftype, d->code);
27404 }
27405
27406 /* comi/ucomi insns. */
27407 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27408 {
27409 if (d->mask == OPTION_MASK_ISA_SSE2)
27410 ftype = INT_FTYPE_V2DF_V2DF;
27411 else
27412 ftype = INT_FTYPE_V4SF_V4SF;
27413 def_builtin_const (d->mask, d->name, ftype, d->code);
27414 }
27415
27416 /* SSE */
27417 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27418 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27419 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27420 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27421
27422 /* SSE or 3DNow!A */
27423 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27424 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27425 IX86_BUILTIN_MASKMOVQ);
27426
27427 /* SSE2 */
27428 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27429 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27430
27431 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27432 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27433 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27434 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27435
27436 /* SSE3. */
27437 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27438 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27439 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27440 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27441
27442 /* AES */
27443 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27444 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27445 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27446 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27447 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27448 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27449 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27450 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27451 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27452 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27453 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27454 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27455
27456 /* PCLMUL */
27457 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27458 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27459
27460 /* RDRND */
27461 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27462 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27463 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27464 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27465 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27466 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27467 IX86_BUILTIN_RDRAND64_STEP);
27468
27469 /* AVX2 */
27470 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27471 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27472 IX86_BUILTIN_GATHERSIV2DF);
27473
27474 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27475 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27476 IX86_BUILTIN_GATHERSIV4DF);
27477
27478 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27479 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27480 IX86_BUILTIN_GATHERDIV2DF);
27481
27482 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27483 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27484 IX86_BUILTIN_GATHERDIV4DF);
27485
27486 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27487 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27488 IX86_BUILTIN_GATHERSIV4SF);
27489
27490 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27491 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27492 IX86_BUILTIN_GATHERSIV8SF);
27493
27494 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27495 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27496 IX86_BUILTIN_GATHERDIV4SF);
27497
27498 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27499 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27500 IX86_BUILTIN_GATHERDIV8SF);
27501
27502 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27503 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27504 IX86_BUILTIN_GATHERSIV2DI);
27505
27506 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27507 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27508 IX86_BUILTIN_GATHERSIV4DI);
27509
27510 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27511 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27512 IX86_BUILTIN_GATHERDIV2DI);
27513
27514 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27515 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27516 IX86_BUILTIN_GATHERDIV4DI);
27517
27518 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27519 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27520 IX86_BUILTIN_GATHERSIV4SI);
27521
27522 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27523 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27524 IX86_BUILTIN_GATHERSIV8SI);
27525
27526 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27527 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27528 IX86_BUILTIN_GATHERDIV4SI);
27529
27530 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27531 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27532 IX86_BUILTIN_GATHERDIV8SI);
27533
27534 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27535 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27536 IX86_BUILTIN_GATHERALTSIV4DF);
27537
27538 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27539 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27540 IX86_BUILTIN_GATHERALTDIV8SF);
27541
27542 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27543 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27544 IX86_BUILTIN_GATHERALTSIV4DI);
27545
27546 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27547 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27548 IX86_BUILTIN_GATHERALTDIV8SI);
27549
27550 /* RTM. */
27551 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
27552 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
27553
27554 /* MMX access to the vec_init patterns. */
27555 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27556 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27557
27558 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27559 V4HI_FTYPE_HI_HI_HI_HI,
27560 IX86_BUILTIN_VEC_INIT_V4HI);
27561
27562 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27563 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27564 IX86_BUILTIN_VEC_INIT_V8QI);
27565
27566 /* Access to the vec_extract patterns. */
27567 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27568 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27569 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27570 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27571 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27572 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27573 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27574 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27575 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27576 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27577
27578 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27579 "__builtin_ia32_vec_ext_v4hi",
27580 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27581
27582 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27583 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27584
27585 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27586 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27587
27588 /* Access to the vec_set patterns. */
27589 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27590 "__builtin_ia32_vec_set_v2di",
27591 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27592
27593 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27594 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27595
27596 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27597 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27598
27599 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27600 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27601
27602 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27603 "__builtin_ia32_vec_set_v4hi",
27604 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27605
27606 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27607 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27608
27609 /* Add FMA4 multi-arg argument instructions */
27610 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27611 {
27612 if (d->name == 0)
27613 continue;
27614
27615 ftype = (enum ix86_builtin_func_type) d->flag;
27616 def_builtin_const (d->mask, d->name, ftype, d->code);
27617 }
27618 }
27619
27620 /* Internal method for ix86_init_builtins. */
27621
27622 static void
27623 ix86_init_builtins_va_builtins_abi (void)
27624 {
27625 tree ms_va_ref, sysv_va_ref;
27626 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27627 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
27628 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
27629 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
27630
27631 if (!TARGET_64BIT)
27632 return;
27633 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
27634 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
27635 ms_va_ref = build_reference_type (ms_va_list_type_node);
27636 sysv_va_ref =
27637 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
27638
27639 fnvoid_va_end_ms =
27640 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27641 fnvoid_va_start_ms =
27642 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27643 fnvoid_va_end_sysv =
27644 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
27645 fnvoid_va_start_sysv =
27646 build_varargs_function_type_list (void_type_node, sysv_va_ref,
27647 NULL_TREE);
27648 fnvoid_va_copy_ms =
27649 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
27650 NULL_TREE);
27651 fnvoid_va_copy_sysv =
27652 build_function_type_list (void_type_node, sysv_va_ref,
27653 sysv_va_ref, NULL_TREE);
27654
27655 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
27656 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
27657 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
27658 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
27659 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
27660 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
27661 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
27662 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27663 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
27664 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27665 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
27666 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27667 }
27668
27669 static void
27670 ix86_init_builtin_types (void)
27671 {
27672 tree float128_type_node, float80_type_node;
27673
27674 /* The __float80 type. */
27675 float80_type_node = long_double_type_node;
27676 if (TYPE_MODE (float80_type_node) != XFmode)
27677 {
27678 /* The __float80 type. */
27679 float80_type_node = make_node (REAL_TYPE);
27680
27681 TYPE_PRECISION (float80_type_node) = 80;
27682 layout_type (float80_type_node);
27683 }
27684 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
27685
27686 /* The __float128 type. */
27687 float128_type_node = make_node (REAL_TYPE);
27688 TYPE_PRECISION (float128_type_node) = 128;
27689 layout_type (float128_type_node);
27690 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
27691
27692 /* This macro is built by i386-builtin-types.awk. */
27693 DEFINE_BUILTIN_PRIMITIVE_TYPES;
27694 }
27695
27696 static void
27697 ix86_init_builtins (void)
27698 {
27699 tree t;
27700
27701 ix86_init_builtin_types ();
27702
27703 /* TFmode support builtins. */
27704 def_builtin_const (0, "__builtin_infq",
27705 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
27706 def_builtin_const (0, "__builtin_huge_valq",
27707 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
27708
27709 /* We will expand them to normal call if SSE2 isn't available since
27710 they are used by libgcc. */
27711 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27712 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27713 BUILT_IN_MD, "__fabstf2", NULL_TREE);
27714 TREE_READONLY (t) = 1;
27715 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27716
27717 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27718 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27719 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27720 TREE_READONLY (t) = 1;
27721 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27722
27723 ix86_init_tm_builtins ();
27724 ix86_init_mmx_sse_builtins ();
27725
27726 if (TARGET_LP64)
27727 ix86_init_builtins_va_builtins_abi ();
27728
27729 #ifdef SUBTARGET_INIT_BUILTINS
27730 SUBTARGET_INIT_BUILTINS;
27731 #endif
27732 }
27733
27734 /* Return the ix86 builtin for CODE. */
27735
27736 static tree
27737 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27738 {
27739 if (code >= IX86_BUILTIN_MAX)
27740 return error_mark_node;
27741
27742 return ix86_builtins[code];
27743 }
27744
27745 /* Errors in the source file can cause expand_expr to return const0_rtx
27746 where we expect a vector. To avoid crashing, use one of the vector
27747 clear instructions. */
27748 static rtx
27749 safe_vector_operand (rtx x, enum machine_mode mode)
27750 {
27751 if (x == const0_rtx)
27752 x = CONST0_RTX (mode);
27753 return x;
27754 }
27755
27756 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
27757
27758 static rtx
27759 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27760 {
27761 rtx pat;
27762 tree arg0 = CALL_EXPR_ARG (exp, 0);
27763 tree arg1 = CALL_EXPR_ARG (exp, 1);
27764 rtx op0 = expand_normal (arg0);
27765 rtx op1 = expand_normal (arg1);
27766 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27767 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27768 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27769
27770 if (VECTOR_MODE_P (mode0))
27771 op0 = safe_vector_operand (op0, mode0);
27772 if (VECTOR_MODE_P (mode1))
27773 op1 = safe_vector_operand (op1, mode1);
27774
27775 if (optimize || !target
27776 || GET_MODE (target) != tmode
27777 || !insn_data[icode].operand[0].predicate (target, tmode))
27778 target = gen_reg_rtx (tmode);
27779
27780 if (GET_MODE (op1) == SImode && mode1 == TImode)
27781 {
27782 rtx x = gen_reg_rtx (V4SImode);
27783 emit_insn (gen_sse2_loadd (x, op1));
27784 op1 = gen_lowpart (TImode, x);
27785 }
27786
27787 if (!insn_data[icode].operand[1].predicate (op0, mode0))
27788 op0 = copy_to_mode_reg (mode0, op0);
27789 if (!insn_data[icode].operand[2].predicate (op1, mode1))
27790 op1 = copy_to_mode_reg (mode1, op1);
27791
27792 pat = GEN_FCN (icode) (target, op0, op1);
27793 if (! pat)
27794 return 0;
27795
27796 emit_insn (pat);
27797
27798 return target;
27799 }
27800
27801 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
27802
27803 static rtx
27804 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27805 enum ix86_builtin_func_type m_type,
27806 enum rtx_code sub_code)
27807 {
27808 rtx pat;
27809 int i;
27810 int nargs;
27811 bool comparison_p = false;
27812 bool tf_p = false;
27813 bool last_arg_constant = false;
27814 int num_memory = 0;
27815 struct {
27816 rtx op;
27817 enum machine_mode mode;
27818 } args[4];
27819
27820 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27821
27822 switch (m_type)
27823 {
27824 case MULTI_ARG_4_DF2_DI_I:
27825 case MULTI_ARG_4_DF2_DI_I1:
27826 case MULTI_ARG_4_SF2_SI_I:
27827 case MULTI_ARG_4_SF2_SI_I1:
27828 nargs = 4;
27829 last_arg_constant = true;
27830 break;
27831
27832 case MULTI_ARG_3_SF:
27833 case MULTI_ARG_3_DF:
27834 case MULTI_ARG_3_SF2:
27835 case MULTI_ARG_3_DF2:
27836 case MULTI_ARG_3_DI:
27837 case MULTI_ARG_3_SI:
27838 case MULTI_ARG_3_SI_DI:
27839 case MULTI_ARG_3_HI:
27840 case MULTI_ARG_3_HI_SI:
27841 case MULTI_ARG_3_QI:
27842 case MULTI_ARG_3_DI2:
27843 case MULTI_ARG_3_SI2:
27844 case MULTI_ARG_3_HI2:
27845 case MULTI_ARG_3_QI2:
27846 nargs = 3;
27847 break;
27848
27849 case MULTI_ARG_2_SF:
27850 case MULTI_ARG_2_DF:
27851 case MULTI_ARG_2_DI:
27852 case MULTI_ARG_2_SI:
27853 case MULTI_ARG_2_HI:
27854 case MULTI_ARG_2_QI:
27855 nargs = 2;
27856 break;
27857
27858 case MULTI_ARG_2_DI_IMM:
27859 case MULTI_ARG_2_SI_IMM:
27860 case MULTI_ARG_2_HI_IMM:
27861 case MULTI_ARG_2_QI_IMM:
27862 nargs = 2;
27863 last_arg_constant = true;
27864 break;
27865
27866 case MULTI_ARG_1_SF:
27867 case MULTI_ARG_1_DF:
27868 case MULTI_ARG_1_SF2:
27869 case MULTI_ARG_1_DF2:
27870 case MULTI_ARG_1_DI:
27871 case MULTI_ARG_1_SI:
27872 case MULTI_ARG_1_HI:
27873 case MULTI_ARG_1_QI:
27874 case MULTI_ARG_1_SI_DI:
27875 case MULTI_ARG_1_HI_DI:
27876 case MULTI_ARG_1_HI_SI:
27877 case MULTI_ARG_1_QI_DI:
27878 case MULTI_ARG_1_QI_SI:
27879 case MULTI_ARG_1_QI_HI:
27880 nargs = 1;
27881 break;
27882
27883 case MULTI_ARG_2_DI_CMP:
27884 case MULTI_ARG_2_SI_CMP:
27885 case MULTI_ARG_2_HI_CMP:
27886 case MULTI_ARG_2_QI_CMP:
27887 nargs = 2;
27888 comparison_p = true;
27889 break;
27890
27891 case MULTI_ARG_2_SF_TF:
27892 case MULTI_ARG_2_DF_TF:
27893 case MULTI_ARG_2_DI_TF:
27894 case MULTI_ARG_2_SI_TF:
27895 case MULTI_ARG_2_HI_TF:
27896 case MULTI_ARG_2_QI_TF:
27897 nargs = 2;
27898 tf_p = true;
27899 break;
27900
27901 default:
27902 gcc_unreachable ();
27903 }
27904
27905 if (optimize || !target
27906 || GET_MODE (target) != tmode
27907 || !insn_data[icode].operand[0].predicate (target, tmode))
27908 target = gen_reg_rtx (tmode);
27909
27910 gcc_assert (nargs <= 4);
27911
27912 for (i = 0; i < nargs; i++)
27913 {
27914 tree arg = CALL_EXPR_ARG (exp, i);
27915 rtx op = expand_normal (arg);
27916 int adjust = (comparison_p) ? 1 : 0;
27917 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
27918
27919 if (last_arg_constant && i == nargs - 1)
27920 {
27921 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
27922 {
27923 enum insn_code new_icode = icode;
27924 switch (icode)
27925 {
27926 case CODE_FOR_xop_vpermil2v2df3:
27927 case CODE_FOR_xop_vpermil2v4sf3:
27928 case CODE_FOR_xop_vpermil2v4df3:
27929 case CODE_FOR_xop_vpermil2v8sf3:
27930 error ("the last argument must be a 2-bit immediate");
27931 return gen_reg_rtx (tmode);
27932 case CODE_FOR_xop_rotlv2di3:
27933 new_icode = CODE_FOR_rotlv2di3;
27934 goto xop_rotl;
27935 case CODE_FOR_xop_rotlv4si3:
27936 new_icode = CODE_FOR_rotlv4si3;
27937 goto xop_rotl;
27938 case CODE_FOR_xop_rotlv8hi3:
27939 new_icode = CODE_FOR_rotlv8hi3;
27940 goto xop_rotl;
27941 case CODE_FOR_xop_rotlv16qi3:
27942 new_icode = CODE_FOR_rotlv16qi3;
27943 xop_rotl:
27944 if (CONST_INT_P (op))
27945 {
27946 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
27947 op = GEN_INT (INTVAL (op) & mask);
27948 gcc_checking_assert
27949 (insn_data[icode].operand[i + 1].predicate (op, mode));
27950 }
27951 else
27952 {
27953 gcc_checking_assert
27954 (nargs == 2
27955 && insn_data[new_icode].operand[0].mode == tmode
27956 && insn_data[new_icode].operand[1].mode == tmode
27957 && insn_data[new_icode].operand[2].mode == mode
27958 && insn_data[new_icode].operand[0].predicate
27959 == insn_data[icode].operand[0].predicate
27960 && insn_data[new_icode].operand[1].predicate
27961 == insn_data[icode].operand[1].predicate);
27962 icode = new_icode;
27963 goto non_constant;
27964 }
27965 break;
27966 default:
27967 gcc_unreachable ();
27968 }
27969 }
27970 }
27971 else
27972 {
27973 non_constant:
27974 if (VECTOR_MODE_P (mode))
27975 op = safe_vector_operand (op, mode);
27976
27977 /* If we aren't optimizing, only allow one memory operand to be
27978 generated. */
27979 if (memory_operand (op, mode))
27980 num_memory++;
27981
27982 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
27983
27984 if (optimize
27985 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
27986 || num_memory > 1)
27987 op = force_reg (mode, op);
27988 }
27989
27990 args[i].op = op;
27991 args[i].mode = mode;
27992 }
27993
27994 switch (nargs)
27995 {
27996 case 1:
27997 pat = GEN_FCN (icode) (target, args[0].op);
27998 break;
27999
28000 case 2:
28001 if (tf_p)
28002 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
28003 GEN_INT ((int)sub_code));
28004 else if (! comparison_p)
28005 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28006 else
28007 {
28008 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
28009 args[0].op,
28010 args[1].op);
28011
28012 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
28013 }
28014 break;
28015
28016 case 3:
28017 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28018 break;
28019
28020 case 4:
28021 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
28022 break;
28023
28024 default:
28025 gcc_unreachable ();
28026 }
28027
28028 if (! pat)
28029 return 0;
28030
28031 emit_insn (pat);
28032 return target;
28033 }
28034
28035 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
28036 insns with vec_merge. */
28037
28038 static rtx
28039 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
28040 rtx target)
28041 {
28042 rtx pat;
28043 tree arg0 = CALL_EXPR_ARG (exp, 0);
28044 rtx op1, op0 = expand_normal (arg0);
28045 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28046 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28047
28048 if (optimize || !target
28049 || GET_MODE (target) != tmode
28050 || !insn_data[icode].operand[0].predicate (target, tmode))
28051 target = gen_reg_rtx (tmode);
28052
28053 if (VECTOR_MODE_P (mode0))
28054 op0 = safe_vector_operand (op0, mode0);
28055
28056 if ((optimize && !register_operand (op0, mode0))
28057 || !insn_data[icode].operand[1].predicate (op0, mode0))
28058 op0 = copy_to_mode_reg (mode0, op0);
28059
28060 op1 = op0;
28061 if (!insn_data[icode].operand[2].predicate (op1, mode0))
28062 op1 = copy_to_mode_reg (mode0, op1);
28063
28064 pat = GEN_FCN (icode) (target, op0, op1);
28065 if (! pat)
28066 return 0;
28067 emit_insn (pat);
28068 return target;
28069 }
28070
28071 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
28072
28073 static rtx
28074 ix86_expand_sse_compare (const struct builtin_description *d,
28075 tree exp, rtx target, bool swap)
28076 {
28077 rtx pat;
28078 tree arg0 = CALL_EXPR_ARG (exp, 0);
28079 tree arg1 = CALL_EXPR_ARG (exp, 1);
28080 rtx op0 = expand_normal (arg0);
28081 rtx op1 = expand_normal (arg1);
28082 rtx op2;
28083 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28084 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28085 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28086 enum rtx_code comparison = d->comparison;
28087
28088 if (VECTOR_MODE_P (mode0))
28089 op0 = safe_vector_operand (op0, mode0);
28090 if (VECTOR_MODE_P (mode1))
28091 op1 = safe_vector_operand (op1, mode1);
28092
28093 /* Swap operands if we have a comparison that isn't available in
28094 hardware. */
28095 if (swap)
28096 {
28097 rtx tmp = gen_reg_rtx (mode1);
28098 emit_move_insn (tmp, op1);
28099 op1 = op0;
28100 op0 = tmp;
28101 }
28102
28103 if (optimize || !target
28104 || GET_MODE (target) != tmode
28105 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28106 target = gen_reg_rtx (tmode);
28107
28108 if ((optimize && !register_operand (op0, mode0))
28109 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
28110 op0 = copy_to_mode_reg (mode0, op0);
28111 if ((optimize && !register_operand (op1, mode1))
28112 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
28113 op1 = copy_to_mode_reg (mode1, op1);
28114
28115 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
28116 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28117 if (! pat)
28118 return 0;
28119 emit_insn (pat);
28120 return target;
28121 }
28122
28123 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
28124
28125 static rtx
28126 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28127 rtx target)
28128 {
28129 rtx pat;
28130 tree arg0 = CALL_EXPR_ARG (exp, 0);
28131 tree arg1 = CALL_EXPR_ARG (exp, 1);
28132 rtx op0 = expand_normal (arg0);
28133 rtx op1 = expand_normal (arg1);
28134 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28135 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28136 enum rtx_code comparison = d->comparison;
28137
28138 if (VECTOR_MODE_P (mode0))
28139 op0 = safe_vector_operand (op0, mode0);
28140 if (VECTOR_MODE_P (mode1))
28141 op1 = safe_vector_operand (op1, mode1);
28142
28143 /* Swap operands if we have a comparison that isn't available in
28144 hardware. */
28145 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28146 {
28147 rtx tmp = op1;
28148 op1 = op0;
28149 op0 = tmp;
28150 }
28151
28152 target = gen_reg_rtx (SImode);
28153 emit_move_insn (target, const0_rtx);
28154 target = gen_rtx_SUBREG (QImode, target, 0);
28155
28156 if ((optimize && !register_operand (op0, mode0))
28157 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28158 op0 = copy_to_mode_reg (mode0, op0);
28159 if ((optimize && !register_operand (op1, mode1))
28160 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28161 op1 = copy_to_mode_reg (mode1, op1);
28162
28163 pat = GEN_FCN (d->icode) (op0, op1);
28164 if (! pat)
28165 return 0;
28166 emit_insn (pat);
28167 emit_insn (gen_rtx_SET (VOIDmode,
28168 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28169 gen_rtx_fmt_ee (comparison, QImode,
28170 SET_DEST (pat),
28171 const0_rtx)));
28172
28173 return SUBREG_REG (target);
28174 }
28175
28176 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
28177
28178 static rtx
28179 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28180 rtx target)
28181 {
28182 rtx pat;
28183 tree arg0 = CALL_EXPR_ARG (exp, 0);
28184 rtx op1, op0 = expand_normal (arg0);
28185 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28186 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28187
28188 if (optimize || target == 0
28189 || GET_MODE (target) != tmode
28190 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28191 target = gen_reg_rtx (tmode);
28192
28193 if (VECTOR_MODE_P (mode0))
28194 op0 = safe_vector_operand (op0, mode0);
28195
28196 if ((optimize && !register_operand (op0, mode0))
28197 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28198 op0 = copy_to_mode_reg (mode0, op0);
28199
28200 op1 = GEN_INT (d->comparison);
28201
28202 pat = GEN_FCN (d->icode) (target, op0, op1);
28203 if (! pat)
28204 return 0;
28205 emit_insn (pat);
28206 return target;
28207 }
28208
28209 static rtx
28210 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28211 tree exp, rtx target)
28212 {
28213 rtx pat;
28214 tree arg0 = CALL_EXPR_ARG (exp, 0);
28215 tree arg1 = CALL_EXPR_ARG (exp, 1);
28216 rtx op0 = expand_normal (arg0);
28217 rtx op1 = expand_normal (arg1);
28218 rtx op2;
28219 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28220 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28221 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28222
28223 if (optimize || target == 0
28224 || GET_MODE (target) != tmode
28225 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28226 target = gen_reg_rtx (tmode);
28227
28228 op0 = safe_vector_operand (op0, mode0);
28229 op1 = safe_vector_operand (op1, mode1);
28230
28231 if ((optimize && !register_operand (op0, mode0))
28232 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28233 op0 = copy_to_mode_reg (mode0, op0);
28234 if ((optimize && !register_operand (op1, mode1))
28235 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28236 op1 = copy_to_mode_reg (mode1, op1);
28237
28238 op2 = GEN_INT (d->comparison);
28239
28240 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28241 if (! pat)
28242 return 0;
28243 emit_insn (pat);
28244 return target;
28245 }
28246
28247 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28248
28249 static rtx
28250 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28251 rtx target)
28252 {
28253 rtx pat;
28254 tree arg0 = CALL_EXPR_ARG (exp, 0);
28255 tree arg1 = CALL_EXPR_ARG (exp, 1);
28256 rtx op0 = expand_normal (arg0);
28257 rtx op1 = expand_normal (arg1);
28258 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28259 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28260 enum rtx_code comparison = d->comparison;
28261
28262 if (VECTOR_MODE_P (mode0))
28263 op0 = safe_vector_operand (op0, mode0);
28264 if (VECTOR_MODE_P (mode1))
28265 op1 = safe_vector_operand (op1, mode1);
28266
28267 target = gen_reg_rtx (SImode);
28268 emit_move_insn (target, const0_rtx);
28269 target = gen_rtx_SUBREG (QImode, target, 0);
28270
28271 if ((optimize && !register_operand (op0, mode0))
28272 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28273 op0 = copy_to_mode_reg (mode0, op0);
28274 if ((optimize && !register_operand (op1, mode1))
28275 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28276 op1 = copy_to_mode_reg (mode1, op1);
28277
28278 pat = GEN_FCN (d->icode) (op0, op1);
28279 if (! pat)
28280 return 0;
28281 emit_insn (pat);
28282 emit_insn (gen_rtx_SET (VOIDmode,
28283 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28284 gen_rtx_fmt_ee (comparison, QImode,
28285 SET_DEST (pat),
28286 const0_rtx)));
28287
28288 return SUBREG_REG (target);
28289 }
28290
28291 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28292
28293 static rtx
28294 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28295 tree exp, rtx target)
28296 {
28297 rtx pat;
28298 tree arg0 = CALL_EXPR_ARG (exp, 0);
28299 tree arg1 = CALL_EXPR_ARG (exp, 1);
28300 tree arg2 = CALL_EXPR_ARG (exp, 2);
28301 tree arg3 = CALL_EXPR_ARG (exp, 3);
28302 tree arg4 = CALL_EXPR_ARG (exp, 4);
28303 rtx scratch0, scratch1;
28304 rtx op0 = expand_normal (arg0);
28305 rtx op1 = expand_normal (arg1);
28306 rtx op2 = expand_normal (arg2);
28307 rtx op3 = expand_normal (arg3);
28308 rtx op4 = expand_normal (arg4);
28309 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28310
28311 tmode0 = insn_data[d->icode].operand[0].mode;
28312 tmode1 = insn_data[d->icode].operand[1].mode;
28313 modev2 = insn_data[d->icode].operand[2].mode;
28314 modei3 = insn_data[d->icode].operand[3].mode;
28315 modev4 = insn_data[d->icode].operand[4].mode;
28316 modei5 = insn_data[d->icode].operand[5].mode;
28317 modeimm = insn_data[d->icode].operand[6].mode;
28318
28319 if (VECTOR_MODE_P (modev2))
28320 op0 = safe_vector_operand (op0, modev2);
28321 if (VECTOR_MODE_P (modev4))
28322 op2 = safe_vector_operand (op2, modev4);
28323
28324 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28325 op0 = copy_to_mode_reg (modev2, op0);
28326 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28327 op1 = copy_to_mode_reg (modei3, op1);
28328 if ((optimize && !register_operand (op2, modev4))
28329 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28330 op2 = copy_to_mode_reg (modev4, op2);
28331 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28332 op3 = copy_to_mode_reg (modei5, op3);
28333
28334 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28335 {
28336 error ("the fifth argument must be an 8-bit immediate");
28337 return const0_rtx;
28338 }
28339
28340 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28341 {
28342 if (optimize || !target
28343 || GET_MODE (target) != tmode0
28344 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28345 target = gen_reg_rtx (tmode0);
28346
28347 scratch1 = gen_reg_rtx (tmode1);
28348
28349 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28350 }
28351 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28352 {
28353 if (optimize || !target
28354 || GET_MODE (target) != tmode1
28355 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28356 target = gen_reg_rtx (tmode1);
28357
28358 scratch0 = gen_reg_rtx (tmode0);
28359
28360 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28361 }
28362 else
28363 {
28364 gcc_assert (d->flag);
28365
28366 scratch0 = gen_reg_rtx (tmode0);
28367 scratch1 = gen_reg_rtx (tmode1);
28368
28369 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28370 }
28371
28372 if (! pat)
28373 return 0;
28374
28375 emit_insn (pat);
28376
28377 if (d->flag)
28378 {
28379 target = gen_reg_rtx (SImode);
28380 emit_move_insn (target, const0_rtx);
28381 target = gen_rtx_SUBREG (QImode, target, 0);
28382
28383 emit_insn
28384 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28385 gen_rtx_fmt_ee (EQ, QImode,
28386 gen_rtx_REG ((enum machine_mode) d->flag,
28387 FLAGS_REG),
28388 const0_rtx)));
28389 return SUBREG_REG (target);
28390 }
28391 else
28392 return target;
28393 }
28394
28395
28396 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28397
28398 static rtx
28399 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28400 tree exp, rtx target)
28401 {
28402 rtx pat;
28403 tree arg0 = CALL_EXPR_ARG (exp, 0);
28404 tree arg1 = CALL_EXPR_ARG (exp, 1);
28405 tree arg2 = CALL_EXPR_ARG (exp, 2);
28406 rtx scratch0, scratch1;
28407 rtx op0 = expand_normal (arg0);
28408 rtx op1 = expand_normal (arg1);
28409 rtx op2 = expand_normal (arg2);
28410 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28411
28412 tmode0 = insn_data[d->icode].operand[0].mode;
28413 tmode1 = insn_data[d->icode].operand[1].mode;
28414 modev2 = insn_data[d->icode].operand[2].mode;
28415 modev3 = insn_data[d->icode].operand[3].mode;
28416 modeimm = insn_data[d->icode].operand[4].mode;
28417
28418 if (VECTOR_MODE_P (modev2))
28419 op0 = safe_vector_operand (op0, modev2);
28420 if (VECTOR_MODE_P (modev3))
28421 op1 = safe_vector_operand (op1, modev3);
28422
28423 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28424 op0 = copy_to_mode_reg (modev2, op0);
28425 if ((optimize && !register_operand (op1, modev3))
28426 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28427 op1 = copy_to_mode_reg (modev3, op1);
28428
28429 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28430 {
28431 error ("the third argument must be an 8-bit immediate");
28432 return const0_rtx;
28433 }
28434
28435 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28436 {
28437 if (optimize || !target
28438 || GET_MODE (target) != tmode0
28439 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28440 target = gen_reg_rtx (tmode0);
28441
28442 scratch1 = gen_reg_rtx (tmode1);
28443
28444 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28445 }
28446 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28447 {
28448 if (optimize || !target
28449 || GET_MODE (target) != tmode1
28450 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28451 target = gen_reg_rtx (tmode1);
28452
28453 scratch0 = gen_reg_rtx (tmode0);
28454
28455 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28456 }
28457 else
28458 {
28459 gcc_assert (d->flag);
28460
28461 scratch0 = gen_reg_rtx (tmode0);
28462 scratch1 = gen_reg_rtx (tmode1);
28463
28464 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28465 }
28466
28467 if (! pat)
28468 return 0;
28469
28470 emit_insn (pat);
28471
28472 if (d->flag)
28473 {
28474 target = gen_reg_rtx (SImode);
28475 emit_move_insn (target, const0_rtx);
28476 target = gen_rtx_SUBREG (QImode, target, 0);
28477
28478 emit_insn
28479 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28480 gen_rtx_fmt_ee (EQ, QImode,
28481 gen_rtx_REG ((enum machine_mode) d->flag,
28482 FLAGS_REG),
28483 const0_rtx)));
28484 return SUBREG_REG (target);
28485 }
28486 else
28487 return target;
28488 }
28489
28490 /* Subroutine of ix86_expand_builtin to take care of insns with
28491 variable number of operands. */
28492
28493 static rtx
28494 ix86_expand_args_builtin (const struct builtin_description *d,
28495 tree exp, rtx target)
28496 {
28497 rtx pat, real_target;
28498 unsigned int i, nargs;
28499 unsigned int nargs_constant = 0;
28500 int num_memory = 0;
28501 struct
28502 {
28503 rtx op;
28504 enum machine_mode mode;
28505 } args[4];
28506 bool last_arg_count = false;
28507 enum insn_code icode = d->icode;
28508 const struct insn_data_d *insn_p = &insn_data[icode];
28509 enum machine_mode tmode = insn_p->operand[0].mode;
28510 enum machine_mode rmode = VOIDmode;
28511 bool swap = false;
28512 enum rtx_code comparison = d->comparison;
28513
28514 switch ((enum ix86_builtin_func_type) d->flag)
28515 {
28516 case V2DF_FTYPE_V2DF_ROUND:
28517 case V4DF_FTYPE_V4DF_ROUND:
28518 case V4SF_FTYPE_V4SF_ROUND:
28519 case V8SF_FTYPE_V8SF_ROUND:
28520 case V4SI_FTYPE_V4SF_ROUND:
28521 case V8SI_FTYPE_V8SF_ROUND:
28522 return ix86_expand_sse_round (d, exp, target);
28523 case V4SI_FTYPE_V2DF_V2DF_ROUND:
28524 case V8SI_FTYPE_V4DF_V4DF_ROUND:
28525 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28526 case INT_FTYPE_V8SF_V8SF_PTEST:
28527 case INT_FTYPE_V4DI_V4DI_PTEST:
28528 case INT_FTYPE_V4DF_V4DF_PTEST:
28529 case INT_FTYPE_V4SF_V4SF_PTEST:
28530 case INT_FTYPE_V2DI_V2DI_PTEST:
28531 case INT_FTYPE_V2DF_V2DF_PTEST:
28532 return ix86_expand_sse_ptest (d, exp, target);
28533 case FLOAT128_FTYPE_FLOAT128:
28534 case FLOAT_FTYPE_FLOAT:
28535 case INT_FTYPE_INT:
28536 case UINT64_FTYPE_INT:
28537 case UINT16_FTYPE_UINT16:
28538 case INT64_FTYPE_INT64:
28539 case INT64_FTYPE_V4SF:
28540 case INT64_FTYPE_V2DF:
28541 case INT_FTYPE_V16QI:
28542 case INT_FTYPE_V8QI:
28543 case INT_FTYPE_V8SF:
28544 case INT_FTYPE_V4DF:
28545 case INT_FTYPE_V4SF:
28546 case INT_FTYPE_V2DF:
28547 case INT_FTYPE_V32QI:
28548 case V16QI_FTYPE_V16QI:
28549 case V8SI_FTYPE_V8SF:
28550 case V8SI_FTYPE_V4SI:
28551 case V8HI_FTYPE_V8HI:
28552 case V8HI_FTYPE_V16QI:
28553 case V8QI_FTYPE_V8QI:
28554 case V8SF_FTYPE_V8SF:
28555 case V8SF_FTYPE_V8SI:
28556 case V8SF_FTYPE_V4SF:
28557 case V8SF_FTYPE_V8HI:
28558 case V4SI_FTYPE_V4SI:
28559 case V4SI_FTYPE_V16QI:
28560 case V4SI_FTYPE_V4SF:
28561 case V4SI_FTYPE_V8SI:
28562 case V4SI_FTYPE_V8HI:
28563 case V4SI_FTYPE_V4DF:
28564 case V4SI_FTYPE_V2DF:
28565 case V4HI_FTYPE_V4HI:
28566 case V4DF_FTYPE_V4DF:
28567 case V4DF_FTYPE_V4SI:
28568 case V4DF_FTYPE_V4SF:
28569 case V4DF_FTYPE_V2DF:
28570 case V4SF_FTYPE_V4SF:
28571 case V4SF_FTYPE_V4SI:
28572 case V4SF_FTYPE_V8SF:
28573 case V4SF_FTYPE_V4DF:
28574 case V4SF_FTYPE_V8HI:
28575 case V4SF_FTYPE_V2DF:
28576 case V2DI_FTYPE_V2DI:
28577 case V2DI_FTYPE_V16QI:
28578 case V2DI_FTYPE_V8HI:
28579 case V2DI_FTYPE_V4SI:
28580 case V2DF_FTYPE_V2DF:
28581 case V2DF_FTYPE_V4SI:
28582 case V2DF_FTYPE_V4DF:
28583 case V2DF_FTYPE_V4SF:
28584 case V2DF_FTYPE_V2SI:
28585 case V2SI_FTYPE_V2SI:
28586 case V2SI_FTYPE_V4SF:
28587 case V2SI_FTYPE_V2SF:
28588 case V2SI_FTYPE_V2DF:
28589 case V2SF_FTYPE_V2SF:
28590 case V2SF_FTYPE_V2SI:
28591 case V32QI_FTYPE_V32QI:
28592 case V32QI_FTYPE_V16QI:
28593 case V16HI_FTYPE_V16HI:
28594 case V16HI_FTYPE_V8HI:
28595 case V8SI_FTYPE_V8SI:
28596 case V16HI_FTYPE_V16QI:
28597 case V8SI_FTYPE_V16QI:
28598 case V4DI_FTYPE_V16QI:
28599 case V8SI_FTYPE_V8HI:
28600 case V4DI_FTYPE_V8HI:
28601 case V4DI_FTYPE_V4SI:
28602 case V4DI_FTYPE_V2DI:
28603 nargs = 1;
28604 break;
28605 case V4SF_FTYPE_V4SF_VEC_MERGE:
28606 case V2DF_FTYPE_V2DF_VEC_MERGE:
28607 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28608 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28609 case V16QI_FTYPE_V16QI_V16QI:
28610 case V16QI_FTYPE_V8HI_V8HI:
28611 case V8QI_FTYPE_V8QI_V8QI:
28612 case V8QI_FTYPE_V4HI_V4HI:
28613 case V8HI_FTYPE_V8HI_V8HI:
28614 case V8HI_FTYPE_V16QI_V16QI:
28615 case V8HI_FTYPE_V4SI_V4SI:
28616 case V8SF_FTYPE_V8SF_V8SF:
28617 case V8SF_FTYPE_V8SF_V8SI:
28618 case V4SI_FTYPE_V4SI_V4SI:
28619 case V4SI_FTYPE_V8HI_V8HI:
28620 case V4SI_FTYPE_V4SF_V4SF:
28621 case V4SI_FTYPE_V2DF_V2DF:
28622 case V4HI_FTYPE_V4HI_V4HI:
28623 case V4HI_FTYPE_V8QI_V8QI:
28624 case V4HI_FTYPE_V2SI_V2SI:
28625 case V4DF_FTYPE_V4DF_V4DF:
28626 case V4DF_FTYPE_V4DF_V4DI:
28627 case V4SF_FTYPE_V4SF_V4SF:
28628 case V4SF_FTYPE_V4SF_V4SI:
28629 case V4SF_FTYPE_V4SF_V2SI:
28630 case V4SF_FTYPE_V4SF_V2DF:
28631 case V4SF_FTYPE_V4SF_DI:
28632 case V4SF_FTYPE_V4SF_SI:
28633 case V2DI_FTYPE_V2DI_V2DI:
28634 case V2DI_FTYPE_V16QI_V16QI:
28635 case V2DI_FTYPE_V4SI_V4SI:
28636 case V2DI_FTYPE_V2DI_V16QI:
28637 case V2DI_FTYPE_V2DF_V2DF:
28638 case V2SI_FTYPE_V2SI_V2SI:
28639 case V2SI_FTYPE_V4HI_V4HI:
28640 case V2SI_FTYPE_V2SF_V2SF:
28641 case V2DF_FTYPE_V2DF_V2DF:
28642 case V2DF_FTYPE_V2DF_V4SF:
28643 case V2DF_FTYPE_V2DF_V2DI:
28644 case V2DF_FTYPE_V2DF_DI:
28645 case V2DF_FTYPE_V2DF_SI:
28646 case V2SF_FTYPE_V2SF_V2SF:
28647 case V1DI_FTYPE_V1DI_V1DI:
28648 case V1DI_FTYPE_V8QI_V8QI:
28649 case V1DI_FTYPE_V2SI_V2SI:
28650 case V32QI_FTYPE_V16HI_V16HI:
28651 case V16HI_FTYPE_V8SI_V8SI:
28652 case V32QI_FTYPE_V32QI_V32QI:
28653 case V16HI_FTYPE_V32QI_V32QI:
28654 case V16HI_FTYPE_V16HI_V16HI:
28655 case V8SI_FTYPE_V4DF_V4DF:
28656 case V8SI_FTYPE_V8SI_V8SI:
28657 case V8SI_FTYPE_V16HI_V16HI:
28658 case V4DI_FTYPE_V4DI_V4DI:
28659 case V4DI_FTYPE_V8SI_V8SI:
28660 if (comparison == UNKNOWN)
28661 return ix86_expand_binop_builtin (icode, exp, target);
28662 nargs = 2;
28663 break;
28664 case V4SF_FTYPE_V4SF_V4SF_SWAP:
28665 case V2DF_FTYPE_V2DF_V2DF_SWAP:
28666 gcc_assert (comparison != UNKNOWN);
28667 nargs = 2;
28668 swap = true;
28669 break;
28670 case V16HI_FTYPE_V16HI_V8HI_COUNT:
28671 case V16HI_FTYPE_V16HI_SI_COUNT:
28672 case V8SI_FTYPE_V8SI_V4SI_COUNT:
28673 case V8SI_FTYPE_V8SI_SI_COUNT:
28674 case V4DI_FTYPE_V4DI_V2DI_COUNT:
28675 case V4DI_FTYPE_V4DI_INT_COUNT:
28676 case V8HI_FTYPE_V8HI_V8HI_COUNT:
28677 case V8HI_FTYPE_V8HI_SI_COUNT:
28678 case V4SI_FTYPE_V4SI_V4SI_COUNT:
28679 case V4SI_FTYPE_V4SI_SI_COUNT:
28680 case V4HI_FTYPE_V4HI_V4HI_COUNT:
28681 case V4HI_FTYPE_V4HI_SI_COUNT:
28682 case V2DI_FTYPE_V2DI_V2DI_COUNT:
28683 case V2DI_FTYPE_V2DI_SI_COUNT:
28684 case V2SI_FTYPE_V2SI_V2SI_COUNT:
28685 case V2SI_FTYPE_V2SI_SI_COUNT:
28686 case V1DI_FTYPE_V1DI_V1DI_COUNT:
28687 case V1DI_FTYPE_V1DI_SI_COUNT:
28688 nargs = 2;
28689 last_arg_count = true;
28690 break;
28691 case UINT64_FTYPE_UINT64_UINT64:
28692 case UINT_FTYPE_UINT_UINT:
28693 case UINT_FTYPE_UINT_USHORT:
28694 case UINT_FTYPE_UINT_UCHAR:
28695 case UINT16_FTYPE_UINT16_INT:
28696 case UINT8_FTYPE_UINT8_INT:
28697 nargs = 2;
28698 break;
28699 case V2DI_FTYPE_V2DI_INT_CONVERT:
28700 nargs = 2;
28701 rmode = V1TImode;
28702 nargs_constant = 1;
28703 break;
28704 case V4DI_FTYPE_V4DI_INT_CONVERT:
28705 nargs = 2;
28706 rmode = V2TImode;
28707 nargs_constant = 1;
28708 break;
28709 case V8HI_FTYPE_V8HI_INT:
28710 case V8HI_FTYPE_V8SF_INT:
28711 case V8HI_FTYPE_V4SF_INT:
28712 case V8SF_FTYPE_V8SF_INT:
28713 case V4SI_FTYPE_V4SI_INT:
28714 case V4SI_FTYPE_V8SI_INT:
28715 case V4HI_FTYPE_V4HI_INT:
28716 case V4DF_FTYPE_V4DF_INT:
28717 case V4SF_FTYPE_V4SF_INT:
28718 case V4SF_FTYPE_V8SF_INT:
28719 case V2DI_FTYPE_V2DI_INT:
28720 case V2DF_FTYPE_V2DF_INT:
28721 case V2DF_FTYPE_V4DF_INT:
28722 case V16HI_FTYPE_V16HI_INT:
28723 case V8SI_FTYPE_V8SI_INT:
28724 case V4DI_FTYPE_V4DI_INT:
28725 case V2DI_FTYPE_V4DI_INT:
28726 nargs = 2;
28727 nargs_constant = 1;
28728 break;
28729 case V16QI_FTYPE_V16QI_V16QI_V16QI:
28730 case V8SF_FTYPE_V8SF_V8SF_V8SF:
28731 case V4DF_FTYPE_V4DF_V4DF_V4DF:
28732 case V4SF_FTYPE_V4SF_V4SF_V4SF:
28733 case V2DF_FTYPE_V2DF_V2DF_V2DF:
28734 case V32QI_FTYPE_V32QI_V32QI_V32QI:
28735 nargs = 3;
28736 break;
28737 case V32QI_FTYPE_V32QI_V32QI_INT:
28738 case V16HI_FTYPE_V16HI_V16HI_INT:
28739 case V16QI_FTYPE_V16QI_V16QI_INT:
28740 case V4DI_FTYPE_V4DI_V4DI_INT:
28741 case V8HI_FTYPE_V8HI_V8HI_INT:
28742 case V8SI_FTYPE_V8SI_V8SI_INT:
28743 case V8SI_FTYPE_V8SI_V4SI_INT:
28744 case V8SF_FTYPE_V8SF_V8SF_INT:
28745 case V8SF_FTYPE_V8SF_V4SF_INT:
28746 case V4SI_FTYPE_V4SI_V4SI_INT:
28747 case V4DF_FTYPE_V4DF_V4DF_INT:
28748 case V4DF_FTYPE_V4DF_V2DF_INT:
28749 case V4SF_FTYPE_V4SF_V4SF_INT:
28750 case V2DI_FTYPE_V2DI_V2DI_INT:
28751 case V4DI_FTYPE_V4DI_V2DI_INT:
28752 case V2DF_FTYPE_V2DF_V2DF_INT:
28753 nargs = 3;
28754 nargs_constant = 1;
28755 break;
28756 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28757 nargs = 3;
28758 rmode = V4DImode;
28759 nargs_constant = 1;
28760 break;
28761 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28762 nargs = 3;
28763 rmode = V2DImode;
28764 nargs_constant = 1;
28765 break;
28766 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28767 nargs = 3;
28768 rmode = DImode;
28769 nargs_constant = 1;
28770 break;
28771 case V2DI_FTYPE_V2DI_UINT_UINT:
28772 nargs = 3;
28773 nargs_constant = 2;
28774 break;
28775 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28776 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28777 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28778 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28779 nargs = 4;
28780 nargs_constant = 1;
28781 break;
28782 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28783 nargs = 4;
28784 nargs_constant = 2;
28785 break;
28786 default:
28787 gcc_unreachable ();
28788 }
28789
28790 gcc_assert (nargs <= ARRAY_SIZE (args));
28791
28792 if (comparison != UNKNOWN)
28793 {
28794 gcc_assert (nargs == 2);
28795 return ix86_expand_sse_compare (d, exp, target, swap);
28796 }
28797
28798 if (rmode == VOIDmode || rmode == tmode)
28799 {
28800 if (optimize
28801 || target == 0
28802 || GET_MODE (target) != tmode
28803 || !insn_p->operand[0].predicate (target, tmode))
28804 target = gen_reg_rtx (tmode);
28805 real_target = target;
28806 }
28807 else
28808 {
28809 target = gen_reg_rtx (rmode);
28810 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28811 }
28812
28813 for (i = 0; i < nargs; i++)
28814 {
28815 tree arg = CALL_EXPR_ARG (exp, i);
28816 rtx op = expand_normal (arg);
28817 enum machine_mode mode = insn_p->operand[i + 1].mode;
28818 bool match = insn_p->operand[i + 1].predicate (op, mode);
28819
28820 if (last_arg_count && (i + 1) == nargs)
28821 {
28822 /* SIMD shift insns take either an 8-bit immediate or
28823 register as count. But builtin functions take int as
28824 count. If count doesn't match, we put it in register. */
28825 if (!match)
28826 {
28827 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28828 if (!insn_p->operand[i + 1].predicate (op, mode))
28829 op = copy_to_reg (op);
28830 }
28831 }
28832 else if ((nargs - i) <= nargs_constant)
28833 {
28834 if (!match)
28835 switch (icode)
28836 {
28837 case CODE_FOR_avx2_inserti128:
28838 case CODE_FOR_avx2_extracti128:
28839 error ("the last argument must be an 1-bit immediate");
28840 return const0_rtx;
28841
28842 case CODE_FOR_sse4_1_roundsd:
28843 case CODE_FOR_sse4_1_roundss:
28844
28845 case CODE_FOR_sse4_1_roundpd:
28846 case CODE_FOR_sse4_1_roundps:
28847 case CODE_FOR_avx_roundpd256:
28848 case CODE_FOR_avx_roundps256:
28849
28850 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
28851 case CODE_FOR_sse4_1_roundps_sfix:
28852 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
28853 case CODE_FOR_avx_roundps_sfix256:
28854
28855 case CODE_FOR_sse4_1_blendps:
28856 case CODE_FOR_avx_blendpd256:
28857 case CODE_FOR_avx_vpermilv4df:
28858 error ("the last argument must be a 4-bit immediate");
28859 return const0_rtx;
28860
28861 case CODE_FOR_sse4_1_blendpd:
28862 case CODE_FOR_avx_vpermilv2df:
28863 case CODE_FOR_xop_vpermil2v2df3:
28864 case CODE_FOR_xop_vpermil2v4sf3:
28865 case CODE_FOR_xop_vpermil2v4df3:
28866 case CODE_FOR_xop_vpermil2v8sf3:
28867 error ("the last argument must be a 2-bit immediate");
28868 return const0_rtx;
28869
28870 case CODE_FOR_avx_vextractf128v4df:
28871 case CODE_FOR_avx_vextractf128v8sf:
28872 case CODE_FOR_avx_vextractf128v8si:
28873 case CODE_FOR_avx_vinsertf128v4df:
28874 case CODE_FOR_avx_vinsertf128v8sf:
28875 case CODE_FOR_avx_vinsertf128v8si:
28876 error ("the last argument must be a 1-bit immediate");
28877 return const0_rtx;
28878
28879 case CODE_FOR_avx_vmcmpv2df3:
28880 case CODE_FOR_avx_vmcmpv4sf3:
28881 case CODE_FOR_avx_cmpv2df3:
28882 case CODE_FOR_avx_cmpv4sf3:
28883 case CODE_FOR_avx_cmpv4df3:
28884 case CODE_FOR_avx_cmpv8sf3:
28885 error ("the last argument must be a 5-bit immediate");
28886 return const0_rtx;
28887
28888 default:
28889 switch (nargs_constant)
28890 {
28891 case 2:
28892 if ((nargs - i) == nargs_constant)
28893 {
28894 error ("the next to last argument must be an 8-bit immediate");
28895 break;
28896 }
28897 case 1:
28898 error ("the last argument must be an 8-bit immediate");
28899 break;
28900 default:
28901 gcc_unreachable ();
28902 }
28903 return const0_rtx;
28904 }
28905 }
28906 else
28907 {
28908 if (VECTOR_MODE_P (mode))
28909 op = safe_vector_operand (op, mode);
28910
28911 /* If we aren't optimizing, only allow one memory operand to
28912 be generated. */
28913 if (memory_operand (op, mode))
28914 num_memory++;
28915
28916 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
28917 {
28918 if (optimize || !match || num_memory > 1)
28919 op = copy_to_mode_reg (mode, op);
28920 }
28921 else
28922 {
28923 op = copy_to_reg (op);
28924 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
28925 }
28926 }
28927
28928 args[i].op = op;
28929 args[i].mode = mode;
28930 }
28931
28932 switch (nargs)
28933 {
28934 case 1:
28935 pat = GEN_FCN (icode) (real_target, args[0].op);
28936 break;
28937 case 2:
28938 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
28939 break;
28940 case 3:
28941 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28942 args[2].op);
28943 break;
28944 case 4:
28945 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28946 args[2].op, args[3].op);
28947 break;
28948 default:
28949 gcc_unreachable ();
28950 }
28951
28952 if (! pat)
28953 return 0;
28954
28955 emit_insn (pat);
28956 return target;
28957 }
28958
28959 /* Subroutine of ix86_expand_builtin to take care of special insns
28960 with variable number of operands. */
28961
28962 static rtx
28963 ix86_expand_special_args_builtin (const struct builtin_description *d,
28964 tree exp, rtx target)
28965 {
28966 tree arg;
28967 rtx pat, op;
28968 unsigned int i, nargs, arg_adjust, memory;
28969 struct
28970 {
28971 rtx op;
28972 enum machine_mode mode;
28973 } args[3];
28974 enum insn_code icode = d->icode;
28975 bool last_arg_constant = false;
28976 const struct insn_data_d *insn_p = &insn_data[icode];
28977 enum machine_mode tmode = insn_p->operand[0].mode;
28978 enum { load, store } klass;
28979
28980 switch ((enum ix86_builtin_func_type) d->flag)
28981 {
28982 case VOID_FTYPE_VOID:
28983 if (icode == CODE_FOR_avx_vzeroupper)
28984 target = GEN_INT (vzeroupper_intrinsic);
28985 emit_insn (GEN_FCN (icode) (target));
28986 return 0;
28987 case VOID_FTYPE_UINT64:
28988 case VOID_FTYPE_UNSIGNED:
28989 nargs = 0;
28990 klass = store;
28991 memory = 0;
28992 break;
28993
28994 case INT_FTYPE_VOID:
28995 case UINT64_FTYPE_VOID:
28996 case UNSIGNED_FTYPE_VOID:
28997 nargs = 0;
28998 klass = load;
28999 memory = 0;
29000 break;
29001 case UINT64_FTYPE_PUNSIGNED:
29002 case V2DI_FTYPE_PV2DI:
29003 case V4DI_FTYPE_PV4DI:
29004 case V32QI_FTYPE_PCCHAR:
29005 case V16QI_FTYPE_PCCHAR:
29006 case V8SF_FTYPE_PCV4SF:
29007 case V8SF_FTYPE_PCFLOAT:
29008 case V4SF_FTYPE_PCFLOAT:
29009 case V4DF_FTYPE_PCV2DF:
29010 case V4DF_FTYPE_PCDOUBLE:
29011 case V2DF_FTYPE_PCDOUBLE:
29012 case VOID_FTYPE_PVOID:
29013 nargs = 1;
29014 klass = load;
29015 memory = 0;
29016 break;
29017 case VOID_FTYPE_PV2SF_V4SF:
29018 case VOID_FTYPE_PV4DI_V4DI:
29019 case VOID_FTYPE_PV2DI_V2DI:
29020 case VOID_FTYPE_PCHAR_V32QI:
29021 case VOID_FTYPE_PCHAR_V16QI:
29022 case VOID_FTYPE_PFLOAT_V8SF:
29023 case VOID_FTYPE_PFLOAT_V4SF:
29024 case VOID_FTYPE_PDOUBLE_V4DF:
29025 case VOID_FTYPE_PDOUBLE_V2DF:
29026 case VOID_FTYPE_PLONGLONG_LONGLONG:
29027 case VOID_FTYPE_PULONGLONG_ULONGLONG:
29028 case VOID_FTYPE_PINT_INT:
29029 nargs = 1;
29030 klass = store;
29031 /* Reserve memory operand for target. */
29032 memory = ARRAY_SIZE (args);
29033 break;
29034 case V4SF_FTYPE_V4SF_PCV2SF:
29035 case V2DF_FTYPE_V2DF_PCDOUBLE:
29036 nargs = 2;
29037 klass = load;
29038 memory = 1;
29039 break;
29040 case V8SF_FTYPE_PCV8SF_V8SI:
29041 case V4DF_FTYPE_PCV4DF_V4DI:
29042 case V4SF_FTYPE_PCV4SF_V4SI:
29043 case V2DF_FTYPE_PCV2DF_V2DI:
29044 case V8SI_FTYPE_PCV8SI_V8SI:
29045 case V4DI_FTYPE_PCV4DI_V4DI:
29046 case V4SI_FTYPE_PCV4SI_V4SI:
29047 case V2DI_FTYPE_PCV2DI_V2DI:
29048 nargs = 2;
29049 klass = load;
29050 memory = 0;
29051 break;
29052 case VOID_FTYPE_PV8SF_V8SI_V8SF:
29053 case VOID_FTYPE_PV4DF_V4DI_V4DF:
29054 case VOID_FTYPE_PV4SF_V4SI_V4SF:
29055 case VOID_FTYPE_PV2DF_V2DI_V2DF:
29056 case VOID_FTYPE_PV8SI_V8SI_V8SI:
29057 case VOID_FTYPE_PV4DI_V4DI_V4DI:
29058 case VOID_FTYPE_PV4SI_V4SI_V4SI:
29059 case VOID_FTYPE_PV2DI_V2DI_V2DI:
29060 nargs = 2;
29061 klass = store;
29062 /* Reserve memory operand for target. */
29063 memory = ARRAY_SIZE (args);
29064 break;
29065 case VOID_FTYPE_UINT_UINT_UINT:
29066 case VOID_FTYPE_UINT64_UINT_UINT:
29067 case UCHAR_FTYPE_UINT_UINT_UINT:
29068 case UCHAR_FTYPE_UINT64_UINT_UINT:
29069 nargs = 3;
29070 klass = load;
29071 memory = ARRAY_SIZE (args);
29072 last_arg_constant = true;
29073 break;
29074 default:
29075 gcc_unreachable ();
29076 }
29077
29078 gcc_assert (nargs <= ARRAY_SIZE (args));
29079
29080 if (klass == store)
29081 {
29082 arg = CALL_EXPR_ARG (exp, 0);
29083 op = expand_normal (arg);
29084 gcc_assert (target == 0);
29085 if (memory)
29086 {
29087 if (GET_MODE (op) != Pmode)
29088 op = convert_to_mode (Pmode, op, 1);
29089 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
29090 }
29091 else
29092 target = force_reg (tmode, op);
29093 arg_adjust = 1;
29094 }
29095 else
29096 {
29097 arg_adjust = 0;
29098 if (optimize
29099 || target == 0
29100 || GET_MODE (target) != tmode
29101 || !insn_p->operand[0].predicate (target, tmode))
29102 target = gen_reg_rtx (tmode);
29103 }
29104
29105 for (i = 0; i < nargs; i++)
29106 {
29107 enum machine_mode mode = insn_p->operand[i + 1].mode;
29108 bool match;
29109
29110 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
29111 op = expand_normal (arg);
29112 match = insn_p->operand[i + 1].predicate (op, mode);
29113
29114 if (last_arg_constant && (i + 1) == nargs)
29115 {
29116 if (!match)
29117 {
29118 if (icode == CODE_FOR_lwp_lwpvalsi3
29119 || icode == CODE_FOR_lwp_lwpinssi3
29120 || icode == CODE_FOR_lwp_lwpvaldi3
29121 || icode == CODE_FOR_lwp_lwpinsdi3)
29122 error ("the last argument must be a 32-bit immediate");
29123 else
29124 error ("the last argument must be an 8-bit immediate");
29125 return const0_rtx;
29126 }
29127 }
29128 else
29129 {
29130 if (i == memory)
29131 {
29132 /* This must be the memory operand. */
29133 if (GET_MODE (op) != Pmode)
29134 op = convert_to_mode (Pmode, op, 1);
29135 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29136 gcc_assert (GET_MODE (op) == mode
29137 || GET_MODE (op) == VOIDmode);
29138 }
29139 else
29140 {
29141 /* This must be register. */
29142 if (VECTOR_MODE_P (mode))
29143 op = safe_vector_operand (op, mode);
29144
29145 gcc_assert (GET_MODE (op) == mode
29146 || GET_MODE (op) == VOIDmode);
29147 op = copy_to_mode_reg (mode, op);
29148 }
29149 }
29150
29151 args[i].op = op;
29152 args[i].mode = mode;
29153 }
29154
29155 switch (nargs)
29156 {
29157 case 0:
29158 pat = GEN_FCN (icode) (target);
29159 break;
29160 case 1:
29161 pat = GEN_FCN (icode) (target, args[0].op);
29162 break;
29163 case 2:
29164 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29165 break;
29166 case 3:
29167 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29168 break;
29169 default:
29170 gcc_unreachable ();
29171 }
29172
29173 if (! pat)
29174 return 0;
29175 emit_insn (pat);
29176 return klass == store ? 0 : target;
29177 }
29178
29179 /* Return the integer constant in ARG. Constrain it to be in the range
29180 of the subparts of VEC_TYPE; issue an error if not. */
29181
29182 static int
29183 get_element_number (tree vec_type, tree arg)
29184 {
29185 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29186
29187 if (!host_integerp (arg, 1)
29188 || (elt = tree_low_cst (arg, 1), elt > max))
29189 {
29190 error ("selector must be an integer constant in the range 0..%wi", max);
29191 return 0;
29192 }
29193
29194 return elt;
29195 }
29196
29197 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29198 ix86_expand_vector_init. We DO have language-level syntax for this, in
29199 the form of (type){ init-list }. Except that since we can't place emms
29200 instructions from inside the compiler, we can't allow the use of MMX
29201 registers unless the user explicitly asks for it. So we do *not* define
29202 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29203 we have builtins invoked by mmintrin.h that gives us license to emit
29204 these sorts of instructions. */
29205
29206 static rtx
29207 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29208 {
29209 enum machine_mode tmode = TYPE_MODE (type);
29210 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29211 int i, n_elt = GET_MODE_NUNITS (tmode);
29212 rtvec v = rtvec_alloc (n_elt);
29213
29214 gcc_assert (VECTOR_MODE_P (tmode));
29215 gcc_assert (call_expr_nargs (exp) == n_elt);
29216
29217 for (i = 0; i < n_elt; ++i)
29218 {
29219 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29220 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29221 }
29222
29223 if (!target || !register_operand (target, tmode))
29224 target = gen_reg_rtx (tmode);
29225
29226 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29227 return target;
29228 }
29229
29230 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29231 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29232 had a language-level syntax for referencing vector elements. */
29233
29234 static rtx
29235 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29236 {
29237 enum machine_mode tmode, mode0;
29238 tree arg0, arg1;
29239 int elt;
29240 rtx op0;
29241
29242 arg0 = CALL_EXPR_ARG (exp, 0);
29243 arg1 = CALL_EXPR_ARG (exp, 1);
29244
29245 op0 = expand_normal (arg0);
29246 elt = get_element_number (TREE_TYPE (arg0), arg1);
29247
29248 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29249 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29250 gcc_assert (VECTOR_MODE_P (mode0));
29251
29252 op0 = force_reg (mode0, op0);
29253
29254 if (optimize || !target || !register_operand (target, tmode))
29255 target = gen_reg_rtx (tmode);
29256
29257 ix86_expand_vector_extract (true, target, op0, elt);
29258
29259 return target;
29260 }
29261
29262 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29263 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29264 a language-level syntax for referencing vector elements. */
29265
29266 static rtx
29267 ix86_expand_vec_set_builtin (tree exp)
29268 {
29269 enum machine_mode tmode, mode1;
29270 tree arg0, arg1, arg2;
29271 int elt;
29272 rtx op0, op1, target;
29273
29274 arg0 = CALL_EXPR_ARG (exp, 0);
29275 arg1 = CALL_EXPR_ARG (exp, 1);
29276 arg2 = CALL_EXPR_ARG (exp, 2);
29277
29278 tmode = TYPE_MODE (TREE_TYPE (arg0));
29279 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29280 gcc_assert (VECTOR_MODE_P (tmode));
29281
29282 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29283 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29284 elt = get_element_number (TREE_TYPE (arg0), arg2);
29285
29286 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29287 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29288
29289 op0 = force_reg (tmode, op0);
29290 op1 = force_reg (mode1, op1);
29291
29292 /* OP0 is the source of these builtin functions and shouldn't be
29293 modified. Create a copy, use it and return it as target. */
29294 target = gen_reg_rtx (tmode);
29295 emit_move_insn (target, op0);
29296 ix86_expand_vector_set (true, target, op1, elt);
29297
29298 return target;
29299 }
29300
29301 /* Expand an expression EXP that calls a built-in function,
29302 with result going to TARGET if that's convenient
29303 (and in mode MODE if that's convenient).
29304 SUBTARGET may be used as the target for computing one of EXP's operands.
29305 IGNORE is nonzero if the value is to be ignored. */
29306
29307 static rtx
29308 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29309 enum machine_mode mode ATTRIBUTE_UNUSED,
29310 int ignore ATTRIBUTE_UNUSED)
29311 {
29312 const struct builtin_description *d;
29313 size_t i;
29314 enum insn_code icode;
29315 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29316 tree arg0, arg1, arg2, arg3, arg4;
29317 rtx op0, op1, op2, op3, op4, pat;
29318 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29319 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29320
29321 /* Determine whether the builtin function is available under the current ISA.
29322 Originally the builtin was not created if it wasn't applicable to the
29323 current ISA based on the command line switches. With function specific
29324 options, we need to check in the context of the function making the call
29325 whether it is supported. */
29326 if (ix86_builtins_isa[fcode].isa
29327 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29328 {
29329 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29330 NULL, (enum fpmath_unit) 0, false);
29331
29332 if (!opts)
29333 error ("%qE needs unknown isa option", fndecl);
29334 else
29335 {
29336 gcc_assert (opts != NULL);
29337 error ("%qE needs isa option %s", fndecl, opts);
29338 free (opts);
29339 }
29340 return const0_rtx;
29341 }
29342
29343 switch (fcode)
29344 {
29345 case IX86_BUILTIN_MASKMOVQ:
29346 case IX86_BUILTIN_MASKMOVDQU:
29347 icode = (fcode == IX86_BUILTIN_MASKMOVQ
29348 ? CODE_FOR_mmx_maskmovq
29349 : CODE_FOR_sse2_maskmovdqu);
29350 /* Note the arg order is different from the operand order. */
29351 arg1 = CALL_EXPR_ARG (exp, 0);
29352 arg2 = CALL_EXPR_ARG (exp, 1);
29353 arg0 = CALL_EXPR_ARG (exp, 2);
29354 op0 = expand_normal (arg0);
29355 op1 = expand_normal (arg1);
29356 op2 = expand_normal (arg2);
29357 mode0 = insn_data[icode].operand[0].mode;
29358 mode1 = insn_data[icode].operand[1].mode;
29359 mode2 = insn_data[icode].operand[2].mode;
29360
29361 if (GET_MODE (op0) != Pmode)
29362 op0 = convert_to_mode (Pmode, op0, 1);
29363 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29364
29365 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29366 op0 = copy_to_mode_reg (mode0, op0);
29367 if (!insn_data[icode].operand[1].predicate (op1, mode1))
29368 op1 = copy_to_mode_reg (mode1, op1);
29369 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29370 op2 = copy_to_mode_reg (mode2, op2);
29371 pat = GEN_FCN (icode) (op0, op1, op2);
29372 if (! pat)
29373 return 0;
29374 emit_insn (pat);
29375 return 0;
29376
29377 case IX86_BUILTIN_LDMXCSR:
29378 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29379 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29380 emit_move_insn (target, op0);
29381 emit_insn (gen_sse_ldmxcsr (target));
29382 return 0;
29383
29384 case IX86_BUILTIN_STMXCSR:
29385 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29386 emit_insn (gen_sse_stmxcsr (target));
29387 return copy_to_mode_reg (SImode, target);
29388
29389 case IX86_BUILTIN_CLFLUSH:
29390 arg0 = CALL_EXPR_ARG (exp, 0);
29391 op0 = expand_normal (arg0);
29392 icode = CODE_FOR_sse2_clflush;
29393 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29394 {
29395 if (GET_MODE (op0) != Pmode)
29396 op0 = convert_to_mode (Pmode, op0, 1);
29397 op0 = force_reg (Pmode, op0);
29398 }
29399
29400 emit_insn (gen_sse2_clflush (op0));
29401 return 0;
29402
29403 case IX86_BUILTIN_MONITOR:
29404 arg0 = CALL_EXPR_ARG (exp, 0);
29405 arg1 = CALL_EXPR_ARG (exp, 1);
29406 arg2 = CALL_EXPR_ARG (exp, 2);
29407 op0 = expand_normal (arg0);
29408 op1 = expand_normal (arg1);
29409 op2 = expand_normal (arg2);
29410 if (!REG_P (op0))
29411 {
29412 if (GET_MODE (op0) != Pmode)
29413 op0 = convert_to_mode (Pmode, op0, 1);
29414 op0 = force_reg (Pmode, op0);
29415 }
29416 if (!REG_P (op1))
29417 op1 = copy_to_mode_reg (SImode, op1);
29418 if (!REG_P (op2))
29419 op2 = copy_to_mode_reg (SImode, op2);
29420 emit_insn (ix86_gen_monitor (op0, op1, op2));
29421 return 0;
29422
29423 case IX86_BUILTIN_MWAIT:
29424 arg0 = CALL_EXPR_ARG (exp, 0);
29425 arg1 = CALL_EXPR_ARG (exp, 1);
29426 op0 = expand_normal (arg0);
29427 op1 = expand_normal (arg1);
29428 if (!REG_P (op0))
29429 op0 = copy_to_mode_reg (SImode, op0);
29430 if (!REG_P (op1))
29431 op1 = copy_to_mode_reg (SImode, op1);
29432 emit_insn (gen_sse3_mwait (op0, op1));
29433 return 0;
29434
29435 case IX86_BUILTIN_VEC_INIT_V2SI:
29436 case IX86_BUILTIN_VEC_INIT_V4HI:
29437 case IX86_BUILTIN_VEC_INIT_V8QI:
29438 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29439
29440 case IX86_BUILTIN_VEC_EXT_V2DF:
29441 case IX86_BUILTIN_VEC_EXT_V2DI:
29442 case IX86_BUILTIN_VEC_EXT_V4SF:
29443 case IX86_BUILTIN_VEC_EXT_V4SI:
29444 case IX86_BUILTIN_VEC_EXT_V8HI:
29445 case IX86_BUILTIN_VEC_EXT_V2SI:
29446 case IX86_BUILTIN_VEC_EXT_V4HI:
29447 case IX86_BUILTIN_VEC_EXT_V16QI:
29448 return ix86_expand_vec_ext_builtin (exp, target);
29449
29450 case IX86_BUILTIN_VEC_SET_V2DI:
29451 case IX86_BUILTIN_VEC_SET_V4SF:
29452 case IX86_BUILTIN_VEC_SET_V4SI:
29453 case IX86_BUILTIN_VEC_SET_V8HI:
29454 case IX86_BUILTIN_VEC_SET_V4HI:
29455 case IX86_BUILTIN_VEC_SET_V16QI:
29456 return ix86_expand_vec_set_builtin (exp);
29457
29458 case IX86_BUILTIN_INFQ:
29459 case IX86_BUILTIN_HUGE_VALQ:
29460 {
29461 REAL_VALUE_TYPE inf;
29462 rtx tmp;
29463
29464 real_inf (&inf);
29465 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29466
29467 tmp = validize_mem (force_const_mem (mode, tmp));
29468
29469 if (target == 0)
29470 target = gen_reg_rtx (mode);
29471
29472 emit_move_insn (target, tmp);
29473 return target;
29474 }
29475
29476 case IX86_BUILTIN_LLWPCB:
29477 arg0 = CALL_EXPR_ARG (exp, 0);
29478 op0 = expand_normal (arg0);
29479 icode = CODE_FOR_lwp_llwpcb;
29480 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29481 {
29482 if (GET_MODE (op0) != Pmode)
29483 op0 = convert_to_mode (Pmode, op0, 1);
29484 op0 = force_reg (Pmode, op0);
29485 }
29486 emit_insn (gen_lwp_llwpcb (op0));
29487 return 0;
29488
29489 case IX86_BUILTIN_SLWPCB:
29490 icode = CODE_FOR_lwp_slwpcb;
29491 if (!target
29492 || !insn_data[icode].operand[0].predicate (target, Pmode))
29493 target = gen_reg_rtx (Pmode);
29494 emit_insn (gen_lwp_slwpcb (target));
29495 return target;
29496
29497 case IX86_BUILTIN_BEXTRI32:
29498 case IX86_BUILTIN_BEXTRI64:
29499 arg0 = CALL_EXPR_ARG (exp, 0);
29500 arg1 = CALL_EXPR_ARG (exp, 1);
29501 op0 = expand_normal (arg0);
29502 op1 = expand_normal (arg1);
29503 icode = (fcode == IX86_BUILTIN_BEXTRI32
29504 ? CODE_FOR_tbm_bextri_si
29505 : CODE_FOR_tbm_bextri_di);
29506 if (!CONST_INT_P (op1))
29507 {
29508 error ("last argument must be an immediate");
29509 return const0_rtx;
29510 }
29511 else
29512 {
29513 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29514 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29515 op1 = GEN_INT (length);
29516 op2 = GEN_INT (lsb_index);
29517 pat = GEN_FCN (icode) (target, op0, op1, op2);
29518 if (pat)
29519 emit_insn (pat);
29520 return target;
29521 }
29522
29523 case IX86_BUILTIN_RDRAND16_STEP:
29524 icode = CODE_FOR_rdrandhi_1;
29525 mode0 = HImode;
29526 goto rdrand_step;
29527
29528 case IX86_BUILTIN_RDRAND32_STEP:
29529 icode = CODE_FOR_rdrandsi_1;
29530 mode0 = SImode;
29531 goto rdrand_step;
29532
29533 case IX86_BUILTIN_RDRAND64_STEP:
29534 icode = CODE_FOR_rdranddi_1;
29535 mode0 = DImode;
29536
29537 rdrand_step:
29538 op0 = gen_reg_rtx (mode0);
29539 emit_insn (GEN_FCN (icode) (op0));
29540
29541 arg0 = CALL_EXPR_ARG (exp, 0);
29542 op1 = expand_normal (arg0);
29543 if (!address_operand (op1, VOIDmode))
29544 {
29545 op1 = convert_memory_address (Pmode, op1);
29546 op1 = copy_addr_to_reg (op1);
29547 }
29548 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29549
29550 op1 = gen_reg_rtx (SImode);
29551 emit_move_insn (op1, CONST1_RTX (SImode));
29552
29553 /* Emit SImode conditional move. */
29554 if (mode0 == HImode)
29555 {
29556 op2 = gen_reg_rtx (SImode);
29557 emit_insn (gen_zero_extendhisi2 (op2, op0));
29558 }
29559 else if (mode0 == SImode)
29560 op2 = op0;
29561 else
29562 op2 = gen_rtx_SUBREG (SImode, op0, 0);
29563
29564 if (target == 0)
29565 target = gen_reg_rtx (SImode);
29566
29567 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29568 const0_rtx);
29569 emit_insn (gen_rtx_SET (VOIDmode, target,
29570 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29571 return target;
29572
29573 case IX86_BUILTIN_GATHERSIV2DF:
29574 icode = CODE_FOR_avx2_gathersiv2df;
29575 goto gather_gen;
29576 case IX86_BUILTIN_GATHERSIV4DF:
29577 icode = CODE_FOR_avx2_gathersiv4df;
29578 goto gather_gen;
29579 case IX86_BUILTIN_GATHERDIV2DF:
29580 icode = CODE_FOR_avx2_gatherdiv2df;
29581 goto gather_gen;
29582 case IX86_BUILTIN_GATHERDIV4DF:
29583 icode = CODE_FOR_avx2_gatherdiv4df;
29584 goto gather_gen;
29585 case IX86_BUILTIN_GATHERSIV4SF:
29586 icode = CODE_FOR_avx2_gathersiv4sf;
29587 goto gather_gen;
29588 case IX86_BUILTIN_GATHERSIV8SF:
29589 icode = CODE_FOR_avx2_gathersiv8sf;
29590 goto gather_gen;
29591 case IX86_BUILTIN_GATHERDIV4SF:
29592 icode = CODE_FOR_avx2_gatherdiv4sf;
29593 goto gather_gen;
29594 case IX86_BUILTIN_GATHERDIV8SF:
29595 icode = CODE_FOR_avx2_gatherdiv8sf;
29596 goto gather_gen;
29597 case IX86_BUILTIN_GATHERSIV2DI:
29598 icode = CODE_FOR_avx2_gathersiv2di;
29599 goto gather_gen;
29600 case IX86_BUILTIN_GATHERSIV4DI:
29601 icode = CODE_FOR_avx2_gathersiv4di;
29602 goto gather_gen;
29603 case IX86_BUILTIN_GATHERDIV2DI:
29604 icode = CODE_FOR_avx2_gatherdiv2di;
29605 goto gather_gen;
29606 case IX86_BUILTIN_GATHERDIV4DI:
29607 icode = CODE_FOR_avx2_gatherdiv4di;
29608 goto gather_gen;
29609 case IX86_BUILTIN_GATHERSIV4SI:
29610 icode = CODE_FOR_avx2_gathersiv4si;
29611 goto gather_gen;
29612 case IX86_BUILTIN_GATHERSIV8SI:
29613 icode = CODE_FOR_avx2_gathersiv8si;
29614 goto gather_gen;
29615 case IX86_BUILTIN_GATHERDIV4SI:
29616 icode = CODE_FOR_avx2_gatherdiv4si;
29617 goto gather_gen;
29618 case IX86_BUILTIN_GATHERDIV8SI:
29619 icode = CODE_FOR_avx2_gatherdiv8si;
29620 goto gather_gen;
29621 case IX86_BUILTIN_GATHERALTSIV4DF:
29622 icode = CODE_FOR_avx2_gathersiv4df;
29623 goto gather_gen;
29624 case IX86_BUILTIN_GATHERALTDIV8SF:
29625 icode = CODE_FOR_avx2_gatherdiv8sf;
29626 goto gather_gen;
29627 case IX86_BUILTIN_GATHERALTSIV4DI:
29628 icode = CODE_FOR_avx2_gathersiv4di;
29629 goto gather_gen;
29630 case IX86_BUILTIN_GATHERALTDIV8SI:
29631 icode = CODE_FOR_avx2_gatherdiv8si;
29632 goto gather_gen;
29633
29634 gather_gen:
29635 arg0 = CALL_EXPR_ARG (exp, 0);
29636 arg1 = CALL_EXPR_ARG (exp, 1);
29637 arg2 = CALL_EXPR_ARG (exp, 2);
29638 arg3 = CALL_EXPR_ARG (exp, 3);
29639 arg4 = CALL_EXPR_ARG (exp, 4);
29640 op0 = expand_normal (arg0);
29641 op1 = expand_normal (arg1);
29642 op2 = expand_normal (arg2);
29643 op3 = expand_normal (arg3);
29644 op4 = expand_normal (arg4);
29645 /* Note the arg order is different from the operand order. */
29646 mode0 = insn_data[icode].operand[1].mode;
29647 mode2 = insn_data[icode].operand[3].mode;
29648 mode3 = insn_data[icode].operand[4].mode;
29649 mode4 = insn_data[icode].operand[5].mode;
29650
29651 if (target == NULL_RTX
29652 || GET_MODE (target) != insn_data[icode].operand[0].mode)
29653 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
29654 else
29655 subtarget = target;
29656
29657 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
29658 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
29659 {
29660 rtx half = gen_reg_rtx (V4SImode);
29661 if (!nonimmediate_operand (op2, V8SImode))
29662 op2 = copy_to_mode_reg (V8SImode, op2);
29663 emit_insn (gen_vec_extract_lo_v8si (half, op2));
29664 op2 = half;
29665 }
29666 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
29667 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
29668 {
29669 rtx (*gen) (rtx, rtx);
29670 rtx half = gen_reg_rtx (mode0);
29671 if (mode0 == V4SFmode)
29672 gen = gen_vec_extract_lo_v8sf;
29673 else
29674 gen = gen_vec_extract_lo_v8si;
29675 if (!nonimmediate_operand (op0, GET_MODE (op0)))
29676 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
29677 emit_insn (gen (half, op0));
29678 op0 = half;
29679 if (!nonimmediate_operand (op3, GET_MODE (op3)))
29680 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
29681 emit_insn (gen (half, op3));
29682 op3 = half;
29683 }
29684
29685 /* Force memory operand only with base register here. But we
29686 don't want to do it on memory operand for other builtin
29687 functions. */
29688 if (GET_MODE (op1) != Pmode)
29689 op1 = convert_to_mode (Pmode, op1, 1);
29690 op1 = force_reg (Pmode, op1);
29691
29692 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29693 op0 = copy_to_mode_reg (mode0, op0);
29694 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
29695 op1 = copy_to_mode_reg (Pmode, op1);
29696 if (!insn_data[icode].operand[3].predicate (op2, mode2))
29697 op2 = copy_to_mode_reg (mode2, op2);
29698 if (!insn_data[icode].operand[4].predicate (op3, mode3))
29699 op3 = copy_to_mode_reg (mode3, op3);
29700 if (!insn_data[icode].operand[5].predicate (op4, mode4))
29701 {
29702 error ("last argument must be scale 1, 2, 4, 8");
29703 return const0_rtx;
29704 }
29705
29706 /* Optimize. If mask is known to have all high bits set,
29707 replace op0 with pc_rtx to signal that the instruction
29708 overwrites the whole destination and doesn't use its
29709 previous contents. */
29710 if (optimize)
29711 {
29712 if (TREE_CODE (arg3) == VECTOR_CST)
29713 {
29714 unsigned int negative = 0;
29715 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
29716 {
29717 tree cst = VECTOR_CST_ELT (arg3, i);
29718 if (TREE_CODE (cst) == INTEGER_CST
29719 && tree_int_cst_sign_bit (cst))
29720 negative++;
29721 else if (TREE_CODE (cst) == REAL_CST
29722 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
29723 negative++;
29724 }
29725 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
29726 op0 = pc_rtx;
29727 }
29728 else if (TREE_CODE (arg3) == SSA_NAME)
29729 {
29730 /* Recognize also when mask is like:
29731 __v2df src = _mm_setzero_pd ();
29732 __v2df mask = _mm_cmpeq_pd (src, src);
29733 or
29734 __v8sf src = _mm256_setzero_ps ();
29735 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
29736 as that is a cheaper way to load all ones into
29737 a register than having to load a constant from
29738 memory. */
29739 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
29740 if (is_gimple_call (def_stmt))
29741 {
29742 tree fndecl = gimple_call_fndecl (def_stmt);
29743 if (fndecl
29744 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29745 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
29746 {
29747 case IX86_BUILTIN_CMPPD:
29748 case IX86_BUILTIN_CMPPS:
29749 case IX86_BUILTIN_CMPPD256:
29750 case IX86_BUILTIN_CMPPS256:
29751 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
29752 break;
29753 /* FALLTHRU */
29754 case IX86_BUILTIN_CMPEQPD:
29755 case IX86_BUILTIN_CMPEQPS:
29756 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
29757 && initializer_zerop (gimple_call_arg (def_stmt,
29758 1)))
29759 op0 = pc_rtx;
29760 break;
29761 default:
29762 break;
29763 }
29764 }
29765 }
29766 }
29767
29768 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
29769 if (! pat)
29770 return const0_rtx;
29771 emit_insn (pat);
29772
29773 if (fcode == IX86_BUILTIN_GATHERDIV8SF
29774 || fcode == IX86_BUILTIN_GATHERDIV8SI)
29775 {
29776 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
29777 ? V4SFmode : V4SImode;
29778 if (target == NULL_RTX)
29779 target = gen_reg_rtx (tmode);
29780 if (tmode == V4SFmode)
29781 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
29782 else
29783 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
29784 }
29785 else
29786 target = subtarget;
29787
29788 return target;
29789
29790 case IX86_BUILTIN_XABORT:
29791 icode = CODE_FOR_xabort;
29792 arg0 = CALL_EXPR_ARG (exp, 0);
29793 op0 = expand_normal (arg0);
29794 mode0 = insn_data[icode].operand[0].mode;
29795 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29796 {
29797 error ("the xabort's argument must be an 8-bit immediate");
29798 return const0_rtx;
29799 }
29800 emit_insn (gen_xabort (op0));
29801 return 0;
29802
29803 default:
29804 break;
29805 }
29806
29807 for (i = 0, d = bdesc_special_args;
29808 i < ARRAY_SIZE (bdesc_special_args);
29809 i++, d++)
29810 if (d->code == fcode)
29811 return ix86_expand_special_args_builtin (d, exp, target);
29812
29813 for (i = 0, d = bdesc_args;
29814 i < ARRAY_SIZE (bdesc_args);
29815 i++, d++)
29816 if (d->code == fcode)
29817 switch (fcode)
29818 {
29819 case IX86_BUILTIN_FABSQ:
29820 case IX86_BUILTIN_COPYSIGNQ:
29821 if (!TARGET_SSE2)
29822 /* Emit a normal call if SSE2 isn't available. */
29823 return expand_call (exp, target, ignore);
29824 default:
29825 return ix86_expand_args_builtin (d, exp, target);
29826 }
29827
29828 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29829 if (d->code == fcode)
29830 return ix86_expand_sse_comi (d, exp, target);
29831
29832 for (i = 0, d = bdesc_pcmpestr;
29833 i < ARRAY_SIZE (bdesc_pcmpestr);
29834 i++, d++)
29835 if (d->code == fcode)
29836 return ix86_expand_sse_pcmpestr (d, exp, target);
29837
29838 for (i = 0, d = bdesc_pcmpistr;
29839 i < ARRAY_SIZE (bdesc_pcmpistr);
29840 i++, d++)
29841 if (d->code == fcode)
29842 return ix86_expand_sse_pcmpistr (d, exp, target);
29843
29844 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29845 if (d->code == fcode)
29846 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
29847 (enum ix86_builtin_func_type)
29848 d->flag, d->comparison);
29849
29850 gcc_unreachable ();
29851 }
29852
29853 /* Returns a function decl for a vectorized version of the builtin function
29854 with builtin function code FN and the result vector type TYPE, or NULL_TREE
29855 if it is not available. */
29856
29857 static tree
29858 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
29859 tree type_in)
29860 {
29861 enum machine_mode in_mode, out_mode;
29862 int in_n, out_n;
29863 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
29864
29865 if (TREE_CODE (type_out) != VECTOR_TYPE
29866 || TREE_CODE (type_in) != VECTOR_TYPE
29867 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
29868 return NULL_TREE;
29869
29870 out_mode = TYPE_MODE (TREE_TYPE (type_out));
29871 out_n = TYPE_VECTOR_SUBPARTS (type_out);
29872 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29873 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29874
29875 switch (fn)
29876 {
29877 case BUILT_IN_SQRT:
29878 if (out_mode == DFmode && in_mode == DFmode)
29879 {
29880 if (out_n == 2 && in_n == 2)
29881 return ix86_builtins[IX86_BUILTIN_SQRTPD];
29882 else if (out_n == 4 && in_n == 4)
29883 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
29884 }
29885 break;
29886
29887 case BUILT_IN_SQRTF:
29888 if (out_mode == SFmode && in_mode == SFmode)
29889 {
29890 if (out_n == 4 && in_n == 4)
29891 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
29892 else if (out_n == 8 && in_n == 8)
29893 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
29894 }
29895 break;
29896
29897 case BUILT_IN_IFLOOR:
29898 case BUILT_IN_LFLOOR:
29899 case BUILT_IN_LLFLOOR:
29900 /* The round insn does not trap on denormals. */
29901 if (flag_trapping_math || !TARGET_ROUND)
29902 break;
29903
29904 if (out_mode == SImode && in_mode == DFmode)
29905 {
29906 if (out_n == 4 && in_n == 2)
29907 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
29908 else if (out_n == 8 && in_n == 4)
29909 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
29910 }
29911 break;
29912
29913 case BUILT_IN_IFLOORF:
29914 case BUILT_IN_LFLOORF:
29915 case BUILT_IN_LLFLOORF:
29916 /* The round insn does not trap on denormals. */
29917 if (flag_trapping_math || !TARGET_ROUND)
29918 break;
29919
29920 if (out_mode == SImode && in_mode == SFmode)
29921 {
29922 if (out_n == 4 && in_n == 4)
29923 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
29924 else if (out_n == 8 && in_n == 8)
29925 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
29926 }
29927 break;
29928
29929 case BUILT_IN_ICEIL:
29930 case BUILT_IN_LCEIL:
29931 case BUILT_IN_LLCEIL:
29932 /* The round insn does not trap on denormals. */
29933 if (flag_trapping_math || !TARGET_ROUND)
29934 break;
29935
29936 if (out_mode == SImode && in_mode == DFmode)
29937 {
29938 if (out_n == 4 && in_n == 2)
29939 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
29940 else if (out_n == 8 && in_n == 4)
29941 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
29942 }
29943 break;
29944
29945 case BUILT_IN_ICEILF:
29946 case BUILT_IN_LCEILF:
29947 case BUILT_IN_LLCEILF:
29948 /* The round insn does not trap on denormals. */
29949 if (flag_trapping_math || !TARGET_ROUND)
29950 break;
29951
29952 if (out_mode == SImode && in_mode == SFmode)
29953 {
29954 if (out_n == 4 && in_n == 4)
29955 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
29956 else if (out_n == 8 && in_n == 8)
29957 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
29958 }
29959 break;
29960
29961 case BUILT_IN_IRINT:
29962 case BUILT_IN_LRINT:
29963 case BUILT_IN_LLRINT:
29964 if (out_mode == SImode && in_mode == DFmode)
29965 {
29966 if (out_n == 4 && in_n == 2)
29967 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
29968 else if (out_n == 8 && in_n == 4)
29969 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
29970 }
29971 break;
29972
29973 case BUILT_IN_IRINTF:
29974 case BUILT_IN_LRINTF:
29975 case BUILT_IN_LLRINTF:
29976 if (out_mode == SImode && in_mode == SFmode)
29977 {
29978 if (out_n == 4 && in_n == 4)
29979 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
29980 else if (out_n == 8 && in_n == 8)
29981 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
29982 }
29983 break;
29984
29985 case BUILT_IN_IROUND:
29986 case BUILT_IN_LROUND:
29987 case BUILT_IN_LLROUND:
29988 /* The round insn does not trap on denormals. */
29989 if (flag_trapping_math || !TARGET_ROUND)
29990 break;
29991
29992 if (out_mode == SImode && in_mode == DFmode)
29993 {
29994 if (out_n == 4 && in_n == 2)
29995 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
29996 else if (out_n == 8 && in_n == 4)
29997 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
29998 }
29999 break;
30000
30001 case BUILT_IN_IROUNDF:
30002 case BUILT_IN_LROUNDF:
30003 case BUILT_IN_LLROUNDF:
30004 /* The round insn does not trap on denormals. */
30005 if (flag_trapping_math || !TARGET_ROUND)
30006 break;
30007
30008 if (out_mode == SImode && in_mode == SFmode)
30009 {
30010 if (out_n == 4 && in_n == 4)
30011 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
30012 else if (out_n == 8 && in_n == 8)
30013 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
30014 }
30015 break;
30016
30017 case BUILT_IN_COPYSIGN:
30018 if (out_mode == DFmode && in_mode == DFmode)
30019 {
30020 if (out_n == 2 && in_n == 2)
30021 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
30022 else if (out_n == 4 && in_n == 4)
30023 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
30024 }
30025 break;
30026
30027 case BUILT_IN_COPYSIGNF:
30028 if (out_mode == SFmode && in_mode == SFmode)
30029 {
30030 if (out_n == 4 && in_n == 4)
30031 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
30032 else if (out_n == 8 && in_n == 8)
30033 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
30034 }
30035 break;
30036
30037 case BUILT_IN_FLOOR:
30038 /* The round insn does not trap on denormals. */
30039 if (flag_trapping_math || !TARGET_ROUND)
30040 break;
30041
30042 if (out_mode == DFmode && in_mode == DFmode)
30043 {
30044 if (out_n == 2 && in_n == 2)
30045 return ix86_builtins[IX86_BUILTIN_FLOORPD];
30046 else if (out_n == 4 && in_n == 4)
30047 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
30048 }
30049 break;
30050
30051 case BUILT_IN_FLOORF:
30052 /* The round insn does not trap on denormals. */
30053 if (flag_trapping_math || !TARGET_ROUND)
30054 break;
30055
30056 if (out_mode == SFmode && in_mode == SFmode)
30057 {
30058 if (out_n == 4 && in_n == 4)
30059 return ix86_builtins[IX86_BUILTIN_FLOORPS];
30060 else if (out_n == 8 && in_n == 8)
30061 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
30062 }
30063 break;
30064
30065 case BUILT_IN_CEIL:
30066 /* The round insn does not trap on denormals. */
30067 if (flag_trapping_math || !TARGET_ROUND)
30068 break;
30069
30070 if (out_mode == DFmode && in_mode == DFmode)
30071 {
30072 if (out_n == 2 && in_n == 2)
30073 return ix86_builtins[IX86_BUILTIN_CEILPD];
30074 else if (out_n == 4 && in_n == 4)
30075 return ix86_builtins[IX86_BUILTIN_CEILPD256];
30076 }
30077 break;
30078
30079 case BUILT_IN_CEILF:
30080 /* The round insn does not trap on denormals. */
30081 if (flag_trapping_math || !TARGET_ROUND)
30082 break;
30083
30084 if (out_mode == SFmode && in_mode == SFmode)
30085 {
30086 if (out_n == 4 && in_n == 4)
30087 return ix86_builtins[IX86_BUILTIN_CEILPS];
30088 else if (out_n == 8 && in_n == 8)
30089 return ix86_builtins[IX86_BUILTIN_CEILPS256];
30090 }
30091 break;
30092
30093 case BUILT_IN_TRUNC:
30094 /* The round insn does not trap on denormals. */
30095 if (flag_trapping_math || !TARGET_ROUND)
30096 break;
30097
30098 if (out_mode == DFmode && in_mode == DFmode)
30099 {
30100 if (out_n == 2 && in_n == 2)
30101 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
30102 else if (out_n == 4 && in_n == 4)
30103 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
30104 }
30105 break;
30106
30107 case BUILT_IN_TRUNCF:
30108 /* The round insn does not trap on denormals. */
30109 if (flag_trapping_math || !TARGET_ROUND)
30110 break;
30111
30112 if (out_mode == SFmode && in_mode == SFmode)
30113 {
30114 if (out_n == 4 && in_n == 4)
30115 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
30116 else if (out_n == 8 && in_n == 8)
30117 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
30118 }
30119 break;
30120
30121 case BUILT_IN_RINT:
30122 /* The round insn does not trap on denormals. */
30123 if (flag_trapping_math || !TARGET_ROUND)
30124 break;
30125
30126 if (out_mode == DFmode && in_mode == DFmode)
30127 {
30128 if (out_n == 2 && in_n == 2)
30129 return ix86_builtins[IX86_BUILTIN_RINTPD];
30130 else if (out_n == 4 && in_n == 4)
30131 return ix86_builtins[IX86_BUILTIN_RINTPD256];
30132 }
30133 break;
30134
30135 case BUILT_IN_RINTF:
30136 /* The round insn does not trap on denormals. */
30137 if (flag_trapping_math || !TARGET_ROUND)
30138 break;
30139
30140 if (out_mode == SFmode && in_mode == SFmode)
30141 {
30142 if (out_n == 4 && in_n == 4)
30143 return ix86_builtins[IX86_BUILTIN_RINTPS];
30144 else if (out_n == 8 && in_n == 8)
30145 return ix86_builtins[IX86_BUILTIN_RINTPS256];
30146 }
30147 break;
30148
30149 case BUILT_IN_ROUND:
30150 /* The round insn does not trap on denormals. */
30151 if (flag_trapping_math || !TARGET_ROUND)
30152 break;
30153
30154 if (out_mode == DFmode && in_mode == DFmode)
30155 {
30156 if (out_n == 2 && in_n == 2)
30157 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
30158 else if (out_n == 4 && in_n == 4)
30159 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
30160 }
30161 break;
30162
30163 case BUILT_IN_ROUNDF:
30164 /* The round insn does not trap on denormals. */
30165 if (flag_trapping_math || !TARGET_ROUND)
30166 break;
30167
30168 if (out_mode == SFmode && in_mode == SFmode)
30169 {
30170 if (out_n == 4 && in_n == 4)
30171 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
30172 else if (out_n == 8 && in_n == 8)
30173 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
30174 }
30175 break;
30176
30177 case BUILT_IN_FMA:
30178 if (out_mode == DFmode && in_mode == DFmode)
30179 {
30180 if (out_n == 2 && in_n == 2)
30181 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
30182 if (out_n == 4 && in_n == 4)
30183 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30184 }
30185 break;
30186
30187 case BUILT_IN_FMAF:
30188 if (out_mode == SFmode && in_mode == SFmode)
30189 {
30190 if (out_n == 4 && in_n == 4)
30191 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30192 if (out_n == 8 && in_n == 8)
30193 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30194 }
30195 break;
30196
30197 default:
30198 break;
30199 }
30200
30201 /* Dispatch to a handler for a vectorization library. */
30202 if (ix86_veclib_handler)
30203 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30204 type_in);
30205
30206 return NULL_TREE;
30207 }
30208
30209 /* Handler for an SVML-style interface to
30210 a library with vectorized intrinsics. */
30211
30212 static tree
30213 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30214 {
30215 char name[20];
30216 tree fntype, new_fndecl, args;
30217 unsigned arity;
30218 const char *bname;
30219 enum machine_mode el_mode, in_mode;
30220 int n, in_n;
30221
30222 /* The SVML is suitable for unsafe math only. */
30223 if (!flag_unsafe_math_optimizations)
30224 return NULL_TREE;
30225
30226 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30227 n = TYPE_VECTOR_SUBPARTS (type_out);
30228 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30229 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30230 if (el_mode != in_mode
30231 || n != in_n)
30232 return NULL_TREE;
30233
30234 switch (fn)
30235 {
30236 case BUILT_IN_EXP:
30237 case BUILT_IN_LOG:
30238 case BUILT_IN_LOG10:
30239 case BUILT_IN_POW:
30240 case BUILT_IN_TANH:
30241 case BUILT_IN_TAN:
30242 case BUILT_IN_ATAN:
30243 case BUILT_IN_ATAN2:
30244 case BUILT_IN_ATANH:
30245 case BUILT_IN_CBRT:
30246 case BUILT_IN_SINH:
30247 case BUILT_IN_SIN:
30248 case BUILT_IN_ASINH:
30249 case BUILT_IN_ASIN:
30250 case BUILT_IN_COSH:
30251 case BUILT_IN_COS:
30252 case BUILT_IN_ACOSH:
30253 case BUILT_IN_ACOS:
30254 if (el_mode != DFmode || n != 2)
30255 return NULL_TREE;
30256 break;
30257
30258 case BUILT_IN_EXPF:
30259 case BUILT_IN_LOGF:
30260 case BUILT_IN_LOG10F:
30261 case BUILT_IN_POWF:
30262 case BUILT_IN_TANHF:
30263 case BUILT_IN_TANF:
30264 case BUILT_IN_ATANF:
30265 case BUILT_IN_ATAN2F:
30266 case BUILT_IN_ATANHF:
30267 case BUILT_IN_CBRTF:
30268 case BUILT_IN_SINHF:
30269 case BUILT_IN_SINF:
30270 case BUILT_IN_ASINHF:
30271 case BUILT_IN_ASINF:
30272 case BUILT_IN_COSHF:
30273 case BUILT_IN_COSF:
30274 case BUILT_IN_ACOSHF:
30275 case BUILT_IN_ACOSF:
30276 if (el_mode != SFmode || n != 4)
30277 return NULL_TREE;
30278 break;
30279
30280 default:
30281 return NULL_TREE;
30282 }
30283
30284 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30285
30286 if (fn == BUILT_IN_LOGF)
30287 strcpy (name, "vmlsLn4");
30288 else if (fn == BUILT_IN_LOG)
30289 strcpy (name, "vmldLn2");
30290 else if (n == 4)
30291 {
30292 sprintf (name, "vmls%s", bname+10);
30293 name[strlen (name)-1] = '4';
30294 }
30295 else
30296 sprintf (name, "vmld%s2", bname+10);
30297
30298 /* Convert to uppercase. */
30299 name[4] &= ~0x20;
30300
30301 arity = 0;
30302 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30303 args;
30304 args = TREE_CHAIN (args))
30305 arity++;
30306
30307 if (arity == 1)
30308 fntype = build_function_type_list (type_out, type_in, NULL);
30309 else
30310 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30311
30312 /* Build a function declaration for the vectorized function. */
30313 new_fndecl = build_decl (BUILTINS_LOCATION,
30314 FUNCTION_DECL, get_identifier (name), fntype);
30315 TREE_PUBLIC (new_fndecl) = 1;
30316 DECL_EXTERNAL (new_fndecl) = 1;
30317 DECL_IS_NOVOPS (new_fndecl) = 1;
30318 TREE_READONLY (new_fndecl) = 1;
30319
30320 return new_fndecl;
30321 }
30322
30323 /* Handler for an ACML-style interface to
30324 a library with vectorized intrinsics. */
30325
30326 static tree
30327 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30328 {
30329 char name[20] = "__vr.._";
30330 tree fntype, new_fndecl, args;
30331 unsigned arity;
30332 const char *bname;
30333 enum machine_mode el_mode, in_mode;
30334 int n, in_n;
30335
30336 /* The ACML is 64bits only and suitable for unsafe math only as
30337 it does not correctly support parts of IEEE with the required
30338 precision such as denormals. */
30339 if (!TARGET_64BIT
30340 || !flag_unsafe_math_optimizations)
30341 return NULL_TREE;
30342
30343 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30344 n = TYPE_VECTOR_SUBPARTS (type_out);
30345 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30346 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30347 if (el_mode != in_mode
30348 || n != in_n)
30349 return NULL_TREE;
30350
30351 switch (fn)
30352 {
30353 case BUILT_IN_SIN:
30354 case BUILT_IN_COS:
30355 case BUILT_IN_EXP:
30356 case BUILT_IN_LOG:
30357 case BUILT_IN_LOG2:
30358 case BUILT_IN_LOG10:
30359 name[4] = 'd';
30360 name[5] = '2';
30361 if (el_mode != DFmode
30362 || n != 2)
30363 return NULL_TREE;
30364 break;
30365
30366 case BUILT_IN_SINF:
30367 case BUILT_IN_COSF:
30368 case BUILT_IN_EXPF:
30369 case BUILT_IN_POWF:
30370 case BUILT_IN_LOGF:
30371 case BUILT_IN_LOG2F:
30372 case BUILT_IN_LOG10F:
30373 name[4] = 's';
30374 name[5] = '4';
30375 if (el_mode != SFmode
30376 || n != 4)
30377 return NULL_TREE;
30378 break;
30379
30380 default:
30381 return NULL_TREE;
30382 }
30383
30384 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30385 sprintf (name + 7, "%s", bname+10);
30386
30387 arity = 0;
30388 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30389 args;
30390 args = TREE_CHAIN (args))
30391 arity++;
30392
30393 if (arity == 1)
30394 fntype = build_function_type_list (type_out, type_in, NULL);
30395 else
30396 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30397
30398 /* Build a function declaration for the vectorized function. */
30399 new_fndecl = build_decl (BUILTINS_LOCATION,
30400 FUNCTION_DECL, get_identifier (name), fntype);
30401 TREE_PUBLIC (new_fndecl) = 1;
30402 DECL_EXTERNAL (new_fndecl) = 1;
30403 DECL_IS_NOVOPS (new_fndecl) = 1;
30404 TREE_READONLY (new_fndecl) = 1;
30405
30406 return new_fndecl;
30407 }
30408
30409 /* Returns a decl of a function that implements gather load with
30410 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30411 Return NULL_TREE if it is not available. */
30412
30413 static tree
30414 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30415 const_tree index_type, int scale)
30416 {
30417 bool si;
30418 enum ix86_builtins code;
30419
30420 if (! TARGET_AVX2)
30421 return NULL_TREE;
30422
30423 if ((TREE_CODE (index_type) != INTEGER_TYPE
30424 && !POINTER_TYPE_P (index_type))
30425 || (TYPE_MODE (index_type) != SImode
30426 && TYPE_MODE (index_type) != DImode))
30427 return NULL_TREE;
30428
30429 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30430 return NULL_TREE;
30431
30432 /* v*gather* insn sign extends index to pointer mode. */
30433 if (TYPE_PRECISION (index_type) < POINTER_SIZE
30434 && TYPE_UNSIGNED (index_type))
30435 return NULL_TREE;
30436
30437 if (scale <= 0
30438 || scale > 8
30439 || (scale & (scale - 1)) != 0)
30440 return NULL_TREE;
30441
30442 si = TYPE_MODE (index_type) == SImode;
30443 switch (TYPE_MODE (mem_vectype))
30444 {
30445 case V2DFmode:
30446 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30447 break;
30448 case V4DFmode:
30449 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30450 break;
30451 case V2DImode:
30452 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30453 break;
30454 case V4DImode:
30455 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30456 break;
30457 case V4SFmode:
30458 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30459 break;
30460 case V8SFmode:
30461 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30462 break;
30463 case V4SImode:
30464 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30465 break;
30466 case V8SImode:
30467 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30468 break;
30469 default:
30470 return NULL_TREE;
30471 }
30472
30473 return ix86_builtins[code];
30474 }
30475
30476 /* Returns a code for a target-specific builtin that implements
30477 reciprocal of the function, or NULL_TREE if not available. */
30478
30479 static tree
30480 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30481 bool sqrt ATTRIBUTE_UNUSED)
30482 {
30483 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30484 && flag_finite_math_only && !flag_trapping_math
30485 && flag_unsafe_math_optimizations))
30486 return NULL_TREE;
30487
30488 if (md_fn)
30489 /* Machine dependent builtins. */
30490 switch (fn)
30491 {
30492 /* Vectorized version of sqrt to rsqrt conversion. */
30493 case IX86_BUILTIN_SQRTPS_NR:
30494 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30495
30496 case IX86_BUILTIN_SQRTPS_NR256:
30497 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30498
30499 default:
30500 return NULL_TREE;
30501 }
30502 else
30503 /* Normal builtins. */
30504 switch (fn)
30505 {
30506 /* Sqrt to rsqrt conversion. */
30507 case BUILT_IN_SQRTF:
30508 return ix86_builtins[IX86_BUILTIN_RSQRTF];
30509
30510 default:
30511 return NULL_TREE;
30512 }
30513 }
30514 \f
30515 /* Helper for avx_vpermilps256_operand et al. This is also used by
30516 the expansion functions to turn the parallel back into a mask.
30517 The return value is 0 for no match and the imm8+1 for a match. */
30518
30519 int
30520 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30521 {
30522 unsigned i, nelt = GET_MODE_NUNITS (mode);
30523 unsigned mask = 0;
30524 unsigned char ipar[8];
30525
30526 if (XVECLEN (par, 0) != (int) nelt)
30527 return 0;
30528
30529 /* Validate that all of the elements are constants, and not totally
30530 out of range. Copy the data into an integral array to make the
30531 subsequent checks easier. */
30532 for (i = 0; i < nelt; ++i)
30533 {
30534 rtx er = XVECEXP (par, 0, i);
30535 unsigned HOST_WIDE_INT ei;
30536
30537 if (!CONST_INT_P (er))
30538 return 0;
30539 ei = INTVAL (er);
30540 if (ei >= nelt)
30541 return 0;
30542 ipar[i] = ei;
30543 }
30544
30545 switch (mode)
30546 {
30547 case V4DFmode:
30548 /* In the 256-bit DFmode case, we can only move elements within
30549 a 128-bit lane. */
30550 for (i = 0; i < 2; ++i)
30551 {
30552 if (ipar[i] >= 2)
30553 return 0;
30554 mask |= ipar[i] << i;
30555 }
30556 for (i = 2; i < 4; ++i)
30557 {
30558 if (ipar[i] < 2)
30559 return 0;
30560 mask |= (ipar[i] - 2) << i;
30561 }
30562 break;
30563
30564 case V8SFmode:
30565 /* In the 256-bit SFmode case, we have full freedom of movement
30566 within the low 128-bit lane, but the high 128-bit lane must
30567 mirror the exact same pattern. */
30568 for (i = 0; i < 4; ++i)
30569 if (ipar[i] + 4 != ipar[i + 4])
30570 return 0;
30571 nelt = 4;
30572 /* FALLTHRU */
30573
30574 case V2DFmode:
30575 case V4SFmode:
30576 /* In the 128-bit case, we've full freedom in the placement of
30577 the elements from the source operand. */
30578 for (i = 0; i < nelt; ++i)
30579 mask |= ipar[i] << (i * (nelt / 2));
30580 break;
30581
30582 default:
30583 gcc_unreachable ();
30584 }
30585
30586 /* Make sure success has a non-zero value by adding one. */
30587 return mask + 1;
30588 }
30589
30590 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
30591 the expansion functions to turn the parallel back into a mask.
30592 The return value is 0 for no match and the imm8+1 for a match. */
30593
30594 int
30595 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30596 {
30597 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30598 unsigned mask = 0;
30599 unsigned char ipar[8];
30600
30601 if (XVECLEN (par, 0) != (int) nelt)
30602 return 0;
30603
30604 /* Validate that all of the elements are constants, and not totally
30605 out of range. Copy the data into an integral array to make the
30606 subsequent checks easier. */
30607 for (i = 0; i < nelt; ++i)
30608 {
30609 rtx er = XVECEXP (par, 0, i);
30610 unsigned HOST_WIDE_INT ei;
30611
30612 if (!CONST_INT_P (er))
30613 return 0;
30614 ei = INTVAL (er);
30615 if (ei >= 2 * nelt)
30616 return 0;
30617 ipar[i] = ei;
30618 }
30619
30620 /* Validate that the halves of the permute are halves. */
30621 for (i = 0; i < nelt2 - 1; ++i)
30622 if (ipar[i] + 1 != ipar[i + 1])
30623 return 0;
30624 for (i = nelt2; i < nelt - 1; ++i)
30625 if (ipar[i] + 1 != ipar[i + 1])
30626 return 0;
30627
30628 /* Reconstruct the mask. */
30629 for (i = 0; i < 2; ++i)
30630 {
30631 unsigned e = ipar[i * nelt2];
30632 if (e % nelt2)
30633 return 0;
30634 e /= nelt2;
30635 mask |= e << (i * 4);
30636 }
30637
30638 /* Make sure success has a non-zero value by adding one. */
30639 return mask + 1;
30640 }
30641 \f
30642 /* Store OPERAND to the memory after reload is completed. This means
30643 that we can't easily use assign_stack_local. */
30644 rtx
30645 ix86_force_to_memory (enum machine_mode mode, rtx operand)
30646 {
30647 rtx result;
30648
30649 gcc_assert (reload_completed);
30650 if (ix86_using_red_zone ())
30651 {
30652 result = gen_rtx_MEM (mode,
30653 gen_rtx_PLUS (Pmode,
30654 stack_pointer_rtx,
30655 GEN_INT (-RED_ZONE_SIZE)));
30656 emit_move_insn (result, operand);
30657 }
30658 else if (TARGET_64BIT)
30659 {
30660 switch (mode)
30661 {
30662 case HImode:
30663 case SImode:
30664 operand = gen_lowpart (DImode, operand);
30665 /* FALLTHRU */
30666 case DImode:
30667 emit_insn (
30668 gen_rtx_SET (VOIDmode,
30669 gen_rtx_MEM (DImode,
30670 gen_rtx_PRE_DEC (DImode,
30671 stack_pointer_rtx)),
30672 operand));
30673 break;
30674 default:
30675 gcc_unreachable ();
30676 }
30677 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30678 }
30679 else
30680 {
30681 switch (mode)
30682 {
30683 case DImode:
30684 {
30685 rtx operands[2];
30686 split_double_mode (mode, &operand, 1, operands, operands + 1);
30687 emit_insn (
30688 gen_rtx_SET (VOIDmode,
30689 gen_rtx_MEM (SImode,
30690 gen_rtx_PRE_DEC (Pmode,
30691 stack_pointer_rtx)),
30692 operands[1]));
30693 emit_insn (
30694 gen_rtx_SET (VOIDmode,
30695 gen_rtx_MEM (SImode,
30696 gen_rtx_PRE_DEC (Pmode,
30697 stack_pointer_rtx)),
30698 operands[0]));
30699 }
30700 break;
30701 case HImode:
30702 /* Store HImodes as SImodes. */
30703 operand = gen_lowpart (SImode, operand);
30704 /* FALLTHRU */
30705 case SImode:
30706 emit_insn (
30707 gen_rtx_SET (VOIDmode,
30708 gen_rtx_MEM (GET_MODE (operand),
30709 gen_rtx_PRE_DEC (SImode,
30710 stack_pointer_rtx)),
30711 operand));
30712 break;
30713 default:
30714 gcc_unreachable ();
30715 }
30716 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30717 }
30718 return result;
30719 }
30720
30721 /* Free operand from the memory. */
30722 void
30723 ix86_free_from_memory (enum machine_mode mode)
30724 {
30725 if (!ix86_using_red_zone ())
30726 {
30727 int size;
30728
30729 if (mode == DImode || TARGET_64BIT)
30730 size = 8;
30731 else
30732 size = 4;
30733 /* Use LEA to deallocate stack space. In peephole2 it will be converted
30734 to pop or add instruction if registers are available. */
30735 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
30736 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
30737 GEN_INT (size))));
30738 }
30739 }
30740
30741 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
30742
30743 Put float CONST_DOUBLE in the constant pool instead of fp regs.
30744 QImode must go into class Q_REGS.
30745 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
30746 movdf to do mem-to-mem moves through integer regs. */
30747
30748 static reg_class_t
30749 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
30750 {
30751 enum machine_mode mode = GET_MODE (x);
30752
30753 /* We're only allowed to return a subclass of CLASS. Many of the
30754 following checks fail for NO_REGS, so eliminate that early. */
30755 if (regclass == NO_REGS)
30756 return NO_REGS;
30757
30758 /* All classes can load zeros. */
30759 if (x == CONST0_RTX (mode))
30760 return regclass;
30761
30762 /* Force constants into memory if we are loading a (nonzero) constant into
30763 an MMX or SSE register. This is because there are no MMX/SSE instructions
30764 to load from a constant. */
30765 if (CONSTANT_P (x)
30766 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
30767 return NO_REGS;
30768
30769 /* Prefer SSE regs only, if we can use them for math. */
30770 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
30771 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
30772
30773 /* Floating-point constants need more complex checks. */
30774 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
30775 {
30776 /* General regs can load everything. */
30777 if (reg_class_subset_p (regclass, GENERAL_REGS))
30778 return regclass;
30779
30780 /* Floats can load 0 and 1 plus some others. Note that we eliminated
30781 zero above. We only want to wind up preferring 80387 registers if
30782 we plan on doing computation with them. */
30783 if (TARGET_80387
30784 && standard_80387_constant_p (x) > 0)
30785 {
30786 /* Limit class to non-sse. */
30787 if (regclass == FLOAT_SSE_REGS)
30788 return FLOAT_REGS;
30789 if (regclass == FP_TOP_SSE_REGS)
30790 return FP_TOP_REG;
30791 if (regclass == FP_SECOND_SSE_REGS)
30792 return FP_SECOND_REG;
30793 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
30794 return regclass;
30795 }
30796
30797 return NO_REGS;
30798 }
30799
30800 /* Generally when we see PLUS here, it's the function invariant
30801 (plus soft-fp const_int). Which can only be computed into general
30802 regs. */
30803 if (GET_CODE (x) == PLUS)
30804 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
30805
30806 /* QImode constants are easy to load, but non-constant QImode data
30807 must go into Q_REGS. */
30808 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
30809 {
30810 if (reg_class_subset_p (regclass, Q_REGS))
30811 return regclass;
30812 if (reg_class_subset_p (Q_REGS, regclass))
30813 return Q_REGS;
30814 return NO_REGS;
30815 }
30816
30817 return regclass;
30818 }
30819
30820 /* Discourage putting floating-point values in SSE registers unless
30821 SSE math is being used, and likewise for the 387 registers. */
30822 static reg_class_t
30823 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
30824 {
30825 enum machine_mode mode = GET_MODE (x);
30826
30827 /* Restrict the output reload class to the register bank that we are doing
30828 math on. If we would like not to return a subset of CLASS, reject this
30829 alternative: if reload cannot do this, it will still use its choice. */
30830 mode = GET_MODE (x);
30831 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
30832 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
30833
30834 if (X87_FLOAT_MODE_P (mode))
30835 {
30836 if (regclass == FP_TOP_SSE_REGS)
30837 return FP_TOP_REG;
30838 else if (regclass == FP_SECOND_SSE_REGS)
30839 return FP_SECOND_REG;
30840 else
30841 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
30842 }
30843
30844 return regclass;
30845 }
30846
30847 static reg_class_t
30848 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
30849 enum machine_mode mode, secondary_reload_info *sri)
30850 {
30851 /* Double-word spills from general registers to non-offsettable memory
30852 references (zero-extended addresses) require special handling. */
30853 if (TARGET_64BIT
30854 && MEM_P (x)
30855 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
30856 && rclass == GENERAL_REGS
30857 && !offsettable_memref_p (x))
30858 {
30859 sri->icode = (in_p
30860 ? CODE_FOR_reload_noff_load
30861 : CODE_FOR_reload_noff_store);
30862 /* Add the cost of moving address to a temporary. */
30863 sri->extra_cost = 1;
30864
30865 return NO_REGS;
30866 }
30867
30868 /* QImode spills from non-QI registers require
30869 intermediate register on 32bit targets. */
30870 if (!TARGET_64BIT
30871 && !in_p && mode == QImode
30872 && (rclass == GENERAL_REGS
30873 || rclass == LEGACY_REGS
30874 || rclass == INDEX_REGS))
30875 {
30876 int regno;
30877
30878 if (REG_P (x))
30879 regno = REGNO (x);
30880 else
30881 regno = -1;
30882
30883 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
30884 regno = true_regnum (x);
30885
30886 /* Return Q_REGS if the operand is in memory. */
30887 if (regno == -1)
30888 return Q_REGS;
30889 }
30890
30891 /* This condition handles corner case where an expression involving
30892 pointers gets vectorized. We're trying to use the address of a
30893 stack slot as a vector initializer.
30894
30895 (set (reg:V2DI 74 [ vect_cst_.2 ])
30896 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
30897
30898 Eventually frame gets turned into sp+offset like this:
30899
30900 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30901 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30902 (const_int 392 [0x188]))))
30903
30904 That later gets turned into:
30905
30906 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30907 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30908 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
30909
30910 We'll have the following reload recorded:
30911
30912 Reload 0: reload_in (DI) =
30913 (plus:DI (reg/f:DI 7 sp)
30914 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
30915 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30916 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
30917 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
30918 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30919 reload_reg_rtx: (reg:V2DI 22 xmm1)
30920
30921 Which isn't going to work since SSE instructions can't handle scalar
30922 additions. Returning GENERAL_REGS forces the addition into integer
30923 register and reload can handle subsequent reloads without problems. */
30924
30925 if (in_p && GET_CODE (x) == PLUS
30926 && SSE_CLASS_P (rclass)
30927 && SCALAR_INT_MODE_P (mode))
30928 return GENERAL_REGS;
30929
30930 return NO_REGS;
30931 }
30932
30933 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
30934
30935 static bool
30936 ix86_class_likely_spilled_p (reg_class_t rclass)
30937 {
30938 switch (rclass)
30939 {
30940 case AREG:
30941 case DREG:
30942 case CREG:
30943 case BREG:
30944 case AD_REGS:
30945 case SIREG:
30946 case DIREG:
30947 case SSE_FIRST_REG:
30948 case FP_TOP_REG:
30949 case FP_SECOND_REG:
30950 return true;
30951
30952 default:
30953 break;
30954 }
30955
30956 return false;
30957 }
30958
30959 /* If we are copying between general and FP registers, we need a memory
30960 location. The same is true for SSE and MMX registers.
30961
30962 To optimize register_move_cost performance, allow inline variant.
30963
30964 The macro can't work reliably when one of the CLASSES is class containing
30965 registers from multiple units (SSE, MMX, integer). We avoid this by never
30966 combining those units in single alternative in the machine description.
30967 Ensure that this constraint holds to avoid unexpected surprises.
30968
30969 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
30970 enforce these sanity checks. */
30971
30972 static inline bool
30973 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30974 enum machine_mode mode, int strict)
30975 {
30976 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
30977 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
30978 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
30979 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
30980 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
30981 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
30982 {
30983 gcc_assert (!strict);
30984 return true;
30985 }
30986
30987 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
30988 return true;
30989
30990 /* ??? This is a lie. We do have moves between mmx/general, and for
30991 mmx/sse2. But by saying we need secondary memory we discourage the
30992 register allocator from using the mmx registers unless needed. */
30993 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
30994 return true;
30995
30996 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30997 {
30998 /* SSE1 doesn't have any direct moves from other classes. */
30999 if (!TARGET_SSE2)
31000 return true;
31001
31002 /* If the target says that inter-unit moves are more expensive
31003 than moving through memory, then don't generate them. */
31004 if (!TARGET_INTER_UNIT_MOVES)
31005 return true;
31006
31007 /* Between SSE and general, we have moves no larger than word size. */
31008 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
31009 return true;
31010 }
31011
31012 return false;
31013 }
31014
31015 bool
31016 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31017 enum machine_mode mode, int strict)
31018 {
31019 return inline_secondary_memory_needed (class1, class2, mode, strict);
31020 }
31021
31022 /* Implement the TARGET_CLASS_MAX_NREGS hook.
31023
31024 On the 80386, this is the size of MODE in words,
31025 except in the FP regs, where a single reg is always enough. */
31026
31027 static unsigned char
31028 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
31029 {
31030 if (MAYBE_INTEGER_CLASS_P (rclass))
31031 {
31032 if (mode == XFmode)
31033 return (TARGET_64BIT ? 2 : 3);
31034 else if (mode == XCmode)
31035 return (TARGET_64BIT ? 4 : 6);
31036 else
31037 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
31038 }
31039 else
31040 {
31041 if (COMPLEX_MODE_P (mode))
31042 return 2;
31043 else
31044 return 1;
31045 }
31046 }
31047
31048 /* Return true if the registers in CLASS cannot represent the change from
31049 modes FROM to TO. */
31050
31051 bool
31052 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
31053 enum reg_class regclass)
31054 {
31055 if (from == to)
31056 return false;
31057
31058 /* x87 registers can't do subreg at all, as all values are reformatted
31059 to extended precision. */
31060 if (MAYBE_FLOAT_CLASS_P (regclass))
31061 return true;
31062
31063 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
31064 {
31065 /* Vector registers do not support QI or HImode loads. If we don't
31066 disallow a change to these modes, reload will assume it's ok to
31067 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
31068 the vec_dupv4hi pattern. */
31069 if (GET_MODE_SIZE (from) < 4)
31070 return true;
31071
31072 /* Vector registers do not support subreg with nonzero offsets, which
31073 are otherwise valid for integer registers. Since we can't see
31074 whether we have a nonzero offset from here, prohibit all
31075 nonparadoxical subregs changing size. */
31076 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
31077 return true;
31078 }
31079
31080 return false;
31081 }
31082
31083 /* Return the cost of moving data of mode M between a
31084 register and memory. A value of 2 is the default; this cost is
31085 relative to those in `REGISTER_MOVE_COST'.
31086
31087 This function is used extensively by register_move_cost that is used to
31088 build tables at startup. Make it inline in this case.
31089 When IN is 2, return maximum of in and out move cost.
31090
31091 If moving between registers and memory is more expensive than
31092 between two registers, you should define this macro to express the
31093 relative cost.
31094
31095 Model also increased moving costs of QImode registers in non
31096 Q_REGS classes.
31097 */
31098 static inline int
31099 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
31100 int in)
31101 {
31102 int cost;
31103 if (FLOAT_CLASS_P (regclass))
31104 {
31105 int index;
31106 switch (mode)
31107 {
31108 case SFmode:
31109 index = 0;
31110 break;
31111 case DFmode:
31112 index = 1;
31113 break;
31114 case XFmode:
31115 index = 2;
31116 break;
31117 default:
31118 return 100;
31119 }
31120 if (in == 2)
31121 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
31122 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
31123 }
31124 if (SSE_CLASS_P (regclass))
31125 {
31126 int index;
31127 switch (GET_MODE_SIZE (mode))
31128 {
31129 case 4:
31130 index = 0;
31131 break;
31132 case 8:
31133 index = 1;
31134 break;
31135 case 16:
31136 index = 2;
31137 break;
31138 default:
31139 return 100;
31140 }
31141 if (in == 2)
31142 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
31143 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
31144 }
31145 if (MMX_CLASS_P (regclass))
31146 {
31147 int index;
31148 switch (GET_MODE_SIZE (mode))
31149 {
31150 case 4:
31151 index = 0;
31152 break;
31153 case 8:
31154 index = 1;
31155 break;
31156 default:
31157 return 100;
31158 }
31159 if (in)
31160 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
31161 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
31162 }
31163 switch (GET_MODE_SIZE (mode))
31164 {
31165 case 1:
31166 if (Q_CLASS_P (regclass) || TARGET_64BIT)
31167 {
31168 if (!in)
31169 return ix86_cost->int_store[0];
31170 if (TARGET_PARTIAL_REG_DEPENDENCY
31171 && optimize_function_for_speed_p (cfun))
31172 cost = ix86_cost->movzbl_load;
31173 else
31174 cost = ix86_cost->int_load[0];
31175 if (in == 2)
31176 return MAX (cost, ix86_cost->int_store[0]);
31177 return cost;
31178 }
31179 else
31180 {
31181 if (in == 2)
31182 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
31183 if (in)
31184 return ix86_cost->movzbl_load;
31185 else
31186 return ix86_cost->int_store[0] + 4;
31187 }
31188 break;
31189 case 2:
31190 if (in == 2)
31191 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31192 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31193 default:
31194 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
31195 if (mode == TFmode)
31196 mode = XFmode;
31197 if (in == 2)
31198 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31199 else if (in)
31200 cost = ix86_cost->int_load[2];
31201 else
31202 cost = ix86_cost->int_store[2];
31203 return (cost * (((int) GET_MODE_SIZE (mode)
31204 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31205 }
31206 }
31207
31208 static int
31209 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31210 bool in)
31211 {
31212 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31213 }
31214
31215
31216 /* Return the cost of moving data from a register in class CLASS1 to
31217 one in class CLASS2.
31218
31219 It is not required that the cost always equal 2 when FROM is the same as TO;
31220 on some machines it is expensive to move between registers if they are not
31221 general registers. */
31222
31223 static int
31224 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31225 reg_class_t class2_i)
31226 {
31227 enum reg_class class1 = (enum reg_class) class1_i;
31228 enum reg_class class2 = (enum reg_class) class2_i;
31229
31230 /* In case we require secondary memory, compute cost of the store followed
31231 by load. In order to avoid bad register allocation choices, we need
31232 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
31233
31234 if (inline_secondary_memory_needed (class1, class2, mode, 0))
31235 {
31236 int cost = 1;
31237
31238 cost += inline_memory_move_cost (mode, class1, 2);
31239 cost += inline_memory_move_cost (mode, class2, 2);
31240
31241 /* In case of copying from general_purpose_register we may emit multiple
31242 stores followed by single load causing memory size mismatch stall.
31243 Count this as arbitrarily high cost of 20. */
31244 if (targetm.class_max_nregs (class1, mode)
31245 > targetm.class_max_nregs (class2, mode))
31246 cost += 20;
31247
31248 /* In the case of FP/MMX moves, the registers actually overlap, and we
31249 have to switch modes in order to treat them differently. */
31250 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31251 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31252 cost += 20;
31253
31254 return cost;
31255 }
31256
31257 /* Moves between SSE/MMX and integer unit are expensive. */
31258 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31259 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31260
31261 /* ??? By keeping returned value relatively high, we limit the number
31262 of moves between integer and MMX/SSE registers for all targets.
31263 Additionally, high value prevents problem with x86_modes_tieable_p(),
31264 where integer modes in MMX/SSE registers are not tieable
31265 because of missing QImode and HImode moves to, from or between
31266 MMX/SSE registers. */
31267 return MAX (8, ix86_cost->mmxsse_to_integer);
31268
31269 if (MAYBE_FLOAT_CLASS_P (class1))
31270 return ix86_cost->fp_move;
31271 if (MAYBE_SSE_CLASS_P (class1))
31272 return ix86_cost->sse_move;
31273 if (MAYBE_MMX_CLASS_P (class1))
31274 return ix86_cost->mmx_move;
31275 return 2;
31276 }
31277
31278 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31279 MODE. */
31280
31281 bool
31282 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31283 {
31284 /* Flags and only flags can only hold CCmode values. */
31285 if (CC_REGNO_P (regno))
31286 return GET_MODE_CLASS (mode) == MODE_CC;
31287 if (GET_MODE_CLASS (mode) == MODE_CC
31288 || GET_MODE_CLASS (mode) == MODE_RANDOM
31289 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31290 return false;
31291 if (FP_REGNO_P (regno))
31292 return VALID_FP_MODE_P (mode);
31293 if (SSE_REGNO_P (regno))
31294 {
31295 /* We implement the move patterns for all vector modes into and
31296 out of SSE registers, even when no operation instructions
31297 are available. OImode move is available only when AVX is
31298 enabled. */
31299 return ((TARGET_AVX && mode == OImode)
31300 || VALID_AVX256_REG_MODE (mode)
31301 || VALID_SSE_REG_MODE (mode)
31302 || VALID_SSE2_REG_MODE (mode)
31303 || VALID_MMX_REG_MODE (mode)
31304 || VALID_MMX_REG_MODE_3DNOW (mode));
31305 }
31306 if (MMX_REGNO_P (regno))
31307 {
31308 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31309 so if the register is available at all, then we can move data of
31310 the given mode into or out of it. */
31311 return (VALID_MMX_REG_MODE (mode)
31312 || VALID_MMX_REG_MODE_3DNOW (mode));
31313 }
31314
31315 if (mode == QImode)
31316 {
31317 /* Take care for QImode values - they can be in non-QI regs,
31318 but then they do cause partial register stalls. */
31319 if (regno <= BX_REG || TARGET_64BIT)
31320 return true;
31321 if (!TARGET_PARTIAL_REG_STALL)
31322 return true;
31323 return !can_create_pseudo_p ();
31324 }
31325 /* We handle both integer and floats in the general purpose registers. */
31326 else if (VALID_INT_MODE_P (mode))
31327 return true;
31328 else if (VALID_FP_MODE_P (mode))
31329 return true;
31330 else if (VALID_DFP_MODE_P (mode))
31331 return true;
31332 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
31333 on to use that value in smaller contexts, this can easily force a
31334 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
31335 supporting DImode, allow it. */
31336 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31337 return true;
31338
31339 return false;
31340 }
31341
31342 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
31343 tieable integer mode. */
31344
31345 static bool
31346 ix86_tieable_integer_mode_p (enum machine_mode mode)
31347 {
31348 switch (mode)
31349 {
31350 case HImode:
31351 case SImode:
31352 return true;
31353
31354 case QImode:
31355 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31356
31357 case DImode:
31358 return TARGET_64BIT;
31359
31360 default:
31361 return false;
31362 }
31363 }
31364
31365 /* Return true if MODE1 is accessible in a register that can hold MODE2
31366 without copying. That is, all register classes that can hold MODE2
31367 can also hold MODE1. */
31368
31369 bool
31370 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31371 {
31372 if (mode1 == mode2)
31373 return true;
31374
31375 if (ix86_tieable_integer_mode_p (mode1)
31376 && ix86_tieable_integer_mode_p (mode2))
31377 return true;
31378
31379 /* MODE2 being XFmode implies fp stack or general regs, which means we
31380 can tie any smaller floating point modes to it. Note that we do not
31381 tie this with TFmode. */
31382 if (mode2 == XFmode)
31383 return mode1 == SFmode || mode1 == DFmode;
31384
31385 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31386 that we can tie it with SFmode. */
31387 if (mode2 == DFmode)
31388 return mode1 == SFmode;
31389
31390 /* If MODE2 is only appropriate for an SSE register, then tie with
31391 any other mode acceptable to SSE registers. */
31392 if (GET_MODE_SIZE (mode2) == 32
31393 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31394 return (GET_MODE_SIZE (mode1) == 32
31395 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31396 if (GET_MODE_SIZE (mode2) == 16
31397 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31398 return (GET_MODE_SIZE (mode1) == 16
31399 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31400
31401 /* If MODE2 is appropriate for an MMX register, then tie
31402 with any other mode acceptable to MMX registers. */
31403 if (GET_MODE_SIZE (mode2) == 8
31404 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31405 return (GET_MODE_SIZE (mode1) == 8
31406 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31407
31408 return false;
31409 }
31410
31411 /* Compute a (partial) cost for rtx X. Return true if the complete
31412 cost has been computed, and false if subexpressions should be
31413 scanned. In either case, *TOTAL contains the cost result. */
31414
31415 static bool
31416 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
31417 bool speed)
31418 {
31419 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31420 enum machine_mode mode = GET_MODE (x);
31421 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31422
31423 switch (code)
31424 {
31425 case CONST_INT:
31426 case CONST:
31427 case LABEL_REF:
31428 case SYMBOL_REF:
31429 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31430 *total = 3;
31431 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31432 *total = 2;
31433 else if (flag_pic && SYMBOLIC_CONST (x)
31434 && (!TARGET_64BIT
31435 || (!GET_CODE (x) != LABEL_REF
31436 && (GET_CODE (x) != SYMBOL_REF
31437 || !SYMBOL_REF_LOCAL_P (x)))))
31438 *total = 1;
31439 else
31440 *total = 0;
31441 return true;
31442
31443 case CONST_DOUBLE:
31444 if (mode == VOIDmode)
31445 *total = 0;
31446 else
31447 switch (standard_80387_constant_p (x))
31448 {
31449 case 1: /* 0.0 */
31450 *total = 1;
31451 break;
31452 default: /* Other constants */
31453 *total = 2;
31454 break;
31455 case 0:
31456 case -1:
31457 /* Start with (MEM (SYMBOL_REF)), since that's where
31458 it'll probably end up. Add a penalty for size. */
31459 *total = (COSTS_N_INSNS (1)
31460 + (flag_pic != 0 && !TARGET_64BIT)
31461 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
31462 break;
31463 }
31464 return true;
31465
31466 case ZERO_EXTEND:
31467 /* The zero extensions is often completely free on x86_64, so make
31468 it as cheap as possible. */
31469 if (TARGET_64BIT && mode == DImode
31470 && GET_MODE (XEXP (x, 0)) == SImode)
31471 *total = 1;
31472 else if (TARGET_ZERO_EXTEND_WITH_AND)
31473 *total = cost->add;
31474 else
31475 *total = cost->movzx;
31476 return false;
31477
31478 case SIGN_EXTEND:
31479 *total = cost->movsx;
31480 return false;
31481
31482 case ASHIFT:
31483 if (CONST_INT_P (XEXP (x, 1))
31484 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
31485 {
31486 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31487 if (value == 1)
31488 {
31489 *total = cost->add;
31490 return false;
31491 }
31492 if ((value == 2 || value == 3)
31493 && cost->lea <= cost->shift_const)
31494 {
31495 *total = cost->lea;
31496 return false;
31497 }
31498 }
31499 /* FALLTHRU */
31500
31501 case ROTATE:
31502 case ASHIFTRT:
31503 case LSHIFTRT:
31504 case ROTATERT:
31505 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
31506 {
31507 if (CONST_INT_P (XEXP (x, 1)))
31508 {
31509 if (INTVAL (XEXP (x, 1)) > 32)
31510 *total = cost->shift_const + COSTS_N_INSNS (2);
31511 else
31512 *total = cost->shift_const * 2;
31513 }
31514 else
31515 {
31516 if (GET_CODE (XEXP (x, 1)) == AND)
31517 *total = cost->shift_var * 2;
31518 else
31519 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
31520 }
31521 }
31522 else
31523 {
31524 if (CONST_INT_P (XEXP (x, 1)))
31525 *total = cost->shift_const;
31526 else
31527 *total = cost->shift_var;
31528 }
31529 return false;
31530
31531 case FMA:
31532 {
31533 rtx sub;
31534
31535 gcc_assert (FLOAT_MODE_P (mode));
31536 gcc_assert (TARGET_FMA || TARGET_FMA4);
31537
31538 /* ??? SSE scalar/vector cost should be used here. */
31539 /* ??? Bald assumption that fma has the same cost as fmul. */
31540 *total = cost->fmul;
31541 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31542
31543 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
31544 sub = XEXP (x, 0);
31545 if (GET_CODE (sub) == NEG)
31546 sub = XEXP (sub, 0);
31547 *total += rtx_cost (sub, FMA, 0, speed);
31548
31549 sub = XEXP (x, 2);
31550 if (GET_CODE (sub) == NEG)
31551 sub = XEXP (sub, 0);
31552 *total += rtx_cost (sub, FMA, 2, speed);
31553 return true;
31554 }
31555
31556 case MULT:
31557 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31558 {
31559 /* ??? SSE scalar cost should be used here. */
31560 *total = cost->fmul;
31561 return false;
31562 }
31563 else if (X87_FLOAT_MODE_P (mode))
31564 {
31565 *total = cost->fmul;
31566 return false;
31567 }
31568 else if (FLOAT_MODE_P (mode))
31569 {
31570 /* ??? SSE vector cost should be used here. */
31571 *total = cost->fmul;
31572 return false;
31573 }
31574 else
31575 {
31576 rtx op0 = XEXP (x, 0);
31577 rtx op1 = XEXP (x, 1);
31578 int nbits;
31579 if (CONST_INT_P (XEXP (x, 1)))
31580 {
31581 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31582 for (nbits = 0; value != 0; value &= value - 1)
31583 nbits++;
31584 }
31585 else
31586 /* This is arbitrary. */
31587 nbits = 7;
31588
31589 /* Compute costs correctly for widening multiplication. */
31590 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
31591 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
31592 == GET_MODE_SIZE (mode))
31593 {
31594 int is_mulwiden = 0;
31595 enum machine_mode inner_mode = GET_MODE (op0);
31596
31597 if (GET_CODE (op0) == GET_CODE (op1))
31598 is_mulwiden = 1, op1 = XEXP (op1, 0);
31599 else if (CONST_INT_P (op1))
31600 {
31601 if (GET_CODE (op0) == SIGN_EXTEND)
31602 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
31603 == INTVAL (op1);
31604 else
31605 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
31606 }
31607
31608 if (is_mulwiden)
31609 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
31610 }
31611
31612 *total = (cost->mult_init[MODE_INDEX (mode)]
31613 + nbits * cost->mult_bit
31614 + rtx_cost (op0, outer_code, opno, speed)
31615 + rtx_cost (op1, outer_code, opno, speed));
31616
31617 return true;
31618 }
31619
31620 case DIV:
31621 case UDIV:
31622 case MOD:
31623 case UMOD:
31624 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31625 /* ??? SSE cost should be used here. */
31626 *total = cost->fdiv;
31627 else if (X87_FLOAT_MODE_P (mode))
31628 *total = cost->fdiv;
31629 else if (FLOAT_MODE_P (mode))
31630 /* ??? SSE vector cost should be used here. */
31631 *total = cost->fdiv;
31632 else
31633 *total = cost->divide[MODE_INDEX (mode)];
31634 return false;
31635
31636 case PLUS:
31637 if (GET_MODE_CLASS (mode) == MODE_INT
31638 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
31639 {
31640 if (GET_CODE (XEXP (x, 0)) == PLUS
31641 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
31642 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
31643 && CONSTANT_P (XEXP (x, 1)))
31644 {
31645 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
31646 if (val == 2 || val == 4 || val == 8)
31647 {
31648 *total = cost->lea;
31649 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31650 outer_code, opno, speed);
31651 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
31652 outer_code, opno, speed);
31653 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31654 return true;
31655 }
31656 }
31657 else if (GET_CODE (XEXP (x, 0)) == MULT
31658 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
31659 {
31660 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
31661 if (val == 2 || val == 4 || val == 8)
31662 {
31663 *total = cost->lea;
31664 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31665 outer_code, opno, speed);
31666 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31667 return true;
31668 }
31669 }
31670 else if (GET_CODE (XEXP (x, 0)) == PLUS)
31671 {
31672 *total = cost->lea;
31673 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31674 outer_code, opno, speed);
31675 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31676 outer_code, opno, speed);
31677 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31678 return true;
31679 }
31680 }
31681 /* FALLTHRU */
31682
31683 case MINUS:
31684 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31685 {
31686 /* ??? SSE cost should be used here. */
31687 *total = cost->fadd;
31688 return false;
31689 }
31690 else if (X87_FLOAT_MODE_P (mode))
31691 {
31692 *total = cost->fadd;
31693 return false;
31694 }
31695 else if (FLOAT_MODE_P (mode))
31696 {
31697 /* ??? SSE vector cost should be used here. */
31698 *total = cost->fadd;
31699 return false;
31700 }
31701 /* FALLTHRU */
31702
31703 case AND:
31704 case IOR:
31705 case XOR:
31706 if (!TARGET_64BIT && mode == DImode)
31707 {
31708 *total = (cost->add * 2
31709 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
31710 << (GET_MODE (XEXP (x, 0)) != DImode))
31711 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
31712 << (GET_MODE (XEXP (x, 1)) != DImode)));
31713 return true;
31714 }
31715 /* FALLTHRU */
31716
31717 case NEG:
31718 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31719 {
31720 /* ??? SSE cost should be used here. */
31721 *total = cost->fchs;
31722 return false;
31723 }
31724 else if (X87_FLOAT_MODE_P (mode))
31725 {
31726 *total = cost->fchs;
31727 return false;
31728 }
31729 else if (FLOAT_MODE_P (mode))
31730 {
31731 /* ??? SSE vector cost should be used here. */
31732 *total = cost->fchs;
31733 return false;
31734 }
31735 /* FALLTHRU */
31736
31737 case NOT:
31738 if (!TARGET_64BIT && mode == DImode)
31739 *total = cost->add * 2;
31740 else
31741 *total = cost->add;
31742 return false;
31743
31744 case COMPARE:
31745 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
31746 && XEXP (XEXP (x, 0), 1) == const1_rtx
31747 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
31748 && XEXP (x, 1) == const0_rtx)
31749 {
31750 /* This kind of construct is implemented using test[bwl].
31751 Treat it as if we had an AND. */
31752 *total = (cost->add
31753 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
31754 + rtx_cost (const1_rtx, outer_code, opno, speed));
31755 return true;
31756 }
31757 return false;
31758
31759 case FLOAT_EXTEND:
31760 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
31761 *total = 0;
31762 return false;
31763
31764 case ABS:
31765 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31766 /* ??? SSE cost should be used here. */
31767 *total = cost->fabs;
31768 else if (X87_FLOAT_MODE_P (mode))
31769 *total = cost->fabs;
31770 else if (FLOAT_MODE_P (mode))
31771 /* ??? SSE vector cost should be used here. */
31772 *total = cost->fabs;
31773 return false;
31774
31775 case SQRT:
31776 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31777 /* ??? SSE cost should be used here. */
31778 *total = cost->fsqrt;
31779 else if (X87_FLOAT_MODE_P (mode))
31780 *total = cost->fsqrt;
31781 else if (FLOAT_MODE_P (mode))
31782 /* ??? SSE vector cost should be used here. */
31783 *total = cost->fsqrt;
31784 return false;
31785
31786 case UNSPEC:
31787 if (XINT (x, 1) == UNSPEC_TP)
31788 *total = 0;
31789 return false;
31790
31791 case VEC_SELECT:
31792 case VEC_CONCAT:
31793 case VEC_MERGE:
31794 case VEC_DUPLICATE:
31795 /* ??? Assume all of these vector manipulation patterns are
31796 recognizable. In which case they all pretty much have the
31797 same cost. */
31798 *total = COSTS_N_INSNS (1);
31799 return true;
31800
31801 default:
31802 return false;
31803 }
31804 }
31805
31806 #if TARGET_MACHO
31807
31808 static int current_machopic_label_num;
31809
31810 /* Given a symbol name and its associated stub, write out the
31811 definition of the stub. */
31812
31813 void
31814 machopic_output_stub (FILE *file, const char *symb, const char *stub)
31815 {
31816 unsigned int length;
31817 char *binder_name, *symbol_name, lazy_ptr_name[32];
31818 int label = ++current_machopic_label_num;
31819
31820 /* For 64-bit we shouldn't get here. */
31821 gcc_assert (!TARGET_64BIT);
31822
31823 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
31824 symb = targetm.strip_name_encoding (symb);
31825
31826 length = strlen (stub);
31827 binder_name = XALLOCAVEC (char, length + 32);
31828 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
31829
31830 length = strlen (symb);
31831 symbol_name = XALLOCAVEC (char, length + 32);
31832 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
31833
31834 sprintf (lazy_ptr_name, "L%d$lz", label);
31835
31836 if (MACHOPIC_ATT_STUB)
31837 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
31838 else if (MACHOPIC_PURE)
31839 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
31840 else
31841 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
31842
31843 fprintf (file, "%s:\n", stub);
31844 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31845
31846 if (MACHOPIC_ATT_STUB)
31847 {
31848 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
31849 }
31850 else if (MACHOPIC_PURE)
31851 {
31852 /* PIC stub. */
31853 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31854 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
31855 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
31856 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
31857 label, lazy_ptr_name, label);
31858 fprintf (file, "\tjmp\t*%%ecx\n");
31859 }
31860 else
31861 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
31862
31863 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
31864 it needs no stub-binding-helper. */
31865 if (MACHOPIC_ATT_STUB)
31866 return;
31867
31868 fprintf (file, "%s:\n", binder_name);
31869
31870 if (MACHOPIC_PURE)
31871 {
31872 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
31873 fprintf (file, "\tpushl\t%%ecx\n");
31874 }
31875 else
31876 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
31877
31878 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
31879
31880 /* N.B. Keep the correspondence of these
31881 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
31882 old-pic/new-pic/non-pic stubs; altering this will break
31883 compatibility with existing dylibs. */
31884 if (MACHOPIC_PURE)
31885 {
31886 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31887 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
31888 }
31889 else
31890 /* 16-byte -mdynamic-no-pic stub. */
31891 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
31892
31893 fprintf (file, "%s:\n", lazy_ptr_name);
31894 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31895 fprintf (file, ASM_LONG "%s\n", binder_name);
31896 }
31897 #endif /* TARGET_MACHO */
31898
31899 /* Order the registers for register allocator. */
31900
31901 void
31902 x86_order_regs_for_local_alloc (void)
31903 {
31904 int pos = 0;
31905 int i;
31906
31907 /* First allocate the local general purpose registers. */
31908 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31909 if (GENERAL_REGNO_P (i) && call_used_regs[i])
31910 reg_alloc_order [pos++] = i;
31911
31912 /* Global general purpose registers. */
31913 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31914 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
31915 reg_alloc_order [pos++] = i;
31916
31917 /* x87 registers come first in case we are doing FP math
31918 using them. */
31919 if (!TARGET_SSE_MATH)
31920 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31921 reg_alloc_order [pos++] = i;
31922
31923 /* SSE registers. */
31924 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
31925 reg_alloc_order [pos++] = i;
31926 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
31927 reg_alloc_order [pos++] = i;
31928
31929 /* x87 registers. */
31930 if (TARGET_SSE_MATH)
31931 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31932 reg_alloc_order [pos++] = i;
31933
31934 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
31935 reg_alloc_order [pos++] = i;
31936
31937 /* Initialize the rest of array as we do not allocate some registers
31938 at all. */
31939 while (pos < FIRST_PSEUDO_REGISTER)
31940 reg_alloc_order [pos++] = 0;
31941 }
31942
31943 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
31944 in struct attribute_spec handler. */
31945 static tree
31946 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
31947 tree args,
31948 int flags ATTRIBUTE_UNUSED,
31949 bool *no_add_attrs)
31950 {
31951 if (TREE_CODE (*node) != FUNCTION_TYPE
31952 && TREE_CODE (*node) != METHOD_TYPE
31953 && TREE_CODE (*node) != FIELD_DECL
31954 && TREE_CODE (*node) != TYPE_DECL)
31955 {
31956 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31957 name);
31958 *no_add_attrs = true;
31959 return NULL_TREE;
31960 }
31961 if (TARGET_64BIT)
31962 {
31963 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
31964 name);
31965 *no_add_attrs = true;
31966 return NULL_TREE;
31967 }
31968 if (is_attribute_p ("callee_pop_aggregate_return", name))
31969 {
31970 tree cst;
31971
31972 cst = TREE_VALUE (args);
31973 if (TREE_CODE (cst) != INTEGER_CST)
31974 {
31975 warning (OPT_Wattributes,
31976 "%qE attribute requires an integer constant argument",
31977 name);
31978 *no_add_attrs = true;
31979 }
31980 else if (compare_tree_int (cst, 0) != 0
31981 && compare_tree_int (cst, 1) != 0)
31982 {
31983 warning (OPT_Wattributes,
31984 "argument to %qE attribute is neither zero, nor one",
31985 name);
31986 *no_add_attrs = true;
31987 }
31988
31989 return NULL_TREE;
31990 }
31991
31992 return NULL_TREE;
31993 }
31994
31995 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
31996 struct attribute_spec.handler. */
31997 static tree
31998 ix86_handle_abi_attribute (tree *node, tree name,
31999 tree args ATTRIBUTE_UNUSED,
32000 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32001 {
32002 if (TREE_CODE (*node) != FUNCTION_TYPE
32003 && TREE_CODE (*node) != METHOD_TYPE
32004 && TREE_CODE (*node) != FIELD_DECL
32005 && TREE_CODE (*node) != TYPE_DECL)
32006 {
32007 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32008 name);
32009 *no_add_attrs = true;
32010 return NULL_TREE;
32011 }
32012
32013 /* Can combine regparm with all attributes but fastcall. */
32014 if (is_attribute_p ("ms_abi", name))
32015 {
32016 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
32017 {
32018 error ("ms_abi and sysv_abi attributes are not compatible");
32019 }
32020
32021 return NULL_TREE;
32022 }
32023 else if (is_attribute_p ("sysv_abi", name))
32024 {
32025 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
32026 {
32027 error ("ms_abi and sysv_abi attributes are not compatible");
32028 }
32029
32030 return NULL_TREE;
32031 }
32032
32033 return NULL_TREE;
32034 }
32035
32036 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
32037 struct attribute_spec.handler. */
32038 static tree
32039 ix86_handle_struct_attribute (tree *node, tree name,
32040 tree args ATTRIBUTE_UNUSED,
32041 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32042 {
32043 tree *type = NULL;
32044 if (DECL_P (*node))
32045 {
32046 if (TREE_CODE (*node) == TYPE_DECL)
32047 type = &TREE_TYPE (*node);
32048 }
32049 else
32050 type = node;
32051
32052 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
32053 || TREE_CODE (*type) == UNION_TYPE)))
32054 {
32055 warning (OPT_Wattributes, "%qE attribute ignored",
32056 name);
32057 *no_add_attrs = true;
32058 }
32059
32060 else if ((is_attribute_p ("ms_struct", name)
32061 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
32062 || ((is_attribute_p ("gcc_struct", name)
32063 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
32064 {
32065 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
32066 name);
32067 *no_add_attrs = true;
32068 }
32069
32070 return NULL_TREE;
32071 }
32072
32073 static tree
32074 ix86_handle_fndecl_attribute (tree *node, tree name,
32075 tree args ATTRIBUTE_UNUSED,
32076 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32077 {
32078 if (TREE_CODE (*node) != FUNCTION_DECL)
32079 {
32080 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32081 name);
32082 *no_add_attrs = true;
32083 }
32084 return NULL_TREE;
32085 }
32086
32087 static bool
32088 ix86_ms_bitfield_layout_p (const_tree record_type)
32089 {
32090 return ((TARGET_MS_BITFIELD_LAYOUT
32091 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
32092 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
32093 }
32094
32095 /* Returns an expression indicating where the this parameter is
32096 located on entry to the FUNCTION. */
32097
32098 static rtx
32099 x86_this_parameter (tree function)
32100 {
32101 tree type = TREE_TYPE (function);
32102 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
32103 int nregs;
32104
32105 if (TARGET_64BIT)
32106 {
32107 const int *parm_regs;
32108
32109 if (ix86_function_type_abi (type) == MS_ABI)
32110 parm_regs = x86_64_ms_abi_int_parameter_registers;
32111 else
32112 parm_regs = x86_64_int_parameter_registers;
32113 return gen_rtx_REG (Pmode, parm_regs[aggr]);
32114 }
32115
32116 nregs = ix86_function_regparm (type, function);
32117
32118 if (nregs > 0 && !stdarg_p (type))
32119 {
32120 int regno;
32121 unsigned int ccvt = ix86_get_callcvt (type);
32122
32123 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32124 regno = aggr ? DX_REG : CX_REG;
32125 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32126 {
32127 regno = CX_REG;
32128 if (aggr)
32129 return gen_rtx_MEM (SImode,
32130 plus_constant (stack_pointer_rtx, 4));
32131 }
32132 else
32133 {
32134 regno = AX_REG;
32135 if (aggr)
32136 {
32137 regno = DX_REG;
32138 if (nregs == 1)
32139 return gen_rtx_MEM (SImode,
32140 plus_constant (stack_pointer_rtx, 4));
32141 }
32142 }
32143 return gen_rtx_REG (SImode, regno);
32144 }
32145
32146 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
32147 }
32148
32149 /* Determine whether x86_output_mi_thunk can succeed. */
32150
32151 static bool
32152 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
32153 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
32154 HOST_WIDE_INT vcall_offset, const_tree function)
32155 {
32156 /* 64-bit can handle anything. */
32157 if (TARGET_64BIT)
32158 return true;
32159
32160 /* For 32-bit, everything's fine if we have one free register. */
32161 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
32162 return true;
32163
32164 /* Need a free register for vcall_offset. */
32165 if (vcall_offset)
32166 return false;
32167
32168 /* Need a free register for GOT references. */
32169 if (flag_pic && !targetm.binds_local_p (function))
32170 return false;
32171
32172 /* Otherwise ok. */
32173 return true;
32174 }
32175
32176 /* Output the assembler code for a thunk function. THUNK_DECL is the
32177 declaration for the thunk function itself, FUNCTION is the decl for
32178 the target function. DELTA is an immediate constant offset to be
32179 added to THIS. If VCALL_OFFSET is nonzero, the word at
32180 *(*this + vcall_offset) should be added to THIS. */
32181
32182 static void
32183 x86_output_mi_thunk (FILE *file,
32184 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
32185 HOST_WIDE_INT vcall_offset, tree function)
32186 {
32187 rtx this_param = x86_this_parameter (function);
32188 rtx this_reg, tmp, fnaddr;
32189
32190 emit_note (NOTE_INSN_PROLOGUE_END);
32191
32192 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
32193 pull it in now and let DELTA benefit. */
32194 if (REG_P (this_param))
32195 this_reg = this_param;
32196 else if (vcall_offset)
32197 {
32198 /* Put the this parameter into %eax. */
32199 this_reg = gen_rtx_REG (Pmode, AX_REG);
32200 emit_move_insn (this_reg, this_param);
32201 }
32202 else
32203 this_reg = NULL_RTX;
32204
32205 /* Adjust the this parameter by a fixed constant. */
32206 if (delta)
32207 {
32208 rtx delta_rtx = GEN_INT (delta);
32209 rtx delta_dst = this_reg ? this_reg : this_param;
32210
32211 if (TARGET_64BIT)
32212 {
32213 if (!x86_64_general_operand (delta_rtx, Pmode))
32214 {
32215 tmp = gen_rtx_REG (Pmode, R10_REG);
32216 emit_move_insn (tmp, delta_rtx);
32217 delta_rtx = tmp;
32218 }
32219 }
32220
32221 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32222 }
32223
32224 /* Adjust the this parameter by a value stored in the vtable. */
32225 if (vcall_offset)
32226 {
32227 rtx vcall_addr, vcall_mem, this_mem;
32228 unsigned int tmp_regno;
32229
32230 if (TARGET_64BIT)
32231 tmp_regno = R10_REG;
32232 else
32233 {
32234 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32235 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
32236 tmp_regno = AX_REG;
32237 else
32238 tmp_regno = CX_REG;
32239 }
32240 tmp = gen_rtx_REG (Pmode, tmp_regno);
32241
32242 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32243 if (Pmode != ptr_mode)
32244 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32245 emit_move_insn (tmp, this_mem);
32246
32247 /* Adjust the this parameter. */
32248 vcall_addr = plus_constant (tmp, vcall_offset);
32249 if (TARGET_64BIT
32250 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32251 {
32252 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32253 emit_move_insn (tmp2, GEN_INT (vcall_offset));
32254 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32255 }
32256
32257 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32258 if (Pmode != ptr_mode)
32259 emit_insn (gen_addsi_1_zext (this_reg,
32260 gen_rtx_REG (ptr_mode,
32261 REGNO (this_reg)),
32262 vcall_mem));
32263 else
32264 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32265 }
32266
32267 /* If necessary, drop THIS back to its stack slot. */
32268 if (this_reg && this_reg != this_param)
32269 emit_move_insn (this_param, this_reg);
32270
32271 fnaddr = XEXP (DECL_RTL (function), 0);
32272 if (TARGET_64BIT)
32273 {
32274 if (!flag_pic || targetm.binds_local_p (function)
32275 || cfun->machine->call_abi == MS_ABI)
32276 ;
32277 else
32278 {
32279 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32280 tmp = gen_rtx_CONST (Pmode, tmp);
32281 fnaddr = gen_rtx_MEM (Pmode, tmp);
32282 }
32283 }
32284 else
32285 {
32286 if (!flag_pic || targetm.binds_local_p (function))
32287 ;
32288 #if TARGET_MACHO
32289 else if (TARGET_MACHO)
32290 {
32291 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32292 fnaddr = XEXP (fnaddr, 0);
32293 }
32294 #endif /* TARGET_MACHO */
32295 else
32296 {
32297 tmp = gen_rtx_REG (Pmode, CX_REG);
32298 output_set_got (tmp, NULL_RTX);
32299
32300 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32301 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32302 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32303 }
32304 }
32305
32306 /* Our sibling call patterns do not allow memories, because we have no
32307 predicate that can distinguish between frame and non-frame memory.
32308 For our purposes here, we can get away with (ab)using a jump pattern,
32309 because we're going to do no optimization. */
32310 if (MEM_P (fnaddr))
32311 emit_jump_insn (gen_indirect_jump (fnaddr));
32312 else
32313 {
32314 tmp = gen_rtx_MEM (QImode, fnaddr);
32315 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32316 tmp = emit_call_insn (tmp);
32317 SIBLING_CALL_P (tmp) = 1;
32318 }
32319 emit_barrier ();
32320
32321 /* Emit just enough of rest_of_compilation to get the insns emitted.
32322 Note that use_thunk calls assemble_start_function et al. */
32323 tmp = get_insns ();
32324 insn_locators_alloc ();
32325 shorten_branches (tmp);
32326 final_start_function (tmp, file, 1);
32327 final (tmp, file, 1);
32328 final_end_function ();
32329 }
32330
32331 static void
32332 x86_file_start (void)
32333 {
32334 default_file_start ();
32335 #if TARGET_MACHO
32336 darwin_file_start ();
32337 #endif
32338 if (X86_FILE_START_VERSION_DIRECTIVE)
32339 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32340 if (X86_FILE_START_FLTUSED)
32341 fputs ("\t.global\t__fltused\n", asm_out_file);
32342 if (ix86_asm_dialect == ASM_INTEL)
32343 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32344 }
32345
32346 int
32347 x86_field_alignment (tree field, int computed)
32348 {
32349 enum machine_mode mode;
32350 tree type = TREE_TYPE (field);
32351
32352 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32353 return computed;
32354 mode = TYPE_MODE (strip_array_types (type));
32355 if (mode == DFmode || mode == DCmode
32356 || GET_MODE_CLASS (mode) == MODE_INT
32357 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32358 return MIN (32, computed);
32359 return computed;
32360 }
32361
32362 /* Output assembler code to FILE to increment profiler label # LABELNO
32363 for profiling a function entry. */
32364 void
32365 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32366 {
32367 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32368 : MCOUNT_NAME);
32369
32370 if (TARGET_64BIT)
32371 {
32372 #ifndef NO_PROFILE_COUNTERS
32373 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32374 #endif
32375
32376 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32377 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32378 else
32379 fprintf (file, "\tcall\t%s\n", mcount_name);
32380 }
32381 else if (flag_pic)
32382 {
32383 #ifndef NO_PROFILE_COUNTERS
32384 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32385 LPREFIX, labelno);
32386 #endif
32387 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32388 }
32389 else
32390 {
32391 #ifndef NO_PROFILE_COUNTERS
32392 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32393 LPREFIX, labelno);
32394 #endif
32395 fprintf (file, "\tcall\t%s\n", mcount_name);
32396 }
32397 }
32398
32399 /* We don't have exact information about the insn sizes, but we may assume
32400 quite safely that we are informed about all 1 byte insns and memory
32401 address sizes. This is enough to eliminate unnecessary padding in
32402 99% of cases. */
32403
32404 static int
32405 min_insn_size (rtx insn)
32406 {
32407 int l = 0, len;
32408
32409 if (!INSN_P (insn) || !active_insn_p (insn))
32410 return 0;
32411
32412 /* Discard alignments we've emit and jump instructions. */
32413 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
32414 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
32415 return 0;
32416 if (JUMP_TABLE_DATA_P (insn))
32417 return 0;
32418
32419 /* Important case - calls are always 5 bytes.
32420 It is common to have many calls in the row. */
32421 if (CALL_P (insn)
32422 && symbolic_reference_mentioned_p (PATTERN (insn))
32423 && !SIBLING_CALL_P (insn))
32424 return 5;
32425 len = get_attr_length (insn);
32426 if (len <= 1)
32427 return 1;
32428
32429 /* For normal instructions we rely on get_attr_length being exact,
32430 with a few exceptions. */
32431 if (!JUMP_P (insn))
32432 {
32433 enum attr_type type = get_attr_type (insn);
32434
32435 switch (type)
32436 {
32437 case TYPE_MULTI:
32438 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
32439 || asm_noperands (PATTERN (insn)) >= 0)
32440 return 0;
32441 break;
32442 case TYPE_OTHER:
32443 case TYPE_FCMP:
32444 break;
32445 default:
32446 /* Otherwise trust get_attr_length. */
32447 return len;
32448 }
32449
32450 l = get_attr_length_address (insn);
32451 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
32452 l = 4;
32453 }
32454 if (l)
32455 return 1+l;
32456 else
32457 return 2;
32458 }
32459
32460 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32461
32462 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
32463 window. */
32464
32465 static void
32466 ix86_avoid_jump_mispredicts (void)
32467 {
32468 rtx insn, start = get_insns ();
32469 int nbytes = 0, njumps = 0;
32470 int isjump = 0;
32471
32472 /* Look for all minimal intervals of instructions containing 4 jumps.
32473 The intervals are bounded by START and INSN. NBYTES is the total
32474 size of instructions in the interval including INSN and not including
32475 START. When the NBYTES is smaller than 16 bytes, it is possible
32476 that the end of START and INSN ends up in the same 16byte page.
32477
32478 The smallest offset in the page INSN can start is the case where START
32479 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
32480 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
32481 */
32482 for (insn = start; insn; insn = NEXT_INSN (insn))
32483 {
32484 int min_size;
32485
32486 if (LABEL_P (insn))
32487 {
32488 int align = label_to_alignment (insn);
32489 int max_skip = label_to_max_skip (insn);
32490
32491 if (max_skip > 15)
32492 max_skip = 15;
32493 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
32494 already in the current 16 byte page, because otherwise
32495 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
32496 bytes to reach 16 byte boundary. */
32497 if (align <= 0
32498 || (align <= 3 && max_skip != (1 << align) - 1))
32499 max_skip = 0;
32500 if (dump_file)
32501 fprintf (dump_file, "Label %i with max_skip %i\n",
32502 INSN_UID (insn), max_skip);
32503 if (max_skip)
32504 {
32505 while (nbytes + max_skip >= 16)
32506 {
32507 start = NEXT_INSN (start);
32508 if ((JUMP_P (start)
32509 && GET_CODE (PATTERN (start)) != ADDR_VEC
32510 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32511 || CALL_P (start))
32512 njumps--, isjump = 1;
32513 else
32514 isjump = 0;
32515 nbytes -= min_insn_size (start);
32516 }
32517 }
32518 continue;
32519 }
32520
32521 min_size = min_insn_size (insn);
32522 nbytes += min_size;
32523 if (dump_file)
32524 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
32525 INSN_UID (insn), min_size);
32526 if ((JUMP_P (insn)
32527 && GET_CODE (PATTERN (insn)) != ADDR_VEC
32528 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
32529 || CALL_P (insn))
32530 njumps++;
32531 else
32532 continue;
32533
32534 while (njumps > 3)
32535 {
32536 start = NEXT_INSN (start);
32537 if ((JUMP_P (start)
32538 && GET_CODE (PATTERN (start)) != ADDR_VEC
32539 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32540 || CALL_P (start))
32541 njumps--, isjump = 1;
32542 else
32543 isjump = 0;
32544 nbytes -= min_insn_size (start);
32545 }
32546 gcc_assert (njumps >= 0);
32547 if (dump_file)
32548 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
32549 INSN_UID (start), INSN_UID (insn), nbytes);
32550
32551 if (njumps == 3 && isjump && nbytes < 16)
32552 {
32553 int padsize = 15 - nbytes + min_insn_size (insn);
32554
32555 if (dump_file)
32556 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
32557 INSN_UID (insn), padsize);
32558 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
32559 }
32560 }
32561 }
32562 #endif
32563
32564 /* AMD Athlon works faster
32565 when RET is not destination of conditional jump or directly preceded
32566 by other jump instruction. We avoid the penalty by inserting NOP just
32567 before the RET instructions in such cases. */
32568 static void
32569 ix86_pad_returns (void)
32570 {
32571 edge e;
32572 edge_iterator ei;
32573
32574 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32575 {
32576 basic_block bb = e->src;
32577 rtx ret = BB_END (bb);
32578 rtx prev;
32579 bool replace = false;
32580
32581 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
32582 || optimize_bb_for_size_p (bb))
32583 continue;
32584 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
32585 if (active_insn_p (prev) || LABEL_P (prev))
32586 break;
32587 if (prev && LABEL_P (prev))
32588 {
32589 edge e;
32590 edge_iterator ei;
32591
32592 FOR_EACH_EDGE (e, ei, bb->preds)
32593 if (EDGE_FREQUENCY (e) && e->src->index >= 0
32594 && !(e->flags & EDGE_FALLTHRU))
32595 replace = true;
32596 }
32597 if (!replace)
32598 {
32599 prev = prev_active_insn (ret);
32600 if (prev
32601 && ((JUMP_P (prev) && any_condjump_p (prev))
32602 || CALL_P (prev)))
32603 replace = true;
32604 /* Empty functions get branch mispredict even when
32605 the jump destination is not visible to us. */
32606 if (!prev && !optimize_function_for_size_p (cfun))
32607 replace = true;
32608 }
32609 if (replace)
32610 {
32611 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
32612 delete_insn (ret);
32613 }
32614 }
32615 }
32616
32617 /* Count the minimum number of instructions in BB. Return 4 if the
32618 number of instructions >= 4. */
32619
32620 static int
32621 ix86_count_insn_bb (basic_block bb)
32622 {
32623 rtx insn;
32624 int insn_count = 0;
32625
32626 /* Count number of instructions in this block. Return 4 if the number
32627 of instructions >= 4. */
32628 FOR_BB_INSNS (bb, insn)
32629 {
32630 /* Only happen in exit blocks. */
32631 if (JUMP_P (insn)
32632 && ANY_RETURN_P (PATTERN (insn)))
32633 break;
32634
32635 if (NONDEBUG_INSN_P (insn)
32636 && GET_CODE (PATTERN (insn)) != USE
32637 && GET_CODE (PATTERN (insn)) != CLOBBER)
32638 {
32639 insn_count++;
32640 if (insn_count >= 4)
32641 return insn_count;
32642 }
32643 }
32644
32645 return insn_count;
32646 }
32647
32648
32649 /* Count the minimum number of instructions in code path in BB.
32650 Return 4 if the number of instructions >= 4. */
32651
32652 static int
32653 ix86_count_insn (basic_block bb)
32654 {
32655 edge e;
32656 edge_iterator ei;
32657 int min_prev_count;
32658
32659 /* Only bother counting instructions along paths with no
32660 more than 2 basic blocks between entry and exit. Given
32661 that BB has an edge to exit, determine if a predecessor
32662 of BB has an edge from entry. If so, compute the number
32663 of instructions in the predecessor block. If there
32664 happen to be multiple such blocks, compute the minimum. */
32665 min_prev_count = 4;
32666 FOR_EACH_EDGE (e, ei, bb->preds)
32667 {
32668 edge prev_e;
32669 edge_iterator prev_ei;
32670
32671 if (e->src == ENTRY_BLOCK_PTR)
32672 {
32673 min_prev_count = 0;
32674 break;
32675 }
32676 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
32677 {
32678 if (prev_e->src == ENTRY_BLOCK_PTR)
32679 {
32680 int count = ix86_count_insn_bb (e->src);
32681 if (count < min_prev_count)
32682 min_prev_count = count;
32683 break;
32684 }
32685 }
32686 }
32687
32688 if (min_prev_count < 4)
32689 min_prev_count += ix86_count_insn_bb (bb);
32690
32691 return min_prev_count;
32692 }
32693
32694 /* Pad short funtion to 4 instructions. */
32695
32696 static void
32697 ix86_pad_short_function (void)
32698 {
32699 edge e;
32700 edge_iterator ei;
32701
32702 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32703 {
32704 rtx ret = BB_END (e->src);
32705 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
32706 {
32707 int insn_count = ix86_count_insn (e->src);
32708
32709 /* Pad short function. */
32710 if (insn_count < 4)
32711 {
32712 rtx insn = ret;
32713
32714 /* Find epilogue. */
32715 while (insn
32716 && (!NOTE_P (insn)
32717 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
32718 insn = PREV_INSN (insn);
32719
32720 if (!insn)
32721 insn = ret;
32722
32723 /* Two NOPs count as one instruction. */
32724 insn_count = 2 * (4 - insn_count);
32725 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
32726 }
32727 }
32728 }
32729 }
32730
32731 /* Implement machine specific optimizations. We implement padding of returns
32732 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
32733 static void
32734 ix86_reorg (void)
32735 {
32736 /* We are freeing block_for_insn in the toplev to keep compatibility
32737 with old MDEP_REORGS that are not CFG based. Recompute it now. */
32738 compute_bb_for_insn ();
32739
32740 /* Run the vzeroupper optimization if needed. */
32741 if (TARGET_VZEROUPPER)
32742 move_or_delete_vzeroupper ();
32743
32744 if (optimize && optimize_function_for_speed_p (cfun))
32745 {
32746 if (TARGET_PAD_SHORT_FUNCTION)
32747 ix86_pad_short_function ();
32748 else if (TARGET_PAD_RETURNS)
32749 ix86_pad_returns ();
32750 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32751 if (TARGET_FOUR_JUMP_LIMIT)
32752 ix86_avoid_jump_mispredicts ();
32753 #endif
32754 }
32755 }
32756
32757 /* Return nonzero when QImode register that must be represented via REX prefix
32758 is used. */
32759 bool
32760 x86_extended_QIreg_mentioned_p (rtx insn)
32761 {
32762 int i;
32763 extract_insn_cached (insn);
32764 for (i = 0; i < recog_data.n_operands; i++)
32765 if (REG_P (recog_data.operand[i])
32766 && REGNO (recog_data.operand[i]) > BX_REG)
32767 return true;
32768 return false;
32769 }
32770
32771 /* Return nonzero when P points to register encoded via REX prefix.
32772 Called via for_each_rtx. */
32773 static int
32774 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
32775 {
32776 unsigned int regno;
32777 if (!REG_P (*p))
32778 return 0;
32779 regno = REGNO (*p);
32780 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
32781 }
32782
32783 /* Return true when INSN mentions register that must be encoded using REX
32784 prefix. */
32785 bool
32786 x86_extended_reg_mentioned_p (rtx insn)
32787 {
32788 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
32789 extended_reg_mentioned_1, NULL);
32790 }
32791
32792 /* If profitable, negate (without causing overflow) integer constant
32793 of mode MODE at location LOC. Return true in this case. */
32794 bool
32795 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
32796 {
32797 HOST_WIDE_INT val;
32798
32799 if (!CONST_INT_P (*loc))
32800 return false;
32801
32802 switch (mode)
32803 {
32804 case DImode:
32805 /* DImode x86_64 constants must fit in 32 bits. */
32806 gcc_assert (x86_64_immediate_operand (*loc, mode));
32807
32808 mode = SImode;
32809 break;
32810
32811 case SImode:
32812 case HImode:
32813 case QImode:
32814 break;
32815
32816 default:
32817 gcc_unreachable ();
32818 }
32819
32820 /* Avoid overflows. */
32821 if (mode_signbit_p (mode, *loc))
32822 return false;
32823
32824 val = INTVAL (*loc);
32825
32826 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
32827 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
32828 if ((val < 0 && val != -128)
32829 || val == 128)
32830 {
32831 *loc = GEN_INT (-val);
32832 return true;
32833 }
32834
32835 return false;
32836 }
32837
32838 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
32839 optabs would emit if we didn't have TFmode patterns. */
32840
32841 void
32842 x86_emit_floatuns (rtx operands[2])
32843 {
32844 rtx neglab, donelab, i0, i1, f0, in, out;
32845 enum machine_mode mode, inmode;
32846
32847 inmode = GET_MODE (operands[1]);
32848 gcc_assert (inmode == SImode || inmode == DImode);
32849
32850 out = operands[0];
32851 in = force_reg (inmode, operands[1]);
32852 mode = GET_MODE (out);
32853 neglab = gen_label_rtx ();
32854 donelab = gen_label_rtx ();
32855 f0 = gen_reg_rtx (mode);
32856
32857 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
32858
32859 expand_float (out, in, 0);
32860
32861 emit_jump_insn (gen_jump (donelab));
32862 emit_barrier ();
32863
32864 emit_label (neglab);
32865
32866 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
32867 1, OPTAB_DIRECT);
32868 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
32869 1, OPTAB_DIRECT);
32870 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
32871
32872 expand_float (f0, i0, 0);
32873
32874 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
32875
32876 emit_label (donelab);
32877 }
32878 \f
32879 /* AVX2 does support 32-byte integer vector operations,
32880 thus the longest vector we are faced with is V32QImode. */
32881 #define MAX_VECT_LEN 32
32882
32883 struct expand_vec_perm_d
32884 {
32885 rtx target, op0, op1;
32886 unsigned char perm[MAX_VECT_LEN];
32887 enum machine_mode vmode;
32888 unsigned char nelt;
32889 bool testing_p;
32890 };
32891
32892 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
32893 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
32894
32895 /* Get a vector mode of the same size as the original but with elements
32896 twice as wide. This is only guaranteed to apply to integral vectors. */
32897
32898 static inline enum machine_mode
32899 get_mode_wider_vector (enum machine_mode o)
32900 {
32901 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
32902 enum machine_mode n = GET_MODE_WIDER_MODE (o);
32903 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
32904 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
32905 return n;
32906 }
32907
32908 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32909 with all elements equal to VAR. Return true if successful. */
32910
32911 static bool
32912 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
32913 rtx target, rtx val)
32914 {
32915 bool ok;
32916
32917 switch (mode)
32918 {
32919 case V2SImode:
32920 case V2SFmode:
32921 if (!mmx_ok)
32922 return false;
32923 /* FALLTHRU */
32924
32925 case V4DFmode:
32926 case V4DImode:
32927 case V8SFmode:
32928 case V8SImode:
32929 case V2DFmode:
32930 case V2DImode:
32931 case V4SFmode:
32932 case V4SImode:
32933 {
32934 rtx insn, dup;
32935
32936 /* First attempt to recognize VAL as-is. */
32937 dup = gen_rtx_VEC_DUPLICATE (mode, val);
32938 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
32939 if (recog_memoized (insn) < 0)
32940 {
32941 rtx seq;
32942 /* If that fails, force VAL into a register. */
32943
32944 start_sequence ();
32945 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
32946 seq = get_insns ();
32947 end_sequence ();
32948 if (seq)
32949 emit_insn_before (seq, insn);
32950
32951 ok = recog_memoized (insn) >= 0;
32952 gcc_assert (ok);
32953 }
32954 }
32955 return true;
32956
32957 case V4HImode:
32958 if (!mmx_ok)
32959 return false;
32960 if (TARGET_SSE || TARGET_3DNOW_A)
32961 {
32962 rtx x;
32963
32964 val = gen_lowpart (SImode, val);
32965 x = gen_rtx_TRUNCATE (HImode, val);
32966 x = gen_rtx_VEC_DUPLICATE (mode, x);
32967 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32968 return true;
32969 }
32970 goto widen;
32971
32972 case V8QImode:
32973 if (!mmx_ok)
32974 return false;
32975 goto widen;
32976
32977 case V8HImode:
32978 if (TARGET_SSE2)
32979 {
32980 struct expand_vec_perm_d dperm;
32981 rtx tmp1, tmp2;
32982
32983 permute:
32984 memset (&dperm, 0, sizeof (dperm));
32985 dperm.target = target;
32986 dperm.vmode = mode;
32987 dperm.nelt = GET_MODE_NUNITS (mode);
32988 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
32989
32990 /* Extend to SImode using a paradoxical SUBREG. */
32991 tmp1 = gen_reg_rtx (SImode);
32992 emit_move_insn (tmp1, gen_lowpart (SImode, val));
32993
32994 /* Insert the SImode value as low element of a V4SImode vector. */
32995 tmp2 = gen_lowpart (V4SImode, dperm.op0);
32996 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
32997
32998 ok = (expand_vec_perm_1 (&dperm)
32999 || expand_vec_perm_broadcast_1 (&dperm));
33000 gcc_assert (ok);
33001 return ok;
33002 }
33003 goto widen;
33004
33005 case V16QImode:
33006 if (TARGET_SSE2)
33007 goto permute;
33008 goto widen;
33009
33010 widen:
33011 /* Replicate the value once into the next wider mode and recurse. */
33012 {
33013 enum machine_mode smode, wsmode, wvmode;
33014 rtx x;
33015
33016 smode = GET_MODE_INNER (mode);
33017 wvmode = get_mode_wider_vector (mode);
33018 wsmode = GET_MODE_INNER (wvmode);
33019
33020 val = convert_modes (wsmode, smode, val, true);
33021 x = expand_simple_binop (wsmode, ASHIFT, val,
33022 GEN_INT (GET_MODE_BITSIZE (smode)),
33023 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33024 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
33025
33026 x = gen_lowpart (wvmode, target);
33027 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
33028 gcc_assert (ok);
33029 return ok;
33030 }
33031
33032 case V16HImode:
33033 case V32QImode:
33034 {
33035 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
33036 rtx x = gen_reg_rtx (hvmode);
33037
33038 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
33039 gcc_assert (ok);
33040
33041 x = gen_rtx_VEC_CONCAT (mode, x, x);
33042 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33043 }
33044 return true;
33045
33046 default:
33047 return false;
33048 }
33049 }
33050
33051 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33052 whose ONE_VAR element is VAR, and other elements are zero. Return true
33053 if successful. */
33054
33055 static bool
33056 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
33057 rtx target, rtx var, int one_var)
33058 {
33059 enum machine_mode vsimode;
33060 rtx new_target;
33061 rtx x, tmp;
33062 bool use_vector_set = false;
33063
33064 switch (mode)
33065 {
33066 case V2DImode:
33067 /* For SSE4.1, we normally use vector set. But if the second
33068 element is zero and inter-unit moves are OK, we use movq
33069 instead. */
33070 use_vector_set = (TARGET_64BIT
33071 && TARGET_SSE4_1
33072 && !(TARGET_INTER_UNIT_MOVES
33073 && one_var == 0));
33074 break;
33075 case V16QImode:
33076 case V4SImode:
33077 case V4SFmode:
33078 use_vector_set = TARGET_SSE4_1;
33079 break;
33080 case V8HImode:
33081 use_vector_set = TARGET_SSE2;
33082 break;
33083 case V4HImode:
33084 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
33085 break;
33086 case V32QImode:
33087 case V16HImode:
33088 case V8SImode:
33089 case V8SFmode:
33090 case V4DFmode:
33091 use_vector_set = TARGET_AVX;
33092 break;
33093 case V4DImode:
33094 /* Use ix86_expand_vector_set in 64bit mode only. */
33095 use_vector_set = TARGET_AVX && TARGET_64BIT;
33096 break;
33097 default:
33098 break;
33099 }
33100
33101 if (use_vector_set)
33102 {
33103 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
33104 var = force_reg (GET_MODE_INNER (mode), var);
33105 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33106 return true;
33107 }
33108
33109 switch (mode)
33110 {
33111 case V2SFmode:
33112 case V2SImode:
33113 if (!mmx_ok)
33114 return false;
33115 /* FALLTHRU */
33116
33117 case V2DFmode:
33118 case V2DImode:
33119 if (one_var != 0)
33120 return false;
33121 var = force_reg (GET_MODE_INNER (mode), var);
33122 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
33123 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33124 return true;
33125
33126 case V4SFmode:
33127 case V4SImode:
33128 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
33129 new_target = gen_reg_rtx (mode);
33130 else
33131 new_target = target;
33132 var = force_reg (GET_MODE_INNER (mode), var);
33133 x = gen_rtx_VEC_DUPLICATE (mode, var);
33134 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
33135 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
33136 if (one_var != 0)
33137 {
33138 /* We need to shuffle the value to the correct position, so
33139 create a new pseudo to store the intermediate result. */
33140
33141 /* With SSE2, we can use the integer shuffle insns. */
33142 if (mode != V4SFmode && TARGET_SSE2)
33143 {
33144 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
33145 const1_rtx,
33146 GEN_INT (one_var == 1 ? 0 : 1),
33147 GEN_INT (one_var == 2 ? 0 : 1),
33148 GEN_INT (one_var == 3 ? 0 : 1)));
33149 if (target != new_target)
33150 emit_move_insn (target, new_target);
33151 return true;
33152 }
33153
33154 /* Otherwise convert the intermediate result to V4SFmode and
33155 use the SSE1 shuffle instructions. */
33156 if (mode != V4SFmode)
33157 {
33158 tmp = gen_reg_rtx (V4SFmode);
33159 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
33160 }
33161 else
33162 tmp = new_target;
33163
33164 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
33165 const1_rtx,
33166 GEN_INT (one_var == 1 ? 0 : 1),
33167 GEN_INT (one_var == 2 ? 0+4 : 1+4),
33168 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
33169
33170 if (mode != V4SFmode)
33171 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
33172 else if (tmp != target)
33173 emit_move_insn (target, tmp);
33174 }
33175 else if (target != new_target)
33176 emit_move_insn (target, new_target);
33177 return true;
33178
33179 case V8HImode:
33180 case V16QImode:
33181 vsimode = V4SImode;
33182 goto widen;
33183 case V4HImode:
33184 case V8QImode:
33185 if (!mmx_ok)
33186 return false;
33187 vsimode = V2SImode;
33188 goto widen;
33189 widen:
33190 if (one_var != 0)
33191 return false;
33192
33193 /* Zero extend the variable element to SImode and recurse. */
33194 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
33195
33196 x = gen_reg_rtx (vsimode);
33197 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
33198 var, one_var))
33199 gcc_unreachable ();
33200
33201 emit_move_insn (target, gen_lowpart (mode, x));
33202 return true;
33203
33204 default:
33205 return false;
33206 }
33207 }
33208
33209 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33210 consisting of the values in VALS. It is known that all elements
33211 except ONE_VAR are constants. Return true if successful. */
33212
33213 static bool
33214 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33215 rtx target, rtx vals, int one_var)
33216 {
33217 rtx var = XVECEXP (vals, 0, one_var);
33218 enum machine_mode wmode;
33219 rtx const_vec, x;
33220
33221 const_vec = copy_rtx (vals);
33222 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33223 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33224
33225 switch (mode)
33226 {
33227 case V2DFmode:
33228 case V2DImode:
33229 case V2SFmode:
33230 case V2SImode:
33231 /* For the two element vectors, it's just as easy to use
33232 the general case. */
33233 return false;
33234
33235 case V4DImode:
33236 /* Use ix86_expand_vector_set in 64bit mode only. */
33237 if (!TARGET_64BIT)
33238 return false;
33239 case V4DFmode:
33240 case V8SFmode:
33241 case V8SImode:
33242 case V16HImode:
33243 case V32QImode:
33244 case V4SFmode:
33245 case V4SImode:
33246 case V8HImode:
33247 case V4HImode:
33248 break;
33249
33250 case V16QImode:
33251 if (TARGET_SSE4_1)
33252 break;
33253 wmode = V8HImode;
33254 goto widen;
33255 case V8QImode:
33256 wmode = V4HImode;
33257 goto widen;
33258 widen:
33259 /* There's no way to set one QImode entry easily. Combine
33260 the variable value with its adjacent constant value, and
33261 promote to an HImode set. */
33262 x = XVECEXP (vals, 0, one_var ^ 1);
33263 if (one_var & 1)
33264 {
33265 var = convert_modes (HImode, QImode, var, true);
33266 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33267 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33268 x = GEN_INT (INTVAL (x) & 0xff);
33269 }
33270 else
33271 {
33272 var = convert_modes (HImode, QImode, var, true);
33273 x = gen_int_mode (INTVAL (x) << 8, HImode);
33274 }
33275 if (x != const0_rtx)
33276 var = expand_simple_binop (HImode, IOR, var, x, var,
33277 1, OPTAB_LIB_WIDEN);
33278
33279 x = gen_reg_rtx (wmode);
33280 emit_move_insn (x, gen_lowpart (wmode, const_vec));
33281 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33282
33283 emit_move_insn (target, gen_lowpart (mode, x));
33284 return true;
33285
33286 default:
33287 return false;
33288 }
33289
33290 emit_move_insn (target, const_vec);
33291 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33292 return true;
33293 }
33294
33295 /* A subroutine of ix86_expand_vector_init_general. Use vector
33296 concatenate to handle the most general case: all values variable,
33297 and none identical. */
33298
33299 static void
33300 ix86_expand_vector_init_concat (enum machine_mode mode,
33301 rtx target, rtx *ops, int n)
33302 {
33303 enum machine_mode cmode, hmode = VOIDmode;
33304 rtx first[8], second[4];
33305 rtvec v;
33306 int i, j;
33307
33308 switch (n)
33309 {
33310 case 2:
33311 switch (mode)
33312 {
33313 case V8SImode:
33314 cmode = V4SImode;
33315 break;
33316 case V8SFmode:
33317 cmode = V4SFmode;
33318 break;
33319 case V4DImode:
33320 cmode = V2DImode;
33321 break;
33322 case V4DFmode:
33323 cmode = V2DFmode;
33324 break;
33325 case V4SImode:
33326 cmode = V2SImode;
33327 break;
33328 case V4SFmode:
33329 cmode = V2SFmode;
33330 break;
33331 case V2DImode:
33332 cmode = DImode;
33333 break;
33334 case V2SImode:
33335 cmode = SImode;
33336 break;
33337 case V2DFmode:
33338 cmode = DFmode;
33339 break;
33340 case V2SFmode:
33341 cmode = SFmode;
33342 break;
33343 default:
33344 gcc_unreachable ();
33345 }
33346
33347 if (!register_operand (ops[1], cmode))
33348 ops[1] = force_reg (cmode, ops[1]);
33349 if (!register_operand (ops[0], cmode))
33350 ops[0] = force_reg (cmode, ops[0]);
33351 emit_insn (gen_rtx_SET (VOIDmode, target,
33352 gen_rtx_VEC_CONCAT (mode, ops[0],
33353 ops[1])));
33354 break;
33355
33356 case 4:
33357 switch (mode)
33358 {
33359 case V4DImode:
33360 cmode = V2DImode;
33361 break;
33362 case V4DFmode:
33363 cmode = V2DFmode;
33364 break;
33365 case V4SImode:
33366 cmode = V2SImode;
33367 break;
33368 case V4SFmode:
33369 cmode = V2SFmode;
33370 break;
33371 default:
33372 gcc_unreachable ();
33373 }
33374 goto half;
33375
33376 case 8:
33377 switch (mode)
33378 {
33379 case V8SImode:
33380 cmode = V2SImode;
33381 hmode = V4SImode;
33382 break;
33383 case V8SFmode:
33384 cmode = V2SFmode;
33385 hmode = V4SFmode;
33386 break;
33387 default:
33388 gcc_unreachable ();
33389 }
33390 goto half;
33391
33392 half:
33393 /* FIXME: We process inputs backward to help RA. PR 36222. */
33394 i = n - 1;
33395 j = (n >> 1) - 1;
33396 for (; i > 0; i -= 2, j--)
33397 {
33398 first[j] = gen_reg_rtx (cmode);
33399 v = gen_rtvec (2, ops[i - 1], ops[i]);
33400 ix86_expand_vector_init (false, first[j],
33401 gen_rtx_PARALLEL (cmode, v));
33402 }
33403
33404 n >>= 1;
33405 if (n > 2)
33406 {
33407 gcc_assert (hmode != VOIDmode);
33408 for (i = j = 0; i < n; i += 2, j++)
33409 {
33410 second[j] = gen_reg_rtx (hmode);
33411 ix86_expand_vector_init_concat (hmode, second [j],
33412 &first [i], 2);
33413 }
33414 n >>= 1;
33415 ix86_expand_vector_init_concat (mode, target, second, n);
33416 }
33417 else
33418 ix86_expand_vector_init_concat (mode, target, first, n);
33419 break;
33420
33421 default:
33422 gcc_unreachable ();
33423 }
33424 }
33425
33426 /* A subroutine of ix86_expand_vector_init_general. Use vector
33427 interleave to handle the most general case: all values variable,
33428 and none identical. */
33429
33430 static void
33431 ix86_expand_vector_init_interleave (enum machine_mode mode,
33432 rtx target, rtx *ops, int n)
33433 {
33434 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
33435 int i, j;
33436 rtx op0, op1;
33437 rtx (*gen_load_even) (rtx, rtx, rtx);
33438 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
33439 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
33440
33441 switch (mode)
33442 {
33443 case V8HImode:
33444 gen_load_even = gen_vec_setv8hi;
33445 gen_interleave_first_low = gen_vec_interleave_lowv4si;
33446 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33447 inner_mode = HImode;
33448 first_imode = V4SImode;
33449 second_imode = V2DImode;
33450 third_imode = VOIDmode;
33451 break;
33452 case V16QImode:
33453 gen_load_even = gen_vec_setv16qi;
33454 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
33455 gen_interleave_second_low = gen_vec_interleave_lowv4si;
33456 inner_mode = QImode;
33457 first_imode = V8HImode;
33458 second_imode = V4SImode;
33459 third_imode = V2DImode;
33460 break;
33461 default:
33462 gcc_unreachable ();
33463 }
33464
33465 for (i = 0; i < n; i++)
33466 {
33467 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
33468 op0 = gen_reg_rtx (SImode);
33469 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
33470
33471 /* Insert the SImode value as low element of V4SImode vector. */
33472 op1 = gen_reg_rtx (V4SImode);
33473 op0 = gen_rtx_VEC_MERGE (V4SImode,
33474 gen_rtx_VEC_DUPLICATE (V4SImode,
33475 op0),
33476 CONST0_RTX (V4SImode),
33477 const1_rtx);
33478 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
33479
33480 /* Cast the V4SImode vector back to a vector in orignal mode. */
33481 op0 = gen_reg_rtx (mode);
33482 emit_move_insn (op0, gen_lowpart (mode, op1));
33483
33484 /* Load even elements into the second positon. */
33485 emit_insn (gen_load_even (op0,
33486 force_reg (inner_mode,
33487 ops [i + i + 1]),
33488 const1_rtx));
33489
33490 /* Cast vector to FIRST_IMODE vector. */
33491 ops[i] = gen_reg_rtx (first_imode);
33492 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
33493 }
33494
33495 /* Interleave low FIRST_IMODE vectors. */
33496 for (i = j = 0; i < n; i += 2, j++)
33497 {
33498 op0 = gen_reg_rtx (first_imode);
33499 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
33500
33501 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
33502 ops[j] = gen_reg_rtx (second_imode);
33503 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
33504 }
33505
33506 /* Interleave low SECOND_IMODE vectors. */
33507 switch (second_imode)
33508 {
33509 case V4SImode:
33510 for (i = j = 0; i < n / 2; i += 2, j++)
33511 {
33512 op0 = gen_reg_rtx (second_imode);
33513 emit_insn (gen_interleave_second_low (op0, ops[i],
33514 ops[i + 1]));
33515
33516 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
33517 vector. */
33518 ops[j] = gen_reg_rtx (third_imode);
33519 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
33520 }
33521 second_imode = V2DImode;
33522 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33523 /* FALLTHRU */
33524
33525 case V2DImode:
33526 op0 = gen_reg_rtx (second_imode);
33527 emit_insn (gen_interleave_second_low (op0, ops[0],
33528 ops[1]));
33529
33530 /* Cast the SECOND_IMODE vector back to a vector on original
33531 mode. */
33532 emit_insn (gen_rtx_SET (VOIDmode, target,
33533 gen_lowpart (mode, op0)));
33534 break;
33535
33536 default:
33537 gcc_unreachable ();
33538 }
33539 }
33540
33541 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
33542 all values variable, and none identical. */
33543
33544 static void
33545 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
33546 rtx target, rtx vals)
33547 {
33548 rtx ops[32], op0, op1;
33549 enum machine_mode half_mode = VOIDmode;
33550 int n, i;
33551
33552 switch (mode)
33553 {
33554 case V2SFmode:
33555 case V2SImode:
33556 if (!mmx_ok && !TARGET_SSE)
33557 break;
33558 /* FALLTHRU */
33559
33560 case V8SFmode:
33561 case V8SImode:
33562 case V4DFmode:
33563 case V4DImode:
33564 case V4SFmode:
33565 case V4SImode:
33566 case V2DFmode:
33567 case V2DImode:
33568 n = GET_MODE_NUNITS (mode);
33569 for (i = 0; i < n; i++)
33570 ops[i] = XVECEXP (vals, 0, i);
33571 ix86_expand_vector_init_concat (mode, target, ops, n);
33572 return;
33573
33574 case V32QImode:
33575 half_mode = V16QImode;
33576 goto half;
33577
33578 case V16HImode:
33579 half_mode = V8HImode;
33580 goto half;
33581
33582 half:
33583 n = GET_MODE_NUNITS (mode);
33584 for (i = 0; i < n; i++)
33585 ops[i] = XVECEXP (vals, 0, i);
33586 op0 = gen_reg_rtx (half_mode);
33587 op1 = gen_reg_rtx (half_mode);
33588 ix86_expand_vector_init_interleave (half_mode, op0, ops,
33589 n >> 2);
33590 ix86_expand_vector_init_interleave (half_mode, op1,
33591 &ops [n >> 1], n >> 2);
33592 emit_insn (gen_rtx_SET (VOIDmode, target,
33593 gen_rtx_VEC_CONCAT (mode, op0, op1)));
33594 return;
33595
33596 case V16QImode:
33597 if (!TARGET_SSE4_1)
33598 break;
33599 /* FALLTHRU */
33600
33601 case V8HImode:
33602 if (!TARGET_SSE2)
33603 break;
33604
33605 /* Don't use ix86_expand_vector_init_interleave if we can't
33606 move from GPR to SSE register directly. */
33607 if (!TARGET_INTER_UNIT_MOVES)
33608 break;
33609
33610 n = GET_MODE_NUNITS (mode);
33611 for (i = 0; i < n; i++)
33612 ops[i] = XVECEXP (vals, 0, i);
33613 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
33614 return;
33615
33616 case V4HImode:
33617 case V8QImode:
33618 break;
33619
33620 default:
33621 gcc_unreachable ();
33622 }
33623
33624 {
33625 int i, j, n_elts, n_words, n_elt_per_word;
33626 enum machine_mode inner_mode;
33627 rtx words[4], shift;
33628
33629 inner_mode = GET_MODE_INNER (mode);
33630 n_elts = GET_MODE_NUNITS (mode);
33631 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
33632 n_elt_per_word = n_elts / n_words;
33633 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
33634
33635 for (i = 0; i < n_words; ++i)
33636 {
33637 rtx word = NULL_RTX;
33638
33639 for (j = 0; j < n_elt_per_word; ++j)
33640 {
33641 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
33642 elt = convert_modes (word_mode, inner_mode, elt, true);
33643
33644 if (j == 0)
33645 word = elt;
33646 else
33647 {
33648 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
33649 word, 1, OPTAB_LIB_WIDEN);
33650 word = expand_simple_binop (word_mode, IOR, word, elt,
33651 word, 1, OPTAB_LIB_WIDEN);
33652 }
33653 }
33654
33655 words[i] = word;
33656 }
33657
33658 if (n_words == 1)
33659 emit_move_insn (target, gen_lowpart (mode, words[0]));
33660 else if (n_words == 2)
33661 {
33662 rtx tmp = gen_reg_rtx (mode);
33663 emit_clobber (tmp);
33664 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
33665 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
33666 emit_move_insn (target, tmp);
33667 }
33668 else if (n_words == 4)
33669 {
33670 rtx tmp = gen_reg_rtx (V4SImode);
33671 gcc_assert (word_mode == SImode);
33672 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
33673 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
33674 emit_move_insn (target, gen_lowpart (mode, tmp));
33675 }
33676 else
33677 gcc_unreachable ();
33678 }
33679 }
33680
33681 /* Initialize vector TARGET via VALS. Suppress the use of MMX
33682 instructions unless MMX_OK is true. */
33683
33684 void
33685 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
33686 {
33687 enum machine_mode mode = GET_MODE (target);
33688 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33689 int n_elts = GET_MODE_NUNITS (mode);
33690 int n_var = 0, one_var = -1;
33691 bool all_same = true, all_const_zero = true;
33692 int i;
33693 rtx x;
33694
33695 for (i = 0; i < n_elts; ++i)
33696 {
33697 x = XVECEXP (vals, 0, i);
33698 if (!(CONST_INT_P (x)
33699 || GET_CODE (x) == CONST_DOUBLE
33700 || GET_CODE (x) == CONST_FIXED))
33701 n_var++, one_var = i;
33702 else if (x != CONST0_RTX (inner_mode))
33703 all_const_zero = false;
33704 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
33705 all_same = false;
33706 }
33707
33708 /* Constants are best loaded from the constant pool. */
33709 if (n_var == 0)
33710 {
33711 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
33712 return;
33713 }
33714
33715 /* If all values are identical, broadcast the value. */
33716 if (all_same
33717 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
33718 XVECEXP (vals, 0, 0)))
33719 return;
33720
33721 /* Values where only one field is non-constant are best loaded from
33722 the pool and overwritten via move later. */
33723 if (n_var == 1)
33724 {
33725 if (all_const_zero
33726 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
33727 XVECEXP (vals, 0, one_var),
33728 one_var))
33729 return;
33730
33731 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
33732 return;
33733 }
33734
33735 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
33736 }
33737
33738 void
33739 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
33740 {
33741 enum machine_mode mode = GET_MODE (target);
33742 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33743 enum machine_mode half_mode;
33744 bool use_vec_merge = false;
33745 rtx tmp;
33746 static rtx (*gen_extract[6][2]) (rtx, rtx)
33747 = {
33748 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
33749 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
33750 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
33751 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
33752 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
33753 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
33754 };
33755 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
33756 = {
33757 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
33758 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
33759 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
33760 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
33761 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
33762 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
33763 };
33764 int i, j, n;
33765
33766 switch (mode)
33767 {
33768 case V2SFmode:
33769 case V2SImode:
33770 if (mmx_ok)
33771 {
33772 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33773 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
33774 if (elt == 0)
33775 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33776 else
33777 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33778 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33779 return;
33780 }
33781 break;
33782
33783 case V2DImode:
33784 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
33785 if (use_vec_merge)
33786 break;
33787
33788 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33789 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
33790 if (elt == 0)
33791 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33792 else
33793 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33794 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33795 return;
33796
33797 case V2DFmode:
33798 {
33799 rtx op0, op1;
33800
33801 /* For the two element vectors, we implement a VEC_CONCAT with
33802 the extraction of the other element. */
33803
33804 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
33805 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
33806
33807 if (elt == 0)
33808 op0 = val, op1 = tmp;
33809 else
33810 op0 = tmp, op1 = val;
33811
33812 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
33813 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33814 }
33815 return;
33816
33817 case V4SFmode:
33818 use_vec_merge = TARGET_SSE4_1;
33819 if (use_vec_merge)
33820 break;
33821
33822 switch (elt)
33823 {
33824 case 0:
33825 use_vec_merge = true;
33826 break;
33827
33828 case 1:
33829 /* tmp = target = A B C D */
33830 tmp = copy_to_reg (target);
33831 /* target = A A B B */
33832 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
33833 /* target = X A B B */
33834 ix86_expand_vector_set (false, target, val, 0);
33835 /* target = A X C D */
33836 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33837 const1_rtx, const0_rtx,
33838 GEN_INT (2+4), GEN_INT (3+4)));
33839 return;
33840
33841 case 2:
33842 /* tmp = target = A B C D */
33843 tmp = copy_to_reg (target);
33844 /* tmp = X B C D */
33845 ix86_expand_vector_set (false, tmp, val, 0);
33846 /* target = A B X D */
33847 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33848 const0_rtx, const1_rtx,
33849 GEN_INT (0+4), GEN_INT (3+4)));
33850 return;
33851
33852 case 3:
33853 /* tmp = target = A B C D */
33854 tmp = copy_to_reg (target);
33855 /* tmp = X B C D */
33856 ix86_expand_vector_set (false, tmp, val, 0);
33857 /* target = A B X D */
33858 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33859 const0_rtx, const1_rtx,
33860 GEN_INT (2+4), GEN_INT (0+4)));
33861 return;
33862
33863 default:
33864 gcc_unreachable ();
33865 }
33866 break;
33867
33868 case V4SImode:
33869 use_vec_merge = TARGET_SSE4_1;
33870 if (use_vec_merge)
33871 break;
33872
33873 /* Element 0 handled by vec_merge below. */
33874 if (elt == 0)
33875 {
33876 use_vec_merge = true;
33877 break;
33878 }
33879
33880 if (TARGET_SSE2)
33881 {
33882 /* With SSE2, use integer shuffles to swap element 0 and ELT,
33883 store into element 0, then shuffle them back. */
33884
33885 rtx order[4];
33886
33887 order[0] = GEN_INT (elt);
33888 order[1] = const1_rtx;
33889 order[2] = const2_rtx;
33890 order[3] = GEN_INT (3);
33891 order[elt] = const0_rtx;
33892
33893 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33894 order[1], order[2], order[3]));
33895
33896 ix86_expand_vector_set (false, target, val, 0);
33897
33898 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33899 order[1], order[2], order[3]));
33900 }
33901 else
33902 {
33903 /* For SSE1, we have to reuse the V4SF code. */
33904 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
33905 gen_lowpart (SFmode, val), elt);
33906 }
33907 return;
33908
33909 case V8HImode:
33910 use_vec_merge = TARGET_SSE2;
33911 break;
33912 case V4HImode:
33913 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33914 break;
33915
33916 case V16QImode:
33917 use_vec_merge = TARGET_SSE4_1;
33918 break;
33919
33920 case V8QImode:
33921 break;
33922
33923 case V32QImode:
33924 half_mode = V16QImode;
33925 j = 0;
33926 n = 16;
33927 goto half;
33928
33929 case V16HImode:
33930 half_mode = V8HImode;
33931 j = 1;
33932 n = 8;
33933 goto half;
33934
33935 case V8SImode:
33936 half_mode = V4SImode;
33937 j = 2;
33938 n = 4;
33939 goto half;
33940
33941 case V4DImode:
33942 half_mode = V2DImode;
33943 j = 3;
33944 n = 2;
33945 goto half;
33946
33947 case V8SFmode:
33948 half_mode = V4SFmode;
33949 j = 4;
33950 n = 4;
33951 goto half;
33952
33953 case V4DFmode:
33954 half_mode = V2DFmode;
33955 j = 5;
33956 n = 2;
33957 goto half;
33958
33959 half:
33960 /* Compute offset. */
33961 i = elt / n;
33962 elt %= n;
33963
33964 gcc_assert (i <= 1);
33965
33966 /* Extract the half. */
33967 tmp = gen_reg_rtx (half_mode);
33968 emit_insn (gen_extract[j][i] (tmp, target));
33969
33970 /* Put val in tmp at elt. */
33971 ix86_expand_vector_set (false, tmp, val, elt);
33972
33973 /* Put it back. */
33974 emit_insn (gen_insert[j][i] (target, target, tmp));
33975 return;
33976
33977 default:
33978 break;
33979 }
33980
33981 if (use_vec_merge)
33982 {
33983 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
33984 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
33985 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33986 }
33987 else
33988 {
33989 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33990
33991 emit_move_insn (mem, target);
33992
33993 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33994 emit_move_insn (tmp, val);
33995
33996 emit_move_insn (target, mem);
33997 }
33998 }
33999
34000 void
34001 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
34002 {
34003 enum machine_mode mode = GET_MODE (vec);
34004 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34005 bool use_vec_extr = false;
34006 rtx tmp;
34007
34008 switch (mode)
34009 {
34010 case V2SImode:
34011 case V2SFmode:
34012 if (!mmx_ok)
34013 break;
34014 /* FALLTHRU */
34015
34016 case V2DFmode:
34017 case V2DImode:
34018 use_vec_extr = true;
34019 break;
34020
34021 case V4SFmode:
34022 use_vec_extr = TARGET_SSE4_1;
34023 if (use_vec_extr)
34024 break;
34025
34026 switch (elt)
34027 {
34028 case 0:
34029 tmp = vec;
34030 break;
34031
34032 case 1:
34033 case 3:
34034 tmp = gen_reg_rtx (mode);
34035 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
34036 GEN_INT (elt), GEN_INT (elt),
34037 GEN_INT (elt+4), GEN_INT (elt+4)));
34038 break;
34039
34040 case 2:
34041 tmp = gen_reg_rtx (mode);
34042 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
34043 break;
34044
34045 default:
34046 gcc_unreachable ();
34047 }
34048 vec = tmp;
34049 use_vec_extr = true;
34050 elt = 0;
34051 break;
34052
34053 case V4SImode:
34054 use_vec_extr = TARGET_SSE4_1;
34055 if (use_vec_extr)
34056 break;
34057
34058 if (TARGET_SSE2)
34059 {
34060 switch (elt)
34061 {
34062 case 0:
34063 tmp = vec;
34064 break;
34065
34066 case 1:
34067 case 3:
34068 tmp = gen_reg_rtx (mode);
34069 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
34070 GEN_INT (elt), GEN_INT (elt),
34071 GEN_INT (elt), GEN_INT (elt)));
34072 break;
34073
34074 case 2:
34075 tmp = gen_reg_rtx (mode);
34076 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
34077 break;
34078
34079 default:
34080 gcc_unreachable ();
34081 }
34082 vec = tmp;
34083 use_vec_extr = true;
34084 elt = 0;
34085 }
34086 else
34087 {
34088 /* For SSE1, we have to reuse the V4SF code. */
34089 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
34090 gen_lowpart (V4SFmode, vec), elt);
34091 return;
34092 }
34093 break;
34094
34095 case V8HImode:
34096 use_vec_extr = TARGET_SSE2;
34097 break;
34098 case V4HImode:
34099 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34100 break;
34101
34102 case V16QImode:
34103 use_vec_extr = TARGET_SSE4_1;
34104 break;
34105
34106 case V8SFmode:
34107 if (TARGET_AVX)
34108 {
34109 tmp = gen_reg_rtx (V4SFmode);
34110 if (elt < 4)
34111 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
34112 else
34113 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
34114 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34115 return;
34116 }
34117 break;
34118
34119 case V4DFmode:
34120 if (TARGET_AVX)
34121 {
34122 tmp = gen_reg_rtx (V2DFmode);
34123 if (elt < 2)
34124 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
34125 else
34126 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
34127 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34128 return;
34129 }
34130 break;
34131
34132 case V32QImode:
34133 if (TARGET_AVX)
34134 {
34135 tmp = gen_reg_rtx (V16QImode);
34136 if (elt < 16)
34137 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
34138 else
34139 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
34140 ix86_expand_vector_extract (false, target, tmp, elt & 15);
34141 return;
34142 }
34143 break;
34144
34145 case V16HImode:
34146 if (TARGET_AVX)
34147 {
34148 tmp = gen_reg_rtx (V8HImode);
34149 if (elt < 8)
34150 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
34151 else
34152 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
34153 ix86_expand_vector_extract (false, target, tmp, elt & 7);
34154 return;
34155 }
34156 break;
34157
34158 case V8SImode:
34159 if (TARGET_AVX)
34160 {
34161 tmp = gen_reg_rtx (V4SImode);
34162 if (elt < 4)
34163 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
34164 else
34165 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
34166 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34167 return;
34168 }
34169 break;
34170
34171 case V4DImode:
34172 if (TARGET_AVX)
34173 {
34174 tmp = gen_reg_rtx (V2DImode);
34175 if (elt < 2)
34176 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
34177 else
34178 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
34179 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34180 return;
34181 }
34182 break;
34183
34184 case V8QImode:
34185 /* ??? Could extract the appropriate HImode element and shift. */
34186 default:
34187 break;
34188 }
34189
34190 if (use_vec_extr)
34191 {
34192 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
34193 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
34194
34195 /* Let the rtl optimizers know about the zero extension performed. */
34196 if (inner_mode == QImode || inner_mode == HImode)
34197 {
34198 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
34199 target = gen_lowpart (SImode, target);
34200 }
34201
34202 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34203 }
34204 else
34205 {
34206 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34207
34208 emit_move_insn (mem, vec);
34209
34210 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34211 emit_move_insn (target, tmp);
34212 }
34213 }
34214
34215 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34216 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34217 The upper bits of DEST are undefined, though they shouldn't cause
34218 exceptions (some bits from src or all zeros are ok). */
34219
34220 static void
34221 emit_reduc_half (rtx dest, rtx src, int i)
34222 {
34223 rtx tem;
34224 switch (GET_MODE (src))
34225 {
34226 case V4SFmode:
34227 if (i == 128)
34228 tem = gen_sse_movhlps (dest, src, src);
34229 else
34230 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34231 GEN_INT (1 + 4), GEN_INT (1 + 4));
34232 break;
34233 case V2DFmode:
34234 tem = gen_vec_interleave_highv2df (dest, src, src);
34235 break;
34236 case V16QImode:
34237 case V8HImode:
34238 case V4SImode:
34239 case V2DImode:
34240 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34241 gen_lowpart (V1TImode, src),
34242 GEN_INT (i / 2));
34243 break;
34244 case V8SFmode:
34245 if (i == 256)
34246 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34247 else
34248 tem = gen_avx_shufps256 (dest, src, src,
34249 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34250 break;
34251 case V4DFmode:
34252 if (i == 256)
34253 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34254 else
34255 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34256 break;
34257 case V32QImode:
34258 case V16HImode:
34259 case V8SImode:
34260 case V4DImode:
34261 if (i == 256)
34262 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34263 gen_lowpart (V4DImode, src),
34264 gen_lowpart (V4DImode, src),
34265 const1_rtx);
34266 else
34267 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34268 gen_lowpart (V2TImode, src),
34269 GEN_INT (i / 2));
34270 break;
34271 default:
34272 gcc_unreachable ();
34273 }
34274 emit_insn (tem);
34275 }
34276
34277 /* Expand a vector reduction. FN is the binary pattern to reduce;
34278 DEST is the destination; IN is the input vector. */
34279
34280 void
34281 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34282 {
34283 rtx half, dst, vec = in;
34284 enum machine_mode mode = GET_MODE (in);
34285 int i;
34286
34287 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
34288 if (TARGET_SSE4_1
34289 && mode == V8HImode
34290 && fn == gen_uminv8hi3)
34291 {
34292 emit_insn (gen_sse4_1_phminposuw (dest, in));
34293 return;
34294 }
34295
34296 for (i = GET_MODE_BITSIZE (mode);
34297 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34298 i >>= 1)
34299 {
34300 half = gen_reg_rtx (mode);
34301 emit_reduc_half (half, vec, i);
34302 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34303 dst = dest;
34304 else
34305 dst = gen_reg_rtx (mode);
34306 emit_insn (fn (dst, half, vec));
34307 vec = dst;
34308 }
34309 }
34310 \f
34311 /* Target hook for scalar_mode_supported_p. */
34312 static bool
34313 ix86_scalar_mode_supported_p (enum machine_mode mode)
34314 {
34315 if (DECIMAL_FLOAT_MODE_P (mode))
34316 return default_decimal_float_supported_p ();
34317 else if (mode == TFmode)
34318 return true;
34319 else
34320 return default_scalar_mode_supported_p (mode);
34321 }
34322
34323 /* Implements target hook vector_mode_supported_p. */
34324 static bool
34325 ix86_vector_mode_supported_p (enum machine_mode mode)
34326 {
34327 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34328 return true;
34329 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34330 return true;
34331 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34332 return true;
34333 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34334 return true;
34335 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34336 return true;
34337 return false;
34338 }
34339
34340 /* Target hook for c_mode_for_suffix. */
34341 static enum machine_mode
34342 ix86_c_mode_for_suffix (char suffix)
34343 {
34344 if (suffix == 'q')
34345 return TFmode;
34346 if (suffix == 'w')
34347 return XFmode;
34348
34349 return VOIDmode;
34350 }
34351
34352 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34353
34354 We do this in the new i386 backend to maintain source compatibility
34355 with the old cc0-based compiler. */
34356
34357 static tree
34358 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34359 tree inputs ATTRIBUTE_UNUSED,
34360 tree clobbers)
34361 {
34362 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34363 clobbers);
34364 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34365 clobbers);
34366 return clobbers;
34367 }
34368
34369 /* Implements target vector targetm.asm.encode_section_info. */
34370
34371 static void ATTRIBUTE_UNUSED
34372 ix86_encode_section_info (tree decl, rtx rtl, int first)
34373 {
34374 default_encode_section_info (decl, rtl, first);
34375
34376 if (TREE_CODE (decl) == VAR_DECL
34377 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34378 && ix86_in_large_data_p (decl))
34379 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34380 }
34381
34382 /* Worker function for REVERSE_CONDITION. */
34383
34384 enum rtx_code
34385 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34386 {
34387 return (mode != CCFPmode && mode != CCFPUmode
34388 ? reverse_condition (code)
34389 : reverse_condition_maybe_unordered (code));
34390 }
34391
34392 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34393 to OPERANDS[0]. */
34394
34395 const char *
34396 output_387_reg_move (rtx insn, rtx *operands)
34397 {
34398 if (REG_P (operands[0]))
34399 {
34400 if (REG_P (operands[1])
34401 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34402 {
34403 if (REGNO (operands[0]) == FIRST_STACK_REG)
34404 return output_387_ffreep (operands, 0);
34405 return "fstp\t%y0";
34406 }
34407 if (STACK_TOP_P (operands[0]))
34408 return "fld%Z1\t%y1";
34409 return "fst\t%y0";
34410 }
34411 else if (MEM_P (operands[0]))
34412 {
34413 gcc_assert (REG_P (operands[1]));
34414 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34415 return "fstp%Z0\t%y0";
34416 else
34417 {
34418 /* There is no non-popping store to memory for XFmode.
34419 So if we need one, follow the store with a load. */
34420 if (GET_MODE (operands[0]) == XFmode)
34421 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
34422 else
34423 return "fst%Z0\t%y0";
34424 }
34425 }
34426 else
34427 gcc_unreachable();
34428 }
34429
34430 /* Output code to perform a conditional jump to LABEL, if C2 flag in
34431 FP status register is set. */
34432
34433 void
34434 ix86_emit_fp_unordered_jump (rtx label)
34435 {
34436 rtx reg = gen_reg_rtx (HImode);
34437 rtx temp;
34438
34439 emit_insn (gen_x86_fnstsw_1 (reg));
34440
34441 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
34442 {
34443 emit_insn (gen_x86_sahf_1 (reg));
34444
34445 temp = gen_rtx_REG (CCmode, FLAGS_REG);
34446 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
34447 }
34448 else
34449 {
34450 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
34451
34452 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
34453 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
34454 }
34455
34456 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
34457 gen_rtx_LABEL_REF (VOIDmode, label),
34458 pc_rtx);
34459 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
34460
34461 emit_jump_insn (temp);
34462 predict_jump (REG_BR_PROB_BASE * 10 / 100);
34463 }
34464
34465 /* Output code to perform a log1p XFmode calculation. */
34466
34467 void ix86_emit_i387_log1p (rtx op0, rtx op1)
34468 {
34469 rtx label1 = gen_label_rtx ();
34470 rtx label2 = gen_label_rtx ();
34471
34472 rtx tmp = gen_reg_rtx (XFmode);
34473 rtx tmp2 = gen_reg_rtx (XFmode);
34474 rtx test;
34475
34476 emit_insn (gen_absxf2 (tmp, op1));
34477 test = gen_rtx_GE (VOIDmode, tmp,
34478 CONST_DOUBLE_FROM_REAL_VALUE (
34479 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
34480 XFmode));
34481 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
34482
34483 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34484 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
34485 emit_jump (label2);
34486
34487 emit_label (label1);
34488 emit_move_insn (tmp, CONST1_RTX (XFmode));
34489 emit_insn (gen_addxf3 (tmp, op1, tmp));
34490 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34491 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
34492
34493 emit_label (label2);
34494 }
34495
34496 /* Emit code for round calculation. */
34497 void ix86_emit_i387_round (rtx op0, rtx op1)
34498 {
34499 enum machine_mode inmode = GET_MODE (op1);
34500 enum machine_mode outmode = GET_MODE (op0);
34501 rtx e1, e2, res, tmp, tmp1, half;
34502 rtx scratch = gen_reg_rtx (HImode);
34503 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
34504 rtx jump_label = gen_label_rtx ();
34505 rtx insn;
34506 rtx (*gen_abs) (rtx, rtx);
34507 rtx (*gen_neg) (rtx, rtx);
34508
34509 switch (inmode)
34510 {
34511 case SFmode:
34512 gen_abs = gen_abssf2;
34513 break;
34514 case DFmode:
34515 gen_abs = gen_absdf2;
34516 break;
34517 case XFmode:
34518 gen_abs = gen_absxf2;
34519 break;
34520 default:
34521 gcc_unreachable ();
34522 }
34523
34524 switch (outmode)
34525 {
34526 case SFmode:
34527 gen_neg = gen_negsf2;
34528 break;
34529 case DFmode:
34530 gen_neg = gen_negdf2;
34531 break;
34532 case XFmode:
34533 gen_neg = gen_negxf2;
34534 break;
34535 case HImode:
34536 gen_neg = gen_neghi2;
34537 break;
34538 case SImode:
34539 gen_neg = gen_negsi2;
34540 break;
34541 case DImode:
34542 gen_neg = gen_negdi2;
34543 break;
34544 default:
34545 gcc_unreachable ();
34546 }
34547
34548 e1 = gen_reg_rtx (inmode);
34549 e2 = gen_reg_rtx (inmode);
34550 res = gen_reg_rtx (outmode);
34551
34552 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
34553
34554 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
34555
34556 /* scratch = fxam(op1) */
34557 emit_insn (gen_rtx_SET (VOIDmode, scratch,
34558 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
34559 UNSPEC_FXAM)));
34560 /* e1 = fabs(op1) */
34561 emit_insn (gen_abs (e1, op1));
34562
34563 /* e2 = e1 + 0.5 */
34564 half = force_reg (inmode, half);
34565 emit_insn (gen_rtx_SET (VOIDmode, e2,
34566 gen_rtx_PLUS (inmode, e1, half)));
34567
34568 /* res = floor(e2) */
34569 if (inmode != XFmode)
34570 {
34571 tmp1 = gen_reg_rtx (XFmode);
34572
34573 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
34574 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
34575 }
34576 else
34577 tmp1 = e2;
34578
34579 switch (outmode)
34580 {
34581 case SFmode:
34582 case DFmode:
34583 {
34584 rtx tmp0 = gen_reg_rtx (XFmode);
34585
34586 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
34587
34588 emit_insn (gen_rtx_SET (VOIDmode, res,
34589 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
34590 UNSPEC_TRUNC_NOOP)));
34591 }
34592 break;
34593 case XFmode:
34594 emit_insn (gen_frndintxf2_floor (res, tmp1));
34595 break;
34596 case HImode:
34597 emit_insn (gen_lfloorxfhi2 (res, tmp1));
34598 break;
34599 case SImode:
34600 emit_insn (gen_lfloorxfsi2 (res, tmp1));
34601 break;
34602 case DImode:
34603 emit_insn (gen_lfloorxfdi2 (res, tmp1));
34604 break;
34605 default:
34606 gcc_unreachable ();
34607 }
34608
34609 /* flags = signbit(a) */
34610 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
34611
34612 /* if (flags) then res = -res */
34613 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
34614 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
34615 gen_rtx_LABEL_REF (VOIDmode, jump_label),
34616 pc_rtx);
34617 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34618 predict_jump (REG_BR_PROB_BASE * 50 / 100);
34619 JUMP_LABEL (insn) = jump_label;
34620
34621 emit_insn (gen_neg (res, res));
34622
34623 emit_label (jump_label);
34624 LABEL_NUSES (jump_label) = 1;
34625
34626 emit_move_insn (op0, res);
34627 }
34628
34629 /* Output code to perform a Newton-Rhapson approximation of a single precision
34630 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
34631
34632 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
34633 {
34634 rtx x0, x1, e0, e1;
34635
34636 x0 = gen_reg_rtx (mode);
34637 e0 = gen_reg_rtx (mode);
34638 e1 = gen_reg_rtx (mode);
34639 x1 = gen_reg_rtx (mode);
34640
34641 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
34642
34643 b = force_reg (mode, b);
34644
34645 /* x0 = rcp(b) estimate */
34646 emit_insn (gen_rtx_SET (VOIDmode, x0,
34647 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
34648 UNSPEC_RCP)));
34649 /* e0 = x0 * b */
34650 emit_insn (gen_rtx_SET (VOIDmode, e0,
34651 gen_rtx_MULT (mode, x0, b)));
34652
34653 /* e0 = x0 * e0 */
34654 emit_insn (gen_rtx_SET (VOIDmode, e0,
34655 gen_rtx_MULT (mode, x0, e0)));
34656
34657 /* e1 = x0 + x0 */
34658 emit_insn (gen_rtx_SET (VOIDmode, e1,
34659 gen_rtx_PLUS (mode, x0, x0)));
34660
34661 /* x1 = e1 - e0 */
34662 emit_insn (gen_rtx_SET (VOIDmode, x1,
34663 gen_rtx_MINUS (mode, e1, e0)));
34664
34665 /* res = a * x1 */
34666 emit_insn (gen_rtx_SET (VOIDmode, res,
34667 gen_rtx_MULT (mode, a, x1)));
34668 }
34669
34670 /* Output code to perform a Newton-Rhapson approximation of a
34671 single precision floating point [reciprocal] square root. */
34672
34673 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
34674 bool recip)
34675 {
34676 rtx x0, e0, e1, e2, e3, mthree, mhalf;
34677 REAL_VALUE_TYPE r;
34678
34679 x0 = gen_reg_rtx (mode);
34680 e0 = gen_reg_rtx (mode);
34681 e1 = gen_reg_rtx (mode);
34682 e2 = gen_reg_rtx (mode);
34683 e3 = gen_reg_rtx (mode);
34684
34685 real_from_integer (&r, VOIDmode, -3, -1, 0);
34686 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34687
34688 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
34689 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34690
34691 if (VECTOR_MODE_P (mode))
34692 {
34693 mthree = ix86_build_const_vector (mode, true, mthree);
34694 mhalf = ix86_build_const_vector (mode, true, mhalf);
34695 }
34696
34697 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
34698 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
34699
34700 a = force_reg (mode, a);
34701
34702 /* x0 = rsqrt(a) estimate */
34703 emit_insn (gen_rtx_SET (VOIDmode, x0,
34704 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
34705 UNSPEC_RSQRT)));
34706
34707 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
34708 if (!recip)
34709 {
34710 rtx zero, mask;
34711
34712 zero = gen_reg_rtx (mode);
34713 mask = gen_reg_rtx (mode);
34714
34715 zero = force_reg (mode, CONST0_RTX(mode));
34716 emit_insn (gen_rtx_SET (VOIDmode, mask,
34717 gen_rtx_NE (mode, zero, a)));
34718
34719 emit_insn (gen_rtx_SET (VOIDmode, x0,
34720 gen_rtx_AND (mode, x0, mask)));
34721 }
34722
34723 /* e0 = x0 * a */
34724 emit_insn (gen_rtx_SET (VOIDmode, e0,
34725 gen_rtx_MULT (mode, x0, a)));
34726 /* e1 = e0 * x0 */
34727 emit_insn (gen_rtx_SET (VOIDmode, e1,
34728 gen_rtx_MULT (mode, e0, x0)));
34729
34730 /* e2 = e1 - 3. */
34731 mthree = force_reg (mode, mthree);
34732 emit_insn (gen_rtx_SET (VOIDmode, e2,
34733 gen_rtx_PLUS (mode, e1, mthree)));
34734
34735 mhalf = force_reg (mode, mhalf);
34736 if (recip)
34737 /* e3 = -.5 * x0 */
34738 emit_insn (gen_rtx_SET (VOIDmode, e3,
34739 gen_rtx_MULT (mode, x0, mhalf)));
34740 else
34741 /* e3 = -.5 * e0 */
34742 emit_insn (gen_rtx_SET (VOIDmode, e3,
34743 gen_rtx_MULT (mode, e0, mhalf)));
34744 /* ret = e2 * e3 */
34745 emit_insn (gen_rtx_SET (VOIDmode, res,
34746 gen_rtx_MULT (mode, e2, e3)));
34747 }
34748
34749 #ifdef TARGET_SOLARIS
34750 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
34751
34752 static void
34753 i386_solaris_elf_named_section (const char *name, unsigned int flags,
34754 tree decl)
34755 {
34756 /* With Binutils 2.15, the "@unwind" marker must be specified on
34757 every occurrence of the ".eh_frame" section, not just the first
34758 one. */
34759 if (TARGET_64BIT
34760 && strcmp (name, ".eh_frame") == 0)
34761 {
34762 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
34763 flags & SECTION_WRITE ? "aw" : "a");
34764 return;
34765 }
34766
34767 #ifndef USE_GAS
34768 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
34769 {
34770 solaris_elf_asm_comdat_section (name, flags, decl);
34771 return;
34772 }
34773 #endif
34774
34775 default_elf_asm_named_section (name, flags, decl);
34776 }
34777 #endif /* TARGET_SOLARIS */
34778
34779 /* Return the mangling of TYPE if it is an extended fundamental type. */
34780
34781 static const char *
34782 ix86_mangle_type (const_tree type)
34783 {
34784 type = TYPE_MAIN_VARIANT (type);
34785
34786 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
34787 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
34788 return NULL;
34789
34790 switch (TYPE_MODE (type))
34791 {
34792 case TFmode:
34793 /* __float128 is "g". */
34794 return "g";
34795 case XFmode:
34796 /* "long double" or __float80 is "e". */
34797 return "e";
34798 default:
34799 return NULL;
34800 }
34801 }
34802
34803 /* For 32-bit code we can save PIC register setup by using
34804 __stack_chk_fail_local hidden function instead of calling
34805 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
34806 register, so it is better to call __stack_chk_fail directly. */
34807
34808 static tree ATTRIBUTE_UNUSED
34809 ix86_stack_protect_fail (void)
34810 {
34811 return TARGET_64BIT
34812 ? default_external_stack_protect_fail ()
34813 : default_hidden_stack_protect_fail ();
34814 }
34815
34816 /* Select a format to encode pointers in exception handling data. CODE
34817 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
34818 true if the symbol may be affected by dynamic relocations.
34819
34820 ??? All x86 object file formats are capable of representing this.
34821 After all, the relocation needed is the same as for the call insn.
34822 Whether or not a particular assembler allows us to enter such, I
34823 guess we'll have to see. */
34824 int
34825 asm_preferred_eh_data_format (int code, int global)
34826 {
34827 if (flag_pic)
34828 {
34829 int type = DW_EH_PE_sdata8;
34830 if (!TARGET_64BIT
34831 || ix86_cmodel == CM_SMALL_PIC
34832 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
34833 type = DW_EH_PE_sdata4;
34834 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
34835 }
34836 if (ix86_cmodel == CM_SMALL
34837 || (ix86_cmodel == CM_MEDIUM && code))
34838 return DW_EH_PE_udata4;
34839 return DW_EH_PE_absptr;
34840 }
34841 \f
34842 /* Expand copysign from SIGN to the positive value ABS_VALUE
34843 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
34844 the sign-bit. */
34845 static void
34846 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
34847 {
34848 enum machine_mode mode = GET_MODE (sign);
34849 rtx sgn = gen_reg_rtx (mode);
34850 if (mask == NULL_RTX)
34851 {
34852 enum machine_mode vmode;
34853
34854 if (mode == SFmode)
34855 vmode = V4SFmode;
34856 else if (mode == DFmode)
34857 vmode = V2DFmode;
34858 else
34859 vmode = mode;
34860
34861 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
34862 if (!VECTOR_MODE_P (mode))
34863 {
34864 /* We need to generate a scalar mode mask in this case. */
34865 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34866 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34867 mask = gen_reg_rtx (mode);
34868 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34869 }
34870 }
34871 else
34872 mask = gen_rtx_NOT (mode, mask);
34873 emit_insn (gen_rtx_SET (VOIDmode, sgn,
34874 gen_rtx_AND (mode, mask, sign)));
34875 emit_insn (gen_rtx_SET (VOIDmode, result,
34876 gen_rtx_IOR (mode, abs_value, sgn)));
34877 }
34878
34879 /* Expand fabs (OP0) and return a new rtx that holds the result. The
34880 mask for masking out the sign-bit is stored in *SMASK, if that is
34881 non-null. */
34882 static rtx
34883 ix86_expand_sse_fabs (rtx op0, rtx *smask)
34884 {
34885 enum machine_mode vmode, mode = GET_MODE (op0);
34886 rtx xa, mask;
34887
34888 xa = gen_reg_rtx (mode);
34889 if (mode == SFmode)
34890 vmode = V4SFmode;
34891 else if (mode == DFmode)
34892 vmode = V2DFmode;
34893 else
34894 vmode = mode;
34895 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
34896 if (!VECTOR_MODE_P (mode))
34897 {
34898 /* We need to generate a scalar mode mask in this case. */
34899 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34900 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34901 mask = gen_reg_rtx (mode);
34902 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34903 }
34904 emit_insn (gen_rtx_SET (VOIDmode, xa,
34905 gen_rtx_AND (mode, op0, mask)));
34906
34907 if (smask)
34908 *smask = mask;
34909
34910 return xa;
34911 }
34912
34913 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
34914 swapping the operands if SWAP_OPERANDS is true. The expanded
34915 code is a forward jump to a newly created label in case the
34916 comparison is true. The generated label rtx is returned. */
34917 static rtx
34918 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
34919 bool swap_operands)
34920 {
34921 rtx label, tmp;
34922
34923 if (swap_operands)
34924 {
34925 tmp = op0;
34926 op0 = op1;
34927 op1 = tmp;
34928 }
34929
34930 label = gen_label_rtx ();
34931 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
34932 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34933 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
34934 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
34935 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
34936 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
34937 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34938 JUMP_LABEL (tmp) = label;
34939
34940 return label;
34941 }
34942
34943 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
34944 using comparison code CODE. Operands are swapped for the comparison if
34945 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
34946 static rtx
34947 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
34948 bool swap_operands)
34949 {
34950 rtx (*insn)(rtx, rtx, rtx, rtx);
34951 enum machine_mode mode = GET_MODE (op0);
34952 rtx mask = gen_reg_rtx (mode);
34953
34954 if (swap_operands)
34955 {
34956 rtx tmp = op0;
34957 op0 = op1;
34958 op1 = tmp;
34959 }
34960
34961 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
34962
34963 emit_insn (insn (mask, op0, op1,
34964 gen_rtx_fmt_ee (code, mode, op0, op1)));
34965 return mask;
34966 }
34967
34968 /* Generate and return a rtx of mode MODE for 2**n where n is the number
34969 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
34970 static rtx
34971 ix86_gen_TWO52 (enum machine_mode mode)
34972 {
34973 REAL_VALUE_TYPE TWO52r;
34974 rtx TWO52;
34975
34976 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
34977 TWO52 = const_double_from_real_value (TWO52r, mode);
34978 TWO52 = force_reg (mode, TWO52);
34979
34980 return TWO52;
34981 }
34982
34983 /* Expand SSE sequence for computing lround from OP1 storing
34984 into OP0. */
34985 void
34986 ix86_expand_lround (rtx op0, rtx op1)
34987 {
34988 /* C code for the stuff we're doing below:
34989 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
34990 return (long)tmp;
34991 */
34992 enum machine_mode mode = GET_MODE (op1);
34993 const struct real_format *fmt;
34994 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34995 rtx adj;
34996
34997 /* load nextafter (0.5, 0.0) */
34998 fmt = REAL_MODE_FORMAT (mode);
34999 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35000 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35001
35002 /* adj = copysign (0.5, op1) */
35003 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
35004 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
35005
35006 /* adj = op1 + adj */
35007 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
35008
35009 /* op0 = (imode)adj */
35010 expand_fix (op0, adj, 0);
35011 }
35012
35013 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
35014 into OPERAND0. */
35015 void
35016 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
35017 {
35018 /* C code for the stuff we're doing below (for do_floor):
35019 xi = (long)op1;
35020 xi -= (double)xi > op1 ? 1 : 0;
35021 return xi;
35022 */
35023 enum machine_mode fmode = GET_MODE (op1);
35024 enum machine_mode imode = GET_MODE (op0);
35025 rtx ireg, freg, label, tmp;
35026
35027 /* reg = (long)op1 */
35028 ireg = gen_reg_rtx (imode);
35029 expand_fix (ireg, op1, 0);
35030
35031 /* freg = (double)reg */
35032 freg = gen_reg_rtx (fmode);
35033 expand_float (freg, ireg, 0);
35034
35035 /* ireg = (freg > op1) ? ireg - 1 : ireg */
35036 label = ix86_expand_sse_compare_and_jump (UNLE,
35037 freg, op1, !do_floor);
35038 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
35039 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
35040 emit_move_insn (ireg, tmp);
35041
35042 emit_label (label);
35043 LABEL_NUSES (label) = 1;
35044
35045 emit_move_insn (op0, ireg);
35046 }
35047
35048 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
35049 result in OPERAND0. */
35050 void
35051 ix86_expand_rint (rtx operand0, rtx operand1)
35052 {
35053 /* C code for the stuff we're doing below:
35054 xa = fabs (operand1);
35055 if (!isless (xa, 2**52))
35056 return operand1;
35057 xa = xa + 2**52 - 2**52;
35058 return copysign (xa, operand1);
35059 */
35060 enum machine_mode mode = GET_MODE (operand0);
35061 rtx res, xa, label, TWO52, mask;
35062
35063 res = gen_reg_rtx (mode);
35064 emit_move_insn (res, operand1);
35065
35066 /* xa = abs (operand1) */
35067 xa = ix86_expand_sse_fabs (res, &mask);
35068
35069 /* if (!isless (xa, TWO52)) goto label; */
35070 TWO52 = ix86_gen_TWO52 (mode);
35071 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35072
35073 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35074 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35075
35076 ix86_sse_copysign_to_positive (res, xa, res, mask);
35077
35078 emit_label (label);
35079 LABEL_NUSES (label) = 1;
35080
35081 emit_move_insn (operand0, res);
35082 }
35083
35084 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35085 into OPERAND0. */
35086 void
35087 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
35088 {
35089 /* C code for the stuff we expand below.
35090 double xa = fabs (x), x2;
35091 if (!isless (xa, TWO52))
35092 return x;
35093 xa = xa + TWO52 - TWO52;
35094 x2 = copysign (xa, x);
35095 Compensate. Floor:
35096 if (x2 > x)
35097 x2 -= 1;
35098 Compensate. Ceil:
35099 if (x2 < x)
35100 x2 -= -1;
35101 return x2;
35102 */
35103 enum machine_mode mode = GET_MODE (operand0);
35104 rtx xa, TWO52, tmp, label, one, res, mask;
35105
35106 TWO52 = ix86_gen_TWO52 (mode);
35107
35108 /* Temporary for holding the result, initialized to the input
35109 operand to ease control flow. */
35110 res = gen_reg_rtx (mode);
35111 emit_move_insn (res, operand1);
35112
35113 /* xa = abs (operand1) */
35114 xa = ix86_expand_sse_fabs (res, &mask);
35115
35116 /* if (!isless (xa, TWO52)) goto label; */
35117 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35118
35119 /* xa = xa + TWO52 - TWO52; */
35120 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35121 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35122
35123 /* xa = copysign (xa, operand1) */
35124 ix86_sse_copysign_to_positive (xa, xa, res, mask);
35125
35126 /* generate 1.0 or -1.0 */
35127 one = force_reg (mode,
35128 const_double_from_real_value (do_floor
35129 ? dconst1 : dconstm1, mode));
35130
35131 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35132 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35133 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35134 gen_rtx_AND (mode, one, tmp)));
35135 /* We always need to subtract here to preserve signed zero. */
35136 tmp = expand_simple_binop (mode, MINUS,
35137 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35138 emit_move_insn (res, tmp);
35139
35140 emit_label (label);
35141 LABEL_NUSES (label) = 1;
35142
35143 emit_move_insn (operand0, res);
35144 }
35145
35146 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35147 into OPERAND0. */
35148 void
35149 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
35150 {
35151 /* C code for the stuff we expand below.
35152 double xa = fabs (x), x2;
35153 if (!isless (xa, TWO52))
35154 return x;
35155 x2 = (double)(long)x;
35156 Compensate. Floor:
35157 if (x2 > x)
35158 x2 -= 1;
35159 Compensate. Ceil:
35160 if (x2 < x)
35161 x2 += 1;
35162 if (HONOR_SIGNED_ZEROS (mode))
35163 return copysign (x2, x);
35164 return x2;
35165 */
35166 enum machine_mode mode = GET_MODE (operand0);
35167 rtx xa, xi, TWO52, tmp, label, one, res, mask;
35168
35169 TWO52 = ix86_gen_TWO52 (mode);
35170
35171 /* Temporary for holding the result, initialized to the input
35172 operand to ease control flow. */
35173 res = gen_reg_rtx (mode);
35174 emit_move_insn (res, operand1);
35175
35176 /* xa = abs (operand1) */
35177 xa = ix86_expand_sse_fabs (res, &mask);
35178
35179 /* if (!isless (xa, TWO52)) goto label; */
35180 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35181
35182 /* xa = (double)(long)x */
35183 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35184 expand_fix (xi, res, 0);
35185 expand_float (xa, xi, 0);
35186
35187 /* generate 1.0 */
35188 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35189
35190 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35191 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35192 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35193 gen_rtx_AND (mode, one, tmp)));
35194 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
35195 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35196 emit_move_insn (res, tmp);
35197
35198 if (HONOR_SIGNED_ZEROS (mode))
35199 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35200
35201 emit_label (label);
35202 LABEL_NUSES (label) = 1;
35203
35204 emit_move_insn (operand0, res);
35205 }
35206
35207 /* Expand SSE sequence for computing round from OPERAND1 storing
35208 into OPERAND0. Sequence that works without relying on DImode truncation
35209 via cvttsd2siq that is only available on 64bit targets. */
35210 void
35211 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35212 {
35213 /* C code for the stuff we expand below.
35214 double xa = fabs (x), xa2, x2;
35215 if (!isless (xa, TWO52))
35216 return x;
35217 Using the absolute value and copying back sign makes
35218 -0.0 -> -0.0 correct.
35219 xa2 = xa + TWO52 - TWO52;
35220 Compensate.
35221 dxa = xa2 - xa;
35222 if (dxa <= -0.5)
35223 xa2 += 1;
35224 else if (dxa > 0.5)
35225 xa2 -= 1;
35226 x2 = copysign (xa2, x);
35227 return x2;
35228 */
35229 enum machine_mode mode = GET_MODE (operand0);
35230 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35231
35232 TWO52 = ix86_gen_TWO52 (mode);
35233
35234 /* Temporary for holding the result, initialized to the input
35235 operand to ease control flow. */
35236 res = gen_reg_rtx (mode);
35237 emit_move_insn (res, operand1);
35238
35239 /* xa = abs (operand1) */
35240 xa = ix86_expand_sse_fabs (res, &mask);
35241
35242 /* if (!isless (xa, TWO52)) goto label; */
35243 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35244
35245 /* xa2 = xa + TWO52 - TWO52; */
35246 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35247 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35248
35249 /* dxa = xa2 - xa; */
35250 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35251
35252 /* generate 0.5, 1.0 and -0.5 */
35253 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35254 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35255 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35256 0, OPTAB_DIRECT);
35257
35258 /* Compensate. */
35259 tmp = gen_reg_rtx (mode);
35260 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35261 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35262 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35263 gen_rtx_AND (mode, one, tmp)));
35264 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35265 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35266 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35267 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35268 gen_rtx_AND (mode, one, tmp)));
35269 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35270
35271 /* res = copysign (xa2, operand1) */
35272 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35273
35274 emit_label (label);
35275 LABEL_NUSES (label) = 1;
35276
35277 emit_move_insn (operand0, res);
35278 }
35279
35280 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35281 into OPERAND0. */
35282 void
35283 ix86_expand_trunc (rtx operand0, rtx operand1)
35284 {
35285 /* C code for SSE variant we expand below.
35286 double xa = fabs (x), x2;
35287 if (!isless (xa, TWO52))
35288 return x;
35289 x2 = (double)(long)x;
35290 if (HONOR_SIGNED_ZEROS (mode))
35291 return copysign (x2, x);
35292 return x2;
35293 */
35294 enum machine_mode mode = GET_MODE (operand0);
35295 rtx xa, xi, TWO52, label, res, mask;
35296
35297 TWO52 = ix86_gen_TWO52 (mode);
35298
35299 /* Temporary for holding the result, initialized to the input
35300 operand to ease control flow. */
35301 res = gen_reg_rtx (mode);
35302 emit_move_insn (res, operand1);
35303
35304 /* xa = abs (operand1) */
35305 xa = ix86_expand_sse_fabs (res, &mask);
35306
35307 /* if (!isless (xa, TWO52)) goto label; */
35308 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35309
35310 /* x = (double)(long)x */
35311 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35312 expand_fix (xi, res, 0);
35313 expand_float (res, xi, 0);
35314
35315 if (HONOR_SIGNED_ZEROS (mode))
35316 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35317
35318 emit_label (label);
35319 LABEL_NUSES (label) = 1;
35320
35321 emit_move_insn (operand0, res);
35322 }
35323
35324 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35325 into OPERAND0. */
35326 void
35327 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35328 {
35329 enum machine_mode mode = GET_MODE (operand0);
35330 rtx xa, mask, TWO52, label, one, res, smask, tmp;
35331
35332 /* C code for SSE variant we expand below.
35333 double xa = fabs (x), x2;
35334 if (!isless (xa, TWO52))
35335 return x;
35336 xa2 = xa + TWO52 - TWO52;
35337 Compensate:
35338 if (xa2 > xa)
35339 xa2 -= 1.0;
35340 x2 = copysign (xa2, x);
35341 return x2;
35342 */
35343
35344 TWO52 = ix86_gen_TWO52 (mode);
35345
35346 /* Temporary for holding the result, initialized to the input
35347 operand to ease control flow. */
35348 res = gen_reg_rtx (mode);
35349 emit_move_insn (res, operand1);
35350
35351 /* xa = abs (operand1) */
35352 xa = ix86_expand_sse_fabs (res, &smask);
35353
35354 /* if (!isless (xa, TWO52)) goto label; */
35355 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35356
35357 /* res = xa + TWO52 - TWO52; */
35358 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35359 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35360 emit_move_insn (res, tmp);
35361
35362 /* generate 1.0 */
35363 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35364
35365 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
35366 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35367 emit_insn (gen_rtx_SET (VOIDmode, mask,
35368 gen_rtx_AND (mode, mask, one)));
35369 tmp = expand_simple_binop (mode, MINUS,
35370 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35371 emit_move_insn (res, tmp);
35372
35373 /* res = copysign (res, operand1) */
35374 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35375
35376 emit_label (label);
35377 LABEL_NUSES (label) = 1;
35378
35379 emit_move_insn (operand0, res);
35380 }
35381
35382 /* Expand SSE sequence for computing round from OPERAND1 storing
35383 into OPERAND0. */
35384 void
35385 ix86_expand_round (rtx operand0, rtx operand1)
35386 {
35387 /* C code for the stuff we're doing below:
35388 double xa = fabs (x);
35389 if (!isless (xa, TWO52))
35390 return x;
35391 xa = (double)(long)(xa + nextafter (0.5, 0.0));
35392 return copysign (xa, x);
35393 */
35394 enum machine_mode mode = GET_MODE (operand0);
35395 rtx res, TWO52, xa, label, xi, half, mask;
35396 const struct real_format *fmt;
35397 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35398
35399 /* Temporary for holding the result, initialized to the input
35400 operand to ease control flow. */
35401 res = gen_reg_rtx (mode);
35402 emit_move_insn (res, operand1);
35403
35404 TWO52 = ix86_gen_TWO52 (mode);
35405 xa = ix86_expand_sse_fabs (res, &mask);
35406 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35407
35408 /* load nextafter (0.5, 0.0) */
35409 fmt = REAL_MODE_FORMAT (mode);
35410 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35411 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35412
35413 /* xa = xa + 0.5 */
35414 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
35415 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
35416
35417 /* xa = (double)(int64_t)xa */
35418 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35419 expand_fix (xi, xa, 0);
35420 expand_float (xa, xi, 0);
35421
35422 /* res = copysign (xa, operand1) */
35423 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
35424
35425 emit_label (label);
35426 LABEL_NUSES (label) = 1;
35427
35428 emit_move_insn (operand0, res);
35429 }
35430
35431 /* Expand SSE sequence for computing round
35432 from OP1 storing into OP0 using sse4 round insn. */
35433 void
35434 ix86_expand_round_sse4 (rtx op0, rtx op1)
35435 {
35436 enum machine_mode mode = GET_MODE (op0);
35437 rtx e1, e2, res, half;
35438 const struct real_format *fmt;
35439 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35440 rtx (*gen_copysign) (rtx, rtx, rtx);
35441 rtx (*gen_round) (rtx, rtx, rtx);
35442
35443 switch (mode)
35444 {
35445 case SFmode:
35446 gen_copysign = gen_copysignsf3;
35447 gen_round = gen_sse4_1_roundsf2;
35448 break;
35449 case DFmode:
35450 gen_copysign = gen_copysigndf3;
35451 gen_round = gen_sse4_1_rounddf2;
35452 break;
35453 default:
35454 gcc_unreachable ();
35455 }
35456
35457 /* round (a) = trunc (a + copysign (0.5, a)) */
35458
35459 /* load nextafter (0.5, 0.0) */
35460 fmt = REAL_MODE_FORMAT (mode);
35461 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35462 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35463 half = const_double_from_real_value (pred_half, mode);
35464
35465 /* e1 = copysign (0.5, op1) */
35466 e1 = gen_reg_rtx (mode);
35467 emit_insn (gen_copysign (e1, half, op1));
35468
35469 /* e2 = op1 + e1 */
35470 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
35471
35472 /* res = trunc (e2) */
35473 res = gen_reg_rtx (mode);
35474 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
35475
35476 emit_move_insn (op0, res);
35477 }
35478 \f
35479
35480 /* Table of valid machine attributes. */
35481 static const struct attribute_spec ix86_attribute_table[] =
35482 {
35483 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
35484 affects_type_identity } */
35485 /* Stdcall attribute says callee is responsible for popping arguments
35486 if they are not variable. */
35487 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35488 true },
35489 /* Fastcall attribute says callee is responsible for popping arguments
35490 if they are not variable. */
35491 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35492 true },
35493 /* Thiscall attribute says callee is responsible for popping arguments
35494 if they are not variable. */
35495 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35496 true },
35497 /* Cdecl attribute says the callee is a normal C declaration */
35498 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35499 true },
35500 /* Regparm attribute specifies how many integer arguments are to be
35501 passed in registers. */
35502 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
35503 true },
35504 /* Sseregparm attribute says we are using x86_64 calling conventions
35505 for FP arguments. */
35506 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35507 true },
35508 /* The transactional memory builtins are implicitly regparm or fastcall
35509 depending on the ABI. Override the generic do-nothing attribute that
35510 these builtins were declared with. */
35511 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
35512 true },
35513 /* force_align_arg_pointer says this function realigns the stack at entry. */
35514 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
35515 false, true, true, ix86_handle_cconv_attribute, false },
35516 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
35517 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
35518 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
35519 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
35520 false },
35521 #endif
35522 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35523 false },
35524 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35525 false },
35526 #ifdef SUBTARGET_ATTRIBUTE_TABLE
35527 SUBTARGET_ATTRIBUTE_TABLE,
35528 #endif
35529 /* ms_abi and sysv_abi calling convention function attributes. */
35530 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35531 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35532 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
35533 false },
35534 { "callee_pop_aggregate_return", 1, 1, false, true, true,
35535 ix86_handle_callee_pop_aggregate_return, true },
35536 /* End element. */
35537 { NULL, 0, 0, false, false, false, NULL, false }
35538 };
35539
35540 /* Implement targetm.vectorize.builtin_vectorization_cost. */
35541 static int
35542 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35543 tree vectype ATTRIBUTE_UNUSED,
35544 int misalign ATTRIBUTE_UNUSED)
35545 {
35546 switch (type_of_cost)
35547 {
35548 case scalar_stmt:
35549 return ix86_cost->scalar_stmt_cost;
35550
35551 case scalar_load:
35552 return ix86_cost->scalar_load_cost;
35553
35554 case scalar_store:
35555 return ix86_cost->scalar_store_cost;
35556
35557 case vector_stmt:
35558 return ix86_cost->vec_stmt_cost;
35559
35560 case vector_load:
35561 return ix86_cost->vec_align_load_cost;
35562
35563 case vector_store:
35564 return ix86_cost->vec_store_cost;
35565
35566 case vec_to_scalar:
35567 return ix86_cost->vec_to_scalar_cost;
35568
35569 case scalar_to_vec:
35570 return ix86_cost->scalar_to_vec_cost;
35571
35572 case unaligned_load:
35573 case unaligned_store:
35574 return ix86_cost->vec_unalign_load_cost;
35575
35576 case cond_branch_taken:
35577 return ix86_cost->cond_taken_branch_cost;
35578
35579 case cond_branch_not_taken:
35580 return ix86_cost->cond_not_taken_branch_cost;
35581
35582 case vec_perm:
35583 case vec_promote_demote:
35584 return ix86_cost->vec_stmt_cost;
35585
35586 default:
35587 gcc_unreachable ();
35588 }
35589 }
35590
35591 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
35592 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
35593 insn every time. */
35594
35595 static GTY(()) rtx vselect_insn;
35596
35597 /* Initialize vselect_insn. */
35598
35599 static void
35600 init_vselect_insn (void)
35601 {
35602 unsigned i;
35603 rtx x;
35604
35605 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
35606 for (i = 0; i < MAX_VECT_LEN; ++i)
35607 XVECEXP (x, 0, i) = const0_rtx;
35608 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
35609 const0_rtx), x);
35610 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
35611 start_sequence ();
35612 vselect_insn = emit_insn (x);
35613 end_sequence ();
35614 }
35615
35616 /* Construct (set target (vec_select op0 (parallel perm))) and
35617 return true if that's a valid instruction in the active ISA. */
35618
35619 static bool
35620 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
35621 unsigned nelt, bool testing_p)
35622 {
35623 unsigned int i;
35624 rtx x, save_vconcat;
35625 int icode;
35626
35627 if (vselect_insn == NULL_RTX)
35628 init_vselect_insn ();
35629
35630 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
35631 PUT_NUM_ELEM (XVEC (x, 0), nelt);
35632 for (i = 0; i < nelt; ++i)
35633 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
35634 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
35635 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
35636 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
35637 SET_DEST (PATTERN (vselect_insn)) = target;
35638 icode = recog_memoized (vselect_insn);
35639
35640 if (icode >= 0 && !testing_p)
35641 emit_insn (copy_rtx (PATTERN (vselect_insn)));
35642
35643 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
35644 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
35645 INSN_CODE (vselect_insn) = -1;
35646
35647 return icode >= 0;
35648 }
35649
35650 /* Similar, but generate a vec_concat from op0 and op1 as well. */
35651
35652 static bool
35653 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
35654 const unsigned char *perm, unsigned nelt,
35655 bool testing_p)
35656 {
35657 enum machine_mode v2mode;
35658 rtx x;
35659 bool ok;
35660
35661 if (vselect_insn == NULL_RTX)
35662 init_vselect_insn ();
35663
35664 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
35665 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
35666 PUT_MODE (x, v2mode);
35667 XEXP (x, 0) = op0;
35668 XEXP (x, 1) = op1;
35669 ok = expand_vselect (target, x, perm, nelt, testing_p);
35670 XEXP (x, 0) = const0_rtx;
35671 XEXP (x, 1) = const0_rtx;
35672 return ok;
35673 }
35674
35675 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35676 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
35677
35678 static bool
35679 expand_vec_perm_blend (struct expand_vec_perm_d *d)
35680 {
35681 enum machine_mode vmode = d->vmode;
35682 unsigned i, mask, nelt = d->nelt;
35683 rtx target, op0, op1, x;
35684 rtx rperm[32], vperm;
35685
35686 if (d->op0 == d->op1)
35687 return false;
35688 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
35689 ;
35690 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
35691 ;
35692 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
35693 ;
35694 else
35695 return false;
35696
35697 /* This is a blend, not a permute. Elements must stay in their
35698 respective lanes. */
35699 for (i = 0; i < nelt; ++i)
35700 {
35701 unsigned e = d->perm[i];
35702 if (!(e == i || e == i + nelt))
35703 return false;
35704 }
35705
35706 if (d->testing_p)
35707 return true;
35708
35709 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
35710 decision should be extracted elsewhere, so that we only try that
35711 sequence once all budget==3 options have been tried. */
35712 target = d->target;
35713 op0 = d->op0;
35714 op1 = d->op1;
35715 mask = 0;
35716
35717 switch (vmode)
35718 {
35719 case V4DFmode:
35720 case V8SFmode:
35721 case V2DFmode:
35722 case V4SFmode:
35723 case V8HImode:
35724 case V8SImode:
35725 for (i = 0; i < nelt; ++i)
35726 mask |= (d->perm[i] >= nelt) << i;
35727 break;
35728
35729 case V2DImode:
35730 for (i = 0; i < 2; ++i)
35731 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
35732 vmode = V8HImode;
35733 goto do_subreg;
35734
35735 case V4SImode:
35736 for (i = 0; i < 4; ++i)
35737 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35738 vmode = V8HImode;
35739 goto do_subreg;
35740
35741 case V16QImode:
35742 /* See if bytes move in pairs so we can use pblendw with
35743 an immediate argument, rather than pblendvb with a vector
35744 argument. */
35745 for (i = 0; i < 16; i += 2)
35746 if (d->perm[i] + 1 != d->perm[i + 1])
35747 {
35748 use_pblendvb:
35749 for (i = 0; i < nelt; ++i)
35750 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
35751
35752 finish_pblendvb:
35753 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
35754 vperm = force_reg (vmode, vperm);
35755
35756 if (GET_MODE_SIZE (vmode) == 16)
35757 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
35758 else
35759 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
35760 return true;
35761 }
35762
35763 for (i = 0; i < 8; ++i)
35764 mask |= (d->perm[i * 2] >= 16) << i;
35765 vmode = V8HImode;
35766 /* FALLTHRU */
35767
35768 do_subreg:
35769 target = gen_lowpart (vmode, target);
35770 op0 = gen_lowpart (vmode, op0);
35771 op1 = gen_lowpart (vmode, op1);
35772 break;
35773
35774 case V32QImode:
35775 /* See if bytes move in pairs. If not, vpblendvb must be used. */
35776 for (i = 0; i < 32; i += 2)
35777 if (d->perm[i] + 1 != d->perm[i + 1])
35778 goto use_pblendvb;
35779 /* See if bytes move in quadruplets. If yes, vpblendd
35780 with immediate can be used. */
35781 for (i = 0; i < 32; i += 4)
35782 if (d->perm[i] + 2 != d->perm[i + 2])
35783 break;
35784 if (i < 32)
35785 {
35786 /* See if bytes move the same in both lanes. If yes,
35787 vpblendw with immediate can be used. */
35788 for (i = 0; i < 16; i += 2)
35789 if (d->perm[i] + 16 != d->perm[i + 16])
35790 goto use_pblendvb;
35791
35792 /* Use vpblendw. */
35793 for (i = 0; i < 16; ++i)
35794 mask |= (d->perm[i * 2] >= 32) << i;
35795 vmode = V16HImode;
35796 goto do_subreg;
35797 }
35798
35799 /* Use vpblendd. */
35800 for (i = 0; i < 8; ++i)
35801 mask |= (d->perm[i * 4] >= 32) << i;
35802 vmode = V8SImode;
35803 goto do_subreg;
35804
35805 case V16HImode:
35806 /* See if words move in pairs. If yes, vpblendd can be used. */
35807 for (i = 0; i < 16; i += 2)
35808 if (d->perm[i] + 1 != d->perm[i + 1])
35809 break;
35810 if (i < 16)
35811 {
35812 /* See if words move the same in both lanes. If not,
35813 vpblendvb must be used. */
35814 for (i = 0; i < 8; i++)
35815 if (d->perm[i] + 8 != d->perm[i + 8])
35816 {
35817 /* Use vpblendvb. */
35818 for (i = 0; i < 32; ++i)
35819 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
35820
35821 vmode = V32QImode;
35822 nelt = 32;
35823 target = gen_lowpart (vmode, target);
35824 op0 = gen_lowpart (vmode, op0);
35825 op1 = gen_lowpart (vmode, op1);
35826 goto finish_pblendvb;
35827 }
35828
35829 /* Use vpblendw. */
35830 for (i = 0; i < 16; ++i)
35831 mask |= (d->perm[i] >= 16) << i;
35832 break;
35833 }
35834
35835 /* Use vpblendd. */
35836 for (i = 0; i < 8; ++i)
35837 mask |= (d->perm[i * 2] >= 16) << i;
35838 vmode = V8SImode;
35839 goto do_subreg;
35840
35841 case V4DImode:
35842 /* Use vpblendd. */
35843 for (i = 0; i < 4; ++i)
35844 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35845 vmode = V8SImode;
35846 goto do_subreg;
35847
35848 default:
35849 gcc_unreachable ();
35850 }
35851
35852 /* This matches five different patterns with the different modes. */
35853 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
35854 x = gen_rtx_SET (VOIDmode, target, x);
35855 emit_insn (x);
35856
35857 return true;
35858 }
35859
35860 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35861 in terms of the variable form of vpermilps.
35862
35863 Note that we will have already failed the immediate input vpermilps,
35864 which requires that the high and low part shuffle be identical; the
35865 variable form doesn't require that. */
35866
35867 static bool
35868 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
35869 {
35870 rtx rperm[8], vperm;
35871 unsigned i;
35872
35873 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
35874 return false;
35875
35876 /* We can only permute within the 128-bit lane. */
35877 for (i = 0; i < 8; ++i)
35878 {
35879 unsigned e = d->perm[i];
35880 if (i < 4 ? e >= 4 : e < 4)
35881 return false;
35882 }
35883
35884 if (d->testing_p)
35885 return true;
35886
35887 for (i = 0; i < 8; ++i)
35888 {
35889 unsigned e = d->perm[i];
35890
35891 /* Within each 128-bit lane, the elements of op0 are numbered
35892 from 0 and the elements of op1 are numbered from 4. */
35893 if (e >= 8 + 4)
35894 e -= 8;
35895 else if (e >= 4)
35896 e -= 4;
35897
35898 rperm[i] = GEN_INT (e);
35899 }
35900
35901 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
35902 vperm = force_reg (V8SImode, vperm);
35903 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
35904
35905 return true;
35906 }
35907
35908 /* Return true if permutation D can be performed as VMODE permutation
35909 instead. */
35910
35911 static bool
35912 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
35913 {
35914 unsigned int i, j, chunk;
35915
35916 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
35917 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
35918 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
35919 return false;
35920
35921 if (GET_MODE_NUNITS (vmode) >= d->nelt)
35922 return true;
35923
35924 chunk = d->nelt / GET_MODE_NUNITS (vmode);
35925 for (i = 0; i < d->nelt; i += chunk)
35926 if (d->perm[i] & (chunk - 1))
35927 return false;
35928 else
35929 for (j = 1; j < chunk; ++j)
35930 if (d->perm[i] + j != d->perm[i + j])
35931 return false;
35932
35933 return true;
35934 }
35935
35936 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35937 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
35938
35939 static bool
35940 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
35941 {
35942 unsigned i, nelt, eltsz, mask;
35943 unsigned char perm[32];
35944 enum machine_mode vmode = V16QImode;
35945 rtx rperm[32], vperm, target, op0, op1;
35946
35947 nelt = d->nelt;
35948
35949 if (d->op0 != d->op1)
35950 {
35951 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
35952 {
35953 if (TARGET_AVX2
35954 && valid_perm_using_mode_p (V2TImode, d))
35955 {
35956 if (d->testing_p)
35957 return true;
35958
35959 /* Use vperm2i128 insn. The pattern uses
35960 V4DImode instead of V2TImode. */
35961 target = gen_lowpart (V4DImode, d->target);
35962 op0 = gen_lowpart (V4DImode, d->op0);
35963 op1 = gen_lowpart (V4DImode, d->op1);
35964 rperm[0]
35965 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
35966 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
35967 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
35968 return true;
35969 }
35970 return false;
35971 }
35972 }
35973 else
35974 {
35975 if (GET_MODE_SIZE (d->vmode) == 16)
35976 {
35977 if (!TARGET_SSSE3)
35978 return false;
35979 }
35980 else if (GET_MODE_SIZE (d->vmode) == 32)
35981 {
35982 if (!TARGET_AVX2)
35983 return false;
35984
35985 /* V4DImode should be already handled through
35986 expand_vselect by vpermq instruction. */
35987 gcc_assert (d->vmode != V4DImode);
35988
35989 vmode = V32QImode;
35990 if (d->vmode == V8SImode
35991 || d->vmode == V16HImode
35992 || d->vmode == V32QImode)
35993 {
35994 /* First see if vpermq can be used for
35995 V8SImode/V16HImode/V32QImode. */
35996 if (valid_perm_using_mode_p (V4DImode, d))
35997 {
35998 for (i = 0; i < 4; i++)
35999 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
36000 if (d->testing_p)
36001 return true;
36002 return expand_vselect (gen_lowpart (V4DImode, d->target),
36003 gen_lowpart (V4DImode, d->op0),
36004 perm, 4, false);
36005 }
36006
36007 /* Next see if vpermd can be used. */
36008 if (valid_perm_using_mode_p (V8SImode, d))
36009 vmode = V8SImode;
36010 }
36011 /* Or if vpermps can be used. */
36012 else if (d->vmode == V8SFmode)
36013 vmode = V8SImode;
36014
36015 if (vmode == V32QImode)
36016 {
36017 /* vpshufb only works intra lanes, it is not
36018 possible to shuffle bytes in between the lanes. */
36019 for (i = 0; i < nelt; ++i)
36020 if ((d->perm[i] ^ i) & (nelt / 2))
36021 return false;
36022 }
36023 }
36024 else
36025 return false;
36026 }
36027
36028 if (d->testing_p)
36029 return true;
36030
36031 if (vmode == V8SImode)
36032 for (i = 0; i < 8; ++i)
36033 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
36034 else
36035 {
36036 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36037 if (d->op0 != d->op1)
36038 mask = 2 * nelt - 1;
36039 else if (vmode == V16QImode)
36040 mask = nelt - 1;
36041 else
36042 mask = nelt / 2 - 1;
36043
36044 for (i = 0; i < nelt; ++i)
36045 {
36046 unsigned j, e = d->perm[i] & mask;
36047 for (j = 0; j < eltsz; ++j)
36048 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
36049 }
36050 }
36051
36052 vperm = gen_rtx_CONST_VECTOR (vmode,
36053 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
36054 vperm = force_reg (vmode, vperm);
36055
36056 if (vmode == V8SImode && d->vmode == V8SFmode)
36057 {
36058 vmode = V8SFmode;
36059 vperm = gen_lowpart (vmode, vperm);
36060 }
36061
36062 target = gen_lowpart (vmode, d->target);
36063 op0 = gen_lowpart (vmode, d->op0);
36064 if (d->op0 == d->op1)
36065 {
36066 if (vmode == V16QImode)
36067 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
36068 else if (vmode == V32QImode)
36069 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
36070 else if (vmode == V8SFmode)
36071 emit_insn (gen_avx2_permvarv8sf (target, vperm, op0));
36072 else
36073 emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
36074 }
36075 else
36076 {
36077 op1 = gen_lowpart (vmode, d->op1);
36078 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
36079 }
36080
36081 return true;
36082 }
36083
36084 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
36085 in a single instruction. */
36086
36087 static bool
36088 expand_vec_perm_1 (struct expand_vec_perm_d *d)
36089 {
36090 unsigned i, nelt = d->nelt;
36091 unsigned char perm2[MAX_VECT_LEN];
36092
36093 /* Check plain VEC_SELECT first, because AVX has instructions that could
36094 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
36095 input where SEL+CONCAT may not. */
36096 if (d->op0 == d->op1)
36097 {
36098 int mask = nelt - 1;
36099 bool identity_perm = true;
36100 bool broadcast_perm = true;
36101
36102 for (i = 0; i < nelt; i++)
36103 {
36104 perm2[i] = d->perm[i] & mask;
36105 if (perm2[i] != i)
36106 identity_perm = false;
36107 if (perm2[i])
36108 broadcast_perm = false;
36109 }
36110
36111 if (identity_perm)
36112 {
36113 if (!d->testing_p)
36114 emit_move_insn (d->target, d->op0);
36115 return true;
36116 }
36117 else if (broadcast_perm && TARGET_AVX2)
36118 {
36119 /* Use vpbroadcast{b,w,d}. */
36120 rtx (*gen) (rtx, rtx) = NULL;
36121 switch (d->vmode)
36122 {
36123 case V32QImode:
36124 gen = gen_avx2_pbroadcastv32qi_1;
36125 break;
36126 case V16HImode:
36127 gen = gen_avx2_pbroadcastv16hi_1;
36128 break;
36129 case V8SImode:
36130 gen = gen_avx2_pbroadcastv8si_1;
36131 break;
36132 case V16QImode:
36133 gen = gen_avx2_pbroadcastv16qi;
36134 break;
36135 case V8HImode:
36136 gen = gen_avx2_pbroadcastv8hi;
36137 break;
36138 case V8SFmode:
36139 gen = gen_avx2_vec_dupv8sf_1;
36140 break;
36141 /* For other modes prefer other shuffles this function creates. */
36142 default: break;
36143 }
36144 if (gen != NULL)
36145 {
36146 if (!d->testing_p)
36147 emit_insn (gen (d->target, d->op0));
36148 return true;
36149 }
36150 }
36151
36152 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
36153 return true;
36154
36155 /* There are plenty of patterns in sse.md that are written for
36156 SEL+CONCAT and are not replicated for a single op. Perhaps
36157 that should be changed, to avoid the nastiness here. */
36158
36159 /* Recognize interleave style patterns, which means incrementing
36160 every other permutation operand. */
36161 for (i = 0; i < nelt; i += 2)
36162 {
36163 perm2[i] = d->perm[i] & mask;
36164 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
36165 }
36166 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
36167 d->testing_p))
36168 return true;
36169
36170 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
36171 if (nelt >= 4)
36172 {
36173 for (i = 0; i < nelt; i += 4)
36174 {
36175 perm2[i + 0] = d->perm[i + 0] & mask;
36176 perm2[i + 1] = d->perm[i + 1] & mask;
36177 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
36178 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
36179 }
36180
36181 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
36182 d->testing_p))
36183 return true;
36184 }
36185 }
36186
36187 /* Finally, try the fully general two operand permute. */
36188 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
36189 d->testing_p))
36190 return true;
36191
36192 /* Recognize interleave style patterns with reversed operands. */
36193 if (d->op0 != d->op1)
36194 {
36195 for (i = 0; i < nelt; ++i)
36196 {
36197 unsigned e = d->perm[i];
36198 if (e >= nelt)
36199 e -= nelt;
36200 else
36201 e += nelt;
36202 perm2[i] = e;
36203 }
36204
36205 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
36206 d->testing_p))
36207 return true;
36208 }
36209
36210 /* Try the SSE4.1 blend variable merge instructions. */
36211 if (expand_vec_perm_blend (d))
36212 return true;
36213
36214 /* Try one of the AVX vpermil variable permutations. */
36215 if (expand_vec_perm_vpermil (d))
36216 return true;
36217
36218 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
36219 vpshufb, vpermd, vpermps or vpermq variable permutation. */
36220 if (expand_vec_perm_pshufb (d))
36221 return true;
36222
36223 return false;
36224 }
36225
36226 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36227 in terms of a pair of pshuflw + pshufhw instructions. */
36228
36229 static bool
36230 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
36231 {
36232 unsigned char perm2[MAX_VECT_LEN];
36233 unsigned i;
36234 bool ok;
36235
36236 if (d->vmode != V8HImode || d->op0 != d->op1)
36237 return false;
36238
36239 /* The two permutations only operate in 64-bit lanes. */
36240 for (i = 0; i < 4; ++i)
36241 if (d->perm[i] >= 4)
36242 return false;
36243 for (i = 4; i < 8; ++i)
36244 if (d->perm[i] < 4)
36245 return false;
36246
36247 if (d->testing_p)
36248 return true;
36249
36250 /* Emit the pshuflw. */
36251 memcpy (perm2, d->perm, 4);
36252 for (i = 4; i < 8; ++i)
36253 perm2[i] = i;
36254 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
36255 gcc_assert (ok);
36256
36257 /* Emit the pshufhw. */
36258 memcpy (perm2 + 4, d->perm + 4, 4);
36259 for (i = 0; i < 4; ++i)
36260 perm2[i] = i;
36261 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
36262 gcc_assert (ok);
36263
36264 return true;
36265 }
36266
36267 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36268 the permutation using the SSSE3 palignr instruction. This succeeds
36269 when all of the elements in PERM fit within one vector and we merely
36270 need to shift them down so that a single vector permutation has a
36271 chance to succeed. */
36272
36273 static bool
36274 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36275 {
36276 unsigned i, nelt = d->nelt;
36277 unsigned min, max;
36278 bool in_order, ok;
36279 rtx shift;
36280
36281 /* Even with AVX, palignr only operates on 128-bit vectors. */
36282 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36283 return false;
36284
36285 min = nelt, max = 0;
36286 for (i = 0; i < nelt; ++i)
36287 {
36288 unsigned e = d->perm[i];
36289 if (e < min)
36290 min = e;
36291 if (e > max)
36292 max = e;
36293 }
36294 if (min == 0 || max - min >= nelt)
36295 return false;
36296
36297 /* Given that we have SSSE3, we know we'll be able to implement the
36298 single operand permutation after the palignr with pshufb. */
36299 if (d->testing_p)
36300 return true;
36301
36302 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36303 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36304 gen_lowpart (TImode, d->op1),
36305 gen_lowpart (TImode, d->op0), shift));
36306
36307 d->op0 = d->op1 = d->target;
36308
36309 in_order = true;
36310 for (i = 0; i < nelt; ++i)
36311 {
36312 unsigned e = d->perm[i] - min;
36313 if (e != i)
36314 in_order = false;
36315 d->perm[i] = e;
36316 }
36317
36318 /* Test for the degenerate case where the alignment by itself
36319 produces the desired permutation. */
36320 if (in_order)
36321 return true;
36322
36323 ok = expand_vec_perm_1 (d);
36324 gcc_assert (ok);
36325
36326 return ok;
36327 }
36328
36329 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
36330
36331 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36332 a two vector permutation into a single vector permutation by using
36333 an interleave operation to merge the vectors. */
36334
36335 static bool
36336 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36337 {
36338 struct expand_vec_perm_d dremap, dfinal;
36339 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36340 unsigned HOST_WIDE_INT contents;
36341 unsigned char remap[2 * MAX_VECT_LEN];
36342 rtx seq;
36343 bool ok, same_halves = false;
36344
36345 if (GET_MODE_SIZE (d->vmode) == 16)
36346 {
36347 if (d->op0 == d->op1)
36348 return false;
36349 }
36350 else if (GET_MODE_SIZE (d->vmode) == 32)
36351 {
36352 if (!TARGET_AVX)
36353 return false;
36354 /* For 32-byte modes allow even d->op0 == d->op1.
36355 The lack of cross-lane shuffling in some instructions
36356 might prevent a single insn shuffle. */
36357 dfinal = *d;
36358 dfinal.testing_p = true;
36359 /* If expand_vec_perm_interleave3 can expand this into
36360 a 3 insn sequence, give up and let it be expanded as
36361 3 insn sequence. While that is one insn longer,
36362 it doesn't need a memory operand and in the common
36363 case that both interleave low and high permutations
36364 with the same operands are adjacent needs 4 insns
36365 for both after CSE. */
36366 if (expand_vec_perm_interleave3 (&dfinal))
36367 return false;
36368 }
36369 else
36370 return false;
36371
36372 /* Examine from whence the elements come. */
36373 contents = 0;
36374 for (i = 0; i < nelt; ++i)
36375 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36376
36377 memset (remap, 0xff, sizeof (remap));
36378 dremap = *d;
36379
36380 if (GET_MODE_SIZE (d->vmode) == 16)
36381 {
36382 unsigned HOST_WIDE_INT h1, h2, h3, h4;
36383
36384 /* Split the two input vectors into 4 halves. */
36385 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36386 h2 = h1 << nelt2;
36387 h3 = h2 << nelt2;
36388 h4 = h3 << nelt2;
36389
36390 /* If the elements from the low halves use interleave low, and similarly
36391 for interleave high. If the elements are from mis-matched halves, we
36392 can use shufps for V4SF/V4SI or do a DImode shuffle. */
36393 if ((contents & (h1 | h3)) == contents)
36394 {
36395 /* punpckl* */
36396 for (i = 0; i < nelt2; ++i)
36397 {
36398 remap[i] = i * 2;
36399 remap[i + nelt] = i * 2 + 1;
36400 dremap.perm[i * 2] = i;
36401 dremap.perm[i * 2 + 1] = i + nelt;
36402 }
36403 if (!TARGET_SSE2 && d->vmode == V4SImode)
36404 dremap.vmode = V4SFmode;
36405 }
36406 else if ((contents & (h2 | h4)) == contents)
36407 {
36408 /* punpckh* */
36409 for (i = 0; i < nelt2; ++i)
36410 {
36411 remap[i + nelt2] = i * 2;
36412 remap[i + nelt + nelt2] = i * 2 + 1;
36413 dremap.perm[i * 2] = i + nelt2;
36414 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
36415 }
36416 if (!TARGET_SSE2 && d->vmode == V4SImode)
36417 dremap.vmode = V4SFmode;
36418 }
36419 else if ((contents & (h1 | h4)) == contents)
36420 {
36421 /* shufps */
36422 for (i = 0; i < nelt2; ++i)
36423 {
36424 remap[i] = i;
36425 remap[i + nelt + nelt2] = i + nelt2;
36426 dremap.perm[i] = i;
36427 dremap.perm[i + nelt2] = i + nelt + nelt2;
36428 }
36429 if (nelt != 4)
36430 {
36431 /* shufpd */
36432 dremap.vmode = V2DImode;
36433 dremap.nelt = 2;
36434 dremap.perm[0] = 0;
36435 dremap.perm[1] = 3;
36436 }
36437 }
36438 else if ((contents & (h2 | h3)) == contents)
36439 {
36440 /* shufps */
36441 for (i = 0; i < nelt2; ++i)
36442 {
36443 remap[i + nelt2] = i;
36444 remap[i + nelt] = i + nelt2;
36445 dremap.perm[i] = i + nelt2;
36446 dremap.perm[i + nelt2] = i + nelt;
36447 }
36448 if (nelt != 4)
36449 {
36450 /* shufpd */
36451 dremap.vmode = V2DImode;
36452 dremap.nelt = 2;
36453 dremap.perm[0] = 1;
36454 dremap.perm[1] = 2;
36455 }
36456 }
36457 else
36458 return false;
36459 }
36460 else
36461 {
36462 unsigned int nelt4 = nelt / 4, nzcnt = 0;
36463 unsigned HOST_WIDE_INT q[8];
36464 unsigned int nonzero_halves[4];
36465
36466 /* Split the two input vectors into 8 quarters. */
36467 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
36468 for (i = 1; i < 8; ++i)
36469 q[i] = q[0] << (nelt4 * i);
36470 for (i = 0; i < 4; ++i)
36471 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
36472 {
36473 nonzero_halves[nzcnt] = i;
36474 ++nzcnt;
36475 }
36476
36477 if (nzcnt == 1)
36478 {
36479 gcc_assert (d->op0 == d->op1);
36480 nonzero_halves[1] = nonzero_halves[0];
36481 same_halves = true;
36482 }
36483 else if (d->op0 == d->op1)
36484 {
36485 gcc_assert (nonzero_halves[0] == 0);
36486 gcc_assert (nonzero_halves[1] == 1);
36487 }
36488
36489 if (nzcnt <= 2)
36490 {
36491 if (d->perm[0] / nelt2 == nonzero_halves[1])
36492 {
36493 /* Attempt to increase the likelyhood that dfinal
36494 shuffle will be intra-lane. */
36495 char tmph = nonzero_halves[0];
36496 nonzero_halves[0] = nonzero_halves[1];
36497 nonzero_halves[1] = tmph;
36498 }
36499
36500 /* vperm2f128 or vperm2i128. */
36501 for (i = 0; i < nelt2; ++i)
36502 {
36503 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
36504 remap[i + nonzero_halves[0] * nelt2] = i;
36505 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
36506 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
36507 }
36508
36509 if (d->vmode != V8SFmode
36510 && d->vmode != V4DFmode
36511 && d->vmode != V8SImode)
36512 {
36513 dremap.vmode = V8SImode;
36514 dremap.nelt = 8;
36515 for (i = 0; i < 4; ++i)
36516 {
36517 dremap.perm[i] = i + nonzero_halves[0] * 4;
36518 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
36519 }
36520 }
36521 }
36522 else if (d->op0 == d->op1)
36523 return false;
36524 else if (TARGET_AVX2
36525 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
36526 {
36527 /* vpunpckl* */
36528 for (i = 0; i < nelt4; ++i)
36529 {
36530 remap[i] = i * 2;
36531 remap[i + nelt] = i * 2 + 1;
36532 remap[i + nelt2] = i * 2 + nelt2;
36533 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
36534 dremap.perm[i * 2] = i;
36535 dremap.perm[i * 2 + 1] = i + nelt;
36536 dremap.perm[i * 2 + nelt2] = i + nelt2;
36537 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
36538 }
36539 }
36540 else if (TARGET_AVX2
36541 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
36542 {
36543 /* vpunpckh* */
36544 for (i = 0; i < nelt4; ++i)
36545 {
36546 remap[i + nelt4] = i * 2;
36547 remap[i + nelt + nelt4] = i * 2 + 1;
36548 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
36549 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
36550 dremap.perm[i * 2] = i + nelt4;
36551 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
36552 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
36553 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
36554 }
36555 }
36556 else
36557 return false;
36558 }
36559
36560 /* Use the remapping array set up above to move the elements from their
36561 swizzled locations into their final destinations. */
36562 dfinal = *d;
36563 for (i = 0; i < nelt; ++i)
36564 {
36565 unsigned e = remap[d->perm[i]];
36566 gcc_assert (e < nelt);
36567 /* If same_halves is true, both halves of the remapped vector are the
36568 same. Avoid cross-lane accesses if possible. */
36569 if (same_halves && i >= nelt2)
36570 {
36571 gcc_assert (e < nelt2);
36572 dfinal.perm[i] = e + nelt2;
36573 }
36574 else
36575 dfinal.perm[i] = e;
36576 }
36577 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
36578 dfinal.op1 = dfinal.op0;
36579 dremap.target = dfinal.op0;
36580
36581 /* Test if the final remap can be done with a single insn. For V4SFmode or
36582 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
36583 start_sequence ();
36584 ok = expand_vec_perm_1 (&dfinal);
36585 seq = get_insns ();
36586 end_sequence ();
36587
36588 if (!ok)
36589 return false;
36590
36591 if (d->testing_p)
36592 return true;
36593
36594 if (dremap.vmode != dfinal.vmode)
36595 {
36596 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
36597 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
36598 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
36599 }
36600
36601 ok = expand_vec_perm_1 (&dremap);
36602 gcc_assert (ok);
36603
36604 emit_insn (seq);
36605 return true;
36606 }
36607
36608 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36609 a single vector cross-lane permutation into vpermq followed
36610 by any of the single insn permutations. */
36611
36612 static bool
36613 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
36614 {
36615 struct expand_vec_perm_d dremap, dfinal;
36616 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
36617 unsigned contents[2];
36618 bool ok;
36619
36620 if (!(TARGET_AVX2
36621 && (d->vmode == V32QImode || d->vmode == V16HImode)
36622 && d->op0 == d->op1))
36623 return false;
36624
36625 contents[0] = 0;
36626 contents[1] = 0;
36627 for (i = 0; i < nelt2; ++i)
36628 {
36629 contents[0] |= 1u << (d->perm[i] / nelt4);
36630 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
36631 }
36632
36633 for (i = 0; i < 2; ++i)
36634 {
36635 unsigned int cnt = 0;
36636 for (j = 0; j < 4; ++j)
36637 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
36638 return false;
36639 }
36640
36641 if (d->testing_p)
36642 return true;
36643
36644 dremap = *d;
36645 dremap.vmode = V4DImode;
36646 dremap.nelt = 4;
36647 dremap.target = gen_reg_rtx (V4DImode);
36648 dremap.op0 = gen_lowpart (V4DImode, d->op0);
36649 dremap.op1 = dremap.op0;
36650 for (i = 0; i < 2; ++i)
36651 {
36652 unsigned int cnt = 0;
36653 for (j = 0; j < 4; ++j)
36654 if ((contents[i] & (1u << j)) != 0)
36655 dremap.perm[2 * i + cnt++] = j;
36656 for (; cnt < 2; ++cnt)
36657 dremap.perm[2 * i + cnt] = 0;
36658 }
36659
36660 dfinal = *d;
36661 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
36662 dfinal.op1 = dfinal.op0;
36663 for (i = 0, j = 0; i < nelt; ++i)
36664 {
36665 if (i == nelt2)
36666 j = 2;
36667 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
36668 if ((d->perm[i] / nelt4) == dremap.perm[j])
36669 ;
36670 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
36671 dfinal.perm[i] |= nelt4;
36672 else
36673 gcc_unreachable ();
36674 }
36675
36676 ok = expand_vec_perm_1 (&dremap);
36677 gcc_assert (ok);
36678
36679 ok = expand_vec_perm_1 (&dfinal);
36680 gcc_assert (ok);
36681
36682 return true;
36683 }
36684
36685 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
36686 a vector permutation using two instructions, vperm2f128 resp.
36687 vperm2i128 followed by any single in-lane permutation. */
36688
36689 static bool
36690 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
36691 {
36692 struct expand_vec_perm_d dfirst, dsecond;
36693 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
36694 bool ok;
36695
36696 if (!TARGET_AVX
36697 || GET_MODE_SIZE (d->vmode) != 32
36698 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
36699 return false;
36700
36701 dsecond = *d;
36702 if (d->op0 == d->op1)
36703 dsecond.op1 = gen_reg_rtx (d->vmode);
36704 dsecond.testing_p = true;
36705
36706 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
36707 immediate. For perm < 16 the second permutation uses
36708 d->op0 as first operand, for perm >= 16 it uses d->op1
36709 as first operand. The second operand is the result of
36710 vperm2[fi]128. */
36711 for (perm = 0; perm < 32; perm++)
36712 {
36713 /* Ignore permutations which do not move anything cross-lane. */
36714 if (perm < 16)
36715 {
36716 /* The second shuffle for e.g. V4DFmode has
36717 0123 and ABCD operands.
36718 Ignore AB23, as 23 is already in the second lane
36719 of the first operand. */
36720 if ((perm & 0xc) == (1 << 2)) continue;
36721 /* And 01CD, as 01 is in the first lane of the first
36722 operand. */
36723 if ((perm & 3) == 0) continue;
36724 /* And 4567, as then the vperm2[fi]128 doesn't change
36725 anything on the original 4567 second operand. */
36726 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
36727 }
36728 else
36729 {
36730 /* The second shuffle for e.g. V4DFmode has
36731 4567 and ABCD operands.
36732 Ignore AB67, as 67 is already in the second lane
36733 of the first operand. */
36734 if ((perm & 0xc) == (3 << 2)) continue;
36735 /* And 45CD, as 45 is in the first lane of the first
36736 operand. */
36737 if ((perm & 3) == 2) continue;
36738 /* And 0123, as then the vperm2[fi]128 doesn't change
36739 anything on the original 0123 first operand. */
36740 if ((perm & 0xf) == (1 << 2)) continue;
36741 }
36742
36743 for (i = 0; i < nelt; i++)
36744 {
36745 j = d->perm[i] / nelt2;
36746 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
36747 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
36748 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
36749 dsecond.perm[i] = d->perm[i] & (nelt - 1);
36750 else
36751 break;
36752 }
36753
36754 if (i == nelt)
36755 {
36756 start_sequence ();
36757 ok = expand_vec_perm_1 (&dsecond);
36758 end_sequence ();
36759 }
36760 else
36761 ok = false;
36762
36763 if (ok)
36764 {
36765 if (d->testing_p)
36766 return true;
36767
36768 /* Found a usable second shuffle. dfirst will be
36769 vperm2f128 on d->op0 and d->op1. */
36770 dsecond.testing_p = false;
36771 dfirst = *d;
36772 if (d->op0 == d->op1)
36773 dfirst.target = dsecond.op1;
36774 else
36775 dfirst.target = gen_reg_rtx (d->vmode);
36776 for (i = 0; i < nelt; i++)
36777 dfirst.perm[i] = (i & (nelt2 - 1))
36778 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
36779
36780 ok = expand_vec_perm_1 (&dfirst);
36781 gcc_assert (ok);
36782
36783 /* And dsecond is some single insn shuffle, taking
36784 d->op0 and result of vperm2f128 (if perm < 16) or
36785 d->op1 and result of vperm2f128 (otherwise). */
36786 dsecond.op1 = dfirst.target;
36787 if (perm >= 16)
36788 dsecond.op0 = dfirst.op1;
36789
36790 ok = expand_vec_perm_1 (&dsecond);
36791 gcc_assert (ok);
36792
36793 return true;
36794 }
36795
36796 /* For d->op0 == d->op1 the only useful vperm2f128 permutation
36797 is 0x10. */
36798 if (d->op0 == d->op1)
36799 return false;
36800 }
36801
36802 return false;
36803 }
36804
36805 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36806 a two vector permutation using 2 intra-lane interleave insns
36807 and cross-lane shuffle for 32-byte vectors. */
36808
36809 static bool
36810 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
36811 {
36812 unsigned i, nelt;
36813 rtx (*gen) (rtx, rtx, rtx);
36814
36815 if (d->op0 == d->op1)
36816 return false;
36817 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
36818 ;
36819 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
36820 ;
36821 else
36822 return false;
36823
36824 nelt = d->nelt;
36825 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
36826 return false;
36827 for (i = 0; i < nelt; i += 2)
36828 if (d->perm[i] != d->perm[0] + i / 2
36829 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
36830 return false;
36831
36832 if (d->testing_p)
36833 return true;
36834
36835 switch (d->vmode)
36836 {
36837 case V32QImode:
36838 if (d->perm[0])
36839 gen = gen_vec_interleave_highv32qi;
36840 else
36841 gen = gen_vec_interleave_lowv32qi;
36842 break;
36843 case V16HImode:
36844 if (d->perm[0])
36845 gen = gen_vec_interleave_highv16hi;
36846 else
36847 gen = gen_vec_interleave_lowv16hi;
36848 break;
36849 case V8SImode:
36850 if (d->perm[0])
36851 gen = gen_vec_interleave_highv8si;
36852 else
36853 gen = gen_vec_interleave_lowv8si;
36854 break;
36855 case V4DImode:
36856 if (d->perm[0])
36857 gen = gen_vec_interleave_highv4di;
36858 else
36859 gen = gen_vec_interleave_lowv4di;
36860 break;
36861 case V8SFmode:
36862 if (d->perm[0])
36863 gen = gen_vec_interleave_highv8sf;
36864 else
36865 gen = gen_vec_interleave_lowv8sf;
36866 break;
36867 case V4DFmode:
36868 if (d->perm[0])
36869 gen = gen_vec_interleave_highv4df;
36870 else
36871 gen = gen_vec_interleave_lowv4df;
36872 break;
36873 default:
36874 gcc_unreachable ();
36875 }
36876
36877 emit_insn (gen (d->target, d->op0, d->op1));
36878 return true;
36879 }
36880
36881 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
36882 a single vector permutation using a single intra-lane vector
36883 permutation, vperm2f128 swapping the lanes and vblend* insn blending
36884 the non-swapped and swapped vectors together. */
36885
36886 static bool
36887 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
36888 {
36889 struct expand_vec_perm_d dfirst, dsecond;
36890 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
36891 rtx seq;
36892 bool ok;
36893 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
36894
36895 if (!TARGET_AVX
36896 || TARGET_AVX2
36897 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
36898 || d->op0 != d->op1)
36899 return false;
36900
36901 dfirst = *d;
36902 for (i = 0; i < nelt; i++)
36903 dfirst.perm[i] = 0xff;
36904 for (i = 0, msk = 0; i < nelt; i++)
36905 {
36906 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
36907 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
36908 return false;
36909 dfirst.perm[j] = d->perm[i];
36910 if (j != i)
36911 msk |= (1 << i);
36912 }
36913 for (i = 0; i < nelt; i++)
36914 if (dfirst.perm[i] == 0xff)
36915 dfirst.perm[i] = i;
36916
36917 if (!d->testing_p)
36918 dfirst.target = gen_reg_rtx (dfirst.vmode);
36919
36920 start_sequence ();
36921 ok = expand_vec_perm_1 (&dfirst);
36922 seq = get_insns ();
36923 end_sequence ();
36924
36925 if (!ok)
36926 return false;
36927
36928 if (d->testing_p)
36929 return true;
36930
36931 emit_insn (seq);
36932
36933 dsecond = *d;
36934 dsecond.op0 = dfirst.target;
36935 dsecond.op1 = dfirst.target;
36936 dsecond.target = gen_reg_rtx (dsecond.vmode);
36937 for (i = 0; i < nelt; i++)
36938 dsecond.perm[i] = i ^ nelt2;
36939
36940 ok = expand_vec_perm_1 (&dsecond);
36941 gcc_assert (ok);
36942
36943 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
36944 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
36945 return true;
36946 }
36947
36948 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
36949 permutation with two pshufb insns and an ior. We should have already
36950 failed all two instruction sequences. */
36951
36952 static bool
36953 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
36954 {
36955 rtx rperm[2][16], vperm, l, h, op, m128;
36956 unsigned int i, nelt, eltsz;
36957
36958 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36959 return false;
36960 gcc_assert (d->op0 != d->op1);
36961
36962 nelt = d->nelt;
36963 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36964
36965 /* Generate two permutation masks. If the required element is within
36966 the given vector it is shuffled into the proper lane. If the required
36967 element is in the other vector, force a zero into the lane by setting
36968 bit 7 in the permutation mask. */
36969 m128 = GEN_INT (-128);
36970 for (i = 0; i < nelt; ++i)
36971 {
36972 unsigned j, e = d->perm[i];
36973 unsigned which = (e >= nelt);
36974 if (e >= nelt)
36975 e -= nelt;
36976
36977 for (j = 0; j < eltsz; ++j)
36978 {
36979 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
36980 rperm[1-which][i*eltsz + j] = m128;
36981 }
36982 }
36983
36984 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
36985 vperm = force_reg (V16QImode, vperm);
36986
36987 l = gen_reg_rtx (V16QImode);
36988 op = gen_lowpart (V16QImode, d->op0);
36989 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
36990
36991 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
36992 vperm = force_reg (V16QImode, vperm);
36993
36994 h = gen_reg_rtx (V16QImode);
36995 op = gen_lowpart (V16QImode, d->op1);
36996 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
36997
36998 op = gen_lowpart (V16QImode, d->target);
36999 emit_insn (gen_iorv16qi3 (op, l, h));
37000
37001 return true;
37002 }
37003
37004 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
37005 with two vpshufb insns, vpermq and vpor. We should have already failed
37006 all two or three instruction sequences. */
37007
37008 static bool
37009 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
37010 {
37011 rtx rperm[2][32], vperm, l, h, hp, op, m128;
37012 unsigned int i, nelt, eltsz;
37013
37014 if (!TARGET_AVX2
37015 || d->op0 != d->op1
37016 || (d->vmode != V32QImode && d->vmode != V16HImode))
37017 return false;
37018
37019 if (d->testing_p)
37020 return true;
37021
37022 nelt = d->nelt;
37023 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37024
37025 /* Generate two permutation masks. If the required element is within
37026 the same lane, it is shuffled in. If the required element from the
37027 other lane, force a zero by setting bit 7 in the permutation mask.
37028 In the other mask the mask has non-negative elements if element
37029 is requested from the other lane, but also moved to the other lane,
37030 so that the result of vpshufb can have the two V2TImode halves
37031 swapped. */
37032 m128 = GEN_INT (-128);
37033 for (i = 0; i < nelt; ++i)
37034 {
37035 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37036 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37037
37038 for (j = 0; j < eltsz; ++j)
37039 {
37040 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
37041 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
37042 }
37043 }
37044
37045 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37046 vperm = force_reg (V32QImode, vperm);
37047
37048 h = gen_reg_rtx (V32QImode);
37049 op = gen_lowpart (V32QImode, d->op0);
37050 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37051
37052 /* Swap the 128-byte lanes of h into hp. */
37053 hp = gen_reg_rtx (V4DImode);
37054 op = gen_lowpart (V4DImode, h);
37055 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
37056 const1_rtx));
37057
37058 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37059 vperm = force_reg (V32QImode, vperm);
37060
37061 l = gen_reg_rtx (V32QImode);
37062 op = gen_lowpart (V32QImode, d->op0);
37063 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37064
37065 op = gen_lowpart (V32QImode, d->target);
37066 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
37067
37068 return true;
37069 }
37070
37071 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
37072 and extract-odd permutations of two V32QImode and V16QImode operand
37073 with two vpshufb insns, vpor and vpermq. We should have already
37074 failed all two or three instruction sequences. */
37075
37076 static bool
37077 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
37078 {
37079 rtx rperm[2][32], vperm, l, h, ior, op, m128;
37080 unsigned int i, nelt, eltsz;
37081
37082 if (!TARGET_AVX2
37083 || d->op0 == d->op1
37084 || (d->vmode != V32QImode && d->vmode != V16HImode))
37085 return false;
37086
37087 for (i = 0; i < d->nelt; ++i)
37088 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
37089 return false;
37090
37091 if (d->testing_p)
37092 return true;
37093
37094 nelt = d->nelt;
37095 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37096
37097 /* Generate two permutation masks. In the first permutation mask
37098 the first quarter will contain indexes for the first half
37099 of the op0, the second quarter will contain bit 7 set, third quarter
37100 will contain indexes for the second half of the op0 and the
37101 last quarter bit 7 set. In the second permutation mask
37102 the first quarter will contain bit 7 set, the second quarter
37103 indexes for the first half of the op1, the third quarter bit 7 set
37104 and last quarter indexes for the second half of the op1.
37105 I.e. the first mask e.g. for V32QImode extract even will be:
37106 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
37107 (all values masked with 0xf except for -128) and second mask
37108 for extract even will be
37109 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
37110 m128 = GEN_INT (-128);
37111 for (i = 0; i < nelt; ++i)
37112 {
37113 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37114 unsigned which = d->perm[i] >= nelt;
37115 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
37116
37117 for (j = 0; j < eltsz; ++j)
37118 {
37119 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
37120 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
37121 }
37122 }
37123
37124 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37125 vperm = force_reg (V32QImode, vperm);
37126
37127 l = gen_reg_rtx (V32QImode);
37128 op = gen_lowpart (V32QImode, d->op0);
37129 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37130
37131 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37132 vperm = force_reg (V32QImode, vperm);
37133
37134 h = gen_reg_rtx (V32QImode);
37135 op = gen_lowpart (V32QImode, d->op1);
37136 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37137
37138 ior = gen_reg_rtx (V32QImode);
37139 emit_insn (gen_iorv32qi3 (ior, l, h));
37140
37141 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
37142 op = gen_lowpart (V4DImode, d->target);
37143 ior = gen_lowpart (V4DImode, ior);
37144 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
37145 const1_rtx, GEN_INT (3)));
37146
37147 return true;
37148 }
37149
37150 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
37151 and extract-odd permutations. */
37152
37153 static bool
37154 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
37155 {
37156 rtx t1, t2, t3;
37157
37158 switch (d->vmode)
37159 {
37160 case V4DFmode:
37161 t1 = gen_reg_rtx (V4DFmode);
37162 t2 = gen_reg_rtx (V4DFmode);
37163
37164 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37165 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
37166 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
37167
37168 /* Now an unpck[lh]pd will produce the result required. */
37169 if (odd)
37170 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
37171 else
37172 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
37173 emit_insn (t3);
37174 break;
37175
37176 case V8SFmode:
37177 {
37178 int mask = odd ? 0xdd : 0x88;
37179
37180 t1 = gen_reg_rtx (V8SFmode);
37181 t2 = gen_reg_rtx (V8SFmode);
37182 t3 = gen_reg_rtx (V8SFmode);
37183
37184 /* Shuffle within the 128-bit lanes to produce:
37185 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
37186 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
37187 GEN_INT (mask)));
37188
37189 /* Shuffle the lanes around to produce:
37190 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
37191 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
37192 GEN_INT (0x3)));
37193
37194 /* Shuffle within the 128-bit lanes to produce:
37195 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
37196 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
37197
37198 /* Shuffle within the 128-bit lanes to produce:
37199 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
37200 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
37201
37202 /* Shuffle the lanes around to produce:
37203 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
37204 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
37205 GEN_INT (0x20)));
37206 }
37207 break;
37208
37209 case V2DFmode:
37210 case V4SFmode:
37211 case V2DImode:
37212 case V4SImode:
37213 /* These are always directly implementable by expand_vec_perm_1. */
37214 gcc_unreachable ();
37215
37216 case V8HImode:
37217 if (TARGET_SSSE3)
37218 return expand_vec_perm_pshufb2 (d);
37219 else
37220 {
37221 /* We need 2*log2(N)-1 operations to achieve odd/even
37222 with interleave. */
37223 t1 = gen_reg_rtx (V8HImode);
37224 t2 = gen_reg_rtx (V8HImode);
37225 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
37226 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
37227 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
37228 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
37229 if (odd)
37230 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
37231 else
37232 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
37233 emit_insn (t3);
37234 }
37235 break;
37236
37237 case V16QImode:
37238 if (TARGET_SSSE3)
37239 return expand_vec_perm_pshufb2 (d);
37240 else
37241 {
37242 t1 = gen_reg_rtx (V16QImode);
37243 t2 = gen_reg_rtx (V16QImode);
37244 t3 = gen_reg_rtx (V16QImode);
37245 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
37246 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
37247 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
37248 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
37249 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
37250 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
37251 if (odd)
37252 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
37253 else
37254 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
37255 emit_insn (t3);
37256 }
37257 break;
37258
37259 case V16HImode:
37260 case V32QImode:
37261 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
37262
37263 case V4DImode:
37264 if (!TARGET_AVX2)
37265 {
37266 struct expand_vec_perm_d d_copy = *d;
37267 d_copy.vmode = V4DFmode;
37268 d_copy.target = gen_lowpart (V4DFmode, d->target);
37269 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
37270 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
37271 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37272 }
37273
37274 t1 = gen_reg_rtx (V4DImode);
37275 t2 = gen_reg_rtx (V4DImode);
37276
37277 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37278 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
37279 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
37280
37281 /* Now an vpunpck[lh]qdq will produce the result required. */
37282 if (odd)
37283 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
37284 else
37285 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
37286 emit_insn (t3);
37287 break;
37288
37289 case V8SImode:
37290 if (!TARGET_AVX2)
37291 {
37292 struct expand_vec_perm_d d_copy = *d;
37293 d_copy.vmode = V8SFmode;
37294 d_copy.target = gen_lowpart (V8SFmode, d->target);
37295 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
37296 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
37297 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37298 }
37299
37300 t1 = gen_reg_rtx (V8SImode);
37301 t2 = gen_reg_rtx (V8SImode);
37302
37303 /* Shuffle the lanes around into
37304 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
37305 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
37306 gen_lowpart (V4DImode, d->op0),
37307 gen_lowpart (V4DImode, d->op1),
37308 GEN_INT (0x20)));
37309 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
37310 gen_lowpart (V4DImode, d->op0),
37311 gen_lowpart (V4DImode, d->op1),
37312 GEN_INT (0x31)));
37313
37314 /* Swap the 2nd and 3rd position in each lane into
37315 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
37316 emit_insn (gen_avx2_pshufdv3 (t1, t1,
37317 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37318 emit_insn (gen_avx2_pshufdv3 (t2, t2,
37319 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37320
37321 /* Now an vpunpck[lh]qdq will produce
37322 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
37323 if (odd)
37324 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
37325 gen_lowpart (V4DImode, t1),
37326 gen_lowpart (V4DImode, t2));
37327 else
37328 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
37329 gen_lowpart (V4DImode, t1),
37330 gen_lowpart (V4DImode, t2));
37331 emit_insn (t3);
37332 break;
37333
37334 default:
37335 gcc_unreachable ();
37336 }
37337
37338 return true;
37339 }
37340
37341 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37342 extract-even and extract-odd permutations. */
37343
37344 static bool
37345 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
37346 {
37347 unsigned i, odd, nelt = d->nelt;
37348
37349 odd = d->perm[0];
37350 if (odd != 0 && odd != 1)
37351 return false;
37352
37353 for (i = 1; i < nelt; ++i)
37354 if (d->perm[i] != 2 * i + odd)
37355 return false;
37356
37357 return expand_vec_perm_even_odd_1 (d, odd);
37358 }
37359
37360 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
37361 permutations. We assume that expand_vec_perm_1 has already failed. */
37362
37363 static bool
37364 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
37365 {
37366 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
37367 enum machine_mode vmode = d->vmode;
37368 unsigned char perm2[4];
37369 rtx op0 = d->op0;
37370 bool ok;
37371
37372 switch (vmode)
37373 {
37374 case V4DFmode:
37375 case V8SFmode:
37376 /* These are special-cased in sse.md so that we can optionally
37377 use the vbroadcast instruction. They expand to two insns
37378 if the input happens to be in a register. */
37379 gcc_unreachable ();
37380
37381 case V2DFmode:
37382 case V2DImode:
37383 case V4SFmode:
37384 case V4SImode:
37385 /* These are always implementable using standard shuffle patterns. */
37386 gcc_unreachable ();
37387
37388 case V8HImode:
37389 case V16QImode:
37390 /* These can be implemented via interleave. We save one insn by
37391 stopping once we have promoted to V4SImode and then use pshufd. */
37392 do
37393 {
37394 rtx dest;
37395 rtx (*gen) (rtx, rtx, rtx)
37396 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
37397 : gen_vec_interleave_lowv8hi;
37398
37399 if (elt >= nelt2)
37400 {
37401 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
37402 : gen_vec_interleave_highv8hi;
37403 elt -= nelt2;
37404 }
37405 nelt2 /= 2;
37406
37407 dest = gen_reg_rtx (vmode);
37408 emit_insn (gen (dest, op0, op0));
37409 vmode = get_mode_wider_vector (vmode);
37410 op0 = gen_lowpart (vmode, dest);
37411 }
37412 while (vmode != V4SImode);
37413
37414 memset (perm2, elt, 4);
37415 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
37416 d->testing_p);
37417 gcc_assert (ok);
37418 return true;
37419
37420 case V32QImode:
37421 case V16HImode:
37422 case V8SImode:
37423 case V4DImode:
37424 /* For AVX2 broadcasts of the first element vpbroadcast* or
37425 vpermq should be used by expand_vec_perm_1. */
37426 gcc_assert (!TARGET_AVX2 || d->perm[0]);
37427 return false;
37428
37429 default:
37430 gcc_unreachable ();
37431 }
37432 }
37433
37434 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37435 broadcast permutations. */
37436
37437 static bool
37438 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
37439 {
37440 unsigned i, elt, nelt = d->nelt;
37441
37442 if (d->op0 != d->op1)
37443 return false;
37444
37445 elt = d->perm[0];
37446 for (i = 1; i < nelt; ++i)
37447 if (d->perm[i] != elt)
37448 return false;
37449
37450 return expand_vec_perm_broadcast_1 (d);
37451 }
37452
37453 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
37454 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
37455 all the shorter instruction sequences. */
37456
37457 static bool
37458 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
37459 {
37460 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
37461 unsigned int i, nelt, eltsz;
37462 bool used[4];
37463
37464 if (!TARGET_AVX2
37465 || d->op0 == d->op1
37466 || (d->vmode != V32QImode && d->vmode != V16HImode))
37467 return false;
37468
37469 if (d->testing_p)
37470 return true;
37471
37472 nelt = d->nelt;
37473 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37474
37475 /* Generate 4 permutation masks. If the required element is within
37476 the same lane, it is shuffled in. If the required element from the
37477 other lane, force a zero by setting bit 7 in the permutation mask.
37478 In the other mask the mask has non-negative elements if element
37479 is requested from the other lane, but also moved to the other lane,
37480 so that the result of vpshufb can have the two V2TImode halves
37481 swapped. */
37482 m128 = GEN_INT (-128);
37483 for (i = 0; i < 32; ++i)
37484 {
37485 rperm[0][i] = m128;
37486 rperm[1][i] = m128;
37487 rperm[2][i] = m128;
37488 rperm[3][i] = m128;
37489 }
37490 used[0] = false;
37491 used[1] = false;
37492 used[2] = false;
37493 used[3] = false;
37494 for (i = 0; i < nelt; ++i)
37495 {
37496 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37497 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37498 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
37499
37500 for (j = 0; j < eltsz; ++j)
37501 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
37502 used[which] = true;
37503 }
37504
37505 for (i = 0; i < 2; ++i)
37506 {
37507 if (!used[2 * i + 1])
37508 {
37509 h[i] = NULL_RTX;
37510 continue;
37511 }
37512 vperm = gen_rtx_CONST_VECTOR (V32QImode,
37513 gen_rtvec_v (32, rperm[2 * i + 1]));
37514 vperm = force_reg (V32QImode, vperm);
37515 h[i] = gen_reg_rtx (V32QImode);
37516 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37517 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
37518 }
37519
37520 /* Swap the 128-byte lanes of h[X]. */
37521 for (i = 0; i < 2; ++i)
37522 {
37523 if (h[i] == NULL_RTX)
37524 continue;
37525 op = gen_reg_rtx (V4DImode);
37526 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
37527 const2_rtx, GEN_INT (3), const0_rtx,
37528 const1_rtx));
37529 h[i] = gen_lowpart (V32QImode, op);
37530 }
37531
37532 for (i = 0; i < 2; ++i)
37533 {
37534 if (!used[2 * i])
37535 {
37536 l[i] = NULL_RTX;
37537 continue;
37538 }
37539 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
37540 vperm = force_reg (V32QImode, vperm);
37541 l[i] = gen_reg_rtx (V32QImode);
37542 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37543 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
37544 }
37545
37546 for (i = 0; i < 2; ++i)
37547 {
37548 if (h[i] && l[i])
37549 {
37550 op = gen_reg_rtx (V32QImode);
37551 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
37552 l[i] = op;
37553 }
37554 else if (h[i])
37555 l[i] = h[i];
37556 }
37557
37558 gcc_assert (l[0] && l[1]);
37559 op = gen_lowpart (V32QImode, d->target);
37560 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
37561 return true;
37562 }
37563
37564 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
37565 With all of the interface bits taken care of, perform the expansion
37566 in D and return true on success. */
37567
37568 static bool
37569 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
37570 {
37571 /* Try a single instruction expansion. */
37572 if (expand_vec_perm_1 (d))
37573 return true;
37574
37575 /* Try sequences of two instructions. */
37576
37577 if (expand_vec_perm_pshuflw_pshufhw (d))
37578 return true;
37579
37580 if (expand_vec_perm_palignr (d))
37581 return true;
37582
37583 if (expand_vec_perm_interleave2 (d))
37584 return true;
37585
37586 if (expand_vec_perm_broadcast (d))
37587 return true;
37588
37589 if (expand_vec_perm_vpermq_perm_1 (d))
37590 return true;
37591
37592 if (expand_vec_perm_vperm2f128 (d))
37593 return true;
37594
37595 /* Try sequences of three instructions. */
37596
37597 if (expand_vec_perm_pshufb2 (d))
37598 return true;
37599
37600 if (expand_vec_perm_interleave3 (d))
37601 return true;
37602
37603 if (expand_vec_perm_vperm2f128_vblend (d))
37604 return true;
37605
37606 /* Try sequences of four instructions. */
37607
37608 if (expand_vec_perm_vpshufb2_vpermq (d))
37609 return true;
37610
37611 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
37612 return true;
37613
37614 /* ??? Look for narrow permutations whose element orderings would
37615 allow the promotion to a wider mode. */
37616
37617 /* ??? Look for sequences of interleave or a wider permute that place
37618 the data into the correct lanes for a half-vector shuffle like
37619 pshuf[lh]w or vpermilps. */
37620
37621 /* ??? Look for sequences of interleave that produce the desired results.
37622 The combinatorics of punpck[lh] get pretty ugly... */
37623
37624 if (expand_vec_perm_even_odd (d))
37625 return true;
37626
37627 /* Even longer sequences. */
37628 if (expand_vec_perm_vpshufb4_vpermq2 (d))
37629 return true;
37630
37631 return false;
37632 }
37633
37634 bool
37635 ix86_expand_vec_perm_const (rtx operands[4])
37636 {
37637 struct expand_vec_perm_d d;
37638 unsigned char perm[MAX_VECT_LEN];
37639 int i, nelt, which;
37640 rtx sel;
37641
37642 d.target = operands[0];
37643 d.op0 = operands[1];
37644 d.op1 = operands[2];
37645 sel = operands[3];
37646
37647 d.vmode = GET_MODE (d.target);
37648 gcc_assert (VECTOR_MODE_P (d.vmode));
37649 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37650 d.testing_p = false;
37651
37652 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
37653 gcc_assert (XVECLEN (sel, 0) == nelt);
37654 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
37655
37656 for (i = which = 0; i < nelt; ++i)
37657 {
37658 rtx e = XVECEXP (sel, 0, i);
37659 int ei = INTVAL (e) & (2 * nelt - 1);
37660
37661 which |= (ei < nelt ? 1 : 2);
37662 d.perm[i] = ei;
37663 perm[i] = ei;
37664 }
37665
37666 switch (which)
37667 {
37668 default:
37669 gcc_unreachable();
37670
37671 case 3:
37672 if (!rtx_equal_p (d.op0, d.op1))
37673 break;
37674
37675 /* The elements of PERM do not suggest that only the first operand
37676 is used, but both operands are identical. Allow easier matching
37677 of the permutation by folding the permutation into the single
37678 input vector. */
37679 for (i = 0; i < nelt; ++i)
37680 if (d.perm[i] >= nelt)
37681 d.perm[i] -= nelt;
37682 /* FALLTHRU */
37683
37684 case 1:
37685 d.op1 = d.op0;
37686 break;
37687
37688 case 2:
37689 for (i = 0; i < nelt; ++i)
37690 d.perm[i] -= nelt;
37691 d.op0 = d.op1;
37692 break;
37693 }
37694
37695 if (ix86_expand_vec_perm_const_1 (&d))
37696 return true;
37697
37698 /* If the mask says both arguments are needed, but they are the same,
37699 the above tried to expand with d.op0 == d.op1. If that didn't work,
37700 retry with d.op0 != d.op1 as that is what testing has been done with. */
37701 if (which == 3 && d.op0 == d.op1)
37702 {
37703 rtx seq;
37704 bool ok;
37705
37706 memcpy (d.perm, perm, sizeof (perm));
37707 d.op1 = gen_reg_rtx (d.vmode);
37708 start_sequence ();
37709 ok = ix86_expand_vec_perm_const_1 (&d);
37710 seq = get_insns ();
37711 end_sequence ();
37712 if (ok)
37713 {
37714 emit_move_insn (d.op1, d.op0);
37715 emit_insn (seq);
37716 return true;
37717 }
37718 }
37719
37720 return false;
37721 }
37722
37723 /* Implement targetm.vectorize.vec_perm_const_ok. */
37724
37725 static bool
37726 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
37727 const unsigned char *sel)
37728 {
37729 struct expand_vec_perm_d d;
37730 unsigned int i, nelt, which;
37731 bool ret, one_vec;
37732
37733 d.vmode = vmode;
37734 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37735 d.testing_p = true;
37736
37737 /* Given sufficient ISA support we can just return true here
37738 for selected vector modes. */
37739 if (GET_MODE_SIZE (d.vmode) == 16)
37740 {
37741 /* All implementable with a single vpperm insn. */
37742 if (TARGET_XOP)
37743 return true;
37744 /* All implementable with 2 pshufb + 1 ior. */
37745 if (TARGET_SSSE3)
37746 return true;
37747 /* All implementable with shufpd or unpck[lh]pd. */
37748 if (d.nelt == 2)
37749 return true;
37750 }
37751
37752 /* Extract the values from the vector CST into the permutation
37753 array in D. */
37754 memcpy (d.perm, sel, nelt);
37755 for (i = which = 0; i < nelt; ++i)
37756 {
37757 unsigned char e = d.perm[i];
37758 gcc_assert (e < 2 * nelt);
37759 which |= (e < nelt ? 1 : 2);
37760 }
37761
37762 /* For all elements from second vector, fold the elements to first. */
37763 if (which == 2)
37764 for (i = 0; i < nelt; ++i)
37765 d.perm[i] -= nelt;
37766
37767 /* Check whether the mask can be applied to the vector type. */
37768 one_vec = (which != 3);
37769
37770 /* Implementable with shufps or pshufd. */
37771 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
37772 return true;
37773
37774 /* Otherwise we have to go through the motions and see if we can
37775 figure out how to generate the requested permutation. */
37776 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
37777 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
37778 if (!one_vec)
37779 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
37780
37781 start_sequence ();
37782 ret = ix86_expand_vec_perm_const_1 (&d);
37783 end_sequence ();
37784
37785 return ret;
37786 }
37787
37788 void
37789 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
37790 {
37791 struct expand_vec_perm_d d;
37792 unsigned i, nelt;
37793
37794 d.target = targ;
37795 d.op0 = op0;
37796 d.op1 = op1;
37797 d.vmode = GET_MODE (targ);
37798 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37799 d.testing_p = false;
37800
37801 for (i = 0; i < nelt; ++i)
37802 d.perm[i] = i * 2 + odd;
37803
37804 /* We'll either be able to implement the permutation directly... */
37805 if (expand_vec_perm_1 (&d))
37806 return;
37807
37808 /* ... or we use the special-case patterns. */
37809 expand_vec_perm_even_odd_1 (&d, odd);
37810 }
37811
37812 /* Expand an insert into a vector register through pinsr insn.
37813 Return true if successful. */
37814
37815 bool
37816 ix86_expand_pinsr (rtx *operands)
37817 {
37818 rtx dst = operands[0];
37819 rtx src = operands[3];
37820
37821 unsigned int size = INTVAL (operands[1]);
37822 unsigned int pos = INTVAL (operands[2]);
37823
37824 if (GET_CODE (dst) == SUBREG)
37825 {
37826 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
37827 dst = SUBREG_REG (dst);
37828 }
37829
37830 if (GET_CODE (src) == SUBREG)
37831 src = SUBREG_REG (src);
37832
37833 switch (GET_MODE (dst))
37834 {
37835 case V16QImode:
37836 case V8HImode:
37837 case V4SImode:
37838 case V2DImode:
37839 {
37840 enum machine_mode srcmode, dstmode;
37841 rtx (*pinsr)(rtx, rtx, rtx, rtx);
37842
37843 srcmode = mode_for_size (size, MODE_INT, 0);
37844
37845 switch (srcmode)
37846 {
37847 case QImode:
37848 if (!TARGET_SSE4_1)
37849 return false;
37850 dstmode = V16QImode;
37851 pinsr = gen_sse4_1_pinsrb;
37852 break;
37853
37854 case HImode:
37855 if (!TARGET_SSE2)
37856 return false;
37857 dstmode = V8HImode;
37858 pinsr = gen_sse2_pinsrw;
37859 break;
37860
37861 case SImode:
37862 if (!TARGET_SSE4_1)
37863 return false;
37864 dstmode = V4SImode;
37865 pinsr = gen_sse4_1_pinsrd;
37866 break;
37867
37868 case DImode:
37869 gcc_assert (TARGET_64BIT);
37870 if (!TARGET_SSE4_1)
37871 return false;
37872 dstmode = V2DImode;
37873 pinsr = gen_sse4_1_pinsrq;
37874 break;
37875
37876 default:
37877 return false;
37878 }
37879
37880 dst = gen_lowpart (dstmode, dst);
37881 src = gen_lowpart (srcmode, src);
37882
37883 pos /= size;
37884
37885 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
37886 return true;
37887 }
37888
37889 default:
37890 return false;
37891 }
37892 }
37893 \f
37894 /* This function returns the calling abi specific va_list type node.
37895 It returns the FNDECL specific va_list type. */
37896
37897 static tree
37898 ix86_fn_abi_va_list (tree fndecl)
37899 {
37900 if (!TARGET_64BIT)
37901 return va_list_type_node;
37902 gcc_assert (fndecl != NULL_TREE);
37903
37904 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
37905 return ms_va_list_type_node;
37906 else
37907 return sysv_va_list_type_node;
37908 }
37909
37910 /* Returns the canonical va_list type specified by TYPE. If there
37911 is no valid TYPE provided, it return NULL_TREE. */
37912
37913 static tree
37914 ix86_canonical_va_list_type (tree type)
37915 {
37916 tree wtype, htype;
37917
37918 /* Resolve references and pointers to va_list type. */
37919 if (TREE_CODE (type) == MEM_REF)
37920 type = TREE_TYPE (type);
37921 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
37922 type = TREE_TYPE (type);
37923 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
37924 type = TREE_TYPE (type);
37925
37926 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
37927 {
37928 wtype = va_list_type_node;
37929 gcc_assert (wtype != NULL_TREE);
37930 htype = type;
37931 if (TREE_CODE (wtype) == ARRAY_TYPE)
37932 {
37933 /* If va_list is an array type, the argument may have decayed
37934 to a pointer type, e.g. by being passed to another function.
37935 In that case, unwrap both types so that we can compare the
37936 underlying records. */
37937 if (TREE_CODE (htype) == ARRAY_TYPE
37938 || POINTER_TYPE_P (htype))
37939 {
37940 wtype = TREE_TYPE (wtype);
37941 htype = TREE_TYPE (htype);
37942 }
37943 }
37944 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37945 return va_list_type_node;
37946 wtype = sysv_va_list_type_node;
37947 gcc_assert (wtype != NULL_TREE);
37948 htype = type;
37949 if (TREE_CODE (wtype) == ARRAY_TYPE)
37950 {
37951 /* If va_list is an array type, the argument may have decayed
37952 to a pointer type, e.g. by being passed to another function.
37953 In that case, unwrap both types so that we can compare the
37954 underlying records. */
37955 if (TREE_CODE (htype) == ARRAY_TYPE
37956 || POINTER_TYPE_P (htype))
37957 {
37958 wtype = TREE_TYPE (wtype);
37959 htype = TREE_TYPE (htype);
37960 }
37961 }
37962 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37963 return sysv_va_list_type_node;
37964 wtype = ms_va_list_type_node;
37965 gcc_assert (wtype != NULL_TREE);
37966 htype = type;
37967 if (TREE_CODE (wtype) == ARRAY_TYPE)
37968 {
37969 /* If va_list is an array type, the argument may have decayed
37970 to a pointer type, e.g. by being passed to another function.
37971 In that case, unwrap both types so that we can compare the
37972 underlying records. */
37973 if (TREE_CODE (htype) == ARRAY_TYPE
37974 || POINTER_TYPE_P (htype))
37975 {
37976 wtype = TREE_TYPE (wtype);
37977 htype = TREE_TYPE (htype);
37978 }
37979 }
37980 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37981 return ms_va_list_type_node;
37982 return NULL_TREE;
37983 }
37984 return std_canonical_va_list_type (type);
37985 }
37986
37987 /* Iterate through the target-specific builtin types for va_list.
37988 IDX denotes the iterator, *PTREE is set to the result type of
37989 the va_list builtin, and *PNAME to its internal type.
37990 Returns zero if there is no element for this index, otherwise
37991 IDX should be increased upon the next call.
37992 Note, do not iterate a base builtin's name like __builtin_va_list.
37993 Used from c_common_nodes_and_builtins. */
37994
37995 static int
37996 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
37997 {
37998 if (TARGET_64BIT)
37999 {
38000 switch (idx)
38001 {
38002 default:
38003 break;
38004
38005 case 0:
38006 *ptree = ms_va_list_type_node;
38007 *pname = "__builtin_ms_va_list";
38008 return 1;
38009
38010 case 1:
38011 *ptree = sysv_va_list_type_node;
38012 *pname = "__builtin_sysv_va_list";
38013 return 1;
38014 }
38015 }
38016
38017 return 0;
38018 }
38019
38020 #undef TARGET_SCHED_DISPATCH
38021 #define TARGET_SCHED_DISPATCH has_dispatch
38022 #undef TARGET_SCHED_DISPATCH_DO
38023 #define TARGET_SCHED_DISPATCH_DO do_dispatch
38024 #undef TARGET_SCHED_REASSOCIATION_WIDTH
38025 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
38026
38027 /* The size of the dispatch window is the total number of bytes of
38028 object code allowed in a window. */
38029 #define DISPATCH_WINDOW_SIZE 16
38030
38031 /* Number of dispatch windows considered for scheduling. */
38032 #define MAX_DISPATCH_WINDOWS 3
38033
38034 /* Maximum number of instructions in a window. */
38035 #define MAX_INSN 4
38036
38037 /* Maximum number of immediate operands in a window. */
38038 #define MAX_IMM 4
38039
38040 /* Maximum number of immediate bits allowed in a window. */
38041 #define MAX_IMM_SIZE 128
38042
38043 /* Maximum number of 32 bit immediates allowed in a window. */
38044 #define MAX_IMM_32 4
38045
38046 /* Maximum number of 64 bit immediates allowed in a window. */
38047 #define MAX_IMM_64 2
38048
38049 /* Maximum total of loads or prefetches allowed in a window. */
38050 #define MAX_LOAD 2
38051
38052 /* Maximum total of stores allowed in a window. */
38053 #define MAX_STORE 1
38054
38055 #undef BIG
38056 #define BIG 100
38057
38058
38059 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
38060 enum dispatch_group {
38061 disp_no_group = 0,
38062 disp_load,
38063 disp_store,
38064 disp_load_store,
38065 disp_prefetch,
38066 disp_imm,
38067 disp_imm_32,
38068 disp_imm_64,
38069 disp_branch,
38070 disp_cmp,
38071 disp_jcc,
38072 disp_last
38073 };
38074
38075 /* Number of allowable groups in a dispatch window. It is an array
38076 indexed by dispatch_group enum. 100 is used as a big number,
38077 because the number of these kind of operations does not have any
38078 effect in dispatch window, but we need them for other reasons in
38079 the table. */
38080 static unsigned int num_allowable_groups[disp_last] = {
38081 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
38082 };
38083
38084 char group_name[disp_last + 1][16] = {
38085 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
38086 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
38087 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
38088 };
38089
38090 /* Instruction path. */
38091 enum insn_path {
38092 no_path = 0,
38093 path_single, /* Single micro op. */
38094 path_double, /* Double micro op. */
38095 path_multi, /* Instructions with more than 2 micro op.. */
38096 last_path
38097 };
38098
38099 /* sched_insn_info defines a window to the instructions scheduled in
38100 the basic block. It contains a pointer to the insn_info table and
38101 the instruction scheduled.
38102
38103 Windows are allocated for each basic block and are linked
38104 together. */
38105 typedef struct sched_insn_info_s {
38106 rtx insn;
38107 enum dispatch_group group;
38108 enum insn_path path;
38109 int byte_len;
38110 int imm_bytes;
38111 } sched_insn_info;
38112
38113 /* Linked list of dispatch windows. This is a two way list of
38114 dispatch windows of a basic block. It contains information about
38115 the number of uops in the window and the total number of
38116 instructions and of bytes in the object code for this dispatch
38117 window. */
38118 typedef struct dispatch_windows_s {
38119 int num_insn; /* Number of insn in the window. */
38120 int num_uops; /* Number of uops in the window. */
38121 int window_size; /* Number of bytes in the window. */
38122 int window_num; /* Window number between 0 or 1. */
38123 int num_imm; /* Number of immediates in an insn. */
38124 int num_imm_32; /* Number of 32 bit immediates in an insn. */
38125 int num_imm_64; /* Number of 64 bit immediates in an insn. */
38126 int imm_size; /* Total immediates in the window. */
38127 int num_loads; /* Total memory loads in the window. */
38128 int num_stores; /* Total memory stores in the window. */
38129 int violation; /* Violation exists in window. */
38130 sched_insn_info *window; /* Pointer to the window. */
38131 struct dispatch_windows_s *next;
38132 struct dispatch_windows_s *prev;
38133 } dispatch_windows;
38134
38135 /* Immediate valuse used in an insn. */
38136 typedef struct imm_info_s
38137 {
38138 int imm;
38139 int imm32;
38140 int imm64;
38141 } imm_info;
38142
38143 static dispatch_windows *dispatch_window_list;
38144 static dispatch_windows *dispatch_window_list1;
38145
38146 /* Get dispatch group of insn. */
38147
38148 static enum dispatch_group
38149 get_mem_group (rtx insn)
38150 {
38151 enum attr_memory memory;
38152
38153 if (INSN_CODE (insn) < 0)
38154 return disp_no_group;
38155 memory = get_attr_memory (insn);
38156 if (memory == MEMORY_STORE)
38157 return disp_store;
38158
38159 if (memory == MEMORY_LOAD)
38160 return disp_load;
38161
38162 if (memory == MEMORY_BOTH)
38163 return disp_load_store;
38164
38165 return disp_no_group;
38166 }
38167
38168 /* Return true if insn is a compare instruction. */
38169
38170 static bool
38171 is_cmp (rtx insn)
38172 {
38173 enum attr_type type;
38174
38175 type = get_attr_type (insn);
38176 return (type == TYPE_TEST
38177 || type == TYPE_ICMP
38178 || type == TYPE_FCMP
38179 || GET_CODE (PATTERN (insn)) == COMPARE);
38180 }
38181
38182 /* Return true if a dispatch violation encountered. */
38183
38184 static bool
38185 dispatch_violation (void)
38186 {
38187 if (dispatch_window_list->next)
38188 return dispatch_window_list->next->violation;
38189 return dispatch_window_list->violation;
38190 }
38191
38192 /* Return true if insn is a branch instruction. */
38193
38194 static bool
38195 is_branch (rtx insn)
38196 {
38197 return (CALL_P (insn) || JUMP_P (insn));
38198 }
38199
38200 /* Return true if insn is a prefetch instruction. */
38201
38202 static bool
38203 is_prefetch (rtx insn)
38204 {
38205 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
38206 }
38207
38208 /* This function initializes a dispatch window and the list container holding a
38209 pointer to the window. */
38210
38211 static void
38212 init_window (int window_num)
38213 {
38214 int i;
38215 dispatch_windows *new_list;
38216
38217 if (window_num == 0)
38218 new_list = dispatch_window_list;
38219 else
38220 new_list = dispatch_window_list1;
38221
38222 new_list->num_insn = 0;
38223 new_list->num_uops = 0;
38224 new_list->window_size = 0;
38225 new_list->next = NULL;
38226 new_list->prev = NULL;
38227 new_list->window_num = window_num;
38228 new_list->num_imm = 0;
38229 new_list->num_imm_32 = 0;
38230 new_list->num_imm_64 = 0;
38231 new_list->imm_size = 0;
38232 new_list->num_loads = 0;
38233 new_list->num_stores = 0;
38234 new_list->violation = false;
38235
38236 for (i = 0; i < MAX_INSN; i++)
38237 {
38238 new_list->window[i].insn = NULL;
38239 new_list->window[i].group = disp_no_group;
38240 new_list->window[i].path = no_path;
38241 new_list->window[i].byte_len = 0;
38242 new_list->window[i].imm_bytes = 0;
38243 }
38244 return;
38245 }
38246
38247 /* This function allocates and initializes a dispatch window and the
38248 list container holding a pointer to the window. */
38249
38250 static dispatch_windows *
38251 allocate_window (void)
38252 {
38253 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
38254 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
38255
38256 return new_list;
38257 }
38258
38259 /* This routine initializes the dispatch scheduling information. It
38260 initiates building dispatch scheduler tables and constructs the
38261 first dispatch window. */
38262
38263 static void
38264 init_dispatch_sched (void)
38265 {
38266 /* Allocate a dispatch list and a window. */
38267 dispatch_window_list = allocate_window ();
38268 dispatch_window_list1 = allocate_window ();
38269 init_window (0);
38270 init_window (1);
38271 }
38272
38273 /* This function returns true if a branch is detected. End of a basic block
38274 does not have to be a branch, but here we assume only branches end a
38275 window. */
38276
38277 static bool
38278 is_end_basic_block (enum dispatch_group group)
38279 {
38280 return group == disp_branch;
38281 }
38282
38283 /* This function is called when the end of a window processing is reached. */
38284
38285 static void
38286 process_end_window (void)
38287 {
38288 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
38289 if (dispatch_window_list->next)
38290 {
38291 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
38292 gcc_assert (dispatch_window_list->window_size
38293 + dispatch_window_list1->window_size <= 48);
38294 init_window (1);
38295 }
38296 init_window (0);
38297 }
38298
38299 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
38300 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
38301 for 48 bytes of instructions. Note that these windows are not dispatch
38302 windows that their sizes are DISPATCH_WINDOW_SIZE. */
38303
38304 static dispatch_windows *
38305 allocate_next_window (int window_num)
38306 {
38307 if (window_num == 0)
38308 {
38309 if (dispatch_window_list->next)
38310 init_window (1);
38311 init_window (0);
38312 return dispatch_window_list;
38313 }
38314
38315 dispatch_window_list->next = dispatch_window_list1;
38316 dispatch_window_list1->prev = dispatch_window_list;
38317
38318 return dispatch_window_list1;
38319 }
38320
38321 /* Increment the number of immediate operands of an instruction. */
38322
38323 static int
38324 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
38325 {
38326 if (*in_rtx == 0)
38327 return 0;
38328
38329 switch ( GET_CODE (*in_rtx))
38330 {
38331 case CONST:
38332 case SYMBOL_REF:
38333 case CONST_INT:
38334 (imm_values->imm)++;
38335 if (x86_64_immediate_operand (*in_rtx, SImode))
38336 (imm_values->imm32)++;
38337 else
38338 (imm_values->imm64)++;
38339 break;
38340
38341 case CONST_DOUBLE:
38342 (imm_values->imm)++;
38343 (imm_values->imm64)++;
38344 break;
38345
38346 case CODE_LABEL:
38347 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
38348 {
38349 (imm_values->imm)++;
38350 (imm_values->imm32)++;
38351 }
38352 break;
38353
38354 default:
38355 break;
38356 }
38357
38358 return 0;
38359 }
38360
38361 /* Compute number of immediate operands of an instruction. */
38362
38363 static void
38364 find_constant (rtx in_rtx, imm_info *imm_values)
38365 {
38366 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
38367 (rtx_function) find_constant_1, (void *) imm_values);
38368 }
38369
38370 /* Return total size of immediate operands of an instruction along with number
38371 of corresponding immediate-operands. It initializes its parameters to zero
38372 befor calling FIND_CONSTANT.
38373 INSN is the input instruction. IMM is the total of immediates.
38374 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
38375 bit immediates. */
38376
38377 static int
38378 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
38379 {
38380 imm_info imm_values = {0, 0, 0};
38381
38382 find_constant (insn, &imm_values);
38383 *imm = imm_values.imm;
38384 *imm32 = imm_values.imm32;
38385 *imm64 = imm_values.imm64;
38386 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
38387 }
38388
38389 /* This function indicates if an operand of an instruction is an
38390 immediate. */
38391
38392 static bool
38393 has_immediate (rtx insn)
38394 {
38395 int num_imm_operand;
38396 int num_imm32_operand;
38397 int num_imm64_operand;
38398
38399 if (insn)
38400 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38401 &num_imm64_operand);
38402 return false;
38403 }
38404
38405 /* Return single or double path for instructions. */
38406
38407 static enum insn_path
38408 get_insn_path (rtx insn)
38409 {
38410 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
38411
38412 if ((int)path == 0)
38413 return path_single;
38414
38415 if ((int)path == 1)
38416 return path_double;
38417
38418 return path_multi;
38419 }
38420
38421 /* Return insn dispatch group. */
38422
38423 static enum dispatch_group
38424 get_insn_group (rtx insn)
38425 {
38426 enum dispatch_group group = get_mem_group (insn);
38427 if (group)
38428 return group;
38429
38430 if (is_branch (insn))
38431 return disp_branch;
38432
38433 if (is_cmp (insn))
38434 return disp_cmp;
38435
38436 if (has_immediate (insn))
38437 return disp_imm;
38438
38439 if (is_prefetch (insn))
38440 return disp_prefetch;
38441
38442 return disp_no_group;
38443 }
38444
38445 /* Count number of GROUP restricted instructions in a dispatch
38446 window WINDOW_LIST. */
38447
38448 static int
38449 count_num_restricted (rtx insn, dispatch_windows *window_list)
38450 {
38451 enum dispatch_group group = get_insn_group (insn);
38452 int imm_size;
38453 int num_imm_operand;
38454 int num_imm32_operand;
38455 int num_imm64_operand;
38456
38457 if (group == disp_no_group)
38458 return 0;
38459
38460 if (group == disp_imm)
38461 {
38462 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38463 &num_imm64_operand);
38464 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
38465 || num_imm_operand + window_list->num_imm > MAX_IMM
38466 || (num_imm32_operand > 0
38467 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
38468 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
38469 || (num_imm64_operand > 0
38470 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
38471 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
38472 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
38473 && num_imm64_operand > 0
38474 && ((window_list->num_imm_64 > 0
38475 && window_list->num_insn >= 2)
38476 || window_list->num_insn >= 3)))
38477 return BIG;
38478
38479 return 1;
38480 }
38481
38482 if ((group == disp_load_store
38483 && (window_list->num_loads >= MAX_LOAD
38484 || window_list->num_stores >= MAX_STORE))
38485 || ((group == disp_load
38486 || group == disp_prefetch)
38487 && window_list->num_loads >= MAX_LOAD)
38488 || (group == disp_store
38489 && window_list->num_stores >= MAX_STORE))
38490 return BIG;
38491
38492 return 1;
38493 }
38494
38495 /* This function returns true if insn satisfies dispatch rules on the
38496 last window scheduled. */
38497
38498 static bool
38499 fits_dispatch_window (rtx insn)
38500 {
38501 dispatch_windows *window_list = dispatch_window_list;
38502 dispatch_windows *window_list_next = dispatch_window_list->next;
38503 unsigned int num_restrict;
38504 enum dispatch_group group = get_insn_group (insn);
38505 enum insn_path path = get_insn_path (insn);
38506 int sum;
38507
38508 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
38509 instructions should be given the lowest priority in the
38510 scheduling process in Haifa scheduler to make sure they will be
38511 scheduled in the same dispatch window as the refrence to them. */
38512 if (group == disp_jcc || group == disp_cmp)
38513 return false;
38514
38515 /* Check nonrestricted. */
38516 if (group == disp_no_group || group == disp_branch)
38517 return true;
38518
38519 /* Get last dispatch window. */
38520 if (window_list_next)
38521 window_list = window_list_next;
38522
38523 if (window_list->window_num == 1)
38524 {
38525 sum = window_list->prev->window_size + window_list->window_size;
38526
38527 if (sum == 32
38528 || (min_insn_size (insn) + sum) >= 48)
38529 /* Window 1 is full. Go for next window. */
38530 return true;
38531 }
38532
38533 num_restrict = count_num_restricted (insn, window_list);
38534
38535 if (num_restrict > num_allowable_groups[group])
38536 return false;
38537
38538 /* See if it fits in the first window. */
38539 if (window_list->window_num == 0)
38540 {
38541 /* The first widow should have only single and double path
38542 uops. */
38543 if (path == path_double
38544 && (window_list->num_uops + 2) > MAX_INSN)
38545 return false;
38546 else if (path != path_single)
38547 return false;
38548 }
38549 return true;
38550 }
38551
38552 /* Add an instruction INSN with NUM_UOPS micro-operations to the
38553 dispatch window WINDOW_LIST. */
38554
38555 static void
38556 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
38557 {
38558 int byte_len = min_insn_size (insn);
38559 int num_insn = window_list->num_insn;
38560 int imm_size;
38561 sched_insn_info *window = window_list->window;
38562 enum dispatch_group group = get_insn_group (insn);
38563 enum insn_path path = get_insn_path (insn);
38564 int num_imm_operand;
38565 int num_imm32_operand;
38566 int num_imm64_operand;
38567
38568 if (!window_list->violation && group != disp_cmp
38569 && !fits_dispatch_window (insn))
38570 window_list->violation = true;
38571
38572 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38573 &num_imm64_operand);
38574
38575 /* Initialize window with new instruction. */
38576 window[num_insn].insn = insn;
38577 window[num_insn].byte_len = byte_len;
38578 window[num_insn].group = group;
38579 window[num_insn].path = path;
38580 window[num_insn].imm_bytes = imm_size;
38581
38582 window_list->window_size += byte_len;
38583 window_list->num_insn = num_insn + 1;
38584 window_list->num_uops = window_list->num_uops + num_uops;
38585 window_list->imm_size += imm_size;
38586 window_list->num_imm += num_imm_operand;
38587 window_list->num_imm_32 += num_imm32_operand;
38588 window_list->num_imm_64 += num_imm64_operand;
38589
38590 if (group == disp_store)
38591 window_list->num_stores += 1;
38592 else if (group == disp_load
38593 || group == disp_prefetch)
38594 window_list->num_loads += 1;
38595 else if (group == disp_load_store)
38596 {
38597 window_list->num_stores += 1;
38598 window_list->num_loads += 1;
38599 }
38600 }
38601
38602 /* Adds a scheduled instruction, INSN, to the current dispatch window.
38603 If the total bytes of instructions or the number of instructions in
38604 the window exceed allowable, it allocates a new window. */
38605
38606 static void
38607 add_to_dispatch_window (rtx insn)
38608 {
38609 int byte_len;
38610 dispatch_windows *window_list;
38611 dispatch_windows *next_list;
38612 dispatch_windows *window0_list;
38613 enum insn_path path;
38614 enum dispatch_group insn_group;
38615 bool insn_fits;
38616 int num_insn;
38617 int num_uops;
38618 int window_num;
38619 int insn_num_uops;
38620 int sum;
38621
38622 if (INSN_CODE (insn) < 0)
38623 return;
38624
38625 byte_len = min_insn_size (insn);
38626 window_list = dispatch_window_list;
38627 next_list = window_list->next;
38628 path = get_insn_path (insn);
38629 insn_group = get_insn_group (insn);
38630
38631 /* Get the last dispatch window. */
38632 if (next_list)
38633 window_list = dispatch_window_list->next;
38634
38635 if (path == path_single)
38636 insn_num_uops = 1;
38637 else if (path == path_double)
38638 insn_num_uops = 2;
38639 else
38640 insn_num_uops = (int) path;
38641
38642 /* If current window is full, get a new window.
38643 Window number zero is full, if MAX_INSN uops are scheduled in it.
38644 Window number one is full, if window zero's bytes plus window
38645 one's bytes is 32, or if the bytes of the new instruction added
38646 to the total makes it greater than 48, or it has already MAX_INSN
38647 instructions in it. */
38648 num_insn = window_list->num_insn;
38649 num_uops = window_list->num_uops;
38650 window_num = window_list->window_num;
38651 insn_fits = fits_dispatch_window (insn);
38652
38653 if (num_insn >= MAX_INSN
38654 || num_uops + insn_num_uops > MAX_INSN
38655 || !(insn_fits))
38656 {
38657 window_num = ~window_num & 1;
38658 window_list = allocate_next_window (window_num);
38659 }
38660
38661 if (window_num == 0)
38662 {
38663 add_insn_window (insn, window_list, insn_num_uops);
38664 if (window_list->num_insn >= MAX_INSN
38665 && insn_group == disp_branch)
38666 {
38667 process_end_window ();
38668 return;
38669 }
38670 }
38671 else if (window_num == 1)
38672 {
38673 window0_list = window_list->prev;
38674 sum = window0_list->window_size + window_list->window_size;
38675 if (sum == 32
38676 || (byte_len + sum) >= 48)
38677 {
38678 process_end_window ();
38679 window_list = dispatch_window_list;
38680 }
38681
38682 add_insn_window (insn, window_list, insn_num_uops);
38683 }
38684 else
38685 gcc_unreachable ();
38686
38687 if (is_end_basic_block (insn_group))
38688 {
38689 /* End of basic block is reached do end-basic-block process. */
38690 process_end_window ();
38691 return;
38692 }
38693 }
38694
38695 /* Print the dispatch window, WINDOW_NUM, to FILE. */
38696
38697 DEBUG_FUNCTION static void
38698 debug_dispatch_window_file (FILE *file, int window_num)
38699 {
38700 dispatch_windows *list;
38701 int i;
38702
38703 if (window_num == 0)
38704 list = dispatch_window_list;
38705 else
38706 list = dispatch_window_list1;
38707
38708 fprintf (file, "Window #%d:\n", list->window_num);
38709 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
38710 list->num_insn, list->num_uops, list->window_size);
38711 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38712 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
38713
38714 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
38715 list->num_stores);
38716 fprintf (file, " insn info:\n");
38717
38718 for (i = 0; i < MAX_INSN; i++)
38719 {
38720 if (!list->window[i].insn)
38721 break;
38722 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
38723 i, group_name[list->window[i].group],
38724 i, (void *)list->window[i].insn,
38725 i, list->window[i].path,
38726 i, list->window[i].byte_len,
38727 i, list->window[i].imm_bytes);
38728 }
38729 }
38730
38731 /* Print to stdout a dispatch window. */
38732
38733 DEBUG_FUNCTION void
38734 debug_dispatch_window (int window_num)
38735 {
38736 debug_dispatch_window_file (stdout, window_num);
38737 }
38738
38739 /* Print INSN dispatch information to FILE. */
38740
38741 DEBUG_FUNCTION static void
38742 debug_insn_dispatch_info_file (FILE *file, rtx insn)
38743 {
38744 int byte_len;
38745 enum insn_path path;
38746 enum dispatch_group group;
38747 int imm_size;
38748 int num_imm_operand;
38749 int num_imm32_operand;
38750 int num_imm64_operand;
38751
38752 if (INSN_CODE (insn) < 0)
38753 return;
38754
38755 byte_len = min_insn_size (insn);
38756 path = get_insn_path (insn);
38757 group = get_insn_group (insn);
38758 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38759 &num_imm64_operand);
38760
38761 fprintf (file, " insn info:\n");
38762 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
38763 group_name[group], path, byte_len);
38764 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38765 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
38766 }
38767
38768 /* Print to STDERR the status of the ready list with respect to
38769 dispatch windows. */
38770
38771 DEBUG_FUNCTION void
38772 debug_ready_dispatch (void)
38773 {
38774 int i;
38775 int no_ready = number_in_ready ();
38776
38777 fprintf (stdout, "Number of ready: %d\n", no_ready);
38778
38779 for (i = 0; i < no_ready; i++)
38780 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
38781 }
38782
38783 /* This routine is the driver of the dispatch scheduler. */
38784
38785 static void
38786 do_dispatch (rtx insn, int mode)
38787 {
38788 if (mode == DISPATCH_INIT)
38789 init_dispatch_sched ();
38790 else if (mode == ADD_TO_DISPATCH_WINDOW)
38791 add_to_dispatch_window (insn);
38792 }
38793
38794 /* Return TRUE if Dispatch Scheduling is supported. */
38795
38796 static bool
38797 has_dispatch (rtx insn, int action)
38798 {
38799 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
38800 && flag_dispatch_scheduler)
38801 switch (action)
38802 {
38803 default:
38804 return false;
38805
38806 case IS_DISPATCH_ON:
38807 return true;
38808 break;
38809
38810 case IS_CMP:
38811 return is_cmp (insn);
38812
38813 case DISPATCH_VIOLATION:
38814 return dispatch_violation ();
38815
38816 case FITS_DISPATCH_WINDOW:
38817 return fits_dispatch_window (insn);
38818 }
38819
38820 return false;
38821 }
38822
38823 /* Implementation of reassociation_width target hook used by
38824 reassoc phase to identify parallelism level in reassociated
38825 tree. Statements tree_code is passed in OPC. Arguments type
38826 is passed in MODE.
38827
38828 Currently parallel reassociation is enabled for Atom
38829 processors only and we set reassociation width to be 2
38830 because Atom may issue up to 2 instructions per cycle.
38831
38832 Return value should be fixed if parallel reassociation is
38833 enabled for other processors. */
38834
38835 static int
38836 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
38837 enum machine_mode mode)
38838 {
38839 int res = 1;
38840
38841 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
38842 res = 2;
38843 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
38844 res = 2;
38845
38846 return res;
38847 }
38848
38849 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
38850 place emms and femms instructions. */
38851
38852 static enum machine_mode
38853 ix86_preferred_simd_mode (enum machine_mode mode)
38854 {
38855 if (!TARGET_SSE)
38856 return word_mode;
38857
38858 switch (mode)
38859 {
38860 case QImode:
38861 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
38862 case HImode:
38863 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
38864 case SImode:
38865 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
38866 case DImode:
38867 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
38868
38869 case SFmode:
38870 if (TARGET_AVX && !TARGET_PREFER_AVX128)
38871 return V8SFmode;
38872 else
38873 return V4SFmode;
38874
38875 case DFmode:
38876 if (!TARGET_VECTORIZE_DOUBLE)
38877 return word_mode;
38878 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
38879 return V4DFmode;
38880 else if (TARGET_SSE2)
38881 return V2DFmode;
38882 /* FALLTHRU */
38883
38884 default:
38885 return word_mode;
38886 }
38887 }
38888
38889 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
38890 vectors. */
38891
38892 static unsigned int
38893 ix86_autovectorize_vector_sizes (void)
38894 {
38895 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
38896 }
38897
38898 /* Initialize the GCC target structure. */
38899 #undef TARGET_RETURN_IN_MEMORY
38900 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
38901
38902 #undef TARGET_LEGITIMIZE_ADDRESS
38903 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
38904
38905 #undef TARGET_ATTRIBUTE_TABLE
38906 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
38907 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38908 # undef TARGET_MERGE_DECL_ATTRIBUTES
38909 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
38910 #endif
38911
38912 #undef TARGET_COMP_TYPE_ATTRIBUTES
38913 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
38914
38915 #undef TARGET_INIT_BUILTINS
38916 #define TARGET_INIT_BUILTINS ix86_init_builtins
38917 #undef TARGET_BUILTIN_DECL
38918 #define TARGET_BUILTIN_DECL ix86_builtin_decl
38919 #undef TARGET_EXPAND_BUILTIN
38920 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
38921
38922 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
38923 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
38924 ix86_builtin_vectorized_function
38925
38926 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
38927 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
38928
38929 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
38930 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
38931
38932 #undef TARGET_VECTORIZE_BUILTIN_GATHER
38933 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
38934
38935 #undef TARGET_BUILTIN_RECIPROCAL
38936 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
38937
38938 #undef TARGET_ASM_FUNCTION_EPILOGUE
38939 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
38940
38941 #undef TARGET_ENCODE_SECTION_INFO
38942 #ifndef SUBTARGET_ENCODE_SECTION_INFO
38943 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
38944 #else
38945 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
38946 #endif
38947
38948 #undef TARGET_ASM_OPEN_PAREN
38949 #define TARGET_ASM_OPEN_PAREN ""
38950 #undef TARGET_ASM_CLOSE_PAREN
38951 #define TARGET_ASM_CLOSE_PAREN ""
38952
38953 #undef TARGET_ASM_BYTE_OP
38954 #define TARGET_ASM_BYTE_OP ASM_BYTE
38955
38956 #undef TARGET_ASM_ALIGNED_HI_OP
38957 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
38958 #undef TARGET_ASM_ALIGNED_SI_OP
38959 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
38960 #ifdef ASM_QUAD
38961 #undef TARGET_ASM_ALIGNED_DI_OP
38962 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
38963 #endif
38964
38965 #undef TARGET_PROFILE_BEFORE_PROLOGUE
38966 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
38967
38968 #undef TARGET_ASM_UNALIGNED_HI_OP
38969 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
38970 #undef TARGET_ASM_UNALIGNED_SI_OP
38971 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
38972 #undef TARGET_ASM_UNALIGNED_DI_OP
38973 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
38974
38975 #undef TARGET_PRINT_OPERAND
38976 #define TARGET_PRINT_OPERAND ix86_print_operand
38977 #undef TARGET_PRINT_OPERAND_ADDRESS
38978 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
38979 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
38980 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
38981 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
38982 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
38983
38984 #undef TARGET_SCHED_INIT_GLOBAL
38985 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
38986 #undef TARGET_SCHED_ADJUST_COST
38987 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
38988 #undef TARGET_SCHED_ISSUE_RATE
38989 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
38990 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
38991 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
38992 ia32_multipass_dfa_lookahead
38993
38994 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
38995 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
38996
38997 #ifdef HAVE_AS_TLS
38998 #undef TARGET_HAVE_TLS
38999 #define TARGET_HAVE_TLS true
39000 #endif
39001 #undef TARGET_CANNOT_FORCE_CONST_MEM
39002 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
39003 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
39004 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
39005
39006 #undef TARGET_DELEGITIMIZE_ADDRESS
39007 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
39008
39009 #undef TARGET_MS_BITFIELD_LAYOUT_P
39010 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
39011
39012 #if TARGET_MACHO
39013 #undef TARGET_BINDS_LOCAL_P
39014 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
39015 #endif
39016 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39017 #undef TARGET_BINDS_LOCAL_P
39018 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
39019 #endif
39020
39021 #undef TARGET_ASM_OUTPUT_MI_THUNK
39022 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
39023 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
39024 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
39025
39026 #undef TARGET_ASM_FILE_START
39027 #define TARGET_ASM_FILE_START x86_file_start
39028
39029 #undef TARGET_OPTION_OVERRIDE
39030 #define TARGET_OPTION_OVERRIDE ix86_option_override
39031
39032 #undef TARGET_REGISTER_MOVE_COST
39033 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
39034 #undef TARGET_MEMORY_MOVE_COST
39035 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
39036 #undef TARGET_RTX_COSTS
39037 #define TARGET_RTX_COSTS ix86_rtx_costs
39038 #undef TARGET_ADDRESS_COST
39039 #define TARGET_ADDRESS_COST ix86_address_cost
39040
39041 #undef TARGET_FIXED_CONDITION_CODE_REGS
39042 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
39043 #undef TARGET_CC_MODES_COMPATIBLE
39044 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
39045
39046 #undef TARGET_MACHINE_DEPENDENT_REORG
39047 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
39048
39049 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
39050 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
39051
39052 #undef TARGET_BUILD_BUILTIN_VA_LIST
39053 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
39054
39055 #undef TARGET_ENUM_VA_LIST_P
39056 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
39057
39058 #undef TARGET_FN_ABI_VA_LIST
39059 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
39060
39061 #undef TARGET_CANONICAL_VA_LIST_TYPE
39062 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
39063
39064 #undef TARGET_EXPAND_BUILTIN_VA_START
39065 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
39066
39067 #undef TARGET_MD_ASM_CLOBBERS
39068 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
39069
39070 #undef TARGET_PROMOTE_PROTOTYPES
39071 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
39072 #undef TARGET_STRUCT_VALUE_RTX
39073 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
39074 #undef TARGET_SETUP_INCOMING_VARARGS
39075 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
39076 #undef TARGET_MUST_PASS_IN_STACK
39077 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
39078 #undef TARGET_FUNCTION_ARG_ADVANCE
39079 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
39080 #undef TARGET_FUNCTION_ARG
39081 #define TARGET_FUNCTION_ARG ix86_function_arg
39082 #undef TARGET_FUNCTION_ARG_BOUNDARY
39083 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
39084 #undef TARGET_PASS_BY_REFERENCE
39085 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
39086 #undef TARGET_INTERNAL_ARG_POINTER
39087 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
39088 #undef TARGET_UPDATE_STACK_BOUNDARY
39089 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
39090 #undef TARGET_GET_DRAP_RTX
39091 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
39092 #undef TARGET_STRICT_ARGUMENT_NAMING
39093 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
39094 #undef TARGET_STATIC_CHAIN
39095 #define TARGET_STATIC_CHAIN ix86_static_chain
39096 #undef TARGET_TRAMPOLINE_INIT
39097 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
39098 #undef TARGET_RETURN_POPS_ARGS
39099 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
39100
39101 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
39102 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
39103
39104 #undef TARGET_SCALAR_MODE_SUPPORTED_P
39105 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
39106
39107 #undef TARGET_VECTOR_MODE_SUPPORTED_P
39108 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
39109
39110 #undef TARGET_C_MODE_FOR_SUFFIX
39111 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
39112
39113 #ifdef HAVE_AS_TLS
39114 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
39115 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
39116 #endif
39117
39118 #ifdef SUBTARGET_INSERT_ATTRIBUTES
39119 #undef TARGET_INSERT_ATTRIBUTES
39120 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
39121 #endif
39122
39123 #undef TARGET_MANGLE_TYPE
39124 #define TARGET_MANGLE_TYPE ix86_mangle_type
39125
39126 #if !TARGET_MACHO
39127 #undef TARGET_STACK_PROTECT_FAIL
39128 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
39129 #endif
39130
39131 #undef TARGET_FUNCTION_VALUE
39132 #define TARGET_FUNCTION_VALUE ix86_function_value
39133
39134 #undef TARGET_FUNCTION_VALUE_REGNO_P
39135 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
39136
39137 #undef TARGET_PROMOTE_FUNCTION_MODE
39138 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
39139
39140 #undef TARGET_SECONDARY_RELOAD
39141 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
39142
39143 #undef TARGET_CLASS_MAX_NREGS
39144 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
39145
39146 #undef TARGET_PREFERRED_RELOAD_CLASS
39147 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
39148 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
39149 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
39150 #undef TARGET_CLASS_LIKELY_SPILLED_P
39151 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
39152
39153 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
39154 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
39155 ix86_builtin_vectorization_cost
39156 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
39157 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
39158 ix86_vectorize_vec_perm_const_ok
39159 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
39160 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
39161 ix86_preferred_simd_mode
39162 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
39163 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
39164 ix86_autovectorize_vector_sizes
39165
39166 #undef TARGET_SET_CURRENT_FUNCTION
39167 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
39168
39169 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
39170 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
39171
39172 #undef TARGET_OPTION_SAVE
39173 #define TARGET_OPTION_SAVE ix86_function_specific_save
39174
39175 #undef TARGET_OPTION_RESTORE
39176 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
39177
39178 #undef TARGET_OPTION_PRINT
39179 #define TARGET_OPTION_PRINT ix86_function_specific_print
39180
39181 #undef TARGET_CAN_INLINE_P
39182 #define TARGET_CAN_INLINE_P ix86_can_inline_p
39183
39184 #undef TARGET_EXPAND_TO_RTL_HOOK
39185 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
39186
39187 #undef TARGET_LEGITIMATE_ADDRESS_P
39188 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
39189
39190 #undef TARGET_LEGITIMATE_CONSTANT_P
39191 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
39192
39193 #undef TARGET_FRAME_POINTER_REQUIRED
39194 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
39195
39196 #undef TARGET_CAN_ELIMINATE
39197 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
39198
39199 #undef TARGET_EXTRA_LIVE_ON_ENTRY
39200 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
39201
39202 #undef TARGET_ASM_CODE_END
39203 #define TARGET_ASM_CODE_END ix86_code_end
39204
39205 #undef TARGET_CONDITIONAL_REGISTER_USAGE
39206 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
39207
39208 #if TARGET_MACHO
39209 #undef TARGET_INIT_LIBFUNCS
39210 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
39211 #endif
39212
39213 struct gcc_target targetm = TARGET_INITIALIZER;
39214 \f
39215 #include "gt-i386.h"