re PR target/50038 (redundant zero extensions)
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "cgraph.h"
51 #include "gimple.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "sched-int.h"
59 #include "sbitmap.h"
60 #include "fibheap.h"
61 #include "opts.h"
62 #include "diagnostic.h"
63
64 enum upper_128bits_state
65 {
66 unknown = 0,
67 unused,
68 used
69 };
70
71 typedef struct block_info_def
72 {
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
76 in this block. */
77 bool unchanged;
78 /* TRUE if block has been processed. */
79 bool processed;
80 /* TRUE if block has been scanned. */
81 bool scanned;
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
84 } *block_info;
85
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
87
88 enum call_avx256_state
89 {
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
95 callee_pass_avx256,
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
98 call_no_avx256,
99 /* vzeroupper intrinsic. */
100 vzeroupper_intrinsic
101 };
102
103 /* Check if a 256bit AVX register is referenced in stores. */
104
105 static void
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
107 {
108 if ((REG_P (dest)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
113 {
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
116 *state = used;
117 }
118 }
119
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
123
124 STATE is state of the upper 128bits of AVX registers at entry. */
125
126 static void
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
129 {
130 rtx insn, bb_end;
131 rtx vzeroupper_insn = NULL_RTX;
132 rtx pat;
133 int avx256;
134 bool unchanged;
135
136 if (BLOCK_INFO (bb)->unchanged)
137 {
138 if (dump_file)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
140 bb->index, state);
141
142 BLOCK_INFO (bb)->state = state;
143 return;
144 }
145
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
147 {
148 if (dump_file)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
151 return;
152 }
153
154 BLOCK_INFO (bb)->prev = state;
155
156 if (dump_file)
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
158 bb->index, state);
159
160 unchanged = true;
161
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
164 insn = BB_HEAD (bb);
165 while (insn != bb_end)
166 {
167 insn = NEXT_INSN (insn);
168
169 if (!NONDEBUG_INSN_P (insn))
170 continue;
171
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
174 {
175 if (!vzeroupper_insn)
176 continue;
177
178 if (PREV_INSN (insn) != vzeroupper_insn)
179 {
180 if (dump_file)
181 {
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
186 }
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
188 PREV_INSN (insn));
189 }
190 vzeroupper_insn = NULL_RTX;
191 continue;
192 }
193
194 pat = PATTERN (insn);
195
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
199 {
200 if (dump_file)
201 {
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
205 }
206 }
207 else
208 {
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
213 {
214 state = unused;
215 unchanged = false;
216
217 /* Delete pending vzeroupper insertion. */
218 if (vzeroupper_insn)
219 {
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
222 }
223 }
224 else if (state != used)
225 {
226 note_stores (pat, check_avx256_stores, &state);
227 if (state == used)
228 unchanged = false;
229 }
230 continue;
231 }
232
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
235
236 if (state == unused)
237 {
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
242 {
243 state = used;
244 unchanged = false;
245 }
246
247 /* Remove unnecessary vzeroupper since upper 128bits are
248 cleared. */
249 if (dump_file)
250 {
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
253 }
254 delete_insn (insn);
255 }
256 else
257 {
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
259 register. */
260 if (avx256 != callee_return_pass_avx256)
261 state = unused;
262
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
265 {
266 /* Must remove vzeroupper since callee passes in 256bit
267 AVX register. */
268 if (dump_file)
269 {
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
272 }
273 delete_insn (insn);
274 }
275 else
276 {
277 vzeroupper_insn = insn;
278 unchanged = false;
279 }
280 }
281 }
282
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
286
287 if (dump_file)
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
290 state);
291 }
292
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
296 state is changed. */
297
298 static bool
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
300 {
301 edge e;
302 edge_iterator ei;
303 enum upper_128bits_state state, old_state, new_state;
304 bool seen_unknown;
305
306 if (dump_file)
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
309
310 if (BLOCK_INFO (block)->processed)
311 return false;
312
313 state = unused;
314
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
318 {
319 if (e->src == block)
320 continue;
321 switch (BLOCK_INFO (e->src)->state)
322 {
323 case unknown:
324 if (!unknown_is_unused)
325 seen_unknown = true;
326 case unused:
327 break;
328 case used:
329 state = used;
330 goto done;
331 }
332 }
333
334 if (seen_unknown)
335 state = unknown;
336
337 done:
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
341
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
344
345 /* Need to rescan if the upper 128bits of AVX registers are changed
346 to USED at exit. */
347 if (new_state != old_state)
348 {
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
351 return true;
352 }
353 else
354 return false;
355 }
356
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
360
361 static void
362 move_or_delete_vzeroupper (void)
363 {
364 edge e;
365 edge_iterator ei;
366 basic_block bb;
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
369 int *bb_order;
370 int *rc_order;
371 int i;
372
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
375
376 /* Process outgoing edges of entry point. */
377 if (dump_file)
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
379
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
381 {
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
384 ? used : unused);
385 BLOCK_INFO (e->dest)->processed = true;
386 }
387
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
395 free (rc_order);
396
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
403
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
406 FOR_EACH_BB (bb)
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
409 else
410 {
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
413 }
414
415 if (dump_file)
416 fprintf (dump_file, "Check remaining basic blocks\n");
417
418 while (!fibheap_empty (pending))
419 {
420 fibheap_swap = pending;
421 pending = worklist;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
426
427 sbitmap_zero (visited);
428
429 cfun->machine->rescan_vzeroupper_p = 0;
430
431 while (!fibheap_empty (worklist))
432 {
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
437 {
438 edge_iterator ei;
439
440 SET_BIT (visited, bb->index);
441
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
444 {
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
447 continue;
448
449 if (TEST_BIT (visited, e->dest->index))
450 {
451 if (!TEST_BIT (in_pending, e->dest->index))
452 {
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
457 e->dest);
458 }
459 }
460 else if (!TEST_BIT (in_worklist, e->dest->index))
461 {
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
465 e->dest);
466 }
467 }
468 }
469 }
470
471 if (!cfun->machine->rescan_vzeroupper_p)
472 break;
473 }
474
475 free (bb_order);
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
481
482 if (dump_file)
483 fprintf (dump_file, "Process remaining basic blocks\n");
484
485 FOR_EACH_BB (bb)
486 move_or_delete_vzeroupper_1 (bb, true);
487
488 free_aux_for_blocks ();
489 }
490
491 static rtx legitimize_dllimport_symbol (rtx, bool);
492
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
495 #endif
496
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
503 : 4)
504
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
508
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
510
511 const
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
531 2, /* MOVE_RATIO */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
557 2, /* Branch cost */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 1, /* scalar_stmt_cost. */
569 1, /* scalar load_cost. */
570 1, /* scalar_store_cost. */
571 1, /* vec_stmt_cost. */
572 1, /* vec_to_scalar_cost. */
573 1, /* scalar_to_vec_cost. */
574 1, /* vec_align_load_cost. */
575 1, /* vec_unalign_load_cost. */
576 1, /* vec_store_cost. */
577 1, /* cond_taken_branch_cost. */
578 1, /* cond_not_taken_branch_cost. */
579 };
580
581 /* Processor costs (relative to an add) */
582 static const
583 struct processor_costs i386_cost = { /* 386 specific costs */
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (1), /* cost of a lea instruction */
586 COSTS_N_INSNS (3), /* variable shift costs */
587 COSTS_N_INSNS (2), /* constant shift costs */
588 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (6), /* HI */
590 COSTS_N_INSNS (6), /* SI */
591 COSTS_N_INSNS (6), /* DI */
592 COSTS_N_INSNS (6)}, /* other */
593 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (23), /* HI */
596 COSTS_N_INSNS (23), /* SI */
597 COSTS_N_INSNS (23), /* DI */
598 COSTS_N_INSNS (23)}, /* other */
599 COSTS_N_INSNS (3), /* cost of movsx */
600 COSTS_N_INSNS (2), /* cost of movzx */
601 15, /* "large" insn */
602 3, /* MOVE_RATIO */
603 4, /* cost for loading QImode using movzbl */
604 {2, 4, 2}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {2, 4, 2}, /* cost of storing integer registers */
608 2, /* cost of reg,reg fld/fst */
609 {8, 8, 8}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {8, 8, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 8}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 8}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 8, 16}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 8, 16}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 3, /* MMX or SSE register to integer */
624 0, /* size of l1 cache */
625 0, /* size of l2 cache */
626 0, /* size of prefetch block */
627 0, /* number of parallel prefetches */
628 1, /* Branch cost */
629 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (22), /* cost of FABS instruction. */
633 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
635 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
636 DUMMY_STRINGOP_ALGS},
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
650 };
651
652 static const
653 struct processor_costs i486_cost = { /* 486 specific costs */
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (1), /* cost of a lea instruction */
656 COSTS_N_INSNS (3), /* variable shift costs */
657 COSTS_N_INSNS (2), /* constant shift costs */
658 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (12), /* HI */
660 COSTS_N_INSNS (12), /* SI */
661 COSTS_N_INSNS (12), /* DI */
662 COSTS_N_INSNS (12)}, /* other */
663 1, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (40), /* HI */
666 COSTS_N_INSNS (40), /* SI */
667 COSTS_N_INSNS (40), /* DI */
668 COSTS_N_INSNS (40)}, /* other */
669 COSTS_N_INSNS (3), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 15, /* "large" insn */
672 3, /* MOVE_RATIO */
673 4, /* cost for loading QImode using movzbl */
674 {2, 4, 2}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 4, 2}, /* cost of storing integer registers */
678 2, /* cost of reg,reg fld/fst */
679 {8, 8, 8}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {8, 8, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 8}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 8}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 8, 16}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 8, 16}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 3, /* MMX or SSE register to integer */
694 4, /* size of l1 cache. 486 has 8kB cache
695 shared for code and data, so 4kB is
696 not really precise. */
697 4, /* size of l2 cache */
698 0, /* size of prefetch block */
699 0, /* number of parallel prefetches */
700 1, /* Branch cost */
701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (3), /* cost of FABS instruction. */
705 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
707 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
708 DUMMY_STRINGOP_ALGS},
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 1, /* scalar_stmt_cost. */
712 1, /* scalar load_cost. */
713 1, /* scalar_store_cost. */
714 1, /* vec_stmt_cost. */
715 1, /* vec_to_scalar_cost. */
716 1, /* scalar_to_vec_cost. */
717 1, /* vec_align_load_cost. */
718 2, /* vec_unalign_load_cost. */
719 1, /* vec_store_cost. */
720 3, /* cond_taken_branch_cost. */
721 1, /* cond_not_taken_branch_cost. */
722 };
723
724 static const
725 struct processor_costs pentium_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (1), /* cost of a lea instruction */
728 COSTS_N_INSNS (4), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (11), /* HI */
732 COSTS_N_INSNS (11), /* SI */
733 COSTS_N_INSNS (11), /* DI */
734 COSTS_N_INSNS (11)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (25), /* HI */
738 COSTS_N_INSNS (25), /* SI */
739 COSTS_N_INSNS (25), /* DI */
740 COSTS_N_INSNS (25)}, /* other */
741 COSTS_N_INSNS (3), /* cost of movsx */
742 COSTS_N_INSNS (2), /* cost of movzx */
743 8, /* "large" insn */
744 6, /* MOVE_RATIO */
745 6, /* cost for loading QImode using movzbl */
746 {2, 4, 2}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {2, 4, 2}, /* cost of storing integer registers */
750 2, /* cost of reg,reg fld/fst */
751 {2, 2, 6}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {4, 4, 6}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 8, /* cost of moving MMX register */
756 {8, 8}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {8, 8}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 8, 16}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 8, 16}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 3, /* MMX or SSE register to integer */
766 8, /* size of l1 cache. */
767 8, /* size of l2 cache */
768 0, /* size of prefetch block */
769 0, /* number of parallel prefetches */
770 2, /* Branch cost */
771 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
772 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
773 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
774 COSTS_N_INSNS (1), /* cost of FABS instruction. */
775 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
776 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
777 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
778 DUMMY_STRINGOP_ALGS},
779 {{libcall, {{-1, rep_prefix_4_byte}}},
780 DUMMY_STRINGOP_ALGS},
781 1, /* scalar_stmt_cost. */
782 1, /* scalar load_cost. */
783 1, /* scalar_store_cost. */
784 1, /* vec_stmt_cost. */
785 1, /* vec_to_scalar_cost. */
786 1, /* scalar_to_vec_cost. */
787 1, /* vec_align_load_cost. */
788 2, /* vec_unalign_load_cost. */
789 1, /* vec_store_cost. */
790 3, /* cond_taken_branch_cost. */
791 1, /* cond_not_taken_branch_cost. */
792 };
793
794 static const
795 struct processor_costs pentiumpro_cost = {
796 COSTS_N_INSNS (1), /* cost of an add instruction */
797 COSTS_N_INSNS (1), /* cost of a lea instruction */
798 COSTS_N_INSNS (1), /* variable shift costs */
799 COSTS_N_INSNS (1), /* constant shift costs */
800 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
801 COSTS_N_INSNS (4), /* HI */
802 COSTS_N_INSNS (4), /* SI */
803 COSTS_N_INSNS (4), /* DI */
804 COSTS_N_INSNS (4)}, /* other */
805 0, /* cost of multiply per each bit set */
806 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
807 COSTS_N_INSNS (17), /* HI */
808 COSTS_N_INSNS (17), /* SI */
809 COSTS_N_INSNS (17), /* DI */
810 COSTS_N_INSNS (17)}, /* other */
811 COSTS_N_INSNS (1), /* cost of movsx */
812 COSTS_N_INSNS (1), /* cost of movzx */
813 8, /* "large" insn */
814 6, /* MOVE_RATIO */
815 2, /* cost for loading QImode using movzbl */
816 {4, 4, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 2, 2}, /* cost of storing integer registers */
820 2, /* cost of reg,reg fld/fst */
821 {2, 2, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 6}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, /* cost of moving SSE register */
831 {2, 2, 8}, /* cost of loading SSE registers
832 in SImode, DImode and TImode */
833 {2, 2, 8}, /* cost of storing SSE registers
834 in SImode, DImode and TImode */
835 3, /* MMX or SSE register to integer */
836 8, /* size of l1 cache. */
837 256, /* size of l2 cache */
838 32, /* size of prefetch block */
839 6, /* number of parallel prefetches */
840 2, /* Branch cost */
841 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
842 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
843 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
844 COSTS_N_INSNS (2), /* cost of FABS instruction. */
845 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
846 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
847 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
848 (we ensure the alignment). For small blocks inline loop is still a
849 noticeable win, for bigger blocks either rep movsl or rep movsb is
850 way to go. Rep movsb has apparently more expensive startup time in CPU,
851 but after 4K the difference is down in the noise. */
852 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
853 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
854 DUMMY_STRINGOP_ALGS},
855 {{rep_prefix_4_byte, {{1024, unrolled_loop},
856 {8192, rep_prefix_4_byte}, {-1, libcall}}},
857 DUMMY_STRINGOP_ALGS},
858 1, /* scalar_stmt_cost. */
859 1, /* scalar load_cost. */
860 1, /* scalar_store_cost. */
861 1, /* vec_stmt_cost. */
862 1, /* vec_to_scalar_cost. */
863 1, /* scalar_to_vec_cost. */
864 1, /* vec_align_load_cost. */
865 2, /* vec_unalign_load_cost. */
866 1, /* vec_store_cost. */
867 3, /* cond_taken_branch_cost. */
868 1, /* cond_not_taken_branch_cost. */
869 };
870
871 static const
872 struct processor_costs geode_cost = {
873 COSTS_N_INSNS (1), /* cost of an add instruction */
874 COSTS_N_INSNS (1), /* cost of a lea instruction */
875 COSTS_N_INSNS (2), /* variable shift costs */
876 COSTS_N_INSNS (1), /* constant shift costs */
877 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
878 COSTS_N_INSNS (4), /* HI */
879 COSTS_N_INSNS (7), /* SI */
880 COSTS_N_INSNS (7), /* DI */
881 COSTS_N_INSNS (7)}, /* other */
882 0, /* cost of multiply per each bit set */
883 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
884 COSTS_N_INSNS (23), /* HI */
885 COSTS_N_INSNS (39), /* SI */
886 COSTS_N_INSNS (39), /* DI */
887 COSTS_N_INSNS (39)}, /* other */
888 COSTS_N_INSNS (1), /* cost of movsx */
889 COSTS_N_INSNS (1), /* cost of movzx */
890 8, /* "large" insn */
891 4, /* MOVE_RATIO */
892 1, /* cost for loading QImode using movzbl */
893 {1, 1, 1}, /* cost of loading integer registers
894 in QImode, HImode and SImode.
895 Relative to reg-reg move (2). */
896 {1, 1, 1}, /* cost of storing integer registers */
897 1, /* cost of reg,reg fld/fst */
898 {1, 1, 1}, /* cost of loading fp registers
899 in SFmode, DFmode and XFmode */
900 {4, 6, 6}, /* cost of storing fp registers
901 in SFmode, DFmode and XFmode */
902
903 1, /* cost of moving MMX register */
904 {1, 1}, /* cost of loading MMX registers
905 in SImode and DImode */
906 {1, 1}, /* cost of storing MMX registers
907 in SImode and DImode */
908 1, /* cost of moving SSE register */
909 {1, 1, 1}, /* cost of loading SSE registers
910 in SImode, DImode and TImode */
911 {1, 1, 1}, /* cost of storing SSE registers
912 in SImode, DImode and TImode */
913 1, /* MMX or SSE register to integer */
914 64, /* size of l1 cache. */
915 128, /* size of l2 cache. */
916 32, /* size of prefetch block */
917 1, /* number of parallel prefetches */
918 1, /* Branch cost */
919 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
920 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
921 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
922 COSTS_N_INSNS (1), /* cost of FABS instruction. */
923 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
924 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
925 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
926 DUMMY_STRINGOP_ALGS},
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 1, /* scalar_stmt_cost. */
930 1, /* scalar load_cost. */
931 1, /* scalar_store_cost. */
932 1, /* vec_stmt_cost. */
933 1, /* vec_to_scalar_cost. */
934 1, /* scalar_to_vec_cost. */
935 1, /* vec_align_load_cost. */
936 2, /* vec_unalign_load_cost. */
937 1, /* vec_store_cost. */
938 3, /* cond_taken_branch_cost. */
939 1, /* cond_not_taken_branch_cost. */
940 };
941
942 static const
943 struct processor_costs k6_cost = {
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (3), /* HI */
950 COSTS_N_INSNS (3), /* SI */
951 COSTS_N_INSNS (3), /* DI */
952 COSTS_N_INSNS (3)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (18), /* HI */
956 COSTS_N_INSNS (18), /* SI */
957 COSTS_N_INSNS (18), /* DI */
958 COSTS_N_INSNS (18)}, /* other */
959 COSTS_N_INSNS (2), /* cost of movsx */
960 COSTS_N_INSNS (2), /* cost of movzx */
961 8, /* "large" insn */
962 4, /* MOVE_RATIO */
963 3, /* cost for loading QImode using movzbl */
964 {4, 5, 4}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {2, 3, 2}, /* cost of storing integer registers */
968 4, /* cost of reg,reg fld/fst */
969 {6, 6, 6}, /* cost of loading fp registers
970 in SFmode, DFmode and XFmode */
971 {4, 4, 4}, /* cost of storing fp registers
972 in SFmode, DFmode and XFmode */
973 2, /* cost of moving MMX register */
974 {2, 2}, /* cost of loading MMX registers
975 in SImode and DImode */
976 {2, 2}, /* cost of storing MMX registers
977 in SImode and DImode */
978 2, /* cost of moving SSE register */
979 {2, 2, 8}, /* cost of loading SSE registers
980 in SImode, DImode and TImode */
981 {2, 2, 8}, /* cost of storing SSE registers
982 in SImode, DImode and TImode */
983 6, /* MMX or SSE register to integer */
984 32, /* size of l1 cache. */
985 32, /* size of l2 cache. Some models
986 have integrated l2 cache, but
987 optimizing for k6 is not important
988 enough to worry about that. */
989 32, /* size of prefetch block */
990 1, /* number of parallel prefetches */
991 1, /* Branch cost */
992 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
993 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
994 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
995 COSTS_N_INSNS (2), /* cost of FABS instruction. */
996 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
997 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
998 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
999 DUMMY_STRINGOP_ALGS},
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 1, /* scalar_stmt_cost. */
1003 1, /* scalar load_cost. */
1004 1, /* scalar_store_cost. */
1005 1, /* vec_stmt_cost. */
1006 1, /* vec_to_scalar_cost. */
1007 1, /* scalar_to_vec_cost. */
1008 1, /* vec_align_load_cost. */
1009 2, /* vec_unalign_load_cost. */
1010 1, /* vec_store_cost. */
1011 3, /* cond_taken_branch_cost. */
1012 1, /* cond_not_taken_branch_cost. */
1013 };
1014
1015 static const
1016 struct processor_costs athlon_cost = {
1017 COSTS_N_INSNS (1), /* cost of an add instruction */
1018 COSTS_N_INSNS (2), /* cost of a lea instruction */
1019 COSTS_N_INSNS (1), /* variable shift costs */
1020 COSTS_N_INSNS (1), /* constant shift costs */
1021 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1022 COSTS_N_INSNS (5), /* HI */
1023 COSTS_N_INSNS (5), /* SI */
1024 COSTS_N_INSNS (5), /* DI */
1025 COSTS_N_INSNS (5)}, /* other */
1026 0, /* cost of multiply per each bit set */
1027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1028 COSTS_N_INSNS (26), /* HI */
1029 COSTS_N_INSNS (42), /* SI */
1030 COSTS_N_INSNS (74), /* DI */
1031 COSTS_N_INSNS (74)}, /* other */
1032 COSTS_N_INSNS (1), /* cost of movsx */
1033 COSTS_N_INSNS (1), /* cost of movzx */
1034 8, /* "large" insn */
1035 9, /* MOVE_RATIO */
1036 4, /* cost for loading QImode using movzbl */
1037 {3, 4, 3}, /* cost of loading integer registers
1038 in QImode, HImode and SImode.
1039 Relative to reg-reg move (2). */
1040 {3, 4, 3}, /* cost of storing integer registers */
1041 4, /* cost of reg,reg fld/fst */
1042 {4, 4, 12}, /* cost of loading fp registers
1043 in SFmode, DFmode and XFmode */
1044 {6, 6, 8}, /* cost of storing fp registers
1045 in SFmode, DFmode and XFmode */
1046 2, /* cost of moving MMX register */
1047 {4, 4}, /* cost of loading MMX registers
1048 in SImode and DImode */
1049 {4, 4}, /* cost of storing MMX registers
1050 in SImode and DImode */
1051 2, /* cost of moving SSE register */
1052 {4, 4, 6}, /* cost of loading SSE registers
1053 in SImode, DImode and TImode */
1054 {4, 4, 5}, /* cost of storing SSE registers
1055 in SImode, DImode and TImode */
1056 5, /* MMX or SSE register to integer */
1057 64, /* size of l1 cache. */
1058 256, /* size of l2 cache. */
1059 64, /* size of prefetch block */
1060 6, /* number of parallel prefetches */
1061 5, /* Branch cost */
1062 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1063 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1064 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1065 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1066 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1067 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1068 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1069 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1070 128 bytes for memset. */
1071 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1072 DUMMY_STRINGOP_ALGS},
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 1, /* scalar_stmt_cost. */
1076 1, /* scalar load_cost. */
1077 1, /* scalar_store_cost. */
1078 1, /* vec_stmt_cost. */
1079 1, /* vec_to_scalar_cost. */
1080 1, /* scalar_to_vec_cost. */
1081 1, /* vec_align_load_cost. */
1082 2, /* vec_unalign_load_cost. */
1083 1, /* vec_store_cost. */
1084 3, /* cond_taken_branch_cost. */
1085 1, /* cond_not_taken_branch_cost. */
1086 };
1087
1088 static const
1089 struct processor_costs k8_cost = {
1090 COSTS_N_INSNS (1), /* cost of an add instruction */
1091 COSTS_N_INSNS (2), /* cost of a lea instruction */
1092 COSTS_N_INSNS (1), /* variable shift costs */
1093 COSTS_N_INSNS (1), /* constant shift costs */
1094 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1095 COSTS_N_INSNS (4), /* HI */
1096 COSTS_N_INSNS (3), /* SI */
1097 COSTS_N_INSNS (4), /* DI */
1098 COSTS_N_INSNS (5)}, /* other */
1099 0, /* cost of multiply per each bit set */
1100 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1101 COSTS_N_INSNS (26), /* HI */
1102 COSTS_N_INSNS (42), /* SI */
1103 COSTS_N_INSNS (74), /* DI */
1104 COSTS_N_INSNS (74)}, /* other */
1105 COSTS_N_INSNS (1), /* cost of movsx */
1106 COSTS_N_INSNS (1), /* cost of movzx */
1107 8, /* "large" insn */
1108 9, /* MOVE_RATIO */
1109 4, /* cost for loading QImode using movzbl */
1110 {3, 4, 3}, /* cost of loading integer registers
1111 in QImode, HImode and SImode.
1112 Relative to reg-reg move (2). */
1113 {3, 4, 3}, /* cost of storing integer registers */
1114 4, /* cost of reg,reg fld/fst */
1115 {4, 4, 12}, /* cost of loading fp registers
1116 in SFmode, DFmode and XFmode */
1117 {6, 6, 8}, /* cost of storing fp registers
1118 in SFmode, DFmode and XFmode */
1119 2, /* cost of moving MMX register */
1120 {3, 3}, /* cost of loading MMX registers
1121 in SImode and DImode */
1122 {4, 4}, /* cost of storing MMX registers
1123 in SImode and DImode */
1124 2, /* cost of moving SSE register */
1125 {4, 3, 6}, /* cost of loading SSE registers
1126 in SImode, DImode and TImode */
1127 {4, 4, 5}, /* cost of storing SSE registers
1128 in SImode, DImode and TImode */
1129 5, /* MMX or SSE register to integer */
1130 64, /* size of l1 cache. */
1131 512, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 3, /* Branch cost */
1140 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1146 /* K8 has optimized REP instruction for medium sized blocks, but for very
1147 small blocks it is better to use loop. For large blocks, libcall can
1148 do nontemporary accesses and beat inline considerably. */
1149 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1150 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1151 {{libcall, {{8, loop}, {24, unrolled_loop},
1152 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1153 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1154 4, /* scalar_stmt_cost. */
1155 2, /* scalar load_cost. */
1156 2, /* scalar_store_cost. */
1157 5, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 2, /* vec_align_load_cost. */
1161 3, /* vec_unalign_load_cost. */
1162 3, /* vec_store_cost. */
1163 3, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1165 };
1166
1167 struct processor_costs amdfam10_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (2), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (3), /* SI */
1175 COSTS_N_INSNS (4), /* DI */
1176 COSTS_N_INSNS (5)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1186 9, /* MOVE_RATIO */
1187 4, /* cost for loading QImode using movzbl */
1188 {3, 4, 3}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {3, 4, 3}, /* cost of storing integer registers */
1192 4, /* cost of reg,reg fld/fst */
1193 {4, 4, 12}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {6, 6, 8}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {3, 3}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {4, 4}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 2, /* cost of moving SSE register */
1203 {4, 4, 3}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {4, 4, 5}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 3, /* MMX or SSE register to integer */
1208 /* On K8:
1209 MOVD reg64, xmmreg Double FSTORE 4
1210 MOVD reg32, xmmreg Double FSTORE 4
1211 On AMDFAM10:
1212 MOVD reg64, xmmreg Double FADD 3
1213 1/1 1/1
1214 MOVD reg32, xmmreg Double FADD 3
1215 1/1 1/1 */
1216 64, /* size of l1 cache. */
1217 512, /* size of l2 cache. */
1218 64, /* size of prefetch block */
1219 /* New AMD processors never drop prefetches; if they cannot be performed
1220 immediately, they are queued. We set number of simultaneous prefetches
1221 to a large constant to reflect this (it probably is not a good idea not
1222 to limit number of prefetches at all, as their execution also takes some
1223 time). */
1224 100, /* number of parallel prefetches */
1225 2, /* Branch cost */
1226 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1227 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1228 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1229 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1230 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1231 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1232
1233 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1234 very small blocks it is better to use loop. For large blocks, libcall can
1235 do nontemporary accesses and beat inline considerably. */
1236 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1237 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1238 {{libcall, {{8, loop}, {24, unrolled_loop},
1239 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1240 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1241 4, /* scalar_stmt_cost. */
1242 2, /* scalar load_cost. */
1243 2, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 2, /* vec_align_load_cost. */
1248 2, /* vec_unalign_load_cost. */
1249 2, /* vec_store_cost. */
1250 2, /* cond_taken_branch_cost. */
1251 1, /* cond_not_taken_branch_cost. */
1252 };
1253
1254 struct processor_costs bdver1_cost = {
1255 COSTS_N_INSNS (1), /* cost of an add instruction */
1256 COSTS_N_INSNS (1), /* cost of a lea instruction */
1257 COSTS_N_INSNS (1), /* variable shift costs */
1258 COSTS_N_INSNS (1), /* constant shift costs */
1259 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1260 COSTS_N_INSNS (4), /* HI */
1261 COSTS_N_INSNS (4), /* SI */
1262 COSTS_N_INSNS (6), /* DI */
1263 COSTS_N_INSNS (6)}, /* other */
1264 0, /* cost of multiply per each bit set */
1265 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1266 COSTS_N_INSNS (35), /* HI */
1267 COSTS_N_INSNS (51), /* SI */
1268 COSTS_N_INSNS (83), /* DI */
1269 COSTS_N_INSNS (83)}, /* other */
1270 COSTS_N_INSNS (1), /* cost of movsx */
1271 COSTS_N_INSNS (1), /* cost of movzx */
1272 8, /* "large" insn */
1273 9, /* MOVE_RATIO */
1274 4, /* cost for loading QImode using movzbl */
1275 {5, 5, 4}, /* cost of loading integer registers
1276 in QImode, HImode and SImode.
1277 Relative to reg-reg move (2). */
1278 {4, 4, 4}, /* cost of storing integer registers */
1279 2, /* cost of reg,reg fld/fst */
1280 {5, 5, 12}, /* cost of loading fp registers
1281 in SFmode, DFmode and XFmode */
1282 {4, 4, 8}, /* cost of storing fp registers
1283 in SFmode, DFmode and XFmode */
1284 2, /* cost of moving MMX register */
1285 {4, 4}, /* cost of loading MMX registers
1286 in SImode and DImode */
1287 {4, 4}, /* cost of storing MMX registers
1288 in SImode and DImode */
1289 2, /* cost of moving SSE register */
1290 {4, 4, 4}, /* cost of loading SSE registers
1291 in SImode, DImode and TImode */
1292 {4, 4, 4}, /* cost of storing SSE registers
1293 in SImode, DImode and TImode */
1294 2, /* MMX or SSE register to integer */
1295 /* On K8:
1296 MOVD reg64, xmmreg Double FSTORE 4
1297 MOVD reg32, xmmreg Double FSTORE 4
1298 On AMDFAM10:
1299 MOVD reg64, xmmreg Double FADD 3
1300 1/1 1/1
1301 MOVD reg32, xmmreg Double FADD 3
1302 1/1 1/1 */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1319
1320 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1321 very small blocks it is better to use loop. For large blocks, libcall
1322 can do nontemporary accesses and beat inline considerably. */
1323 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1324 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1325 {{libcall, {{8, loop}, {24, unrolled_loop},
1326 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1327 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 2, /* cond_taken_branch_cost. */
1338 1, /* cond_not_taken_branch_cost. */
1339 };
1340
1341 struct processor_costs bdver2_cost = {
1342 COSTS_N_INSNS (1), /* cost of an add instruction */
1343 COSTS_N_INSNS (1), /* cost of a lea instruction */
1344 COSTS_N_INSNS (1), /* variable shift costs */
1345 COSTS_N_INSNS (1), /* constant shift costs */
1346 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1347 COSTS_N_INSNS (4), /* HI */
1348 COSTS_N_INSNS (4), /* SI */
1349 COSTS_N_INSNS (6), /* DI */
1350 COSTS_N_INSNS (6)}, /* other */
1351 0, /* cost of multiply per each bit set */
1352 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1353 COSTS_N_INSNS (35), /* HI */
1354 COSTS_N_INSNS (51), /* SI */
1355 COSTS_N_INSNS (83), /* DI */
1356 COSTS_N_INSNS (83)}, /* other */
1357 COSTS_N_INSNS (1), /* cost of movsx */
1358 COSTS_N_INSNS (1), /* cost of movzx */
1359 8, /* "large" insn */
1360 9, /* MOVE_RATIO */
1361 4, /* cost for loading QImode using movzbl */
1362 {5, 5, 4}, /* cost of loading integer registers
1363 in QImode, HImode and SImode.
1364 Relative to reg-reg move (2). */
1365 {4, 4, 4}, /* cost of storing integer registers */
1366 2, /* cost of reg,reg fld/fst */
1367 {5, 5, 12}, /* cost of loading fp registers
1368 in SFmode, DFmode and XFmode */
1369 {4, 4, 8}, /* cost of storing fp registers
1370 in SFmode, DFmode and XFmode */
1371 2, /* cost of moving MMX register */
1372 {4, 4}, /* cost of loading MMX registers
1373 in SImode and DImode */
1374 {4, 4}, /* cost of storing MMX registers
1375 in SImode and DImode */
1376 2, /* cost of moving SSE register */
1377 {4, 4, 4}, /* cost of loading SSE registers
1378 in SImode, DImode and TImode */
1379 {4, 4, 4}, /* cost of storing SSE registers
1380 in SImode, DImode and TImode */
1381 2, /* MMX or SSE register to integer */
1382 /* On K8:
1383 MOVD reg64, xmmreg Double FSTORE 4
1384 MOVD reg32, xmmreg Double FSTORE 4
1385 On AMDFAM10:
1386 MOVD reg64, xmmreg Double FADD 3
1387 1/1 1/1
1388 MOVD reg32, xmmreg Double FADD 3
1389 1/1 1/1 */
1390 16, /* size of l1 cache. */
1391 2048, /* size of l2 cache. */
1392 64, /* size of prefetch block */
1393 /* New AMD processors never drop prefetches; if they cannot be performed
1394 immediately, they are queued. We set number of simultaneous prefetches
1395 to a large constant to reflect this (it probably is not a good idea not
1396 to limit number of prefetches at all, as their execution also takes some
1397 time). */
1398 100, /* number of parallel prefetches */
1399 2, /* Branch cost */
1400 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1401 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1402 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1403 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1404 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1405 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1406
1407 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1408 very small blocks it is better to use loop. For large blocks, libcall
1409 can do nontemporary accesses and beat inline considerably. */
1410 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1411 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1412 {{libcall, {{8, loop}, {24, unrolled_loop},
1413 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1414 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1415 6, /* scalar_stmt_cost. */
1416 4, /* scalar load_cost. */
1417 4, /* scalar_store_cost. */
1418 6, /* vec_stmt_cost. */
1419 0, /* vec_to_scalar_cost. */
1420 2, /* scalar_to_vec_cost. */
1421 4, /* vec_align_load_cost. */
1422 4, /* vec_unalign_load_cost. */
1423 4, /* vec_store_cost. */
1424 2, /* cond_taken_branch_cost. */
1425 1, /* cond_not_taken_branch_cost. */
1426 };
1427
1428 struct processor_costs btver1_cost = {
1429 COSTS_N_INSNS (1), /* cost of an add instruction */
1430 COSTS_N_INSNS (2), /* cost of a lea instruction */
1431 COSTS_N_INSNS (1), /* variable shift costs */
1432 COSTS_N_INSNS (1), /* constant shift costs */
1433 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1434 COSTS_N_INSNS (4), /* HI */
1435 COSTS_N_INSNS (3), /* SI */
1436 COSTS_N_INSNS (4), /* DI */
1437 COSTS_N_INSNS (5)}, /* other */
1438 0, /* cost of multiply per each bit set */
1439 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1440 COSTS_N_INSNS (35), /* HI */
1441 COSTS_N_INSNS (51), /* SI */
1442 COSTS_N_INSNS (83), /* DI */
1443 COSTS_N_INSNS (83)}, /* other */
1444 COSTS_N_INSNS (1), /* cost of movsx */
1445 COSTS_N_INSNS (1), /* cost of movzx */
1446 8, /* "large" insn */
1447 9, /* MOVE_RATIO */
1448 4, /* cost for loading QImode using movzbl */
1449 {3, 4, 3}, /* cost of loading integer registers
1450 in QImode, HImode and SImode.
1451 Relative to reg-reg move (2). */
1452 {3, 4, 3}, /* cost of storing integer registers */
1453 4, /* cost of reg,reg fld/fst */
1454 {4, 4, 12}, /* cost of loading fp registers
1455 in SFmode, DFmode and XFmode */
1456 {6, 6, 8}, /* cost of storing fp registers
1457 in SFmode, DFmode and XFmode */
1458 2, /* cost of moving MMX register */
1459 {3, 3}, /* cost of loading MMX registers
1460 in SImode and DImode */
1461 {4, 4}, /* cost of storing MMX registers
1462 in SImode and DImode */
1463 2, /* cost of moving SSE register */
1464 {4, 4, 3}, /* cost of loading SSE registers
1465 in SImode, DImode and TImode */
1466 {4, 4, 5}, /* cost of storing SSE registers
1467 in SImode, DImode and TImode */
1468 3, /* MMX or SSE register to integer */
1469 /* On K8:
1470 MOVD reg64, xmmreg Double FSTORE 4
1471 MOVD reg32, xmmreg Double FSTORE 4
1472 On AMDFAM10:
1473 MOVD reg64, xmmreg Double FADD 3
1474 1/1 1/1
1475 MOVD reg32, xmmreg Double FADD 3
1476 1/1 1/1 */
1477 32, /* size of l1 cache. */
1478 512, /* size of l2 cache. */
1479 64, /* size of prefetch block */
1480 100, /* number of parallel prefetches */
1481 2, /* Branch cost */
1482 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1483 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1484 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1485 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1486 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1487 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1488
1489 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1490 very small blocks it is better to use loop. For large blocks, libcall can
1491 do nontemporary accesses and beat inline considerably. */
1492 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1493 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1494 {{libcall, {{8, loop}, {24, unrolled_loop},
1495 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1496 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1497 4, /* scalar_stmt_cost. */
1498 2, /* scalar load_cost. */
1499 2, /* scalar_store_cost. */
1500 6, /* vec_stmt_cost. */
1501 0, /* vec_to_scalar_cost. */
1502 2, /* scalar_to_vec_cost. */
1503 2, /* vec_align_load_cost. */
1504 2, /* vec_unalign_load_cost. */
1505 2, /* vec_store_cost. */
1506 2, /* cond_taken_branch_cost. */
1507 1, /* cond_not_taken_branch_cost. */
1508 };
1509
1510 static const
1511 struct processor_costs pentium4_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (3), /* cost of a lea instruction */
1514 COSTS_N_INSNS (4), /* variable shift costs */
1515 COSTS_N_INSNS (4), /* constant shift costs */
1516 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (15), /* HI */
1518 COSTS_N_INSNS (15), /* SI */
1519 COSTS_N_INSNS (15), /* DI */
1520 COSTS_N_INSNS (15)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (56), /* HI */
1524 COSTS_N_INSNS (56), /* SI */
1525 COSTS_N_INSNS (56), /* DI */
1526 COSTS_N_INSNS (56)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 16, /* "large" insn */
1530 6, /* MOVE_RATIO */
1531 2, /* cost for loading QImode using movzbl */
1532 {4, 5, 4}, /* cost of loading integer registers
1533 in QImode, HImode and SImode.
1534 Relative to reg-reg move (2). */
1535 {2, 3, 2}, /* cost of storing integer registers */
1536 2, /* cost of reg,reg fld/fst */
1537 {2, 2, 6}, /* cost of loading fp registers
1538 in SFmode, DFmode and XFmode */
1539 {4, 4, 6}, /* cost of storing fp registers
1540 in SFmode, DFmode and XFmode */
1541 2, /* cost of moving MMX register */
1542 {2, 2}, /* cost of loading MMX registers
1543 in SImode and DImode */
1544 {2, 2}, /* cost of storing MMX registers
1545 in SImode and DImode */
1546 12, /* cost of moving SSE register */
1547 {12, 12, 12}, /* cost of loading SSE registers
1548 in SImode, DImode and TImode */
1549 {2, 2, 8}, /* cost of storing SSE registers
1550 in SImode, DImode and TImode */
1551 10, /* MMX or SSE register to integer */
1552 8, /* size of l1 cache. */
1553 256, /* size of l2 cache. */
1554 64, /* size of prefetch block */
1555 6, /* number of parallel prefetches */
1556 2, /* Branch cost */
1557 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1558 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1559 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1560 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1561 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1562 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1563 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1564 DUMMY_STRINGOP_ALGS},
1565 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1566 {-1, libcall}}},
1567 DUMMY_STRINGOP_ALGS},
1568 1, /* scalar_stmt_cost. */
1569 1, /* scalar load_cost. */
1570 1, /* scalar_store_cost. */
1571 1, /* vec_stmt_cost. */
1572 1, /* vec_to_scalar_cost. */
1573 1, /* scalar_to_vec_cost. */
1574 1, /* vec_align_load_cost. */
1575 2, /* vec_unalign_load_cost. */
1576 1, /* vec_store_cost. */
1577 3, /* cond_taken_branch_cost. */
1578 1, /* cond_not_taken_branch_cost. */
1579 };
1580
1581 static const
1582 struct processor_costs nocona_cost = {
1583 COSTS_N_INSNS (1), /* cost of an add instruction */
1584 COSTS_N_INSNS (1), /* cost of a lea instruction */
1585 COSTS_N_INSNS (1), /* variable shift costs */
1586 COSTS_N_INSNS (1), /* constant shift costs */
1587 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1588 COSTS_N_INSNS (10), /* HI */
1589 COSTS_N_INSNS (10), /* SI */
1590 COSTS_N_INSNS (10), /* DI */
1591 COSTS_N_INSNS (10)}, /* other */
1592 0, /* cost of multiply per each bit set */
1593 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1594 COSTS_N_INSNS (66), /* HI */
1595 COSTS_N_INSNS (66), /* SI */
1596 COSTS_N_INSNS (66), /* DI */
1597 COSTS_N_INSNS (66)}, /* other */
1598 COSTS_N_INSNS (1), /* cost of movsx */
1599 COSTS_N_INSNS (1), /* cost of movzx */
1600 16, /* "large" insn */
1601 17, /* MOVE_RATIO */
1602 4, /* cost for loading QImode using movzbl */
1603 {4, 4, 4}, /* cost of loading integer registers
1604 in QImode, HImode and SImode.
1605 Relative to reg-reg move (2). */
1606 {4, 4, 4}, /* cost of storing integer registers */
1607 3, /* cost of reg,reg fld/fst */
1608 {12, 12, 12}, /* cost of loading fp registers
1609 in SFmode, DFmode and XFmode */
1610 {4, 4, 4}, /* cost of storing fp registers
1611 in SFmode, DFmode and XFmode */
1612 6, /* cost of moving MMX register */
1613 {12, 12}, /* cost of loading MMX registers
1614 in SImode and DImode */
1615 {12, 12}, /* cost of storing MMX registers
1616 in SImode and DImode */
1617 6, /* cost of moving SSE register */
1618 {12, 12, 12}, /* cost of loading SSE registers
1619 in SImode, DImode and TImode */
1620 {12, 12, 12}, /* cost of storing SSE registers
1621 in SImode, DImode and TImode */
1622 8, /* MMX or SSE register to integer */
1623 8, /* size of l1 cache. */
1624 1024, /* size of l2 cache. */
1625 128, /* size of prefetch block */
1626 8, /* number of parallel prefetches */
1627 1, /* Branch cost */
1628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1629 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1630 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1631 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1632 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1633 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1634 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1635 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1636 {100000, unrolled_loop}, {-1, libcall}}}},
1637 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1638 {-1, libcall}}},
1639 {libcall, {{24, loop}, {64, unrolled_loop},
1640 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1641 1, /* scalar_stmt_cost. */
1642 1, /* scalar load_cost. */
1643 1, /* scalar_store_cost. */
1644 1, /* vec_stmt_cost. */
1645 1, /* vec_to_scalar_cost. */
1646 1, /* scalar_to_vec_cost. */
1647 1, /* vec_align_load_cost. */
1648 2, /* vec_unalign_load_cost. */
1649 1, /* vec_store_cost. */
1650 3, /* cond_taken_branch_cost. */
1651 1, /* cond_not_taken_branch_cost. */
1652 };
1653
1654 static const
1655 struct processor_costs atom_cost = {
1656 COSTS_N_INSNS (1), /* cost of an add instruction */
1657 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1658 COSTS_N_INSNS (1), /* variable shift costs */
1659 COSTS_N_INSNS (1), /* constant shift costs */
1660 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1661 COSTS_N_INSNS (4), /* HI */
1662 COSTS_N_INSNS (3), /* SI */
1663 COSTS_N_INSNS (4), /* DI */
1664 COSTS_N_INSNS (2)}, /* other */
1665 0, /* cost of multiply per each bit set */
1666 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1667 COSTS_N_INSNS (26), /* HI */
1668 COSTS_N_INSNS (42), /* SI */
1669 COSTS_N_INSNS (74), /* DI */
1670 COSTS_N_INSNS (74)}, /* other */
1671 COSTS_N_INSNS (1), /* cost of movsx */
1672 COSTS_N_INSNS (1), /* cost of movzx */
1673 8, /* "large" insn */
1674 17, /* MOVE_RATIO */
1675 4, /* cost for loading QImode using movzbl */
1676 {4, 4, 4}, /* cost of loading integer registers
1677 in QImode, HImode and SImode.
1678 Relative to reg-reg move (2). */
1679 {4, 4, 4}, /* cost of storing integer registers */
1680 4, /* cost of reg,reg fld/fst */
1681 {12, 12, 12}, /* cost of loading fp registers
1682 in SFmode, DFmode and XFmode */
1683 {6, 6, 8}, /* cost of storing fp registers
1684 in SFmode, DFmode and XFmode */
1685 2, /* cost of moving MMX register */
1686 {8, 8}, /* cost of loading MMX registers
1687 in SImode and DImode */
1688 {8, 8}, /* cost of storing MMX registers
1689 in SImode and DImode */
1690 2, /* cost of moving SSE register */
1691 {8, 8, 8}, /* cost of loading SSE registers
1692 in SImode, DImode and TImode */
1693 {8, 8, 8}, /* cost of storing SSE registers
1694 in SImode, DImode and TImode */
1695 5, /* MMX or SSE register to integer */
1696 32, /* size of l1 cache. */
1697 256, /* size of l2 cache. */
1698 64, /* size of prefetch block */
1699 6, /* number of parallel prefetches */
1700 3, /* Branch cost */
1701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1702 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1703 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1704 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1705 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1706 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1707 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1708 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1709 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1710 {{libcall, {{8, loop}, {15, unrolled_loop},
1711 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1712 {libcall, {{24, loop}, {32, unrolled_loop},
1713 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1714 1, /* scalar_stmt_cost. */
1715 1, /* scalar load_cost. */
1716 1, /* scalar_store_cost. */
1717 1, /* vec_stmt_cost. */
1718 1, /* vec_to_scalar_cost. */
1719 1, /* scalar_to_vec_cost. */
1720 1, /* vec_align_load_cost. */
1721 2, /* vec_unalign_load_cost. */
1722 1, /* vec_store_cost. */
1723 3, /* cond_taken_branch_cost. */
1724 1, /* cond_not_taken_branch_cost. */
1725 };
1726
1727 /* Generic64 should produce code tuned for Nocona and K8. */
1728 static const
1729 struct processor_costs generic64_cost = {
1730 COSTS_N_INSNS (1), /* cost of an add instruction */
1731 /* On all chips taken into consideration lea is 2 cycles and more. With
1732 this cost however our current implementation of synth_mult results in
1733 use of unnecessary temporary registers causing regression on several
1734 SPECfp benchmarks. */
1735 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1736 COSTS_N_INSNS (1), /* variable shift costs */
1737 COSTS_N_INSNS (1), /* constant shift costs */
1738 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1739 COSTS_N_INSNS (4), /* HI */
1740 COSTS_N_INSNS (3), /* SI */
1741 COSTS_N_INSNS (4), /* DI */
1742 COSTS_N_INSNS (2)}, /* other */
1743 0, /* cost of multiply per each bit set */
1744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1745 COSTS_N_INSNS (26), /* HI */
1746 COSTS_N_INSNS (42), /* SI */
1747 COSTS_N_INSNS (74), /* DI */
1748 COSTS_N_INSNS (74)}, /* other */
1749 COSTS_N_INSNS (1), /* cost of movsx */
1750 COSTS_N_INSNS (1), /* cost of movzx */
1751 8, /* "large" insn */
1752 17, /* MOVE_RATIO */
1753 4, /* cost for loading QImode using movzbl */
1754 {4, 4, 4}, /* cost of loading integer registers
1755 in QImode, HImode and SImode.
1756 Relative to reg-reg move (2). */
1757 {4, 4, 4}, /* cost of storing integer registers */
1758 4, /* cost of reg,reg fld/fst */
1759 {12, 12, 12}, /* cost of loading fp registers
1760 in SFmode, DFmode and XFmode */
1761 {6, 6, 8}, /* cost of storing fp registers
1762 in SFmode, DFmode and XFmode */
1763 2, /* cost of moving MMX register */
1764 {8, 8}, /* cost of loading MMX registers
1765 in SImode and DImode */
1766 {8, 8}, /* cost of storing MMX registers
1767 in SImode and DImode */
1768 2, /* cost of moving SSE register */
1769 {8, 8, 8}, /* cost of loading SSE registers
1770 in SImode, DImode and TImode */
1771 {8, 8, 8}, /* cost of storing SSE registers
1772 in SImode, DImode and TImode */
1773 5, /* MMX or SSE register to integer */
1774 32, /* size of l1 cache. */
1775 512, /* size of l2 cache. */
1776 64, /* size of prefetch block */
1777 6, /* number of parallel prefetches */
1778 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1779 value is increased to perhaps more appropriate value of 5. */
1780 3, /* Branch cost */
1781 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1782 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1783 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1784 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1785 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1786 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1787 {DUMMY_STRINGOP_ALGS,
1788 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1789 {DUMMY_STRINGOP_ALGS,
1790 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 1, /* scalar_stmt_cost. */
1792 1, /* scalar load_cost. */
1793 1, /* scalar_store_cost. */
1794 1, /* vec_stmt_cost. */
1795 1, /* vec_to_scalar_cost. */
1796 1, /* scalar_to_vec_cost. */
1797 1, /* vec_align_load_cost. */
1798 2, /* vec_unalign_load_cost. */
1799 1, /* vec_store_cost. */
1800 3, /* cond_taken_branch_cost. */
1801 1, /* cond_not_taken_branch_cost. */
1802 };
1803
1804 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1805 Athlon and K8. */
1806 static const
1807 struct processor_costs generic32_cost = {
1808 COSTS_N_INSNS (1), /* cost of an add instruction */
1809 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1810 COSTS_N_INSNS (1), /* variable shift costs */
1811 COSTS_N_INSNS (1), /* constant shift costs */
1812 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1813 COSTS_N_INSNS (4), /* HI */
1814 COSTS_N_INSNS (3), /* SI */
1815 COSTS_N_INSNS (4), /* DI */
1816 COSTS_N_INSNS (2)}, /* other */
1817 0, /* cost of multiply per each bit set */
1818 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1819 COSTS_N_INSNS (26), /* HI */
1820 COSTS_N_INSNS (42), /* SI */
1821 COSTS_N_INSNS (74), /* DI */
1822 COSTS_N_INSNS (74)}, /* other */
1823 COSTS_N_INSNS (1), /* cost of movsx */
1824 COSTS_N_INSNS (1), /* cost of movzx */
1825 8, /* "large" insn */
1826 17, /* MOVE_RATIO */
1827 4, /* cost for loading QImode using movzbl */
1828 {4, 4, 4}, /* cost of loading integer registers
1829 in QImode, HImode and SImode.
1830 Relative to reg-reg move (2). */
1831 {4, 4, 4}, /* cost of storing integer registers */
1832 4, /* cost of reg,reg fld/fst */
1833 {12, 12, 12}, /* cost of loading fp registers
1834 in SFmode, DFmode and XFmode */
1835 {6, 6, 8}, /* cost of storing fp registers
1836 in SFmode, DFmode and XFmode */
1837 2, /* cost of moving MMX register */
1838 {8, 8}, /* cost of loading MMX registers
1839 in SImode and DImode */
1840 {8, 8}, /* cost of storing MMX registers
1841 in SImode and DImode */
1842 2, /* cost of moving SSE register */
1843 {8, 8, 8}, /* cost of loading SSE registers
1844 in SImode, DImode and TImode */
1845 {8, 8, 8}, /* cost of storing SSE registers
1846 in SImode, DImode and TImode */
1847 5, /* MMX or SSE register to integer */
1848 32, /* size of l1 cache. */
1849 256, /* size of l2 cache. */
1850 64, /* size of prefetch block */
1851 6, /* number of parallel prefetches */
1852 3, /* Branch cost */
1853 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1854 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1855 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1856 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1857 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1858 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1859 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1860 DUMMY_STRINGOP_ALGS},
1861 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1862 DUMMY_STRINGOP_ALGS},
1863 1, /* scalar_stmt_cost. */
1864 1, /* scalar load_cost. */
1865 1, /* scalar_store_cost. */
1866 1, /* vec_stmt_cost. */
1867 1, /* vec_to_scalar_cost. */
1868 1, /* scalar_to_vec_cost. */
1869 1, /* vec_align_load_cost. */
1870 2, /* vec_unalign_load_cost. */
1871 1, /* vec_store_cost. */
1872 3, /* cond_taken_branch_cost. */
1873 1, /* cond_not_taken_branch_cost. */
1874 };
1875
1876 const struct processor_costs *ix86_cost = &pentium_cost;
1877
1878 /* Processor feature/optimization bitmasks. */
1879 #define m_386 (1<<PROCESSOR_I386)
1880 #define m_486 (1<<PROCESSOR_I486)
1881 #define m_PENT (1<<PROCESSOR_PENTIUM)
1882 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1883 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1884 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1885 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1886 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1887 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1888 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1889 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1890 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1891 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1892 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1893 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1894 #define m_ATOM (1<<PROCESSOR_ATOM)
1895
1896 #define m_GEODE (1<<PROCESSOR_GEODE)
1897 #define m_K6 (1<<PROCESSOR_K6)
1898 #define m_K6_GEODE (m_K6 | m_GEODE)
1899 #define m_K8 (1<<PROCESSOR_K8)
1900 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1901 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1902 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1903 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1904 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1905 #define m_BDVER (m_BDVER1 | m_BDVER2)
1906 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1907 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1908
1909 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1910 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1911
1912 /* Generic instruction choice should be common subset of supported CPUs
1913 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1914 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1915
1916 /* Feature tests against the various tunings. */
1917 unsigned char ix86_tune_features[X86_TUNE_LAST];
1918
1919 /* Feature tests against the various tunings used to create ix86_tune_features
1920 based on the processor mask. */
1921 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1922 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1923 negatively, so enabling for Generic64 seems like good code size
1924 tradeoff. We can't enable it for 32bit generic because it does not
1925 work well with PPro base chips. */
1926 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1927
1928 /* X86_TUNE_PUSH_MEMORY */
1929 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1930
1931 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1932 m_486 | m_PENT,
1933
1934 /* X86_TUNE_UNROLL_STRLEN */
1935 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1936
1937 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1938 on simulation result. But after P4 was made, no performance benefit
1939 was observed with branch hints. It also increases the code size.
1940 As a result, icc never generates branch hints. */
1941 0,
1942
1943 /* X86_TUNE_DOUBLE_WITH_ADD */
1944 ~m_386,
1945
1946 /* X86_TUNE_USE_SAHF */
1947 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1948
1949 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1950 partial dependencies. */
1951 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1952
1953 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1954 register stalls on Generic32 compilation setting as well. However
1955 in current implementation the partial register stalls are not eliminated
1956 very well - they can be introduced via subregs synthesized by combine
1957 and can happen in caller/callee saving sequences. Because this option
1958 pays back little on PPro based chips and is in conflict with partial reg
1959 dependencies used by Athlon/P4 based chips, it is better to leave it off
1960 for generic32 for now. */
1961 m_PPRO,
1962
1963 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1964 m_CORE2I7 | m_GENERIC,
1965
1966 /* X86_TUNE_USE_HIMODE_FIOP */
1967 m_386 | m_486 | m_K6_GEODE,
1968
1969 /* X86_TUNE_USE_SIMODE_FIOP */
1970 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1971
1972 /* X86_TUNE_USE_MOV0 */
1973 m_K6,
1974
1975 /* X86_TUNE_USE_CLTD */
1976 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1977
1978 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1979 m_PENT4,
1980
1981 /* X86_TUNE_SPLIT_LONG_MOVES */
1982 m_PPRO,
1983
1984 /* X86_TUNE_READ_MODIFY_WRITE */
1985 ~m_PENT,
1986
1987 /* X86_TUNE_READ_MODIFY */
1988 ~(m_PENT | m_PPRO),
1989
1990 /* X86_TUNE_PROMOTE_QIMODE */
1991 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1992
1993 /* X86_TUNE_FAST_PREFIX */
1994 ~(m_386 | m_486 | m_PENT),
1995
1996 /* X86_TUNE_SINGLE_STRINGOP */
1997 m_386 | m_P4_NOCONA,
1998
1999 /* X86_TUNE_QIMODE_MATH */
2000 ~0,
2001
2002 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2003 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2004 might be considered for Generic32 if our scheme for avoiding partial
2005 stalls was more effective. */
2006 ~m_PPRO,
2007
2008 /* X86_TUNE_PROMOTE_QI_REGS */
2009 0,
2010
2011 /* X86_TUNE_PROMOTE_HI_REGS */
2012 m_PPRO,
2013
2014 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2015 over esp addition. */
2016 m_386 | m_486 | m_PENT | m_PPRO,
2017
2018 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2019 over esp addition. */
2020 m_PENT,
2021
2022 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2023 over esp subtraction. */
2024 m_386 | m_486 | m_PENT | m_K6_GEODE,
2025
2026 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2027 over esp subtraction. */
2028 m_PENT | m_K6_GEODE,
2029
2030 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2031 for DFmode copies */
2032 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2033
2034 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2035 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2036
2037 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2038 conflict here in between PPro/Pentium4 based chips that thread 128bit
2039 SSE registers as single units versus K8 based chips that divide SSE
2040 registers to two 64bit halves. This knob promotes all store destinations
2041 to be 128bit to allow register renaming on 128bit SSE units, but usually
2042 results in one extra microop on 64bit SSE units. Experimental results
2043 shows that disabling this option on P4 brings over 20% SPECfp regression,
2044 while enabling it on K8 brings roughly 2.4% regression that can be partly
2045 masked by careful scheduling of moves. */
2046 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2047
2048 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2049 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2050
2051 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2052 m_COREI7 | m_BDVER,
2053
2054 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2055 m_BDVER ,
2056
2057 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2058 are resolved on SSE register parts instead of whole registers, so we may
2059 maintain just lower part of scalar values in proper format leaving the
2060 upper part undefined. */
2061 m_ATHLON_K8,
2062
2063 /* X86_TUNE_SSE_TYPELESS_STORES */
2064 m_AMD_MULTIPLE,
2065
2066 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2067 m_PPRO | m_P4_NOCONA,
2068
2069 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2070 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2071
2072 /* X86_TUNE_PROLOGUE_USING_MOVE */
2073 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2074
2075 /* X86_TUNE_EPILOGUE_USING_MOVE */
2076 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2077
2078 /* X86_TUNE_SHIFT1 */
2079 ~m_486,
2080
2081 /* X86_TUNE_USE_FFREEP */
2082 m_AMD_MULTIPLE,
2083
2084 /* X86_TUNE_INTER_UNIT_MOVES */
2085 ~(m_AMD_MULTIPLE | m_GENERIC),
2086
2087 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2088 ~(m_AMDFAM10 | m_BDVER ),
2089
2090 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2091 than 4 branch instructions in the 16 byte window. */
2092 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2093
2094 /* X86_TUNE_SCHEDULE */
2095 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2096
2097 /* X86_TUNE_USE_BT */
2098 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2099
2100 /* X86_TUNE_USE_INCDEC */
2101 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2102
2103 /* X86_TUNE_PAD_RETURNS */
2104 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2105
2106 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2107 m_ATOM,
2108
2109 /* X86_TUNE_EXT_80387_CONSTANTS */
2110 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2111
2112 /* X86_TUNE_SHORTEN_X87_SSE */
2113 ~m_K8,
2114
2115 /* X86_TUNE_AVOID_VECTOR_DECODE */
2116 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2117
2118 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2119 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2120 ~(m_386 | m_486),
2121
2122 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2123 vector path on AMD machines. */
2124 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2125
2126 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2127 machines. */
2128 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2129
2130 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2131 than a MOV. */
2132 m_PENT,
2133
2134 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2135 but one byte longer. */
2136 m_PENT,
2137
2138 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2139 operand that cannot be represented using a modRM byte. The XOR
2140 replacement is long decoded, so this split helps here as well. */
2141 m_K6,
2142
2143 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2144 from FP to FP. */
2145 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2146
2147 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2148 from integer to FP. */
2149 m_AMDFAM10,
2150
2151 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2152 with a subsequent conditional jump instruction into a single
2153 compare-and-branch uop. */
2154 m_BDVER,
2155
2156 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2157 will impact LEA instruction selection. */
2158 m_ATOM,
2159
2160 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2161 instructions. */
2162 ~m_ATOM,
2163
2164 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2165 at -O3. For the moment, the prefetching seems badly tuned for Intel
2166 chips. */
2167 m_K6_GEODE | m_AMD_MULTIPLE,
2168
2169 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2170 the auto-vectorizer. */
2171 m_BDVER,
2172
2173 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2174 during reassociation of integer computation. */
2175 m_ATOM,
2176
2177 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2178 during reassociation of fp computation. */
2179 m_ATOM
2180 };
2181
2182 /* Feature tests against the various architecture variations. */
2183 unsigned char ix86_arch_features[X86_ARCH_LAST];
2184
2185 /* Feature tests against the various architecture variations, used to create
2186 ix86_arch_features based on the processor mask. */
2187 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2188 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2189 ~(m_386 | m_486 | m_PENT | m_K6),
2190
2191 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2192 ~m_386,
2193
2194 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2195 ~(m_386 | m_486),
2196
2197 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2198 ~m_386,
2199
2200 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2201 ~m_386,
2202 };
2203
2204 static const unsigned int x86_accumulate_outgoing_args
2205 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2206
2207 static const unsigned int x86_arch_always_fancy_math_387
2208 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2209
2210 static const unsigned int x86_avx256_split_unaligned_load
2211 = m_COREI7 | m_GENERIC;
2212
2213 static const unsigned int x86_avx256_split_unaligned_store
2214 = m_COREI7 | m_BDVER | m_GENERIC;
2215
2216 /* In case the average insn count for single function invocation is
2217 lower than this constant, emit fast (but longer) prologue and
2218 epilogue code. */
2219 #define FAST_PROLOGUE_INSN_COUNT 20
2220
2221 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2222 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2223 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2224 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2225
2226 /* Array of the smallest class containing reg number REGNO, indexed by
2227 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2228
2229 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2230 {
2231 /* ax, dx, cx, bx */
2232 AREG, DREG, CREG, BREG,
2233 /* si, di, bp, sp */
2234 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2235 /* FP registers */
2236 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2237 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2238 /* arg pointer */
2239 NON_Q_REGS,
2240 /* flags, fpsr, fpcr, frame */
2241 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2242 /* SSE registers */
2243 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2244 SSE_REGS, SSE_REGS,
2245 /* MMX registers */
2246 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2247 MMX_REGS, MMX_REGS,
2248 /* REX registers */
2249 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 /* SSE REX registers */
2252 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2253 SSE_REGS, SSE_REGS,
2254 };
2255
2256 /* The "default" register map used in 32bit mode. */
2257
2258 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2259 {
2260 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2261 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2262 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2263 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2264 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2265 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2267 };
2268
2269 /* The "default" register map used in 64bit mode. */
2270
2271 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2272 {
2273 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2274 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2275 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2276 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2277 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2278 8,9,10,11,12,13,14,15, /* extended integer registers */
2279 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2280 };
2281
2282 /* Define the register numbers to be used in Dwarf debugging information.
2283 The SVR4 reference port C compiler uses the following register numbers
2284 in its Dwarf output code:
2285 0 for %eax (gcc regno = 0)
2286 1 for %ecx (gcc regno = 2)
2287 2 for %edx (gcc regno = 1)
2288 3 for %ebx (gcc regno = 3)
2289 4 for %esp (gcc regno = 7)
2290 5 for %ebp (gcc regno = 6)
2291 6 for %esi (gcc regno = 4)
2292 7 for %edi (gcc regno = 5)
2293 The following three DWARF register numbers are never generated by
2294 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2295 believes these numbers have these meanings.
2296 8 for %eip (no gcc equivalent)
2297 9 for %eflags (gcc regno = 17)
2298 10 for %trapno (no gcc equivalent)
2299 It is not at all clear how we should number the FP stack registers
2300 for the x86 architecture. If the version of SDB on x86/svr4 were
2301 a bit less brain dead with respect to floating-point then we would
2302 have a precedent to follow with respect to DWARF register numbers
2303 for x86 FP registers, but the SDB on x86/svr4 is so completely
2304 broken with respect to FP registers that it is hardly worth thinking
2305 of it as something to strive for compatibility with.
2306 The version of x86/svr4 SDB I have at the moment does (partially)
2307 seem to believe that DWARF register number 11 is associated with
2308 the x86 register %st(0), but that's about all. Higher DWARF
2309 register numbers don't seem to be associated with anything in
2310 particular, and even for DWARF regno 11, SDB only seems to under-
2311 stand that it should say that a variable lives in %st(0) (when
2312 asked via an `=' command) if we said it was in DWARF regno 11,
2313 but SDB still prints garbage when asked for the value of the
2314 variable in question (via a `/' command).
2315 (Also note that the labels SDB prints for various FP stack regs
2316 when doing an `x' command are all wrong.)
2317 Note that these problems generally don't affect the native SVR4
2318 C compiler because it doesn't allow the use of -O with -g and
2319 because when it is *not* optimizing, it allocates a memory
2320 location for each floating-point variable, and the memory
2321 location is what gets described in the DWARF AT_location
2322 attribute for the variable in question.
2323 Regardless of the severe mental illness of the x86/svr4 SDB, we
2324 do something sensible here and we use the following DWARF
2325 register numbers. Note that these are all stack-top-relative
2326 numbers.
2327 11 for %st(0) (gcc regno = 8)
2328 12 for %st(1) (gcc regno = 9)
2329 13 for %st(2) (gcc regno = 10)
2330 14 for %st(3) (gcc regno = 11)
2331 15 for %st(4) (gcc regno = 12)
2332 16 for %st(5) (gcc regno = 13)
2333 17 for %st(6) (gcc regno = 14)
2334 18 for %st(7) (gcc regno = 15)
2335 */
2336 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2337 {
2338 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2339 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2340 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2341 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2342 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2343 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2345 };
2346
2347 /* Define parameter passing and return registers. */
2348
2349 static int const x86_64_int_parameter_registers[6] =
2350 {
2351 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2352 };
2353
2354 static int const x86_64_ms_abi_int_parameter_registers[4] =
2355 {
2356 CX_REG, DX_REG, R8_REG, R9_REG
2357 };
2358
2359 static int const x86_64_int_return_registers[4] =
2360 {
2361 AX_REG, DX_REG, DI_REG, SI_REG
2362 };
2363
2364 /* Define the structure for the machine field in struct function. */
2365
2366 struct GTY(()) stack_local_entry {
2367 unsigned short mode;
2368 unsigned short n;
2369 rtx rtl;
2370 struct stack_local_entry *next;
2371 };
2372
2373 /* Structure describing stack frame layout.
2374 Stack grows downward:
2375
2376 [arguments]
2377 <- ARG_POINTER
2378 saved pc
2379
2380 saved static chain if ix86_static_chain_on_stack
2381
2382 saved frame pointer if frame_pointer_needed
2383 <- HARD_FRAME_POINTER
2384 [saved regs]
2385 <- regs_save_offset
2386 [padding0]
2387
2388 [saved SSE regs]
2389 <- sse_regs_save_offset
2390 [padding1] |
2391 | <- FRAME_POINTER
2392 [va_arg registers] |
2393 |
2394 [frame] |
2395 |
2396 [padding2] | = to_allocate
2397 <- STACK_POINTER
2398 */
2399 struct ix86_frame
2400 {
2401 int nsseregs;
2402 int nregs;
2403 int va_arg_size;
2404 int red_zone_size;
2405 int outgoing_arguments_size;
2406 HOST_WIDE_INT frame;
2407
2408 /* The offsets relative to ARG_POINTER. */
2409 HOST_WIDE_INT frame_pointer_offset;
2410 HOST_WIDE_INT hard_frame_pointer_offset;
2411 HOST_WIDE_INT stack_pointer_offset;
2412 HOST_WIDE_INT hfp_save_offset;
2413 HOST_WIDE_INT reg_save_offset;
2414 HOST_WIDE_INT sse_reg_save_offset;
2415
2416 /* When save_regs_using_mov is set, emit prologue using
2417 move instead of push instructions. */
2418 bool save_regs_using_mov;
2419 };
2420
2421 /* Which cpu are we scheduling for. */
2422 enum attr_cpu ix86_schedule;
2423
2424 /* Which cpu are we optimizing for. */
2425 enum processor_type ix86_tune;
2426
2427 /* Which instruction set architecture to use. */
2428 enum processor_type ix86_arch;
2429
2430 /* true if sse prefetch instruction is not NOOP. */
2431 int x86_prefetch_sse;
2432
2433 /* -mstackrealign option */
2434 static const char ix86_force_align_arg_pointer_string[]
2435 = "force_align_arg_pointer";
2436
2437 static rtx (*ix86_gen_leave) (void);
2438 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2439 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2440 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2441 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2442 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2445 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2446 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2447
2448 /* Preferred alignment for stack boundary in bits. */
2449 unsigned int ix86_preferred_stack_boundary;
2450
2451 /* Alignment for incoming stack boundary in bits specified at
2452 command line. */
2453 static unsigned int ix86_user_incoming_stack_boundary;
2454
2455 /* Default alignment for incoming stack boundary in bits. */
2456 static unsigned int ix86_default_incoming_stack_boundary;
2457
2458 /* Alignment for incoming stack boundary in bits. */
2459 unsigned int ix86_incoming_stack_boundary;
2460
2461 /* Calling abi specific va_list type nodes. */
2462 static GTY(()) tree sysv_va_list_type_node;
2463 static GTY(()) tree ms_va_list_type_node;
2464
2465 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2466 char internal_label_prefix[16];
2467 int internal_label_prefix_len;
2468
2469 /* Fence to use after loop using movnt. */
2470 tree x86_mfence;
2471
2472 /* Register class used for passing given 64bit part of the argument.
2473 These represent classes as documented by the PS ABI, with the exception
2474 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2475 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2476
2477 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2478 whenever possible (upper half does contain padding). */
2479 enum x86_64_reg_class
2480 {
2481 X86_64_NO_CLASS,
2482 X86_64_INTEGER_CLASS,
2483 X86_64_INTEGERSI_CLASS,
2484 X86_64_SSE_CLASS,
2485 X86_64_SSESF_CLASS,
2486 X86_64_SSEDF_CLASS,
2487 X86_64_SSEUP_CLASS,
2488 X86_64_X87_CLASS,
2489 X86_64_X87UP_CLASS,
2490 X86_64_COMPLEX_X87_CLASS,
2491 X86_64_MEMORY_CLASS
2492 };
2493
2494 #define MAX_CLASSES 4
2495
2496 /* Table of constants used by fldpi, fldln2, etc.... */
2497 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2498 static bool ext_80387_constants_init = 0;
2499
2500 \f
2501 static struct machine_function * ix86_init_machine_status (void);
2502 static rtx ix86_function_value (const_tree, const_tree, bool);
2503 static bool ix86_function_value_regno_p (const unsigned int);
2504 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2505 const_tree);
2506 static rtx ix86_static_chain (const_tree, bool);
2507 static int ix86_function_regparm (const_tree, const_tree);
2508 static void ix86_compute_frame_layout (struct ix86_frame *);
2509 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2510 rtx, rtx, int);
2511 static void ix86_add_new_builtins (HOST_WIDE_INT);
2512 static tree ix86_canonical_va_list_type (tree);
2513 static void predict_jump (int);
2514 static unsigned int split_stack_prologue_scratch_regno (void);
2515 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2516
2517 enum ix86_function_specific_strings
2518 {
2519 IX86_FUNCTION_SPECIFIC_ARCH,
2520 IX86_FUNCTION_SPECIFIC_TUNE,
2521 IX86_FUNCTION_SPECIFIC_MAX
2522 };
2523
2524 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2525 const char *, enum fpmath_unit, bool);
2526 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2527 static void ix86_function_specific_save (struct cl_target_option *);
2528 static void ix86_function_specific_restore (struct cl_target_option *);
2529 static void ix86_function_specific_print (FILE *, int,
2530 struct cl_target_option *);
2531 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2532 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2533 struct gcc_options *);
2534 static bool ix86_can_inline_p (tree, tree);
2535 static void ix86_set_current_function (tree);
2536 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2537
2538 static enum calling_abi ix86_function_abi (const_tree);
2539
2540 \f
2541 #ifndef SUBTARGET32_DEFAULT_CPU
2542 #define SUBTARGET32_DEFAULT_CPU "i386"
2543 #endif
2544
2545 /* The svr4 ABI for the i386 says that records and unions are returned
2546 in memory. */
2547 #ifndef DEFAULT_PCC_STRUCT_RETURN
2548 #define DEFAULT_PCC_STRUCT_RETURN 1
2549 #endif
2550
2551 /* Whether -mtune= or -march= were specified */
2552 static int ix86_tune_defaulted;
2553 static int ix86_arch_specified;
2554
2555 /* Vectorization library interface and handlers. */
2556 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2557
2558 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2559 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2560
2561 /* Processor target table, indexed by processor number */
2562 struct ptt
2563 {
2564 const struct processor_costs *cost; /* Processor costs */
2565 const int align_loop; /* Default alignments. */
2566 const int align_loop_max_skip;
2567 const int align_jump;
2568 const int align_jump_max_skip;
2569 const int align_func;
2570 };
2571
2572 static const struct ptt processor_target_table[PROCESSOR_max] =
2573 {
2574 {&i386_cost, 4, 3, 4, 3, 4},
2575 {&i486_cost, 16, 15, 16, 15, 16},
2576 {&pentium_cost, 16, 7, 16, 7, 16},
2577 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2578 {&geode_cost, 0, 0, 0, 0, 0},
2579 {&k6_cost, 32, 7, 32, 7, 32},
2580 {&athlon_cost, 16, 7, 16, 7, 16},
2581 {&pentium4_cost, 0, 0, 0, 0, 0},
2582 {&k8_cost, 16, 7, 16, 7, 16},
2583 {&nocona_cost, 0, 0, 0, 0, 0},
2584 /* Core 2 32-bit. */
2585 {&generic32_cost, 16, 10, 16, 10, 16},
2586 /* Core 2 64-bit. */
2587 {&generic64_cost, 16, 10, 16, 10, 16},
2588 /* Core i7 32-bit. */
2589 {&generic32_cost, 16, 10, 16, 10, 16},
2590 /* Core i7 64-bit. */
2591 {&generic64_cost, 16, 10, 16, 10, 16},
2592 {&generic32_cost, 16, 7, 16, 7, 16},
2593 {&generic64_cost, 16, 10, 16, 10, 16},
2594 {&amdfam10_cost, 32, 24, 32, 7, 32},
2595 {&bdver1_cost, 32, 24, 32, 7, 32},
2596 {&bdver2_cost, 32, 24, 32, 7, 32},
2597 {&btver1_cost, 32, 24, 32, 7, 32},
2598 {&atom_cost, 16, 15, 16, 7, 16}
2599 };
2600
2601 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2602 {
2603 "generic",
2604 "i386",
2605 "i486",
2606 "pentium",
2607 "pentium-mmx",
2608 "pentiumpro",
2609 "pentium2",
2610 "pentium3",
2611 "pentium4",
2612 "pentium-m",
2613 "prescott",
2614 "nocona",
2615 "core2",
2616 "corei7",
2617 "atom",
2618 "geode",
2619 "k6",
2620 "k6-2",
2621 "k6-3",
2622 "athlon",
2623 "athlon-4",
2624 "k8",
2625 "amdfam10",
2626 "bdver1",
2627 "bdver2",
2628 "btver1"
2629 };
2630 \f
2631 /* Return true if a red-zone is in use. */
2632
2633 static inline bool
2634 ix86_using_red_zone (void)
2635 {
2636 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2637 }
2638 \f
2639 /* Return a string that documents the current -m options. The caller is
2640 responsible for freeing the string. */
2641
2642 static char *
2643 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2644 const char *tune, enum fpmath_unit fpmath,
2645 bool add_nl_p)
2646 {
2647 struct ix86_target_opts
2648 {
2649 const char *option; /* option string */
2650 HOST_WIDE_INT mask; /* isa mask options */
2651 };
2652
2653 /* This table is ordered so that options like -msse4.2 that imply
2654 preceding options while match those first. */
2655 static struct ix86_target_opts isa_opts[] =
2656 {
2657 { "-m64", OPTION_MASK_ISA_64BIT },
2658 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2659 { "-mfma", OPTION_MASK_ISA_FMA },
2660 { "-mxop", OPTION_MASK_ISA_XOP },
2661 { "-mlwp", OPTION_MASK_ISA_LWP },
2662 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2663 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2664 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2665 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2666 { "-msse3", OPTION_MASK_ISA_SSE3 },
2667 { "-msse2", OPTION_MASK_ISA_SSE2 },
2668 { "-msse", OPTION_MASK_ISA_SSE },
2669 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2670 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2671 { "-mmmx", OPTION_MASK_ISA_MMX },
2672 { "-mabm", OPTION_MASK_ISA_ABM },
2673 { "-mbmi", OPTION_MASK_ISA_BMI },
2674 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2675 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2676 { "-mtbm", OPTION_MASK_ISA_TBM },
2677 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2678 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2679 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2680 { "-maes", OPTION_MASK_ISA_AES },
2681 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2682 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2683 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2684 { "-mf16c", OPTION_MASK_ISA_F16C },
2685 };
2686
2687 /* Flag options. */
2688 static struct ix86_target_opts flag_opts[] =
2689 {
2690 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2691 { "-m80387", MASK_80387 },
2692 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2693 { "-malign-double", MASK_ALIGN_DOUBLE },
2694 { "-mcld", MASK_CLD },
2695 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2696 { "-mieee-fp", MASK_IEEE_FP },
2697 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2698 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2699 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2700 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2701 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2702 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2703 { "-mno-red-zone", MASK_NO_RED_ZONE },
2704 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2705 { "-mrecip", MASK_RECIP },
2706 { "-mrtd", MASK_RTD },
2707 { "-msseregparm", MASK_SSEREGPARM },
2708 { "-mstack-arg-probe", MASK_STACK_PROBE },
2709 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2710 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2711 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2712 { "-mvzeroupper", MASK_VZEROUPPER },
2713 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2714 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2715 { "-mprefer-avx128", MASK_PREFER_AVX128},
2716 };
2717
2718 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2719
2720 char isa_other[40];
2721 char target_other[40];
2722 unsigned num = 0;
2723 unsigned i, j;
2724 char *ret;
2725 char *ptr;
2726 size_t len;
2727 size_t line_len;
2728 size_t sep_len;
2729
2730 memset (opts, '\0', sizeof (opts));
2731
2732 /* Add -march= option. */
2733 if (arch)
2734 {
2735 opts[num][0] = "-march=";
2736 opts[num++][1] = arch;
2737 }
2738
2739 /* Add -mtune= option. */
2740 if (tune)
2741 {
2742 opts[num][0] = "-mtune=";
2743 opts[num++][1] = tune;
2744 }
2745
2746 /* Pick out the options in isa options. */
2747 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2748 {
2749 if ((isa & isa_opts[i].mask) != 0)
2750 {
2751 opts[num++][0] = isa_opts[i].option;
2752 isa &= ~ isa_opts[i].mask;
2753 }
2754 }
2755
2756 if (isa && add_nl_p)
2757 {
2758 opts[num++][0] = isa_other;
2759 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2760 isa);
2761 }
2762
2763 /* Add flag options. */
2764 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2765 {
2766 if ((flags & flag_opts[i].mask) != 0)
2767 {
2768 opts[num++][0] = flag_opts[i].option;
2769 flags &= ~ flag_opts[i].mask;
2770 }
2771 }
2772
2773 if (flags && add_nl_p)
2774 {
2775 opts[num++][0] = target_other;
2776 sprintf (target_other, "(other flags: %#x)", flags);
2777 }
2778
2779 /* Add -fpmath= option. */
2780 if (fpmath)
2781 {
2782 opts[num][0] = "-mfpmath=";
2783 switch ((int) fpmath)
2784 {
2785 case FPMATH_387:
2786 opts[num++][1] = "387";
2787 break;
2788
2789 case FPMATH_SSE:
2790 opts[num++][1] = "sse";
2791 break;
2792
2793 case FPMATH_387 | FPMATH_SSE:
2794 opts[num++][1] = "sse+387";
2795 break;
2796
2797 default:
2798 gcc_unreachable ();
2799 }
2800 }
2801
2802 /* Any options? */
2803 if (num == 0)
2804 return NULL;
2805
2806 gcc_assert (num < ARRAY_SIZE (opts));
2807
2808 /* Size the string. */
2809 len = 0;
2810 sep_len = (add_nl_p) ? 3 : 1;
2811 for (i = 0; i < num; i++)
2812 {
2813 len += sep_len;
2814 for (j = 0; j < 2; j++)
2815 if (opts[i][j])
2816 len += strlen (opts[i][j]);
2817 }
2818
2819 /* Build the string. */
2820 ret = ptr = (char *) xmalloc (len);
2821 line_len = 0;
2822
2823 for (i = 0; i < num; i++)
2824 {
2825 size_t len2[2];
2826
2827 for (j = 0; j < 2; j++)
2828 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2829
2830 if (i != 0)
2831 {
2832 *ptr++ = ' ';
2833 line_len++;
2834
2835 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2836 {
2837 *ptr++ = '\\';
2838 *ptr++ = '\n';
2839 line_len = 0;
2840 }
2841 }
2842
2843 for (j = 0; j < 2; j++)
2844 if (opts[i][j])
2845 {
2846 memcpy (ptr, opts[i][j], len2[j]);
2847 ptr += len2[j];
2848 line_len += len2[j];
2849 }
2850 }
2851
2852 *ptr = '\0';
2853 gcc_assert (ret + len >= ptr);
2854
2855 return ret;
2856 }
2857
2858 /* Return true, if profiling code should be emitted before
2859 prologue. Otherwise it returns false.
2860 Note: For x86 with "hotfix" it is sorried. */
2861 static bool
2862 ix86_profile_before_prologue (void)
2863 {
2864 return flag_fentry != 0;
2865 }
2866
2867 /* Function that is callable from the debugger to print the current
2868 options. */
2869 void
2870 ix86_debug_options (void)
2871 {
2872 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2873 ix86_arch_string, ix86_tune_string,
2874 ix86_fpmath, true);
2875
2876 if (opts)
2877 {
2878 fprintf (stderr, "%s\n\n", opts);
2879 free (opts);
2880 }
2881 else
2882 fputs ("<no options>\n\n", stderr);
2883
2884 return;
2885 }
2886 \f
2887 /* Override various settings based on options. If MAIN_ARGS_P, the
2888 options are from the command line, otherwise they are from
2889 attributes. */
2890
2891 static void
2892 ix86_option_override_internal (bool main_args_p)
2893 {
2894 int i;
2895 unsigned int ix86_arch_mask, ix86_tune_mask;
2896 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2897 const char *prefix;
2898 const char *suffix;
2899 const char *sw;
2900
2901 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2902 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2903 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2904 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2905 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2906 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2907 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2908 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2909 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2910 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2911 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2912 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2913 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2914 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2915 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2916 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2917 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2918 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2919 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2920 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2921 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2922 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2923 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2924 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2925 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2926 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2927 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2928 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2929 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2930 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2931 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2932 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2933 /* if this reaches 64, need to widen struct pta flags below */
2934
2935 static struct pta
2936 {
2937 const char *const name; /* processor name or nickname. */
2938 const enum processor_type processor;
2939 const enum attr_cpu schedule;
2940 const unsigned HOST_WIDE_INT flags;
2941 }
2942 const processor_alias_table[] =
2943 {
2944 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2945 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2946 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2947 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2948 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2949 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2950 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2951 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2952 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2953 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2954 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2955 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2956 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2957 PTA_MMX | PTA_SSE},
2958 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2959 PTA_MMX | PTA_SSE},
2960 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2961 PTA_MMX | PTA_SSE | PTA_SSE2},
2962 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2963 PTA_MMX |PTA_SSE | PTA_SSE2},
2964 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2965 PTA_MMX | PTA_SSE | PTA_SSE2},
2966 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2967 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2968 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2969 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2970 | PTA_CX16 | PTA_NO_SAHF},
2971 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2972 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2973 | PTA_SSSE3 | PTA_CX16},
2974 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2975 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2976 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2977 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2978 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2979 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2980 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2981 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2982 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2983 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2984 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2985 | PTA_RDRND | PTA_F16C},
2986 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2987 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2988 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2989 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2990 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2991 | PTA_FMA | PTA_MOVBE},
2992 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2993 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2994 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
2995 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2996 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
2997 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2998 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2999 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3000 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3001 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3002 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3003 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3004 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3005 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3006 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3007 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3008 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3009 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3010 {"x86-64", PROCESSOR_K8, CPU_K8,
3011 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3012 {"k8", PROCESSOR_K8, CPU_K8,
3013 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3014 | PTA_SSE2 | PTA_NO_SAHF},
3015 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3016 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3017 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3018 {"opteron", PROCESSOR_K8, CPU_K8,
3019 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3020 | PTA_SSE2 | PTA_NO_SAHF},
3021 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3022 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3023 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3024 {"athlon64", PROCESSOR_K8, CPU_K8,
3025 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3026 | PTA_SSE2 | PTA_NO_SAHF},
3027 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3028 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3029 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3030 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3031 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3032 | PTA_SSE2 | PTA_NO_SAHF},
3033 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3034 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3035 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3036 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3037 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3038 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3039 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3040 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3041 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3042 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3043 | PTA_XOP | PTA_LWP},
3044 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3045 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3046 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3047 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3048 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3049 | PTA_FMA},
3050 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3051 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3052 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3053 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3054 0 /* flags are only used for -march switch. */ },
3055 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3056 PTA_64BIT /* flags are only used for -march switch. */ },
3057 };
3058
3059 /* -mrecip options. */
3060 static struct
3061 {
3062 const char *string; /* option name */
3063 unsigned int mask; /* mask bits to set */
3064 }
3065 const recip_options[] =
3066 {
3067 { "all", RECIP_MASK_ALL },
3068 { "none", RECIP_MASK_NONE },
3069 { "div", RECIP_MASK_DIV },
3070 { "sqrt", RECIP_MASK_SQRT },
3071 { "vec-div", RECIP_MASK_VEC_DIV },
3072 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3073 };
3074
3075 int const pta_size = ARRAY_SIZE (processor_alias_table);
3076
3077 /* Set up prefix/suffix so the error messages refer to either the command
3078 line argument, or the attribute(target). */
3079 if (main_args_p)
3080 {
3081 prefix = "-m";
3082 suffix = "";
3083 sw = "switch";
3084 }
3085 else
3086 {
3087 prefix = "option(\"";
3088 suffix = "\")";
3089 sw = "attribute";
3090 }
3091
3092 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3093 SUBTARGET_OVERRIDE_OPTIONS;
3094 #endif
3095
3096 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3097 SUBSUBTARGET_OVERRIDE_OPTIONS;
3098 #endif
3099
3100 if (TARGET_X32)
3101 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3102
3103 /* -fPIC is the default for x86_64. */
3104 if (TARGET_MACHO && TARGET_64BIT)
3105 flag_pic = 2;
3106
3107 /* Need to check -mtune=generic first. */
3108 if (ix86_tune_string)
3109 {
3110 if (!strcmp (ix86_tune_string, "generic")
3111 || !strcmp (ix86_tune_string, "i686")
3112 /* As special support for cross compilers we read -mtune=native
3113 as -mtune=generic. With native compilers we won't see the
3114 -mtune=native, as it was changed by the driver. */
3115 || !strcmp (ix86_tune_string, "native"))
3116 {
3117 if (TARGET_64BIT)
3118 ix86_tune_string = "generic64";
3119 else
3120 ix86_tune_string = "generic32";
3121 }
3122 /* If this call is for setting the option attribute, allow the
3123 generic32/generic64 that was previously set. */
3124 else if (!main_args_p
3125 && (!strcmp (ix86_tune_string, "generic32")
3126 || !strcmp (ix86_tune_string, "generic64")))
3127 ;
3128 else if (!strncmp (ix86_tune_string, "generic", 7))
3129 error ("bad value (%s) for %stune=%s %s",
3130 ix86_tune_string, prefix, suffix, sw);
3131 else if (!strcmp (ix86_tune_string, "x86-64"))
3132 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3133 "%stune=k8%s or %stune=generic%s instead as appropriate",
3134 prefix, suffix, prefix, suffix, prefix, suffix);
3135 }
3136 else
3137 {
3138 if (ix86_arch_string)
3139 ix86_tune_string = ix86_arch_string;
3140 if (!ix86_tune_string)
3141 {
3142 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3143 ix86_tune_defaulted = 1;
3144 }
3145
3146 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3147 need to use a sensible tune option. */
3148 if (!strcmp (ix86_tune_string, "generic")
3149 || !strcmp (ix86_tune_string, "x86-64")
3150 || !strcmp (ix86_tune_string, "i686"))
3151 {
3152 if (TARGET_64BIT)
3153 ix86_tune_string = "generic64";
3154 else
3155 ix86_tune_string = "generic32";
3156 }
3157 }
3158
3159 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3160 {
3161 /* rep; movq isn't available in 32-bit code. */
3162 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3163 ix86_stringop_alg = no_stringop;
3164 }
3165
3166 if (!ix86_arch_string)
3167 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3168 else
3169 ix86_arch_specified = 1;
3170
3171 if (!global_options_set.x_ix86_abi)
3172 ix86_abi = DEFAULT_ABI;
3173
3174 if (global_options_set.x_ix86_cmodel)
3175 {
3176 switch (ix86_cmodel)
3177 {
3178 case CM_SMALL:
3179 case CM_SMALL_PIC:
3180 if (flag_pic)
3181 ix86_cmodel = CM_SMALL_PIC;
3182 if (!TARGET_64BIT)
3183 error ("code model %qs not supported in the %s bit mode",
3184 "small", "32");
3185 break;
3186
3187 case CM_MEDIUM:
3188 case CM_MEDIUM_PIC:
3189 if (flag_pic)
3190 ix86_cmodel = CM_MEDIUM_PIC;
3191 if (!TARGET_64BIT)
3192 error ("code model %qs not supported in the %s bit mode",
3193 "medium", "32");
3194 else if (TARGET_X32)
3195 error ("code model %qs not supported in x32 mode",
3196 "medium");
3197 break;
3198
3199 case CM_LARGE:
3200 case CM_LARGE_PIC:
3201 if (flag_pic)
3202 ix86_cmodel = CM_LARGE_PIC;
3203 if (!TARGET_64BIT)
3204 error ("code model %qs not supported in the %s bit mode",
3205 "large", "32");
3206 else if (TARGET_X32)
3207 error ("code model %qs not supported in x32 mode",
3208 "medium");
3209 break;
3210
3211 case CM_32:
3212 if (flag_pic)
3213 error ("code model %s does not support PIC mode", "32");
3214 if (TARGET_64BIT)
3215 error ("code model %qs not supported in the %s bit mode",
3216 "32", "64");
3217 break;
3218
3219 case CM_KERNEL:
3220 if (flag_pic)
3221 {
3222 error ("code model %s does not support PIC mode", "kernel");
3223 ix86_cmodel = CM_32;
3224 }
3225 if (!TARGET_64BIT)
3226 error ("code model %qs not supported in the %s bit mode",
3227 "kernel", "32");
3228 break;
3229
3230 default:
3231 gcc_unreachable ();
3232 }
3233 }
3234 else
3235 {
3236 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3237 use of rip-relative addressing. This eliminates fixups that
3238 would otherwise be needed if this object is to be placed in a
3239 DLL, and is essentially just as efficient as direct addressing. */
3240 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3241 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3242 else if (TARGET_64BIT)
3243 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3244 else
3245 ix86_cmodel = CM_32;
3246 }
3247 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3248 {
3249 error ("-masm=intel not supported in this configuration");
3250 ix86_asm_dialect = ASM_ATT;
3251 }
3252 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3253 sorry ("%i-bit mode not compiled in",
3254 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3255
3256 for (i = 0; i < pta_size; i++)
3257 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3258 {
3259 ix86_schedule = processor_alias_table[i].schedule;
3260 ix86_arch = processor_alias_table[i].processor;
3261 /* Default cpu tuning to the architecture. */
3262 ix86_tune = ix86_arch;
3263
3264 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3265 error ("CPU you selected does not support x86-64 "
3266 "instruction set");
3267
3268 if (processor_alias_table[i].flags & PTA_MMX
3269 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3270 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3271 if (processor_alias_table[i].flags & PTA_3DNOW
3272 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3273 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3274 if (processor_alias_table[i].flags & PTA_3DNOW_A
3275 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3276 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3277 if (processor_alias_table[i].flags & PTA_SSE
3278 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3279 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3280 if (processor_alias_table[i].flags & PTA_SSE2
3281 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3282 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3283 if (processor_alias_table[i].flags & PTA_SSE3
3284 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3285 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3286 if (processor_alias_table[i].flags & PTA_SSSE3
3287 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3288 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3289 if (processor_alias_table[i].flags & PTA_SSE4_1
3290 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3291 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3292 if (processor_alias_table[i].flags & PTA_SSE4_2
3293 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3294 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3295 if (processor_alias_table[i].flags & PTA_AVX
3296 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3297 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3298 if (processor_alias_table[i].flags & PTA_AVX2
3299 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3300 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3301 if (processor_alias_table[i].flags & PTA_FMA
3302 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3303 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3304 if (processor_alias_table[i].flags & PTA_SSE4A
3305 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3306 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3307 if (processor_alias_table[i].flags & PTA_FMA4
3308 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3309 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3310 if (processor_alias_table[i].flags & PTA_XOP
3311 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3312 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3313 if (processor_alias_table[i].flags & PTA_LWP
3314 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3315 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3316 if (processor_alias_table[i].flags & PTA_ABM
3317 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3318 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3319 if (processor_alias_table[i].flags & PTA_BMI
3320 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3321 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3322 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3323 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3324 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3325 if (processor_alias_table[i].flags & PTA_TBM
3326 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3327 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3328 if (processor_alias_table[i].flags & PTA_BMI2
3329 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3330 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3331 if (processor_alias_table[i].flags & PTA_CX16
3332 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3333 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3334 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3335 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3336 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3337 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3338 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3339 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3340 if (processor_alias_table[i].flags & PTA_MOVBE
3341 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3342 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3343 if (processor_alias_table[i].flags & PTA_AES
3344 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3345 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3346 if (processor_alias_table[i].flags & PTA_PCLMUL
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3348 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3349 if (processor_alias_table[i].flags & PTA_FSGSBASE
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3351 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3352 if (processor_alias_table[i].flags & PTA_RDRND
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3354 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3355 if (processor_alias_table[i].flags & PTA_F16C
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3357 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3358 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3359 x86_prefetch_sse = true;
3360
3361 break;
3362 }
3363
3364 if (!strcmp (ix86_arch_string, "generic"))
3365 error ("generic CPU can be used only for %stune=%s %s",
3366 prefix, suffix, sw);
3367 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3368 error ("bad value (%s) for %sarch=%s %s",
3369 ix86_arch_string, prefix, suffix, sw);
3370
3371 ix86_arch_mask = 1u << ix86_arch;
3372 for (i = 0; i < X86_ARCH_LAST; ++i)
3373 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3374
3375 for (i = 0; i < pta_size; i++)
3376 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3377 {
3378 ix86_schedule = processor_alias_table[i].schedule;
3379 ix86_tune = processor_alias_table[i].processor;
3380 if (TARGET_64BIT)
3381 {
3382 if (!(processor_alias_table[i].flags & PTA_64BIT))
3383 {
3384 if (ix86_tune_defaulted)
3385 {
3386 ix86_tune_string = "x86-64";
3387 for (i = 0; i < pta_size; i++)
3388 if (! strcmp (ix86_tune_string,
3389 processor_alias_table[i].name))
3390 break;
3391 ix86_schedule = processor_alias_table[i].schedule;
3392 ix86_tune = processor_alias_table[i].processor;
3393 }
3394 else
3395 error ("CPU you selected does not support x86-64 "
3396 "instruction set");
3397 }
3398 }
3399 else
3400 {
3401 /* Adjust tuning when compiling for 32-bit ABI. */
3402 switch (ix86_tune)
3403 {
3404 case PROCESSOR_GENERIC64:
3405 ix86_tune = PROCESSOR_GENERIC32;
3406 ix86_schedule = CPU_PENTIUMPRO;
3407 break;
3408
3409 case PROCESSOR_CORE2_64:
3410 ix86_tune = PROCESSOR_CORE2_32;
3411 break;
3412
3413 case PROCESSOR_COREI7_64:
3414 ix86_tune = PROCESSOR_COREI7_32;
3415 break;
3416
3417 default:
3418 break;
3419 }
3420 }
3421 /* Intel CPUs have always interpreted SSE prefetch instructions as
3422 NOPs; so, we can enable SSE prefetch instructions even when
3423 -mtune (rather than -march) points us to a processor that has them.
3424 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3425 higher processors. */
3426 if (TARGET_CMOVE
3427 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3428 x86_prefetch_sse = true;
3429 break;
3430 }
3431
3432 if (ix86_tune_specified && i == pta_size)
3433 error ("bad value (%s) for %stune=%s %s",
3434 ix86_tune_string, prefix, suffix, sw);
3435
3436 ix86_tune_mask = 1u << ix86_tune;
3437 for (i = 0; i < X86_TUNE_LAST; ++i)
3438 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3439
3440 #ifndef USE_IX86_FRAME_POINTER
3441 #define USE_IX86_FRAME_POINTER 0
3442 #endif
3443
3444 #ifndef USE_X86_64_FRAME_POINTER
3445 #define USE_X86_64_FRAME_POINTER 0
3446 #endif
3447
3448 /* Set the default values for switches whose default depends on TARGET_64BIT
3449 in case they weren't overwritten by command line options. */
3450 if (TARGET_64BIT)
3451 {
3452 if (optimize > 1 && !global_options_set.x_flag_ree)
3453 flag_ree = 1;
3454 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3455 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3456 if (flag_asynchronous_unwind_tables == 2)
3457 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3458 if (flag_pcc_struct_return == 2)
3459 flag_pcc_struct_return = 0;
3460 }
3461 else
3462 {
3463 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3464 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3465 if (flag_asynchronous_unwind_tables == 2)
3466 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3467 if (flag_pcc_struct_return == 2)
3468 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3469 }
3470
3471 if (optimize_size)
3472 ix86_cost = &ix86_size_cost;
3473 else
3474 ix86_cost = processor_target_table[ix86_tune].cost;
3475
3476 /* Arrange to set up i386_stack_locals for all functions. */
3477 init_machine_status = ix86_init_machine_status;
3478
3479 /* Validate -mregparm= value. */
3480 if (global_options_set.x_ix86_regparm)
3481 {
3482 if (TARGET_64BIT)
3483 warning (0, "-mregparm is ignored in 64-bit mode");
3484 if (ix86_regparm > REGPARM_MAX)
3485 {
3486 error ("-mregparm=%d is not between 0 and %d",
3487 ix86_regparm, REGPARM_MAX);
3488 ix86_regparm = 0;
3489 }
3490 }
3491 if (TARGET_64BIT)
3492 ix86_regparm = REGPARM_MAX;
3493
3494 /* Default align_* from the processor table. */
3495 if (align_loops == 0)
3496 {
3497 align_loops = processor_target_table[ix86_tune].align_loop;
3498 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3499 }
3500 if (align_jumps == 0)
3501 {
3502 align_jumps = processor_target_table[ix86_tune].align_jump;
3503 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3504 }
3505 if (align_functions == 0)
3506 {
3507 align_functions = processor_target_table[ix86_tune].align_func;
3508 }
3509
3510 /* Provide default for -mbranch-cost= value. */
3511 if (!global_options_set.x_ix86_branch_cost)
3512 ix86_branch_cost = ix86_cost->branch_cost;
3513
3514 if (TARGET_64BIT)
3515 {
3516 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3517
3518 /* Enable by default the SSE and MMX builtins. Do allow the user to
3519 explicitly disable any of these. In particular, disabling SSE and
3520 MMX for kernel code is extremely useful. */
3521 if (!ix86_arch_specified)
3522 ix86_isa_flags
3523 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3524 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3525
3526 if (TARGET_RTD)
3527 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3528 }
3529 else
3530 {
3531 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3532
3533 if (!ix86_arch_specified)
3534 ix86_isa_flags
3535 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3536
3537 /* i386 ABI does not specify red zone. It still makes sense to use it
3538 when programmer takes care to stack from being destroyed. */
3539 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3540 target_flags |= MASK_NO_RED_ZONE;
3541 }
3542
3543 /* Keep nonleaf frame pointers. */
3544 if (flag_omit_frame_pointer)
3545 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3546 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3547 flag_omit_frame_pointer = 1;
3548
3549 /* If we're doing fast math, we don't care about comparison order
3550 wrt NaNs. This lets us use a shorter comparison sequence. */
3551 if (flag_finite_math_only)
3552 target_flags &= ~MASK_IEEE_FP;
3553
3554 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3555 since the insns won't need emulation. */
3556 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3557 target_flags &= ~MASK_NO_FANCY_MATH_387;
3558
3559 /* Likewise, if the target doesn't have a 387, or we've specified
3560 software floating point, don't use 387 inline intrinsics. */
3561 if (!TARGET_80387)
3562 target_flags |= MASK_NO_FANCY_MATH_387;
3563
3564 /* Turn on MMX builtins for -msse. */
3565 if (TARGET_SSE)
3566 {
3567 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3568 x86_prefetch_sse = true;
3569 }
3570
3571 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3572 if (TARGET_SSE4_2 || TARGET_ABM)
3573 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3574
3575 /* Turn on lzcnt instruction for -mabm. */
3576 if (TARGET_ABM)
3577 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3578
3579 /* Validate -mpreferred-stack-boundary= value or default it to
3580 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3581 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3582 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3583 {
3584 int min = (TARGET_64BIT ? 4 : 2);
3585 int max = (TARGET_SEH ? 4 : 12);
3586
3587 if (ix86_preferred_stack_boundary_arg < min
3588 || ix86_preferred_stack_boundary_arg > max)
3589 {
3590 if (min == max)
3591 error ("-mpreferred-stack-boundary is not supported "
3592 "for this target");
3593 else
3594 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3595 ix86_preferred_stack_boundary_arg, min, max);
3596 }
3597 else
3598 ix86_preferred_stack_boundary
3599 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3600 }
3601
3602 /* Set the default value for -mstackrealign. */
3603 if (ix86_force_align_arg_pointer == -1)
3604 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3605
3606 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3607
3608 /* Validate -mincoming-stack-boundary= value or default it to
3609 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3610 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3611 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3612 {
3613 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3614 || ix86_incoming_stack_boundary_arg > 12)
3615 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3616 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3617 else
3618 {
3619 ix86_user_incoming_stack_boundary
3620 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3621 ix86_incoming_stack_boundary
3622 = ix86_user_incoming_stack_boundary;
3623 }
3624 }
3625
3626 /* Accept -msseregparm only if at least SSE support is enabled. */
3627 if (TARGET_SSEREGPARM
3628 && ! TARGET_SSE)
3629 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3630
3631 if (global_options_set.x_ix86_fpmath)
3632 {
3633 if (ix86_fpmath & FPMATH_SSE)
3634 {
3635 if (!TARGET_SSE)
3636 {
3637 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3638 ix86_fpmath = FPMATH_387;
3639 }
3640 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3641 {
3642 warning (0, "387 instruction set disabled, using SSE arithmetics");
3643 ix86_fpmath = FPMATH_SSE;
3644 }
3645 }
3646 }
3647 else
3648 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3649
3650 /* If the i387 is disabled, then do not return values in it. */
3651 if (!TARGET_80387)
3652 target_flags &= ~MASK_FLOAT_RETURNS;
3653
3654 /* Use external vectorized library in vectorizing intrinsics. */
3655 if (global_options_set.x_ix86_veclibabi_type)
3656 switch (ix86_veclibabi_type)
3657 {
3658 case ix86_veclibabi_type_svml:
3659 ix86_veclib_handler = ix86_veclibabi_svml;
3660 break;
3661
3662 case ix86_veclibabi_type_acml:
3663 ix86_veclib_handler = ix86_veclibabi_acml;
3664 break;
3665
3666 default:
3667 gcc_unreachable ();
3668 }
3669
3670 if ((!USE_IX86_FRAME_POINTER
3671 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3672 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3673 && !optimize_size)
3674 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3675
3676 /* ??? Unwind info is not correct around the CFG unless either a frame
3677 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3678 unwind info generation to be aware of the CFG and propagating states
3679 around edges. */
3680 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3681 || flag_exceptions || flag_non_call_exceptions)
3682 && flag_omit_frame_pointer
3683 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3684 {
3685 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3686 warning (0, "unwind tables currently require either a frame pointer "
3687 "or %saccumulate-outgoing-args%s for correctness",
3688 prefix, suffix);
3689 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3690 }
3691
3692 /* If stack probes are required, the space used for large function
3693 arguments on the stack must also be probed, so enable
3694 -maccumulate-outgoing-args so this happens in the prologue. */
3695 if (TARGET_STACK_PROBE
3696 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3697 {
3698 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3699 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3700 "for correctness", prefix, suffix);
3701 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3702 }
3703
3704 /* For sane SSE instruction set generation we need fcomi instruction.
3705 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3706 expands to a sequence that includes conditional move. */
3707 if (TARGET_SSE || TARGET_RDRND)
3708 TARGET_CMOVE = 1;
3709
3710 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3711 {
3712 char *p;
3713 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3714 p = strchr (internal_label_prefix, 'X');
3715 internal_label_prefix_len = p - internal_label_prefix;
3716 *p = '\0';
3717 }
3718
3719 /* When scheduling description is not available, disable scheduler pass
3720 so it won't slow down the compilation and make x87 code slower. */
3721 if (!TARGET_SCHEDULE)
3722 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3723
3724 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3725 ix86_cost->simultaneous_prefetches,
3726 global_options.x_param_values,
3727 global_options_set.x_param_values);
3728 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3729 global_options.x_param_values,
3730 global_options_set.x_param_values);
3731 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3732 global_options.x_param_values,
3733 global_options_set.x_param_values);
3734 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3735 global_options.x_param_values,
3736 global_options_set.x_param_values);
3737
3738 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3739 if (flag_prefetch_loop_arrays < 0
3740 && HAVE_prefetch
3741 && optimize >= 3
3742 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3743 flag_prefetch_loop_arrays = 1;
3744
3745 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3746 can be optimized to ap = __builtin_next_arg (0). */
3747 if (!TARGET_64BIT && !flag_split_stack)
3748 targetm.expand_builtin_va_start = NULL;
3749
3750 if (TARGET_64BIT)
3751 {
3752 ix86_gen_leave = gen_leave_rex64;
3753 ix86_gen_add3 = gen_adddi3;
3754 ix86_gen_sub3 = gen_subdi3;
3755 ix86_gen_sub3_carry = gen_subdi3_carry;
3756 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3757 ix86_gen_monitor = gen_sse3_monitor64;
3758 ix86_gen_andsp = gen_anddi3;
3759 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3760 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3761 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3762 }
3763 else
3764 {
3765 ix86_gen_leave = gen_leave;
3766 ix86_gen_add3 = gen_addsi3;
3767 ix86_gen_sub3 = gen_subsi3;
3768 ix86_gen_sub3_carry = gen_subsi3_carry;
3769 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3770 ix86_gen_monitor = gen_sse3_monitor;
3771 ix86_gen_andsp = gen_andsi3;
3772 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3773 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3774 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3775 }
3776
3777 #ifdef USE_IX86_CLD
3778 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3779 if (!TARGET_64BIT)
3780 target_flags |= MASK_CLD & ~target_flags_explicit;
3781 #endif
3782
3783 if (!TARGET_64BIT && flag_pic)
3784 {
3785 if (flag_fentry > 0)
3786 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3787 "with -fpic");
3788 flag_fentry = 0;
3789 }
3790 else if (TARGET_SEH)
3791 {
3792 if (flag_fentry == 0)
3793 sorry ("-mno-fentry isn%'t compatible with SEH");
3794 flag_fentry = 1;
3795 }
3796 else if (flag_fentry < 0)
3797 {
3798 #if defined(PROFILE_BEFORE_PROLOGUE)
3799 flag_fentry = 1;
3800 #else
3801 flag_fentry = 0;
3802 #endif
3803 }
3804
3805 if (TARGET_AVX)
3806 {
3807 /* When not optimize for size, enable vzeroupper optimization for
3808 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3809 AVX unaligned load/store. */
3810 if (!optimize_size)
3811 {
3812 if (flag_expensive_optimizations
3813 && !(target_flags_explicit & MASK_VZEROUPPER))
3814 target_flags |= MASK_VZEROUPPER;
3815 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3816 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3817 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3818 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3819 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3820 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3821 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3822 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3823 target_flags |= MASK_PREFER_AVX128;
3824 }
3825 }
3826 else
3827 {
3828 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3829 target_flags &= ~MASK_VZEROUPPER;
3830 }
3831
3832 if (ix86_recip_name)
3833 {
3834 char *p = ASTRDUP (ix86_recip_name);
3835 char *q;
3836 unsigned int mask, i;
3837 bool invert;
3838
3839 while ((q = strtok (p, ",")) != NULL)
3840 {
3841 p = NULL;
3842 if (*q == '!')
3843 {
3844 invert = true;
3845 q++;
3846 }
3847 else
3848 invert = false;
3849
3850 if (!strcmp (q, "default"))
3851 mask = RECIP_MASK_ALL;
3852 else
3853 {
3854 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3855 if (!strcmp (q, recip_options[i].string))
3856 {
3857 mask = recip_options[i].mask;
3858 break;
3859 }
3860
3861 if (i == ARRAY_SIZE (recip_options))
3862 {
3863 error ("unknown option for -mrecip=%s", q);
3864 invert = false;
3865 mask = RECIP_MASK_NONE;
3866 }
3867 }
3868
3869 recip_mask_explicit |= mask;
3870 if (invert)
3871 recip_mask &= ~mask;
3872 else
3873 recip_mask |= mask;
3874 }
3875 }
3876
3877 if (TARGET_RECIP)
3878 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3879 else if (target_flags_explicit & MASK_RECIP)
3880 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3881
3882 /* Save the initial options in case the user does function specific
3883 options. */
3884 if (main_args_p)
3885 target_option_default_node = target_option_current_node
3886 = build_target_option_node ();
3887 }
3888
3889 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3890
3891 static bool
3892 function_pass_avx256_p (const_rtx val)
3893 {
3894 if (!val)
3895 return false;
3896
3897 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3898 return true;
3899
3900 if (GET_CODE (val) == PARALLEL)
3901 {
3902 int i;
3903 rtx r;
3904
3905 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3906 {
3907 r = XVECEXP (val, 0, i);
3908 if (GET_CODE (r) == EXPR_LIST
3909 && XEXP (r, 0)
3910 && REG_P (XEXP (r, 0))
3911 && (GET_MODE (XEXP (r, 0)) == OImode
3912 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3913 return true;
3914 }
3915 }
3916
3917 return false;
3918 }
3919
3920 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3921
3922 static void
3923 ix86_option_override (void)
3924 {
3925 ix86_option_override_internal (true);
3926 }
3927
3928 /* Update register usage after having seen the compiler flags. */
3929
3930 static void
3931 ix86_conditional_register_usage (void)
3932 {
3933 int i;
3934 unsigned int j;
3935
3936 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3937 {
3938 if (fixed_regs[i] > 1)
3939 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3940 if (call_used_regs[i] > 1)
3941 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3942 }
3943
3944 /* The PIC register, if it exists, is fixed. */
3945 j = PIC_OFFSET_TABLE_REGNUM;
3946 if (j != INVALID_REGNUM)
3947 fixed_regs[j] = call_used_regs[j] = 1;
3948
3949 /* The 64-bit MS_ABI changes the set of call-used registers. */
3950 if (TARGET_64BIT_MS_ABI)
3951 {
3952 call_used_regs[SI_REG] = 0;
3953 call_used_regs[DI_REG] = 0;
3954 call_used_regs[XMM6_REG] = 0;
3955 call_used_regs[XMM7_REG] = 0;
3956 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3957 call_used_regs[i] = 0;
3958 }
3959
3960 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3961 other call-clobbered regs for 64-bit. */
3962 if (TARGET_64BIT)
3963 {
3964 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3965
3966 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3967 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3968 && call_used_regs[i])
3969 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3970 }
3971
3972 /* If MMX is disabled, squash the registers. */
3973 if (! TARGET_MMX)
3974 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3975 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3976 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3977
3978 /* If SSE is disabled, squash the registers. */
3979 if (! TARGET_SSE)
3980 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3981 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3982 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3983
3984 /* If the FPU is disabled, squash the registers. */
3985 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3986 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3987 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3988 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3989
3990 /* If 32-bit, squash the 64-bit registers. */
3991 if (! TARGET_64BIT)
3992 {
3993 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3994 reg_names[i] = "";
3995 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3996 reg_names[i] = "";
3997 }
3998 }
3999
4000 \f
4001 /* Save the current options */
4002
4003 static void
4004 ix86_function_specific_save (struct cl_target_option *ptr)
4005 {
4006 ptr->arch = ix86_arch;
4007 ptr->schedule = ix86_schedule;
4008 ptr->tune = ix86_tune;
4009 ptr->branch_cost = ix86_branch_cost;
4010 ptr->tune_defaulted = ix86_tune_defaulted;
4011 ptr->arch_specified = ix86_arch_specified;
4012 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4013 ptr->ix86_target_flags_explicit = target_flags_explicit;
4014 ptr->x_recip_mask_explicit = recip_mask_explicit;
4015
4016 /* The fields are char but the variables are not; make sure the
4017 values fit in the fields. */
4018 gcc_assert (ptr->arch == ix86_arch);
4019 gcc_assert (ptr->schedule == ix86_schedule);
4020 gcc_assert (ptr->tune == ix86_tune);
4021 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4022 }
4023
4024 /* Restore the current options */
4025
4026 static void
4027 ix86_function_specific_restore (struct cl_target_option *ptr)
4028 {
4029 enum processor_type old_tune = ix86_tune;
4030 enum processor_type old_arch = ix86_arch;
4031 unsigned int ix86_arch_mask, ix86_tune_mask;
4032 int i;
4033
4034 ix86_arch = (enum processor_type) ptr->arch;
4035 ix86_schedule = (enum attr_cpu) ptr->schedule;
4036 ix86_tune = (enum processor_type) ptr->tune;
4037 ix86_branch_cost = ptr->branch_cost;
4038 ix86_tune_defaulted = ptr->tune_defaulted;
4039 ix86_arch_specified = ptr->arch_specified;
4040 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4041 target_flags_explicit = ptr->ix86_target_flags_explicit;
4042 recip_mask_explicit = ptr->x_recip_mask_explicit;
4043
4044 /* Recreate the arch feature tests if the arch changed */
4045 if (old_arch != ix86_arch)
4046 {
4047 ix86_arch_mask = 1u << ix86_arch;
4048 for (i = 0; i < X86_ARCH_LAST; ++i)
4049 ix86_arch_features[i]
4050 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4051 }
4052
4053 /* Recreate the tune optimization tests */
4054 if (old_tune != ix86_tune)
4055 {
4056 ix86_tune_mask = 1u << ix86_tune;
4057 for (i = 0; i < X86_TUNE_LAST; ++i)
4058 ix86_tune_features[i]
4059 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4060 }
4061 }
4062
4063 /* Print the current options */
4064
4065 static void
4066 ix86_function_specific_print (FILE *file, int indent,
4067 struct cl_target_option *ptr)
4068 {
4069 char *target_string
4070 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4071 NULL, NULL, ptr->x_ix86_fpmath, false);
4072
4073 fprintf (file, "%*sarch = %d (%s)\n",
4074 indent, "",
4075 ptr->arch,
4076 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4077 ? cpu_names[ptr->arch]
4078 : "<unknown>"));
4079
4080 fprintf (file, "%*stune = %d (%s)\n",
4081 indent, "",
4082 ptr->tune,
4083 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4084 ? cpu_names[ptr->tune]
4085 : "<unknown>"));
4086
4087 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4088
4089 if (target_string)
4090 {
4091 fprintf (file, "%*s%s\n", indent, "", target_string);
4092 free (target_string);
4093 }
4094 }
4095
4096 \f
4097 /* Inner function to process the attribute((target(...))), take an argument and
4098 set the current options from the argument. If we have a list, recursively go
4099 over the list. */
4100
4101 static bool
4102 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4103 struct gcc_options *enum_opts_set)
4104 {
4105 char *next_optstr;
4106 bool ret = true;
4107
4108 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4109 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4110 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4111 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4112 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4113
4114 enum ix86_opt_type
4115 {
4116 ix86_opt_unknown,
4117 ix86_opt_yes,
4118 ix86_opt_no,
4119 ix86_opt_str,
4120 ix86_opt_enum,
4121 ix86_opt_isa
4122 };
4123
4124 static const struct
4125 {
4126 const char *string;
4127 size_t len;
4128 enum ix86_opt_type type;
4129 int opt;
4130 int mask;
4131 } attrs[] = {
4132 /* isa options */
4133 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4134 IX86_ATTR_ISA ("abm", OPT_mabm),
4135 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4136 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4137 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4138 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4139 IX86_ATTR_ISA ("aes", OPT_maes),
4140 IX86_ATTR_ISA ("avx", OPT_mavx),
4141 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4142 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4143 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4144 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4145 IX86_ATTR_ISA ("sse", OPT_msse),
4146 IX86_ATTR_ISA ("sse2", OPT_msse2),
4147 IX86_ATTR_ISA ("sse3", OPT_msse3),
4148 IX86_ATTR_ISA ("sse4", OPT_msse4),
4149 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4150 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4151 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4152 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4153 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4154 IX86_ATTR_ISA ("fma", OPT_mfma),
4155 IX86_ATTR_ISA ("xop", OPT_mxop),
4156 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4157 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4158 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4159 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4160
4161 /* enum options */
4162 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4163
4164 /* string options */
4165 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4166 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4167
4168 /* flag options */
4169 IX86_ATTR_YES ("cld",
4170 OPT_mcld,
4171 MASK_CLD),
4172
4173 IX86_ATTR_NO ("fancy-math-387",
4174 OPT_mfancy_math_387,
4175 MASK_NO_FANCY_MATH_387),
4176
4177 IX86_ATTR_YES ("ieee-fp",
4178 OPT_mieee_fp,
4179 MASK_IEEE_FP),
4180
4181 IX86_ATTR_YES ("inline-all-stringops",
4182 OPT_minline_all_stringops,
4183 MASK_INLINE_ALL_STRINGOPS),
4184
4185 IX86_ATTR_YES ("inline-stringops-dynamically",
4186 OPT_minline_stringops_dynamically,
4187 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4188
4189 IX86_ATTR_NO ("align-stringops",
4190 OPT_mno_align_stringops,
4191 MASK_NO_ALIGN_STRINGOPS),
4192
4193 IX86_ATTR_YES ("recip",
4194 OPT_mrecip,
4195 MASK_RECIP),
4196
4197 };
4198
4199 /* If this is a list, recurse to get the options. */
4200 if (TREE_CODE (args) == TREE_LIST)
4201 {
4202 bool ret = true;
4203
4204 for (; args; args = TREE_CHAIN (args))
4205 if (TREE_VALUE (args)
4206 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4207 p_strings, enum_opts_set))
4208 ret = false;
4209
4210 return ret;
4211 }
4212
4213 else if (TREE_CODE (args) != STRING_CST)
4214 gcc_unreachable ();
4215
4216 /* Handle multiple arguments separated by commas. */
4217 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4218
4219 while (next_optstr && *next_optstr != '\0')
4220 {
4221 char *p = next_optstr;
4222 char *orig_p = p;
4223 char *comma = strchr (next_optstr, ',');
4224 const char *opt_string;
4225 size_t len, opt_len;
4226 int opt;
4227 bool opt_set_p;
4228 char ch;
4229 unsigned i;
4230 enum ix86_opt_type type = ix86_opt_unknown;
4231 int mask = 0;
4232
4233 if (comma)
4234 {
4235 *comma = '\0';
4236 len = comma - next_optstr;
4237 next_optstr = comma + 1;
4238 }
4239 else
4240 {
4241 len = strlen (p);
4242 next_optstr = NULL;
4243 }
4244
4245 /* Recognize no-xxx. */
4246 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4247 {
4248 opt_set_p = false;
4249 p += 3;
4250 len -= 3;
4251 }
4252 else
4253 opt_set_p = true;
4254
4255 /* Find the option. */
4256 ch = *p;
4257 opt = N_OPTS;
4258 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4259 {
4260 type = attrs[i].type;
4261 opt_len = attrs[i].len;
4262 if (ch == attrs[i].string[0]
4263 && ((type != ix86_opt_str && type != ix86_opt_enum)
4264 ? len == opt_len
4265 : len > opt_len)
4266 && memcmp (p, attrs[i].string, opt_len) == 0)
4267 {
4268 opt = attrs[i].opt;
4269 mask = attrs[i].mask;
4270 opt_string = attrs[i].string;
4271 break;
4272 }
4273 }
4274
4275 /* Process the option. */
4276 if (opt == N_OPTS)
4277 {
4278 error ("attribute(target(\"%s\")) is unknown", orig_p);
4279 ret = false;
4280 }
4281
4282 else if (type == ix86_opt_isa)
4283 {
4284 struct cl_decoded_option decoded;
4285
4286 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4287 ix86_handle_option (&global_options, &global_options_set,
4288 &decoded, input_location);
4289 }
4290
4291 else if (type == ix86_opt_yes || type == ix86_opt_no)
4292 {
4293 if (type == ix86_opt_no)
4294 opt_set_p = !opt_set_p;
4295
4296 if (opt_set_p)
4297 target_flags |= mask;
4298 else
4299 target_flags &= ~mask;
4300 }
4301
4302 else if (type == ix86_opt_str)
4303 {
4304 if (p_strings[opt])
4305 {
4306 error ("option(\"%s\") was already specified", opt_string);
4307 ret = false;
4308 }
4309 else
4310 p_strings[opt] = xstrdup (p + opt_len);
4311 }
4312
4313 else if (type == ix86_opt_enum)
4314 {
4315 bool arg_ok;
4316 int value;
4317
4318 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4319 if (arg_ok)
4320 set_option (&global_options, enum_opts_set, opt, value,
4321 p + opt_len, DK_UNSPECIFIED, input_location,
4322 global_dc);
4323 else
4324 {
4325 error ("attribute(target(\"%s\")) is unknown", orig_p);
4326 ret = false;
4327 }
4328 }
4329
4330 else
4331 gcc_unreachable ();
4332 }
4333
4334 return ret;
4335 }
4336
4337 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4338
4339 tree
4340 ix86_valid_target_attribute_tree (tree args)
4341 {
4342 const char *orig_arch_string = ix86_arch_string;
4343 const char *orig_tune_string = ix86_tune_string;
4344 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4345 int orig_tune_defaulted = ix86_tune_defaulted;
4346 int orig_arch_specified = ix86_arch_specified;
4347 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4348 tree t = NULL_TREE;
4349 int i;
4350 struct cl_target_option *def
4351 = TREE_TARGET_OPTION (target_option_default_node);
4352 struct gcc_options enum_opts_set;
4353
4354 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4355
4356 /* Process each of the options on the chain. */
4357 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4358 &enum_opts_set))
4359 return NULL_TREE;
4360
4361 /* If the changed options are different from the default, rerun
4362 ix86_option_override_internal, and then save the options away.
4363 The string options are are attribute options, and will be undone
4364 when we copy the save structure. */
4365 if (ix86_isa_flags != def->x_ix86_isa_flags
4366 || target_flags != def->x_target_flags
4367 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4368 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4369 || enum_opts_set.x_ix86_fpmath)
4370 {
4371 /* If we are using the default tune= or arch=, undo the string assigned,
4372 and use the default. */
4373 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4374 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4375 else if (!orig_arch_specified)
4376 ix86_arch_string = NULL;
4377
4378 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4379 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4380 else if (orig_tune_defaulted)
4381 ix86_tune_string = NULL;
4382
4383 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4384 if (enum_opts_set.x_ix86_fpmath)
4385 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4386 else if (!TARGET_64BIT && TARGET_SSE)
4387 {
4388 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4389 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4390 }
4391
4392 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4393 ix86_option_override_internal (false);
4394
4395 /* Add any builtin functions with the new isa if any. */
4396 ix86_add_new_builtins (ix86_isa_flags);
4397
4398 /* Save the current options unless we are validating options for
4399 #pragma. */
4400 t = build_target_option_node ();
4401
4402 ix86_arch_string = orig_arch_string;
4403 ix86_tune_string = orig_tune_string;
4404 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4405
4406 /* Free up memory allocated to hold the strings */
4407 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4408 free (option_strings[i]);
4409 }
4410
4411 return t;
4412 }
4413
4414 /* Hook to validate attribute((target("string"))). */
4415
4416 static bool
4417 ix86_valid_target_attribute_p (tree fndecl,
4418 tree ARG_UNUSED (name),
4419 tree args,
4420 int ARG_UNUSED (flags))
4421 {
4422 struct cl_target_option cur_target;
4423 bool ret = true;
4424 tree old_optimize = build_optimization_node ();
4425 tree new_target, new_optimize;
4426 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4427
4428 /* If the function changed the optimization levels as well as setting target
4429 options, start with the optimizations specified. */
4430 if (func_optimize && func_optimize != old_optimize)
4431 cl_optimization_restore (&global_options,
4432 TREE_OPTIMIZATION (func_optimize));
4433
4434 /* The target attributes may also change some optimization flags, so update
4435 the optimization options if necessary. */
4436 cl_target_option_save (&cur_target, &global_options);
4437 new_target = ix86_valid_target_attribute_tree (args);
4438 new_optimize = build_optimization_node ();
4439
4440 if (!new_target)
4441 ret = false;
4442
4443 else if (fndecl)
4444 {
4445 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4446
4447 if (old_optimize != new_optimize)
4448 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4449 }
4450
4451 cl_target_option_restore (&global_options, &cur_target);
4452
4453 if (old_optimize != new_optimize)
4454 cl_optimization_restore (&global_options,
4455 TREE_OPTIMIZATION (old_optimize));
4456
4457 return ret;
4458 }
4459
4460 \f
4461 /* Hook to determine if one function can safely inline another. */
4462
4463 static bool
4464 ix86_can_inline_p (tree caller, tree callee)
4465 {
4466 bool ret = false;
4467 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4468 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4469
4470 /* If callee has no option attributes, then it is ok to inline. */
4471 if (!callee_tree)
4472 ret = true;
4473
4474 /* If caller has no option attributes, but callee does then it is not ok to
4475 inline. */
4476 else if (!caller_tree)
4477 ret = false;
4478
4479 else
4480 {
4481 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4482 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4483
4484 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4485 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4486 function. */
4487 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4488 != callee_opts->x_ix86_isa_flags)
4489 ret = false;
4490
4491 /* See if we have the same non-isa options. */
4492 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4493 ret = false;
4494
4495 /* See if arch, tune, etc. are the same. */
4496 else if (caller_opts->arch != callee_opts->arch)
4497 ret = false;
4498
4499 else if (caller_opts->tune != callee_opts->tune)
4500 ret = false;
4501
4502 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4503 ret = false;
4504
4505 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4506 ret = false;
4507
4508 else
4509 ret = true;
4510 }
4511
4512 return ret;
4513 }
4514
4515 \f
4516 /* Remember the last target of ix86_set_current_function. */
4517 static GTY(()) tree ix86_previous_fndecl;
4518
4519 /* Establish appropriate back-end context for processing the function
4520 FNDECL. The argument might be NULL to indicate processing at top
4521 level, outside of any function scope. */
4522 static void
4523 ix86_set_current_function (tree fndecl)
4524 {
4525 /* Only change the context if the function changes. This hook is called
4526 several times in the course of compiling a function, and we don't want to
4527 slow things down too much or call target_reinit when it isn't safe. */
4528 if (fndecl && fndecl != ix86_previous_fndecl)
4529 {
4530 tree old_tree = (ix86_previous_fndecl
4531 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4532 : NULL_TREE);
4533
4534 tree new_tree = (fndecl
4535 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4536 : NULL_TREE);
4537
4538 ix86_previous_fndecl = fndecl;
4539 if (old_tree == new_tree)
4540 ;
4541
4542 else if (new_tree)
4543 {
4544 cl_target_option_restore (&global_options,
4545 TREE_TARGET_OPTION (new_tree));
4546 target_reinit ();
4547 }
4548
4549 else if (old_tree)
4550 {
4551 struct cl_target_option *def
4552 = TREE_TARGET_OPTION (target_option_current_node);
4553
4554 cl_target_option_restore (&global_options, def);
4555 target_reinit ();
4556 }
4557 }
4558 }
4559
4560 \f
4561 /* Return true if this goes in large data/bss. */
4562
4563 static bool
4564 ix86_in_large_data_p (tree exp)
4565 {
4566 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4567 return false;
4568
4569 /* Functions are never large data. */
4570 if (TREE_CODE (exp) == FUNCTION_DECL)
4571 return false;
4572
4573 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4574 {
4575 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4576 if (strcmp (section, ".ldata") == 0
4577 || strcmp (section, ".lbss") == 0)
4578 return true;
4579 return false;
4580 }
4581 else
4582 {
4583 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4584
4585 /* If this is an incomplete type with size 0, then we can't put it
4586 in data because it might be too big when completed. */
4587 if (!size || size > ix86_section_threshold)
4588 return true;
4589 }
4590
4591 return false;
4592 }
4593
4594 /* Switch to the appropriate section for output of DECL.
4595 DECL is either a `VAR_DECL' node or a constant of some sort.
4596 RELOC indicates whether forming the initial value of DECL requires
4597 link-time relocations. */
4598
4599 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4600 ATTRIBUTE_UNUSED;
4601
4602 static section *
4603 x86_64_elf_select_section (tree decl, int reloc,
4604 unsigned HOST_WIDE_INT align)
4605 {
4606 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4607 && ix86_in_large_data_p (decl))
4608 {
4609 const char *sname = NULL;
4610 unsigned int flags = SECTION_WRITE;
4611 switch (categorize_decl_for_section (decl, reloc))
4612 {
4613 case SECCAT_DATA:
4614 sname = ".ldata";
4615 break;
4616 case SECCAT_DATA_REL:
4617 sname = ".ldata.rel";
4618 break;
4619 case SECCAT_DATA_REL_LOCAL:
4620 sname = ".ldata.rel.local";
4621 break;
4622 case SECCAT_DATA_REL_RO:
4623 sname = ".ldata.rel.ro";
4624 break;
4625 case SECCAT_DATA_REL_RO_LOCAL:
4626 sname = ".ldata.rel.ro.local";
4627 break;
4628 case SECCAT_BSS:
4629 sname = ".lbss";
4630 flags |= SECTION_BSS;
4631 break;
4632 case SECCAT_RODATA:
4633 case SECCAT_RODATA_MERGE_STR:
4634 case SECCAT_RODATA_MERGE_STR_INIT:
4635 case SECCAT_RODATA_MERGE_CONST:
4636 sname = ".lrodata";
4637 flags = 0;
4638 break;
4639 case SECCAT_SRODATA:
4640 case SECCAT_SDATA:
4641 case SECCAT_SBSS:
4642 gcc_unreachable ();
4643 case SECCAT_TEXT:
4644 case SECCAT_TDATA:
4645 case SECCAT_TBSS:
4646 /* We don't split these for medium model. Place them into
4647 default sections and hope for best. */
4648 break;
4649 }
4650 if (sname)
4651 {
4652 /* We might get called with string constants, but get_named_section
4653 doesn't like them as they are not DECLs. Also, we need to set
4654 flags in that case. */
4655 if (!DECL_P (decl))
4656 return get_section (sname, flags, NULL);
4657 return get_named_section (decl, sname, reloc);
4658 }
4659 }
4660 return default_elf_select_section (decl, reloc, align);
4661 }
4662
4663 /* Build up a unique section name, expressed as a
4664 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4665 RELOC indicates whether the initial value of EXP requires
4666 link-time relocations. */
4667
4668 static void ATTRIBUTE_UNUSED
4669 x86_64_elf_unique_section (tree decl, int reloc)
4670 {
4671 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4672 && ix86_in_large_data_p (decl))
4673 {
4674 const char *prefix = NULL;
4675 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4676 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4677
4678 switch (categorize_decl_for_section (decl, reloc))
4679 {
4680 case SECCAT_DATA:
4681 case SECCAT_DATA_REL:
4682 case SECCAT_DATA_REL_LOCAL:
4683 case SECCAT_DATA_REL_RO:
4684 case SECCAT_DATA_REL_RO_LOCAL:
4685 prefix = one_only ? ".ld" : ".ldata";
4686 break;
4687 case SECCAT_BSS:
4688 prefix = one_only ? ".lb" : ".lbss";
4689 break;
4690 case SECCAT_RODATA:
4691 case SECCAT_RODATA_MERGE_STR:
4692 case SECCAT_RODATA_MERGE_STR_INIT:
4693 case SECCAT_RODATA_MERGE_CONST:
4694 prefix = one_only ? ".lr" : ".lrodata";
4695 break;
4696 case SECCAT_SRODATA:
4697 case SECCAT_SDATA:
4698 case SECCAT_SBSS:
4699 gcc_unreachable ();
4700 case SECCAT_TEXT:
4701 case SECCAT_TDATA:
4702 case SECCAT_TBSS:
4703 /* We don't split these for medium model. Place them into
4704 default sections and hope for best. */
4705 break;
4706 }
4707 if (prefix)
4708 {
4709 const char *name, *linkonce;
4710 char *string;
4711
4712 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4713 name = targetm.strip_name_encoding (name);
4714
4715 /* If we're using one_only, then there needs to be a .gnu.linkonce
4716 prefix to the section name. */
4717 linkonce = one_only ? ".gnu.linkonce" : "";
4718
4719 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4720
4721 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4722 return;
4723 }
4724 }
4725 default_unique_section (decl, reloc);
4726 }
4727
4728 #ifdef COMMON_ASM_OP
4729 /* This says how to output assembler code to declare an
4730 uninitialized external linkage data object.
4731
4732 For medium model x86-64 we need to use .largecomm opcode for
4733 large objects. */
4734 void
4735 x86_elf_aligned_common (FILE *file,
4736 const char *name, unsigned HOST_WIDE_INT size,
4737 int align)
4738 {
4739 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4740 && size > (unsigned int)ix86_section_threshold)
4741 fputs (".largecomm\t", file);
4742 else
4743 fputs (COMMON_ASM_OP, file);
4744 assemble_name (file, name);
4745 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4746 size, align / BITS_PER_UNIT);
4747 }
4748 #endif
4749
4750 /* Utility function for targets to use in implementing
4751 ASM_OUTPUT_ALIGNED_BSS. */
4752
4753 void
4754 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4755 const char *name, unsigned HOST_WIDE_INT size,
4756 int align)
4757 {
4758 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4759 && size > (unsigned int)ix86_section_threshold)
4760 switch_to_section (get_named_section (decl, ".lbss", 0));
4761 else
4762 switch_to_section (bss_section);
4763 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4764 #ifdef ASM_DECLARE_OBJECT_NAME
4765 last_assemble_variable_decl = decl;
4766 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4767 #else
4768 /* Standard thing is just output label for the object. */
4769 ASM_OUTPUT_LABEL (file, name);
4770 #endif /* ASM_DECLARE_OBJECT_NAME */
4771 ASM_OUTPUT_SKIP (file, size ? size : 1);
4772 }
4773 \f
4774 /* Decide whether we must probe the stack before any space allocation
4775 on this target. It's essentially TARGET_STACK_PROBE except when
4776 -fstack-check causes the stack to be already probed differently. */
4777
4778 bool
4779 ix86_target_stack_probe (void)
4780 {
4781 /* Do not probe the stack twice if static stack checking is enabled. */
4782 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4783 return false;
4784
4785 return TARGET_STACK_PROBE;
4786 }
4787 \f
4788 /* Decide whether we can make a sibling call to a function. DECL is the
4789 declaration of the function being targeted by the call and EXP is the
4790 CALL_EXPR representing the call. */
4791
4792 static bool
4793 ix86_function_ok_for_sibcall (tree decl, tree exp)
4794 {
4795 tree type, decl_or_type;
4796 rtx a, b;
4797
4798 /* If we are generating position-independent code, we cannot sibcall
4799 optimize any indirect call, or a direct call to a global function,
4800 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4801 if (!TARGET_MACHO
4802 && !TARGET_64BIT
4803 && flag_pic
4804 && (!decl || !targetm.binds_local_p (decl)))
4805 return false;
4806
4807 /* If we need to align the outgoing stack, then sibcalling would
4808 unalign the stack, which may break the called function. */
4809 if (ix86_minimum_incoming_stack_boundary (true)
4810 < PREFERRED_STACK_BOUNDARY)
4811 return false;
4812
4813 if (decl)
4814 {
4815 decl_or_type = decl;
4816 type = TREE_TYPE (decl);
4817 }
4818 else
4819 {
4820 /* We're looking at the CALL_EXPR, we need the type of the function. */
4821 type = CALL_EXPR_FN (exp); /* pointer expression */
4822 type = TREE_TYPE (type); /* pointer type */
4823 type = TREE_TYPE (type); /* function type */
4824 decl_or_type = type;
4825 }
4826
4827 /* Check that the return value locations are the same. Like
4828 if we are returning floats on the 80387 register stack, we cannot
4829 make a sibcall from a function that doesn't return a float to a
4830 function that does or, conversely, from a function that does return
4831 a float to a function that doesn't; the necessary stack adjustment
4832 would not be executed. This is also the place we notice
4833 differences in the return value ABI. Note that it is ok for one
4834 of the functions to have void return type as long as the return
4835 value of the other is passed in a register. */
4836 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4837 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4838 cfun->decl, false);
4839 if (STACK_REG_P (a) || STACK_REG_P (b))
4840 {
4841 if (!rtx_equal_p (a, b))
4842 return false;
4843 }
4844 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4845 {
4846 /* Disable sibcall if we need to generate vzeroupper after
4847 callee returns. */
4848 if (TARGET_VZEROUPPER
4849 && cfun->machine->callee_return_avx256_p
4850 && !cfun->machine->caller_return_avx256_p)
4851 return false;
4852 }
4853 else if (!rtx_equal_p (a, b))
4854 return false;
4855
4856 if (TARGET_64BIT)
4857 {
4858 /* The SYSV ABI has more call-clobbered registers;
4859 disallow sibcalls from MS to SYSV. */
4860 if (cfun->machine->call_abi == MS_ABI
4861 && ix86_function_type_abi (type) == SYSV_ABI)
4862 return false;
4863 }
4864 else
4865 {
4866 /* If this call is indirect, we'll need to be able to use a
4867 call-clobbered register for the address of the target function.
4868 Make sure that all such registers are not used for passing
4869 parameters. Note that DLLIMPORT functions are indirect. */
4870 if (!decl
4871 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4872 {
4873 if (ix86_function_regparm (type, NULL) >= 3)
4874 {
4875 /* ??? Need to count the actual number of registers to be used,
4876 not the possible number of registers. Fix later. */
4877 return false;
4878 }
4879 }
4880 }
4881
4882 /* Otherwise okay. That also includes certain types of indirect calls. */
4883 return true;
4884 }
4885
4886 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4887 and "sseregparm" calling convention attributes;
4888 arguments as in struct attribute_spec.handler. */
4889
4890 static tree
4891 ix86_handle_cconv_attribute (tree *node, tree name,
4892 tree args,
4893 int flags ATTRIBUTE_UNUSED,
4894 bool *no_add_attrs)
4895 {
4896 if (TREE_CODE (*node) != FUNCTION_TYPE
4897 && TREE_CODE (*node) != METHOD_TYPE
4898 && TREE_CODE (*node) != FIELD_DECL
4899 && TREE_CODE (*node) != TYPE_DECL)
4900 {
4901 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4902 name);
4903 *no_add_attrs = true;
4904 return NULL_TREE;
4905 }
4906
4907 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4908 if (is_attribute_p ("regparm", name))
4909 {
4910 tree cst;
4911
4912 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4913 {
4914 error ("fastcall and regparm attributes are not compatible");
4915 }
4916
4917 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4918 {
4919 error ("regparam and thiscall attributes are not compatible");
4920 }
4921
4922 cst = TREE_VALUE (args);
4923 if (TREE_CODE (cst) != INTEGER_CST)
4924 {
4925 warning (OPT_Wattributes,
4926 "%qE attribute requires an integer constant argument",
4927 name);
4928 *no_add_attrs = true;
4929 }
4930 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4931 {
4932 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4933 name, REGPARM_MAX);
4934 *no_add_attrs = true;
4935 }
4936
4937 return NULL_TREE;
4938 }
4939
4940 if (TARGET_64BIT)
4941 {
4942 /* Do not warn when emulating the MS ABI. */
4943 if ((TREE_CODE (*node) != FUNCTION_TYPE
4944 && TREE_CODE (*node) != METHOD_TYPE)
4945 || ix86_function_type_abi (*node) != MS_ABI)
4946 warning (OPT_Wattributes, "%qE attribute ignored",
4947 name);
4948 *no_add_attrs = true;
4949 return NULL_TREE;
4950 }
4951
4952 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4953 if (is_attribute_p ("fastcall", name))
4954 {
4955 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4956 {
4957 error ("fastcall and cdecl attributes are not compatible");
4958 }
4959 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4960 {
4961 error ("fastcall and stdcall attributes are not compatible");
4962 }
4963 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4964 {
4965 error ("fastcall and regparm attributes are not compatible");
4966 }
4967 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4968 {
4969 error ("fastcall and thiscall attributes are not compatible");
4970 }
4971 }
4972
4973 /* Can combine stdcall with fastcall (redundant), regparm and
4974 sseregparm. */
4975 else if (is_attribute_p ("stdcall", name))
4976 {
4977 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4978 {
4979 error ("stdcall and cdecl attributes are not compatible");
4980 }
4981 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4982 {
4983 error ("stdcall and fastcall attributes are not compatible");
4984 }
4985 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4986 {
4987 error ("stdcall and thiscall attributes are not compatible");
4988 }
4989 }
4990
4991 /* Can combine cdecl with regparm and sseregparm. */
4992 else if (is_attribute_p ("cdecl", name))
4993 {
4994 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4995 {
4996 error ("stdcall and cdecl attributes are not compatible");
4997 }
4998 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4999 {
5000 error ("fastcall and cdecl attributes are not compatible");
5001 }
5002 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5003 {
5004 error ("cdecl and thiscall attributes are not compatible");
5005 }
5006 }
5007 else if (is_attribute_p ("thiscall", name))
5008 {
5009 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5010 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5011 name);
5012 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5013 {
5014 error ("stdcall and thiscall attributes are not compatible");
5015 }
5016 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5017 {
5018 error ("fastcall and thiscall attributes are not compatible");
5019 }
5020 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5021 {
5022 error ("cdecl and thiscall attributes are not compatible");
5023 }
5024 }
5025
5026 /* Can combine sseregparm with all attributes. */
5027
5028 return NULL_TREE;
5029 }
5030
5031 /* The transactional memory builtins are implicitly regparm or fastcall
5032 depending on the ABI. Override the generic do-nothing attribute that
5033 these builtins were declared with, and replace it with one of the two
5034 attributes that we expect elsewhere. */
5035
5036 static tree
5037 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5038 tree args ATTRIBUTE_UNUSED,
5039 int flags ATTRIBUTE_UNUSED,
5040 bool *no_add_attrs)
5041 {
5042 tree alt;
5043
5044 /* In no case do we want to add the placeholder attribute. */
5045 *no_add_attrs = true;
5046
5047 /* The 64-bit ABI is unchanged for transactional memory. */
5048 if (TARGET_64BIT)
5049 return NULL_TREE;
5050
5051 /* ??? Is there a better way to validate 32-bit windows? We have
5052 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5053 if (CHECK_STACK_LIMIT > 0)
5054 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5055 else
5056 {
5057 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5058 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5059 }
5060 decl_attributes (node, alt, flags);
5061
5062 return NULL_TREE;
5063 }
5064
5065 /* This function determines from TYPE the calling-convention. */
5066
5067 unsigned int
5068 ix86_get_callcvt (const_tree type)
5069 {
5070 unsigned int ret = 0;
5071 bool is_stdarg;
5072 tree attrs;
5073
5074 if (TARGET_64BIT)
5075 return IX86_CALLCVT_CDECL;
5076
5077 attrs = TYPE_ATTRIBUTES (type);
5078 if (attrs != NULL_TREE)
5079 {
5080 if (lookup_attribute ("cdecl", attrs))
5081 ret |= IX86_CALLCVT_CDECL;
5082 else if (lookup_attribute ("stdcall", attrs))
5083 ret |= IX86_CALLCVT_STDCALL;
5084 else if (lookup_attribute ("fastcall", attrs))
5085 ret |= IX86_CALLCVT_FASTCALL;
5086 else if (lookup_attribute ("thiscall", attrs))
5087 ret |= IX86_CALLCVT_THISCALL;
5088
5089 /* Regparam isn't allowed for thiscall and fastcall. */
5090 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5091 {
5092 if (lookup_attribute ("regparm", attrs))
5093 ret |= IX86_CALLCVT_REGPARM;
5094 if (lookup_attribute ("sseregparm", attrs))
5095 ret |= IX86_CALLCVT_SSEREGPARM;
5096 }
5097
5098 if (IX86_BASE_CALLCVT(ret) != 0)
5099 return ret;
5100 }
5101
5102 is_stdarg = stdarg_p (type);
5103 if (TARGET_RTD && !is_stdarg)
5104 return IX86_CALLCVT_STDCALL | ret;
5105
5106 if (ret != 0
5107 || is_stdarg
5108 || TREE_CODE (type) != METHOD_TYPE
5109 || ix86_function_type_abi (type) != MS_ABI)
5110 return IX86_CALLCVT_CDECL | ret;
5111
5112 return IX86_CALLCVT_THISCALL;
5113 }
5114
5115 /* Return 0 if the attributes for two types are incompatible, 1 if they
5116 are compatible, and 2 if they are nearly compatible (which causes a
5117 warning to be generated). */
5118
5119 static int
5120 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5121 {
5122 unsigned int ccvt1, ccvt2;
5123
5124 if (TREE_CODE (type1) != FUNCTION_TYPE
5125 && TREE_CODE (type1) != METHOD_TYPE)
5126 return 1;
5127
5128 ccvt1 = ix86_get_callcvt (type1);
5129 ccvt2 = ix86_get_callcvt (type2);
5130 if (ccvt1 != ccvt2)
5131 return 0;
5132 if (ix86_function_regparm (type1, NULL)
5133 != ix86_function_regparm (type2, NULL))
5134 return 0;
5135
5136 return 1;
5137 }
5138 \f
5139 /* Return the regparm value for a function with the indicated TYPE and DECL.
5140 DECL may be NULL when calling function indirectly
5141 or considering a libcall. */
5142
5143 static int
5144 ix86_function_regparm (const_tree type, const_tree decl)
5145 {
5146 tree attr;
5147 int regparm;
5148 unsigned int ccvt;
5149
5150 if (TARGET_64BIT)
5151 return (ix86_function_type_abi (type) == SYSV_ABI
5152 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5153 ccvt = ix86_get_callcvt (type);
5154 regparm = ix86_regparm;
5155
5156 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5157 {
5158 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5159 if (attr)
5160 {
5161 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5162 return regparm;
5163 }
5164 }
5165 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5166 return 2;
5167 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5168 return 1;
5169
5170 /* Use register calling convention for local functions when possible. */
5171 if (decl
5172 && TREE_CODE (decl) == FUNCTION_DECL
5173 && optimize
5174 && !(profile_flag && !flag_fentry))
5175 {
5176 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5177 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5178 if (i && i->local && i->can_change_signature)
5179 {
5180 int local_regparm, globals = 0, regno;
5181
5182 /* Make sure no regparm register is taken by a
5183 fixed register variable. */
5184 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5185 if (fixed_regs[local_regparm])
5186 break;
5187
5188 /* We don't want to use regparm(3) for nested functions as
5189 these use a static chain pointer in the third argument. */
5190 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5191 local_regparm = 2;
5192
5193 /* In 32-bit mode save a register for the split stack. */
5194 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5195 local_regparm = 2;
5196
5197 /* Each fixed register usage increases register pressure,
5198 so less registers should be used for argument passing.
5199 This functionality can be overriden by an explicit
5200 regparm value. */
5201 for (regno = 0; regno <= DI_REG; regno++)
5202 if (fixed_regs[regno])
5203 globals++;
5204
5205 local_regparm
5206 = globals < local_regparm ? local_regparm - globals : 0;
5207
5208 if (local_regparm > regparm)
5209 regparm = local_regparm;
5210 }
5211 }
5212
5213 return regparm;
5214 }
5215
5216 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5217 DFmode (2) arguments in SSE registers for a function with the
5218 indicated TYPE and DECL. DECL may be NULL when calling function
5219 indirectly or considering a libcall. Otherwise return 0. */
5220
5221 static int
5222 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5223 {
5224 gcc_assert (!TARGET_64BIT);
5225
5226 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5227 by the sseregparm attribute. */
5228 if (TARGET_SSEREGPARM
5229 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5230 {
5231 if (!TARGET_SSE)
5232 {
5233 if (warn)
5234 {
5235 if (decl)
5236 error ("calling %qD with attribute sseregparm without "
5237 "SSE/SSE2 enabled", decl);
5238 else
5239 error ("calling %qT with attribute sseregparm without "
5240 "SSE/SSE2 enabled", type);
5241 }
5242 return 0;
5243 }
5244
5245 return 2;
5246 }
5247
5248 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5249 (and DFmode for SSE2) arguments in SSE registers. */
5250 if (decl && TARGET_SSE_MATH && optimize
5251 && !(profile_flag && !flag_fentry))
5252 {
5253 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5254 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5255 if (i && i->local && i->can_change_signature)
5256 return TARGET_SSE2 ? 2 : 1;
5257 }
5258
5259 return 0;
5260 }
5261
5262 /* Return true if EAX is live at the start of the function. Used by
5263 ix86_expand_prologue to determine if we need special help before
5264 calling allocate_stack_worker. */
5265
5266 static bool
5267 ix86_eax_live_at_start_p (void)
5268 {
5269 /* Cheat. Don't bother working forward from ix86_function_regparm
5270 to the function type to whether an actual argument is located in
5271 eax. Instead just look at cfg info, which is still close enough
5272 to correct at this point. This gives false positives for broken
5273 functions that might use uninitialized data that happens to be
5274 allocated in eax, but who cares? */
5275 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5276 }
5277
5278 static bool
5279 ix86_keep_aggregate_return_pointer (tree fntype)
5280 {
5281 tree attr;
5282
5283 if (!TARGET_64BIT)
5284 {
5285 attr = lookup_attribute ("callee_pop_aggregate_return",
5286 TYPE_ATTRIBUTES (fntype));
5287 if (attr)
5288 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5289
5290 /* For 32-bit MS-ABI the default is to keep aggregate
5291 return pointer. */
5292 if (ix86_function_type_abi (fntype) == MS_ABI)
5293 return true;
5294 }
5295 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5296 }
5297
5298 /* Value is the number of bytes of arguments automatically
5299 popped when returning from a subroutine call.
5300 FUNDECL is the declaration node of the function (as a tree),
5301 FUNTYPE is the data type of the function (as a tree),
5302 or for a library call it is an identifier node for the subroutine name.
5303 SIZE is the number of bytes of arguments passed on the stack.
5304
5305 On the 80386, the RTD insn may be used to pop them if the number
5306 of args is fixed, but if the number is variable then the caller
5307 must pop them all. RTD can't be used for library calls now
5308 because the library is compiled with the Unix compiler.
5309 Use of RTD is a selectable option, since it is incompatible with
5310 standard Unix calling sequences. If the option is not selected,
5311 the caller must always pop the args.
5312
5313 The attribute stdcall is equivalent to RTD on a per module basis. */
5314
5315 static int
5316 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5317 {
5318 unsigned int ccvt;
5319
5320 /* None of the 64-bit ABIs pop arguments. */
5321 if (TARGET_64BIT)
5322 return 0;
5323
5324 ccvt = ix86_get_callcvt (funtype);
5325
5326 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5327 | IX86_CALLCVT_THISCALL)) != 0
5328 && ! stdarg_p (funtype))
5329 return size;
5330
5331 /* Lose any fake structure return argument if it is passed on the stack. */
5332 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5333 && !ix86_keep_aggregate_return_pointer (funtype))
5334 {
5335 int nregs = ix86_function_regparm (funtype, fundecl);
5336 if (nregs == 0)
5337 return GET_MODE_SIZE (Pmode);
5338 }
5339
5340 return 0;
5341 }
5342 \f
5343 /* Argument support functions. */
5344
5345 /* Return true when register may be used to pass function parameters. */
5346 bool
5347 ix86_function_arg_regno_p (int regno)
5348 {
5349 int i;
5350 const int *parm_regs;
5351
5352 if (!TARGET_64BIT)
5353 {
5354 if (TARGET_MACHO)
5355 return (regno < REGPARM_MAX
5356 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5357 else
5358 return (regno < REGPARM_MAX
5359 || (TARGET_MMX && MMX_REGNO_P (regno)
5360 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5361 || (TARGET_SSE && SSE_REGNO_P (regno)
5362 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5363 }
5364
5365 if (TARGET_MACHO)
5366 {
5367 if (SSE_REGNO_P (regno) && TARGET_SSE)
5368 return true;
5369 }
5370 else
5371 {
5372 if (TARGET_SSE && SSE_REGNO_P (regno)
5373 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5374 return true;
5375 }
5376
5377 /* TODO: The function should depend on current function ABI but
5378 builtins.c would need updating then. Therefore we use the
5379 default ABI. */
5380
5381 /* RAX is used as hidden argument to va_arg functions. */
5382 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5383 return true;
5384
5385 if (ix86_abi == MS_ABI)
5386 parm_regs = x86_64_ms_abi_int_parameter_registers;
5387 else
5388 parm_regs = x86_64_int_parameter_registers;
5389 for (i = 0; i < (ix86_abi == MS_ABI
5390 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5391 if (regno == parm_regs[i])
5392 return true;
5393 return false;
5394 }
5395
5396 /* Return if we do not know how to pass TYPE solely in registers. */
5397
5398 static bool
5399 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5400 {
5401 if (must_pass_in_stack_var_size_or_pad (mode, type))
5402 return true;
5403
5404 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5405 The layout_type routine is crafty and tries to trick us into passing
5406 currently unsupported vector types on the stack by using TImode. */
5407 return (!TARGET_64BIT && mode == TImode
5408 && type && TREE_CODE (type) != VECTOR_TYPE);
5409 }
5410
5411 /* It returns the size, in bytes, of the area reserved for arguments passed
5412 in registers for the function represented by fndecl dependent to the used
5413 abi format. */
5414 int
5415 ix86_reg_parm_stack_space (const_tree fndecl)
5416 {
5417 enum calling_abi call_abi = SYSV_ABI;
5418 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5419 call_abi = ix86_function_abi (fndecl);
5420 else
5421 call_abi = ix86_function_type_abi (fndecl);
5422 if (TARGET_64BIT && call_abi == MS_ABI)
5423 return 32;
5424 return 0;
5425 }
5426
5427 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5428 call abi used. */
5429 enum calling_abi
5430 ix86_function_type_abi (const_tree fntype)
5431 {
5432 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5433 {
5434 enum calling_abi abi = ix86_abi;
5435 if (abi == SYSV_ABI)
5436 {
5437 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5438 abi = MS_ABI;
5439 }
5440 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5441 abi = SYSV_ABI;
5442 return abi;
5443 }
5444 return ix86_abi;
5445 }
5446
5447 static bool
5448 ix86_function_ms_hook_prologue (const_tree fn)
5449 {
5450 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5451 {
5452 if (decl_function_context (fn) != NULL_TREE)
5453 error_at (DECL_SOURCE_LOCATION (fn),
5454 "ms_hook_prologue is not compatible with nested function");
5455 else
5456 return true;
5457 }
5458 return false;
5459 }
5460
5461 static enum calling_abi
5462 ix86_function_abi (const_tree fndecl)
5463 {
5464 if (! fndecl)
5465 return ix86_abi;
5466 return ix86_function_type_abi (TREE_TYPE (fndecl));
5467 }
5468
5469 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5470 call abi used. */
5471 enum calling_abi
5472 ix86_cfun_abi (void)
5473 {
5474 if (! cfun)
5475 return ix86_abi;
5476 return cfun->machine->call_abi;
5477 }
5478
5479 /* Write the extra assembler code needed to declare a function properly. */
5480
5481 void
5482 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5483 tree decl)
5484 {
5485 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5486
5487 if (is_ms_hook)
5488 {
5489 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5490 unsigned int filler_cc = 0xcccccccc;
5491
5492 for (i = 0; i < filler_count; i += 4)
5493 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5494 }
5495
5496 #ifdef SUBTARGET_ASM_UNWIND_INIT
5497 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5498 #endif
5499
5500 ASM_OUTPUT_LABEL (asm_out_file, fname);
5501
5502 /* Output magic byte marker, if hot-patch attribute is set. */
5503 if (is_ms_hook)
5504 {
5505 if (TARGET_64BIT)
5506 {
5507 /* leaq [%rsp + 0], %rsp */
5508 asm_fprintf (asm_out_file, ASM_BYTE
5509 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5510 }
5511 else
5512 {
5513 /* movl.s %edi, %edi
5514 push %ebp
5515 movl.s %esp, %ebp */
5516 asm_fprintf (asm_out_file, ASM_BYTE
5517 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5518 }
5519 }
5520 }
5521
5522 /* regclass.c */
5523 extern void init_regs (void);
5524
5525 /* Implementation of call abi switching target hook. Specific to FNDECL
5526 the specific call register sets are set. See also
5527 ix86_conditional_register_usage for more details. */
5528 void
5529 ix86_call_abi_override (const_tree fndecl)
5530 {
5531 if (fndecl == NULL_TREE)
5532 cfun->machine->call_abi = ix86_abi;
5533 else
5534 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5535 }
5536
5537 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5538 expensive re-initialization of init_regs each time we switch function context
5539 since this is needed only during RTL expansion. */
5540 static void
5541 ix86_maybe_switch_abi (void)
5542 {
5543 if (TARGET_64BIT &&
5544 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5545 reinit_regs ();
5546 }
5547
5548 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5549 for a call to a function whose data type is FNTYPE.
5550 For a library call, FNTYPE is 0. */
5551
5552 void
5553 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5554 tree fntype, /* tree ptr for function decl */
5555 rtx libname, /* SYMBOL_REF of library name or 0 */
5556 tree fndecl,
5557 int caller)
5558 {
5559 struct cgraph_local_info *i;
5560 tree fnret_type;
5561
5562 memset (cum, 0, sizeof (*cum));
5563
5564 /* Initialize for the current callee. */
5565 if (caller)
5566 {
5567 cfun->machine->callee_pass_avx256_p = false;
5568 cfun->machine->callee_return_avx256_p = false;
5569 }
5570
5571 if (fndecl)
5572 {
5573 i = cgraph_local_info (fndecl);
5574 cum->call_abi = ix86_function_abi (fndecl);
5575 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5576 }
5577 else
5578 {
5579 i = NULL;
5580 cum->call_abi = ix86_function_type_abi (fntype);
5581 if (fntype)
5582 fnret_type = TREE_TYPE (fntype);
5583 else
5584 fnret_type = NULL;
5585 }
5586
5587 if (TARGET_VZEROUPPER && fnret_type)
5588 {
5589 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5590 false);
5591 if (function_pass_avx256_p (fnret_value))
5592 {
5593 /* The return value of this function uses 256bit AVX modes. */
5594 if (caller)
5595 cfun->machine->callee_return_avx256_p = true;
5596 else
5597 cfun->machine->caller_return_avx256_p = true;
5598 }
5599 }
5600
5601 cum->caller = caller;
5602
5603 /* Set up the number of registers to use for passing arguments. */
5604
5605 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5606 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5607 "or subtarget optimization implying it");
5608 cum->nregs = ix86_regparm;
5609 if (TARGET_64BIT)
5610 {
5611 cum->nregs = (cum->call_abi == SYSV_ABI
5612 ? X86_64_REGPARM_MAX
5613 : X86_64_MS_REGPARM_MAX);
5614 }
5615 if (TARGET_SSE)
5616 {
5617 cum->sse_nregs = SSE_REGPARM_MAX;
5618 if (TARGET_64BIT)
5619 {
5620 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5621 ? X86_64_SSE_REGPARM_MAX
5622 : X86_64_MS_SSE_REGPARM_MAX);
5623 }
5624 }
5625 if (TARGET_MMX)
5626 cum->mmx_nregs = MMX_REGPARM_MAX;
5627 cum->warn_avx = true;
5628 cum->warn_sse = true;
5629 cum->warn_mmx = true;
5630
5631 /* Because type might mismatch in between caller and callee, we need to
5632 use actual type of function for local calls.
5633 FIXME: cgraph_analyze can be told to actually record if function uses
5634 va_start so for local functions maybe_vaarg can be made aggressive
5635 helping K&R code.
5636 FIXME: once typesytem is fixed, we won't need this code anymore. */
5637 if (i && i->local && i->can_change_signature)
5638 fntype = TREE_TYPE (fndecl);
5639 cum->maybe_vaarg = (fntype
5640 ? (!prototype_p (fntype) || stdarg_p (fntype))
5641 : !libname);
5642
5643 if (!TARGET_64BIT)
5644 {
5645 /* If there are variable arguments, then we won't pass anything
5646 in registers in 32-bit mode. */
5647 if (stdarg_p (fntype))
5648 {
5649 cum->nregs = 0;
5650 cum->sse_nregs = 0;
5651 cum->mmx_nregs = 0;
5652 cum->warn_avx = 0;
5653 cum->warn_sse = 0;
5654 cum->warn_mmx = 0;
5655 return;
5656 }
5657
5658 /* Use ecx and edx registers if function has fastcall attribute,
5659 else look for regparm information. */
5660 if (fntype)
5661 {
5662 unsigned int ccvt = ix86_get_callcvt (fntype);
5663 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5664 {
5665 cum->nregs = 1;
5666 cum->fastcall = 1; /* Same first register as in fastcall. */
5667 }
5668 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5669 {
5670 cum->nregs = 2;
5671 cum->fastcall = 1;
5672 }
5673 else
5674 cum->nregs = ix86_function_regparm (fntype, fndecl);
5675 }
5676
5677 /* Set up the number of SSE registers used for passing SFmode
5678 and DFmode arguments. Warn for mismatching ABI. */
5679 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5680 }
5681 }
5682
5683 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5684 But in the case of vector types, it is some vector mode.
5685
5686 When we have only some of our vector isa extensions enabled, then there
5687 are some modes for which vector_mode_supported_p is false. For these
5688 modes, the generic vector support in gcc will choose some non-vector mode
5689 in order to implement the type. By computing the natural mode, we'll
5690 select the proper ABI location for the operand and not depend on whatever
5691 the middle-end decides to do with these vector types.
5692
5693 The midde-end can't deal with the vector types > 16 bytes. In this
5694 case, we return the original mode and warn ABI change if CUM isn't
5695 NULL. */
5696
5697 static enum machine_mode
5698 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5699 {
5700 enum machine_mode mode = TYPE_MODE (type);
5701
5702 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5703 {
5704 HOST_WIDE_INT size = int_size_in_bytes (type);
5705 if ((size == 8 || size == 16 || size == 32)
5706 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5707 && TYPE_VECTOR_SUBPARTS (type) > 1)
5708 {
5709 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5710
5711 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5712 mode = MIN_MODE_VECTOR_FLOAT;
5713 else
5714 mode = MIN_MODE_VECTOR_INT;
5715
5716 /* Get the mode which has this inner mode and number of units. */
5717 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5718 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5719 && GET_MODE_INNER (mode) == innermode)
5720 {
5721 if (size == 32 && !TARGET_AVX)
5722 {
5723 static bool warnedavx;
5724
5725 if (cum
5726 && !warnedavx
5727 && cum->warn_avx)
5728 {
5729 warnedavx = true;
5730 warning (0, "AVX vector argument without AVX "
5731 "enabled changes the ABI");
5732 }
5733 return TYPE_MODE (type);
5734 }
5735 else
5736 return mode;
5737 }
5738
5739 gcc_unreachable ();
5740 }
5741 }
5742
5743 return mode;
5744 }
5745
5746 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5747 this may not agree with the mode that the type system has chosen for the
5748 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5749 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5750
5751 static rtx
5752 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5753 unsigned int regno)
5754 {
5755 rtx tmp;
5756
5757 if (orig_mode != BLKmode)
5758 tmp = gen_rtx_REG (orig_mode, regno);
5759 else
5760 {
5761 tmp = gen_rtx_REG (mode, regno);
5762 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5763 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5764 }
5765
5766 return tmp;
5767 }
5768
5769 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5770 of this code is to classify each 8bytes of incoming argument by the register
5771 class and assign registers accordingly. */
5772
5773 /* Return the union class of CLASS1 and CLASS2.
5774 See the x86-64 PS ABI for details. */
5775
5776 static enum x86_64_reg_class
5777 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5778 {
5779 /* Rule #1: If both classes are equal, this is the resulting class. */
5780 if (class1 == class2)
5781 return class1;
5782
5783 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5784 the other class. */
5785 if (class1 == X86_64_NO_CLASS)
5786 return class2;
5787 if (class2 == X86_64_NO_CLASS)
5788 return class1;
5789
5790 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5791 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5792 return X86_64_MEMORY_CLASS;
5793
5794 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5795 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5796 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5797 return X86_64_INTEGERSI_CLASS;
5798 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5799 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5800 return X86_64_INTEGER_CLASS;
5801
5802 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5803 MEMORY is used. */
5804 if (class1 == X86_64_X87_CLASS
5805 || class1 == X86_64_X87UP_CLASS
5806 || class1 == X86_64_COMPLEX_X87_CLASS
5807 || class2 == X86_64_X87_CLASS
5808 || class2 == X86_64_X87UP_CLASS
5809 || class2 == X86_64_COMPLEX_X87_CLASS)
5810 return X86_64_MEMORY_CLASS;
5811
5812 /* Rule #6: Otherwise class SSE is used. */
5813 return X86_64_SSE_CLASS;
5814 }
5815
5816 /* Classify the argument of type TYPE and mode MODE.
5817 CLASSES will be filled by the register class used to pass each word
5818 of the operand. The number of words is returned. In case the parameter
5819 should be passed in memory, 0 is returned. As a special case for zero
5820 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5821
5822 BIT_OFFSET is used internally for handling records and specifies offset
5823 of the offset in bits modulo 256 to avoid overflow cases.
5824
5825 See the x86-64 PS ABI for details.
5826 */
5827
5828 static int
5829 classify_argument (enum machine_mode mode, const_tree type,
5830 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5831 {
5832 HOST_WIDE_INT bytes =
5833 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5834 int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5835
5836 /* Variable sized entities are always passed/returned in memory. */
5837 if (bytes < 0)
5838 return 0;
5839
5840 if (mode != VOIDmode
5841 && targetm.calls.must_pass_in_stack (mode, type))
5842 return 0;
5843
5844 if (type && AGGREGATE_TYPE_P (type))
5845 {
5846 int i;
5847 tree field;
5848 enum x86_64_reg_class subclasses[MAX_CLASSES];
5849
5850 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5851 if (bytes > 32)
5852 return 0;
5853
5854 for (i = 0; i < words; i++)
5855 classes[i] = X86_64_NO_CLASS;
5856
5857 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5858 signalize memory class, so handle it as special case. */
5859 if (!words)
5860 {
5861 classes[0] = X86_64_NO_CLASS;
5862 return 1;
5863 }
5864
5865 /* Classify each field of record and merge classes. */
5866 switch (TREE_CODE (type))
5867 {
5868 case RECORD_TYPE:
5869 /* And now merge the fields of structure. */
5870 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5871 {
5872 if (TREE_CODE (field) == FIELD_DECL)
5873 {
5874 int num;
5875
5876 if (TREE_TYPE (field) == error_mark_node)
5877 continue;
5878
5879 /* Bitfields are always classified as integer. Handle them
5880 early, since later code would consider them to be
5881 misaligned integers. */
5882 if (DECL_BIT_FIELD (field))
5883 {
5884 for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5885 i < ((int_bit_position (field) + (bit_offset % 64))
5886 + tree_low_cst (DECL_SIZE (field), 0)
5887 + 63) / 8 / 8; i++)
5888 classes[i] =
5889 merge_classes (X86_64_INTEGER_CLASS,
5890 classes[i]);
5891 }
5892 else
5893 {
5894 int pos;
5895
5896 type = TREE_TYPE (field);
5897
5898 /* Flexible array member is ignored. */
5899 if (TYPE_MODE (type) == BLKmode
5900 && TREE_CODE (type) == ARRAY_TYPE
5901 && TYPE_SIZE (type) == NULL_TREE
5902 && TYPE_DOMAIN (type) != NULL_TREE
5903 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5904 == NULL_TREE))
5905 {
5906 static bool warned;
5907
5908 if (!warned && warn_psabi)
5909 {
5910 warned = true;
5911 inform (input_location,
5912 "the ABI of passing struct with"
5913 " a flexible array member has"
5914 " changed in GCC 4.4");
5915 }
5916 continue;
5917 }
5918 num = classify_argument (TYPE_MODE (type), type,
5919 subclasses,
5920 (int_bit_position (field)
5921 + bit_offset) % 256);
5922 if (!num)
5923 return 0;
5924 pos = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5925 for (i = 0; i < num && (i + pos) < words; i++)
5926 classes[i + pos] =
5927 merge_classes (subclasses[i], classes[i + pos]);
5928 }
5929 }
5930 }
5931 break;
5932
5933 case ARRAY_TYPE:
5934 /* Arrays are handled as small records. */
5935 {
5936 int num;
5937 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5938 TREE_TYPE (type), subclasses, bit_offset);
5939 if (!num)
5940 return 0;
5941
5942 /* The partial classes are now full classes. */
5943 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5944 subclasses[0] = X86_64_SSE_CLASS;
5945 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5946 && !((bit_offset % 64) == 0 && bytes == 4))
5947 subclasses[0] = X86_64_INTEGER_CLASS;
5948
5949 for (i = 0; i < words; i++)
5950 classes[i] = subclasses[i % num];
5951
5952 break;
5953 }
5954 case UNION_TYPE:
5955 case QUAL_UNION_TYPE:
5956 /* Unions are similar to RECORD_TYPE but offset is always 0.
5957 */
5958 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5959 {
5960 if (TREE_CODE (field) == FIELD_DECL)
5961 {
5962 int num;
5963
5964 if (TREE_TYPE (field) == error_mark_node)
5965 continue;
5966
5967 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5968 TREE_TYPE (field), subclasses,
5969 bit_offset);
5970 if (!num)
5971 return 0;
5972 for (i = 0; i < num; i++)
5973 classes[i] = merge_classes (subclasses[i], classes[i]);
5974 }
5975 }
5976 break;
5977
5978 default:
5979 gcc_unreachable ();
5980 }
5981
5982 if (words > 2)
5983 {
5984 /* When size > 16 bytes, if the first one isn't
5985 X86_64_SSE_CLASS or any other ones aren't
5986 X86_64_SSEUP_CLASS, everything should be passed in
5987 memory. */
5988 if (classes[0] != X86_64_SSE_CLASS)
5989 return 0;
5990
5991 for (i = 1; i < words; i++)
5992 if (classes[i] != X86_64_SSEUP_CLASS)
5993 return 0;
5994 }
5995
5996 /* Final merger cleanup. */
5997 for (i = 0; i < words; i++)
5998 {
5999 /* If one class is MEMORY, everything should be passed in
6000 memory. */
6001 if (classes[i] == X86_64_MEMORY_CLASS)
6002 return 0;
6003
6004 /* The X86_64_SSEUP_CLASS should be always preceded by
6005 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6006 if (classes[i] == X86_64_SSEUP_CLASS
6007 && classes[i - 1] != X86_64_SSE_CLASS
6008 && classes[i - 1] != X86_64_SSEUP_CLASS)
6009 {
6010 /* The first one should never be X86_64_SSEUP_CLASS. */
6011 gcc_assert (i != 0);
6012 classes[i] = X86_64_SSE_CLASS;
6013 }
6014
6015 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6016 everything should be passed in memory. */
6017 if (classes[i] == X86_64_X87UP_CLASS
6018 && (classes[i - 1] != X86_64_X87_CLASS))
6019 {
6020 static bool warned;
6021
6022 /* The first one should never be X86_64_X87UP_CLASS. */
6023 gcc_assert (i != 0);
6024 if (!warned && warn_psabi)
6025 {
6026 warned = true;
6027 inform (input_location,
6028 "the ABI of passing union with long double"
6029 " has changed in GCC 4.4");
6030 }
6031 return 0;
6032 }
6033 }
6034 return words;
6035 }
6036
6037 /* Compute alignment needed. We align all types to natural boundaries with
6038 exception of XFmode that is aligned to 64bits. */
6039 if (mode != VOIDmode && mode != BLKmode)
6040 {
6041 int mode_alignment = GET_MODE_BITSIZE (mode);
6042
6043 if (mode == XFmode)
6044 mode_alignment = 128;
6045 else if (mode == XCmode)
6046 mode_alignment = 256;
6047 if (COMPLEX_MODE_P (mode))
6048 mode_alignment /= 2;
6049 /* Misaligned fields are always returned in memory. */
6050 if (bit_offset % mode_alignment)
6051 return 0;
6052 }
6053
6054 /* for V1xx modes, just use the base mode */
6055 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6056 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6057 mode = GET_MODE_INNER (mode);
6058
6059 /* Classification of atomic types. */
6060 switch (mode)
6061 {
6062 case SDmode:
6063 case DDmode:
6064 classes[0] = X86_64_SSE_CLASS;
6065 return 1;
6066 case TDmode:
6067 classes[0] = X86_64_SSE_CLASS;
6068 classes[1] = X86_64_SSEUP_CLASS;
6069 return 2;
6070 case DImode:
6071 case SImode:
6072 case HImode:
6073 case QImode:
6074 case CSImode:
6075 case CHImode:
6076 case CQImode:
6077 {
6078 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6079
6080 if (size <= 32)
6081 {
6082 classes[0] = X86_64_INTEGERSI_CLASS;
6083 return 1;
6084 }
6085 else if (size <= 64)
6086 {
6087 classes[0] = X86_64_INTEGER_CLASS;
6088 return 1;
6089 }
6090 else if (size <= 64+32)
6091 {
6092 classes[0] = X86_64_INTEGER_CLASS;
6093 classes[1] = X86_64_INTEGERSI_CLASS;
6094 return 2;
6095 }
6096 else if (size <= 64+64)
6097 {
6098 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6099 return 2;
6100 }
6101 else
6102 gcc_unreachable ();
6103 }
6104 case CDImode:
6105 case TImode:
6106 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6107 return 2;
6108 case COImode:
6109 case OImode:
6110 /* OImode shouldn't be used directly. */
6111 gcc_unreachable ();
6112 case CTImode:
6113 return 0;
6114 case SFmode:
6115 if (!(bit_offset % 64))
6116 classes[0] = X86_64_SSESF_CLASS;
6117 else
6118 classes[0] = X86_64_SSE_CLASS;
6119 return 1;
6120 case DFmode:
6121 classes[0] = X86_64_SSEDF_CLASS;
6122 return 1;
6123 case XFmode:
6124 classes[0] = X86_64_X87_CLASS;
6125 classes[1] = X86_64_X87UP_CLASS;
6126 return 2;
6127 case TFmode:
6128 classes[0] = X86_64_SSE_CLASS;
6129 classes[1] = X86_64_SSEUP_CLASS;
6130 return 2;
6131 case SCmode:
6132 classes[0] = X86_64_SSE_CLASS;
6133 if (!(bit_offset % 64))
6134 return 1;
6135 else
6136 {
6137 static bool warned;
6138
6139 if (!warned && warn_psabi)
6140 {
6141 warned = true;
6142 inform (input_location,
6143 "the ABI of passing structure with complex float"
6144 " member has changed in GCC 4.4");
6145 }
6146 classes[1] = X86_64_SSESF_CLASS;
6147 return 2;
6148 }
6149 case DCmode:
6150 classes[0] = X86_64_SSEDF_CLASS;
6151 classes[1] = X86_64_SSEDF_CLASS;
6152 return 2;
6153 case XCmode:
6154 classes[0] = X86_64_COMPLEX_X87_CLASS;
6155 return 1;
6156 case TCmode:
6157 /* This modes is larger than 16 bytes. */
6158 return 0;
6159 case V8SFmode:
6160 case V8SImode:
6161 case V32QImode:
6162 case V16HImode:
6163 case V4DFmode:
6164 case V4DImode:
6165 classes[0] = X86_64_SSE_CLASS;
6166 classes[1] = X86_64_SSEUP_CLASS;
6167 classes[2] = X86_64_SSEUP_CLASS;
6168 classes[3] = X86_64_SSEUP_CLASS;
6169 return 4;
6170 case V4SFmode:
6171 case V4SImode:
6172 case V16QImode:
6173 case V8HImode:
6174 case V2DFmode:
6175 case V2DImode:
6176 classes[0] = X86_64_SSE_CLASS;
6177 classes[1] = X86_64_SSEUP_CLASS;
6178 return 2;
6179 case V1TImode:
6180 case V1DImode:
6181 case V2SFmode:
6182 case V2SImode:
6183 case V4HImode:
6184 case V8QImode:
6185 classes[0] = X86_64_SSE_CLASS;
6186 return 1;
6187 case BLKmode:
6188 case VOIDmode:
6189 return 0;
6190 default:
6191 gcc_assert (VECTOR_MODE_P (mode));
6192
6193 if (bytes > 16)
6194 return 0;
6195
6196 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6197
6198 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6199 classes[0] = X86_64_INTEGERSI_CLASS;
6200 else
6201 classes[0] = X86_64_INTEGER_CLASS;
6202 classes[1] = X86_64_INTEGER_CLASS;
6203 return 1 + (bytes > 8);
6204 }
6205 }
6206
6207 /* Examine the argument and return set number of register required in each
6208 class. Return 0 iff parameter should be passed in memory. */
6209 static int
6210 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6211 int *int_nregs, int *sse_nregs)
6212 {
6213 enum x86_64_reg_class regclass[MAX_CLASSES];
6214 int n = classify_argument (mode, type, regclass, 0);
6215
6216 *int_nregs = 0;
6217 *sse_nregs = 0;
6218 if (!n)
6219 return 0;
6220 for (n--; n >= 0; n--)
6221 switch (regclass[n])
6222 {
6223 case X86_64_INTEGER_CLASS:
6224 case X86_64_INTEGERSI_CLASS:
6225 (*int_nregs)++;
6226 break;
6227 case X86_64_SSE_CLASS:
6228 case X86_64_SSESF_CLASS:
6229 case X86_64_SSEDF_CLASS:
6230 (*sse_nregs)++;
6231 break;
6232 case X86_64_NO_CLASS:
6233 case X86_64_SSEUP_CLASS:
6234 break;
6235 case X86_64_X87_CLASS:
6236 case X86_64_X87UP_CLASS:
6237 if (!in_return)
6238 return 0;
6239 break;
6240 case X86_64_COMPLEX_X87_CLASS:
6241 return in_return ? 2 : 0;
6242 case X86_64_MEMORY_CLASS:
6243 gcc_unreachable ();
6244 }
6245 return 1;
6246 }
6247
6248 /* Construct container for the argument used by GCC interface. See
6249 FUNCTION_ARG for the detailed description. */
6250
6251 static rtx
6252 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6253 const_tree type, int in_return, int nintregs, int nsseregs,
6254 const int *intreg, int sse_regno)
6255 {
6256 /* The following variables hold the static issued_error state. */
6257 static bool issued_sse_arg_error;
6258 static bool issued_sse_ret_error;
6259 static bool issued_x87_ret_error;
6260
6261 enum machine_mode tmpmode;
6262 int bytes =
6263 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6264 enum x86_64_reg_class regclass[MAX_CLASSES];
6265 int n;
6266 int i;
6267 int nexps = 0;
6268 int needed_sseregs, needed_intregs;
6269 rtx exp[MAX_CLASSES];
6270 rtx ret;
6271
6272 n = classify_argument (mode, type, regclass, 0);
6273 if (!n)
6274 return NULL;
6275 if (!examine_argument (mode, type, in_return, &needed_intregs,
6276 &needed_sseregs))
6277 return NULL;
6278 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6279 return NULL;
6280
6281 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6282 some less clueful developer tries to use floating-point anyway. */
6283 if (needed_sseregs && !TARGET_SSE)
6284 {
6285 if (in_return)
6286 {
6287 if (!issued_sse_ret_error)
6288 {
6289 error ("SSE register return with SSE disabled");
6290 issued_sse_ret_error = true;
6291 }
6292 }
6293 else if (!issued_sse_arg_error)
6294 {
6295 error ("SSE register argument with SSE disabled");
6296 issued_sse_arg_error = true;
6297 }
6298 return NULL;
6299 }
6300
6301 /* Likewise, error if the ABI requires us to return values in the
6302 x87 registers and the user specified -mno-80387. */
6303 if (!TARGET_80387 && in_return)
6304 for (i = 0; i < n; i++)
6305 if (regclass[i] == X86_64_X87_CLASS
6306 || regclass[i] == X86_64_X87UP_CLASS
6307 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6308 {
6309 if (!issued_x87_ret_error)
6310 {
6311 error ("x87 register return with x87 disabled");
6312 issued_x87_ret_error = true;
6313 }
6314 return NULL;
6315 }
6316
6317 /* First construct simple cases. Avoid SCmode, since we want to use
6318 single register to pass this type. */
6319 if (n == 1 && mode != SCmode)
6320 switch (regclass[0])
6321 {
6322 case X86_64_INTEGER_CLASS:
6323 case X86_64_INTEGERSI_CLASS:
6324 return gen_rtx_REG (mode, intreg[0]);
6325 case X86_64_SSE_CLASS:
6326 case X86_64_SSESF_CLASS:
6327 case X86_64_SSEDF_CLASS:
6328 if (mode != BLKmode)
6329 return gen_reg_or_parallel (mode, orig_mode,
6330 SSE_REGNO (sse_regno));
6331 break;
6332 case X86_64_X87_CLASS:
6333 case X86_64_COMPLEX_X87_CLASS:
6334 return gen_rtx_REG (mode, FIRST_STACK_REG);
6335 case X86_64_NO_CLASS:
6336 /* Zero sized array, struct or class. */
6337 return NULL;
6338 default:
6339 gcc_unreachable ();
6340 }
6341 if (n == 2 && regclass[0] == X86_64_SSE_CLASS
6342 && regclass[1] == X86_64_SSEUP_CLASS && mode != BLKmode)
6343 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6344 if (n == 4
6345 && regclass[0] == X86_64_SSE_CLASS
6346 && regclass[1] == X86_64_SSEUP_CLASS
6347 && regclass[2] == X86_64_SSEUP_CLASS
6348 && regclass[3] == X86_64_SSEUP_CLASS
6349 && mode != BLKmode)
6350 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6351
6352 if (n == 2
6353 && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS)
6354 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6355 if (n == 2 && regclass[0] == X86_64_INTEGER_CLASS
6356 && regclass[1] == X86_64_INTEGER_CLASS
6357 && (mode == CDImode || mode == TImode || mode == TFmode)
6358 && intreg[0] + 1 == intreg[1])
6359 return gen_rtx_REG (mode, intreg[0]);
6360
6361 /* Otherwise figure out the entries of the PARALLEL. */
6362 for (i = 0; i < n; i++)
6363 {
6364 int pos;
6365
6366 switch (regclass[i])
6367 {
6368 case X86_64_NO_CLASS:
6369 break;
6370 case X86_64_INTEGER_CLASS:
6371 case X86_64_INTEGERSI_CLASS:
6372 /* Merge TImodes on aligned occasions here too. */
6373 if (i * 8 + 8 > bytes)
6374 tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6375 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6376 tmpmode = SImode;
6377 else
6378 tmpmode = DImode;
6379 /* We've requested 24 bytes we don't have mode for. Use DImode. */
6380 if (tmpmode == BLKmode)
6381 tmpmode = DImode;
6382 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6383 gen_rtx_REG (tmpmode, *intreg),
6384 GEN_INT (i*8));
6385 intreg++;
6386 break;
6387 case X86_64_SSESF_CLASS:
6388 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6389 gen_rtx_REG (SFmode,
6390 SSE_REGNO (sse_regno)),
6391 GEN_INT (i*8));
6392 sse_regno++;
6393 break;
6394 case X86_64_SSEDF_CLASS:
6395 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6396 gen_rtx_REG (DFmode,
6397 SSE_REGNO (sse_regno)),
6398 GEN_INT (i*8));
6399 sse_regno++;
6400 break;
6401 case X86_64_SSE_CLASS:
6402 pos = i;
6403 switch (n)
6404 {
6405 case 1:
6406 tmpmode = DImode;
6407 break;
6408 case 2:
6409 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6410 {
6411 tmpmode = TImode;
6412 i++;
6413 }
6414 else
6415 tmpmode = DImode;
6416 break;
6417 case 4:
6418 gcc_assert (i == 0
6419 && regclass[1] == X86_64_SSEUP_CLASS
6420 && regclass[2] == X86_64_SSEUP_CLASS
6421 && regclass[3] == X86_64_SSEUP_CLASS);
6422 tmpmode = OImode;
6423 i += 3;
6424 break;
6425 default:
6426 gcc_unreachable ();
6427 }
6428 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6429 gen_rtx_REG (tmpmode,
6430 SSE_REGNO (sse_regno)),
6431 GEN_INT (pos*8));
6432 sse_regno++;
6433 break;
6434 default:
6435 gcc_unreachable ();
6436 }
6437 }
6438
6439 /* Empty aligned struct, union or class. */
6440 if (nexps == 0)
6441 return NULL;
6442
6443 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6444 for (i = 0; i < nexps; i++)
6445 XVECEXP (ret, 0, i) = exp [i];
6446 return ret;
6447 }
6448
6449 /* Update the data in CUM to advance over an argument of mode MODE
6450 and data type TYPE. (TYPE is null for libcalls where that information
6451 may not be available.) */
6452
6453 static void
6454 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6455 const_tree type, HOST_WIDE_INT bytes,
6456 HOST_WIDE_INT words)
6457 {
6458 switch (mode)
6459 {
6460 default:
6461 break;
6462
6463 case BLKmode:
6464 if (bytes < 0)
6465 break;
6466 /* FALLTHRU */
6467
6468 case DImode:
6469 case SImode:
6470 case HImode:
6471 case QImode:
6472 cum->words += words;
6473 cum->nregs -= words;
6474 cum->regno += words;
6475
6476 if (cum->nregs <= 0)
6477 {
6478 cum->nregs = 0;
6479 cum->regno = 0;
6480 }
6481 break;
6482
6483 case OImode:
6484 /* OImode shouldn't be used directly. */
6485 gcc_unreachable ();
6486
6487 case DFmode:
6488 if (cum->float_in_sse < 2)
6489 break;
6490 case SFmode:
6491 if (cum->float_in_sse < 1)
6492 break;
6493 /* FALLTHRU */
6494
6495 case V8SFmode:
6496 case V8SImode:
6497 case V32QImode:
6498 case V16HImode:
6499 case V4DFmode:
6500 case V4DImode:
6501 case TImode:
6502 case V16QImode:
6503 case V8HImode:
6504 case V4SImode:
6505 case V2DImode:
6506 case V4SFmode:
6507 case V2DFmode:
6508 if (!type || !AGGREGATE_TYPE_P (type))
6509 {
6510 cum->sse_words += words;
6511 cum->sse_nregs -= 1;
6512 cum->sse_regno += 1;
6513 if (cum->sse_nregs <= 0)
6514 {
6515 cum->sse_nregs = 0;
6516 cum->sse_regno = 0;
6517 }
6518 }
6519 break;
6520
6521 case V8QImode:
6522 case V4HImode:
6523 case V2SImode:
6524 case V2SFmode:
6525 case V1TImode:
6526 case V1DImode:
6527 if (!type || !AGGREGATE_TYPE_P (type))
6528 {
6529 cum->mmx_words += words;
6530 cum->mmx_nregs -= 1;
6531 cum->mmx_regno += 1;
6532 if (cum->mmx_nregs <= 0)
6533 {
6534 cum->mmx_nregs = 0;
6535 cum->mmx_regno = 0;
6536 }
6537 }
6538 break;
6539 }
6540 }
6541
6542 static void
6543 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6544 const_tree type, HOST_WIDE_INT words, bool named)
6545 {
6546 int int_nregs, sse_nregs;
6547
6548 /* Unnamed 256bit vector mode parameters are passed on stack. */
6549 if (!named && VALID_AVX256_REG_MODE (mode))
6550 return;
6551
6552 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6553 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6554 {
6555 cum->nregs -= int_nregs;
6556 cum->sse_nregs -= sse_nregs;
6557 cum->regno += int_nregs;
6558 cum->sse_regno += sse_nregs;
6559 }
6560 else
6561 {
6562 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6563 cum->words = (cum->words + align - 1) & ~(align - 1);
6564 cum->words += words;
6565 }
6566 }
6567
6568 static void
6569 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6570 HOST_WIDE_INT words)
6571 {
6572 /* Otherwise, this should be passed indirect. */
6573 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6574
6575 cum->words += words;
6576 if (cum->nregs > 0)
6577 {
6578 cum->nregs -= 1;
6579 cum->regno += 1;
6580 }
6581 }
6582
6583 /* Update the data in CUM to advance over an argument of mode MODE and
6584 data type TYPE. (TYPE is null for libcalls where that information
6585 may not be available.) */
6586
6587 static void
6588 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6589 const_tree type, bool named)
6590 {
6591 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6592 HOST_WIDE_INT bytes, words;
6593
6594 if (mode == BLKmode)
6595 bytes = int_size_in_bytes (type);
6596 else
6597 bytes = GET_MODE_SIZE (mode);
6598 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6599
6600 if (type)
6601 mode = type_natural_mode (type, NULL);
6602
6603 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6604 function_arg_advance_ms_64 (cum, bytes, words);
6605 else if (TARGET_64BIT)
6606 function_arg_advance_64 (cum, mode, type, words, named);
6607 else
6608 function_arg_advance_32 (cum, mode, type, bytes, words);
6609 }
6610
6611 /* Define where to put the arguments to a function.
6612 Value is zero to push the argument on the stack,
6613 or a hard register in which to store the argument.
6614
6615 MODE is the argument's machine mode.
6616 TYPE is the data type of the argument (as a tree).
6617 This is null for libcalls where that information may
6618 not be available.
6619 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6620 the preceding args and about the function being called.
6621 NAMED is nonzero if this argument is a named parameter
6622 (otherwise it is an extra parameter matching an ellipsis). */
6623
6624 static rtx
6625 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6626 enum machine_mode orig_mode, const_tree type,
6627 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6628 {
6629 static bool warnedsse, warnedmmx;
6630
6631 /* Avoid the AL settings for the Unix64 ABI. */
6632 if (mode == VOIDmode)
6633 return constm1_rtx;
6634
6635 switch (mode)
6636 {
6637 default:
6638 break;
6639
6640 case BLKmode:
6641 if (bytes < 0)
6642 break;
6643 /* FALLTHRU */
6644 case DImode:
6645 case SImode:
6646 case HImode:
6647 case QImode:
6648 if (words <= cum->nregs)
6649 {
6650 int regno = cum->regno;
6651
6652 /* Fastcall allocates the first two DWORD (SImode) or
6653 smaller arguments to ECX and EDX if it isn't an
6654 aggregate type . */
6655 if (cum->fastcall)
6656 {
6657 if (mode == BLKmode
6658 || mode == DImode
6659 || (type && AGGREGATE_TYPE_P (type)))
6660 break;
6661
6662 /* ECX not EAX is the first allocated register. */
6663 if (regno == AX_REG)
6664 regno = CX_REG;
6665 }
6666 return gen_rtx_REG (mode, regno);
6667 }
6668 break;
6669
6670 case DFmode:
6671 if (cum->float_in_sse < 2)
6672 break;
6673 case SFmode:
6674 if (cum->float_in_sse < 1)
6675 break;
6676 /* FALLTHRU */
6677 case TImode:
6678 /* In 32bit, we pass TImode in xmm registers. */
6679 case V16QImode:
6680 case V8HImode:
6681 case V4SImode:
6682 case V2DImode:
6683 case V4SFmode:
6684 case V2DFmode:
6685 if (!type || !AGGREGATE_TYPE_P (type))
6686 {
6687 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6688 {
6689 warnedsse = true;
6690 warning (0, "SSE vector argument without SSE enabled "
6691 "changes the ABI");
6692 }
6693 if (cum->sse_nregs)
6694 return gen_reg_or_parallel (mode, orig_mode,
6695 cum->sse_regno + FIRST_SSE_REG);
6696 }
6697 break;
6698
6699 case OImode:
6700 /* OImode shouldn't be used directly. */
6701 gcc_unreachable ();
6702
6703 case V8SFmode:
6704 case V8SImode:
6705 case V32QImode:
6706 case V16HImode:
6707 case V4DFmode:
6708 case V4DImode:
6709 if (!type || !AGGREGATE_TYPE_P (type))
6710 {
6711 if (cum->sse_nregs)
6712 return gen_reg_or_parallel (mode, orig_mode,
6713 cum->sse_regno + FIRST_SSE_REG);
6714 }
6715 break;
6716
6717 case V8QImode:
6718 case V4HImode:
6719 case V2SImode:
6720 case V2SFmode:
6721 case V1TImode:
6722 case V1DImode:
6723 if (!type || !AGGREGATE_TYPE_P (type))
6724 {
6725 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6726 {
6727 warnedmmx = true;
6728 warning (0, "MMX vector argument without MMX enabled "
6729 "changes the ABI");
6730 }
6731 if (cum->mmx_nregs)
6732 return gen_reg_or_parallel (mode, orig_mode,
6733 cum->mmx_regno + FIRST_MMX_REG);
6734 }
6735 break;
6736 }
6737
6738 return NULL_RTX;
6739 }
6740
6741 static rtx
6742 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6743 enum machine_mode orig_mode, const_tree type, bool named)
6744 {
6745 /* Handle a hidden AL argument containing number of registers
6746 for varargs x86-64 functions. */
6747 if (mode == VOIDmode)
6748 return GEN_INT (cum->maybe_vaarg
6749 ? (cum->sse_nregs < 0
6750 ? X86_64_SSE_REGPARM_MAX
6751 : cum->sse_regno)
6752 : -1);
6753
6754 switch (mode)
6755 {
6756 default:
6757 break;
6758
6759 case V8SFmode:
6760 case V8SImode:
6761 case V32QImode:
6762 case V16HImode:
6763 case V4DFmode:
6764 case V4DImode:
6765 /* Unnamed 256bit vector mode parameters are passed on stack. */
6766 if (!named)
6767 return NULL;
6768 break;
6769 }
6770
6771 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6772 cum->sse_nregs,
6773 &x86_64_int_parameter_registers [cum->regno],
6774 cum->sse_regno);
6775 }
6776
6777 static rtx
6778 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6779 enum machine_mode orig_mode, bool named,
6780 HOST_WIDE_INT bytes)
6781 {
6782 unsigned int regno;
6783
6784 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6785 We use value of -2 to specify that current function call is MSABI. */
6786 if (mode == VOIDmode)
6787 return GEN_INT (-2);
6788
6789 /* If we've run out of registers, it goes on the stack. */
6790 if (cum->nregs == 0)
6791 return NULL_RTX;
6792
6793 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6794
6795 /* Only floating point modes are passed in anything but integer regs. */
6796 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6797 {
6798 if (named)
6799 regno = cum->regno + FIRST_SSE_REG;
6800 else
6801 {
6802 rtx t1, t2;
6803
6804 /* Unnamed floating parameters are passed in both the
6805 SSE and integer registers. */
6806 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6807 t2 = gen_rtx_REG (mode, regno);
6808 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6809 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6810 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6811 }
6812 }
6813 /* Handle aggregated types passed in register. */
6814 if (orig_mode == BLKmode)
6815 {
6816 if (bytes > 0 && bytes <= 8)
6817 mode = (bytes > 4 ? DImode : SImode);
6818 if (mode == BLKmode)
6819 mode = DImode;
6820 }
6821
6822 return gen_reg_or_parallel (mode, orig_mode, regno);
6823 }
6824
6825 /* Return where to put the arguments to a function.
6826 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6827
6828 MODE is the argument's machine mode. TYPE is the data type of the
6829 argument. It is null for libcalls where that information may not be
6830 available. CUM gives information about the preceding args and about
6831 the function being called. NAMED is nonzero if this argument is a
6832 named parameter (otherwise it is an extra parameter matching an
6833 ellipsis). */
6834
6835 static rtx
6836 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6837 const_tree type, bool named)
6838 {
6839 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6840 enum machine_mode mode = omode;
6841 HOST_WIDE_INT bytes, words;
6842 rtx arg;
6843
6844 if (mode == BLKmode)
6845 bytes = int_size_in_bytes (type);
6846 else
6847 bytes = GET_MODE_SIZE (mode);
6848 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6849
6850 /* To simplify the code below, represent vector types with a vector mode
6851 even if MMX/SSE are not active. */
6852 if (type && TREE_CODE (type) == VECTOR_TYPE)
6853 mode = type_natural_mode (type, cum);
6854
6855 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6856 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6857 else if (TARGET_64BIT)
6858 arg = function_arg_64 (cum, mode, omode, type, named);
6859 else
6860 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6861
6862 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6863 {
6864 /* This argument uses 256bit AVX modes. */
6865 if (cum->caller)
6866 cfun->machine->callee_pass_avx256_p = true;
6867 else
6868 cfun->machine->caller_pass_avx256_p = true;
6869 }
6870
6871 return arg;
6872 }
6873
6874 /* A C expression that indicates when an argument must be passed by
6875 reference. If nonzero for an argument, a copy of that argument is
6876 made in memory and a pointer to the argument is passed instead of
6877 the argument itself. The pointer is passed in whatever way is
6878 appropriate for passing a pointer to that type. */
6879
6880 static bool
6881 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6882 enum machine_mode mode ATTRIBUTE_UNUSED,
6883 const_tree type, bool named ATTRIBUTE_UNUSED)
6884 {
6885 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6886
6887 /* See Windows x64 Software Convention. */
6888 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6889 {
6890 int msize = (int) GET_MODE_SIZE (mode);
6891 if (type)
6892 {
6893 /* Arrays are passed by reference. */
6894 if (TREE_CODE (type) == ARRAY_TYPE)
6895 return true;
6896
6897 if (AGGREGATE_TYPE_P (type))
6898 {
6899 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6900 are passed by reference. */
6901 msize = int_size_in_bytes (type);
6902 }
6903 }
6904
6905 /* __m128 is passed by reference. */
6906 switch (msize) {
6907 case 1: case 2: case 4: case 8:
6908 break;
6909 default:
6910 return true;
6911 }
6912 }
6913 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6914 return 1;
6915
6916 return 0;
6917 }
6918
6919 /* Return true when TYPE should be 128bit aligned for 32bit argument
6920 passing ABI. XXX: This function is obsolete and is only used for
6921 checking psABI compatibility with previous versions of GCC. */
6922
6923 static bool
6924 ix86_compat_aligned_value_p (const_tree type)
6925 {
6926 enum machine_mode mode = TYPE_MODE (type);
6927 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6928 || mode == TDmode
6929 || mode == TFmode
6930 || mode == TCmode)
6931 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6932 return true;
6933 if (TYPE_ALIGN (type) < 128)
6934 return false;
6935
6936 if (AGGREGATE_TYPE_P (type))
6937 {
6938 /* Walk the aggregates recursively. */
6939 switch (TREE_CODE (type))
6940 {
6941 case RECORD_TYPE:
6942 case UNION_TYPE:
6943 case QUAL_UNION_TYPE:
6944 {
6945 tree field;
6946
6947 /* Walk all the structure fields. */
6948 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6949 {
6950 if (TREE_CODE (field) == FIELD_DECL
6951 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
6952 return true;
6953 }
6954 break;
6955 }
6956
6957 case ARRAY_TYPE:
6958 /* Just for use if some languages passes arrays by value. */
6959 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
6960 return true;
6961 break;
6962
6963 default:
6964 gcc_unreachable ();
6965 }
6966 }
6967 return false;
6968 }
6969
6970 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
6971 XXX: This function is obsolete and is only used for checking psABI
6972 compatibility with previous versions of GCC. */
6973
6974 static unsigned int
6975 ix86_compat_function_arg_boundary (enum machine_mode mode,
6976 const_tree type, unsigned int align)
6977 {
6978 /* In 32bit, only _Decimal128 and __float128 are aligned to their
6979 natural boundaries. */
6980 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6981 {
6982 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
6983 make an exception for SSE modes since these require 128bit
6984 alignment.
6985
6986 The handling here differs from field_alignment. ICC aligns MMX
6987 arguments to 4 byte boundaries, while structure fields are aligned
6988 to 8 byte boundaries. */
6989 if (!type)
6990 {
6991 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
6992 align = PARM_BOUNDARY;
6993 }
6994 else
6995 {
6996 if (!ix86_compat_aligned_value_p (type))
6997 align = PARM_BOUNDARY;
6998 }
6999 }
7000 if (align > BIGGEST_ALIGNMENT)
7001 align = BIGGEST_ALIGNMENT;
7002 return align;
7003 }
7004
7005 /* Return true when TYPE should be 128bit aligned for 32bit argument
7006 passing ABI. */
7007
7008 static bool
7009 ix86_contains_aligned_value_p (const_tree type)
7010 {
7011 enum machine_mode mode = TYPE_MODE (type);
7012
7013 if (mode == XFmode || mode == XCmode)
7014 return false;
7015
7016 if (TYPE_ALIGN (type) < 128)
7017 return false;
7018
7019 if (AGGREGATE_TYPE_P (type))
7020 {
7021 /* Walk the aggregates recursively. */
7022 switch (TREE_CODE (type))
7023 {
7024 case RECORD_TYPE:
7025 case UNION_TYPE:
7026 case QUAL_UNION_TYPE:
7027 {
7028 tree field;
7029
7030 /* Walk all the structure fields. */
7031 for (field = TYPE_FIELDS (type);
7032 field;
7033 field = DECL_CHAIN (field))
7034 {
7035 if (TREE_CODE (field) == FIELD_DECL
7036 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7037 return true;
7038 }
7039 break;
7040 }
7041
7042 case ARRAY_TYPE:
7043 /* Just for use if some languages passes arrays by value. */
7044 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7045 return true;
7046 break;
7047
7048 default:
7049 gcc_unreachable ();
7050 }
7051 }
7052 else
7053 return TYPE_ALIGN (type) >= 128;
7054
7055 return false;
7056 }
7057
7058 /* Gives the alignment boundary, in bits, of an argument with the
7059 specified mode and type. */
7060
7061 static unsigned int
7062 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7063 {
7064 unsigned int align;
7065 if (type)
7066 {
7067 /* Since the main variant type is used for call, we convert it to
7068 the main variant type. */
7069 type = TYPE_MAIN_VARIANT (type);
7070 align = TYPE_ALIGN (type);
7071 }
7072 else
7073 align = GET_MODE_ALIGNMENT (mode);
7074 if (align < PARM_BOUNDARY)
7075 align = PARM_BOUNDARY;
7076 else
7077 {
7078 static bool warned;
7079 unsigned int saved_align = align;
7080
7081 if (!TARGET_64BIT)
7082 {
7083 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7084 if (!type)
7085 {
7086 if (mode == XFmode || mode == XCmode)
7087 align = PARM_BOUNDARY;
7088 }
7089 else if (!ix86_contains_aligned_value_p (type))
7090 align = PARM_BOUNDARY;
7091
7092 if (align < 128)
7093 align = PARM_BOUNDARY;
7094 }
7095
7096 if (warn_psabi
7097 && !warned
7098 && align != ix86_compat_function_arg_boundary (mode, type,
7099 saved_align))
7100 {
7101 warned = true;
7102 inform (input_location,
7103 "The ABI for passing parameters with %d-byte"
7104 " alignment has changed in GCC 4.6",
7105 align / BITS_PER_UNIT);
7106 }
7107 }
7108
7109 return align;
7110 }
7111
7112 /* Return true if N is a possible register number of function value. */
7113
7114 static bool
7115 ix86_function_value_regno_p (const unsigned int regno)
7116 {
7117 switch (regno)
7118 {
7119 case AX_REG:
7120 return true;
7121
7122 case FIRST_FLOAT_REG:
7123 /* TODO: The function should depend on current function ABI but
7124 builtins.c would need updating then. Therefore we use the
7125 default ABI. */
7126 if (TARGET_64BIT && ix86_abi == MS_ABI)
7127 return false;
7128 return TARGET_FLOAT_RETURNS_IN_80387;
7129
7130 case FIRST_SSE_REG:
7131 return TARGET_SSE;
7132
7133 case FIRST_MMX_REG:
7134 if (TARGET_MACHO || TARGET_64BIT)
7135 return false;
7136 return TARGET_MMX;
7137 }
7138
7139 return false;
7140 }
7141
7142 /* Define how to find the value returned by a function.
7143 VALTYPE is the data type of the value (as a tree).
7144 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7145 otherwise, FUNC is 0. */
7146
7147 static rtx
7148 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7149 const_tree fntype, const_tree fn)
7150 {
7151 unsigned int regno;
7152
7153 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7154 we normally prevent this case when mmx is not available. However
7155 some ABIs may require the result to be returned like DImode. */
7156 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7157 regno = FIRST_MMX_REG;
7158
7159 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7160 we prevent this case when sse is not available. However some ABIs
7161 may require the result to be returned like integer TImode. */
7162 else if (mode == TImode
7163 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7164 regno = FIRST_SSE_REG;
7165
7166 /* 32-byte vector modes in %ymm0. */
7167 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7168 regno = FIRST_SSE_REG;
7169
7170 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7171 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7172 regno = FIRST_FLOAT_REG;
7173 else
7174 /* Most things go in %eax. */
7175 regno = AX_REG;
7176
7177 /* Override FP return register with %xmm0 for local functions when
7178 SSE math is enabled or for functions with sseregparm attribute. */
7179 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7180 {
7181 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7182 if ((sse_level >= 1 && mode == SFmode)
7183 || (sse_level == 2 && mode == DFmode))
7184 regno = FIRST_SSE_REG;
7185 }
7186
7187 /* OImode shouldn't be used directly. */
7188 gcc_assert (mode != OImode);
7189
7190 return gen_rtx_REG (orig_mode, regno);
7191 }
7192
7193 static rtx
7194 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7195 const_tree valtype)
7196 {
7197 rtx ret;
7198
7199 /* Handle libcalls, which don't provide a type node. */
7200 if (valtype == NULL)
7201 {
7202 unsigned int regno;
7203
7204 switch (mode)
7205 {
7206 case SFmode:
7207 case SCmode:
7208 case DFmode:
7209 case DCmode:
7210 case TFmode:
7211 case SDmode:
7212 case DDmode:
7213 case TDmode:
7214 regno = FIRST_SSE_REG;
7215 break;
7216 case XFmode:
7217 case XCmode:
7218 regno = FIRST_FLOAT_REG;
7219 break;
7220 case TCmode:
7221 return NULL;
7222 default:
7223 regno = AX_REG;
7224 }
7225
7226 return gen_rtx_REG (mode, regno);
7227 }
7228 else if (POINTER_TYPE_P (valtype))
7229 {
7230 /* Pointers are always returned in Pmode. */
7231 mode = Pmode;
7232 }
7233
7234 ret = construct_container (mode, orig_mode, valtype, 1,
7235 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7236 x86_64_int_return_registers, 0);
7237
7238 /* For zero sized structures, construct_container returns NULL, but we
7239 need to keep rest of compiler happy by returning meaningful value. */
7240 if (!ret)
7241 ret = gen_rtx_REG (orig_mode, AX_REG);
7242
7243 return ret;
7244 }
7245
7246 static rtx
7247 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7248 {
7249 unsigned int regno = AX_REG;
7250
7251 if (TARGET_SSE)
7252 {
7253 switch (GET_MODE_SIZE (mode))
7254 {
7255 case 16:
7256 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7257 && !COMPLEX_MODE_P (mode))
7258 regno = FIRST_SSE_REG;
7259 break;
7260 case 8:
7261 case 4:
7262 if (mode == SFmode || mode == DFmode)
7263 regno = FIRST_SSE_REG;
7264 break;
7265 default:
7266 break;
7267 }
7268 }
7269 return gen_rtx_REG (orig_mode, regno);
7270 }
7271
7272 static rtx
7273 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7274 enum machine_mode orig_mode, enum machine_mode mode)
7275 {
7276 const_tree fn, fntype;
7277
7278 fn = NULL_TREE;
7279 if (fntype_or_decl && DECL_P (fntype_or_decl))
7280 fn = fntype_or_decl;
7281 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7282
7283 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7284 return function_value_ms_64 (orig_mode, mode);
7285 else if (TARGET_64BIT)
7286 return function_value_64 (orig_mode, mode, valtype);
7287 else
7288 return function_value_32 (orig_mode, mode, fntype, fn);
7289 }
7290
7291 static rtx
7292 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7293 bool outgoing ATTRIBUTE_UNUSED)
7294 {
7295 enum machine_mode mode, orig_mode;
7296
7297 orig_mode = TYPE_MODE (valtype);
7298 mode = type_natural_mode (valtype, NULL);
7299 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7300 }
7301
7302 /* Pointer function arguments and return values are promoted to Pmode. */
7303
7304 static enum machine_mode
7305 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7306 int *punsignedp, const_tree fntype,
7307 int for_return)
7308 {
7309 if (type != NULL_TREE && POINTER_TYPE_P (type))
7310 {
7311 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7312 return Pmode;
7313 }
7314 return default_promote_function_mode (type, mode, punsignedp, fntype,
7315 for_return);
7316 }
7317
7318 rtx
7319 ix86_libcall_value (enum machine_mode mode)
7320 {
7321 return ix86_function_value_1 (NULL, NULL, mode, mode);
7322 }
7323
7324 /* Return true iff type is returned in memory. */
7325
7326 static bool ATTRIBUTE_UNUSED
7327 return_in_memory_32 (const_tree type, enum machine_mode mode)
7328 {
7329 HOST_WIDE_INT size;
7330
7331 if (mode == BLKmode)
7332 return true;
7333
7334 size = int_size_in_bytes (type);
7335
7336 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7337 return false;
7338
7339 if (VECTOR_MODE_P (mode) || mode == TImode)
7340 {
7341 /* User-created vectors small enough to fit in EAX. */
7342 if (size < 8)
7343 return false;
7344
7345 /* MMX/3dNow values are returned in MM0,
7346 except when it doesn't exits or the ABI prescribes otherwise. */
7347 if (size == 8)
7348 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7349
7350 /* SSE values are returned in XMM0, except when it doesn't exist. */
7351 if (size == 16)
7352 return !TARGET_SSE;
7353
7354 /* AVX values are returned in YMM0, except when it doesn't exist. */
7355 if (size == 32)
7356 return !TARGET_AVX;
7357 }
7358
7359 if (mode == XFmode)
7360 return false;
7361
7362 if (size > 12)
7363 return true;
7364
7365 /* OImode shouldn't be used directly. */
7366 gcc_assert (mode != OImode);
7367
7368 return false;
7369 }
7370
7371 static bool ATTRIBUTE_UNUSED
7372 return_in_memory_64 (const_tree type, enum machine_mode mode)
7373 {
7374 int needed_intregs, needed_sseregs;
7375 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7376 }
7377
7378 static bool ATTRIBUTE_UNUSED
7379 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7380 {
7381 HOST_WIDE_INT size = int_size_in_bytes (type);
7382
7383 /* __m128 is returned in xmm0. */
7384 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7385 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7386 return false;
7387
7388 /* Otherwise, the size must be exactly in [1248]. */
7389 return size != 1 && size != 2 && size != 4 && size != 8;
7390 }
7391
7392 static bool
7393 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7394 {
7395 #ifdef SUBTARGET_RETURN_IN_MEMORY
7396 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7397 #else
7398 const enum machine_mode mode = type_natural_mode (type, NULL);
7399
7400 if (TARGET_64BIT)
7401 {
7402 if (ix86_function_type_abi (fntype) == MS_ABI)
7403 return return_in_memory_ms_64 (type, mode);
7404 else
7405 return return_in_memory_64 (type, mode);
7406 }
7407 else
7408 return return_in_memory_32 (type, mode);
7409 #endif
7410 }
7411
7412 /* When returning SSE vector types, we have a choice of either
7413 (1) being abi incompatible with a -march switch, or
7414 (2) generating an error.
7415 Given no good solution, I think the safest thing is one warning.
7416 The user won't be able to use -Werror, but....
7417
7418 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7419 called in response to actually generating a caller or callee that
7420 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7421 via aggregate_value_p for general type probing from tree-ssa. */
7422
7423 static rtx
7424 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7425 {
7426 static bool warnedsse, warnedmmx;
7427
7428 if (!TARGET_64BIT && type)
7429 {
7430 /* Look at the return type of the function, not the function type. */
7431 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7432
7433 if (!TARGET_SSE && !warnedsse)
7434 {
7435 if (mode == TImode
7436 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7437 {
7438 warnedsse = true;
7439 warning (0, "SSE vector return without SSE enabled "
7440 "changes the ABI");
7441 }
7442 }
7443
7444 if (!TARGET_MMX && !warnedmmx)
7445 {
7446 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7447 {
7448 warnedmmx = true;
7449 warning (0, "MMX vector return without MMX enabled "
7450 "changes the ABI");
7451 }
7452 }
7453 }
7454
7455 return NULL;
7456 }
7457
7458 \f
7459 /* Create the va_list data type. */
7460
7461 /* Returns the calling convention specific va_list date type.
7462 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7463
7464 static tree
7465 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7466 {
7467 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7468
7469 /* For i386 we use plain pointer to argument area. */
7470 if (!TARGET_64BIT || abi == MS_ABI)
7471 return build_pointer_type (char_type_node);
7472
7473 record = lang_hooks.types.make_type (RECORD_TYPE);
7474 type_decl = build_decl (BUILTINS_LOCATION,
7475 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7476
7477 f_gpr = build_decl (BUILTINS_LOCATION,
7478 FIELD_DECL, get_identifier ("gp_offset"),
7479 unsigned_type_node);
7480 f_fpr = build_decl (BUILTINS_LOCATION,
7481 FIELD_DECL, get_identifier ("fp_offset"),
7482 unsigned_type_node);
7483 f_ovf = build_decl (BUILTINS_LOCATION,
7484 FIELD_DECL, get_identifier ("overflow_arg_area"),
7485 ptr_type_node);
7486 f_sav = build_decl (BUILTINS_LOCATION,
7487 FIELD_DECL, get_identifier ("reg_save_area"),
7488 ptr_type_node);
7489
7490 va_list_gpr_counter_field = f_gpr;
7491 va_list_fpr_counter_field = f_fpr;
7492
7493 DECL_FIELD_CONTEXT (f_gpr) = record;
7494 DECL_FIELD_CONTEXT (f_fpr) = record;
7495 DECL_FIELD_CONTEXT (f_ovf) = record;
7496 DECL_FIELD_CONTEXT (f_sav) = record;
7497
7498 TYPE_STUB_DECL (record) = type_decl;
7499 TYPE_NAME (record) = type_decl;
7500 TYPE_FIELDS (record) = f_gpr;
7501 DECL_CHAIN (f_gpr) = f_fpr;
7502 DECL_CHAIN (f_fpr) = f_ovf;
7503 DECL_CHAIN (f_ovf) = f_sav;
7504
7505 layout_type (record);
7506
7507 /* The correct type is an array type of one element. */
7508 return build_array_type (record, build_index_type (size_zero_node));
7509 }
7510
7511 /* Setup the builtin va_list data type and for 64-bit the additional
7512 calling convention specific va_list data types. */
7513
7514 static tree
7515 ix86_build_builtin_va_list (void)
7516 {
7517 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7518
7519 /* Initialize abi specific va_list builtin types. */
7520 if (TARGET_64BIT)
7521 {
7522 tree t;
7523 if (ix86_abi == MS_ABI)
7524 {
7525 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7526 if (TREE_CODE (t) != RECORD_TYPE)
7527 t = build_variant_type_copy (t);
7528 sysv_va_list_type_node = t;
7529 }
7530 else
7531 {
7532 t = ret;
7533 if (TREE_CODE (t) != RECORD_TYPE)
7534 t = build_variant_type_copy (t);
7535 sysv_va_list_type_node = t;
7536 }
7537 if (ix86_abi != MS_ABI)
7538 {
7539 t = ix86_build_builtin_va_list_abi (MS_ABI);
7540 if (TREE_CODE (t) != RECORD_TYPE)
7541 t = build_variant_type_copy (t);
7542 ms_va_list_type_node = t;
7543 }
7544 else
7545 {
7546 t = ret;
7547 if (TREE_CODE (t) != RECORD_TYPE)
7548 t = build_variant_type_copy (t);
7549 ms_va_list_type_node = t;
7550 }
7551 }
7552
7553 return ret;
7554 }
7555
7556 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7557
7558 static void
7559 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7560 {
7561 rtx save_area, mem;
7562 alias_set_type set;
7563 int i, max;
7564
7565 /* GPR size of varargs save area. */
7566 if (cfun->va_list_gpr_size)
7567 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7568 else
7569 ix86_varargs_gpr_size = 0;
7570
7571 /* FPR size of varargs save area. We don't need it if we don't pass
7572 anything in SSE registers. */
7573 if (TARGET_SSE && cfun->va_list_fpr_size)
7574 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7575 else
7576 ix86_varargs_fpr_size = 0;
7577
7578 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7579 return;
7580
7581 save_area = frame_pointer_rtx;
7582 set = get_varargs_alias_set ();
7583
7584 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7585 if (max > X86_64_REGPARM_MAX)
7586 max = X86_64_REGPARM_MAX;
7587
7588 for (i = cum->regno; i < max; i++)
7589 {
7590 mem = gen_rtx_MEM (Pmode,
7591 plus_constant (save_area, i * UNITS_PER_WORD));
7592 MEM_NOTRAP_P (mem) = 1;
7593 set_mem_alias_set (mem, set);
7594 emit_move_insn (mem, gen_rtx_REG (Pmode,
7595 x86_64_int_parameter_registers[i]));
7596 }
7597
7598 if (ix86_varargs_fpr_size)
7599 {
7600 enum machine_mode smode;
7601 rtx label, test;
7602
7603 /* Now emit code to save SSE registers. The AX parameter contains number
7604 of SSE parameter registers used to call this function, though all we
7605 actually check here is the zero/non-zero status. */
7606
7607 label = gen_label_rtx ();
7608 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7609 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7610 label));
7611
7612 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7613 we used movdqa (i.e. TImode) instead? Perhaps even better would
7614 be if we could determine the real mode of the data, via a hook
7615 into pass_stdarg. Ignore all that for now. */
7616 smode = V4SFmode;
7617 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7618 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7619
7620 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7621 if (max > X86_64_SSE_REGPARM_MAX)
7622 max = X86_64_SSE_REGPARM_MAX;
7623
7624 for (i = cum->sse_regno; i < max; ++i)
7625 {
7626 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7627 mem = gen_rtx_MEM (smode, mem);
7628 MEM_NOTRAP_P (mem) = 1;
7629 set_mem_alias_set (mem, set);
7630 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7631
7632 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7633 }
7634
7635 emit_label (label);
7636 }
7637 }
7638
7639 static void
7640 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7641 {
7642 alias_set_type set = get_varargs_alias_set ();
7643 int i;
7644
7645 /* Reset to zero, as there might be a sysv vaarg used
7646 before. */
7647 ix86_varargs_gpr_size = 0;
7648 ix86_varargs_fpr_size = 0;
7649
7650 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7651 {
7652 rtx reg, mem;
7653
7654 mem = gen_rtx_MEM (Pmode,
7655 plus_constant (virtual_incoming_args_rtx,
7656 i * UNITS_PER_WORD));
7657 MEM_NOTRAP_P (mem) = 1;
7658 set_mem_alias_set (mem, set);
7659
7660 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7661 emit_move_insn (mem, reg);
7662 }
7663 }
7664
7665 static void
7666 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7667 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7668 int no_rtl)
7669 {
7670 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7671 CUMULATIVE_ARGS next_cum;
7672 tree fntype;
7673
7674 /* This argument doesn't appear to be used anymore. Which is good,
7675 because the old code here didn't suppress rtl generation. */
7676 gcc_assert (!no_rtl);
7677
7678 if (!TARGET_64BIT)
7679 return;
7680
7681 fntype = TREE_TYPE (current_function_decl);
7682
7683 /* For varargs, we do not want to skip the dummy va_dcl argument.
7684 For stdargs, we do want to skip the last named argument. */
7685 next_cum = *cum;
7686 if (stdarg_p (fntype))
7687 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7688 true);
7689
7690 if (cum->call_abi == MS_ABI)
7691 setup_incoming_varargs_ms_64 (&next_cum);
7692 else
7693 setup_incoming_varargs_64 (&next_cum);
7694 }
7695
7696 /* Checks if TYPE is of kind va_list char *. */
7697
7698 static bool
7699 is_va_list_char_pointer (tree type)
7700 {
7701 tree canonic;
7702
7703 /* For 32-bit it is always true. */
7704 if (!TARGET_64BIT)
7705 return true;
7706 canonic = ix86_canonical_va_list_type (type);
7707 return (canonic == ms_va_list_type_node
7708 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7709 }
7710
7711 /* Implement va_start. */
7712
7713 static void
7714 ix86_va_start (tree valist, rtx nextarg)
7715 {
7716 HOST_WIDE_INT words, n_gpr, n_fpr;
7717 tree f_gpr, f_fpr, f_ovf, f_sav;
7718 tree gpr, fpr, ovf, sav, t;
7719 tree type;
7720 rtx ovf_rtx;
7721
7722 if (flag_split_stack
7723 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7724 {
7725 unsigned int scratch_regno;
7726
7727 /* When we are splitting the stack, we can't refer to the stack
7728 arguments using internal_arg_pointer, because they may be on
7729 the old stack. The split stack prologue will arrange to
7730 leave a pointer to the old stack arguments in a scratch
7731 register, which we here copy to a pseudo-register. The split
7732 stack prologue can't set the pseudo-register directly because
7733 it (the prologue) runs before any registers have been saved. */
7734
7735 scratch_regno = split_stack_prologue_scratch_regno ();
7736 if (scratch_regno != INVALID_REGNUM)
7737 {
7738 rtx reg, seq;
7739
7740 reg = gen_reg_rtx (Pmode);
7741 cfun->machine->split_stack_varargs_pointer = reg;
7742
7743 start_sequence ();
7744 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7745 seq = get_insns ();
7746 end_sequence ();
7747
7748 push_topmost_sequence ();
7749 emit_insn_after (seq, entry_of_function ());
7750 pop_topmost_sequence ();
7751 }
7752 }
7753
7754 /* Only 64bit target needs something special. */
7755 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7756 {
7757 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7758 std_expand_builtin_va_start (valist, nextarg);
7759 else
7760 {
7761 rtx va_r, next;
7762
7763 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7764 next = expand_binop (ptr_mode, add_optab,
7765 cfun->machine->split_stack_varargs_pointer,
7766 crtl->args.arg_offset_rtx,
7767 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7768 convert_move (va_r, next, 0);
7769 }
7770 return;
7771 }
7772
7773 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7774 f_fpr = DECL_CHAIN (f_gpr);
7775 f_ovf = DECL_CHAIN (f_fpr);
7776 f_sav = DECL_CHAIN (f_ovf);
7777
7778 valist = build_simple_mem_ref (valist);
7779 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7780 /* The following should be folded into the MEM_REF offset. */
7781 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7782 f_gpr, NULL_TREE);
7783 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7784 f_fpr, NULL_TREE);
7785 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7786 f_ovf, NULL_TREE);
7787 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7788 f_sav, NULL_TREE);
7789
7790 /* Count number of gp and fp argument registers used. */
7791 words = crtl->args.info.words;
7792 n_gpr = crtl->args.info.regno;
7793 n_fpr = crtl->args.info.sse_regno;
7794
7795 if (cfun->va_list_gpr_size)
7796 {
7797 type = TREE_TYPE (gpr);
7798 t = build2 (MODIFY_EXPR, type,
7799 gpr, build_int_cst (type, n_gpr * 8));
7800 TREE_SIDE_EFFECTS (t) = 1;
7801 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7802 }
7803
7804 if (TARGET_SSE && cfun->va_list_fpr_size)
7805 {
7806 type = TREE_TYPE (fpr);
7807 t = build2 (MODIFY_EXPR, type, fpr,
7808 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7809 TREE_SIDE_EFFECTS (t) = 1;
7810 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7811 }
7812
7813 /* Find the overflow area. */
7814 type = TREE_TYPE (ovf);
7815 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7816 ovf_rtx = crtl->args.internal_arg_pointer;
7817 else
7818 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7819 t = make_tree (type, ovf_rtx);
7820 if (words != 0)
7821 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7822 t = build2 (MODIFY_EXPR, type, ovf, t);
7823 TREE_SIDE_EFFECTS (t) = 1;
7824 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7825
7826 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7827 {
7828 /* Find the register save area.
7829 Prologue of the function save it right above stack frame. */
7830 type = TREE_TYPE (sav);
7831 t = make_tree (type, frame_pointer_rtx);
7832 if (!ix86_varargs_gpr_size)
7833 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7834 t = build2 (MODIFY_EXPR, type, sav, t);
7835 TREE_SIDE_EFFECTS (t) = 1;
7836 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7837 }
7838 }
7839
7840 /* Implement va_arg. */
7841
7842 static tree
7843 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7844 gimple_seq *post_p)
7845 {
7846 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7847 tree f_gpr, f_fpr, f_ovf, f_sav;
7848 tree gpr, fpr, ovf, sav, t;
7849 int size, rsize;
7850 tree lab_false, lab_over = NULL_TREE;
7851 tree addr, t2;
7852 rtx container;
7853 int indirect_p = 0;
7854 tree ptrtype;
7855 enum machine_mode nat_mode;
7856 unsigned int arg_boundary;
7857
7858 /* Only 64bit target needs something special. */
7859 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7860 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7861
7862 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7863 f_fpr = DECL_CHAIN (f_gpr);
7864 f_ovf = DECL_CHAIN (f_fpr);
7865 f_sav = DECL_CHAIN (f_ovf);
7866
7867 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7868 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7869 valist = build_va_arg_indirect_ref (valist);
7870 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7871 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7872 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7873
7874 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7875 if (indirect_p)
7876 type = build_pointer_type (type);
7877 size = int_size_in_bytes (type);
7878 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7879
7880 nat_mode = type_natural_mode (type, NULL);
7881 switch (nat_mode)
7882 {
7883 case V8SFmode:
7884 case V8SImode:
7885 case V32QImode:
7886 case V16HImode:
7887 case V4DFmode:
7888 case V4DImode:
7889 /* Unnamed 256bit vector mode parameters are passed on stack. */
7890 if (!TARGET_64BIT_MS_ABI)
7891 {
7892 container = NULL;
7893 break;
7894 }
7895
7896 default:
7897 container = construct_container (nat_mode, TYPE_MODE (type),
7898 type, 0, X86_64_REGPARM_MAX,
7899 X86_64_SSE_REGPARM_MAX, intreg,
7900 0);
7901 break;
7902 }
7903
7904 /* Pull the value out of the saved registers. */
7905
7906 addr = create_tmp_var (ptr_type_node, "addr");
7907
7908 if (container)
7909 {
7910 int needed_intregs, needed_sseregs;
7911 bool need_temp;
7912 tree int_addr, sse_addr;
7913
7914 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7915 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7916
7917 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7918
7919 need_temp = (!REG_P (container)
7920 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7921 || TYPE_ALIGN (type) > 128));
7922
7923 /* In case we are passing structure, verify that it is consecutive block
7924 on the register save area. If not we need to do moves. */
7925 if (!need_temp && !REG_P (container))
7926 {
7927 /* Verify that all registers are strictly consecutive */
7928 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7929 {
7930 int i;
7931
7932 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7933 {
7934 rtx slot = XVECEXP (container, 0, i);
7935 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7936 || INTVAL (XEXP (slot, 1)) != i * 16)
7937 need_temp = 1;
7938 }
7939 }
7940 else
7941 {
7942 int i;
7943
7944 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7945 {
7946 rtx slot = XVECEXP (container, 0, i);
7947 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7948 || INTVAL (XEXP (slot, 1)) != i * 8)
7949 need_temp = 1;
7950 }
7951 }
7952 }
7953 if (!need_temp)
7954 {
7955 int_addr = addr;
7956 sse_addr = addr;
7957 }
7958 else
7959 {
7960 int_addr = create_tmp_var (ptr_type_node, "int_addr");
7961 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7962 }
7963
7964 /* First ensure that we fit completely in registers. */
7965 if (needed_intregs)
7966 {
7967 t = build_int_cst (TREE_TYPE (gpr),
7968 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7969 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7970 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7971 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7972 gimplify_and_add (t, pre_p);
7973 }
7974 if (needed_sseregs)
7975 {
7976 t = build_int_cst (TREE_TYPE (fpr),
7977 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
7978 + X86_64_REGPARM_MAX * 8);
7979 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
7980 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7981 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7982 gimplify_and_add (t, pre_p);
7983 }
7984
7985 /* Compute index to start of area used for integer regs. */
7986 if (needed_intregs)
7987 {
7988 /* int_addr = gpr + sav; */
7989 t = fold_build_pointer_plus (sav, gpr);
7990 gimplify_assign (int_addr, t, pre_p);
7991 }
7992 if (needed_sseregs)
7993 {
7994 /* sse_addr = fpr + sav; */
7995 t = fold_build_pointer_plus (sav, fpr);
7996 gimplify_assign (sse_addr, t, pre_p);
7997 }
7998 if (need_temp)
7999 {
8000 int i, prev_size = 0;
8001 tree temp = create_tmp_var (type, "va_arg_tmp");
8002
8003 /* addr = &temp; */
8004 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8005 gimplify_assign (addr, t, pre_p);
8006
8007 for (i = 0; i < XVECLEN (container, 0); i++)
8008 {
8009 rtx slot = XVECEXP (container, 0, i);
8010 rtx reg = XEXP (slot, 0);
8011 enum machine_mode mode = GET_MODE (reg);
8012 tree piece_type;
8013 tree addr_type;
8014 tree daddr_type;
8015 tree src_addr, src;
8016 int src_offset;
8017 tree dest_addr, dest;
8018 int cur_size = GET_MODE_SIZE (mode);
8019
8020 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8021 prev_size = INTVAL (XEXP (slot, 1));
8022 if (prev_size + cur_size > size)
8023 {
8024 cur_size = size - prev_size;
8025 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8026 if (mode == BLKmode)
8027 mode = QImode;
8028 }
8029 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8030 if (mode == GET_MODE (reg))
8031 addr_type = build_pointer_type (piece_type);
8032 else
8033 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8034 true);
8035 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8036 true);
8037
8038 if (SSE_REGNO_P (REGNO (reg)))
8039 {
8040 src_addr = sse_addr;
8041 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8042 }
8043 else
8044 {
8045 src_addr = int_addr;
8046 src_offset = REGNO (reg) * 8;
8047 }
8048 src_addr = fold_convert (addr_type, src_addr);
8049 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8050
8051 dest_addr = fold_convert (daddr_type, addr);
8052 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8053 if (cur_size == GET_MODE_SIZE (mode))
8054 {
8055 src = build_va_arg_indirect_ref (src_addr);
8056 dest = build_va_arg_indirect_ref (dest_addr);
8057
8058 gimplify_assign (dest, src, pre_p);
8059 }
8060 else
8061 {
8062 tree copy
8063 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8064 3, dest_addr, src_addr,
8065 size_int (cur_size));
8066 gimplify_and_add (copy, pre_p);
8067 }
8068 prev_size += cur_size;
8069 }
8070 }
8071
8072 if (needed_intregs)
8073 {
8074 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8075 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8076 gimplify_assign (gpr, t, pre_p);
8077 }
8078
8079 if (needed_sseregs)
8080 {
8081 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8082 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8083 gimplify_assign (fpr, t, pre_p);
8084 }
8085
8086 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8087
8088 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8089 }
8090
8091 /* ... otherwise out of the overflow area. */
8092
8093 /* When we align parameter on stack for caller, if the parameter
8094 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8095 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8096 here with caller. */
8097 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8098 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8099 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8100
8101 /* Care for on-stack alignment if needed. */
8102 if (arg_boundary <= 64 || size == 0)
8103 t = ovf;
8104 else
8105 {
8106 HOST_WIDE_INT align = arg_boundary / 8;
8107 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8108 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8109 build_int_cst (TREE_TYPE (t), -align));
8110 }
8111
8112 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8113 gimplify_assign (addr, t, pre_p);
8114
8115 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8116 gimplify_assign (unshare_expr (ovf), t, pre_p);
8117
8118 if (container)
8119 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8120
8121 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8122 addr = fold_convert (ptrtype, addr);
8123
8124 if (indirect_p)
8125 addr = build_va_arg_indirect_ref (addr);
8126 return build_va_arg_indirect_ref (addr);
8127 }
8128 \f
8129 /* Return true if OPNUM's MEM should be matched
8130 in movabs* patterns. */
8131
8132 bool
8133 ix86_check_movabs (rtx insn, int opnum)
8134 {
8135 rtx set, mem;
8136
8137 set = PATTERN (insn);
8138 if (GET_CODE (set) == PARALLEL)
8139 set = XVECEXP (set, 0, 0);
8140 gcc_assert (GET_CODE (set) == SET);
8141 mem = XEXP (set, opnum);
8142 while (GET_CODE (mem) == SUBREG)
8143 mem = SUBREG_REG (mem);
8144 gcc_assert (MEM_P (mem));
8145 return volatile_ok || !MEM_VOLATILE_P (mem);
8146 }
8147 \f
8148 /* Initialize the table of extra 80387 mathematical constants. */
8149
8150 static void
8151 init_ext_80387_constants (void)
8152 {
8153 static const char * cst[5] =
8154 {
8155 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8156 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8157 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8158 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8159 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8160 };
8161 int i;
8162
8163 for (i = 0; i < 5; i++)
8164 {
8165 real_from_string (&ext_80387_constants_table[i], cst[i]);
8166 /* Ensure each constant is rounded to XFmode precision. */
8167 real_convert (&ext_80387_constants_table[i],
8168 XFmode, &ext_80387_constants_table[i]);
8169 }
8170
8171 ext_80387_constants_init = 1;
8172 }
8173
8174 /* Return non-zero if the constant is something that
8175 can be loaded with a special instruction. */
8176
8177 int
8178 standard_80387_constant_p (rtx x)
8179 {
8180 enum machine_mode mode = GET_MODE (x);
8181
8182 REAL_VALUE_TYPE r;
8183
8184 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8185 return -1;
8186
8187 if (x == CONST0_RTX (mode))
8188 return 1;
8189 if (x == CONST1_RTX (mode))
8190 return 2;
8191
8192 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8193
8194 /* For XFmode constants, try to find a special 80387 instruction when
8195 optimizing for size or on those CPUs that benefit from them. */
8196 if (mode == XFmode
8197 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8198 {
8199 int i;
8200
8201 if (! ext_80387_constants_init)
8202 init_ext_80387_constants ();
8203
8204 for (i = 0; i < 5; i++)
8205 if (real_identical (&r, &ext_80387_constants_table[i]))
8206 return i + 3;
8207 }
8208
8209 /* Load of the constant -0.0 or -1.0 will be split as
8210 fldz;fchs or fld1;fchs sequence. */
8211 if (real_isnegzero (&r))
8212 return 8;
8213 if (real_identical (&r, &dconstm1))
8214 return 9;
8215
8216 return 0;
8217 }
8218
8219 /* Return the opcode of the special instruction to be used to load
8220 the constant X. */
8221
8222 const char *
8223 standard_80387_constant_opcode (rtx x)
8224 {
8225 switch (standard_80387_constant_p (x))
8226 {
8227 case 1:
8228 return "fldz";
8229 case 2:
8230 return "fld1";
8231 case 3:
8232 return "fldlg2";
8233 case 4:
8234 return "fldln2";
8235 case 5:
8236 return "fldl2e";
8237 case 6:
8238 return "fldl2t";
8239 case 7:
8240 return "fldpi";
8241 case 8:
8242 case 9:
8243 return "#";
8244 default:
8245 gcc_unreachable ();
8246 }
8247 }
8248
8249 /* Return the CONST_DOUBLE representing the 80387 constant that is
8250 loaded by the specified special instruction. The argument IDX
8251 matches the return value from standard_80387_constant_p. */
8252
8253 rtx
8254 standard_80387_constant_rtx (int idx)
8255 {
8256 int i;
8257
8258 if (! ext_80387_constants_init)
8259 init_ext_80387_constants ();
8260
8261 switch (idx)
8262 {
8263 case 3:
8264 case 4:
8265 case 5:
8266 case 6:
8267 case 7:
8268 i = idx - 3;
8269 break;
8270
8271 default:
8272 gcc_unreachable ();
8273 }
8274
8275 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8276 XFmode);
8277 }
8278
8279 /* Return 1 if X is all 0s and 2 if x is all 1s
8280 in supported SSE/AVX vector mode. */
8281
8282 int
8283 standard_sse_constant_p (rtx x)
8284 {
8285 enum machine_mode mode = GET_MODE (x);
8286
8287 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8288 return 1;
8289 if (vector_all_ones_operand (x, mode))
8290 switch (mode)
8291 {
8292 case V16QImode:
8293 case V8HImode:
8294 case V4SImode:
8295 case V2DImode:
8296 if (TARGET_SSE2)
8297 return 2;
8298 case V32QImode:
8299 case V16HImode:
8300 case V8SImode:
8301 case V4DImode:
8302 if (TARGET_AVX2)
8303 return 2;
8304 default:
8305 break;
8306 }
8307
8308 return 0;
8309 }
8310
8311 /* Return the opcode of the special instruction to be used to load
8312 the constant X. */
8313
8314 const char *
8315 standard_sse_constant_opcode (rtx insn, rtx x)
8316 {
8317 switch (standard_sse_constant_p (x))
8318 {
8319 case 1:
8320 switch (get_attr_mode (insn))
8321 {
8322 case MODE_TI:
8323 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8324 return "%vpxor\t%0, %d0";
8325 case MODE_V2DF:
8326 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8327 return "%vxorpd\t%0, %d0";
8328 case MODE_V4SF:
8329 return "%vxorps\t%0, %d0";
8330
8331 case MODE_OI:
8332 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8333 return "vpxor\t%x0, %x0, %x0";
8334 case MODE_V4DF:
8335 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8336 return "vxorpd\t%x0, %x0, %x0";
8337 case MODE_V8SF:
8338 return "vxorps\t%x0, %x0, %x0";
8339
8340 default:
8341 break;
8342 }
8343
8344 case 2:
8345 if (TARGET_AVX)
8346 return "vpcmpeqd\t%0, %0, %0";
8347 else
8348 return "pcmpeqd\t%0, %0";
8349
8350 default:
8351 break;
8352 }
8353 gcc_unreachable ();
8354 }
8355
8356 /* Returns true if OP contains a symbol reference */
8357
8358 bool
8359 symbolic_reference_mentioned_p (rtx op)
8360 {
8361 const char *fmt;
8362 int i;
8363
8364 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8365 return true;
8366
8367 fmt = GET_RTX_FORMAT (GET_CODE (op));
8368 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8369 {
8370 if (fmt[i] == 'E')
8371 {
8372 int j;
8373
8374 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8375 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8376 return true;
8377 }
8378
8379 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8380 return true;
8381 }
8382
8383 return false;
8384 }
8385
8386 /* Return true if it is appropriate to emit `ret' instructions in the
8387 body of a function. Do this only if the epilogue is simple, needing a
8388 couple of insns. Prior to reloading, we can't tell how many registers
8389 must be saved, so return false then. Return false if there is no frame
8390 marker to de-allocate. */
8391
8392 bool
8393 ix86_can_use_return_insn_p (void)
8394 {
8395 struct ix86_frame frame;
8396
8397 if (! reload_completed || frame_pointer_needed)
8398 return 0;
8399
8400 /* Don't allow more than 32k pop, since that's all we can do
8401 with one instruction. */
8402 if (crtl->args.pops_args && crtl->args.size >= 32768)
8403 return 0;
8404
8405 ix86_compute_frame_layout (&frame);
8406 return (frame.stack_pointer_offset == UNITS_PER_WORD
8407 && (frame.nregs + frame.nsseregs) == 0);
8408 }
8409 \f
8410 /* Value should be nonzero if functions must have frame pointers.
8411 Zero means the frame pointer need not be set up (and parms may
8412 be accessed via the stack pointer) in functions that seem suitable. */
8413
8414 static bool
8415 ix86_frame_pointer_required (void)
8416 {
8417 /* If we accessed previous frames, then the generated code expects
8418 to be able to access the saved ebp value in our frame. */
8419 if (cfun->machine->accesses_prev_frame)
8420 return true;
8421
8422 /* Several x86 os'es need a frame pointer for other reasons,
8423 usually pertaining to setjmp. */
8424 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8425 return true;
8426
8427 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8428 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8429 return true;
8430
8431 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8432 turns off the frame pointer by default. Turn it back on now if
8433 we've not got a leaf function. */
8434 if (TARGET_OMIT_LEAF_FRAME_POINTER
8435 && (!current_function_is_leaf
8436 || ix86_current_function_calls_tls_descriptor))
8437 return true;
8438
8439 if (crtl->profile && !flag_fentry)
8440 return true;
8441
8442 return false;
8443 }
8444
8445 /* Record that the current function accesses previous call frames. */
8446
8447 void
8448 ix86_setup_frame_addresses (void)
8449 {
8450 cfun->machine->accesses_prev_frame = 1;
8451 }
8452 \f
8453 #ifndef USE_HIDDEN_LINKONCE
8454 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8455 # define USE_HIDDEN_LINKONCE 1
8456 # else
8457 # define USE_HIDDEN_LINKONCE 0
8458 # endif
8459 #endif
8460
8461 static int pic_labels_used;
8462
8463 /* Fills in the label name that should be used for a pc thunk for
8464 the given register. */
8465
8466 static void
8467 get_pc_thunk_name (char name[32], unsigned int regno)
8468 {
8469 gcc_assert (!TARGET_64BIT);
8470
8471 if (USE_HIDDEN_LINKONCE)
8472 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8473 else
8474 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8475 }
8476
8477
8478 /* This function generates code for -fpic that loads %ebx with
8479 the return address of the caller and then returns. */
8480
8481 static void
8482 ix86_code_end (void)
8483 {
8484 rtx xops[2];
8485 int regno;
8486
8487 for (regno = AX_REG; regno <= SP_REG; regno++)
8488 {
8489 char name[32];
8490 tree decl;
8491
8492 if (!(pic_labels_used & (1 << regno)))
8493 continue;
8494
8495 get_pc_thunk_name (name, regno);
8496
8497 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8498 get_identifier (name),
8499 build_function_type_list (void_type_node, NULL_TREE));
8500 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8501 NULL_TREE, void_type_node);
8502 TREE_PUBLIC (decl) = 1;
8503 TREE_STATIC (decl) = 1;
8504
8505 #if TARGET_MACHO
8506 if (TARGET_MACHO)
8507 {
8508 switch_to_section (darwin_sections[text_coal_section]);
8509 fputs ("\t.weak_definition\t", asm_out_file);
8510 assemble_name (asm_out_file, name);
8511 fputs ("\n\t.private_extern\t", asm_out_file);
8512 assemble_name (asm_out_file, name);
8513 putc ('\n', asm_out_file);
8514 ASM_OUTPUT_LABEL (asm_out_file, name);
8515 DECL_WEAK (decl) = 1;
8516 }
8517 else
8518 #endif
8519 if (USE_HIDDEN_LINKONCE)
8520 {
8521 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8522
8523 targetm.asm_out.unique_section (decl, 0);
8524 switch_to_section (get_named_section (decl, NULL, 0));
8525
8526 targetm.asm_out.globalize_label (asm_out_file, name);
8527 fputs ("\t.hidden\t", asm_out_file);
8528 assemble_name (asm_out_file, name);
8529 putc ('\n', asm_out_file);
8530 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8531 }
8532 else
8533 {
8534 switch_to_section (text_section);
8535 ASM_OUTPUT_LABEL (asm_out_file, name);
8536 }
8537
8538 DECL_INITIAL (decl) = make_node (BLOCK);
8539 current_function_decl = decl;
8540 init_function_start (decl);
8541 first_function_block_is_cold = false;
8542 /* Make sure unwind info is emitted for the thunk if needed. */
8543 final_start_function (emit_barrier (), asm_out_file, 1);
8544
8545 /* Pad stack IP move with 4 instructions (two NOPs count
8546 as one instruction). */
8547 if (TARGET_PAD_SHORT_FUNCTION)
8548 {
8549 int i = 8;
8550
8551 while (i--)
8552 fputs ("\tnop\n", asm_out_file);
8553 }
8554
8555 xops[0] = gen_rtx_REG (Pmode, regno);
8556 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8557 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8558 fputs ("\tret\n", asm_out_file);
8559 final_end_function ();
8560 init_insn_lengths ();
8561 free_after_compilation (cfun);
8562 set_cfun (NULL);
8563 current_function_decl = NULL;
8564 }
8565
8566 if (flag_split_stack)
8567 file_end_indicate_split_stack ();
8568 }
8569
8570 /* Emit code for the SET_GOT patterns. */
8571
8572 const char *
8573 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8574 {
8575 rtx xops[3];
8576
8577 xops[0] = dest;
8578
8579 if (TARGET_VXWORKS_RTP && flag_pic)
8580 {
8581 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8582 xops[2] = gen_rtx_MEM (Pmode,
8583 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8584 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8585
8586 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8587 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8588 an unadorned address. */
8589 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8590 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8591 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8592 return "";
8593 }
8594
8595 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8596
8597 if (!flag_pic)
8598 {
8599 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8600
8601 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8602
8603 #if TARGET_MACHO
8604 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8605 is what will be referenced by the Mach-O PIC subsystem. */
8606 if (!label)
8607 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8608 #endif
8609
8610 targetm.asm_out.internal_label (asm_out_file, "L",
8611 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8612 }
8613 else
8614 {
8615 char name[32];
8616 get_pc_thunk_name (name, REGNO (dest));
8617 pic_labels_used |= 1 << REGNO (dest);
8618
8619 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8620 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8621 output_asm_insn ("call\t%X2", xops);
8622 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8623 is what will be referenced by the Mach-O PIC subsystem. */
8624 #if TARGET_MACHO
8625 if (!label)
8626 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8627 else
8628 targetm.asm_out.internal_label (asm_out_file, "L",
8629 CODE_LABEL_NUMBER (label));
8630 #endif
8631 }
8632
8633 if (!TARGET_MACHO)
8634 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8635
8636 return "";
8637 }
8638
8639 /* Generate an "push" pattern for input ARG. */
8640
8641 static rtx
8642 gen_push (rtx arg)
8643 {
8644 struct machine_function *m = cfun->machine;
8645
8646 if (m->fs.cfa_reg == stack_pointer_rtx)
8647 m->fs.cfa_offset += UNITS_PER_WORD;
8648 m->fs.sp_offset += UNITS_PER_WORD;
8649
8650 return gen_rtx_SET (VOIDmode,
8651 gen_rtx_MEM (Pmode,
8652 gen_rtx_PRE_DEC (Pmode,
8653 stack_pointer_rtx)),
8654 arg);
8655 }
8656
8657 /* Generate an "pop" pattern for input ARG. */
8658
8659 static rtx
8660 gen_pop (rtx arg)
8661 {
8662 return gen_rtx_SET (VOIDmode,
8663 arg,
8664 gen_rtx_MEM (Pmode,
8665 gen_rtx_POST_INC (Pmode,
8666 stack_pointer_rtx)));
8667 }
8668
8669 /* Return >= 0 if there is an unused call-clobbered register available
8670 for the entire function. */
8671
8672 static unsigned int
8673 ix86_select_alt_pic_regnum (void)
8674 {
8675 if (current_function_is_leaf
8676 && !crtl->profile
8677 && !ix86_current_function_calls_tls_descriptor)
8678 {
8679 int i, drap;
8680 /* Can't use the same register for both PIC and DRAP. */
8681 if (crtl->drap_reg)
8682 drap = REGNO (crtl->drap_reg);
8683 else
8684 drap = -1;
8685 for (i = 2; i >= 0; --i)
8686 if (i != drap && !df_regs_ever_live_p (i))
8687 return i;
8688 }
8689
8690 return INVALID_REGNUM;
8691 }
8692
8693 /* Return TRUE if we need to save REGNO. */
8694
8695 static bool
8696 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8697 {
8698 if (pic_offset_table_rtx
8699 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8700 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8701 || crtl->profile
8702 || crtl->calls_eh_return
8703 || crtl->uses_const_pool))
8704 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8705
8706 if (crtl->calls_eh_return && maybe_eh_return)
8707 {
8708 unsigned i;
8709 for (i = 0; ; i++)
8710 {
8711 unsigned test = EH_RETURN_DATA_REGNO (i);
8712 if (test == INVALID_REGNUM)
8713 break;
8714 if (test == regno)
8715 return true;
8716 }
8717 }
8718
8719 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8720 return true;
8721
8722 return (df_regs_ever_live_p (regno)
8723 && !call_used_regs[regno]
8724 && !fixed_regs[regno]
8725 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8726 }
8727
8728 /* Return number of saved general prupose registers. */
8729
8730 static int
8731 ix86_nsaved_regs (void)
8732 {
8733 int nregs = 0;
8734 int regno;
8735
8736 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8737 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8738 nregs ++;
8739 return nregs;
8740 }
8741
8742 /* Return number of saved SSE registrers. */
8743
8744 static int
8745 ix86_nsaved_sseregs (void)
8746 {
8747 int nregs = 0;
8748 int regno;
8749
8750 if (!TARGET_64BIT_MS_ABI)
8751 return 0;
8752 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8753 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8754 nregs ++;
8755 return nregs;
8756 }
8757
8758 /* Given FROM and TO register numbers, say whether this elimination is
8759 allowed. If stack alignment is needed, we can only replace argument
8760 pointer with hard frame pointer, or replace frame pointer with stack
8761 pointer. Otherwise, frame pointer elimination is automatically
8762 handled and all other eliminations are valid. */
8763
8764 static bool
8765 ix86_can_eliminate (const int from, const int to)
8766 {
8767 if (stack_realign_fp)
8768 return ((from == ARG_POINTER_REGNUM
8769 && to == HARD_FRAME_POINTER_REGNUM)
8770 || (from == FRAME_POINTER_REGNUM
8771 && to == STACK_POINTER_REGNUM));
8772 else
8773 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8774 }
8775
8776 /* Return the offset between two registers, one to be eliminated, and the other
8777 its replacement, at the start of a routine. */
8778
8779 HOST_WIDE_INT
8780 ix86_initial_elimination_offset (int from, int to)
8781 {
8782 struct ix86_frame frame;
8783 ix86_compute_frame_layout (&frame);
8784
8785 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8786 return frame.hard_frame_pointer_offset;
8787 else if (from == FRAME_POINTER_REGNUM
8788 && to == HARD_FRAME_POINTER_REGNUM)
8789 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8790 else
8791 {
8792 gcc_assert (to == STACK_POINTER_REGNUM);
8793
8794 if (from == ARG_POINTER_REGNUM)
8795 return frame.stack_pointer_offset;
8796
8797 gcc_assert (from == FRAME_POINTER_REGNUM);
8798 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8799 }
8800 }
8801
8802 /* In a dynamically-aligned function, we can't know the offset from
8803 stack pointer to frame pointer, so we must ensure that setjmp
8804 eliminates fp against the hard fp (%ebp) rather than trying to
8805 index from %esp up to the top of the frame across a gap that is
8806 of unknown (at compile-time) size. */
8807 static rtx
8808 ix86_builtin_setjmp_frame_value (void)
8809 {
8810 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8811 }
8812
8813 /* When using -fsplit-stack, the allocation routines set a field in
8814 the TCB to the bottom of the stack plus this much space, measured
8815 in bytes. */
8816
8817 #define SPLIT_STACK_AVAILABLE 256
8818
8819 /* Fill structure ix86_frame about frame of currently computed function. */
8820
8821 static void
8822 ix86_compute_frame_layout (struct ix86_frame *frame)
8823 {
8824 unsigned int stack_alignment_needed;
8825 HOST_WIDE_INT offset;
8826 unsigned int preferred_alignment;
8827 HOST_WIDE_INT size = get_frame_size ();
8828 HOST_WIDE_INT to_allocate;
8829
8830 frame->nregs = ix86_nsaved_regs ();
8831 frame->nsseregs = ix86_nsaved_sseregs ();
8832
8833 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8834 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8835
8836 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8837 function prologues and leaf. */
8838 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8839 && (!current_function_is_leaf || cfun->calls_alloca != 0
8840 || ix86_current_function_calls_tls_descriptor))
8841 {
8842 preferred_alignment = 16;
8843 stack_alignment_needed = 16;
8844 crtl->preferred_stack_boundary = 128;
8845 crtl->stack_alignment_needed = 128;
8846 }
8847
8848 gcc_assert (!size || stack_alignment_needed);
8849 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8850 gcc_assert (preferred_alignment <= stack_alignment_needed);
8851
8852 /* For SEH we have to limit the amount of code movement into the prologue.
8853 At present we do this via a BLOCKAGE, at which point there's very little
8854 scheduling that can be done, which means that there's very little point
8855 in doing anything except PUSHs. */
8856 if (TARGET_SEH)
8857 cfun->machine->use_fast_prologue_epilogue = false;
8858
8859 /* During reload iteration the amount of registers saved can change.
8860 Recompute the value as needed. Do not recompute when amount of registers
8861 didn't change as reload does multiple calls to the function and does not
8862 expect the decision to change within single iteration. */
8863 else if (!optimize_function_for_size_p (cfun)
8864 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8865 {
8866 int count = frame->nregs;
8867 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8868
8869 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8870
8871 /* The fast prologue uses move instead of push to save registers. This
8872 is significantly longer, but also executes faster as modern hardware
8873 can execute the moves in parallel, but can't do that for push/pop.
8874
8875 Be careful about choosing what prologue to emit: When function takes
8876 many instructions to execute we may use slow version as well as in
8877 case function is known to be outside hot spot (this is known with
8878 feedback only). Weight the size of function by number of registers
8879 to save as it is cheap to use one or two push instructions but very
8880 slow to use many of them. */
8881 if (count)
8882 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8883 if (node->frequency < NODE_FREQUENCY_NORMAL
8884 || (flag_branch_probabilities
8885 && node->frequency < NODE_FREQUENCY_HOT))
8886 cfun->machine->use_fast_prologue_epilogue = false;
8887 else
8888 cfun->machine->use_fast_prologue_epilogue
8889 = !expensive_function_p (count);
8890 }
8891
8892 frame->save_regs_using_mov
8893 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8894 /* If static stack checking is enabled and done with probes,
8895 the registers need to be saved before allocating the frame. */
8896 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8897
8898 /* Skip return address. */
8899 offset = UNITS_PER_WORD;
8900
8901 /* Skip pushed static chain. */
8902 if (ix86_static_chain_on_stack)
8903 offset += UNITS_PER_WORD;
8904
8905 /* Skip saved base pointer. */
8906 if (frame_pointer_needed)
8907 offset += UNITS_PER_WORD;
8908 frame->hfp_save_offset = offset;
8909
8910 /* The traditional frame pointer location is at the top of the frame. */
8911 frame->hard_frame_pointer_offset = offset;
8912
8913 /* Register save area */
8914 offset += frame->nregs * UNITS_PER_WORD;
8915 frame->reg_save_offset = offset;
8916
8917 /* Align and set SSE register save area. */
8918 if (frame->nsseregs)
8919 {
8920 /* The only ABI that has saved SSE registers (Win64) also has a
8921 16-byte aligned default stack, and thus we don't need to be
8922 within the re-aligned local stack frame to save them. */
8923 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8924 offset = (offset + 16 - 1) & -16;
8925 offset += frame->nsseregs * 16;
8926 }
8927 frame->sse_reg_save_offset = offset;
8928
8929 /* The re-aligned stack starts here. Values before this point are not
8930 directly comparable with values below this point. In order to make
8931 sure that no value happens to be the same before and after, force
8932 the alignment computation below to add a non-zero value. */
8933 if (stack_realign_fp)
8934 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8935
8936 /* Va-arg area */
8937 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8938 offset += frame->va_arg_size;
8939
8940 /* Align start of frame for local function. */
8941 if (stack_realign_fp
8942 || offset != frame->sse_reg_save_offset
8943 || size != 0
8944 || !current_function_is_leaf
8945 || cfun->calls_alloca
8946 || ix86_current_function_calls_tls_descriptor)
8947 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
8948
8949 /* Frame pointer points here. */
8950 frame->frame_pointer_offset = offset;
8951
8952 offset += size;
8953
8954 /* Add outgoing arguments area. Can be skipped if we eliminated
8955 all the function calls as dead code.
8956 Skipping is however impossible when function calls alloca. Alloca
8957 expander assumes that last crtl->outgoing_args_size
8958 of stack frame are unused. */
8959 if (ACCUMULATE_OUTGOING_ARGS
8960 && (!current_function_is_leaf || cfun->calls_alloca
8961 || ix86_current_function_calls_tls_descriptor))
8962 {
8963 offset += crtl->outgoing_args_size;
8964 frame->outgoing_arguments_size = crtl->outgoing_args_size;
8965 }
8966 else
8967 frame->outgoing_arguments_size = 0;
8968
8969 /* Align stack boundary. Only needed if we're calling another function
8970 or using alloca. */
8971 if (!current_function_is_leaf || cfun->calls_alloca
8972 || ix86_current_function_calls_tls_descriptor)
8973 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
8974
8975 /* We've reached end of stack frame. */
8976 frame->stack_pointer_offset = offset;
8977
8978 /* Size prologue needs to allocate. */
8979 to_allocate = offset - frame->sse_reg_save_offset;
8980
8981 if ((!to_allocate && frame->nregs <= 1)
8982 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
8983 frame->save_regs_using_mov = false;
8984
8985 if (ix86_using_red_zone ()
8986 && current_function_sp_is_unchanging
8987 && current_function_is_leaf
8988 && !ix86_current_function_calls_tls_descriptor)
8989 {
8990 frame->red_zone_size = to_allocate;
8991 if (frame->save_regs_using_mov)
8992 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
8993 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
8994 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
8995 }
8996 else
8997 frame->red_zone_size = 0;
8998 frame->stack_pointer_offset -= frame->red_zone_size;
8999
9000 /* The SEH frame pointer location is near the bottom of the frame.
9001 This is enforced by the fact that the difference between the
9002 stack pointer and the frame pointer is limited to 240 bytes in
9003 the unwind data structure. */
9004 if (TARGET_SEH)
9005 {
9006 HOST_WIDE_INT diff;
9007
9008 /* If we can leave the frame pointer where it is, do so. */
9009 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9010 if (diff > 240 || (diff & 15) != 0)
9011 {
9012 /* Ideally we'd determine what portion of the local stack frame
9013 (within the constraint of the lowest 240) is most heavily used.
9014 But without that complication, simply bias the frame pointer
9015 by 128 bytes so as to maximize the amount of the local stack
9016 frame that is addressable with 8-bit offsets. */
9017 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9018 }
9019 }
9020 }
9021
9022 /* This is semi-inlined memory_address_length, but simplified
9023 since we know that we're always dealing with reg+offset, and
9024 to avoid having to create and discard all that rtl. */
9025
9026 static inline int
9027 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9028 {
9029 int len = 4;
9030
9031 if (offset == 0)
9032 {
9033 /* EBP and R13 cannot be encoded without an offset. */
9034 len = (regno == BP_REG || regno == R13_REG);
9035 }
9036 else if (IN_RANGE (offset, -128, 127))
9037 len = 1;
9038
9039 /* ESP and R12 must be encoded with a SIB byte. */
9040 if (regno == SP_REG || regno == R12_REG)
9041 len++;
9042
9043 return len;
9044 }
9045
9046 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9047 The valid base registers are taken from CFUN->MACHINE->FS. */
9048
9049 static rtx
9050 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9051 {
9052 const struct machine_function *m = cfun->machine;
9053 rtx base_reg = NULL;
9054 HOST_WIDE_INT base_offset = 0;
9055
9056 if (m->use_fast_prologue_epilogue)
9057 {
9058 /* Choose the base register most likely to allow the most scheduling
9059 opportunities. Generally FP is valid througout the function,
9060 while DRAP must be reloaded within the epilogue. But choose either
9061 over the SP due to increased encoding size. */
9062
9063 if (m->fs.fp_valid)
9064 {
9065 base_reg = hard_frame_pointer_rtx;
9066 base_offset = m->fs.fp_offset - cfa_offset;
9067 }
9068 else if (m->fs.drap_valid)
9069 {
9070 base_reg = crtl->drap_reg;
9071 base_offset = 0 - cfa_offset;
9072 }
9073 else if (m->fs.sp_valid)
9074 {
9075 base_reg = stack_pointer_rtx;
9076 base_offset = m->fs.sp_offset - cfa_offset;
9077 }
9078 }
9079 else
9080 {
9081 HOST_WIDE_INT toffset;
9082 int len = 16, tlen;
9083
9084 /* Choose the base register with the smallest address encoding.
9085 With a tie, choose FP > DRAP > SP. */
9086 if (m->fs.sp_valid)
9087 {
9088 base_reg = stack_pointer_rtx;
9089 base_offset = m->fs.sp_offset - cfa_offset;
9090 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9091 }
9092 if (m->fs.drap_valid)
9093 {
9094 toffset = 0 - cfa_offset;
9095 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9096 if (tlen <= len)
9097 {
9098 base_reg = crtl->drap_reg;
9099 base_offset = toffset;
9100 len = tlen;
9101 }
9102 }
9103 if (m->fs.fp_valid)
9104 {
9105 toffset = m->fs.fp_offset - cfa_offset;
9106 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9107 if (tlen <= len)
9108 {
9109 base_reg = hard_frame_pointer_rtx;
9110 base_offset = toffset;
9111 len = tlen;
9112 }
9113 }
9114 }
9115 gcc_assert (base_reg != NULL);
9116
9117 return plus_constant (base_reg, base_offset);
9118 }
9119
9120 /* Emit code to save registers in the prologue. */
9121
9122 static void
9123 ix86_emit_save_regs (void)
9124 {
9125 unsigned int regno;
9126 rtx insn;
9127
9128 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9129 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9130 {
9131 insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
9132 RTX_FRAME_RELATED_P (insn) = 1;
9133 }
9134 }
9135
9136 /* Emit a single register save at CFA - CFA_OFFSET. */
9137
9138 static void
9139 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9140 HOST_WIDE_INT cfa_offset)
9141 {
9142 struct machine_function *m = cfun->machine;
9143 rtx reg = gen_rtx_REG (mode, regno);
9144 rtx mem, addr, base, insn;
9145
9146 addr = choose_baseaddr (cfa_offset);
9147 mem = gen_frame_mem (mode, addr);
9148
9149 /* For SSE saves, we need to indicate the 128-bit alignment. */
9150 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9151
9152 insn = emit_move_insn (mem, reg);
9153 RTX_FRAME_RELATED_P (insn) = 1;
9154
9155 base = addr;
9156 if (GET_CODE (base) == PLUS)
9157 base = XEXP (base, 0);
9158 gcc_checking_assert (REG_P (base));
9159
9160 /* When saving registers into a re-aligned local stack frame, avoid
9161 any tricky guessing by dwarf2out. */
9162 if (m->fs.realigned)
9163 {
9164 gcc_checking_assert (stack_realign_drap);
9165
9166 if (regno == REGNO (crtl->drap_reg))
9167 {
9168 /* A bit of a hack. We force the DRAP register to be saved in
9169 the re-aligned stack frame, which provides us with a copy
9170 of the CFA that will last past the prologue. Install it. */
9171 gcc_checking_assert (cfun->machine->fs.fp_valid);
9172 addr = plus_constant (hard_frame_pointer_rtx,
9173 cfun->machine->fs.fp_offset - cfa_offset);
9174 mem = gen_rtx_MEM (mode, addr);
9175 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9176 }
9177 else
9178 {
9179 /* The frame pointer is a stable reference within the
9180 aligned frame. Use it. */
9181 gcc_checking_assert (cfun->machine->fs.fp_valid);
9182 addr = plus_constant (hard_frame_pointer_rtx,
9183 cfun->machine->fs.fp_offset - cfa_offset);
9184 mem = gen_rtx_MEM (mode, addr);
9185 add_reg_note (insn, REG_CFA_EXPRESSION,
9186 gen_rtx_SET (VOIDmode, mem, reg));
9187 }
9188 }
9189
9190 /* The memory may not be relative to the current CFA register,
9191 which means that we may need to generate a new pattern for
9192 use by the unwind info. */
9193 else if (base != m->fs.cfa_reg)
9194 {
9195 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9196 mem = gen_rtx_MEM (mode, addr);
9197 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9198 }
9199 }
9200
9201 /* Emit code to save registers using MOV insns.
9202 First register is stored at CFA - CFA_OFFSET. */
9203 static void
9204 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9205 {
9206 unsigned int regno;
9207
9208 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9209 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9210 {
9211 ix86_emit_save_reg_using_mov (Pmode, regno, cfa_offset);
9212 cfa_offset -= UNITS_PER_WORD;
9213 }
9214 }
9215
9216 /* Emit code to save SSE registers using MOV insns.
9217 First register is stored at CFA - CFA_OFFSET. */
9218 static void
9219 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9220 {
9221 unsigned int regno;
9222
9223 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9224 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9225 {
9226 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9227 cfa_offset -= 16;
9228 }
9229 }
9230
9231 static GTY(()) rtx queued_cfa_restores;
9232
9233 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9234 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9235 Don't add the note if the previously saved value will be left untouched
9236 within stack red-zone till return, as unwinders can find the same value
9237 in the register and on the stack. */
9238
9239 static void
9240 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9241 {
9242 if (!crtl->shrink_wrapped
9243 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9244 return;
9245
9246 if (insn)
9247 {
9248 add_reg_note (insn, REG_CFA_RESTORE, reg);
9249 RTX_FRAME_RELATED_P (insn) = 1;
9250 }
9251 else
9252 queued_cfa_restores
9253 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9254 }
9255
9256 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9257
9258 static void
9259 ix86_add_queued_cfa_restore_notes (rtx insn)
9260 {
9261 rtx last;
9262 if (!queued_cfa_restores)
9263 return;
9264 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9265 ;
9266 XEXP (last, 1) = REG_NOTES (insn);
9267 REG_NOTES (insn) = queued_cfa_restores;
9268 queued_cfa_restores = NULL_RTX;
9269 RTX_FRAME_RELATED_P (insn) = 1;
9270 }
9271
9272 /* Expand prologue or epilogue stack adjustment.
9273 The pattern exist to put a dependency on all ebp-based memory accesses.
9274 STYLE should be negative if instructions should be marked as frame related,
9275 zero if %r11 register is live and cannot be freely used and positive
9276 otherwise. */
9277
9278 static void
9279 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9280 int style, bool set_cfa)
9281 {
9282 struct machine_function *m = cfun->machine;
9283 rtx insn;
9284 bool add_frame_related_expr = false;
9285
9286 if (! TARGET_64BIT)
9287 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9288 else if (x86_64_immediate_operand (offset, DImode))
9289 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9290 else
9291 {
9292 rtx tmp;
9293 /* r11 is used by indirect sibcall return as well, set before the
9294 epilogue and used after the epilogue. */
9295 if (style)
9296 tmp = gen_rtx_REG (DImode, R11_REG);
9297 else
9298 {
9299 gcc_assert (src != hard_frame_pointer_rtx
9300 && dest != hard_frame_pointer_rtx);
9301 tmp = hard_frame_pointer_rtx;
9302 }
9303 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9304 if (style < 0)
9305 add_frame_related_expr = true;
9306
9307 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9308 }
9309
9310 insn = emit_insn (insn);
9311 if (style >= 0)
9312 ix86_add_queued_cfa_restore_notes (insn);
9313
9314 if (set_cfa)
9315 {
9316 rtx r;
9317
9318 gcc_assert (m->fs.cfa_reg == src);
9319 m->fs.cfa_offset += INTVAL (offset);
9320 m->fs.cfa_reg = dest;
9321
9322 r = gen_rtx_PLUS (Pmode, src, offset);
9323 r = gen_rtx_SET (VOIDmode, dest, r);
9324 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9325 RTX_FRAME_RELATED_P (insn) = 1;
9326 }
9327 else if (style < 0)
9328 {
9329 RTX_FRAME_RELATED_P (insn) = 1;
9330 if (add_frame_related_expr)
9331 {
9332 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9333 r = gen_rtx_SET (VOIDmode, dest, r);
9334 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9335 }
9336 }
9337
9338 if (dest == stack_pointer_rtx)
9339 {
9340 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9341 bool valid = m->fs.sp_valid;
9342
9343 if (src == hard_frame_pointer_rtx)
9344 {
9345 valid = m->fs.fp_valid;
9346 ooffset = m->fs.fp_offset;
9347 }
9348 else if (src == crtl->drap_reg)
9349 {
9350 valid = m->fs.drap_valid;
9351 ooffset = 0;
9352 }
9353 else
9354 {
9355 /* Else there are two possibilities: SP itself, which we set
9356 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9357 taken care of this by hand along the eh_return path. */
9358 gcc_checking_assert (src == stack_pointer_rtx
9359 || offset == const0_rtx);
9360 }
9361
9362 m->fs.sp_offset = ooffset - INTVAL (offset);
9363 m->fs.sp_valid = valid;
9364 }
9365 }
9366
9367 /* Find an available register to be used as dynamic realign argument
9368 pointer regsiter. Such a register will be written in prologue and
9369 used in begin of body, so it must not be
9370 1. parameter passing register.
9371 2. GOT pointer.
9372 We reuse static-chain register if it is available. Otherwise, we
9373 use DI for i386 and R13 for x86-64. We chose R13 since it has
9374 shorter encoding.
9375
9376 Return: the regno of chosen register. */
9377
9378 static unsigned int
9379 find_drap_reg (void)
9380 {
9381 tree decl = cfun->decl;
9382
9383 if (TARGET_64BIT)
9384 {
9385 /* Use R13 for nested function or function need static chain.
9386 Since function with tail call may use any caller-saved
9387 registers in epilogue, DRAP must not use caller-saved
9388 register in such case. */
9389 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9390 return R13_REG;
9391
9392 return R10_REG;
9393 }
9394 else
9395 {
9396 /* Use DI for nested function or function need static chain.
9397 Since function with tail call may use any caller-saved
9398 registers in epilogue, DRAP must not use caller-saved
9399 register in such case. */
9400 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9401 return DI_REG;
9402
9403 /* Reuse static chain register if it isn't used for parameter
9404 passing. */
9405 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9406 {
9407 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9408 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9409 return CX_REG;
9410 }
9411 return DI_REG;
9412 }
9413 }
9414
9415 /* Return minimum incoming stack alignment. */
9416
9417 static unsigned int
9418 ix86_minimum_incoming_stack_boundary (bool sibcall)
9419 {
9420 unsigned int incoming_stack_boundary;
9421
9422 /* Prefer the one specified at command line. */
9423 if (ix86_user_incoming_stack_boundary)
9424 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9425 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9426 if -mstackrealign is used, it isn't used for sibcall check and
9427 estimated stack alignment is 128bit. */
9428 else if (!sibcall
9429 && !TARGET_64BIT
9430 && ix86_force_align_arg_pointer
9431 && crtl->stack_alignment_estimated == 128)
9432 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9433 else
9434 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9435
9436 /* Incoming stack alignment can be changed on individual functions
9437 via force_align_arg_pointer attribute. We use the smallest
9438 incoming stack boundary. */
9439 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9440 && lookup_attribute (ix86_force_align_arg_pointer_string,
9441 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9442 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9443
9444 /* The incoming stack frame has to be aligned at least at
9445 parm_stack_boundary. */
9446 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9447 incoming_stack_boundary = crtl->parm_stack_boundary;
9448
9449 /* Stack at entrance of main is aligned by runtime. We use the
9450 smallest incoming stack boundary. */
9451 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9452 && DECL_NAME (current_function_decl)
9453 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9454 && DECL_FILE_SCOPE_P (current_function_decl))
9455 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9456
9457 return incoming_stack_boundary;
9458 }
9459
9460 /* Update incoming stack boundary and estimated stack alignment. */
9461
9462 static void
9463 ix86_update_stack_boundary (void)
9464 {
9465 ix86_incoming_stack_boundary
9466 = ix86_minimum_incoming_stack_boundary (false);
9467
9468 /* x86_64 vararg needs 16byte stack alignment for register save
9469 area. */
9470 if (TARGET_64BIT
9471 && cfun->stdarg
9472 && crtl->stack_alignment_estimated < 128)
9473 crtl->stack_alignment_estimated = 128;
9474 }
9475
9476 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9477 needed or an rtx for DRAP otherwise. */
9478
9479 static rtx
9480 ix86_get_drap_rtx (void)
9481 {
9482 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9483 crtl->need_drap = true;
9484
9485 if (stack_realign_drap)
9486 {
9487 /* Assign DRAP to vDRAP and returns vDRAP */
9488 unsigned int regno = find_drap_reg ();
9489 rtx drap_vreg;
9490 rtx arg_ptr;
9491 rtx seq, insn;
9492
9493 arg_ptr = gen_rtx_REG (Pmode, regno);
9494 crtl->drap_reg = arg_ptr;
9495
9496 start_sequence ();
9497 drap_vreg = copy_to_reg (arg_ptr);
9498 seq = get_insns ();
9499 end_sequence ();
9500
9501 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9502 if (!optimize)
9503 {
9504 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9505 RTX_FRAME_RELATED_P (insn) = 1;
9506 }
9507 return drap_vreg;
9508 }
9509 else
9510 return NULL;
9511 }
9512
9513 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9514
9515 static rtx
9516 ix86_internal_arg_pointer (void)
9517 {
9518 return virtual_incoming_args_rtx;
9519 }
9520
9521 struct scratch_reg {
9522 rtx reg;
9523 bool saved;
9524 };
9525
9526 /* Return a short-lived scratch register for use on function entry.
9527 In 32-bit mode, it is valid only after the registers are saved
9528 in the prologue. This register must be released by means of
9529 release_scratch_register_on_entry once it is dead. */
9530
9531 static void
9532 get_scratch_register_on_entry (struct scratch_reg *sr)
9533 {
9534 int regno;
9535
9536 sr->saved = false;
9537
9538 if (TARGET_64BIT)
9539 {
9540 /* We always use R11 in 64-bit mode. */
9541 regno = R11_REG;
9542 }
9543 else
9544 {
9545 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9546 bool fastcall_p
9547 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9548 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9549 int regparm = ix86_function_regparm (fntype, decl);
9550 int drap_regno
9551 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9552
9553 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9554 for the static chain register. */
9555 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9556 && drap_regno != AX_REG)
9557 regno = AX_REG;
9558 else if (regparm < 2 && drap_regno != DX_REG)
9559 regno = DX_REG;
9560 /* ecx is the static chain register. */
9561 else if (regparm < 3 && !fastcall_p && !static_chain_p
9562 && drap_regno != CX_REG)
9563 regno = CX_REG;
9564 else if (ix86_save_reg (BX_REG, true))
9565 regno = BX_REG;
9566 /* esi is the static chain register. */
9567 else if (!(regparm == 3 && static_chain_p)
9568 && ix86_save_reg (SI_REG, true))
9569 regno = SI_REG;
9570 else if (ix86_save_reg (DI_REG, true))
9571 regno = DI_REG;
9572 else
9573 {
9574 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9575 sr->saved = true;
9576 }
9577 }
9578
9579 sr->reg = gen_rtx_REG (Pmode, regno);
9580 if (sr->saved)
9581 {
9582 rtx insn = emit_insn (gen_push (sr->reg));
9583 RTX_FRAME_RELATED_P (insn) = 1;
9584 }
9585 }
9586
9587 /* Release a scratch register obtained from the preceding function. */
9588
9589 static void
9590 release_scratch_register_on_entry (struct scratch_reg *sr)
9591 {
9592 if (sr->saved)
9593 {
9594 rtx x, insn = emit_insn (gen_pop (sr->reg));
9595
9596 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9597 RTX_FRAME_RELATED_P (insn) = 1;
9598 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9599 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9600 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9601 }
9602 }
9603
9604 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9605
9606 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9607
9608 static void
9609 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9610 {
9611 /* We skip the probe for the first interval + a small dope of 4 words and
9612 probe that many bytes past the specified size to maintain a protection
9613 area at the botton of the stack. */
9614 const int dope = 4 * UNITS_PER_WORD;
9615 rtx size_rtx = GEN_INT (size), last;
9616
9617 /* See if we have a constant small number of probes to generate. If so,
9618 that's the easy case. The run-time loop is made up of 11 insns in the
9619 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9620 for n # of intervals. */
9621 if (size <= 5 * PROBE_INTERVAL)
9622 {
9623 HOST_WIDE_INT i, adjust;
9624 bool first_probe = true;
9625
9626 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9627 values of N from 1 until it exceeds SIZE. If only one probe is
9628 needed, this will not generate any code. Then adjust and probe
9629 to PROBE_INTERVAL + SIZE. */
9630 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9631 {
9632 if (first_probe)
9633 {
9634 adjust = 2 * PROBE_INTERVAL + dope;
9635 first_probe = false;
9636 }
9637 else
9638 adjust = PROBE_INTERVAL;
9639
9640 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9641 plus_constant (stack_pointer_rtx, -adjust)));
9642 emit_stack_probe (stack_pointer_rtx);
9643 }
9644
9645 if (first_probe)
9646 adjust = size + PROBE_INTERVAL + dope;
9647 else
9648 adjust = size + PROBE_INTERVAL - i;
9649
9650 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9651 plus_constant (stack_pointer_rtx, -adjust)));
9652 emit_stack_probe (stack_pointer_rtx);
9653
9654 /* Adjust back to account for the additional first interval. */
9655 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9656 plus_constant (stack_pointer_rtx,
9657 PROBE_INTERVAL + dope)));
9658 }
9659
9660 /* Otherwise, do the same as above, but in a loop. Note that we must be
9661 extra careful with variables wrapping around because we might be at
9662 the very top (or the very bottom) of the address space and we have
9663 to be able to handle this case properly; in particular, we use an
9664 equality test for the loop condition. */
9665 else
9666 {
9667 HOST_WIDE_INT rounded_size;
9668 struct scratch_reg sr;
9669
9670 get_scratch_register_on_entry (&sr);
9671
9672
9673 /* Step 1: round SIZE to the previous multiple of the interval. */
9674
9675 rounded_size = size & -PROBE_INTERVAL;
9676
9677
9678 /* Step 2: compute initial and final value of the loop counter. */
9679
9680 /* SP = SP_0 + PROBE_INTERVAL. */
9681 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9682 plus_constant (stack_pointer_rtx,
9683 - (PROBE_INTERVAL + dope))));
9684
9685 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9686 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9687 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9688 gen_rtx_PLUS (Pmode, sr.reg,
9689 stack_pointer_rtx)));
9690
9691
9692 /* Step 3: the loop
9693
9694 while (SP != LAST_ADDR)
9695 {
9696 SP = SP + PROBE_INTERVAL
9697 probe at SP
9698 }
9699
9700 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9701 values of N from 1 until it is equal to ROUNDED_SIZE. */
9702
9703 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9704
9705
9706 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9707 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9708
9709 if (size != rounded_size)
9710 {
9711 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9712 plus_constant (stack_pointer_rtx,
9713 rounded_size - size)));
9714 emit_stack_probe (stack_pointer_rtx);
9715 }
9716
9717 /* Adjust back to account for the additional first interval. */
9718 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9719 plus_constant (stack_pointer_rtx,
9720 PROBE_INTERVAL + dope)));
9721
9722 release_scratch_register_on_entry (&sr);
9723 }
9724
9725 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9726
9727 /* Even if the stack pointer isn't the CFA register, we need to correctly
9728 describe the adjustments made to it, in particular differentiate the
9729 frame-related ones from the frame-unrelated ones. */
9730 if (size > 0)
9731 {
9732 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9733 XVECEXP (expr, 0, 0)
9734 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9735 plus_constant (stack_pointer_rtx, -size));
9736 XVECEXP (expr, 0, 1)
9737 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9738 plus_constant (stack_pointer_rtx,
9739 PROBE_INTERVAL + dope + size));
9740 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9741 RTX_FRAME_RELATED_P (last) = 1;
9742
9743 cfun->machine->fs.sp_offset += size;
9744 }
9745
9746 /* Make sure nothing is scheduled before we are done. */
9747 emit_insn (gen_blockage ());
9748 }
9749
9750 /* Adjust the stack pointer up to REG while probing it. */
9751
9752 const char *
9753 output_adjust_stack_and_probe (rtx reg)
9754 {
9755 static int labelno = 0;
9756 char loop_lab[32], end_lab[32];
9757 rtx xops[2];
9758
9759 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9760 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9761
9762 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9763
9764 /* Jump to END_LAB if SP == LAST_ADDR. */
9765 xops[0] = stack_pointer_rtx;
9766 xops[1] = reg;
9767 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9768 fputs ("\tje\t", asm_out_file);
9769 assemble_name_raw (asm_out_file, end_lab);
9770 fputc ('\n', asm_out_file);
9771
9772 /* SP = SP + PROBE_INTERVAL. */
9773 xops[1] = GEN_INT (PROBE_INTERVAL);
9774 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9775
9776 /* Probe at SP. */
9777 xops[1] = const0_rtx;
9778 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9779
9780 fprintf (asm_out_file, "\tjmp\t");
9781 assemble_name_raw (asm_out_file, loop_lab);
9782 fputc ('\n', asm_out_file);
9783
9784 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9785
9786 return "";
9787 }
9788
9789 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9790 inclusive. These are offsets from the current stack pointer. */
9791
9792 static void
9793 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9794 {
9795 /* See if we have a constant small number of probes to generate. If so,
9796 that's the easy case. The run-time loop is made up of 7 insns in the
9797 generic case while the compile-time loop is made up of n insns for n #
9798 of intervals. */
9799 if (size <= 7 * PROBE_INTERVAL)
9800 {
9801 HOST_WIDE_INT i;
9802
9803 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9804 it exceeds SIZE. If only one probe is needed, this will not
9805 generate any code. Then probe at FIRST + SIZE. */
9806 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9807 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9808
9809 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9810 }
9811
9812 /* Otherwise, do the same as above, but in a loop. Note that we must be
9813 extra careful with variables wrapping around because we might be at
9814 the very top (or the very bottom) of the address space and we have
9815 to be able to handle this case properly; in particular, we use an
9816 equality test for the loop condition. */
9817 else
9818 {
9819 HOST_WIDE_INT rounded_size, last;
9820 struct scratch_reg sr;
9821
9822 get_scratch_register_on_entry (&sr);
9823
9824
9825 /* Step 1: round SIZE to the previous multiple of the interval. */
9826
9827 rounded_size = size & -PROBE_INTERVAL;
9828
9829
9830 /* Step 2: compute initial and final value of the loop counter. */
9831
9832 /* TEST_OFFSET = FIRST. */
9833 emit_move_insn (sr.reg, GEN_INT (-first));
9834
9835 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9836 last = first + rounded_size;
9837
9838
9839 /* Step 3: the loop
9840
9841 while (TEST_ADDR != LAST_ADDR)
9842 {
9843 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9844 probe at TEST_ADDR
9845 }
9846
9847 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9848 until it is equal to ROUNDED_SIZE. */
9849
9850 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9851
9852
9853 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9854 that SIZE is equal to ROUNDED_SIZE. */
9855
9856 if (size != rounded_size)
9857 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9858 stack_pointer_rtx,
9859 sr.reg),
9860 rounded_size - size));
9861
9862 release_scratch_register_on_entry (&sr);
9863 }
9864
9865 /* Make sure nothing is scheduled before we are done. */
9866 emit_insn (gen_blockage ());
9867 }
9868
9869 /* Probe a range of stack addresses from REG to END, inclusive. These are
9870 offsets from the current stack pointer. */
9871
9872 const char *
9873 output_probe_stack_range (rtx reg, rtx end)
9874 {
9875 static int labelno = 0;
9876 char loop_lab[32], end_lab[32];
9877 rtx xops[3];
9878
9879 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9880 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9881
9882 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9883
9884 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9885 xops[0] = reg;
9886 xops[1] = end;
9887 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9888 fputs ("\tje\t", asm_out_file);
9889 assemble_name_raw (asm_out_file, end_lab);
9890 fputc ('\n', asm_out_file);
9891
9892 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9893 xops[1] = GEN_INT (PROBE_INTERVAL);
9894 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9895
9896 /* Probe at TEST_ADDR. */
9897 xops[0] = stack_pointer_rtx;
9898 xops[1] = reg;
9899 xops[2] = const0_rtx;
9900 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9901
9902 fprintf (asm_out_file, "\tjmp\t");
9903 assemble_name_raw (asm_out_file, loop_lab);
9904 fputc ('\n', asm_out_file);
9905
9906 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9907
9908 return "";
9909 }
9910
9911 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9912 to be generated in correct form. */
9913 static void
9914 ix86_finalize_stack_realign_flags (void)
9915 {
9916 /* Check if stack realign is really needed after reload, and
9917 stores result in cfun */
9918 unsigned int incoming_stack_boundary
9919 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9920 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9921 unsigned int stack_realign = (incoming_stack_boundary
9922 < (current_function_is_leaf
9923 ? crtl->max_used_stack_slot_alignment
9924 : crtl->stack_alignment_needed));
9925
9926 if (crtl->stack_realign_finalized)
9927 {
9928 /* After stack_realign_needed is finalized, we can't no longer
9929 change it. */
9930 gcc_assert (crtl->stack_realign_needed == stack_realign);
9931 return;
9932 }
9933
9934 /* If the only reason for frame_pointer_needed is that we conservatively
9935 assumed stack realignment might be needed, but in the end nothing that
9936 needed the stack alignment had been spilled, clear frame_pointer_needed
9937 and say we don't need stack realignment. */
9938 if (stack_realign
9939 && !crtl->need_drap
9940 && frame_pointer_needed
9941 && current_function_is_leaf
9942 && flag_omit_frame_pointer
9943 && current_function_sp_is_unchanging
9944 && !ix86_current_function_calls_tls_descriptor
9945 && !crtl->accesses_prior_frames
9946 && !cfun->calls_alloca
9947 && !crtl->calls_eh_return
9948 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
9949 && !ix86_frame_pointer_required ()
9950 && get_frame_size () == 0
9951 && ix86_nsaved_sseregs () == 0
9952 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
9953 {
9954 HARD_REG_SET set_up_by_prologue, prologue_used;
9955 basic_block bb;
9956
9957 CLEAR_HARD_REG_SET (prologue_used);
9958 CLEAR_HARD_REG_SET (set_up_by_prologue);
9959 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
9960 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
9961 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
9962 HARD_FRAME_POINTER_REGNUM);
9963 FOR_EACH_BB (bb)
9964 {
9965 rtx insn;
9966 FOR_BB_INSNS (bb, insn)
9967 if (NONDEBUG_INSN_P (insn)
9968 && requires_stack_frame_p (insn, prologue_used,
9969 set_up_by_prologue))
9970 {
9971 crtl->stack_realign_needed = stack_realign;
9972 crtl->stack_realign_finalized = true;
9973 return;
9974 }
9975 }
9976
9977 frame_pointer_needed = false;
9978 stack_realign = false;
9979 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
9980 crtl->stack_alignment_needed = incoming_stack_boundary;
9981 crtl->stack_alignment_estimated = incoming_stack_boundary;
9982 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
9983 crtl->preferred_stack_boundary = incoming_stack_boundary;
9984 df_finish_pass (true);
9985 df_scan_alloc (NULL);
9986 df_scan_blocks ();
9987 df_compute_regs_ever_live (true);
9988 df_analyze ();
9989 }
9990
9991 crtl->stack_realign_needed = stack_realign;
9992 crtl->stack_realign_finalized = true;
9993 }
9994
9995 /* Expand the prologue into a bunch of separate insns. */
9996
9997 void
9998 ix86_expand_prologue (void)
9999 {
10000 struct machine_function *m = cfun->machine;
10001 rtx insn, t;
10002 bool pic_reg_used;
10003 struct ix86_frame frame;
10004 HOST_WIDE_INT allocate;
10005 bool int_registers_saved;
10006
10007 ix86_finalize_stack_realign_flags ();
10008
10009 /* DRAP should not coexist with stack_realign_fp */
10010 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10011
10012 memset (&m->fs, 0, sizeof (m->fs));
10013
10014 /* Initialize CFA state for before the prologue. */
10015 m->fs.cfa_reg = stack_pointer_rtx;
10016 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10017
10018 /* Track SP offset to the CFA. We continue tracking this after we've
10019 swapped the CFA register away from SP. In the case of re-alignment
10020 this is fudged; we're interested to offsets within the local frame. */
10021 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10022 m->fs.sp_valid = true;
10023
10024 ix86_compute_frame_layout (&frame);
10025
10026 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10027 {
10028 /* We should have already generated an error for any use of
10029 ms_hook on a nested function. */
10030 gcc_checking_assert (!ix86_static_chain_on_stack);
10031
10032 /* Check if profiling is active and we shall use profiling before
10033 prologue variant. If so sorry. */
10034 if (crtl->profile && flag_fentry != 0)
10035 sorry ("ms_hook_prologue attribute isn%'t compatible "
10036 "with -mfentry for 32-bit");
10037
10038 /* In ix86_asm_output_function_label we emitted:
10039 8b ff movl.s %edi,%edi
10040 55 push %ebp
10041 8b ec movl.s %esp,%ebp
10042
10043 This matches the hookable function prologue in Win32 API
10044 functions in Microsoft Windows XP Service Pack 2 and newer.
10045 Wine uses this to enable Windows apps to hook the Win32 API
10046 functions provided by Wine.
10047
10048 What that means is that we've already set up the frame pointer. */
10049
10050 if (frame_pointer_needed
10051 && !(crtl->drap_reg && crtl->stack_realign_needed))
10052 {
10053 rtx push, mov;
10054
10055 /* We've decided to use the frame pointer already set up.
10056 Describe this to the unwinder by pretending that both
10057 push and mov insns happen right here.
10058
10059 Putting the unwind info here at the end of the ms_hook
10060 is done so that we can make absolutely certain we get
10061 the required byte sequence at the start of the function,
10062 rather than relying on an assembler that can produce
10063 the exact encoding required.
10064
10065 However it does mean (in the unpatched case) that we have
10066 a 1 insn window where the asynchronous unwind info is
10067 incorrect. However, if we placed the unwind info at
10068 its correct location we would have incorrect unwind info
10069 in the patched case. Which is probably all moot since
10070 I don't expect Wine generates dwarf2 unwind info for the
10071 system libraries that use this feature. */
10072
10073 insn = emit_insn (gen_blockage ());
10074
10075 push = gen_push (hard_frame_pointer_rtx);
10076 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10077 stack_pointer_rtx);
10078 RTX_FRAME_RELATED_P (push) = 1;
10079 RTX_FRAME_RELATED_P (mov) = 1;
10080
10081 RTX_FRAME_RELATED_P (insn) = 1;
10082 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10083 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10084
10085 /* Note that gen_push incremented m->fs.cfa_offset, even
10086 though we didn't emit the push insn here. */
10087 m->fs.cfa_reg = hard_frame_pointer_rtx;
10088 m->fs.fp_offset = m->fs.cfa_offset;
10089 m->fs.fp_valid = true;
10090 }
10091 else
10092 {
10093 /* The frame pointer is not needed so pop %ebp again.
10094 This leaves us with a pristine state. */
10095 emit_insn (gen_pop (hard_frame_pointer_rtx));
10096 }
10097 }
10098
10099 /* The first insn of a function that accepts its static chain on the
10100 stack is to push the register that would be filled in by a direct
10101 call. This insn will be skipped by the trampoline. */
10102 else if (ix86_static_chain_on_stack)
10103 {
10104 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10105 emit_insn (gen_blockage ());
10106
10107 /* We don't want to interpret this push insn as a register save,
10108 only as a stack adjustment. The real copy of the register as
10109 a save will be done later, if needed. */
10110 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10111 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10112 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10113 RTX_FRAME_RELATED_P (insn) = 1;
10114 }
10115
10116 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10117 of DRAP is needed and stack realignment is really needed after reload */
10118 if (stack_realign_drap)
10119 {
10120 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10121
10122 /* Only need to push parameter pointer reg if it is caller saved. */
10123 if (!call_used_regs[REGNO (crtl->drap_reg)])
10124 {
10125 /* Push arg pointer reg */
10126 insn = emit_insn (gen_push (crtl->drap_reg));
10127 RTX_FRAME_RELATED_P (insn) = 1;
10128 }
10129
10130 /* Grab the argument pointer. */
10131 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10132 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10133 RTX_FRAME_RELATED_P (insn) = 1;
10134 m->fs.cfa_reg = crtl->drap_reg;
10135 m->fs.cfa_offset = 0;
10136
10137 /* Align the stack. */
10138 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10139 stack_pointer_rtx,
10140 GEN_INT (-align_bytes)));
10141 RTX_FRAME_RELATED_P (insn) = 1;
10142
10143 /* Replicate the return address on the stack so that return
10144 address can be reached via (argp - 1) slot. This is needed
10145 to implement macro RETURN_ADDR_RTX and intrinsic function
10146 expand_builtin_return_addr etc. */
10147 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10148 t = gen_frame_mem (Pmode, t);
10149 insn = emit_insn (gen_push (t));
10150 RTX_FRAME_RELATED_P (insn) = 1;
10151
10152 /* For the purposes of frame and register save area addressing,
10153 we've started over with a new frame. */
10154 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10155 m->fs.realigned = true;
10156 }
10157
10158 if (frame_pointer_needed && !m->fs.fp_valid)
10159 {
10160 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10161 slower on all targets. Also sdb doesn't like it. */
10162 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10163 RTX_FRAME_RELATED_P (insn) = 1;
10164
10165 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10166 {
10167 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10168 RTX_FRAME_RELATED_P (insn) = 1;
10169
10170 if (m->fs.cfa_reg == stack_pointer_rtx)
10171 m->fs.cfa_reg = hard_frame_pointer_rtx;
10172 m->fs.fp_offset = m->fs.sp_offset;
10173 m->fs.fp_valid = true;
10174 }
10175 }
10176
10177 int_registers_saved = (frame.nregs == 0);
10178
10179 if (!int_registers_saved)
10180 {
10181 /* If saving registers via PUSH, do so now. */
10182 if (!frame.save_regs_using_mov)
10183 {
10184 ix86_emit_save_regs ();
10185 int_registers_saved = true;
10186 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10187 }
10188
10189 /* When using red zone we may start register saving before allocating
10190 the stack frame saving one cycle of the prologue. However, avoid
10191 doing this if we have to probe the stack; at least on x86_64 the
10192 stack probe can turn into a call that clobbers a red zone location. */
10193 else if (ix86_using_red_zone ()
10194 && (! TARGET_STACK_PROBE
10195 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10196 {
10197 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10198 int_registers_saved = true;
10199 }
10200 }
10201
10202 if (stack_realign_fp)
10203 {
10204 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10205 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10206
10207 /* The computation of the size of the re-aligned stack frame means
10208 that we must allocate the size of the register save area before
10209 performing the actual alignment. Otherwise we cannot guarantee
10210 that there's enough storage above the realignment point. */
10211 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10212 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10213 GEN_INT (m->fs.sp_offset
10214 - frame.sse_reg_save_offset),
10215 -1, false);
10216
10217 /* Align the stack. */
10218 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10219 stack_pointer_rtx,
10220 GEN_INT (-align_bytes)));
10221
10222 /* For the purposes of register save area addressing, the stack
10223 pointer is no longer valid. As for the value of sp_offset,
10224 see ix86_compute_frame_layout, which we need to match in order
10225 to pass verification of stack_pointer_offset at the end. */
10226 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10227 m->fs.sp_valid = false;
10228 }
10229
10230 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10231
10232 if (flag_stack_usage_info)
10233 {
10234 /* We start to count from ARG_POINTER. */
10235 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10236
10237 /* If it was realigned, take into account the fake frame. */
10238 if (stack_realign_drap)
10239 {
10240 if (ix86_static_chain_on_stack)
10241 stack_size += UNITS_PER_WORD;
10242
10243 if (!call_used_regs[REGNO (crtl->drap_reg)])
10244 stack_size += UNITS_PER_WORD;
10245
10246 /* This over-estimates by 1 minimal-stack-alignment-unit but
10247 mitigates that by counting in the new return address slot. */
10248 current_function_dynamic_stack_size
10249 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10250 }
10251
10252 current_function_static_stack_size = stack_size;
10253 }
10254
10255 /* The stack has already been decremented by the instruction calling us
10256 so probe if the size is non-negative to preserve the protection area. */
10257 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10258 {
10259 /* We expect the registers to be saved when probes are used. */
10260 gcc_assert (int_registers_saved);
10261
10262 if (STACK_CHECK_MOVING_SP)
10263 {
10264 ix86_adjust_stack_and_probe (allocate);
10265 allocate = 0;
10266 }
10267 else
10268 {
10269 HOST_WIDE_INT size = allocate;
10270
10271 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10272 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10273
10274 if (TARGET_STACK_PROBE)
10275 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10276 else
10277 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10278 }
10279 }
10280
10281 if (allocate == 0)
10282 ;
10283 else if (!ix86_target_stack_probe ()
10284 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10285 {
10286 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10287 GEN_INT (-allocate), -1,
10288 m->fs.cfa_reg == stack_pointer_rtx);
10289 }
10290 else
10291 {
10292 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10293 rtx r10 = NULL;
10294 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10295
10296 bool eax_live = false;
10297 bool r10_live = false;
10298
10299 if (TARGET_64BIT)
10300 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10301 if (!TARGET_64BIT_MS_ABI)
10302 eax_live = ix86_eax_live_at_start_p ();
10303
10304 if (eax_live)
10305 {
10306 emit_insn (gen_push (eax));
10307 allocate -= UNITS_PER_WORD;
10308 }
10309 if (r10_live)
10310 {
10311 r10 = gen_rtx_REG (Pmode, R10_REG);
10312 emit_insn (gen_push (r10));
10313 allocate -= UNITS_PER_WORD;
10314 }
10315
10316 emit_move_insn (eax, GEN_INT (allocate));
10317 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10318
10319 /* Use the fact that AX still contains ALLOCATE. */
10320 adjust_stack_insn = (TARGET_64BIT
10321 ? gen_pro_epilogue_adjust_stack_di_sub
10322 : gen_pro_epilogue_adjust_stack_si_sub);
10323
10324 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10325 stack_pointer_rtx, eax));
10326
10327 /* Note that SEH directives need to continue tracking the stack
10328 pointer even after the frame pointer has been set up. */
10329 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10330 {
10331 if (m->fs.cfa_reg == stack_pointer_rtx)
10332 m->fs.cfa_offset += allocate;
10333
10334 RTX_FRAME_RELATED_P (insn) = 1;
10335 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10336 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10337 plus_constant (stack_pointer_rtx,
10338 -allocate)));
10339 }
10340 m->fs.sp_offset += allocate;
10341
10342 if (r10_live && eax_live)
10343 {
10344 t = choose_baseaddr (m->fs.sp_offset - allocate);
10345 emit_move_insn (r10, gen_frame_mem (Pmode, t));
10346 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10347 emit_move_insn (eax, gen_frame_mem (Pmode, t));
10348 }
10349 else if (eax_live || r10_live)
10350 {
10351 t = choose_baseaddr (m->fs.sp_offset - allocate);
10352 emit_move_insn ((eax_live ? eax : r10), gen_frame_mem (Pmode, t));
10353 }
10354 }
10355 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10356
10357 /* If we havn't already set up the frame pointer, do so now. */
10358 if (frame_pointer_needed && !m->fs.fp_valid)
10359 {
10360 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10361 GEN_INT (frame.stack_pointer_offset
10362 - frame.hard_frame_pointer_offset));
10363 insn = emit_insn (insn);
10364 RTX_FRAME_RELATED_P (insn) = 1;
10365 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10366
10367 if (m->fs.cfa_reg == stack_pointer_rtx)
10368 m->fs.cfa_reg = hard_frame_pointer_rtx;
10369 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10370 m->fs.fp_valid = true;
10371 }
10372
10373 if (!int_registers_saved)
10374 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10375 if (frame.nsseregs)
10376 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10377
10378 pic_reg_used = false;
10379 if (pic_offset_table_rtx
10380 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10381 || crtl->profile))
10382 {
10383 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10384
10385 if (alt_pic_reg_used != INVALID_REGNUM)
10386 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10387
10388 pic_reg_used = true;
10389 }
10390
10391 if (pic_reg_used)
10392 {
10393 if (TARGET_64BIT)
10394 {
10395 if (ix86_cmodel == CM_LARGE_PIC)
10396 {
10397 rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
10398 rtx label = gen_label_rtx ();
10399 emit_label (label);
10400 LABEL_PRESERVE_P (label) = 1;
10401 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10402 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
10403 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10404 insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
10405 pic_offset_table_rtx, tmp_reg));
10406 }
10407 else
10408 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10409 }
10410 else
10411 {
10412 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10413 RTX_FRAME_RELATED_P (insn) = 1;
10414 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10415 }
10416 }
10417
10418 /* In the pic_reg_used case, make sure that the got load isn't deleted
10419 when mcount needs it. Blockage to avoid call movement across mcount
10420 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10421 note. */
10422 if (crtl->profile && !flag_fentry && pic_reg_used)
10423 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10424
10425 if (crtl->drap_reg && !crtl->stack_realign_needed)
10426 {
10427 /* vDRAP is setup but after reload it turns out stack realign
10428 isn't necessary, here we will emit prologue to setup DRAP
10429 without stack realign adjustment */
10430 t = choose_baseaddr (0);
10431 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10432 }
10433
10434 /* Prevent instructions from being scheduled into register save push
10435 sequence when access to the redzone area is done through frame pointer.
10436 The offset between the frame pointer and the stack pointer is calculated
10437 relative to the value of the stack pointer at the end of the function
10438 prologue, and moving instructions that access redzone area via frame
10439 pointer inside push sequence violates this assumption. */
10440 if (frame_pointer_needed && frame.red_zone_size)
10441 emit_insn (gen_memory_blockage ());
10442
10443 /* Emit cld instruction if stringops are used in the function. */
10444 if (TARGET_CLD && ix86_current_function_needs_cld)
10445 emit_insn (gen_cld ());
10446
10447 /* SEH requires that the prologue end within 256 bytes of the start of
10448 the function. Prevent instruction schedules that would extend that.
10449 Further, prevent alloca modifications to the stack pointer from being
10450 combined with prologue modifications. */
10451 if (TARGET_SEH)
10452 emit_insn (gen_prologue_use (stack_pointer_rtx));
10453 }
10454
10455 /* Emit code to restore REG using a POP insn. */
10456
10457 static void
10458 ix86_emit_restore_reg_using_pop (rtx reg)
10459 {
10460 struct machine_function *m = cfun->machine;
10461 rtx insn = emit_insn (gen_pop (reg));
10462
10463 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10464 m->fs.sp_offset -= UNITS_PER_WORD;
10465
10466 if (m->fs.cfa_reg == crtl->drap_reg
10467 && REGNO (reg) == REGNO (crtl->drap_reg))
10468 {
10469 /* Previously we'd represented the CFA as an expression
10470 like *(%ebp - 8). We've just popped that value from
10471 the stack, which means we need to reset the CFA to
10472 the drap register. This will remain until we restore
10473 the stack pointer. */
10474 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10475 RTX_FRAME_RELATED_P (insn) = 1;
10476
10477 /* This means that the DRAP register is valid for addressing too. */
10478 m->fs.drap_valid = true;
10479 return;
10480 }
10481
10482 if (m->fs.cfa_reg == stack_pointer_rtx)
10483 {
10484 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10485 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10486 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10487 RTX_FRAME_RELATED_P (insn) = 1;
10488
10489 m->fs.cfa_offset -= UNITS_PER_WORD;
10490 }
10491
10492 /* When the frame pointer is the CFA, and we pop it, we are
10493 swapping back to the stack pointer as the CFA. This happens
10494 for stack frames that don't allocate other data, so we assume
10495 the stack pointer is now pointing at the return address, i.e.
10496 the function entry state, which makes the offset be 1 word. */
10497 if (reg == hard_frame_pointer_rtx)
10498 {
10499 m->fs.fp_valid = false;
10500 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10501 {
10502 m->fs.cfa_reg = stack_pointer_rtx;
10503 m->fs.cfa_offset -= UNITS_PER_WORD;
10504
10505 add_reg_note (insn, REG_CFA_DEF_CFA,
10506 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10507 GEN_INT (m->fs.cfa_offset)));
10508 RTX_FRAME_RELATED_P (insn) = 1;
10509 }
10510 }
10511 }
10512
10513 /* Emit code to restore saved registers using POP insns. */
10514
10515 static void
10516 ix86_emit_restore_regs_using_pop (void)
10517 {
10518 unsigned int regno;
10519
10520 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10521 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10522 ix86_emit_restore_reg_using_pop (gen_rtx_REG (Pmode, regno));
10523 }
10524
10525 /* Emit code and notes for the LEAVE instruction. */
10526
10527 static void
10528 ix86_emit_leave (void)
10529 {
10530 struct machine_function *m = cfun->machine;
10531 rtx insn = emit_insn (ix86_gen_leave ());
10532
10533 ix86_add_queued_cfa_restore_notes (insn);
10534
10535 gcc_assert (m->fs.fp_valid);
10536 m->fs.sp_valid = true;
10537 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10538 m->fs.fp_valid = false;
10539
10540 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10541 {
10542 m->fs.cfa_reg = stack_pointer_rtx;
10543 m->fs.cfa_offset = m->fs.sp_offset;
10544
10545 add_reg_note (insn, REG_CFA_DEF_CFA,
10546 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10547 RTX_FRAME_RELATED_P (insn) = 1;
10548 }
10549 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10550 m->fs.fp_offset);
10551 }
10552
10553 /* Emit code to restore saved registers using MOV insns.
10554 First register is restored from CFA - CFA_OFFSET. */
10555 static void
10556 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10557 bool maybe_eh_return)
10558 {
10559 struct machine_function *m = cfun->machine;
10560 unsigned int regno;
10561
10562 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10563 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10564 {
10565 rtx reg = gen_rtx_REG (Pmode, regno);
10566 rtx insn, mem;
10567
10568 mem = choose_baseaddr (cfa_offset);
10569 mem = gen_frame_mem (Pmode, mem);
10570 insn = emit_move_insn (reg, mem);
10571
10572 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10573 {
10574 /* Previously we'd represented the CFA as an expression
10575 like *(%ebp - 8). We've just popped that value from
10576 the stack, which means we need to reset the CFA to
10577 the drap register. This will remain until we restore
10578 the stack pointer. */
10579 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10580 RTX_FRAME_RELATED_P (insn) = 1;
10581
10582 /* This means that the DRAP register is valid for addressing. */
10583 m->fs.drap_valid = true;
10584 }
10585 else
10586 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10587
10588 cfa_offset -= UNITS_PER_WORD;
10589 }
10590 }
10591
10592 /* Emit code to restore saved registers using MOV insns.
10593 First register is restored from CFA - CFA_OFFSET. */
10594 static void
10595 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10596 bool maybe_eh_return)
10597 {
10598 unsigned int regno;
10599
10600 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10601 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10602 {
10603 rtx reg = gen_rtx_REG (V4SFmode, regno);
10604 rtx mem;
10605
10606 mem = choose_baseaddr (cfa_offset);
10607 mem = gen_rtx_MEM (V4SFmode, mem);
10608 set_mem_align (mem, 128);
10609 emit_move_insn (reg, mem);
10610
10611 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10612
10613 cfa_offset -= 16;
10614 }
10615 }
10616
10617 /* Emit vzeroupper if needed. */
10618
10619 void
10620 ix86_maybe_emit_epilogue_vzeroupper (void)
10621 {
10622 if (TARGET_VZEROUPPER
10623 && !TREE_THIS_VOLATILE (cfun->decl)
10624 && !cfun->machine->caller_return_avx256_p)
10625 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10626 }
10627
10628 /* Restore function stack, frame, and registers. */
10629
10630 void
10631 ix86_expand_epilogue (int style)
10632 {
10633 struct machine_function *m = cfun->machine;
10634 struct machine_frame_state frame_state_save = m->fs;
10635 struct ix86_frame frame;
10636 bool restore_regs_via_mov;
10637 bool using_drap;
10638
10639 ix86_finalize_stack_realign_flags ();
10640 ix86_compute_frame_layout (&frame);
10641
10642 m->fs.sp_valid = (!frame_pointer_needed
10643 || (current_function_sp_is_unchanging
10644 && !stack_realign_fp));
10645 gcc_assert (!m->fs.sp_valid
10646 || m->fs.sp_offset == frame.stack_pointer_offset);
10647
10648 /* The FP must be valid if the frame pointer is present. */
10649 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10650 gcc_assert (!m->fs.fp_valid
10651 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10652
10653 /* We must have *some* valid pointer to the stack frame. */
10654 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10655
10656 /* The DRAP is never valid at this point. */
10657 gcc_assert (!m->fs.drap_valid);
10658
10659 /* See the comment about red zone and frame
10660 pointer usage in ix86_expand_prologue. */
10661 if (frame_pointer_needed && frame.red_zone_size)
10662 emit_insn (gen_memory_blockage ());
10663
10664 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10665 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10666
10667 /* Determine the CFA offset of the end of the red-zone. */
10668 m->fs.red_zone_offset = 0;
10669 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10670 {
10671 /* The red-zone begins below the return address. */
10672 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10673
10674 /* When the register save area is in the aligned portion of
10675 the stack, determine the maximum runtime displacement that
10676 matches up with the aligned frame. */
10677 if (stack_realign_drap)
10678 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10679 + UNITS_PER_WORD);
10680 }
10681
10682 /* Special care must be taken for the normal return case of a function
10683 using eh_return: the eax and edx registers are marked as saved, but
10684 not restored along this path. Adjust the save location to match. */
10685 if (crtl->calls_eh_return && style != 2)
10686 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10687
10688 /* EH_RETURN requires the use of moves to function properly. */
10689 if (crtl->calls_eh_return)
10690 restore_regs_via_mov = true;
10691 /* SEH requires the use of pops to identify the epilogue. */
10692 else if (TARGET_SEH)
10693 restore_regs_via_mov = false;
10694 /* If we're only restoring one register and sp is not valid then
10695 using a move instruction to restore the register since it's
10696 less work than reloading sp and popping the register. */
10697 else if (!m->fs.sp_valid && frame.nregs <= 1)
10698 restore_regs_via_mov = true;
10699 else if (TARGET_EPILOGUE_USING_MOVE
10700 && cfun->machine->use_fast_prologue_epilogue
10701 && (frame.nregs > 1
10702 || m->fs.sp_offset != frame.reg_save_offset))
10703 restore_regs_via_mov = true;
10704 else if (frame_pointer_needed
10705 && !frame.nregs
10706 && m->fs.sp_offset != frame.reg_save_offset)
10707 restore_regs_via_mov = true;
10708 else if (frame_pointer_needed
10709 && TARGET_USE_LEAVE
10710 && cfun->machine->use_fast_prologue_epilogue
10711 && frame.nregs == 1)
10712 restore_regs_via_mov = true;
10713 else
10714 restore_regs_via_mov = false;
10715
10716 if (restore_regs_via_mov || frame.nsseregs)
10717 {
10718 /* Ensure that the entire register save area is addressable via
10719 the stack pointer, if we will restore via sp. */
10720 if (TARGET_64BIT
10721 && m->fs.sp_offset > 0x7fffffff
10722 && !(m->fs.fp_valid || m->fs.drap_valid)
10723 && (frame.nsseregs + frame.nregs) != 0)
10724 {
10725 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10726 GEN_INT (m->fs.sp_offset
10727 - frame.sse_reg_save_offset),
10728 style,
10729 m->fs.cfa_reg == stack_pointer_rtx);
10730 }
10731 }
10732
10733 /* If there are any SSE registers to restore, then we have to do it
10734 via moves, since there's obviously no pop for SSE regs. */
10735 if (frame.nsseregs)
10736 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10737 style == 2);
10738
10739 if (restore_regs_via_mov)
10740 {
10741 rtx t;
10742
10743 if (frame.nregs)
10744 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10745
10746 /* eh_return epilogues need %ecx added to the stack pointer. */
10747 if (style == 2)
10748 {
10749 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10750
10751 /* Stack align doesn't work with eh_return. */
10752 gcc_assert (!stack_realign_drap);
10753 /* Neither does regparm nested functions. */
10754 gcc_assert (!ix86_static_chain_on_stack);
10755
10756 if (frame_pointer_needed)
10757 {
10758 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10759 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10760 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10761
10762 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10763 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10764
10765 /* Note that we use SA as a temporary CFA, as the return
10766 address is at the proper place relative to it. We
10767 pretend this happens at the FP restore insn because
10768 prior to this insn the FP would be stored at the wrong
10769 offset relative to SA, and after this insn we have no
10770 other reasonable register to use for the CFA. We don't
10771 bother resetting the CFA to the SP for the duration of
10772 the return insn. */
10773 add_reg_note (insn, REG_CFA_DEF_CFA,
10774 plus_constant (sa, UNITS_PER_WORD));
10775 ix86_add_queued_cfa_restore_notes (insn);
10776 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10777 RTX_FRAME_RELATED_P (insn) = 1;
10778
10779 m->fs.cfa_reg = sa;
10780 m->fs.cfa_offset = UNITS_PER_WORD;
10781 m->fs.fp_valid = false;
10782
10783 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10784 const0_rtx, style, false);
10785 }
10786 else
10787 {
10788 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10789 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10790 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10791 ix86_add_queued_cfa_restore_notes (insn);
10792
10793 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10794 if (m->fs.cfa_offset != UNITS_PER_WORD)
10795 {
10796 m->fs.cfa_offset = UNITS_PER_WORD;
10797 add_reg_note (insn, REG_CFA_DEF_CFA,
10798 plus_constant (stack_pointer_rtx,
10799 UNITS_PER_WORD));
10800 RTX_FRAME_RELATED_P (insn) = 1;
10801 }
10802 }
10803 m->fs.sp_offset = UNITS_PER_WORD;
10804 m->fs.sp_valid = true;
10805 }
10806 }
10807 else
10808 {
10809 /* SEH requires that the function end with (1) a stack adjustment
10810 if necessary, (2) a sequence of pops, and (3) a return or
10811 jump instruction. Prevent insns from the function body from
10812 being scheduled into this sequence. */
10813 if (TARGET_SEH)
10814 {
10815 /* Prevent a catch region from being adjacent to the standard
10816 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10817 several other flags that would be interesting to test are
10818 not yet set up. */
10819 if (flag_non_call_exceptions)
10820 emit_insn (gen_nops (const1_rtx));
10821 else
10822 emit_insn (gen_blockage ());
10823 }
10824
10825 /* First step is to deallocate the stack frame so that we can
10826 pop the registers. */
10827 if (!m->fs.sp_valid)
10828 {
10829 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10830 GEN_INT (m->fs.fp_offset
10831 - frame.reg_save_offset),
10832 style, false);
10833 }
10834 else if (m->fs.sp_offset != frame.reg_save_offset)
10835 {
10836 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10837 GEN_INT (m->fs.sp_offset
10838 - frame.reg_save_offset),
10839 style,
10840 m->fs.cfa_reg == stack_pointer_rtx);
10841 }
10842
10843 ix86_emit_restore_regs_using_pop ();
10844 }
10845
10846 /* If we used a stack pointer and haven't already got rid of it,
10847 then do so now. */
10848 if (m->fs.fp_valid)
10849 {
10850 /* If the stack pointer is valid and pointing at the frame
10851 pointer store address, then we only need a pop. */
10852 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10853 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10854 /* Leave results in shorter dependency chains on CPUs that are
10855 able to grok it fast. */
10856 else if (TARGET_USE_LEAVE
10857 || optimize_function_for_size_p (cfun)
10858 || !cfun->machine->use_fast_prologue_epilogue)
10859 ix86_emit_leave ();
10860 else
10861 {
10862 pro_epilogue_adjust_stack (stack_pointer_rtx,
10863 hard_frame_pointer_rtx,
10864 const0_rtx, style, !using_drap);
10865 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10866 }
10867 }
10868
10869 if (using_drap)
10870 {
10871 int param_ptr_offset = UNITS_PER_WORD;
10872 rtx insn;
10873
10874 gcc_assert (stack_realign_drap);
10875
10876 if (ix86_static_chain_on_stack)
10877 param_ptr_offset += UNITS_PER_WORD;
10878 if (!call_used_regs[REGNO (crtl->drap_reg)])
10879 param_ptr_offset += UNITS_PER_WORD;
10880
10881 insn = emit_insn (gen_rtx_SET
10882 (VOIDmode, stack_pointer_rtx,
10883 gen_rtx_PLUS (Pmode,
10884 crtl->drap_reg,
10885 GEN_INT (-param_ptr_offset))));
10886 m->fs.cfa_reg = stack_pointer_rtx;
10887 m->fs.cfa_offset = param_ptr_offset;
10888 m->fs.sp_offset = param_ptr_offset;
10889 m->fs.realigned = false;
10890
10891 add_reg_note (insn, REG_CFA_DEF_CFA,
10892 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10893 GEN_INT (param_ptr_offset)));
10894 RTX_FRAME_RELATED_P (insn) = 1;
10895
10896 if (!call_used_regs[REGNO (crtl->drap_reg)])
10897 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10898 }
10899
10900 /* At this point the stack pointer must be valid, and we must have
10901 restored all of the registers. We may not have deallocated the
10902 entire stack frame. We've delayed this until now because it may
10903 be possible to merge the local stack deallocation with the
10904 deallocation forced by ix86_static_chain_on_stack. */
10905 gcc_assert (m->fs.sp_valid);
10906 gcc_assert (!m->fs.fp_valid);
10907 gcc_assert (!m->fs.realigned);
10908 if (m->fs.sp_offset != UNITS_PER_WORD)
10909 {
10910 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10911 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10912 style, true);
10913 }
10914 else
10915 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10916
10917 /* Sibcall epilogues don't want a return instruction. */
10918 if (style == 0)
10919 {
10920 m->fs = frame_state_save;
10921 return;
10922 }
10923
10924 /* Emit vzeroupper if needed. */
10925 ix86_maybe_emit_epilogue_vzeroupper ();
10926
10927 if (crtl->args.pops_args && crtl->args.size)
10928 {
10929 rtx popc = GEN_INT (crtl->args.pops_args);
10930
10931 /* i386 can only pop 64K bytes. If asked to pop more, pop return
10932 address, do explicit add, and jump indirectly to the caller. */
10933
10934 if (crtl->args.pops_args >= 65536)
10935 {
10936 rtx ecx = gen_rtx_REG (SImode, CX_REG);
10937 rtx insn;
10938
10939 /* There is no "pascal" calling convention in any 64bit ABI. */
10940 gcc_assert (!TARGET_64BIT);
10941
10942 insn = emit_insn (gen_pop (ecx));
10943 m->fs.cfa_offset -= UNITS_PER_WORD;
10944 m->fs.sp_offset -= UNITS_PER_WORD;
10945
10946 add_reg_note (insn, REG_CFA_ADJUST_CFA,
10947 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
10948 add_reg_note (insn, REG_CFA_REGISTER,
10949 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
10950 RTX_FRAME_RELATED_P (insn) = 1;
10951
10952 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10953 popc, -1, true);
10954 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
10955 }
10956 else
10957 emit_jump_insn (gen_simple_return_pop_internal (popc));
10958 }
10959 else
10960 emit_jump_insn (gen_simple_return_internal ());
10961
10962 /* Restore the state back to the state from the prologue,
10963 so that it's correct for the next epilogue. */
10964 m->fs = frame_state_save;
10965 }
10966
10967 /* Reset from the function's potential modifications. */
10968
10969 static void
10970 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
10971 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
10972 {
10973 if (pic_offset_table_rtx)
10974 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
10975 #if TARGET_MACHO
10976 /* Mach-O doesn't support labels at the end of objects, so if
10977 it looks like we might want one, insert a NOP. */
10978 {
10979 rtx insn = get_last_insn ();
10980 rtx deleted_debug_label = NULL_RTX;
10981 while (insn
10982 && NOTE_P (insn)
10983 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
10984 {
10985 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
10986 notes only, instead set their CODE_LABEL_NUMBER to -1,
10987 otherwise there would be code generation differences
10988 in between -g and -g0. */
10989 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
10990 deleted_debug_label = insn;
10991 insn = PREV_INSN (insn);
10992 }
10993 if (insn
10994 && (LABEL_P (insn)
10995 || (NOTE_P (insn)
10996 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
10997 fputs ("\tnop\n", file);
10998 else if (deleted_debug_label)
10999 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11000 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11001 CODE_LABEL_NUMBER (insn) = -1;
11002 }
11003 #endif
11004
11005 }
11006
11007 /* Return a scratch register to use in the split stack prologue. The
11008 split stack prologue is used for -fsplit-stack. It is the first
11009 instructions in the function, even before the regular prologue.
11010 The scratch register can be any caller-saved register which is not
11011 used for parameters or for the static chain. */
11012
11013 static unsigned int
11014 split_stack_prologue_scratch_regno (void)
11015 {
11016 if (TARGET_64BIT)
11017 return R11_REG;
11018 else
11019 {
11020 bool is_fastcall;
11021 int regparm;
11022
11023 is_fastcall = (lookup_attribute ("fastcall",
11024 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11025 != NULL);
11026 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11027
11028 if (is_fastcall)
11029 {
11030 if (DECL_STATIC_CHAIN (cfun->decl))
11031 {
11032 sorry ("-fsplit-stack does not support fastcall with "
11033 "nested function");
11034 return INVALID_REGNUM;
11035 }
11036 return AX_REG;
11037 }
11038 else if (regparm < 3)
11039 {
11040 if (!DECL_STATIC_CHAIN (cfun->decl))
11041 return CX_REG;
11042 else
11043 {
11044 if (regparm >= 2)
11045 {
11046 sorry ("-fsplit-stack does not support 2 register "
11047 " parameters for a nested function");
11048 return INVALID_REGNUM;
11049 }
11050 return DX_REG;
11051 }
11052 }
11053 else
11054 {
11055 /* FIXME: We could make this work by pushing a register
11056 around the addition and comparison. */
11057 sorry ("-fsplit-stack does not support 3 register parameters");
11058 return INVALID_REGNUM;
11059 }
11060 }
11061 }
11062
11063 /* A SYMBOL_REF for the function which allocates new stackspace for
11064 -fsplit-stack. */
11065
11066 static GTY(()) rtx split_stack_fn;
11067
11068 /* A SYMBOL_REF for the more stack function when using the large
11069 model. */
11070
11071 static GTY(()) rtx split_stack_fn_large;
11072
11073 /* Handle -fsplit-stack. These are the first instructions in the
11074 function, even before the regular prologue. */
11075
11076 void
11077 ix86_expand_split_stack_prologue (void)
11078 {
11079 struct ix86_frame frame;
11080 HOST_WIDE_INT allocate;
11081 unsigned HOST_WIDE_INT args_size;
11082 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11083 rtx scratch_reg = NULL_RTX;
11084 rtx varargs_label = NULL_RTX;
11085 rtx fn;
11086
11087 gcc_assert (flag_split_stack && reload_completed);
11088
11089 ix86_finalize_stack_realign_flags ();
11090 ix86_compute_frame_layout (&frame);
11091 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11092
11093 /* This is the label we will branch to if we have enough stack
11094 space. We expect the basic block reordering pass to reverse this
11095 branch if optimizing, so that we branch in the unlikely case. */
11096 label = gen_label_rtx ();
11097
11098 /* We need to compare the stack pointer minus the frame size with
11099 the stack boundary in the TCB. The stack boundary always gives
11100 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11101 can compare directly. Otherwise we need to do an addition. */
11102
11103 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11104 UNSPEC_STACK_CHECK);
11105 limit = gen_rtx_CONST (Pmode, limit);
11106 limit = gen_rtx_MEM (Pmode, limit);
11107 if (allocate < SPLIT_STACK_AVAILABLE)
11108 current = stack_pointer_rtx;
11109 else
11110 {
11111 unsigned int scratch_regno;
11112 rtx offset;
11113
11114 /* We need a scratch register to hold the stack pointer minus
11115 the required frame size. Since this is the very start of the
11116 function, the scratch register can be any caller-saved
11117 register which is not used for parameters. */
11118 offset = GEN_INT (- allocate);
11119 scratch_regno = split_stack_prologue_scratch_regno ();
11120 if (scratch_regno == INVALID_REGNUM)
11121 return;
11122 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11123 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11124 {
11125 /* We don't use ix86_gen_add3 in this case because it will
11126 want to split to lea, but when not optimizing the insn
11127 will not be split after this point. */
11128 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11129 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11130 offset)));
11131 }
11132 else
11133 {
11134 emit_move_insn (scratch_reg, offset);
11135 emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
11136 stack_pointer_rtx));
11137 }
11138 current = scratch_reg;
11139 }
11140
11141 ix86_expand_branch (GEU, current, limit, label);
11142 jump_insn = get_last_insn ();
11143 JUMP_LABEL (jump_insn) = label;
11144
11145 /* Mark the jump as very likely to be taken. */
11146 add_reg_note (jump_insn, REG_BR_PROB,
11147 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11148
11149 if (split_stack_fn == NULL_RTX)
11150 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11151 fn = split_stack_fn;
11152
11153 /* Get more stack space. We pass in the desired stack space and the
11154 size of the arguments to copy to the new stack. In 32-bit mode
11155 we push the parameters; __morestack will return on a new stack
11156 anyhow. In 64-bit mode we pass the parameters in r10 and
11157 r11. */
11158 allocate_rtx = GEN_INT (allocate);
11159 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11160 call_fusage = NULL_RTX;
11161 if (TARGET_64BIT)
11162 {
11163 rtx reg10, reg11;
11164
11165 reg10 = gen_rtx_REG (Pmode, R10_REG);
11166 reg11 = gen_rtx_REG (Pmode, R11_REG);
11167
11168 /* If this function uses a static chain, it will be in %r10.
11169 Preserve it across the call to __morestack. */
11170 if (DECL_STATIC_CHAIN (cfun->decl))
11171 {
11172 rtx rax;
11173
11174 rax = gen_rtx_REG (Pmode, AX_REG);
11175 emit_move_insn (rax, reg10);
11176 use_reg (&call_fusage, rax);
11177 }
11178
11179 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11180 {
11181 HOST_WIDE_INT argval;
11182
11183 /* When using the large model we need to load the address
11184 into a register, and we've run out of registers. So we
11185 switch to a different calling convention, and we call a
11186 different function: __morestack_large. We pass the
11187 argument size in the upper 32 bits of r10 and pass the
11188 frame size in the lower 32 bits. */
11189 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11190 gcc_assert ((args_size & 0xffffffff) == args_size);
11191
11192 if (split_stack_fn_large == NULL_RTX)
11193 split_stack_fn_large =
11194 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11195
11196 if (ix86_cmodel == CM_LARGE_PIC)
11197 {
11198 rtx label, x;
11199
11200 label = gen_label_rtx ();
11201 emit_label (label);
11202 LABEL_PRESERVE_P (label) = 1;
11203 emit_insn (gen_set_rip_rex64 (reg10, label));
11204 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11205 emit_insn (gen_adddi3 (reg10, reg10, reg11));
11206 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11207 UNSPEC_GOT);
11208 x = gen_rtx_CONST (Pmode, x);
11209 emit_move_insn (reg11, x);
11210 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11211 x = gen_const_mem (Pmode, x);
11212 emit_move_insn (reg11, x);
11213 }
11214 else
11215 emit_move_insn (reg11, split_stack_fn_large);
11216
11217 fn = reg11;
11218
11219 argval = ((args_size << 16) << 16) + allocate;
11220 emit_move_insn (reg10, GEN_INT (argval));
11221 }
11222 else
11223 {
11224 emit_move_insn (reg10, allocate_rtx);
11225 emit_move_insn (reg11, GEN_INT (args_size));
11226 use_reg (&call_fusage, reg11);
11227 }
11228
11229 use_reg (&call_fusage, reg10);
11230 }
11231 else
11232 {
11233 emit_insn (gen_push (GEN_INT (args_size)));
11234 emit_insn (gen_push (allocate_rtx));
11235 }
11236 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11237 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11238 NULL_RTX, false);
11239 add_function_usage_to (call_insn, call_fusage);
11240
11241 /* In order to make call/return prediction work right, we now need
11242 to execute a return instruction. See
11243 libgcc/config/i386/morestack.S for the details on how this works.
11244
11245 For flow purposes gcc must not see this as a return
11246 instruction--we need control flow to continue at the subsequent
11247 label. Therefore, we use an unspec. */
11248 gcc_assert (crtl->args.pops_args < 65536);
11249 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11250
11251 /* If we are in 64-bit mode and this function uses a static chain,
11252 we saved %r10 in %rax before calling _morestack. */
11253 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11254 emit_move_insn (gen_rtx_REG (Pmode, R10_REG),
11255 gen_rtx_REG (Pmode, AX_REG));
11256
11257 /* If this function calls va_start, we need to store a pointer to
11258 the arguments on the old stack, because they may not have been
11259 all copied to the new stack. At this point the old stack can be
11260 found at the frame pointer value used by __morestack, because
11261 __morestack has set that up before calling back to us. Here we
11262 store that pointer in a scratch register, and in
11263 ix86_expand_prologue we store the scratch register in a stack
11264 slot. */
11265 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11266 {
11267 unsigned int scratch_regno;
11268 rtx frame_reg;
11269 int words;
11270
11271 scratch_regno = split_stack_prologue_scratch_regno ();
11272 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11273 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11274
11275 /* 64-bit:
11276 fp -> old fp value
11277 return address within this function
11278 return address of caller of this function
11279 stack arguments
11280 So we add three words to get to the stack arguments.
11281
11282 32-bit:
11283 fp -> old fp value
11284 return address within this function
11285 first argument to __morestack
11286 second argument to __morestack
11287 return address of caller of this function
11288 stack arguments
11289 So we add five words to get to the stack arguments.
11290 */
11291 words = TARGET_64BIT ? 3 : 5;
11292 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11293 gen_rtx_PLUS (Pmode, frame_reg,
11294 GEN_INT (words * UNITS_PER_WORD))));
11295
11296 varargs_label = gen_label_rtx ();
11297 emit_jump_insn (gen_jump (varargs_label));
11298 JUMP_LABEL (get_last_insn ()) = varargs_label;
11299
11300 emit_barrier ();
11301 }
11302
11303 emit_label (label);
11304 LABEL_NUSES (label) = 1;
11305
11306 /* If this function calls va_start, we now have to set the scratch
11307 register for the case where we do not call __morestack. In this
11308 case we need to set it based on the stack pointer. */
11309 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11310 {
11311 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11312 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11313 GEN_INT (UNITS_PER_WORD))));
11314
11315 emit_label (varargs_label);
11316 LABEL_NUSES (varargs_label) = 1;
11317 }
11318 }
11319
11320 /* We may have to tell the dataflow pass that the split stack prologue
11321 is initializing a scratch register. */
11322
11323 static void
11324 ix86_live_on_entry (bitmap regs)
11325 {
11326 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11327 {
11328 gcc_assert (flag_split_stack);
11329 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11330 }
11331 }
11332 \f
11333 /* Determine if op is suitable SUBREG RTX for address. */
11334
11335 static bool
11336 ix86_address_subreg_operand (rtx op)
11337 {
11338 enum machine_mode mode;
11339
11340 if (!REG_P (op))
11341 return false;
11342
11343 mode = GET_MODE (op);
11344
11345 if (GET_MODE_CLASS (mode) != MODE_INT)
11346 return false;
11347
11348 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11349 failures when the register is one word out of a two word structure. */
11350 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11351 return false;
11352
11353 /* Allow only SUBREGs of non-eliminable hard registers. */
11354 return register_no_elim_operand (op, mode);
11355 }
11356
11357 /* Extract the parts of an RTL expression that is a valid memory address
11358 for an instruction. Return 0 if the structure of the address is
11359 grossly off. Return -1 if the address contains ASHIFT, so it is not
11360 strictly valid, but still used for computing length of lea instruction. */
11361
11362 int
11363 ix86_decompose_address (rtx addr, struct ix86_address *out)
11364 {
11365 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11366 rtx base_reg, index_reg;
11367 HOST_WIDE_INT scale = 1;
11368 rtx scale_rtx = NULL_RTX;
11369 rtx tmp;
11370 int retval = 1;
11371 enum ix86_address_seg seg = SEG_DEFAULT;
11372
11373 /* Allow zero-extended SImode addresses,
11374 they will be emitted with addr32 prefix. */
11375 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11376 {
11377 if (GET_CODE (addr) == ZERO_EXTEND
11378 && GET_MODE (XEXP (addr, 0)) == SImode)
11379 addr = XEXP (addr, 0);
11380 else if (GET_CODE (addr) == AND
11381 && const_32bit_mask (XEXP (addr, 1), DImode))
11382 {
11383 addr = XEXP (addr, 0);
11384
11385 /* Strip subreg. */
11386 if (GET_CODE (addr) == SUBREG
11387 && GET_MODE (SUBREG_REG (addr)) == SImode)
11388 addr = SUBREG_REG (addr);
11389 }
11390 }
11391
11392 if (REG_P (addr))
11393 base = addr;
11394 else if (GET_CODE (addr) == SUBREG)
11395 {
11396 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11397 base = addr;
11398 else
11399 return 0;
11400 }
11401 else if (GET_CODE (addr) == PLUS)
11402 {
11403 rtx addends[4], op;
11404 int n = 0, i;
11405
11406 op = addr;
11407 do
11408 {
11409 if (n >= 4)
11410 return 0;
11411 addends[n++] = XEXP (op, 1);
11412 op = XEXP (op, 0);
11413 }
11414 while (GET_CODE (op) == PLUS);
11415 if (n >= 4)
11416 return 0;
11417 addends[n] = op;
11418
11419 for (i = n; i >= 0; --i)
11420 {
11421 op = addends[i];
11422 switch (GET_CODE (op))
11423 {
11424 case MULT:
11425 if (index)
11426 return 0;
11427 index = XEXP (op, 0);
11428 scale_rtx = XEXP (op, 1);
11429 break;
11430
11431 case ASHIFT:
11432 if (index)
11433 return 0;
11434 index = XEXP (op, 0);
11435 tmp = XEXP (op, 1);
11436 if (!CONST_INT_P (tmp))
11437 return 0;
11438 scale = INTVAL (tmp);
11439 if ((unsigned HOST_WIDE_INT) scale > 3)
11440 return 0;
11441 scale = 1 << scale;
11442 break;
11443
11444 case UNSPEC:
11445 if (XINT (op, 1) == UNSPEC_TP
11446 && TARGET_TLS_DIRECT_SEG_REFS
11447 && seg == SEG_DEFAULT)
11448 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11449 else
11450 return 0;
11451 break;
11452
11453 case SUBREG:
11454 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11455 return 0;
11456 /* FALLTHRU */
11457
11458 case REG:
11459 if (!base)
11460 base = op;
11461 else if (!index)
11462 index = op;
11463 else
11464 return 0;
11465 break;
11466
11467 case CONST:
11468 case CONST_INT:
11469 case SYMBOL_REF:
11470 case LABEL_REF:
11471 if (disp)
11472 return 0;
11473 disp = op;
11474 break;
11475
11476 default:
11477 return 0;
11478 }
11479 }
11480 }
11481 else if (GET_CODE (addr) == MULT)
11482 {
11483 index = XEXP (addr, 0); /* index*scale */
11484 scale_rtx = XEXP (addr, 1);
11485 }
11486 else if (GET_CODE (addr) == ASHIFT)
11487 {
11488 /* We're called for lea too, which implements ashift on occasion. */
11489 index = XEXP (addr, 0);
11490 tmp = XEXP (addr, 1);
11491 if (!CONST_INT_P (tmp))
11492 return 0;
11493 scale = INTVAL (tmp);
11494 if ((unsigned HOST_WIDE_INT) scale > 3)
11495 return 0;
11496 scale = 1 << scale;
11497 retval = -1;
11498 }
11499 else
11500 disp = addr; /* displacement */
11501
11502 if (index)
11503 {
11504 if (REG_P (index))
11505 ;
11506 else if (GET_CODE (index) == SUBREG
11507 && ix86_address_subreg_operand (SUBREG_REG (index)))
11508 ;
11509 else
11510 return 0;
11511 }
11512
11513 /* Extract the integral value of scale. */
11514 if (scale_rtx)
11515 {
11516 if (!CONST_INT_P (scale_rtx))
11517 return 0;
11518 scale = INTVAL (scale_rtx);
11519 }
11520
11521 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11522 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11523
11524 /* Avoid useless 0 displacement. */
11525 if (disp == const0_rtx && (base || index))
11526 disp = NULL_RTX;
11527
11528 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11529 if (base_reg && index_reg && scale == 1
11530 && (index_reg == arg_pointer_rtx
11531 || index_reg == frame_pointer_rtx
11532 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11533 {
11534 rtx tmp;
11535 tmp = base, base = index, index = tmp;
11536 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11537 }
11538
11539 /* Special case: %ebp cannot be encoded as a base without a displacement.
11540 Similarly %r13. */
11541 if (!disp
11542 && base_reg
11543 && (base_reg == hard_frame_pointer_rtx
11544 || base_reg == frame_pointer_rtx
11545 || base_reg == arg_pointer_rtx
11546 || (REG_P (base_reg)
11547 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11548 || REGNO (base_reg) == R13_REG))))
11549 disp = const0_rtx;
11550
11551 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11552 Avoid this by transforming to [%esi+0].
11553 Reload calls address legitimization without cfun defined, so we need
11554 to test cfun for being non-NULL. */
11555 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11556 && base_reg && !index_reg && !disp
11557 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11558 disp = const0_rtx;
11559
11560 /* Special case: encode reg+reg instead of reg*2. */
11561 if (!base && index && scale == 2)
11562 base = index, base_reg = index_reg, scale = 1;
11563
11564 /* Special case: scaling cannot be encoded without base or displacement. */
11565 if (!base && !disp && index && scale != 1)
11566 disp = const0_rtx;
11567
11568 out->base = base;
11569 out->index = index;
11570 out->disp = disp;
11571 out->scale = scale;
11572 out->seg = seg;
11573
11574 return retval;
11575 }
11576 \f
11577 /* Return cost of the memory address x.
11578 For i386, it is better to use a complex address than let gcc copy
11579 the address into a reg and make a new pseudo. But not if the address
11580 requires to two regs - that would mean more pseudos with longer
11581 lifetimes. */
11582 static int
11583 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11584 {
11585 struct ix86_address parts;
11586 int cost = 1;
11587 int ok = ix86_decompose_address (x, &parts);
11588
11589 gcc_assert (ok);
11590
11591 if (parts.base && GET_CODE (parts.base) == SUBREG)
11592 parts.base = SUBREG_REG (parts.base);
11593 if (parts.index && GET_CODE (parts.index) == SUBREG)
11594 parts.index = SUBREG_REG (parts.index);
11595
11596 /* Attempt to minimize number of registers in the address. */
11597 if ((parts.base
11598 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11599 || (parts.index
11600 && (!REG_P (parts.index)
11601 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11602 cost++;
11603
11604 if (parts.base
11605 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11606 && parts.index
11607 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11608 && parts.base != parts.index)
11609 cost++;
11610
11611 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11612 since it's predecode logic can't detect the length of instructions
11613 and it degenerates to vector decoded. Increase cost of such
11614 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11615 to split such addresses or even refuse such addresses at all.
11616
11617 Following addressing modes are affected:
11618 [base+scale*index]
11619 [scale*index+disp]
11620 [base+index]
11621
11622 The first and last case may be avoidable by explicitly coding the zero in
11623 memory address, but I don't have AMD-K6 machine handy to check this
11624 theory. */
11625
11626 if (TARGET_K6
11627 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11628 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11629 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11630 cost += 10;
11631
11632 return cost;
11633 }
11634 \f
11635 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11636 this is used for to form addresses to local data when -fPIC is in
11637 use. */
11638
11639 static bool
11640 darwin_local_data_pic (rtx disp)
11641 {
11642 return (GET_CODE (disp) == UNSPEC
11643 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11644 }
11645
11646 /* Determine if a given RTX is a valid constant. We already know this
11647 satisfies CONSTANT_P. */
11648
11649 static bool
11650 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11651 {
11652 switch (GET_CODE (x))
11653 {
11654 case CONST:
11655 x = XEXP (x, 0);
11656
11657 if (GET_CODE (x) == PLUS)
11658 {
11659 if (!CONST_INT_P (XEXP (x, 1)))
11660 return false;
11661 x = XEXP (x, 0);
11662 }
11663
11664 if (TARGET_MACHO && darwin_local_data_pic (x))
11665 return true;
11666
11667 /* Only some unspecs are valid as "constants". */
11668 if (GET_CODE (x) == UNSPEC)
11669 switch (XINT (x, 1))
11670 {
11671 case UNSPEC_GOT:
11672 case UNSPEC_GOTOFF:
11673 case UNSPEC_PLTOFF:
11674 return TARGET_64BIT;
11675 case UNSPEC_TPOFF:
11676 case UNSPEC_NTPOFF:
11677 x = XVECEXP (x, 0, 0);
11678 return (GET_CODE (x) == SYMBOL_REF
11679 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11680 case UNSPEC_DTPOFF:
11681 x = XVECEXP (x, 0, 0);
11682 return (GET_CODE (x) == SYMBOL_REF
11683 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11684 default:
11685 return false;
11686 }
11687
11688 /* We must have drilled down to a symbol. */
11689 if (GET_CODE (x) == LABEL_REF)
11690 return true;
11691 if (GET_CODE (x) != SYMBOL_REF)
11692 return false;
11693 /* FALLTHRU */
11694
11695 case SYMBOL_REF:
11696 /* TLS symbols are never valid. */
11697 if (SYMBOL_REF_TLS_MODEL (x))
11698 return false;
11699
11700 /* DLLIMPORT symbols are never valid. */
11701 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11702 && SYMBOL_REF_DLLIMPORT_P (x))
11703 return false;
11704
11705 #if TARGET_MACHO
11706 /* mdynamic-no-pic */
11707 if (MACHO_DYNAMIC_NO_PIC_P)
11708 return machopic_symbol_defined_p (x);
11709 #endif
11710 break;
11711
11712 case CONST_DOUBLE:
11713 if (GET_MODE (x) == TImode
11714 && x != CONST0_RTX (TImode)
11715 && !TARGET_64BIT)
11716 return false;
11717 break;
11718
11719 case CONST_VECTOR:
11720 if (!standard_sse_constant_p (x))
11721 return false;
11722
11723 default:
11724 break;
11725 }
11726
11727 /* Otherwise we handle everything else in the move patterns. */
11728 return true;
11729 }
11730
11731 /* Determine if it's legal to put X into the constant pool. This
11732 is not possible for the address of thread-local symbols, which
11733 is checked above. */
11734
11735 static bool
11736 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11737 {
11738 /* We can always put integral constants and vectors in memory. */
11739 switch (GET_CODE (x))
11740 {
11741 case CONST_INT:
11742 case CONST_DOUBLE:
11743 case CONST_VECTOR:
11744 return false;
11745
11746 default:
11747 break;
11748 }
11749 return !ix86_legitimate_constant_p (mode, x);
11750 }
11751
11752
11753 /* Nonzero if the constant value X is a legitimate general operand
11754 when generating PIC code. It is given that flag_pic is on and
11755 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11756
11757 bool
11758 legitimate_pic_operand_p (rtx x)
11759 {
11760 rtx inner;
11761
11762 switch (GET_CODE (x))
11763 {
11764 case CONST:
11765 inner = XEXP (x, 0);
11766 if (GET_CODE (inner) == PLUS
11767 && CONST_INT_P (XEXP (inner, 1)))
11768 inner = XEXP (inner, 0);
11769
11770 /* Only some unspecs are valid as "constants". */
11771 if (GET_CODE (inner) == UNSPEC)
11772 switch (XINT (inner, 1))
11773 {
11774 case UNSPEC_GOT:
11775 case UNSPEC_GOTOFF:
11776 case UNSPEC_PLTOFF:
11777 return TARGET_64BIT;
11778 case UNSPEC_TPOFF:
11779 x = XVECEXP (inner, 0, 0);
11780 return (GET_CODE (x) == SYMBOL_REF
11781 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11782 case UNSPEC_MACHOPIC_OFFSET:
11783 return legitimate_pic_address_disp_p (x);
11784 default:
11785 return false;
11786 }
11787 /* FALLTHRU */
11788
11789 case SYMBOL_REF:
11790 case LABEL_REF:
11791 return legitimate_pic_address_disp_p (x);
11792
11793 default:
11794 return true;
11795 }
11796 }
11797
11798 /* Determine if a given CONST RTX is a valid memory displacement
11799 in PIC mode. */
11800
11801 bool
11802 legitimate_pic_address_disp_p (rtx disp)
11803 {
11804 bool saw_plus;
11805
11806 /* In 64bit mode we can allow direct addresses of symbols and labels
11807 when they are not dynamic symbols. */
11808 if (TARGET_64BIT)
11809 {
11810 rtx op0 = disp, op1;
11811
11812 switch (GET_CODE (disp))
11813 {
11814 case LABEL_REF:
11815 return true;
11816
11817 case CONST:
11818 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11819 break;
11820 op0 = XEXP (XEXP (disp, 0), 0);
11821 op1 = XEXP (XEXP (disp, 0), 1);
11822 if (!CONST_INT_P (op1)
11823 || INTVAL (op1) >= 16*1024*1024
11824 || INTVAL (op1) < -16*1024*1024)
11825 break;
11826 if (GET_CODE (op0) == LABEL_REF)
11827 return true;
11828 if (GET_CODE (op0) != SYMBOL_REF)
11829 break;
11830 /* FALLTHRU */
11831
11832 case SYMBOL_REF:
11833 /* TLS references should always be enclosed in UNSPEC. */
11834 if (SYMBOL_REF_TLS_MODEL (op0))
11835 return false;
11836 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11837 && ix86_cmodel != CM_LARGE_PIC)
11838 return true;
11839 break;
11840
11841 default:
11842 break;
11843 }
11844 }
11845 if (GET_CODE (disp) != CONST)
11846 return false;
11847 disp = XEXP (disp, 0);
11848
11849 if (TARGET_64BIT)
11850 {
11851 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11852 of GOT tables. We should not need these anyway. */
11853 if (GET_CODE (disp) != UNSPEC
11854 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11855 && XINT (disp, 1) != UNSPEC_GOTOFF
11856 && XINT (disp, 1) != UNSPEC_PCREL
11857 && XINT (disp, 1) != UNSPEC_PLTOFF))
11858 return false;
11859
11860 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11861 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11862 return false;
11863 return true;
11864 }
11865
11866 saw_plus = false;
11867 if (GET_CODE (disp) == PLUS)
11868 {
11869 if (!CONST_INT_P (XEXP (disp, 1)))
11870 return false;
11871 disp = XEXP (disp, 0);
11872 saw_plus = true;
11873 }
11874
11875 if (TARGET_MACHO && darwin_local_data_pic (disp))
11876 return true;
11877
11878 if (GET_CODE (disp) != UNSPEC)
11879 return false;
11880
11881 switch (XINT (disp, 1))
11882 {
11883 case UNSPEC_GOT:
11884 if (saw_plus)
11885 return false;
11886 /* We need to check for both symbols and labels because VxWorks loads
11887 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11888 details. */
11889 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11890 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11891 case UNSPEC_GOTOFF:
11892 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11893 While ABI specify also 32bit relocation but we don't produce it in
11894 small PIC model at all. */
11895 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11896 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11897 && !TARGET_64BIT)
11898 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11899 return false;
11900 case UNSPEC_GOTTPOFF:
11901 case UNSPEC_GOTNTPOFF:
11902 case UNSPEC_INDNTPOFF:
11903 if (saw_plus)
11904 return false;
11905 disp = XVECEXP (disp, 0, 0);
11906 return (GET_CODE (disp) == SYMBOL_REF
11907 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
11908 case UNSPEC_NTPOFF:
11909 disp = XVECEXP (disp, 0, 0);
11910 return (GET_CODE (disp) == SYMBOL_REF
11911 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
11912 case UNSPEC_DTPOFF:
11913 disp = XVECEXP (disp, 0, 0);
11914 return (GET_CODE (disp) == SYMBOL_REF
11915 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
11916 }
11917
11918 return false;
11919 }
11920
11921 /* Recognizes RTL expressions that are valid memory addresses for an
11922 instruction. The MODE argument is the machine mode for the MEM
11923 expression that wants to use this address.
11924
11925 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
11926 convert common non-canonical forms to canonical form so that they will
11927 be recognized. */
11928
11929 static bool
11930 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
11931 rtx addr, bool strict)
11932 {
11933 struct ix86_address parts;
11934 rtx base, index, disp;
11935 HOST_WIDE_INT scale;
11936
11937 if (ix86_decompose_address (addr, &parts) <= 0)
11938 /* Decomposition failed. */
11939 return false;
11940
11941 base = parts.base;
11942 index = parts.index;
11943 disp = parts.disp;
11944 scale = parts.scale;
11945
11946 /* Validate base register. */
11947 if (base)
11948 {
11949 rtx reg;
11950
11951 if (REG_P (base))
11952 reg = base;
11953 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
11954 reg = SUBREG_REG (base);
11955 else
11956 /* Base is not a register. */
11957 return false;
11958
11959 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
11960 return false;
11961
11962 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
11963 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
11964 /* Base is not valid. */
11965 return false;
11966 }
11967
11968 /* Validate index register. */
11969 if (index)
11970 {
11971 rtx reg;
11972
11973 if (REG_P (index))
11974 reg = index;
11975 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
11976 reg = SUBREG_REG (index);
11977 else
11978 /* Index is not a register. */
11979 return false;
11980
11981 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
11982 return false;
11983
11984 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
11985 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
11986 /* Index is not valid. */
11987 return false;
11988 }
11989
11990 /* Index and base should have the same mode. */
11991 if (base && index
11992 && GET_MODE (base) != GET_MODE (index))
11993 return false;
11994
11995 /* Validate scale factor. */
11996 if (scale != 1)
11997 {
11998 if (!index)
11999 /* Scale without index. */
12000 return false;
12001
12002 if (scale != 2 && scale != 4 && scale != 8)
12003 /* Scale is not a valid multiplier. */
12004 return false;
12005 }
12006
12007 /* Validate displacement. */
12008 if (disp)
12009 {
12010 if (GET_CODE (disp) == CONST
12011 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12012 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12013 switch (XINT (XEXP (disp, 0), 1))
12014 {
12015 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12016 used. While ABI specify also 32bit relocations, we don't produce
12017 them at all and use IP relative instead. */
12018 case UNSPEC_GOT:
12019 case UNSPEC_GOTOFF:
12020 gcc_assert (flag_pic);
12021 if (!TARGET_64BIT)
12022 goto is_legitimate_pic;
12023
12024 /* 64bit address unspec. */
12025 return false;
12026
12027 case UNSPEC_GOTPCREL:
12028 case UNSPEC_PCREL:
12029 gcc_assert (flag_pic);
12030 goto is_legitimate_pic;
12031
12032 case UNSPEC_GOTTPOFF:
12033 case UNSPEC_GOTNTPOFF:
12034 case UNSPEC_INDNTPOFF:
12035 case UNSPEC_NTPOFF:
12036 case UNSPEC_DTPOFF:
12037 break;
12038
12039 case UNSPEC_STACK_CHECK:
12040 gcc_assert (flag_split_stack);
12041 break;
12042
12043 default:
12044 /* Invalid address unspec. */
12045 return false;
12046 }
12047
12048 else if (SYMBOLIC_CONST (disp)
12049 && (flag_pic
12050 || (TARGET_MACHO
12051 #if TARGET_MACHO
12052 && MACHOPIC_INDIRECT
12053 && !machopic_operand_p (disp)
12054 #endif
12055 )))
12056 {
12057
12058 is_legitimate_pic:
12059 if (TARGET_64BIT && (index || base))
12060 {
12061 /* foo@dtpoff(%rX) is ok. */
12062 if (GET_CODE (disp) != CONST
12063 || GET_CODE (XEXP (disp, 0)) != PLUS
12064 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12065 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12066 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12067 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12068 /* Non-constant pic memory reference. */
12069 return false;
12070 }
12071 else if ((!TARGET_MACHO || flag_pic)
12072 && ! legitimate_pic_address_disp_p (disp))
12073 /* Displacement is an invalid pic construct. */
12074 return false;
12075 #if TARGET_MACHO
12076 else if (MACHO_DYNAMIC_NO_PIC_P
12077 && !ix86_legitimate_constant_p (Pmode, disp))
12078 /* displacment must be referenced via non_lazy_pointer */
12079 return false;
12080 #endif
12081
12082 /* This code used to verify that a symbolic pic displacement
12083 includes the pic_offset_table_rtx register.
12084
12085 While this is good idea, unfortunately these constructs may
12086 be created by "adds using lea" optimization for incorrect
12087 code like:
12088
12089 int a;
12090 int foo(int i)
12091 {
12092 return *(&a+i);
12093 }
12094
12095 This code is nonsensical, but results in addressing
12096 GOT table with pic_offset_table_rtx base. We can't
12097 just refuse it easily, since it gets matched by
12098 "addsi3" pattern, that later gets split to lea in the
12099 case output register differs from input. While this
12100 can be handled by separate addsi pattern for this case
12101 that never results in lea, this seems to be easier and
12102 correct fix for crash to disable this test. */
12103 }
12104 else if (GET_CODE (disp) != LABEL_REF
12105 && !CONST_INT_P (disp)
12106 && (GET_CODE (disp) != CONST
12107 || !ix86_legitimate_constant_p (Pmode, disp))
12108 && (GET_CODE (disp) != SYMBOL_REF
12109 || !ix86_legitimate_constant_p (Pmode, disp)))
12110 /* Displacement is not constant. */
12111 return false;
12112 else if (TARGET_64BIT
12113 && !x86_64_immediate_operand (disp, VOIDmode))
12114 /* Displacement is out of range. */
12115 return false;
12116 }
12117
12118 /* Everything looks valid. */
12119 return true;
12120 }
12121
12122 /* Determine if a given RTX is a valid constant address. */
12123
12124 bool
12125 constant_address_p (rtx x)
12126 {
12127 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12128 }
12129 \f
12130 /* Return a unique alias set for the GOT. */
12131
12132 static alias_set_type
12133 ix86_GOT_alias_set (void)
12134 {
12135 static alias_set_type set = -1;
12136 if (set == -1)
12137 set = new_alias_set ();
12138 return set;
12139 }
12140
12141 /* Return a legitimate reference for ORIG (an address) using the
12142 register REG. If REG is 0, a new pseudo is generated.
12143
12144 There are two types of references that must be handled:
12145
12146 1. Global data references must load the address from the GOT, via
12147 the PIC reg. An insn is emitted to do this load, and the reg is
12148 returned.
12149
12150 2. Static data references, constant pool addresses, and code labels
12151 compute the address as an offset from the GOT, whose base is in
12152 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12153 differentiate them from global data objects. The returned
12154 address is the PIC reg + an unspec constant.
12155
12156 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12157 reg also appears in the address. */
12158
12159 static rtx
12160 legitimize_pic_address (rtx orig, rtx reg)
12161 {
12162 rtx addr = orig;
12163 rtx new_rtx = orig;
12164 rtx base;
12165
12166 #if TARGET_MACHO
12167 if (TARGET_MACHO && !TARGET_64BIT)
12168 {
12169 if (reg == 0)
12170 reg = gen_reg_rtx (Pmode);
12171 /* Use the generic Mach-O PIC machinery. */
12172 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12173 }
12174 #endif
12175
12176 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12177 new_rtx = addr;
12178 else if (TARGET_64BIT
12179 && ix86_cmodel != CM_SMALL_PIC
12180 && gotoff_operand (addr, Pmode))
12181 {
12182 rtx tmpreg;
12183 /* This symbol may be referenced via a displacement from the PIC
12184 base address (@GOTOFF). */
12185
12186 if (reload_in_progress)
12187 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12188 if (GET_CODE (addr) == CONST)
12189 addr = XEXP (addr, 0);
12190 if (GET_CODE (addr) == PLUS)
12191 {
12192 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12193 UNSPEC_GOTOFF);
12194 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12195 }
12196 else
12197 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12198 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12199 if (!reg)
12200 tmpreg = gen_reg_rtx (Pmode);
12201 else
12202 tmpreg = reg;
12203 emit_move_insn (tmpreg, new_rtx);
12204
12205 if (reg != 0)
12206 {
12207 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12208 tmpreg, 1, OPTAB_DIRECT);
12209 new_rtx = reg;
12210 }
12211 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12212 }
12213 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12214 {
12215 /* This symbol may be referenced via a displacement from the PIC
12216 base address (@GOTOFF). */
12217
12218 if (reload_in_progress)
12219 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12220 if (GET_CODE (addr) == CONST)
12221 addr = XEXP (addr, 0);
12222 if (GET_CODE (addr) == PLUS)
12223 {
12224 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12225 UNSPEC_GOTOFF);
12226 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12227 }
12228 else
12229 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12230 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12231 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12232
12233 if (reg != 0)
12234 {
12235 emit_move_insn (reg, new_rtx);
12236 new_rtx = reg;
12237 }
12238 }
12239 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12240 /* We can't use @GOTOFF for text labels on VxWorks;
12241 see gotoff_operand. */
12242 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12243 {
12244 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12245 {
12246 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12247 return legitimize_dllimport_symbol (addr, true);
12248 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12249 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12250 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12251 {
12252 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12253 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12254 }
12255 }
12256
12257 /* For x64 PE-COFF there is no GOT table. So we use address
12258 directly. */
12259 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12260 {
12261 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12262 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12263
12264 if (reg == 0)
12265 reg = gen_reg_rtx (Pmode);
12266 emit_move_insn (reg, new_rtx);
12267 new_rtx = reg;
12268 }
12269 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12270 {
12271 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12272 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12273 new_rtx = gen_const_mem (Pmode, new_rtx);
12274 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12275
12276 if (reg == 0)
12277 reg = gen_reg_rtx (Pmode);
12278 /* Use directly gen_movsi, otherwise the address is loaded
12279 into register for CSE. We don't want to CSE this addresses,
12280 instead we CSE addresses from the GOT table, so skip this. */
12281 emit_insn (gen_movsi (reg, new_rtx));
12282 new_rtx = reg;
12283 }
12284 else
12285 {
12286 /* This symbol must be referenced via a load from the
12287 Global Offset Table (@GOT). */
12288
12289 if (reload_in_progress)
12290 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12291 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12292 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12293 if (TARGET_64BIT)
12294 new_rtx = force_reg (Pmode, new_rtx);
12295 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12296 new_rtx = gen_const_mem (Pmode, new_rtx);
12297 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12298
12299 if (reg == 0)
12300 reg = gen_reg_rtx (Pmode);
12301 emit_move_insn (reg, new_rtx);
12302 new_rtx = reg;
12303 }
12304 }
12305 else
12306 {
12307 if (CONST_INT_P (addr)
12308 && !x86_64_immediate_operand (addr, VOIDmode))
12309 {
12310 if (reg)
12311 {
12312 emit_move_insn (reg, addr);
12313 new_rtx = reg;
12314 }
12315 else
12316 new_rtx = force_reg (Pmode, addr);
12317 }
12318 else if (GET_CODE (addr) == CONST)
12319 {
12320 addr = XEXP (addr, 0);
12321
12322 /* We must match stuff we generate before. Assume the only
12323 unspecs that can get here are ours. Not that we could do
12324 anything with them anyway.... */
12325 if (GET_CODE (addr) == UNSPEC
12326 || (GET_CODE (addr) == PLUS
12327 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12328 return orig;
12329 gcc_assert (GET_CODE (addr) == PLUS);
12330 }
12331 if (GET_CODE (addr) == PLUS)
12332 {
12333 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12334
12335 /* Check first to see if this is a constant offset from a @GOTOFF
12336 symbol reference. */
12337 if (gotoff_operand (op0, Pmode)
12338 && CONST_INT_P (op1))
12339 {
12340 if (!TARGET_64BIT)
12341 {
12342 if (reload_in_progress)
12343 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12344 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12345 UNSPEC_GOTOFF);
12346 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12347 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12348 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12349
12350 if (reg != 0)
12351 {
12352 emit_move_insn (reg, new_rtx);
12353 new_rtx = reg;
12354 }
12355 }
12356 else
12357 {
12358 if (INTVAL (op1) < -16*1024*1024
12359 || INTVAL (op1) >= 16*1024*1024)
12360 {
12361 if (!x86_64_immediate_operand (op1, Pmode))
12362 op1 = force_reg (Pmode, op1);
12363 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12364 }
12365 }
12366 }
12367 else
12368 {
12369 base = legitimize_pic_address (XEXP (addr, 0), reg);
12370 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12371 base == reg ? NULL_RTX : reg);
12372
12373 if (CONST_INT_P (new_rtx))
12374 new_rtx = plus_constant (base, INTVAL (new_rtx));
12375 else
12376 {
12377 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12378 {
12379 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12380 new_rtx = XEXP (new_rtx, 1);
12381 }
12382 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12383 }
12384 }
12385 }
12386 }
12387 return new_rtx;
12388 }
12389 \f
12390 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12391
12392 static rtx
12393 get_thread_pointer (bool to_reg)
12394 {
12395 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12396
12397 if (GET_MODE (tp) != Pmode)
12398 tp = convert_to_mode (Pmode, tp, 1);
12399
12400 if (to_reg)
12401 tp = copy_addr_to_reg (tp);
12402
12403 return tp;
12404 }
12405
12406 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12407
12408 static GTY(()) rtx ix86_tls_symbol;
12409
12410 static rtx
12411 ix86_tls_get_addr (void)
12412 {
12413 if (!ix86_tls_symbol)
12414 {
12415 const char *sym
12416 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12417 ? "___tls_get_addr" : "__tls_get_addr");
12418
12419 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12420 }
12421
12422 return ix86_tls_symbol;
12423 }
12424
12425 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12426
12427 static GTY(()) rtx ix86_tls_module_base_symbol;
12428
12429 rtx
12430 ix86_tls_module_base (void)
12431 {
12432 if (!ix86_tls_module_base_symbol)
12433 {
12434 ix86_tls_module_base_symbol
12435 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12436
12437 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12438 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12439 }
12440
12441 return ix86_tls_module_base_symbol;
12442 }
12443
12444 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12445 false if we expect this to be used for a memory address and true if
12446 we expect to load the address into a register. */
12447
12448 static rtx
12449 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12450 {
12451 rtx dest, base, off;
12452 rtx pic = NULL_RTX, tp = NULL_RTX;
12453 int type;
12454
12455 switch (model)
12456 {
12457 case TLS_MODEL_GLOBAL_DYNAMIC:
12458 dest = gen_reg_rtx (Pmode);
12459
12460 if (!TARGET_64BIT)
12461 {
12462 if (flag_pic)
12463 pic = pic_offset_table_rtx;
12464 else
12465 {
12466 pic = gen_reg_rtx (Pmode);
12467 emit_insn (gen_set_got (pic));
12468 }
12469 }
12470
12471 if (TARGET_GNU2_TLS)
12472 {
12473 if (TARGET_64BIT)
12474 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12475 else
12476 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12477
12478 tp = get_thread_pointer (true);
12479 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12480
12481 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12482 }
12483 else
12484 {
12485 rtx caddr = ix86_tls_get_addr ();
12486
12487 if (TARGET_64BIT)
12488 {
12489 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12490
12491 start_sequence ();
12492 emit_call_insn (gen_tls_global_dynamic_64 (rax, x, caddr));
12493 insns = get_insns ();
12494 end_sequence ();
12495
12496 RTL_CONST_CALL_P (insns) = 1;
12497 emit_libcall_block (insns, dest, rax, x);
12498 }
12499 else
12500 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12501 }
12502 break;
12503
12504 case TLS_MODEL_LOCAL_DYNAMIC:
12505 base = gen_reg_rtx (Pmode);
12506
12507 if (!TARGET_64BIT)
12508 {
12509 if (flag_pic)
12510 pic = pic_offset_table_rtx;
12511 else
12512 {
12513 pic = gen_reg_rtx (Pmode);
12514 emit_insn (gen_set_got (pic));
12515 }
12516 }
12517
12518 if (TARGET_GNU2_TLS)
12519 {
12520 rtx tmp = ix86_tls_module_base ();
12521
12522 if (TARGET_64BIT)
12523 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12524 else
12525 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12526
12527 tp = get_thread_pointer (true);
12528 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12529 gen_rtx_MINUS (Pmode, tmp, tp));
12530 }
12531 else
12532 {
12533 rtx caddr = ix86_tls_get_addr ();
12534
12535 if (TARGET_64BIT)
12536 {
12537 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12538
12539 start_sequence ();
12540 emit_call_insn (gen_tls_local_dynamic_base_64 (rax, caddr));
12541 insns = get_insns ();
12542 end_sequence ();
12543
12544 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12545 share the LD_BASE result with other LD model accesses. */
12546 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12547 UNSPEC_TLS_LD_BASE);
12548
12549 RTL_CONST_CALL_P (insns) = 1;
12550 emit_libcall_block (insns, base, rax, eqv);
12551 }
12552 else
12553 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12554 }
12555
12556 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12557 off = gen_rtx_CONST (Pmode, off);
12558
12559 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12560
12561 if (TARGET_GNU2_TLS)
12562 {
12563 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12564
12565 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12566 }
12567 break;
12568
12569 case TLS_MODEL_INITIAL_EXEC:
12570 if (TARGET_64BIT)
12571 {
12572 if (TARGET_SUN_TLS)
12573 {
12574 /* The Sun linker took the AMD64 TLS spec literally
12575 and can only handle %rax as destination of the
12576 initial executable code sequence. */
12577
12578 dest = gen_reg_rtx (Pmode);
12579 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12580 return dest;
12581 }
12582
12583 pic = NULL;
12584 type = UNSPEC_GOTNTPOFF;
12585 }
12586 else if (flag_pic)
12587 {
12588 if (reload_in_progress)
12589 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12590 pic = pic_offset_table_rtx;
12591 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12592 }
12593 else if (!TARGET_ANY_GNU_TLS)
12594 {
12595 pic = gen_reg_rtx (Pmode);
12596 emit_insn (gen_set_got (pic));
12597 type = UNSPEC_GOTTPOFF;
12598 }
12599 else
12600 {
12601 pic = NULL;
12602 type = UNSPEC_INDNTPOFF;
12603 }
12604
12605 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
12606 off = gen_rtx_CONST (Pmode, off);
12607 if (pic)
12608 off = gen_rtx_PLUS (Pmode, pic, off);
12609 off = gen_const_mem (Pmode, off);
12610 set_mem_alias_set (off, ix86_GOT_alias_set ());
12611
12612 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12613 {
12614 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12615 off = force_reg (Pmode, off);
12616 return gen_rtx_PLUS (Pmode, base, off);
12617 }
12618 else
12619 {
12620 base = get_thread_pointer (true);
12621 dest = gen_reg_rtx (Pmode);
12622 emit_insn (gen_subsi3 (dest, base, off));
12623 }
12624 break;
12625
12626 case TLS_MODEL_LOCAL_EXEC:
12627 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12628 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12629 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12630 off = gen_rtx_CONST (Pmode, off);
12631
12632 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12633 {
12634 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12635 return gen_rtx_PLUS (Pmode, base, off);
12636 }
12637 else
12638 {
12639 base = get_thread_pointer (true);
12640 dest = gen_reg_rtx (Pmode);
12641 emit_insn (gen_subsi3 (dest, base, off));
12642 }
12643 break;
12644
12645 default:
12646 gcc_unreachable ();
12647 }
12648
12649 return dest;
12650 }
12651
12652 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12653 to symbol DECL. */
12654
12655 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12656 htab_t dllimport_map;
12657
12658 static tree
12659 get_dllimport_decl (tree decl)
12660 {
12661 struct tree_map *h, in;
12662 void **loc;
12663 const char *name;
12664 const char *prefix;
12665 size_t namelen, prefixlen;
12666 char *imp_name;
12667 tree to;
12668 rtx rtl;
12669
12670 if (!dllimport_map)
12671 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12672
12673 in.hash = htab_hash_pointer (decl);
12674 in.base.from = decl;
12675 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12676 h = (struct tree_map *) *loc;
12677 if (h)
12678 return h->to;
12679
12680 *loc = h = ggc_alloc_tree_map ();
12681 h->hash = in.hash;
12682 h->base.from = decl;
12683 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12684 VAR_DECL, NULL, ptr_type_node);
12685 DECL_ARTIFICIAL (to) = 1;
12686 DECL_IGNORED_P (to) = 1;
12687 DECL_EXTERNAL (to) = 1;
12688 TREE_READONLY (to) = 1;
12689
12690 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12691 name = targetm.strip_name_encoding (name);
12692 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12693 ? "*__imp_" : "*__imp__";
12694 namelen = strlen (name);
12695 prefixlen = strlen (prefix);
12696 imp_name = (char *) alloca (namelen + prefixlen + 1);
12697 memcpy (imp_name, prefix, prefixlen);
12698 memcpy (imp_name + prefixlen, name, namelen + 1);
12699
12700 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12701 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12702 SET_SYMBOL_REF_DECL (rtl, to);
12703 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12704
12705 rtl = gen_const_mem (Pmode, rtl);
12706 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12707
12708 SET_DECL_RTL (to, rtl);
12709 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12710
12711 return to;
12712 }
12713
12714 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12715 true if we require the result be a register. */
12716
12717 static rtx
12718 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12719 {
12720 tree imp_decl;
12721 rtx x;
12722
12723 gcc_assert (SYMBOL_REF_DECL (symbol));
12724 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12725
12726 x = DECL_RTL (imp_decl);
12727 if (want_reg)
12728 x = force_reg (Pmode, x);
12729 return x;
12730 }
12731
12732 /* Try machine-dependent ways of modifying an illegitimate address
12733 to be legitimate. If we find one, return the new, valid address.
12734 This macro is used in only one place: `memory_address' in explow.c.
12735
12736 OLDX is the address as it was before break_out_memory_refs was called.
12737 In some cases it is useful to look at this to decide what needs to be done.
12738
12739 It is always safe for this macro to do nothing. It exists to recognize
12740 opportunities to optimize the output.
12741
12742 For the 80386, we handle X+REG by loading X into a register R and
12743 using R+REG. R will go in a general reg and indexing will be used.
12744 However, if REG is a broken-out memory address or multiplication,
12745 nothing needs to be done because REG can certainly go in a general reg.
12746
12747 When -fpic is used, special handling is needed for symbolic references.
12748 See comments by legitimize_pic_address in i386.c for details. */
12749
12750 static rtx
12751 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12752 enum machine_mode mode)
12753 {
12754 int changed = 0;
12755 unsigned log;
12756
12757 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12758 if (log)
12759 return legitimize_tls_address (x, (enum tls_model) log, false);
12760 if (GET_CODE (x) == CONST
12761 && GET_CODE (XEXP (x, 0)) == PLUS
12762 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12763 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12764 {
12765 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12766 (enum tls_model) log, false);
12767 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12768 }
12769
12770 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12771 {
12772 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12773 return legitimize_dllimport_symbol (x, true);
12774 if (GET_CODE (x) == CONST
12775 && GET_CODE (XEXP (x, 0)) == PLUS
12776 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12777 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12778 {
12779 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12780 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12781 }
12782 }
12783
12784 if (flag_pic && SYMBOLIC_CONST (x))
12785 return legitimize_pic_address (x, 0);
12786
12787 #if TARGET_MACHO
12788 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12789 return machopic_indirect_data_reference (x, 0);
12790 #endif
12791
12792 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12793 if (GET_CODE (x) == ASHIFT
12794 && CONST_INT_P (XEXP (x, 1))
12795 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12796 {
12797 changed = 1;
12798 log = INTVAL (XEXP (x, 1));
12799 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12800 GEN_INT (1 << log));
12801 }
12802
12803 if (GET_CODE (x) == PLUS)
12804 {
12805 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12806
12807 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12808 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12809 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12810 {
12811 changed = 1;
12812 log = INTVAL (XEXP (XEXP (x, 0), 1));
12813 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12814 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12815 GEN_INT (1 << log));
12816 }
12817
12818 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12819 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12820 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12821 {
12822 changed = 1;
12823 log = INTVAL (XEXP (XEXP (x, 1), 1));
12824 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12825 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12826 GEN_INT (1 << log));
12827 }
12828
12829 /* Put multiply first if it isn't already. */
12830 if (GET_CODE (XEXP (x, 1)) == MULT)
12831 {
12832 rtx tmp = XEXP (x, 0);
12833 XEXP (x, 0) = XEXP (x, 1);
12834 XEXP (x, 1) = tmp;
12835 changed = 1;
12836 }
12837
12838 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
12839 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
12840 created by virtual register instantiation, register elimination, and
12841 similar optimizations. */
12842 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
12843 {
12844 changed = 1;
12845 x = gen_rtx_PLUS (Pmode,
12846 gen_rtx_PLUS (Pmode, XEXP (x, 0),
12847 XEXP (XEXP (x, 1), 0)),
12848 XEXP (XEXP (x, 1), 1));
12849 }
12850
12851 /* Canonicalize
12852 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
12853 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
12854 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
12855 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12856 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
12857 && CONSTANT_P (XEXP (x, 1)))
12858 {
12859 rtx constant;
12860 rtx other = NULL_RTX;
12861
12862 if (CONST_INT_P (XEXP (x, 1)))
12863 {
12864 constant = XEXP (x, 1);
12865 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
12866 }
12867 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
12868 {
12869 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
12870 other = XEXP (x, 1);
12871 }
12872 else
12873 constant = 0;
12874
12875 if (constant)
12876 {
12877 changed = 1;
12878 x = gen_rtx_PLUS (Pmode,
12879 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
12880 XEXP (XEXP (XEXP (x, 0), 1), 0)),
12881 plus_constant (other, INTVAL (constant)));
12882 }
12883 }
12884
12885 if (changed && ix86_legitimate_address_p (mode, x, false))
12886 return x;
12887
12888 if (GET_CODE (XEXP (x, 0)) == MULT)
12889 {
12890 changed = 1;
12891 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
12892 }
12893
12894 if (GET_CODE (XEXP (x, 1)) == MULT)
12895 {
12896 changed = 1;
12897 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
12898 }
12899
12900 if (changed
12901 && REG_P (XEXP (x, 1))
12902 && REG_P (XEXP (x, 0)))
12903 return x;
12904
12905 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
12906 {
12907 changed = 1;
12908 x = legitimize_pic_address (x, 0);
12909 }
12910
12911 if (changed && ix86_legitimate_address_p (mode, x, false))
12912 return x;
12913
12914 if (REG_P (XEXP (x, 0)))
12915 {
12916 rtx temp = gen_reg_rtx (Pmode);
12917 rtx val = force_operand (XEXP (x, 1), temp);
12918 if (val != temp)
12919 {
12920 if (GET_MODE (val) != Pmode)
12921 val = convert_to_mode (Pmode, val, 1);
12922 emit_move_insn (temp, val);
12923 }
12924
12925 XEXP (x, 1) = temp;
12926 return x;
12927 }
12928
12929 else if (REG_P (XEXP (x, 1)))
12930 {
12931 rtx temp = gen_reg_rtx (Pmode);
12932 rtx val = force_operand (XEXP (x, 0), temp);
12933 if (val != temp)
12934 {
12935 if (GET_MODE (val) != Pmode)
12936 val = convert_to_mode (Pmode, val, 1);
12937 emit_move_insn (temp, val);
12938 }
12939
12940 XEXP (x, 0) = temp;
12941 return x;
12942 }
12943 }
12944
12945 return x;
12946 }
12947 \f
12948 /* Print an integer constant expression in assembler syntax. Addition
12949 and subtraction are the only arithmetic that may appear in these
12950 expressions. FILE is the stdio stream to write to, X is the rtx, and
12951 CODE is the operand print code from the output string. */
12952
12953 static void
12954 output_pic_addr_const (FILE *file, rtx x, int code)
12955 {
12956 char buf[256];
12957
12958 switch (GET_CODE (x))
12959 {
12960 case PC:
12961 gcc_assert (flag_pic);
12962 putc ('.', file);
12963 break;
12964
12965 case SYMBOL_REF:
12966 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
12967 output_addr_const (file, x);
12968 else
12969 {
12970 const char *name = XSTR (x, 0);
12971
12972 /* Mark the decl as referenced so that cgraph will
12973 output the function. */
12974 if (SYMBOL_REF_DECL (x))
12975 mark_decl_referenced (SYMBOL_REF_DECL (x));
12976
12977 #if TARGET_MACHO
12978 if (MACHOPIC_INDIRECT
12979 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
12980 name = machopic_indirection_name (x, /*stub_p=*/true);
12981 #endif
12982 assemble_name (file, name);
12983 }
12984 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12985 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
12986 fputs ("@PLT", file);
12987 break;
12988
12989 case LABEL_REF:
12990 x = XEXP (x, 0);
12991 /* FALLTHRU */
12992 case CODE_LABEL:
12993 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
12994 assemble_name (asm_out_file, buf);
12995 break;
12996
12997 case CONST_INT:
12998 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
12999 break;
13000
13001 case CONST:
13002 /* This used to output parentheses around the expression,
13003 but that does not work on the 386 (either ATT or BSD assembler). */
13004 output_pic_addr_const (file, XEXP (x, 0), code);
13005 break;
13006
13007 case CONST_DOUBLE:
13008 if (GET_MODE (x) == VOIDmode)
13009 {
13010 /* We can use %d if the number is <32 bits and positive. */
13011 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13012 fprintf (file, "0x%lx%08lx",
13013 (unsigned long) CONST_DOUBLE_HIGH (x),
13014 (unsigned long) CONST_DOUBLE_LOW (x));
13015 else
13016 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13017 }
13018 else
13019 /* We can't handle floating point constants;
13020 TARGET_PRINT_OPERAND must handle them. */
13021 output_operand_lossage ("floating constant misused");
13022 break;
13023
13024 case PLUS:
13025 /* Some assemblers need integer constants to appear first. */
13026 if (CONST_INT_P (XEXP (x, 0)))
13027 {
13028 output_pic_addr_const (file, XEXP (x, 0), code);
13029 putc ('+', file);
13030 output_pic_addr_const (file, XEXP (x, 1), code);
13031 }
13032 else
13033 {
13034 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13035 output_pic_addr_const (file, XEXP (x, 1), code);
13036 putc ('+', file);
13037 output_pic_addr_const (file, XEXP (x, 0), code);
13038 }
13039 break;
13040
13041 case MINUS:
13042 if (!TARGET_MACHO)
13043 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13044 output_pic_addr_const (file, XEXP (x, 0), code);
13045 putc ('-', file);
13046 output_pic_addr_const (file, XEXP (x, 1), code);
13047 if (!TARGET_MACHO)
13048 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13049 break;
13050
13051 case UNSPEC:
13052 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13053 {
13054 bool f = i386_asm_output_addr_const_extra (file, x);
13055 gcc_assert (f);
13056 break;
13057 }
13058
13059 gcc_assert (XVECLEN (x, 0) == 1);
13060 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13061 switch (XINT (x, 1))
13062 {
13063 case UNSPEC_GOT:
13064 fputs ("@GOT", file);
13065 break;
13066 case UNSPEC_GOTOFF:
13067 fputs ("@GOTOFF", file);
13068 break;
13069 case UNSPEC_PLTOFF:
13070 fputs ("@PLTOFF", file);
13071 break;
13072 case UNSPEC_PCREL:
13073 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13074 "(%rip)" : "[rip]", file);
13075 break;
13076 case UNSPEC_GOTPCREL:
13077 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13078 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13079 break;
13080 case UNSPEC_GOTTPOFF:
13081 /* FIXME: This might be @TPOFF in Sun ld too. */
13082 fputs ("@gottpoff", file);
13083 break;
13084 case UNSPEC_TPOFF:
13085 fputs ("@tpoff", file);
13086 break;
13087 case UNSPEC_NTPOFF:
13088 if (TARGET_64BIT)
13089 fputs ("@tpoff", file);
13090 else
13091 fputs ("@ntpoff", file);
13092 break;
13093 case UNSPEC_DTPOFF:
13094 fputs ("@dtpoff", file);
13095 break;
13096 case UNSPEC_GOTNTPOFF:
13097 if (TARGET_64BIT)
13098 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13099 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13100 else
13101 fputs ("@gotntpoff", file);
13102 break;
13103 case UNSPEC_INDNTPOFF:
13104 fputs ("@indntpoff", file);
13105 break;
13106 #if TARGET_MACHO
13107 case UNSPEC_MACHOPIC_OFFSET:
13108 putc ('-', file);
13109 machopic_output_function_base_name (file);
13110 break;
13111 #endif
13112 default:
13113 output_operand_lossage ("invalid UNSPEC as operand");
13114 break;
13115 }
13116 break;
13117
13118 default:
13119 output_operand_lossage ("invalid expression as operand");
13120 }
13121 }
13122
13123 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13124 We need to emit DTP-relative relocations. */
13125
13126 static void ATTRIBUTE_UNUSED
13127 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13128 {
13129 fputs (ASM_LONG, file);
13130 output_addr_const (file, x);
13131 fputs ("@dtpoff", file);
13132 switch (size)
13133 {
13134 case 4:
13135 break;
13136 case 8:
13137 fputs (", 0", file);
13138 break;
13139 default:
13140 gcc_unreachable ();
13141 }
13142 }
13143
13144 /* Return true if X is a representation of the PIC register. This copes
13145 with calls from ix86_find_base_term, where the register might have
13146 been replaced by a cselib value. */
13147
13148 static bool
13149 ix86_pic_register_p (rtx x)
13150 {
13151 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13152 return (pic_offset_table_rtx
13153 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13154 else
13155 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13156 }
13157
13158 /* Helper function for ix86_delegitimize_address.
13159 Attempt to delegitimize TLS local-exec accesses. */
13160
13161 static rtx
13162 ix86_delegitimize_tls_address (rtx orig_x)
13163 {
13164 rtx x = orig_x, unspec;
13165 struct ix86_address addr;
13166
13167 if (!TARGET_TLS_DIRECT_SEG_REFS)
13168 return orig_x;
13169 if (MEM_P (x))
13170 x = XEXP (x, 0);
13171 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13172 return orig_x;
13173 if (ix86_decompose_address (x, &addr) == 0
13174 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13175 || addr.disp == NULL_RTX
13176 || GET_CODE (addr.disp) != CONST)
13177 return orig_x;
13178 unspec = XEXP (addr.disp, 0);
13179 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13180 unspec = XEXP (unspec, 0);
13181 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13182 return orig_x;
13183 x = XVECEXP (unspec, 0, 0);
13184 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13185 if (unspec != XEXP (addr.disp, 0))
13186 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13187 if (addr.index)
13188 {
13189 rtx idx = addr.index;
13190 if (addr.scale != 1)
13191 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13192 x = gen_rtx_PLUS (Pmode, idx, x);
13193 }
13194 if (addr.base)
13195 x = gen_rtx_PLUS (Pmode, addr.base, x);
13196 if (MEM_P (orig_x))
13197 x = replace_equiv_address_nv (orig_x, x);
13198 return x;
13199 }
13200
13201 /* In the name of slightly smaller debug output, and to cater to
13202 general assembler lossage, recognize PIC+GOTOFF and turn it back
13203 into a direct symbol reference.
13204
13205 On Darwin, this is necessary to avoid a crash, because Darwin
13206 has a different PIC label for each routine but the DWARF debugging
13207 information is not associated with any particular routine, so it's
13208 necessary to remove references to the PIC label from RTL stored by
13209 the DWARF output code. */
13210
13211 static rtx
13212 ix86_delegitimize_address (rtx x)
13213 {
13214 rtx orig_x = delegitimize_mem_from_attrs (x);
13215 /* addend is NULL or some rtx if x is something+GOTOFF where
13216 something doesn't include the PIC register. */
13217 rtx addend = NULL_RTX;
13218 /* reg_addend is NULL or a multiple of some register. */
13219 rtx reg_addend = NULL_RTX;
13220 /* const_addend is NULL or a const_int. */
13221 rtx const_addend = NULL_RTX;
13222 /* This is the result, or NULL. */
13223 rtx result = NULL_RTX;
13224
13225 x = orig_x;
13226
13227 if (MEM_P (x))
13228 x = XEXP (x, 0);
13229
13230 if (TARGET_64BIT)
13231 {
13232 if (GET_CODE (x) != CONST
13233 || GET_CODE (XEXP (x, 0)) != UNSPEC
13234 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13235 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13236 || !MEM_P (orig_x))
13237 return ix86_delegitimize_tls_address (orig_x);
13238 x = XVECEXP (XEXP (x, 0), 0, 0);
13239 if (GET_MODE (orig_x) != GET_MODE (x))
13240 {
13241 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13242 GET_MODE (x), 0);
13243 if (x == NULL_RTX)
13244 return orig_x;
13245 }
13246 return x;
13247 }
13248
13249 if (GET_CODE (x) != PLUS
13250 || GET_CODE (XEXP (x, 1)) != CONST)
13251 return ix86_delegitimize_tls_address (orig_x);
13252
13253 if (ix86_pic_register_p (XEXP (x, 0)))
13254 /* %ebx + GOT/GOTOFF */
13255 ;
13256 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13257 {
13258 /* %ebx + %reg * scale + GOT/GOTOFF */
13259 reg_addend = XEXP (x, 0);
13260 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13261 reg_addend = XEXP (reg_addend, 1);
13262 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13263 reg_addend = XEXP (reg_addend, 0);
13264 else
13265 {
13266 reg_addend = NULL_RTX;
13267 addend = XEXP (x, 0);
13268 }
13269 }
13270 else
13271 addend = XEXP (x, 0);
13272
13273 x = XEXP (XEXP (x, 1), 0);
13274 if (GET_CODE (x) == PLUS
13275 && CONST_INT_P (XEXP (x, 1)))
13276 {
13277 const_addend = XEXP (x, 1);
13278 x = XEXP (x, 0);
13279 }
13280
13281 if (GET_CODE (x) == UNSPEC
13282 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13283 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13284 result = XVECEXP (x, 0, 0);
13285
13286 if (TARGET_MACHO && darwin_local_data_pic (x)
13287 && !MEM_P (orig_x))
13288 result = XVECEXP (x, 0, 0);
13289
13290 if (! result)
13291 return ix86_delegitimize_tls_address (orig_x);
13292
13293 if (const_addend)
13294 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13295 if (reg_addend)
13296 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13297 if (addend)
13298 {
13299 /* If the rest of original X doesn't involve the PIC register, add
13300 addend and subtract pic_offset_table_rtx. This can happen e.g.
13301 for code like:
13302 leal (%ebx, %ecx, 4), %ecx
13303 ...
13304 movl foo@GOTOFF(%ecx), %edx
13305 in which case we return (%ecx - %ebx) + foo. */
13306 if (pic_offset_table_rtx)
13307 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13308 pic_offset_table_rtx),
13309 result);
13310 else
13311 return orig_x;
13312 }
13313 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13314 {
13315 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13316 if (result == NULL_RTX)
13317 return orig_x;
13318 }
13319 return result;
13320 }
13321
13322 /* If X is a machine specific address (i.e. a symbol or label being
13323 referenced as a displacement from the GOT implemented using an
13324 UNSPEC), then return the base term. Otherwise return X. */
13325
13326 rtx
13327 ix86_find_base_term (rtx x)
13328 {
13329 rtx term;
13330
13331 if (TARGET_64BIT)
13332 {
13333 if (GET_CODE (x) != CONST)
13334 return x;
13335 term = XEXP (x, 0);
13336 if (GET_CODE (term) == PLUS
13337 && (CONST_INT_P (XEXP (term, 1))
13338 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13339 term = XEXP (term, 0);
13340 if (GET_CODE (term) != UNSPEC
13341 || (XINT (term, 1) != UNSPEC_GOTPCREL
13342 && XINT (term, 1) != UNSPEC_PCREL))
13343 return x;
13344
13345 return XVECEXP (term, 0, 0);
13346 }
13347
13348 return ix86_delegitimize_address (x);
13349 }
13350 \f
13351 static void
13352 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13353 int fp, FILE *file)
13354 {
13355 const char *suffix;
13356
13357 if (mode == CCFPmode || mode == CCFPUmode)
13358 {
13359 code = ix86_fp_compare_code_to_integer (code);
13360 mode = CCmode;
13361 }
13362 if (reverse)
13363 code = reverse_condition (code);
13364
13365 switch (code)
13366 {
13367 case EQ:
13368 switch (mode)
13369 {
13370 case CCAmode:
13371 suffix = "a";
13372 break;
13373
13374 case CCCmode:
13375 suffix = "c";
13376 break;
13377
13378 case CCOmode:
13379 suffix = "o";
13380 break;
13381
13382 case CCSmode:
13383 suffix = "s";
13384 break;
13385
13386 default:
13387 suffix = "e";
13388 }
13389 break;
13390 case NE:
13391 switch (mode)
13392 {
13393 case CCAmode:
13394 suffix = "na";
13395 break;
13396
13397 case CCCmode:
13398 suffix = "nc";
13399 break;
13400
13401 case CCOmode:
13402 suffix = "no";
13403 break;
13404
13405 case CCSmode:
13406 suffix = "ns";
13407 break;
13408
13409 default:
13410 suffix = "ne";
13411 }
13412 break;
13413 case GT:
13414 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13415 suffix = "g";
13416 break;
13417 case GTU:
13418 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13419 Those same assemblers have the same but opposite lossage on cmov. */
13420 if (mode == CCmode)
13421 suffix = fp ? "nbe" : "a";
13422 else if (mode == CCCmode)
13423 suffix = "b";
13424 else
13425 gcc_unreachable ();
13426 break;
13427 case LT:
13428 switch (mode)
13429 {
13430 case CCNOmode:
13431 case CCGOCmode:
13432 suffix = "s";
13433 break;
13434
13435 case CCmode:
13436 case CCGCmode:
13437 suffix = "l";
13438 break;
13439
13440 default:
13441 gcc_unreachable ();
13442 }
13443 break;
13444 case LTU:
13445 gcc_assert (mode == CCmode || mode == CCCmode);
13446 suffix = "b";
13447 break;
13448 case GE:
13449 switch (mode)
13450 {
13451 case CCNOmode:
13452 case CCGOCmode:
13453 suffix = "ns";
13454 break;
13455
13456 case CCmode:
13457 case CCGCmode:
13458 suffix = "ge";
13459 break;
13460
13461 default:
13462 gcc_unreachable ();
13463 }
13464 break;
13465 case GEU:
13466 /* ??? As above. */
13467 gcc_assert (mode == CCmode || mode == CCCmode);
13468 suffix = fp ? "nb" : "ae";
13469 break;
13470 case LE:
13471 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13472 suffix = "le";
13473 break;
13474 case LEU:
13475 /* ??? As above. */
13476 if (mode == CCmode)
13477 suffix = "be";
13478 else if (mode == CCCmode)
13479 suffix = fp ? "nb" : "ae";
13480 else
13481 gcc_unreachable ();
13482 break;
13483 case UNORDERED:
13484 suffix = fp ? "u" : "p";
13485 break;
13486 case ORDERED:
13487 suffix = fp ? "nu" : "np";
13488 break;
13489 default:
13490 gcc_unreachable ();
13491 }
13492 fputs (suffix, file);
13493 }
13494
13495 /* Print the name of register X to FILE based on its machine mode and number.
13496 If CODE is 'w', pretend the mode is HImode.
13497 If CODE is 'b', pretend the mode is QImode.
13498 If CODE is 'k', pretend the mode is SImode.
13499 If CODE is 'q', pretend the mode is DImode.
13500 If CODE is 'x', pretend the mode is V4SFmode.
13501 If CODE is 't', pretend the mode is V8SFmode.
13502 If CODE is 'h', pretend the reg is the 'high' byte register.
13503 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13504 If CODE is 'd', duplicate the operand for AVX instruction.
13505 */
13506
13507 void
13508 print_reg (rtx x, int code, FILE *file)
13509 {
13510 const char *reg;
13511 bool duplicated = code == 'd' && TARGET_AVX;
13512
13513 gcc_assert (x == pc_rtx
13514 || (REGNO (x) != ARG_POINTER_REGNUM
13515 && REGNO (x) != FRAME_POINTER_REGNUM
13516 && REGNO (x) != FLAGS_REG
13517 && REGNO (x) != FPSR_REG
13518 && REGNO (x) != FPCR_REG));
13519
13520 if (ASSEMBLER_DIALECT == ASM_ATT)
13521 putc ('%', file);
13522
13523 if (x == pc_rtx)
13524 {
13525 gcc_assert (TARGET_64BIT);
13526 fputs ("rip", file);
13527 return;
13528 }
13529
13530 if (code == 'w' || MMX_REG_P (x))
13531 code = 2;
13532 else if (code == 'b')
13533 code = 1;
13534 else if (code == 'k')
13535 code = 4;
13536 else if (code == 'q')
13537 code = 8;
13538 else if (code == 'y')
13539 code = 3;
13540 else if (code == 'h')
13541 code = 0;
13542 else if (code == 'x')
13543 code = 16;
13544 else if (code == 't')
13545 code = 32;
13546 else
13547 code = GET_MODE_SIZE (GET_MODE (x));
13548
13549 /* Irritatingly, AMD extended registers use different naming convention
13550 from the normal registers: "r%d[bwd]" */
13551 if (REX_INT_REG_P (x))
13552 {
13553 gcc_assert (TARGET_64BIT);
13554 putc ('r', file);
13555 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13556 switch (code)
13557 {
13558 case 0:
13559 error ("extended registers have no high halves");
13560 break;
13561 case 1:
13562 putc ('b', file);
13563 break;
13564 case 2:
13565 putc ('w', file);
13566 break;
13567 case 4:
13568 putc ('d', file);
13569 break;
13570 case 8:
13571 /* no suffix */
13572 break;
13573 default:
13574 error ("unsupported operand size for extended register");
13575 break;
13576 }
13577 return;
13578 }
13579
13580 reg = NULL;
13581 switch (code)
13582 {
13583 case 3:
13584 if (STACK_TOP_P (x))
13585 {
13586 reg = "st(0)";
13587 break;
13588 }
13589 /* FALLTHRU */
13590 case 8:
13591 case 4:
13592 case 12:
13593 if (! ANY_FP_REG_P (x))
13594 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13595 /* FALLTHRU */
13596 case 16:
13597 case 2:
13598 normal:
13599 reg = hi_reg_name[REGNO (x)];
13600 break;
13601 case 1:
13602 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13603 goto normal;
13604 reg = qi_reg_name[REGNO (x)];
13605 break;
13606 case 0:
13607 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13608 goto normal;
13609 reg = qi_high_reg_name[REGNO (x)];
13610 break;
13611 case 32:
13612 if (SSE_REG_P (x))
13613 {
13614 gcc_assert (!duplicated);
13615 putc ('y', file);
13616 fputs (hi_reg_name[REGNO (x)] + 1, file);
13617 return;
13618 }
13619 break;
13620 default:
13621 gcc_unreachable ();
13622 }
13623
13624 fputs (reg, file);
13625 if (duplicated)
13626 {
13627 if (ASSEMBLER_DIALECT == ASM_ATT)
13628 fprintf (file, ", %%%s", reg);
13629 else
13630 fprintf (file, ", %s", reg);
13631 }
13632 }
13633
13634 /* Locate some local-dynamic symbol still in use by this function
13635 so that we can print its name in some tls_local_dynamic_base
13636 pattern. */
13637
13638 static int
13639 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13640 {
13641 rtx x = *px;
13642
13643 if (GET_CODE (x) == SYMBOL_REF
13644 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13645 {
13646 cfun->machine->some_ld_name = XSTR (x, 0);
13647 return 1;
13648 }
13649
13650 return 0;
13651 }
13652
13653 static const char *
13654 get_some_local_dynamic_name (void)
13655 {
13656 rtx insn;
13657
13658 if (cfun->machine->some_ld_name)
13659 return cfun->machine->some_ld_name;
13660
13661 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13662 if (NONDEBUG_INSN_P (insn)
13663 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13664 return cfun->machine->some_ld_name;
13665
13666 return NULL;
13667 }
13668
13669 /* Meaning of CODE:
13670 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13671 C -- print opcode suffix for set/cmov insn.
13672 c -- like C, but print reversed condition
13673 F,f -- likewise, but for floating-point.
13674 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13675 otherwise nothing
13676 R -- print the prefix for register names.
13677 z -- print the opcode suffix for the size of the current operand.
13678 Z -- likewise, with special suffixes for x87 instructions.
13679 * -- print a star (in certain assembler syntax)
13680 A -- print an absolute memory reference.
13681 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13682 s -- print a shift double count, followed by the assemblers argument
13683 delimiter.
13684 b -- print the QImode name of the register for the indicated operand.
13685 %b0 would print %al if operands[0] is reg 0.
13686 w -- likewise, print the HImode name of the register.
13687 k -- likewise, print the SImode name of the register.
13688 q -- likewise, print the DImode name of the register.
13689 x -- likewise, print the V4SFmode name of the register.
13690 t -- likewise, print the V8SFmode name of the register.
13691 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13692 y -- print "st(0)" instead of "st" as a register.
13693 d -- print duplicated register operand for AVX instruction.
13694 D -- print condition for SSE cmp instruction.
13695 P -- if PIC, print an @PLT suffix.
13696 p -- print raw symbol name.
13697 X -- don't print any sort of PIC '@' suffix for a symbol.
13698 & -- print some in-use local-dynamic symbol name.
13699 H -- print a memory address offset by 8; used for sse high-parts
13700 Y -- print condition for XOP pcom* instruction.
13701 + -- print a branch hint as 'cs' or 'ds' prefix
13702 ; -- print a semicolon (after prefixes due to bug in older gas).
13703 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13704 @ -- print a segment register of thread base pointer load
13705 */
13706
13707 void
13708 ix86_print_operand (FILE *file, rtx x, int code)
13709 {
13710 if (code)
13711 {
13712 switch (code)
13713 {
13714 case '*':
13715 if (ASSEMBLER_DIALECT == ASM_ATT)
13716 putc ('*', file);
13717 return;
13718
13719 case '&':
13720 {
13721 const char *name = get_some_local_dynamic_name ();
13722 if (name == NULL)
13723 output_operand_lossage ("'%%&' used without any "
13724 "local dynamic TLS references");
13725 else
13726 assemble_name (file, name);
13727 return;
13728 }
13729
13730 case 'A':
13731 switch (ASSEMBLER_DIALECT)
13732 {
13733 case ASM_ATT:
13734 putc ('*', file);
13735 break;
13736
13737 case ASM_INTEL:
13738 /* Intel syntax. For absolute addresses, registers should not
13739 be surrounded by braces. */
13740 if (!REG_P (x))
13741 {
13742 putc ('[', file);
13743 ix86_print_operand (file, x, 0);
13744 putc (']', file);
13745 return;
13746 }
13747 break;
13748
13749 default:
13750 gcc_unreachable ();
13751 }
13752
13753 ix86_print_operand (file, x, 0);
13754 return;
13755
13756
13757 case 'L':
13758 if (ASSEMBLER_DIALECT == ASM_ATT)
13759 putc ('l', file);
13760 return;
13761
13762 case 'W':
13763 if (ASSEMBLER_DIALECT == ASM_ATT)
13764 putc ('w', file);
13765 return;
13766
13767 case 'B':
13768 if (ASSEMBLER_DIALECT == ASM_ATT)
13769 putc ('b', file);
13770 return;
13771
13772 case 'Q':
13773 if (ASSEMBLER_DIALECT == ASM_ATT)
13774 putc ('l', file);
13775 return;
13776
13777 case 'S':
13778 if (ASSEMBLER_DIALECT == ASM_ATT)
13779 putc ('s', file);
13780 return;
13781
13782 case 'T':
13783 if (ASSEMBLER_DIALECT == ASM_ATT)
13784 putc ('t', file);
13785 return;
13786
13787 case 'z':
13788 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13789 {
13790 /* Opcodes don't get size suffixes if using Intel opcodes. */
13791 if (ASSEMBLER_DIALECT == ASM_INTEL)
13792 return;
13793
13794 switch (GET_MODE_SIZE (GET_MODE (x)))
13795 {
13796 case 1:
13797 putc ('b', file);
13798 return;
13799
13800 case 2:
13801 putc ('w', file);
13802 return;
13803
13804 case 4:
13805 putc ('l', file);
13806 return;
13807
13808 case 8:
13809 putc ('q', file);
13810 return;
13811
13812 default:
13813 output_operand_lossage
13814 ("invalid operand size for operand code '%c'", code);
13815 return;
13816 }
13817 }
13818
13819 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13820 warning
13821 (0, "non-integer operand used with operand code '%c'", code);
13822 /* FALLTHRU */
13823
13824 case 'Z':
13825 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
13826 if (ASSEMBLER_DIALECT == ASM_INTEL)
13827 return;
13828
13829 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13830 {
13831 switch (GET_MODE_SIZE (GET_MODE (x)))
13832 {
13833 case 2:
13834 #ifdef HAVE_AS_IX86_FILDS
13835 putc ('s', file);
13836 #endif
13837 return;
13838
13839 case 4:
13840 putc ('l', file);
13841 return;
13842
13843 case 8:
13844 #ifdef HAVE_AS_IX86_FILDQ
13845 putc ('q', file);
13846 #else
13847 fputs ("ll", file);
13848 #endif
13849 return;
13850
13851 default:
13852 break;
13853 }
13854 }
13855 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13856 {
13857 /* 387 opcodes don't get size suffixes
13858 if the operands are registers. */
13859 if (STACK_REG_P (x))
13860 return;
13861
13862 switch (GET_MODE_SIZE (GET_MODE (x)))
13863 {
13864 case 4:
13865 putc ('s', file);
13866 return;
13867
13868 case 8:
13869 putc ('l', file);
13870 return;
13871
13872 case 12:
13873 case 16:
13874 putc ('t', file);
13875 return;
13876
13877 default:
13878 break;
13879 }
13880 }
13881 else
13882 {
13883 output_operand_lossage
13884 ("invalid operand type used with operand code '%c'", code);
13885 return;
13886 }
13887
13888 output_operand_lossage
13889 ("invalid operand size for operand code '%c'", code);
13890 return;
13891
13892 case 'd':
13893 case 'b':
13894 case 'w':
13895 case 'k':
13896 case 'q':
13897 case 'h':
13898 case 't':
13899 case 'y':
13900 case 'x':
13901 case 'X':
13902 case 'P':
13903 case 'p':
13904 break;
13905
13906 case 's':
13907 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
13908 {
13909 ix86_print_operand (file, x, 0);
13910 fputs (", ", file);
13911 }
13912 return;
13913
13914 case 'D':
13915 /* Little bit of braindamage here. The SSE compare instructions
13916 does use completely different names for the comparisons that the
13917 fp conditional moves. */
13918 if (TARGET_AVX)
13919 {
13920 switch (GET_CODE (x))
13921 {
13922 case EQ:
13923 fputs ("eq", file);
13924 break;
13925 case UNEQ:
13926 fputs ("eq_us", file);
13927 break;
13928 case LT:
13929 fputs ("lt", file);
13930 break;
13931 case UNLT:
13932 fputs ("nge", file);
13933 break;
13934 case LE:
13935 fputs ("le", file);
13936 break;
13937 case UNLE:
13938 fputs ("ngt", file);
13939 break;
13940 case UNORDERED:
13941 fputs ("unord", file);
13942 break;
13943 case NE:
13944 fputs ("neq", file);
13945 break;
13946 case LTGT:
13947 fputs ("neq_oq", file);
13948 break;
13949 case GE:
13950 fputs ("ge", file);
13951 break;
13952 case UNGE:
13953 fputs ("nlt", file);
13954 break;
13955 case GT:
13956 fputs ("gt", file);
13957 break;
13958 case UNGT:
13959 fputs ("nle", file);
13960 break;
13961 case ORDERED:
13962 fputs ("ord", file);
13963 break;
13964 default:
13965 output_operand_lossage ("operand is not a condition code, "
13966 "invalid operand code 'D'");
13967 return;
13968 }
13969 }
13970 else
13971 {
13972 switch (GET_CODE (x))
13973 {
13974 case EQ:
13975 case UNEQ:
13976 fputs ("eq", file);
13977 break;
13978 case LT:
13979 case UNLT:
13980 fputs ("lt", file);
13981 break;
13982 case LE:
13983 case UNLE:
13984 fputs ("le", file);
13985 break;
13986 case UNORDERED:
13987 fputs ("unord", file);
13988 break;
13989 case NE:
13990 case LTGT:
13991 fputs ("neq", file);
13992 break;
13993 case UNGE:
13994 case GE:
13995 fputs ("nlt", file);
13996 break;
13997 case UNGT:
13998 case GT:
13999 fputs ("nle", file);
14000 break;
14001 case ORDERED:
14002 fputs ("ord", file);
14003 break;
14004 default:
14005 output_operand_lossage ("operand is not a condition code, "
14006 "invalid operand code 'D'");
14007 return;
14008 }
14009 }
14010 return;
14011 case 'O':
14012 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14013 if (ASSEMBLER_DIALECT == ASM_ATT)
14014 {
14015 switch (GET_MODE (x))
14016 {
14017 case HImode: putc ('w', file); break;
14018 case SImode:
14019 case SFmode: putc ('l', file); break;
14020 case DImode:
14021 case DFmode: putc ('q', file); break;
14022 default: gcc_unreachable ();
14023 }
14024 putc ('.', file);
14025 }
14026 #endif
14027 return;
14028 case 'C':
14029 if (!COMPARISON_P (x))
14030 {
14031 output_operand_lossage ("operand is neither a constant nor a "
14032 "condition code, invalid operand code "
14033 "'C'");
14034 return;
14035 }
14036 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
14037 return;
14038 case 'F':
14039 if (!COMPARISON_P (x))
14040 {
14041 output_operand_lossage ("operand is neither a constant nor a "
14042 "condition code, invalid operand code "
14043 "'F'");
14044 return;
14045 }
14046 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14047 if (ASSEMBLER_DIALECT == ASM_ATT)
14048 putc ('.', file);
14049 #endif
14050 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
14051 return;
14052
14053 /* Like above, but reverse condition */
14054 case 'c':
14055 /* Check to see if argument to %c is really a constant
14056 and not a condition code which needs to be reversed. */
14057 if (!COMPARISON_P (x))
14058 {
14059 output_operand_lossage ("operand is neither a constant nor a "
14060 "condition code, invalid operand "
14061 "code 'c'");
14062 return;
14063 }
14064 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
14065 return;
14066 case 'f':
14067 if (!COMPARISON_P (x))
14068 {
14069 output_operand_lossage ("operand is neither a constant nor a "
14070 "condition code, invalid operand "
14071 "code 'f'");
14072 return;
14073 }
14074 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14075 if (ASSEMBLER_DIALECT == ASM_ATT)
14076 putc ('.', file);
14077 #endif
14078 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14079 return;
14080
14081 case 'H':
14082 /* It doesn't actually matter what mode we use here, as we're
14083 only going to use this for printing. */
14084 x = adjust_address_nv (x, DImode, 8);
14085 break;
14086
14087 case '+':
14088 {
14089 rtx x;
14090
14091 if (!optimize
14092 || optimize_function_for_size_p (cfun) || !TARGET_BRANCH_PREDICTION_HINTS)
14093 return;
14094
14095 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14096 if (x)
14097 {
14098 int pred_val = INTVAL (XEXP (x, 0));
14099
14100 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14101 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14102 {
14103 int taken = pred_val > REG_BR_PROB_BASE / 2;
14104 int cputaken = final_forward_branch_p (current_output_insn) == 0;
14105
14106 /* Emit hints only in the case default branch prediction
14107 heuristics would fail. */
14108 if (taken != cputaken)
14109 {
14110 /* We use 3e (DS) prefix for taken branches and
14111 2e (CS) prefix for not taken branches. */
14112 if (taken)
14113 fputs ("ds ; ", file);
14114 else
14115 fputs ("cs ; ", file);
14116 }
14117 }
14118 }
14119 return;
14120 }
14121
14122 case 'Y':
14123 switch (GET_CODE (x))
14124 {
14125 case NE:
14126 fputs ("neq", file);
14127 break;
14128 case EQ:
14129 fputs ("eq", file);
14130 break;
14131 case GE:
14132 case GEU:
14133 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14134 break;
14135 case GT:
14136 case GTU:
14137 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14138 break;
14139 case LE:
14140 case LEU:
14141 fputs ("le", file);
14142 break;
14143 case LT:
14144 case LTU:
14145 fputs ("lt", file);
14146 break;
14147 case UNORDERED:
14148 fputs ("unord", file);
14149 break;
14150 case ORDERED:
14151 fputs ("ord", file);
14152 break;
14153 case UNEQ:
14154 fputs ("ueq", file);
14155 break;
14156 case UNGE:
14157 fputs ("nlt", file);
14158 break;
14159 case UNGT:
14160 fputs ("nle", file);
14161 break;
14162 case UNLE:
14163 fputs ("ule", file);
14164 break;
14165 case UNLT:
14166 fputs ("ult", file);
14167 break;
14168 case LTGT:
14169 fputs ("une", file);
14170 break;
14171 default:
14172 output_operand_lossage ("operand is not a condition code, "
14173 "invalid operand code 'Y'");
14174 return;
14175 }
14176 return;
14177
14178 case ';':
14179 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14180 putc (';', file);
14181 #endif
14182 return;
14183
14184 case '@':
14185 if (ASSEMBLER_DIALECT == ASM_ATT)
14186 putc ('%', file);
14187
14188 /* The kernel uses a different segment register for performance
14189 reasons; a system call would not have to trash the userspace
14190 segment register, which would be expensive. */
14191 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14192 fputs ("fs", file);
14193 else
14194 fputs ("gs", file);
14195 return;
14196
14197 case '~':
14198 putc (TARGET_AVX2 ? 'i' : 'f', file);
14199 return;
14200
14201 default:
14202 output_operand_lossage ("invalid operand code '%c'", code);
14203 }
14204 }
14205
14206 if (REG_P (x))
14207 print_reg (x, code, file);
14208
14209 else if (MEM_P (x))
14210 {
14211 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14212 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14213 && GET_MODE (x) != BLKmode)
14214 {
14215 const char * size;
14216 switch (GET_MODE_SIZE (GET_MODE (x)))
14217 {
14218 case 1: size = "BYTE"; break;
14219 case 2: size = "WORD"; break;
14220 case 4: size = "DWORD"; break;
14221 case 8: size = "QWORD"; break;
14222 case 12: size = "TBYTE"; break;
14223 case 16:
14224 if (GET_MODE (x) == XFmode)
14225 size = "TBYTE";
14226 else
14227 size = "XMMWORD";
14228 break;
14229 case 32: size = "YMMWORD"; break;
14230 default:
14231 gcc_unreachable ();
14232 }
14233
14234 /* Check for explicit size override (codes 'b', 'w', 'k',
14235 'q' and 'x') */
14236 if (code == 'b')
14237 size = "BYTE";
14238 else if (code == 'w')
14239 size = "WORD";
14240 else if (code == 'k')
14241 size = "DWORD";
14242 else if (code == 'q')
14243 size = "QWORD";
14244 else if (code == 'x')
14245 size = "XMMWORD";
14246
14247 fputs (size, file);
14248 fputs (" PTR ", file);
14249 }
14250
14251 x = XEXP (x, 0);
14252 /* Avoid (%rip) for call operands. */
14253 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14254 && !CONST_INT_P (x))
14255 output_addr_const (file, x);
14256 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14257 output_operand_lossage ("invalid constraints for operand");
14258 else
14259 output_address (x);
14260 }
14261
14262 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14263 {
14264 REAL_VALUE_TYPE r;
14265 long l;
14266
14267 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14268 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14269
14270 if (ASSEMBLER_DIALECT == ASM_ATT)
14271 putc ('$', file);
14272 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14273 if (code == 'q')
14274 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14275 else
14276 fprintf (file, "0x%08x", (unsigned int) l);
14277 }
14278
14279 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14280 {
14281 REAL_VALUE_TYPE r;
14282 long l[2];
14283
14284 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14285 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14286
14287 if (ASSEMBLER_DIALECT == ASM_ATT)
14288 putc ('$', file);
14289 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14290 }
14291
14292 /* These float cases don't actually occur as immediate operands. */
14293 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14294 {
14295 char dstr[30];
14296
14297 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14298 fputs (dstr, file);
14299 }
14300
14301 else
14302 {
14303 /* We have patterns that allow zero sets of memory, for instance.
14304 In 64-bit mode, we should probably support all 8-byte vectors,
14305 since we can in fact encode that into an immediate. */
14306 if (GET_CODE (x) == CONST_VECTOR)
14307 {
14308 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14309 x = const0_rtx;
14310 }
14311
14312 if (code != 'P' && code != 'p')
14313 {
14314 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14315 {
14316 if (ASSEMBLER_DIALECT == ASM_ATT)
14317 putc ('$', file);
14318 }
14319 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14320 || GET_CODE (x) == LABEL_REF)
14321 {
14322 if (ASSEMBLER_DIALECT == ASM_ATT)
14323 putc ('$', file);
14324 else
14325 fputs ("OFFSET FLAT:", file);
14326 }
14327 }
14328 if (CONST_INT_P (x))
14329 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14330 else if (flag_pic || MACHOPIC_INDIRECT)
14331 output_pic_addr_const (file, x, code);
14332 else
14333 output_addr_const (file, x);
14334 }
14335 }
14336
14337 static bool
14338 ix86_print_operand_punct_valid_p (unsigned char code)
14339 {
14340 return (code == '@' || code == '*' || code == '+'
14341 || code == '&' || code == ';' || code == '~');
14342 }
14343 \f
14344 /* Print a memory operand whose address is ADDR. */
14345
14346 static void
14347 ix86_print_operand_address (FILE *file, rtx addr)
14348 {
14349 struct ix86_address parts;
14350 rtx base, index, disp;
14351 int scale;
14352 int ok;
14353 bool vsib = false;
14354
14355 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14356 {
14357 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14358 gcc_assert (parts.index == NULL_RTX);
14359 parts.index = XVECEXP (addr, 0, 1);
14360 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14361 addr = XVECEXP (addr, 0, 0);
14362 vsib = true;
14363 }
14364 else
14365 ok = ix86_decompose_address (addr, &parts);
14366
14367 gcc_assert (ok);
14368
14369 if (parts.base && GET_CODE (parts.base) == SUBREG)
14370 {
14371 rtx tmp = SUBREG_REG (parts.base);
14372 parts.base = simplify_subreg (GET_MODE (parts.base),
14373 tmp, GET_MODE (tmp), 0);
14374 }
14375
14376 if (parts.index && GET_CODE (parts.index) == SUBREG)
14377 {
14378 rtx tmp = SUBREG_REG (parts.index);
14379 parts.index = simplify_subreg (GET_MODE (parts.index),
14380 tmp, GET_MODE (tmp), 0);
14381 }
14382
14383 base = parts.base;
14384 index = parts.index;
14385 disp = parts.disp;
14386 scale = parts.scale;
14387
14388 switch (parts.seg)
14389 {
14390 case SEG_DEFAULT:
14391 break;
14392 case SEG_FS:
14393 case SEG_GS:
14394 if (ASSEMBLER_DIALECT == ASM_ATT)
14395 putc ('%', file);
14396 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14397 break;
14398 default:
14399 gcc_unreachable ();
14400 }
14401
14402 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14403 if (TARGET_64BIT && !base && !index)
14404 {
14405 rtx symbol = disp;
14406
14407 if (GET_CODE (disp) == CONST
14408 && GET_CODE (XEXP (disp, 0)) == PLUS
14409 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14410 symbol = XEXP (XEXP (disp, 0), 0);
14411
14412 if (GET_CODE (symbol) == LABEL_REF
14413 || (GET_CODE (symbol) == SYMBOL_REF
14414 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14415 base = pc_rtx;
14416 }
14417 if (!base && !index)
14418 {
14419 /* Displacement only requires special attention. */
14420
14421 if (CONST_INT_P (disp))
14422 {
14423 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14424 fputs ("ds:", file);
14425 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14426 }
14427 else if (flag_pic)
14428 output_pic_addr_const (file, disp, 0);
14429 else
14430 output_addr_const (file, disp);
14431 }
14432 else
14433 {
14434 int code = 0;
14435
14436 /* Print SImode registers for zero-extended addresses to force
14437 addr32 prefix. Otherwise print DImode registers to avoid it. */
14438 if (TARGET_64BIT)
14439 code = ((GET_CODE (addr) == ZERO_EXTEND
14440 || GET_CODE (addr) == AND)
14441 ? 'l'
14442 : 'q');
14443
14444 if (ASSEMBLER_DIALECT == ASM_ATT)
14445 {
14446 if (disp)
14447 {
14448 if (flag_pic)
14449 output_pic_addr_const (file, disp, 0);
14450 else if (GET_CODE (disp) == LABEL_REF)
14451 output_asm_label (disp);
14452 else
14453 output_addr_const (file, disp);
14454 }
14455
14456 putc ('(', file);
14457 if (base)
14458 print_reg (base, code, file);
14459 if (index)
14460 {
14461 putc (',', file);
14462 print_reg (index, vsib ? 0 : code, file);
14463 if (scale != 1 || vsib)
14464 fprintf (file, ",%d", scale);
14465 }
14466 putc (')', file);
14467 }
14468 else
14469 {
14470 rtx offset = NULL_RTX;
14471
14472 if (disp)
14473 {
14474 /* Pull out the offset of a symbol; print any symbol itself. */
14475 if (GET_CODE (disp) == CONST
14476 && GET_CODE (XEXP (disp, 0)) == PLUS
14477 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14478 {
14479 offset = XEXP (XEXP (disp, 0), 1);
14480 disp = gen_rtx_CONST (VOIDmode,
14481 XEXP (XEXP (disp, 0), 0));
14482 }
14483
14484 if (flag_pic)
14485 output_pic_addr_const (file, disp, 0);
14486 else if (GET_CODE (disp) == LABEL_REF)
14487 output_asm_label (disp);
14488 else if (CONST_INT_P (disp))
14489 offset = disp;
14490 else
14491 output_addr_const (file, disp);
14492 }
14493
14494 putc ('[', file);
14495 if (base)
14496 {
14497 print_reg (base, code, file);
14498 if (offset)
14499 {
14500 if (INTVAL (offset) >= 0)
14501 putc ('+', file);
14502 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14503 }
14504 }
14505 else if (offset)
14506 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14507 else
14508 putc ('0', file);
14509
14510 if (index)
14511 {
14512 putc ('+', file);
14513 print_reg (index, vsib ? 0 : code, file);
14514 if (scale != 1 || vsib)
14515 fprintf (file, "*%d", scale);
14516 }
14517 putc (']', file);
14518 }
14519 }
14520 }
14521
14522 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14523
14524 static bool
14525 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14526 {
14527 rtx op;
14528
14529 if (GET_CODE (x) != UNSPEC)
14530 return false;
14531
14532 op = XVECEXP (x, 0, 0);
14533 switch (XINT (x, 1))
14534 {
14535 case UNSPEC_GOTTPOFF:
14536 output_addr_const (file, op);
14537 /* FIXME: This might be @TPOFF in Sun ld. */
14538 fputs ("@gottpoff", file);
14539 break;
14540 case UNSPEC_TPOFF:
14541 output_addr_const (file, op);
14542 fputs ("@tpoff", file);
14543 break;
14544 case UNSPEC_NTPOFF:
14545 output_addr_const (file, op);
14546 if (TARGET_64BIT)
14547 fputs ("@tpoff", file);
14548 else
14549 fputs ("@ntpoff", file);
14550 break;
14551 case UNSPEC_DTPOFF:
14552 output_addr_const (file, op);
14553 fputs ("@dtpoff", file);
14554 break;
14555 case UNSPEC_GOTNTPOFF:
14556 output_addr_const (file, op);
14557 if (TARGET_64BIT)
14558 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14559 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14560 else
14561 fputs ("@gotntpoff", file);
14562 break;
14563 case UNSPEC_INDNTPOFF:
14564 output_addr_const (file, op);
14565 fputs ("@indntpoff", file);
14566 break;
14567 #if TARGET_MACHO
14568 case UNSPEC_MACHOPIC_OFFSET:
14569 output_addr_const (file, op);
14570 putc ('-', file);
14571 machopic_output_function_base_name (file);
14572 break;
14573 #endif
14574
14575 case UNSPEC_STACK_CHECK:
14576 {
14577 int offset;
14578
14579 gcc_assert (flag_split_stack);
14580
14581 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14582 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14583 #else
14584 gcc_unreachable ();
14585 #endif
14586
14587 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14588 }
14589 break;
14590
14591 default:
14592 return false;
14593 }
14594
14595 return true;
14596 }
14597 \f
14598 /* Split one or more double-mode RTL references into pairs of half-mode
14599 references. The RTL can be REG, offsettable MEM, integer constant, or
14600 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14601 split and "num" is its length. lo_half and hi_half are output arrays
14602 that parallel "operands". */
14603
14604 void
14605 split_double_mode (enum machine_mode mode, rtx operands[],
14606 int num, rtx lo_half[], rtx hi_half[])
14607 {
14608 enum machine_mode half_mode;
14609 unsigned int byte;
14610
14611 switch (mode)
14612 {
14613 case TImode:
14614 half_mode = DImode;
14615 break;
14616 case DImode:
14617 half_mode = SImode;
14618 break;
14619 default:
14620 gcc_unreachable ();
14621 }
14622
14623 byte = GET_MODE_SIZE (half_mode);
14624
14625 while (num--)
14626 {
14627 rtx op = operands[num];
14628
14629 /* simplify_subreg refuse to split volatile memory addresses,
14630 but we still have to handle it. */
14631 if (MEM_P (op))
14632 {
14633 lo_half[num] = adjust_address (op, half_mode, 0);
14634 hi_half[num] = adjust_address (op, half_mode, byte);
14635 }
14636 else
14637 {
14638 lo_half[num] = simplify_gen_subreg (half_mode, op,
14639 GET_MODE (op) == VOIDmode
14640 ? mode : GET_MODE (op), 0);
14641 hi_half[num] = simplify_gen_subreg (half_mode, op,
14642 GET_MODE (op) == VOIDmode
14643 ? mode : GET_MODE (op), byte);
14644 }
14645 }
14646 }
14647 \f
14648 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14649 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14650 is the expression of the binary operation. The output may either be
14651 emitted here, or returned to the caller, like all output_* functions.
14652
14653 There is no guarantee that the operands are the same mode, as they
14654 might be within FLOAT or FLOAT_EXTEND expressions. */
14655
14656 #ifndef SYSV386_COMPAT
14657 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14658 wants to fix the assemblers because that causes incompatibility
14659 with gcc. No-one wants to fix gcc because that causes
14660 incompatibility with assemblers... You can use the option of
14661 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14662 #define SYSV386_COMPAT 1
14663 #endif
14664
14665 const char *
14666 output_387_binary_op (rtx insn, rtx *operands)
14667 {
14668 static char buf[40];
14669 const char *p;
14670 const char *ssep;
14671 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14672
14673 #ifdef ENABLE_CHECKING
14674 /* Even if we do not want to check the inputs, this documents input
14675 constraints. Which helps in understanding the following code. */
14676 if (STACK_REG_P (operands[0])
14677 && ((REG_P (operands[1])
14678 && REGNO (operands[0]) == REGNO (operands[1])
14679 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14680 || (REG_P (operands[2])
14681 && REGNO (operands[0]) == REGNO (operands[2])
14682 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14683 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14684 ; /* ok */
14685 else
14686 gcc_assert (is_sse);
14687 #endif
14688
14689 switch (GET_CODE (operands[3]))
14690 {
14691 case PLUS:
14692 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14693 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14694 p = "fiadd";
14695 else
14696 p = "fadd";
14697 ssep = "vadd";
14698 break;
14699
14700 case MINUS:
14701 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14702 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14703 p = "fisub";
14704 else
14705 p = "fsub";
14706 ssep = "vsub";
14707 break;
14708
14709 case MULT:
14710 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14711 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14712 p = "fimul";
14713 else
14714 p = "fmul";
14715 ssep = "vmul";
14716 break;
14717
14718 case DIV:
14719 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14720 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14721 p = "fidiv";
14722 else
14723 p = "fdiv";
14724 ssep = "vdiv";
14725 break;
14726
14727 default:
14728 gcc_unreachable ();
14729 }
14730
14731 if (is_sse)
14732 {
14733 if (TARGET_AVX)
14734 {
14735 strcpy (buf, ssep);
14736 if (GET_MODE (operands[0]) == SFmode)
14737 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14738 else
14739 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14740 }
14741 else
14742 {
14743 strcpy (buf, ssep + 1);
14744 if (GET_MODE (operands[0]) == SFmode)
14745 strcat (buf, "ss\t{%2, %0|%0, %2}");
14746 else
14747 strcat (buf, "sd\t{%2, %0|%0, %2}");
14748 }
14749 return buf;
14750 }
14751 strcpy (buf, p);
14752
14753 switch (GET_CODE (operands[3]))
14754 {
14755 case MULT:
14756 case PLUS:
14757 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14758 {
14759 rtx temp = operands[2];
14760 operands[2] = operands[1];
14761 operands[1] = temp;
14762 }
14763
14764 /* know operands[0] == operands[1]. */
14765
14766 if (MEM_P (operands[2]))
14767 {
14768 p = "%Z2\t%2";
14769 break;
14770 }
14771
14772 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14773 {
14774 if (STACK_TOP_P (operands[0]))
14775 /* How is it that we are storing to a dead operand[2]?
14776 Well, presumably operands[1] is dead too. We can't
14777 store the result to st(0) as st(0) gets popped on this
14778 instruction. Instead store to operands[2] (which I
14779 think has to be st(1)). st(1) will be popped later.
14780 gcc <= 2.8.1 didn't have this check and generated
14781 assembly code that the Unixware assembler rejected. */
14782 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14783 else
14784 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14785 break;
14786 }
14787
14788 if (STACK_TOP_P (operands[0]))
14789 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14790 else
14791 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14792 break;
14793
14794 case MINUS:
14795 case DIV:
14796 if (MEM_P (operands[1]))
14797 {
14798 p = "r%Z1\t%1";
14799 break;
14800 }
14801
14802 if (MEM_P (operands[2]))
14803 {
14804 p = "%Z2\t%2";
14805 break;
14806 }
14807
14808 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14809 {
14810 #if SYSV386_COMPAT
14811 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14812 derived assemblers, confusingly reverse the direction of
14813 the operation for fsub{r} and fdiv{r} when the
14814 destination register is not st(0). The Intel assembler
14815 doesn't have this brain damage. Read !SYSV386_COMPAT to
14816 figure out what the hardware really does. */
14817 if (STACK_TOP_P (operands[0]))
14818 p = "{p\t%0, %2|rp\t%2, %0}";
14819 else
14820 p = "{rp\t%2, %0|p\t%0, %2}";
14821 #else
14822 if (STACK_TOP_P (operands[0]))
14823 /* As above for fmul/fadd, we can't store to st(0). */
14824 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14825 else
14826 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14827 #endif
14828 break;
14829 }
14830
14831 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
14832 {
14833 #if SYSV386_COMPAT
14834 if (STACK_TOP_P (operands[0]))
14835 p = "{rp\t%0, %1|p\t%1, %0}";
14836 else
14837 p = "{p\t%1, %0|rp\t%0, %1}";
14838 #else
14839 if (STACK_TOP_P (operands[0]))
14840 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
14841 else
14842 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
14843 #endif
14844 break;
14845 }
14846
14847 if (STACK_TOP_P (operands[0]))
14848 {
14849 if (STACK_TOP_P (operands[1]))
14850 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14851 else
14852 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
14853 break;
14854 }
14855 else if (STACK_TOP_P (operands[1]))
14856 {
14857 #if SYSV386_COMPAT
14858 p = "{\t%1, %0|r\t%0, %1}";
14859 #else
14860 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
14861 #endif
14862 }
14863 else
14864 {
14865 #if SYSV386_COMPAT
14866 p = "{r\t%2, %0|\t%0, %2}";
14867 #else
14868 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14869 #endif
14870 }
14871 break;
14872
14873 default:
14874 gcc_unreachable ();
14875 }
14876
14877 strcat (buf, p);
14878 return buf;
14879 }
14880
14881 /* Return needed mode for entity in optimize_mode_switching pass. */
14882
14883 int
14884 ix86_mode_needed (int entity, rtx insn)
14885 {
14886 enum attr_i387_cw mode;
14887
14888 /* The mode UNINITIALIZED is used to store control word after a
14889 function call or ASM pattern. The mode ANY specify that function
14890 has no requirements on the control word and make no changes in the
14891 bits we are interested in. */
14892
14893 if (CALL_P (insn)
14894 || (NONJUMP_INSN_P (insn)
14895 && (asm_noperands (PATTERN (insn)) >= 0
14896 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
14897 return I387_CW_UNINITIALIZED;
14898
14899 if (recog_memoized (insn) < 0)
14900 return I387_CW_ANY;
14901
14902 mode = get_attr_i387_cw (insn);
14903
14904 switch (entity)
14905 {
14906 case I387_TRUNC:
14907 if (mode == I387_CW_TRUNC)
14908 return mode;
14909 break;
14910
14911 case I387_FLOOR:
14912 if (mode == I387_CW_FLOOR)
14913 return mode;
14914 break;
14915
14916 case I387_CEIL:
14917 if (mode == I387_CW_CEIL)
14918 return mode;
14919 break;
14920
14921 case I387_MASK_PM:
14922 if (mode == I387_CW_MASK_PM)
14923 return mode;
14924 break;
14925
14926 default:
14927 gcc_unreachable ();
14928 }
14929
14930 return I387_CW_ANY;
14931 }
14932
14933 /* Output code to initialize control word copies used by trunc?f?i and
14934 rounding patterns. CURRENT_MODE is set to current control word,
14935 while NEW_MODE is set to new control word. */
14936
14937 void
14938 emit_i387_cw_initialization (int mode)
14939 {
14940 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
14941 rtx new_mode;
14942
14943 enum ix86_stack_slot slot;
14944
14945 rtx reg = gen_reg_rtx (HImode);
14946
14947 emit_insn (gen_x86_fnstcw_1 (stored_mode));
14948 emit_move_insn (reg, copy_rtx (stored_mode));
14949
14950 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
14951 || optimize_function_for_size_p (cfun))
14952 {
14953 switch (mode)
14954 {
14955 case I387_CW_TRUNC:
14956 /* round toward zero (truncate) */
14957 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
14958 slot = SLOT_CW_TRUNC;
14959 break;
14960
14961 case I387_CW_FLOOR:
14962 /* round down toward -oo */
14963 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
14964 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
14965 slot = SLOT_CW_FLOOR;
14966 break;
14967
14968 case I387_CW_CEIL:
14969 /* round up toward +oo */
14970 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
14971 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
14972 slot = SLOT_CW_CEIL;
14973 break;
14974
14975 case I387_CW_MASK_PM:
14976 /* mask precision exception for nearbyint() */
14977 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
14978 slot = SLOT_CW_MASK_PM;
14979 break;
14980
14981 default:
14982 gcc_unreachable ();
14983 }
14984 }
14985 else
14986 {
14987 switch (mode)
14988 {
14989 case I387_CW_TRUNC:
14990 /* round toward zero (truncate) */
14991 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
14992 slot = SLOT_CW_TRUNC;
14993 break;
14994
14995 case I387_CW_FLOOR:
14996 /* round down toward -oo */
14997 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
14998 slot = SLOT_CW_FLOOR;
14999 break;
15000
15001 case I387_CW_CEIL:
15002 /* round up toward +oo */
15003 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15004 slot = SLOT_CW_CEIL;
15005 break;
15006
15007 case I387_CW_MASK_PM:
15008 /* mask precision exception for nearbyint() */
15009 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15010 slot = SLOT_CW_MASK_PM;
15011 break;
15012
15013 default:
15014 gcc_unreachable ();
15015 }
15016 }
15017
15018 gcc_assert (slot < MAX_386_STACK_LOCALS);
15019
15020 new_mode = assign_386_stack_local (HImode, slot);
15021 emit_move_insn (new_mode, reg);
15022 }
15023
15024 /* Output code for INSN to convert a float to a signed int. OPERANDS
15025 are the insn operands. The output may be [HSD]Imode and the input
15026 operand may be [SDX]Fmode. */
15027
15028 const char *
15029 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15030 {
15031 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15032 int dimode_p = GET_MODE (operands[0]) == DImode;
15033 int round_mode = get_attr_i387_cw (insn);
15034
15035 /* Jump through a hoop or two for DImode, since the hardware has no
15036 non-popping instruction. We used to do this a different way, but
15037 that was somewhat fragile and broke with post-reload splitters. */
15038 if ((dimode_p || fisttp) && !stack_top_dies)
15039 output_asm_insn ("fld\t%y1", operands);
15040
15041 gcc_assert (STACK_TOP_P (operands[1]));
15042 gcc_assert (MEM_P (operands[0]));
15043 gcc_assert (GET_MODE (operands[1]) != TFmode);
15044
15045 if (fisttp)
15046 output_asm_insn ("fisttp%Z0\t%0", operands);
15047 else
15048 {
15049 if (round_mode != I387_CW_ANY)
15050 output_asm_insn ("fldcw\t%3", operands);
15051 if (stack_top_dies || dimode_p)
15052 output_asm_insn ("fistp%Z0\t%0", operands);
15053 else
15054 output_asm_insn ("fist%Z0\t%0", operands);
15055 if (round_mode != I387_CW_ANY)
15056 output_asm_insn ("fldcw\t%2", operands);
15057 }
15058
15059 return "";
15060 }
15061
15062 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15063 have the values zero or one, indicates the ffreep insn's operand
15064 from the OPERANDS array. */
15065
15066 static const char *
15067 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15068 {
15069 if (TARGET_USE_FFREEP)
15070 #ifdef HAVE_AS_IX86_FFREEP
15071 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15072 #else
15073 {
15074 static char retval[32];
15075 int regno = REGNO (operands[opno]);
15076
15077 gcc_assert (FP_REGNO_P (regno));
15078
15079 regno -= FIRST_STACK_REG;
15080
15081 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15082 return retval;
15083 }
15084 #endif
15085
15086 return opno ? "fstp\t%y1" : "fstp\t%y0";
15087 }
15088
15089
15090 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15091 should be used. UNORDERED_P is true when fucom should be used. */
15092
15093 const char *
15094 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15095 {
15096 int stack_top_dies;
15097 rtx cmp_op0, cmp_op1;
15098 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15099
15100 if (eflags_p)
15101 {
15102 cmp_op0 = operands[0];
15103 cmp_op1 = operands[1];
15104 }
15105 else
15106 {
15107 cmp_op0 = operands[1];
15108 cmp_op1 = operands[2];
15109 }
15110
15111 if (is_sse)
15112 {
15113 if (GET_MODE (operands[0]) == SFmode)
15114 if (unordered_p)
15115 return "%vucomiss\t{%1, %0|%0, %1}";
15116 else
15117 return "%vcomiss\t{%1, %0|%0, %1}";
15118 else
15119 if (unordered_p)
15120 return "%vucomisd\t{%1, %0|%0, %1}";
15121 else
15122 return "%vcomisd\t{%1, %0|%0, %1}";
15123 }
15124
15125 gcc_assert (STACK_TOP_P (cmp_op0));
15126
15127 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15128
15129 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15130 {
15131 if (stack_top_dies)
15132 {
15133 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15134 return output_387_ffreep (operands, 1);
15135 }
15136 else
15137 return "ftst\n\tfnstsw\t%0";
15138 }
15139
15140 if (STACK_REG_P (cmp_op1)
15141 && stack_top_dies
15142 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15143 && REGNO (cmp_op1) != FIRST_STACK_REG)
15144 {
15145 /* If both the top of the 387 stack dies, and the other operand
15146 is also a stack register that dies, then this must be a
15147 `fcompp' float compare */
15148
15149 if (eflags_p)
15150 {
15151 /* There is no double popping fcomi variant. Fortunately,
15152 eflags is immune from the fstp's cc clobbering. */
15153 if (unordered_p)
15154 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15155 else
15156 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15157 return output_387_ffreep (operands, 0);
15158 }
15159 else
15160 {
15161 if (unordered_p)
15162 return "fucompp\n\tfnstsw\t%0";
15163 else
15164 return "fcompp\n\tfnstsw\t%0";
15165 }
15166 }
15167 else
15168 {
15169 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15170
15171 static const char * const alt[16] =
15172 {
15173 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15174 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15175 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15176 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15177
15178 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15179 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15180 NULL,
15181 NULL,
15182
15183 "fcomi\t{%y1, %0|%0, %y1}",
15184 "fcomip\t{%y1, %0|%0, %y1}",
15185 "fucomi\t{%y1, %0|%0, %y1}",
15186 "fucomip\t{%y1, %0|%0, %y1}",
15187
15188 NULL,
15189 NULL,
15190 NULL,
15191 NULL
15192 };
15193
15194 int mask;
15195 const char *ret;
15196
15197 mask = eflags_p << 3;
15198 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15199 mask |= unordered_p << 1;
15200 mask |= stack_top_dies;
15201
15202 gcc_assert (mask < 16);
15203 ret = alt[mask];
15204 gcc_assert (ret);
15205
15206 return ret;
15207 }
15208 }
15209
15210 void
15211 ix86_output_addr_vec_elt (FILE *file, int value)
15212 {
15213 const char *directive = ASM_LONG;
15214
15215 #ifdef ASM_QUAD
15216 if (TARGET_LP64)
15217 directive = ASM_QUAD;
15218 #else
15219 gcc_assert (!TARGET_64BIT);
15220 #endif
15221
15222 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15223 }
15224
15225 void
15226 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15227 {
15228 const char *directive = ASM_LONG;
15229
15230 #ifdef ASM_QUAD
15231 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15232 directive = ASM_QUAD;
15233 #else
15234 gcc_assert (!TARGET_64BIT);
15235 #endif
15236 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15237 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15238 fprintf (file, "%s%s%d-%s%d\n",
15239 directive, LPREFIX, value, LPREFIX, rel);
15240 else if (HAVE_AS_GOTOFF_IN_DATA)
15241 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15242 #if TARGET_MACHO
15243 else if (TARGET_MACHO)
15244 {
15245 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15246 machopic_output_function_base_name (file);
15247 putc ('\n', file);
15248 }
15249 #endif
15250 else
15251 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15252 GOT_SYMBOL_NAME, LPREFIX, value);
15253 }
15254 \f
15255 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15256 for the target. */
15257
15258 void
15259 ix86_expand_clear (rtx dest)
15260 {
15261 rtx tmp;
15262
15263 /* We play register width games, which are only valid after reload. */
15264 gcc_assert (reload_completed);
15265
15266 /* Avoid HImode and its attendant prefix byte. */
15267 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15268 dest = gen_rtx_REG (SImode, REGNO (dest));
15269 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15270
15271 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15272 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15273 {
15274 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15275 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15276 }
15277
15278 emit_insn (tmp);
15279 }
15280
15281 /* X is an unchanging MEM. If it is a constant pool reference, return
15282 the constant pool rtx, else NULL. */
15283
15284 rtx
15285 maybe_get_pool_constant (rtx x)
15286 {
15287 x = ix86_delegitimize_address (XEXP (x, 0));
15288
15289 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15290 return get_pool_constant (x);
15291
15292 return NULL_RTX;
15293 }
15294
15295 void
15296 ix86_expand_move (enum machine_mode mode, rtx operands[])
15297 {
15298 rtx op0, op1;
15299 enum tls_model model;
15300
15301 op0 = operands[0];
15302 op1 = operands[1];
15303
15304 if (GET_CODE (op1) == SYMBOL_REF)
15305 {
15306 model = SYMBOL_REF_TLS_MODEL (op1);
15307 if (model)
15308 {
15309 op1 = legitimize_tls_address (op1, model, true);
15310 op1 = force_operand (op1, op0);
15311 if (op1 == op0)
15312 return;
15313 if (GET_MODE (op1) != mode)
15314 op1 = convert_to_mode (mode, op1, 1);
15315 }
15316 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15317 && SYMBOL_REF_DLLIMPORT_P (op1))
15318 op1 = legitimize_dllimport_symbol (op1, false);
15319 }
15320 else if (GET_CODE (op1) == CONST
15321 && GET_CODE (XEXP (op1, 0)) == PLUS
15322 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15323 {
15324 rtx addend = XEXP (XEXP (op1, 0), 1);
15325 rtx symbol = XEXP (XEXP (op1, 0), 0);
15326 rtx tmp = NULL;
15327
15328 model = SYMBOL_REF_TLS_MODEL (symbol);
15329 if (model)
15330 tmp = legitimize_tls_address (symbol, model, true);
15331 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15332 && SYMBOL_REF_DLLIMPORT_P (symbol))
15333 tmp = legitimize_dllimport_symbol (symbol, true);
15334
15335 if (tmp)
15336 {
15337 tmp = force_operand (tmp, NULL);
15338 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15339 op0, 1, OPTAB_DIRECT);
15340 if (tmp == op0)
15341 return;
15342 if (GET_MODE (tmp) != mode)
15343 op1 = convert_to_mode (mode, tmp, 1);
15344 }
15345 }
15346
15347 if ((flag_pic || MACHOPIC_INDIRECT)
15348 && symbolic_operand (op1, mode))
15349 {
15350 if (TARGET_MACHO && !TARGET_64BIT)
15351 {
15352 #if TARGET_MACHO
15353 /* dynamic-no-pic */
15354 if (MACHOPIC_INDIRECT)
15355 {
15356 rtx temp = ((reload_in_progress
15357 || ((op0 && REG_P (op0))
15358 && mode == Pmode))
15359 ? op0 : gen_reg_rtx (Pmode));
15360 op1 = machopic_indirect_data_reference (op1, temp);
15361 if (MACHOPIC_PURE)
15362 op1 = machopic_legitimize_pic_address (op1, mode,
15363 temp == op1 ? 0 : temp);
15364 }
15365 if (op0 != op1 && GET_CODE (op0) != MEM)
15366 {
15367 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15368 emit_insn (insn);
15369 return;
15370 }
15371 if (GET_CODE (op0) == MEM)
15372 op1 = force_reg (Pmode, op1);
15373 else
15374 {
15375 rtx temp = op0;
15376 if (GET_CODE (temp) != REG)
15377 temp = gen_reg_rtx (Pmode);
15378 temp = legitimize_pic_address (op1, temp);
15379 if (temp == op0)
15380 return;
15381 op1 = temp;
15382 }
15383 /* dynamic-no-pic */
15384 #endif
15385 }
15386 else
15387 {
15388 if (MEM_P (op0))
15389 op1 = force_reg (mode, op1);
15390 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15391 {
15392 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15393 op1 = legitimize_pic_address (op1, reg);
15394 if (op0 == op1)
15395 return;
15396 if (GET_MODE (op1) != mode)
15397 op1 = convert_to_mode (mode, op1, 1);
15398 }
15399 }
15400 }
15401 else
15402 {
15403 if (MEM_P (op0)
15404 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15405 || !push_operand (op0, mode))
15406 && MEM_P (op1))
15407 op1 = force_reg (mode, op1);
15408
15409 if (push_operand (op0, mode)
15410 && ! general_no_elim_operand (op1, mode))
15411 op1 = copy_to_mode_reg (mode, op1);
15412
15413 /* Force large constants in 64bit compilation into register
15414 to get them CSEed. */
15415 if (can_create_pseudo_p ()
15416 && (mode == DImode) && TARGET_64BIT
15417 && immediate_operand (op1, mode)
15418 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15419 && !register_operand (op0, mode)
15420 && optimize)
15421 op1 = copy_to_mode_reg (mode, op1);
15422
15423 if (can_create_pseudo_p ()
15424 && FLOAT_MODE_P (mode)
15425 && GET_CODE (op1) == CONST_DOUBLE)
15426 {
15427 /* If we are loading a floating point constant to a register,
15428 force the value to memory now, since we'll get better code
15429 out the back end. */
15430
15431 op1 = validize_mem (force_const_mem (mode, op1));
15432 if (!register_operand (op0, mode))
15433 {
15434 rtx temp = gen_reg_rtx (mode);
15435 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15436 emit_move_insn (op0, temp);
15437 return;
15438 }
15439 }
15440 }
15441
15442 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15443 }
15444
15445 void
15446 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15447 {
15448 rtx op0 = operands[0], op1 = operands[1];
15449 unsigned int align = GET_MODE_ALIGNMENT (mode);
15450
15451 /* Force constants other than zero into memory. We do not know how
15452 the instructions used to build constants modify the upper 64 bits
15453 of the register, once we have that information we may be able
15454 to handle some of them more efficiently. */
15455 if (can_create_pseudo_p ()
15456 && register_operand (op0, mode)
15457 && (CONSTANT_P (op1)
15458 || (GET_CODE (op1) == SUBREG
15459 && CONSTANT_P (SUBREG_REG (op1))))
15460 && !standard_sse_constant_p (op1))
15461 op1 = validize_mem (force_const_mem (mode, op1));
15462
15463 /* We need to check memory alignment for SSE mode since attribute
15464 can make operands unaligned. */
15465 if (can_create_pseudo_p ()
15466 && SSE_REG_MODE_P (mode)
15467 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15468 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15469 {
15470 rtx tmp[2];
15471
15472 /* ix86_expand_vector_move_misalign() does not like constants ... */
15473 if (CONSTANT_P (op1)
15474 || (GET_CODE (op1) == SUBREG
15475 && CONSTANT_P (SUBREG_REG (op1))))
15476 op1 = validize_mem (force_const_mem (mode, op1));
15477
15478 /* ... nor both arguments in memory. */
15479 if (!register_operand (op0, mode)
15480 && !register_operand (op1, mode))
15481 op1 = force_reg (mode, op1);
15482
15483 tmp[0] = op0; tmp[1] = op1;
15484 ix86_expand_vector_move_misalign (mode, tmp);
15485 return;
15486 }
15487
15488 /* Make operand1 a register if it isn't already. */
15489 if (can_create_pseudo_p ()
15490 && !register_operand (op0, mode)
15491 && !register_operand (op1, mode))
15492 {
15493 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15494 return;
15495 }
15496
15497 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15498 }
15499
15500 /* Split 32-byte AVX unaligned load and store if needed. */
15501
15502 static void
15503 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15504 {
15505 rtx m;
15506 rtx (*extract) (rtx, rtx, rtx);
15507 rtx (*move_unaligned) (rtx, rtx);
15508 enum machine_mode mode;
15509
15510 switch (GET_MODE (op0))
15511 {
15512 default:
15513 gcc_unreachable ();
15514 case V32QImode:
15515 extract = gen_avx_vextractf128v32qi;
15516 move_unaligned = gen_avx_movdqu256;
15517 mode = V16QImode;
15518 break;
15519 case V8SFmode:
15520 extract = gen_avx_vextractf128v8sf;
15521 move_unaligned = gen_avx_movups256;
15522 mode = V4SFmode;
15523 break;
15524 case V4DFmode:
15525 extract = gen_avx_vextractf128v4df;
15526 move_unaligned = gen_avx_movupd256;
15527 mode = V2DFmode;
15528 break;
15529 }
15530
15531 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15532 {
15533 rtx r = gen_reg_rtx (mode);
15534 m = adjust_address (op1, mode, 0);
15535 emit_move_insn (r, m);
15536 m = adjust_address (op1, mode, 16);
15537 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15538 emit_move_insn (op0, r);
15539 }
15540 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15541 {
15542 m = adjust_address (op0, mode, 0);
15543 emit_insn (extract (m, op1, const0_rtx));
15544 m = adjust_address (op0, mode, 16);
15545 emit_insn (extract (m, op1, const1_rtx));
15546 }
15547 else
15548 emit_insn (move_unaligned (op0, op1));
15549 }
15550
15551 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15552 straight to ix86_expand_vector_move. */
15553 /* Code generation for scalar reg-reg moves of single and double precision data:
15554 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15555 movaps reg, reg
15556 else
15557 movss reg, reg
15558 if (x86_sse_partial_reg_dependency == true)
15559 movapd reg, reg
15560 else
15561 movsd reg, reg
15562
15563 Code generation for scalar loads of double precision data:
15564 if (x86_sse_split_regs == true)
15565 movlpd mem, reg (gas syntax)
15566 else
15567 movsd mem, reg
15568
15569 Code generation for unaligned packed loads of single precision data
15570 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15571 if (x86_sse_unaligned_move_optimal)
15572 movups mem, reg
15573
15574 if (x86_sse_partial_reg_dependency == true)
15575 {
15576 xorps reg, reg
15577 movlps mem, reg
15578 movhps mem+8, reg
15579 }
15580 else
15581 {
15582 movlps mem, reg
15583 movhps mem+8, reg
15584 }
15585
15586 Code generation for unaligned packed loads of double precision data
15587 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15588 if (x86_sse_unaligned_move_optimal)
15589 movupd mem, reg
15590
15591 if (x86_sse_split_regs == true)
15592 {
15593 movlpd mem, reg
15594 movhpd mem+8, reg
15595 }
15596 else
15597 {
15598 movsd mem, reg
15599 movhpd mem+8, reg
15600 }
15601 */
15602
15603 void
15604 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15605 {
15606 rtx op0, op1, m;
15607
15608 op0 = operands[0];
15609 op1 = operands[1];
15610
15611 if (TARGET_AVX)
15612 {
15613 switch (GET_MODE_CLASS (mode))
15614 {
15615 case MODE_VECTOR_INT:
15616 case MODE_INT:
15617 switch (GET_MODE_SIZE (mode))
15618 {
15619 case 16:
15620 /* If we're optimizing for size, movups is the smallest. */
15621 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15622 {
15623 op0 = gen_lowpart (V4SFmode, op0);
15624 op1 = gen_lowpart (V4SFmode, op1);
15625 emit_insn (gen_sse_movups (op0, op1));
15626 return;
15627 }
15628 op0 = gen_lowpart (V16QImode, op0);
15629 op1 = gen_lowpart (V16QImode, op1);
15630 emit_insn (gen_sse2_movdqu (op0, op1));
15631 break;
15632 case 32:
15633 op0 = gen_lowpart (V32QImode, op0);
15634 op1 = gen_lowpart (V32QImode, op1);
15635 ix86_avx256_split_vector_move_misalign (op0, op1);
15636 break;
15637 default:
15638 gcc_unreachable ();
15639 }
15640 break;
15641 case MODE_VECTOR_FLOAT:
15642 op0 = gen_lowpart (mode, op0);
15643 op1 = gen_lowpart (mode, op1);
15644
15645 switch (mode)
15646 {
15647 case V4SFmode:
15648 emit_insn (gen_sse_movups (op0, op1));
15649 break;
15650 case V8SFmode:
15651 ix86_avx256_split_vector_move_misalign (op0, op1);
15652 break;
15653 case V2DFmode:
15654 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15655 {
15656 op0 = gen_lowpart (V4SFmode, op0);
15657 op1 = gen_lowpart (V4SFmode, op1);
15658 emit_insn (gen_sse_movups (op0, op1));
15659 return;
15660 }
15661 emit_insn (gen_sse2_movupd (op0, op1));
15662 break;
15663 case V4DFmode:
15664 ix86_avx256_split_vector_move_misalign (op0, op1);
15665 break;
15666 default:
15667 gcc_unreachable ();
15668 }
15669 break;
15670
15671 default:
15672 gcc_unreachable ();
15673 }
15674
15675 return;
15676 }
15677
15678 if (MEM_P (op1))
15679 {
15680 /* If we're optimizing for size, movups is the smallest. */
15681 if (optimize_insn_for_size_p ()
15682 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15683 {
15684 op0 = gen_lowpart (V4SFmode, op0);
15685 op1 = gen_lowpart (V4SFmode, op1);
15686 emit_insn (gen_sse_movups (op0, op1));
15687 return;
15688 }
15689
15690 /* ??? If we have typed data, then it would appear that using
15691 movdqu is the only way to get unaligned data loaded with
15692 integer type. */
15693 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15694 {
15695 op0 = gen_lowpart (V16QImode, op0);
15696 op1 = gen_lowpart (V16QImode, op1);
15697 emit_insn (gen_sse2_movdqu (op0, op1));
15698 return;
15699 }
15700
15701 if (TARGET_SSE2 && mode == V2DFmode)
15702 {
15703 rtx zero;
15704
15705 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15706 {
15707 op0 = gen_lowpart (V2DFmode, op0);
15708 op1 = gen_lowpart (V2DFmode, op1);
15709 emit_insn (gen_sse2_movupd (op0, op1));
15710 return;
15711 }
15712
15713 /* When SSE registers are split into halves, we can avoid
15714 writing to the top half twice. */
15715 if (TARGET_SSE_SPLIT_REGS)
15716 {
15717 emit_clobber (op0);
15718 zero = op0;
15719 }
15720 else
15721 {
15722 /* ??? Not sure about the best option for the Intel chips.
15723 The following would seem to satisfy; the register is
15724 entirely cleared, breaking the dependency chain. We
15725 then store to the upper half, with a dependency depth
15726 of one. A rumor has it that Intel recommends two movsd
15727 followed by an unpacklpd, but this is unconfirmed. And
15728 given that the dependency depth of the unpacklpd would
15729 still be one, I'm not sure why this would be better. */
15730 zero = CONST0_RTX (V2DFmode);
15731 }
15732
15733 m = adjust_address (op1, DFmode, 0);
15734 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15735 m = adjust_address (op1, DFmode, 8);
15736 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15737 }
15738 else
15739 {
15740 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15741 {
15742 op0 = gen_lowpart (V4SFmode, op0);
15743 op1 = gen_lowpart (V4SFmode, op1);
15744 emit_insn (gen_sse_movups (op0, op1));
15745 return;
15746 }
15747
15748 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15749 emit_move_insn (op0, CONST0_RTX (mode));
15750 else
15751 emit_clobber (op0);
15752
15753 if (mode != V4SFmode)
15754 op0 = gen_lowpart (V4SFmode, op0);
15755 m = adjust_address (op1, V2SFmode, 0);
15756 emit_insn (gen_sse_loadlps (op0, op0, m));
15757 m = adjust_address (op1, V2SFmode, 8);
15758 emit_insn (gen_sse_loadhps (op0, op0, m));
15759 }
15760 }
15761 else if (MEM_P (op0))
15762 {
15763 /* If we're optimizing for size, movups is the smallest. */
15764 if (optimize_insn_for_size_p ()
15765 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15766 {
15767 op0 = gen_lowpart (V4SFmode, op0);
15768 op1 = gen_lowpart (V4SFmode, op1);
15769 emit_insn (gen_sse_movups (op0, op1));
15770 return;
15771 }
15772
15773 /* ??? Similar to above, only less clear because of quote
15774 typeless stores unquote. */
15775 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15776 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15777 {
15778 op0 = gen_lowpart (V16QImode, op0);
15779 op1 = gen_lowpart (V16QImode, op1);
15780 emit_insn (gen_sse2_movdqu (op0, op1));
15781 return;
15782 }
15783
15784 if (TARGET_SSE2 && mode == V2DFmode)
15785 {
15786 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15787 {
15788 op0 = gen_lowpart (V2DFmode, op0);
15789 op1 = gen_lowpart (V2DFmode, op1);
15790 emit_insn (gen_sse2_movupd (op0, op1));
15791 }
15792 else
15793 {
15794 m = adjust_address (op0, DFmode, 0);
15795 emit_insn (gen_sse2_storelpd (m, op1));
15796 m = adjust_address (op0, DFmode, 8);
15797 emit_insn (gen_sse2_storehpd (m, op1));
15798 }
15799 }
15800 else
15801 {
15802 if (mode != V4SFmode)
15803 op1 = gen_lowpart (V4SFmode, op1);
15804
15805 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15806 {
15807 op0 = gen_lowpart (V4SFmode, op0);
15808 emit_insn (gen_sse_movups (op0, op1));
15809 }
15810 else
15811 {
15812 m = adjust_address (op0, V2SFmode, 0);
15813 emit_insn (gen_sse_storelps (m, op1));
15814 m = adjust_address (op0, V2SFmode, 8);
15815 emit_insn (gen_sse_storehps (m, op1));
15816 }
15817 }
15818 }
15819 else
15820 gcc_unreachable ();
15821 }
15822
15823 /* Expand a push in MODE. This is some mode for which we do not support
15824 proper push instructions, at least from the registers that we expect
15825 the value to live in. */
15826
15827 void
15828 ix86_expand_push (enum machine_mode mode, rtx x)
15829 {
15830 rtx tmp;
15831
15832 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
15833 GEN_INT (-GET_MODE_SIZE (mode)),
15834 stack_pointer_rtx, 1, OPTAB_DIRECT);
15835 if (tmp != stack_pointer_rtx)
15836 emit_move_insn (stack_pointer_rtx, tmp);
15837
15838 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
15839
15840 /* When we push an operand onto stack, it has to be aligned at least
15841 at the function argument boundary. However since we don't have
15842 the argument type, we can't determine the actual argument
15843 boundary. */
15844 emit_move_insn (tmp, x);
15845 }
15846
15847 /* Helper function of ix86_fixup_binary_operands to canonicalize
15848 operand order. Returns true if the operands should be swapped. */
15849
15850 static bool
15851 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
15852 rtx operands[])
15853 {
15854 rtx dst = operands[0];
15855 rtx src1 = operands[1];
15856 rtx src2 = operands[2];
15857
15858 /* If the operation is not commutative, we can't do anything. */
15859 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
15860 return false;
15861
15862 /* Highest priority is that src1 should match dst. */
15863 if (rtx_equal_p (dst, src1))
15864 return false;
15865 if (rtx_equal_p (dst, src2))
15866 return true;
15867
15868 /* Next highest priority is that immediate constants come second. */
15869 if (immediate_operand (src2, mode))
15870 return false;
15871 if (immediate_operand (src1, mode))
15872 return true;
15873
15874 /* Lowest priority is that memory references should come second. */
15875 if (MEM_P (src2))
15876 return false;
15877 if (MEM_P (src1))
15878 return true;
15879
15880 return false;
15881 }
15882
15883
15884 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
15885 destination to use for the operation. If different from the true
15886 destination in operands[0], a copy operation will be required. */
15887
15888 rtx
15889 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
15890 rtx operands[])
15891 {
15892 rtx dst = operands[0];
15893 rtx src1 = operands[1];
15894 rtx src2 = operands[2];
15895
15896 /* Canonicalize operand order. */
15897 if (ix86_swap_binary_operands_p (code, mode, operands))
15898 {
15899 rtx temp;
15900
15901 /* It is invalid to swap operands of different modes. */
15902 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
15903
15904 temp = src1;
15905 src1 = src2;
15906 src2 = temp;
15907 }
15908
15909 /* Both source operands cannot be in memory. */
15910 if (MEM_P (src1) && MEM_P (src2))
15911 {
15912 /* Optimization: Only read from memory once. */
15913 if (rtx_equal_p (src1, src2))
15914 {
15915 src2 = force_reg (mode, src2);
15916 src1 = src2;
15917 }
15918 else
15919 src2 = force_reg (mode, src2);
15920 }
15921
15922 /* If the destination is memory, and we do not have matching source
15923 operands, do things in registers. */
15924 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
15925 dst = gen_reg_rtx (mode);
15926
15927 /* Source 1 cannot be a constant. */
15928 if (CONSTANT_P (src1))
15929 src1 = force_reg (mode, src1);
15930
15931 /* Source 1 cannot be a non-matching memory. */
15932 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
15933 src1 = force_reg (mode, src1);
15934
15935 /* Improve address combine. */
15936 if (code == PLUS
15937 && GET_MODE_CLASS (mode) == MODE_INT
15938 && MEM_P (src2))
15939 src2 = force_reg (mode, src2);
15940
15941 operands[1] = src1;
15942 operands[2] = src2;
15943 return dst;
15944 }
15945
15946 /* Similarly, but assume that the destination has already been
15947 set up properly. */
15948
15949 void
15950 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
15951 enum machine_mode mode, rtx operands[])
15952 {
15953 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
15954 gcc_assert (dst == operands[0]);
15955 }
15956
15957 /* Attempt to expand a binary operator. Make the expansion closer to the
15958 actual machine, then just general_operand, which will allow 3 separate
15959 memory references (one output, two input) in a single insn. */
15960
15961 void
15962 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
15963 rtx operands[])
15964 {
15965 rtx src1, src2, dst, op, clob;
15966
15967 dst = ix86_fixup_binary_operands (code, mode, operands);
15968 src1 = operands[1];
15969 src2 = operands[2];
15970
15971 /* Emit the instruction. */
15972
15973 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
15974 if (reload_in_progress)
15975 {
15976 /* Reload doesn't know about the flags register, and doesn't know that
15977 it doesn't want to clobber it. We can only do this with PLUS. */
15978 gcc_assert (code == PLUS);
15979 emit_insn (op);
15980 }
15981 else if (reload_completed
15982 && code == PLUS
15983 && !rtx_equal_p (dst, src1))
15984 {
15985 /* This is going to be an LEA; avoid splitting it later. */
15986 emit_insn (op);
15987 }
15988 else
15989 {
15990 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15991 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
15992 }
15993
15994 /* Fix up the destination if needed. */
15995 if (dst != operands[0])
15996 emit_move_insn (operands[0], dst);
15997 }
15998
15999 /* Return TRUE or FALSE depending on whether the binary operator meets the
16000 appropriate constraints. */
16001
16002 bool
16003 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16004 rtx operands[3])
16005 {
16006 rtx dst = operands[0];
16007 rtx src1 = operands[1];
16008 rtx src2 = operands[2];
16009
16010 /* Both source operands cannot be in memory. */
16011 if (MEM_P (src1) && MEM_P (src2))
16012 return false;
16013
16014 /* Canonicalize operand order for commutative operators. */
16015 if (ix86_swap_binary_operands_p (code, mode, operands))
16016 {
16017 rtx temp = src1;
16018 src1 = src2;
16019 src2 = temp;
16020 }
16021
16022 /* If the destination is memory, we must have a matching source operand. */
16023 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16024 return false;
16025
16026 /* Source 1 cannot be a constant. */
16027 if (CONSTANT_P (src1))
16028 return false;
16029
16030 /* Source 1 cannot be a non-matching memory. */
16031 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16032 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16033 return (code == AND
16034 && (mode == HImode
16035 || mode == SImode
16036 || (TARGET_64BIT && mode == DImode))
16037 && satisfies_constraint_L (src2));
16038
16039 return true;
16040 }
16041
16042 /* Attempt to expand a unary operator. Make the expansion closer to the
16043 actual machine, then just general_operand, which will allow 2 separate
16044 memory references (one output, one input) in a single insn. */
16045
16046 void
16047 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16048 rtx operands[])
16049 {
16050 int matching_memory;
16051 rtx src, dst, op, clob;
16052
16053 dst = operands[0];
16054 src = operands[1];
16055
16056 /* If the destination is memory, and we do not have matching source
16057 operands, do things in registers. */
16058 matching_memory = 0;
16059 if (MEM_P (dst))
16060 {
16061 if (rtx_equal_p (dst, src))
16062 matching_memory = 1;
16063 else
16064 dst = gen_reg_rtx (mode);
16065 }
16066
16067 /* When source operand is memory, destination must match. */
16068 if (MEM_P (src) && !matching_memory)
16069 src = force_reg (mode, src);
16070
16071 /* Emit the instruction. */
16072
16073 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16074 if (reload_in_progress || code == NOT)
16075 {
16076 /* Reload doesn't know about the flags register, and doesn't know that
16077 it doesn't want to clobber it. */
16078 gcc_assert (code == NOT);
16079 emit_insn (op);
16080 }
16081 else
16082 {
16083 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16084 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16085 }
16086
16087 /* Fix up the destination if needed. */
16088 if (dst != operands[0])
16089 emit_move_insn (operands[0], dst);
16090 }
16091
16092 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16093 divisor are within the range [0-255]. */
16094
16095 void
16096 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16097 bool signed_p)
16098 {
16099 rtx end_label, qimode_label;
16100 rtx insn, div, mod;
16101 rtx scratch, tmp0, tmp1, tmp2;
16102 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16103 rtx (*gen_zero_extend) (rtx, rtx);
16104 rtx (*gen_test_ccno_1) (rtx, rtx);
16105
16106 switch (mode)
16107 {
16108 case SImode:
16109 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16110 gen_test_ccno_1 = gen_testsi_ccno_1;
16111 gen_zero_extend = gen_zero_extendqisi2;
16112 break;
16113 case DImode:
16114 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16115 gen_test_ccno_1 = gen_testdi_ccno_1;
16116 gen_zero_extend = gen_zero_extendqidi2;
16117 break;
16118 default:
16119 gcc_unreachable ();
16120 }
16121
16122 end_label = gen_label_rtx ();
16123 qimode_label = gen_label_rtx ();
16124
16125 scratch = gen_reg_rtx (mode);
16126
16127 /* Use 8bit unsigned divimod if dividend and divisor are within
16128 the range [0-255]. */
16129 emit_move_insn (scratch, operands[2]);
16130 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16131 scratch, 1, OPTAB_DIRECT);
16132 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16133 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16134 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16135 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16136 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16137 pc_rtx);
16138 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16139 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16140 JUMP_LABEL (insn) = qimode_label;
16141
16142 /* Generate original signed/unsigned divimod. */
16143 div = gen_divmod4_1 (operands[0], operands[1],
16144 operands[2], operands[3]);
16145 emit_insn (div);
16146
16147 /* Branch to the end. */
16148 emit_jump_insn (gen_jump (end_label));
16149 emit_barrier ();
16150
16151 /* Generate 8bit unsigned divide. */
16152 emit_label (qimode_label);
16153 /* Don't use operands[0] for result of 8bit divide since not all
16154 registers support QImode ZERO_EXTRACT. */
16155 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16156 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16157 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16158 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16159
16160 if (signed_p)
16161 {
16162 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16163 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16164 }
16165 else
16166 {
16167 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16168 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16169 }
16170
16171 /* Extract remainder from AH. */
16172 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16173 if (REG_P (operands[1]))
16174 insn = emit_move_insn (operands[1], tmp1);
16175 else
16176 {
16177 /* Need a new scratch register since the old one has result
16178 of 8bit divide. */
16179 scratch = gen_reg_rtx (mode);
16180 emit_move_insn (scratch, tmp1);
16181 insn = emit_move_insn (operands[1], scratch);
16182 }
16183 set_unique_reg_note (insn, REG_EQUAL, mod);
16184
16185 /* Zero extend quotient from AL. */
16186 tmp1 = gen_lowpart (QImode, tmp0);
16187 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16188 set_unique_reg_note (insn, REG_EQUAL, div);
16189
16190 emit_label (end_label);
16191 }
16192
16193 #define LEA_MAX_STALL (3)
16194 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16195
16196 /* Increase given DISTANCE in half-cycles according to
16197 dependencies between PREV and NEXT instructions.
16198 Add 1 half-cycle if there is no dependency and
16199 go to next cycle if there is some dependecy. */
16200
16201 static unsigned int
16202 increase_distance (rtx prev, rtx next, unsigned int distance)
16203 {
16204 df_ref *use_rec;
16205 df_ref *def_rec;
16206
16207 if (!prev || !next)
16208 return distance + (distance & 1) + 2;
16209
16210 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16211 return distance + 1;
16212
16213 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16214 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16215 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16216 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16217 return distance + (distance & 1) + 2;
16218
16219 return distance + 1;
16220 }
16221
16222 /* Function checks if instruction INSN defines register number
16223 REGNO1 or REGNO2. */
16224
16225 static bool
16226 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16227 rtx insn)
16228 {
16229 df_ref *def_rec;
16230
16231 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16232 if (DF_REF_REG_DEF_P (*def_rec)
16233 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16234 && (regno1 == DF_REF_REGNO (*def_rec)
16235 || regno2 == DF_REF_REGNO (*def_rec)))
16236 {
16237 return true;
16238 }
16239
16240 return false;
16241 }
16242
16243 /* Function checks if instruction INSN uses register number
16244 REGNO as a part of address expression. */
16245
16246 static bool
16247 insn_uses_reg_mem (unsigned int regno, rtx insn)
16248 {
16249 df_ref *use_rec;
16250
16251 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16252 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16253 return true;
16254
16255 return false;
16256 }
16257
16258 /* Search backward for non-agu definition of register number REGNO1
16259 or register number REGNO2 in basic block starting from instruction
16260 START up to head of basic block or instruction INSN.
16261
16262 Function puts true value into *FOUND var if definition was found
16263 and false otherwise.
16264
16265 Distance in half-cycles between START and found instruction or head
16266 of BB is added to DISTANCE and returned. */
16267
16268 static int
16269 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16270 rtx insn, int distance,
16271 rtx start, bool *found)
16272 {
16273 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16274 rtx prev = start;
16275 rtx next = NULL;
16276
16277 *found = false;
16278
16279 while (prev
16280 && prev != insn
16281 && distance < LEA_SEARCH_THRESHOLD)
16282 {
16283 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16284 {
16285 distance = increase_distance (prev, next, distance);
16286 if (insn_defines_reg (regno1, regno2, prev))
16287 {
16288 if (recog_memoized (prev) < 0
16289 || get_attr_type (prev) != TYPE_LEA)
16290 {
16291 *found = true;
16292 return distance;
16293 }
16294 }
16295
16296 next = prev;
16297 }
16298 if (prev == BB_HEAD (bb))
16299 break;
16300
16301 prev = PREV_INSN (prev);
16302 }
16303
16304 return distance;
16305 }
16306
16307 /* Search backward for non-agu definition of register number REGNO1
16308 or register number REGNO2 in INSN's basic block until
16309 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16310 2. Reach neighbour BBs boundary, or
16311 3. Reach agu definition.
16312 Returns the distance between the non-agu definition point and INSN.
16313 If no definition point, returns -1. */
16314
16315 static int
16316 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16317 rtx insn)
16318 {
16319 basic_block bb = BLOCK_FOR_INSN (insn);
16320 int distance = 0;
16321 bool found = false;
16322
16323 if (insn != BB_HEAD (bb))
16324 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16325 distance, PREV_INSN (insn),
16326 &found);
16327
16328 if (!found && distance < LEA_SEARCH_THRESHOLD)
16329 {
16330 edge e;
16331 edge_iterator ei;
16332 bool simple_loop = false;
16333
16334 FOR_EACH_EDGE (e, ei, bb->preds)
16335 if (e->src == bb)
16336 {
16337 simple_loop = true;
16338 break;
16339 }
16340
16341 if (simple_loop)
16342 distance = distance_non_agu_define_in_bb (regno1, regno2,
16343 insn, distance,
16344 BB_END (bb), &found);
16345 else
16346 {
16347 int shortest_dist = -1;
16348 bool found_in_bb = false;
16349
16350 FOR_EACH_EDGE (e, ei, bb->preds)
16351 {
16352 int bb_dist
16353 = distance_non_agu_define_in_bb (regno1, regno2,
16354 insn, distance,
16355 BB_END (e->src),
16356 &found_in_bb);
16357 if (found_in_bb)
16358 {
16359 if (shortest_dist < 0)
16360 shortest_dist = bb_dist;
16361 else if (bb_dist > 0)
16362 shortest_dist = MIN (bb_dist, shortest_dist);
16363
16364 found = true;
16365 }
16366 }
16367
16368 distance = shortest_dist;
16369 }
16370 }
16371
16372 /* get_attr_type may modify recog data. We want to make sure
16373 that recog data is valid for instruction INSN, on which
16374 distance_non_agu_define is called. INSN is unchanged here. */
16375 extract_insn_cached (insn);
16376
16377 if (!found)
16378 return -1;
16379
16380 return distance >> 1;
16381 }
16382
16383 /* Return the distance in half-cycles between INSN and the next
16384 insn that uses register number REGNO in memory address added
16385 to DISTANCE. Return -1 if REGNO0 is set.
16386
16387 Put true value into *FOUND if register usage was found and
16388 false otherwise.
16389 Put true value into *REDEFINED if register redefinition was
16390 found and false otherwise. */
16391
16392 static int
16393 distance_agu_use_in_bb (unsigned int regno,
16394 rtx insn, int distance, rtx start,
16395 bool *found, bool *redefined)
16396 {
16397 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16398 rtx next = start;
16399 rtx prev = NULL;
16400
16401 *found = false;
16402 *redefined = false;
16403
16404 while (next
16405 && next != insn
16406 && distance < LEA_SEARCH_THRESHOLD)
16407 {
16408 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16409 {
16410 distance = increase_distance(prev, next, distance);
16411 if (insn_uses_reg_mem (regno, next))
16412 {
16413 /* Return DISTANCE if OP0 is used in memory
16414 address in NEXT. */
16415 *found = true;
16416 return distance;
16417 }
16418
16419 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16420 {
16421 /* Return -1 if OP0 is set in NEXT. */
16422 *redefined = true;
16423 return -1;
16424 }
16425
16426 prev = next;
16427 }
16428
16429 if (next == BB_END (bb))
16430 break;
16431
16432 next = NEXT_INSN (next);
16433 }
16434
16435 return distance;
16436 }
16437
16438 /* Return the distance between INSN and the next insn that uses
16439 register number REGNO0 in memory address. Return -1 if no such
16440 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16441
16442 static int
16443 distance_agu_use (unsigned int regno0, rtx insn)
16444 {
16445 basic_block bb = BLOCK_FOR_INSN (insn);
16446 int distance = 0;
16447 bool found = false;
16448 bool redefined = false;
16449
16450 if (insn != BB_END (bb))
16451 distance = distance_agu_use_in_bb (regno0, insn, distance,
16452 NEXT_INSN (insn),
16453 &found, &redefined);
16454
16455 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16456 {
16457 edge e;
16458 edge_iterator ei;
16459 bool simple_loop = false;
16460
16461 FOR_EACH_EDGE (e, ei, bb->succs)
16462 if (e->dest == bb)
16463 {
16464 simple_loop = true;
16465 break;
16466 }
16467
16468 if (simple_loop)
16469 distance = distance_agu_use_in_bb (regno0, insn,
16470 distance, BB_HEAD (bb),
16471 &found, &redefined);
16472 else
16473 {
16474 int shortest_dist = -1;
16475 bool found_in_bb = false;
16476 bool redefined_in_bb = false;
16477
16478 FOR_EACH_EDGE (e, ei, bb->succs)
16479 {
16480 int bb_dist
16481 = distance_agu_use_in_bb (regno0, insn,
16482 distance, BB_HEAD (e->dest),
16483 &found_in_bb, &redefined_in_bb);
16484 if (found_in_bb)
16485 {
16486 if (shortest_dist < 0)
16487 shortest_dist = bb_dist;
16488 else if (bb_dist > 0)
16489 shortest_dist = MIN (bb_dist, shortest_dist);
16490
16491 found = true;
16492 }
16493 }
16494
16495 distance = shortest_dist;
16496 }
16497 }
16498
16499 if (!found || redefined)
16500 return -1;
16501
16502 return distance >> 1;
16503 }
16504
16505 /* Define this macro to tune LEA priority vs ADD, it take effect when
16506 there is a dilemma of choicing LEA or ADD
16507 Negative value: ADD is more preferred than LEA
16508 Zero: Netrual
16509 Positive value: LEA is more preferred than ADD*/
16510 #define IX86_LEA_PRIORITY 0
16511
16512 /* Return true if usage of lea INSN has performance advantage
16513 over a sequence of instructions. Instructions sequence has
16514 SPLIT_COST cycles higher latency than lea latency. */
16515
16516 bool
16517 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16518 unsigned int regno2, unsigned int split_cost)
16519 {
16520 int dist_define, dist_use;
16521
16522 dist_define = distance_non_agu_define (regno1, regno2, insn);
16523 dist_use = distance_agu_use (regno0, insn);
16524
16525 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16526 {
16527 /* If there is no non AGU operand definition, no AGU
16528 operand usage and split cost is 0 then both lea
16529 and non lea variants have same priority. Currently
16530 we prefer lea for 64 bit code and non lea on 32 bit
16531 code. */
16532 if (dist_use < 0 && split_cost == 0)
16533 return TARGET_64BIT || IX86_LEA_PRIORITY;
16534 else
16535 return true;
16536 }
16537
16538 /* With longer definitions distance lea is more preferable.
16539 Here we change it to take into account splitting cost and
16540 lea priority. */
16541 dist_define += split_cost + IX86_LEA_PRIORITY;
16542
16543 /* If there is no use in memory addess then we just check
16544 that split cost does not exceed AGU stall. */
16545 if (dist_use < 0)
16546 return dist_define >= LEA_MAX_STALL;
16547
16548 /* If this insn has both backward non-agu dependence and forward
16549 agu dependence, the one with short distance takes effect. */
16550 return dist_define >= dist_use;
16551 }
16552
16553 /* Return true if it is legal to clobber flags by INSN and
16554 false otherwise. */
16555
16556 static bool
16557 ix86_ok_to_clobber_flags (rtx insn)
16558 {
16559 basic_block bb = BLOCK_FOR_INSN (insn);
16560 df_ref *use;
16561 bitmap live;
16562
16563 while (insn)
16564 {
16565 if (NONDEBUG_INSN_P (insn))
16566 {
16567 for (use = DF_INSN_USES (insn); *use; use++)
16568 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16569 return false;
16570
16571 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16572 return true;
16573 }
16574
16575 if (insn == BB_END (bb))
16576 break;
16577
16578 insn = NEXT_INSN (insn);
16579 }
16580
16581 live = df_get_live_out(bb);
16582 return !REGNO_REG_SET_P (live, FLAGS_REG);
16583 }
16584
16585 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16586 move and add to avoid AGU stalls. */
16587
16588 bool
16589 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16590 {
16591 unsigned int regno0 = true_regnum (operands[0]);
16592 unsigned int regno1 = true_regnum (operands[1]);
16593 unsigned int regno2 = true_regnum (operands[2]);
16594
16595 /* Check if we need to optimize. */
16596 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16597 return false;
16598
16599 /* Check it is correct to split here. */
16600 if (!ix86_ok_to_clobber_flags(insn))
16601 return false;
16602
16603 /* We need to split only adds with non destructive
16604 destination operand. */
16605 if (regno0 == regno1 || regno0 == regno2)
16606 return false;
16607 else
16608 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16609 }
16610
16611 /* Return true if we should emit lea instruction instead of mov
16612 instruction. */
16613
16614 bool
16615 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16616 {
16617 unsigned int regno0;
16618 unsigned int regno1;
16619
16620 /* Check if we need to optimize. */
16621 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16622 return false;
16623
16624 /* Use lea for reg to reg moves only. */
16625 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16626 return false;
16627
16628 regno0 = true_regnum (operands[0]);
16629 regno1 = true_regnum (operands[1]);
16630
16631 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16632 }
16633
16634 /* Return true if we need to split lea into a sequence of
16635 instructions to avoid AGU stalls. */
16636
16637 bool
16638 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16639 {
16640 unsigned int regno0 = true_regnum (operands[0]) ;
16641 unsigned int regno1 = -1;
16642 unsigned int regno2 = -1;
16643 unsigned int split_cost = 0;
16644 struct ix86_address parts;
16645 int ok;
16646
16647 /* Check we need to optimize. */
16648 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16649 return false;
16650
16651 /* Check it is correct to split here. */
16652 if (!ix86_ok_to_clobber_flags(insn))
16653 return false;
16654
16655 ok = ix86_decompose_address (operands[1], &parts);
16656 gcc_assert (ok);
16657
16658 /* We should not split into add if non legitimate pic
16659 operand is used as displacement. */
16660 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16661 return false;
16662
16663 if (parts.base)
16664 regno1 = true_regnum (parts.base);
16665 if (parts.index)
16666 regno2 = true_regnum (parts.index);
16667
16668 /* Compute how many cycles we will add to execution time
16669 if split lea into a sequence of instructions. */
16670 if (parts.base || parts.index)
16671 {
16672 /* Have to use mov instruction if non desctructive
16673 destination form is used. */
16674 if (regno1 != regno0 && regno2 != regno0)
16675 split_cost += 1;
16676
16677 /* Have to add index to base if both exist. */
16678 if (parts.base && parts.index)
16679 split_cost += 1;
16680
16681 /* Have to use shift and adds if scale is 2 or greater. */
16682 if (parts.scale > 1)
16683 {
16684 if (regno0 != regno1)
16685 split_cost += 1;
16686 else if (regno2 == regno0)
16687 split_cost += 4;
16688 else
16689 split_cost += parts.scale;
16690 }
16691
16692 /* Have to use add instruction with immediate if
16693 disp is non zero. */
16694 if (parts.disp && parts.disp != const0_rtx)
16695 split_cost += 1;
16696
16697 /* Subtract the price of lea. */
16698 split_cost -= 1;
16699 }
16700
16701 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16702 }
16703
16704 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16705 matches destination. RTX includes clobber of FLAGS_REG. */
16706
16707 static void
16708 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16709 rtx dst, rtx src)
16710 {
16711 rtx op, clob;
16712
16713 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16714 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16715
16716 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16717 }
16718
16719 /* Split lea instructions into a sequence of instructions
16720 which are executed on ALU to avoid AGU stalls.
16721 It is assumed that it is allowed to clobber flags register
16722 at lea position. */
16723
16724 extern void
16725 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16726 {
16727 unsigned int regno0 = true_regnum (operands[0]) ;
16728 unsigned int regno1 = INVALID_REGNUM;
16729 unsigned int regno2 = INVALID_REGNUM;
16730 struct ix86_address parts;
16731 rtx tmp;
16732 int ok, adds;
16733
16734 ok = ix86_decompose_address (operands[1], &parts);
16735 gcc_assert (ok);
16736
16737 if (parts.base)
16738 {
16739 if (GET_MODE (parts.base) != mode)
16740 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16741 regno1 = true_regnum (parts.base);
16742 }
16743
16744 if (parts.index)
16745 {
16746 if (GET_MODE (parts.index) != mode)
16747 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16748 regno2 = true_regnum (parts.index);
16749 }
16750
16751 if (parts.scale > 1)
16752 {
16753 /* Case r1 = r1 + ... */
16754 if (regno1 == regno0)
16755 {
16756 /* If we have a case r1 = r1 + C * r1 then we
16757 should use multiplication which is very
16758 expensive. Assume cost model is wrong if we
16759 have such case here. */
16760 gcc_assert (regno2 != regno0);
16761
16762 for (adds = parts.scale; adds > 0; adds--)
16763 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16764 }
16765 else
16766 {
16767 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16768 if (regno0 != regno2)
16769 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16770
16771 /* Use shift for scaling. */
16772 ix86_emit_binop (ASHIFT, mode, operands[0],
16773 GEN_INT (exact_log2 (parts.scale)));
16774
16775 if (parts.base)
16776 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16777
16778 if (parts.disp && parts.disp != const0_rtx)
16779 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16780 }
16781 }
16782 else if (!parts.base && !parts.index)
16783 {
16784 gcc_assert(parts.disp);
16785 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16786 }
16787 else
16788 {
16789 if (!parts.base)
16790 {
16791 if (regno0 != regno2)
16792 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16793 }
16794 else if (!parts.index)
16795 {
16796 if (regno0 != regno1)
16797 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16798 }
16799 else
16800 {
16801 if (regno0 == regno1)
16802 tmp = parts.index;
16803 else if (regno0 == regno2)
16804 tmp = parts.base;
16805 else
16806 {
16807 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16808 tmp = parts.index;
16809 }
16810
16811 ix86_emit_binop (PLUS, mode, operands[0], tmp);
16812 }
16813
16814 if (parts.disp && parts.disp != const0_rtx)
16815 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16816 }
16817 }
16818
16819 /* Return true if it is ok to optimize an ADD operation to LEA
16820 operation to avoid flag register consumation. For most processors,
16821 ADD is faster than LEA. For the processors like ATOM, if the
16822 destination register of LEA holds an actual address which will be
16823 used soon, LEA is better and otherwise ADD is better. */
16824
16825 bool
16826 ix86_lea_for_add_ok (rtx insn, rtx operands[])
16827 {
16828 unsigned int regno0 = true_regnum (operands[0]);
16829 unsigned int regno1 = true_regnum (operands[1]);
16830 unsigned int regno2 = true_regnum (operands[2]);
16831
16832 /* If a = b + c, (a!=b && a!=c), must use lea form. */
16833 if (regno0 != regno1 && regno0 != regno2)
16834 return true;
16835
16836 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16837 return false;
16838
16839 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
16840 }
16841
16842 /* Return true if destination reg of SET_BODY is shift count of
16843 USE_BODY. */
16844
16845 static bool
16846 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
16847 {
16848 rtx set_dest;
16849 rtx shift_rtx;
16850 int i;
16851
16852 /* Retrieve destination of SET_BODY. */
16853 switch (GET_CODE (set_body))
16854 {
16855 case SET:
16856 set_dest = SET_DEST (set_body);
16857 if (!set_dest || !REG_P (set_dest))
16858 return false;
16859 break;
16860 case PARALLEL:
16861 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
16862 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
16863 use_body))
16864 return true;
16865 default:
16866 return false;
16867 break;
16868 }
16869
16870 /* Retrieve shift count of USE_BODY. */
16871 switch (GET_CODE (use_body))
16872 {
16873 case SET:
16874 shift_rtx = XEXP (use_body, 1);
16875 break;
16876 case PARALLEL:
16877 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
16878 if (ix86_dep_by_shift_count_body (set_body,
16879 XVECEXP (use_body, 0, i)))
16880 return true;
16881 default:
16882 return false;
16883 break;
16884 }
16885
16886 if (shift_rtx
16887 && (GET_CODE (shift_rtx) == ASHIFT
16888 || GET_CODE (shift_rtx) == LSHIFTRT
16889 || GET_CODE (shift_rtx) == ASHIFTRT
16890 || GET_CODE (shift_rtx) == ROTATE
16891 || GET_CODE (shift_rtx) == ROTATERT))
16892 {
16893 rtx shift_count = XEXP (shift_rtx, 1);
16894
16895 /* Return true if shift count is dest of SET_BODY. */
16896 if (REG_P (shift_count)
16897 && true_regnum (set_dest) == true_regnum (shift_count))
16898 return true;
16899 }
16900
16901 return false;
16902 }
16903
16904 /* Return true if destination reg of SET_INSN is shift count of
16905 USE_INSN. */
16906
16907 bool
16908 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
16909 {
16910 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
16911 PATTERN (use_insn));
16912 }
16913
16914 /* Return TRUE or FALSE depending on whether the unary operator meets the
16915 appropriate constraints. */
16916
16917 bool
16918 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
16919 enum machine_mode mode ATTRIBUTE_UNUSED,
16920 rtx operands[2] ATTRIBUTE_UNUSED)
16921 {
16922 /* If one of operands is memory, source and destination must match. */
16923 if ((MEM_P (operands[0])
16924 || MEM_P (operands[1]))
16925 && ! rtx_equal_p (operands[0], operands[1]))
16926 return false;
16927 return true;
16928 }
16929
16930 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
16931 are ok, keeping in mind the possible movddup alternative. */
16932
16933 bool
16934 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
16935 {
16936 if (MEM_P (operands[0]))
16937 return rtx_equal_p (operands[0], operands[1 + high]);
16938 if (MEM_P (operands[1]) && MEM_P (operands[2]))
16939 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
16940 return true;
16941 }
16942
16943 /* Post-reload splitter for converting an SF or DFmode value in an
16944 SSE register into an unsigned SImode. */
16945
16946 void
16947 ix86_split_convert_uns_si_sse (rtx operands[])
16948 {
16949 enum machine_mode vecmode;
16950 rtx value, large, zero_or_two31, input, two31, x;
16951
16952 large = operands[1];
16953 zero_or_two31 = operands[2];
16954 input = operands[3];
16955 two31 = operands[4];
16956 vecmode = GET_MODE (large);
16957 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
16958
16959 /* Load up the value into the low element. We must ensure that the other
16960 elements are valid floats -- zero is the easiest such value. */
16961 if (MEM_P (input))
16962 {
16963 if (vecmode == V4SFmode)
16964 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
16965 else
16966 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
16967 }
16968 else
16969 {
16970 input = gen_rtx_REG (vecmode, REGNO (input));
16971 emit_move_insn (value, CONST0_RTX (vecmode));
16972 if (vecmode == V4SFmode)
16973 emit_insn (gen_sse_movss (value, value, input));
16974 else
16975 emit_insn (gen_sse2_movsd (value, value, input));
16976 }
16977
16978 emit_move_insn (large, two31);
16979 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
16980
16981 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
16982 emit_insn (gen_rtx_SET (VOIDmode, large, x));
16983
16984 x = gen_rtx_AND (vecmode, zero_or_two31, large);
16985 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
16986
16987 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
16988 emit_insn (gen_rtx_SET (VOIDmode, value, x));
16989
16990 large = gen_rtx_REG (V4SImode, REGNO (large));
16991 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
16992
16993 x = gen_rtx_REG (V4SImode, REGNO (value));
16994 if (vecmode == V4SFmode)
16995 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
16996 else
16997 emit_insn (gen_sse2_cvttpd2dq (x, value));
16998 value = x;
16999
17000 emit_insn (gen_xorv4si3 (value, value, large));
17001 }
17002
17003 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17004 Expects the 64-bit DImode to be supplied in a pair of integral
17005 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17006 -mfpmath=sse, !optimize_size only. */
17007
17008 void
17009 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17010 {
17011 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17012 rtx int_xmm, fp_xmm;
17013 rtx biases, exponents;
17014 rtx x;
17015
17016 int_xmm = gen_reg_rtx (V4SImode);
17017 if (TARGET_INTER_UNIT_MOVES)
17018 emit_insn (gen_movdi_to_sse (int_xmm, input));
17019 else if (TARGET_SSE_SPLIT_REGS)
17020 {
17021 emit_clobber (int_xmm);
17022 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17023 }
17024 else
17025 {
17026 x = gen_reg_rtx (V2DImode);
17027 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17028 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17029 }
17030
17031 x = gen_rtx_CONST_VECTOR (V4SImode,
17032 gen_rtvec (4, GEN_INT (0x43300000UL),
17033 GEN_INT (0x45300000UL),
17034 const0_rtx, const0_rtx));
17035 exponents = validize_mem (force_const_mem (V4SImode, x));
17036
17037 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17038 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17039
17040 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17041 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17042 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17043 (0x1.0p84 + double(fp_value_hi_xmm)).
17044 Note these exponents differ by 32. */
17045
17046 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17047
17048 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17049 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17050 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17051 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17052 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17053 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17054 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17055 biases = validize_mem (force_const_mem (V2DFmode, biases));
17056 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17057
17058 /* Add the upper and lower DFmode values together. */
17059 if (TARGET_SSE3)
17060 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17061 else
17062 {
17063 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17064 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17065 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17066 }
17067
17068 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17069 }
17070
17071 /* Not used, but eases macroization of patterns. */
17072 void
17073 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17074 rtx input ATTRIBUTE_UNUSED)
17075 {
17076 gcc_unreachable ();
17077 }
17078
17079 /* Convert an unsigned SImode value into a DFmode. Only currently used
17080 for SSE, but applicable anywhere. */
17081
17082 void
17083 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17084 {
17085 REAL_VALUE_TYPE TWO31r;
17086 rtx x, fp;
17087
17088 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17089 NULL, 1, OPTAB_DIRECT);
17090
17091 fp = gen_reg_rtx (DFmode);
17092 emit_insn (gen_floatsidf2 (fp, x));
17093
17094 real_ldexp (&TWO31r, &dconst1, 31);
17095 x = const_double_from_real_value (TWO31r, DFmode);
17096
17097 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17098 if (x != target)
17099 emit_move_insn (target, x);
17100 }
17101
17102 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17103 32-bit mode; otherwise we have a direct convert instruction. */
17104
17105 void
17106 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17107 {
17108 REAL_VALUE_TYPE TWO32r;
17109 rtx fp_lo, fp_hi, x;
17110
17111 fp_lo = gen_reg_rtx (DFmode);
17112 fp_hi = gen_reg_rtx (DFmode);
17113
17114 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17115
17116 real_ldexp (&TWO32r, &dconst1, 32);
17117 x = const_double_from_real_value (TWO32r, DFmode);
17118 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17119
17120 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17121
17122 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17123 0, OPTAB_DIRECT);
17124 if (x != target)
17125 emit_move_insn (target, x);
17126 }
17127
17128 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17129 For x86_32, -mfpmath=sse, !optimize_size only. */
17130 void
17131 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17132 {
17133 REAL_VALUE_TYPE ONE16r;
17134 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17135
17136 real_ldexp (&ONE16r, &dconst1, 16);
17137 x = const_double_from_real_value (ONE16r, SFmode);
17138 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17139 NULL, 0, OPTAB_DIRECT);
17140 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17141 NULL, 0, OPTAB_DIRECT);
17142 fp_hi = gen_reg_rtx (SFmode);
17143 fp_lo = gen_reg_rtx (SFmode);
17144 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17145 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17146 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17147 0, OPTAB_DIRECT);
17148 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17149 0, OPTAB_DIRECT);
17150 if (!rtx_equal_p (target, fp_hi))
17151 emit_move_insn (target, fp_hi);
17152 }
17153
17154 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17155 a vector of unsigned ints VAL to vector of floats TARGET. */
17156
17157 void
17158 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17159 {
17160 rtx tmp[8];
17161 REAL_VALUE_TYPE TWO16r;
17162 enum machine_mode intmode = GET_MODE (val);
17163 enum machine_mode fltmode = GET_MODE (target);
17164 rtx (*cvt) (rtx, rtx);
17165
17166 if (intmode == V4SImode)
17167 cvt = gen_floatv4siv4sf2;
17168 else
17169 cvt = gen_floatv8siv8sf2;
17170 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17171 tmp[0] = force_reg (intmode, tmp[0]);
17172 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17173 OPTAB_DIRECT);
17174 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17175 NULL_RTX, 1, OPTAB_DIRECT);
17176 tmp[3] = gen_reg_rtx (fltmode);
17177 emit_insn (cvt (tmp[3], tmp[1]));
17178 tmp[4] = gen_reg_rtx (fltmode);
17179 emit_insn (cvt (tmp[4], tmp[2]));
17180 real_ldexp (&TWO16r, &dconst1, 16);
17181 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17182 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17183 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17184 OPTAB_DIRECT);
17185 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17186 OPTAB_DIRECT);
17187 if (tmp[7] != target)
17188 emit_move_insn (target, tmp[7]);
17189 }
17190
17191 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17192 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17193 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17194 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17195
17196 rtx
17197 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17198 {
17199 REAL_VALUE_TYPE TWO31r;
17200 rtx two31r, tmp[4];
17201 enum machine_mode mode = GET_MODE (val);
17202 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17203 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17204 rtx (*cmp) (rtx, rtx, rtx, rtx);
17205 int i;
17206
17207 for (i = 0; i < 3; i++)
17208 tmp[i] = gen_reg_rtx (mode);
17209 real_ldexp (&TWO31r, &dconst1, 31);
17210 two31r = const_double_from_real_value (TWO31r, scalarmode);
17211 two31r = ix86_build_const_vector (mode, 1, two31r);
17212 two31r = force_reg (mode, two31r);
17213 switch (mode)
17214 {
17215 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17216 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17217 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17218 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17219 default: gcc_unreachable ();
17220 }
17221 tmp[3] = gen_rtx_LE (mode, two31r, val);
17222 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17223 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17224 0, OPTAB_DIRECT);
17225 if (intmode == V4SImode || TARGET_AVX2)
17226 *xorp = expand_simple_binop (intmode, ASHIFT,
17227 gen_lowpart (intmode, tmp[0]),
17228 GEN_INT (31), NULL_RTX, 0,
17229 OPTAB_DIRECT);
17230 else
17231 {
17232 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17233 two31 = ix86_build_const_vector (intmode, 1, two31);
17234 *xorp = expand_simple_binop (intmode, AND,
17235 gen_lowpart (intmode, tmp[0]),
17236 two31, NULL_RTX, 0,
17237 OPTAB_DIRECT);
17238 }
17239 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17240 0, OPTAB_DIRECT);
17241 }
17242
17243 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17244 then replicate the value for all elements of the vector
17245 register. */
17246
17247 rtx
17248 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17249 {
17250 int i, n_elt;
17251 rtvec v;
17252 enum machine_mode scalar_mode;
17253
17254 switch (mode)
17255 {
17256 case V32QImode:
17257 case V16QImode:
17258 case V16HImode:
17259 case V8HImode:
17260 case V8SImode:
17261 case V4SImode:
17262 case V4DImode:
17263 case V2DImode:
17264 gcc_assert (vect);
17265 case V8SFmode:
17266 case V4SFmode:
17267 case V4DFmode:
17268 case V2DFmode:
17269 n_elt = GET_MODE_NUNITS (mode);
17270 v = rtvec_alloc (n_elt);
17271 scalar_mode = GET_MODE_INNER (mode);
17272
17273 RTVEC_ELT (v, 0) = value;
17274
17275 for (i = 1; i < n_elt; ++i)
17276 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17277
17278 return gen_rtx_CONST_VECTOR (mode, v);
17279
17280 default:
17281 gcc_unreachable ();
17282 }
17283 }
17284
17285 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17286 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17287 for an SSE register. If VECT is true, then replicate the mask for
17288 all elements of the vector register. If INVERT is true, then create
17289 a mask excluding the sign bit. */
17290
17291 rtx
17292 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17293 {
17294 enum machine_mode vec_mode, imode;
17295 HOST_WIDE_INT hi, lo;
17296 int shift = 63;
17297 rtx v;
17298 rtx mask;
17299
17300 /* Find the sign bit, sign extended to 2*HWI. */
17301 switch (mode)
17302 {
17303 case V8SImode:
17304 case V4SImode:
17305 case V8SFmode:
17306 case V4SFmode:
17307 vec_mode = mode;
17308 mode = GET_MODE_INNER (mode);
17309 imode = SImode;
17310 lo = 0x80000000, hi = lo < 0;
17311 break;
17312
17313 case V4DImode:
17314 case V2DImode:
17315 case V4DFmode:
17316 case V2DFmode:
17317 vec_mode = mode;
17318 mode = GET_MODE_INNER (mode);
17319 imode = DImode;
17320 if (HOST_BITS_PER_WIDE_INT >= 64)
17321 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17322 else
17323 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17324 break;
17325
17326 case TImode:
17327 case TFmode:
17328 vec_mode = VOIDmode;
17329 if (HOST_BITS_PER_WIDE_INT >= 64)
17330 {
17331 imode = TImode;
17332 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17333 }
17334 else
17335 {
17336 rtvec vec;
17337
17338 imode = DImode;
17339 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17340
17341 if (invert)
17342 {
17343 lo = ~lo, hi = ~hi;
17344 v = constm1_rtx;
17345 }
17346 else
17347 v = const0_rtx;
17348
17349 mask = immed_double_const (lo, hi, imode);
17350
17351 vec = gen_rtvec (2, v, mask);
17352 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17353 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17354
17355 return v;
17356 }
17357 break;
17358
17359 default:
17360 gcc_unreachable ();
17361 }
17362
17363 if (invert)
17364 lo = ~lo, hi = ~hi;
17365
17366 /* Force this value into the low part of a fp vector constant. */
17367 mask = immed_double_const (lo, hi, imode);
17368 mask = gen_lowpart (mode, mask);
17369
17370 if (vec_mode == VOIDmode)
17371 return force_reg (mode, mask);
17372
17373 v = ix86_build_const_vector (vec_mode, vect, mask);
17374 return force_reg (vec_mode, v);
17375 }
17376
17377 /* Generate code for floating point ABS or NEG. */
17378
17379 void
17380 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17381 rtx operands[])
17382 {
17383 rtx mask, set, dst, src;
17384 bool use_sse = false;
17385 bool vector_mode = VECTOR_MODE_P (mode);
17386 enum machine_mode vmode = mode;
17387
17388 if (vector_mode)
17389 use_sse = true;
17390 else if (mode == TFmode)
17391 use_sse = true;
17392 else if (TARGET_SSE_MATH)
17393 {
17394 use_sse = SSE_FLOAT_MODE_P (mode);
17395 if (mode == SFmode)
17396 vmode = V4SFmode;
17397 else if (mode == DFmode)
17398 vmode = V2DFmode;
17399 }
17400
17401 /* NEG and ABS performed with SSE use bitwise mask operations.
17402 Create the appropriate mask now. */
17403 if (use_sse)
17404 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17405 else
17406 mask = NULL_RTX;
17407
17408 dst = operands[0];
17409 src = operands[1];
17410
17411 set = gen_rtx_fmt_e (code, mode, src);
17412 set = gen_rtx_SET (VOIDmode, dst, set);
17413
17414 if (mask)
17415 {
17416 rtx use, clob;
17417 rtvec par;
17418
17419 use = gen_rtx_USE (VOIDmode, mask);
17420 if (vector_mode)
17421 par = gen_rtvec (2, set, use);
17422 else
17423 {
17424 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17425 par = gen_rtvec (3, set, use, clob);
17426 }
17427 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17428 }
17429 else
17430 emit_insn (set);
17431 }
17432
17433 /* Expand a copysign operation. Special case operand 0 being a constant. */
17434
17435 void
17436 ix86_expand_copysign (rtx operands[])
17437 {
17438 enum machine_mode mode, vmode;
17439 rtx dest, op0, op1, mask, nmask;
17440
17441 dest = operands[0];
17442 op0 = operands[1];
17443 op1 = operands[2];
17444
17445 mode = GET_MODE (dest);
17446
17447 if (mode == SFmode)
17448 vmode = V4SFmode;
17449 else if (mode == DFmode)
17450 vmode = V2DFmode;
17451 else
17452 vmode = mode;
17453
17454 if (GET_CODE (op0) == CONST_DOUBLE)
17455 {
17456 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17457
17458 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17459 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17460
17461 if (mode == SFmode || mode == DFmode)
17462 {
17463 if (op0 == CONST0_RTX (mode))
17464 op0 = CONST0_RTX (vmode);
17465 else
17466 {
17467 rtx v = ix86_build_const_vector (vmode, false, op0);
17468
17469 op0 = force_reg (vmode, v);
17470 }
17471 }
17472 else if (op0 != CONST0_RTX (mode))
17473 op0 = force_reg (mode, op0);
17474
17475 mask = ix86_build_signbit_mask (vmode, 0, 0);
17476
17477 if (mode == SFmode)
17478 copysign_insn = gen_copysignsf3_const;
17479 else if (mode == DFmode)
17480 copysign_insn = gen_copysigndf3_const;
17481 else
17482 copysign_insn = gen_copysigntf3_const;
17483
17484 emit_insn (copysign_insn (dest, op0, op1, mask));
17485 }
17486 else
17487 {
17488 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17489
17490 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17491 mask = ix86_build_signbit_mask (vmode, 0, 0);
17492
17493 if (mode == SFmode)
17494 copysign_insn = gen_copysignsf3_var;
17495 else if (mode == DFmode)
17496 copysign_insn = gen_copysigndf3_var;
17497 else
17498 copysign_insn = gen_copysigntf3_var;
17499
17500 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17501 }
17502 }
17503
17504 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17505 be a constant, and so has already been expanded into a vector constant. */
17506
17507 void
17508 ix86_split_copysign_const (rtx operands[])
17509 {
17510 enum machine_mode mode, vmode;
17511 rtx dest, op0, mask, x;
17512
17513 dest = operands[0];
17514 op0 = operands[1];
17515 mask = operands[3];
17516
17517 mode = GET_MODE (dest);
17518 vmode = GET_MODE (mask);
17519
17520 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17521 x = gen_rtx_AND (vmode, dest, mask);
17522 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17523
17524 if (op0 != CONST0_RTX (vmode))
17525 {
17526 x = gen_rtx_IOR (vmode, dest, op0);
17527 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17528 }
17529 }
17530
17531 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17532 so we have to do two masks. */
17533
17534 void
17535 ix86_split_copysign_var (rtx operands[])
17536 {
17537 enum machine_mode mode, vmode;
17538 rtx dest, scratch, op0, op1, mask, nmask, x;
17539
17540 dest = operands[0];
17541 scratch = operands[1];
17542 op0 = operands[2];
17543 op1 = operands[3];
17544 nmask = operands[4];
17545 mask = operands[5];
17546
17547 mode = GET_MODE (dest);
17548 vmode = GET_MODE (mask);
17549
17550 if (rtx_equal_p (op0, op1))
17551 {
17552 /* Shouldn't happen often (it's useless, obviously), but when it does
17553 we'd generate incorrect code if we continue below. */
17554 emit_move_insn (dest, op0);
17555 return;
17556 }
17557
17558 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17559 {
17560 gcc_assert (REGNO (op1) == REGNO (scratch));
17561
17562 x = gen_rtx_AND (vmode, scratch, mask);
17563 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17564
17565 dest = mask;
17566 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17567 x = gen_rtx_NOT (vmode, dest);
17568 x = gen_rtx_AND (vmode, x, op0);
17569 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17570 }
17571 else
17572 {
17573 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17574 {
17575 x = gen_rtx_AND (vmode, scratch, mask);
17576 }
17577 else /* alternative 2,4 */
17578 {
17579 gcc_assert (REGNO (mask) == REGNO (scratch));
17580 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17581 x = gen_rtx_AND (vmode, scratch, op1);
17582 }
17583 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17584
17585 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17586 {
17587 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17588 x = gen_rtx_AND (vmode, dest, nmask);
17589 }
17590 else /* alternative 3,4 */
17591 {
17592 gcc_assert (REGNO (nmask) == REGNO (dest));
17593 dest = nmask;
17594 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17595 x = gen_rtx_AND (vmode, dest, op0);
17596 }
17597 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17598 }
17599
17600 x = gen_rtx_IOR (vmode, dest, scratch);
17601 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17602 }
17603
17604 /* Return TRUE or FALSE depending on whether the first SET in INSN
17605 has source and destination with matching CC modes, and that the
17606 CC mode is at least as constrained as REQ_MODE. */
17607
17608 bool
17609 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17610 {
17611 rtx set;
17612 enum machine_mode set_mode;
17613
17614 set = PATTERN (insn);
17615 if (GET_CODE (set) == PARALLEL)
17616 set = XVECEXP (set, 0, 0);
17617 gcc_assert (GET_CODE (set) == SET);
17618 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17619
17620 set_mode = GET_MODE (SET_DEST (set));
17621 switch (set_mode)
17622 {
17623 case CCNOmode:
17624 if (req_mode != CCNOmode
17625 && (req_mode != CCmode
17626 || XEXP (SET_SRC (set), 1) != const0_rtx))
17627 return false;
17628 break;
17629 case CCmode:
17630 if (req_mode == CCGCmode)
17631 return false;
17632 /* FALLTHRU */
17633 case CCGCmode:
17634 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17635 return false;
17636 /* FALLTHRU */
17637 case CCGOCmode:
17638 if (req_mode == CCZmode)
17639 return false;
17640 /* FALLTHRU */
17641 case CCZmode:
17642 break;
17643
17644 case CCAmode:
17645 case CCCmode:
17646 case CCOmode:
17647 case CCSmode:
17648 if (set_mode != req_mode)
17649 return false;
17650 break;
17651
17652 default:
17653 gcc_unreachable ();
17654 }
17655
17656 return GET_MODE (SET_SRC (set)) == set_mode;
17657 }
17658
17659 /* Generate insn patterns to do an integer compare of OPERANDS. */
17660
17661 static rtx
17662 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17663 {
17664 enum machine_mode cmpmode;
17665 rtx tmp, flags;
17666
17667 cmpmode = SELECT_CC_MODE (code, op0, op1);
17668 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17669
17670 /* This is very simple, but making the interface the same as in the
17671 FP case makes the rest of the code easier. */
17672 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17673 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17674
17675 /* Return the test that should be put into the flags user, i.e.
17676 the bcc, scc, or cmov instruction. */
17677 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17678 }
17679
17680 /* Figure out whether to use ordered or unordered fp comparisons.
17681 Return the appropriate mode to use. */
17682
17683 enum machine_mode
17684 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17685 {
17686 /* ??? In order to make all comparisons reversible, we do all comparisons
17687 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17688 all forms trapping and nontrapping comparisons, we can make inequality
17689 comparisons trapping again, since it results in better code when using
17690 FCOM based compares. */
17691 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17692 }
17693
17694 enum machine_mode
17695 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17696 {
17697 enum machine_mode mode = GET_MODE (op0);
17698
17699 if (SCALAR_FLOAT_MODE_P (mode))
17700 {
17701 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17702 return ix86_fp_compare_mode (code);
17703 }
17704
17705 switch (code)
17706 {
17707 /* Only zero flag is needed. */
17708 case EQ: /* ZF=0 */
17709 case NE: /* ZF!=0 */
17710 return CCZmode;
17711 /* Codes needing carry flag. */
17712 case GEU: /* CF=0 */
17713 case LTU: /* CF=1 */
17714 /* Detect overflow checks. They need just the carry flag. */
17715 if (GET_CODE (op0) == PLUS
17716 && rtx_equal_p (op1, XEXP (op0, 0)))
17717 return CCCmode;
17718 else
17719 return CCmode;
17720 case GTU: /* CF=0 & ZF=0 */
17721 case LEU: /* CF=1 | ZF=1 */
17722 /* Detect overflow checks. They need just the carry flag. */
17723 if (GET_CODE (op0) == MINUS
17724 && rtx_equal_p (op1, XEXP (op0, 0)))
17725 return CCCmode;
17726 else
17727 return CCmode;
17728 /* Codes possibly doable only with sign flag when
17729 comparing against zero. */
17730 case GE: /* SF=OF or SF=0 */
17731 case LT: /* SF<>OF or SF=1 */
17732 if (op1 == const0_rtx)
17733 return CCGOCmode;
17734 else
17735 /* For other cases Carry flag is not required. */
17736 return CCGCmode;
17737 /* Codes doable only with sign flag when comparing
17738 against zero, but we miss jump instruction for it
17739 so we need to use relational tests against overflow
17740 that thus needs to be zero. */
17741 case GT: /* ZF=0 & SF=OF */
17742 case LE: /* ZF=1 | SF<>OF */
17743 if (op1 == const0_rtx)
17744 return CCNOmode;
17745 else
17746 return CCGCmode;
17747 /* strcmp pattern do (use flags) and combine may ask us for proper
17748 mode. */
17749 case USE:
17750 return CCmode;
17751 default:
17752 gcc_unreachable ();
17753 }
17754 }
17755
17756 /* Return the fixed registers used for condition codes. */
17757
17758 static bool
17759 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17760 {
17761 *p1 = FLAGS_REG;
17762 *p2 = FPSR_REG;
17763 return true;
17764 }
17765
17766 /* If two condition code modes are compatible, return a condition code
17767 mode which is compatible with both. Otherwise, return
17768 VOIDmode. */
17769
17770 static enum machine_mode
17771 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17772 {
17773 if (m1 == m2)
17774 return m1;
17775
17776 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17777 return VOIDmode;
17778
17779 if ((m1 == CCGCmode && m2 == CCGOCmode)
17780 || (m1 == CCGOCmode && m2 == CCGCmode))
17781 return CCGCmode;
17782
17783 switch (m1)
17784 {
17785 default:
17786 gcc_unreachable ();
17787
17788 case CCmode:
17789 case CCGCmode:
17790 case CCGOCmode:
17791 case CCNOmode:
17792 case CCAmode:
17793 case CCCmode:
17794 case CCOmode:
17795 case CCSmode:
17796 case CCZmode:
17797 switch (m2)
17798 {
17799 default:
17800 return VOIDmode;
17801
17802 case CCmode:
17803 case CCGCmode:
17804 case CCGOCmode:
17805 case CCNOmode:
17806 case CCAmode:
17807 case CCCmode:
17808 case CCOmode:
17809 case CCSmode:
17810 case CCZmode:
17811 return CCmode;
17812 }
17813
17814 case CCFPmode:
17815 case CCFPUmode:
17816 /* These are only compatible with themselves, which we already
17817 checked above. */
17818 return VOIDmode;
17819 }
17820 }
17821
17822
17823 /* Return a comparison we can do and that it is equivalent to
17824 swap_condition (code) apart possibly from orderedness.
17825 But, never change orderedness if TARGET_IEEE_FP, returning
17826 UNKNOWN in that case if necessary. */
17827
17828 static enum rtx_code
17829 ix86_fp_swap_condition (enum rtx_code code)
17830 {
17831 switch (code)
17832 {
17833 case GT: /* GTU - CF=0 & ZF=0 */
17834 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
17835 case GE: /* GEU - CF=0 */
17836 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
17837 case UNLT: /* LTU - CF=1 */
17838 return TARGET_IEEE_FP ? UNKNOWN : GT;
17839 case UNLE: /* LEU - CF=1 | ZF=1 */
17840 return TARGET_IEEE_FP ? UNKNOWN : GE;
17841 default:
17842 return swap_condition (code);
17843 }
17844 }
17845
17846 /* Return cost of comparison CODE using the best strategy for performance.
17847 All following functions do use number of instructions as a cost metrics.
17848 In future this should be tweaked to compute bytes for optimize_size and
17849 take into account performance of various instructions on various CPUs. */
17850
17851 static int
17852 ix86_fp_comparison_cost (enum rtx_code code)
17853 {
17854 int arith_cost;
17855
17856 /* The cost of code using bit-twiddling on %ah. */
17857 switch (code)
17858 {
17859 case UNLE:
17860 case UNLT:
17861 case LTGT:
17862 case GT:
17863 case GE:
17864 case UNORDERED:
17865 case ORDERED:
17866 case UNEQ:
17867 arith_cost = 4;
17868 break;
17869 case LT:
17870 case NE:
17871 case EQ:
17872 case UNGE:
17873 arith_cost = TARGET_IEEE_FP ? 5 : 4;
17874 break;
17875 case LE:
17876 case UNGT:
17877 arith_cost = TARGET_IEEE_FP ? 6 : 4;
17878 break;
17879 default:
17880 gcc_unreachable ();
17881 }
17882
17883 switch (ix86_fp_comparison_strategy (code))
17884 {
17885 case IX86_FPCMP_COMI:
17886 return arith_cost > 4 ? 3 : 2;
17887 case IX86_FPCMP_SAHF:
17888 return arith_cost > 4 ? 4 : 3;
17889 default:
17890 return arith_cost;
17891 }
17892 }
17893
17894 /* Return strategy to use for floating-point. We assume that fcomi is always
17895 preferrable where available, since that is also true when looking at size
17896 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
17897
17898 enum ix86_fpcmp_strategy
17899 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
17900 {
17901 /* Do fcomi/sahf based test when profitable. */
17902
17903 if (TARGET_CMOVE)
17904 return IX86_FPCMP_COMI;
17905
17906 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
17907 return IX86_FPCMP_SAHF;
17908
17909 return IX86_FPCMP_ARITH;
17910 }
17911
17912 /* Swap, force into registers, or otherwise massage the two operands
17913 to a fp comparison. The operands are updated in place; the new
17914 comparison code is returned. */
17915
17916 static enum rtx_code
17917 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
17918 {
17919 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
17920 rtx op0 = *pop0, op1 = *pop1;
17921 enum machine_mode op_mode = GET_MODE (op0);
17922 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
17923
17924 /* All of the unordered compare instructions only work on registers.
17925 The same is true of the fcomi compare instructions. The XFmode
17926 compare instructions require registers except when comparing
17927 against zero or when converting operand 1 from fixed point to
17928 floating point. */
17929
17930 if (!is_sse
17931 && (fpcmp_mode == CCFPUmode
17932 || (op_mode == XFmode
17933 && ! (standard_80387_constant_p (op0) == 1
17934 || standard_80387_constant_p (op1) == 1)
17935 && GET_CODE (op1) != FLOAT)
17936 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
17937 {
17938 op0 = force_reg (op_mode, op0);
17939 op1 = force_reg (op_mode, op1);
17940 }
17941 else
17942 {
17943 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
17944 things around if they appear profitable, otherwise force op0
17945 into a register. */
17946
17947 if (standard_80387_constant_p (op0) == 0
17948 || (MEM_P (op0)
17949 && ! (standard_80387_constant_p (op1) == 0
17950 || MEM_P (op1))))
17951 {
17952 enum rtx_code new_code = ix86_fp_swap_condition (code);
17953 if (new_code != UNKNOWN)
17954 {
17955 rtx tmp;
17956 tmp = op0, op0 = op1, op1 = tmp;
17957 code = new_code;
17958 }
17959 }
17960
17961 if (!REG_P (op0))
17962 op0 = force_reg (op_mode, op0);
17963
17964 if (CONSTANT_P (op1))
17965 {
17966 int tmp = standard_80387_constant_p (op1);
17967 if (tmp == 0)
17968 op1 = validize_mem (force_const_mem (op_mode, op1));
17969 else if (tmp == 1)
17970 {
17971 if (TARGET_CMOVE)
17972 op1 = force_reg (op_mode, op1);
17973 }
17974 else
17975 op1 = force_reg (op_mode, op1);
17976 }
17977 }
17978
17979 /* Try to rearrange the comparison to make it cheaper. */
17980 if (ix86_fp_comparison_cost (code)
17981 > ix86_fp_comparison_cost (swap_condition (code))
17982 && (REG_P (op1) || can_create_pseudo_p ()))
17983 {
17984 rtx tmp;
17985 tmp = op0, op0 = op1, op1 = tmp;
17986 code = swap_condition (code);
17987 if (!REG_P (op0))
17988 op0 = force_reg (op_mode, op0);
17989 }
17990
17991 *pop0 = op0;
17992 *pop1 = op1;
17993 return code;
17994 }
17995
17996 /* Convert comparison codes we use to represent FP comparison to integer
17997 code that will result in proper branch. Return UNKNOWN if no such code
17998 is available. */
17999
18000 enum rtx_code
18001 ix86_fp_compare_code_to_integer (enum rtx_code code)
18002 {
18003 switch (code)
18004 {
18005 case GT:
18006 return GTU;
18007 case GE:
18008 return GEU;
18009 case ORDERED:
18010 case UNORDERED:
18011 return code;
18012 break;
18013 case UNEQ:
18014 return EQ;
18015 break;
18016 case UNLT:
18017 return LTU;
18018 break;
18019 case UNLE:
18020 return LEU;
18021 break;
18022 case LTGT:
18023 return NE;
18024 break;
18025 default:
18026 return UNKNOWN;
18027 }
18028 }
18029
18030 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18031
18032 static rtx
18033 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18034 {
18035 enum machine_mode fpcmp_mode, intcmp_mode;
18036 rtx tmp, tmp2;
18037
18038 fpcmp_mode = ix86_fp_compare_mode (code);
18039 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18040
18041 /* Do fcomi/sahf based test when profitable. */
18042 switch (ix86_fp_comparison_strategy (code))
18043 {
18044 case IX86_FPCMP_COMI:
18045 intcmp_mode = fpcmp_mode;
18046 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18047 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18048 tmp);
18049 emit_insn (tmp);
18050 break;
18051
18052 case IX86_FPCMP_SAHF:
18053 intcmp_mode = fpcmp_mode;
18054 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18055 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18056 tmp);
18057
18058 if (!scratch)
18059 scratch = gen_reg_rtx (HImode);
18060 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18061 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18062 break;
18063
18064 case IX86_FPCMP_ARITH:
18065 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18066 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18067 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18068 if (!scratch)
18069 scratch = gen_reg_rtx (HImode);
18070 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18071
18072 /* In the unordered case, we have to check C2 for NaN's, which
18073 doesn't happen to work out to anything nice combination-wise.
18074 So do some bit twiddling on the value we've got in AH to come
18075 up with an appropriate set of condition codes. */
18076
18077 intcmp_mode = CCNOmode;
18078 switch (code)
18079 {
18080 case GT:
18081 case UNGT:
18082 if (code == GT || !TARGET_IEEE_FP)
18083 {
18084 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18085 code = EQ;
18086 }
18087 else
18088 {
18089 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18090 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18091 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18092 intcmp_mode = CCmode;
18093 code = GEU;
18094 }
18095 break;
18096 case LT:
18097 case UNLT:
18098 if (code == LT && TARGET_IEEE_FP)
18099 {
18100 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18101 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18102 intcmp_mode = CCmode;
18103 code = EQ;
18104 }
18105 else
18106 {
18107 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18108 code = NE;
18109 }
18110 break;
18111 case GE:
18112 case UNGE:
18113 if (code == GE || !TARGET_IEEE_FP)
18114 {
18115 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18116 code = EQ;
18117 }
18118 else
18119 {
18120 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18121 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18122 code = NE;
18123 }
18124 break;
18125 case LE:
18126 case UNLE:
18127 if (code == LE && TARGET_IEEE_FP)
18128 {
18129 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18130 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18131 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18132 intcmp_mode = CCmode;
18133 code = LTU;
18134 }
18135 else
18136 {
18137 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18138 code = NE;
18139 }
18140 break;
18141 case EQ:
18142 case UNEQ:
18143 if (code == EQ && TARGET_IEEE_FP)
18144 {
18145 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18146 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18147 intcmp_mode = CCmode;
18148 code = EQ;
18149 }
18150 else
18151 {
18152 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18153 code = NE;
18154 }
18155 break;
18156 case NE:
18157 case LTGT:
18158 if (code == NE && TARGET_IEEE_FP)
18159 {
18160 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18161 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18162 GEN_INT (0x40)));
18163 code = NE;
18164 }
18165 else
18166 {
18167 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18168 code = EQ;
18169 }
18170 break;
18171
18172 case UNORDERED:
18173 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18174 code = NE;
18175 break;
18176 case ORDERED:
18177 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18178 code = EQ;
18179 break;
18180
18181 default:
18182 gcc_unreachable ();
18183 }
18184 break;
18185
18186 default:
18187 gcc_unreachable();
18188 }
18189
18190 /* Return the test that should be put into the flags user, i.e.
18191 the bcc, scc, or cmov instruction. */
18192 return gen_rtx_fmt_ee (code, VOIDmode,
18193 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18194 const0_rtx);
18195 }
18196
18197 static rtx
18198 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18199 {
18200 rtx ret;
18201
18202 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18203 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18204
18205 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18206 {
18207 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18208 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18209 }
18210 else
18211 ret = ix86_expand_int_compare (code, op0, op1);
18212
18213 return ret;
18214 }
18215
18216 void
18217 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18218 {
18219 enum machine_mode mode = GET_MODE (op0);
18220 rtx tmp;
18221
18222 switch (mode)
18223 {
18224 case SFmode:
18225 case DFmode:
18226 case XFmode:
18227 case QImode:
18228 case HImode:
18229 case SImode:
18230 simple:
18231 tmp = ix86_expand_compare (code, op0, op1);
18232 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18233 gen_rtx_LABEL_REF (VOIDmode, label),
18234 pc_rtx);
18235 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18236 return;
18237
18238 case DImode:
18239 if (TARGET_64BIT)
18240 goto simple;
18241 case TImode:
18242 /* Expand DImode branch into multiple compare+branch. */
18243 {
18244 rtx lo[2], hi[2], label2;
18245 enum rtx_code code1, code2, code3;
18246 enum machine_mode submode;
18247
18248 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18249 {
18250 tmp = op0, op0 = op1, op1 = tmp;
18251 code = swap_condition (code);
18252 }
18253
18254 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18255 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18256
18257 submode = mode == DImode ? SImode : DImode;
18258
18259 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18260 avoid two branches. This costs one extra insn, so disable when
18261 optimizing for size. */
18262
18263 if ((code == EQ || code == NE)
18264 && (!optimize_insn_for_size_p ()
18265 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18266 {
18267 rtx xor0, xor1;
18268
18269 xor1 = hi[0];
18270 if (hi[1] != const0_rtx)
18271 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18272 NULL_RTX, 0, OPTAB_WIDEN);
18273
18274 xor0 = lo[0];
18275 if (lo[1] != const0_rtx)
18276 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18277 NULL_RTX, 0, OPTAB_WIDEN);
18278
18279 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18280 NULL_RTX, 0, OPTAB_WIDEN);
18281
18282 ix86_expand_branch (code, tmp, const0_rtx, label);
18283 return;
18284 }
18285
18286 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18287 op1 is a constant and the low word is zero, then we can just
18288 examine the high word. Similarly for low word -1 and
18289 less-or-equal-than or greater-than. */
18290
18291 if (CONST_INT_P (hi[1]))
18292 switch (code)
18293 {
18294 case LT: case LTU: case GE: case GEU:
18295 if (lo[1] == const0_rtx)
18296 {
18297 ix86_expand_branch (code, hi[0], hi[1], label);
18298 return;
18299 }
18300 break;
18301 case LE: case LEU: case GT: case GTU:
18302 if (lo[1] == constm1_rtx)
18303 {
18304 ix86_expand_branch (code, hi[0], hi[1], label);
18305 return;
18306 }
18307 break;
18308 default:
18309 break;
18310 }
18311
18312 /* Otherwise, we need two or three jumps. */
18313
18314 label2 = gen_label_rtx ();
18315
18316 code1 = code;
18317 code2 = swap_condition (code);
18318 code3 = unsigned_condition (code);
18319
18320 switch (code)
18321 {
18322 case LT: case GT: case LTU: case GTU:
18323 break;
18324
18325 case LE: code1 = LT; code2 = GT; break;
18326 case GE: code1 = GT; code2 = LT; break;
18327 case LEU: code1 = LTU; code2 = GTU; break;
18328 case GEU: code1 = GTU; code2 = LTU; break;
18329
18330 case EQ: code1 = UNKNOWN; code2 = NE; break;
18331 case NE: code2 = UNKNOWN; break;
18332
18333 default:
18334 gcc_unreachable ();
18335 }
18336
18337 /*
18338 * a < b =>
18339 * if (hi(a) < hi(b)) goto true;
18340 * if (hi(a) > hi(b)) goto false;
18341 * if (lo(a) < lo(b)) goto true;
18342 * false:
18343 */
18344
18345 if (code1 != UNKNOWN)
18346 ix86_expand_branch (code1, hi[0], hi[1], label);
18347 if (code2 != UNKNOWN)
18348 ix86_expand_branch (code2, hi[0], hi[1], label2);
18349
18350 ix86_expand_branch (code3, lo[0], lo[1], label);
18351
18352 if (code2 != UNKNOWN)
18353 emit_label (label2);
18354 return;
18355 }
18356
18357 default:
18358 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18359 goto simple;
18360 }
18361 }
18362
18363 /* Split branch based on floating point condition. */
18364 void
18365 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18366 rtx target1, rtx target2, rtx tmp, rtx pushed)
18367 {
18368 rtx condition;
18369 rtx i;
18370
18371 if (target2 != pc_rtx)
18372 {
18373 rtx tmp = target2;
18374 code = reverse_condition_maybe_unordered (code);
18375 target2 = target1;
18376 target1 = tmp;
18377 }
18378
18379 condition = ix86_expand_fp_compare (code, op1, op2,
18380 tmp);
18381
18382 /* Remove pushed operand from stack. */
18383 if (pushed)
18384 ix86_free_from_memory (GET_MODE (pushed));
18385
18386 i = emit_jump_insn (gen_rtx_SET
18387 (VOIDmode, pc_rtx,
18388 gen_rtx_IF_THEN_ELSE (VOIDmode,
18389 condition, target1, target2)));
18390 if (split_branch_probability >= 0)
18391 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18392 }
18393
18394 void
18395 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18396 {
18397 rtx ret;
18398
18399 gcc_assert (GET_MODE (dest) == QImode);
18400
18401 ret = ix86_expand_compare (code, op0, op1);
18402 PUT_MODE (ret, QImode);
18403 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18404 }
18405
18406 /* Expand comparison setting or clearing carry flag. Return true when
18407 successful and set pop for the operation. */
18408 static bool
18409 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18410 {
18411 enum machine_mode mode =
18412 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18413
18414 /* Do not handle double-mode compares that go through special path. */
18415 if (mode == (TARGET_64BIT ? TImode : DImode))
18416 return false;
18417
18418 if (SCALAR_FLOAT_MODE_P (mode))
18419 {
18420 rtx compare_op, compare_seq;
18421
18422 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18423
18424 /* Shortcut: following common codes never translate
18425 into carry flag compares. */
18426 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18427 || code == ORDERED || code == UNORDERED)
18428 return false;
18429
18430 /* These comparisons require zero flag; swap operands so they won't. */
18431 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18432 && !TARGET_IEEE_FP)
18433 {
18434 rtx tmp = op0;
18435 op0 = op1;
18436 op1 = tmp;
18437 code = swap_condition (code);
18438 }
18439
18440 /* Try to expand the comparison and verify that we end up with
18441 carry flag based comparison. This fails to be true only when
18442 we decide to expand comparison using arithmetic that is not
18443 too common scenario. */
18444 start_sequence ();
18445 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18446 compare_seq = get_insns ();
18447 end_sequence ();
18448
18449 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18450 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18451 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18452 else
18453 code = GET_CODE (compare_op);
18454
18455 if (code != LTU && code != GEU)
18456 return false;
18457
18458 emit_insn (compare_seq);
18459 *pop = compare_op;
18460 return true;
18461 }
18462
18463 if (!INTEGRAL_MODE_P (mode))
18464 return false;
18465
18466 switch (code)
18467 {
18468 case LTU:
18469 case GEU:
18470 break;
18471
18472 /* Convert a==0 into (unsigned)a<1. */
18473 case EQ:
18474 case NE:
18475 if (op1 != const0_rtx)
18476 return false;
18477 op1 = const1_rtx;
18478 code = (code == EQ ? LTU : GEU);
18479 break;
18480
18481 /* Convert a>b into b<a or a>=b-1. */
18482 case GTU:
18483 case LEU:
18484 if (CONST_INT_P (op1))
18485 {
18486 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18487 /* Bail out on overflow. We still can swap operands but that
18488 would force loading of the constant into register. */
18489 if (op1 == const0_rtx
18490 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18491 return false;
18492 code = (code == GTU ? GEU : LTU);
18493 }
18494 else
18495 {
18496 rtx tmp = op1;
18497 op1 = op0;
18498 op0 = tmp;
18499 code = (code == GTU ? LTU : GEU);
18500 }
18501 break;
18502
18503 /* Convert a>=0 into (unsigned)a<0x80000000. */
18504 case LT:
18505 case GE:
18506 if (mode == DImode || op1 != const0_rtx)
18507 return false;
18508 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18509 code = (code == LT ? GEU : LTU);
18510 break;
18511 case LE:
18512 case GT:
18513 if (mode == DImode || op1 != constm1_rtx)
18514 return false;
18515 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18516 code = (code == LE ? GEU : LTU);
18517 break;
18518
18519 default:
18520 return false;
18521 }
18522 /* Swapping operands may cause constant to appear as first operand. */
18523 if (!nonimmediate_operand (op0, VOIDmode))
18524 {
18525 if (!can_create_pseudo_p ())
18526 return false;
18527 op0 = force_reg (mode, op0);
18528 }
18529 *pop = ix86_expand_compare (code, op0, op1);
18530 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18531 return true;
18532 }
18533
18534 bool
18535 ix86_expand_int_movcc (rtx operands[])
18536 {
18537 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18538 rtx compare_seq, compare_op;
18539 enum machine_mode mode = GET_MODE (operands[0]);
18540 bool sign_bit_compare_p = false;
18541 rtx op0 = XEXP (operands[1], 0);
18542 rtx op1 = XEXP (operands[1], 1);
18543
18544 start_sequence ();
18545 compare_op = ix86_expand_compare (code, op0, op1);
18546 compare_seq = get_insns ();
18547 end_sequence ();
18548
18549 compare_code = GET_CODE (compare_op);
18550
18551 if ((op1 == const0_rtx && (code == GE || code == LT))
18552 || (op1 == constm1_rtx && (code == GT || code == LE)))
18553 sign_bit_compare_p = true;
18554
18555 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18556 HImode insns, we'd be swallowed in word prefix ops. */
18557
18558 if ((mode != HImode || TARGET_FAST_PREFIX)
18559 && (mode != (TARGET_64BIT ? TImode : DImode))
18560 && CONST_INT_P (operands[2])
18561 && CONST_INT_P (operands[3]))
18562 {
18563 rtx out = operands[0];
18564 HOST_WIDE_INT ct = INTVAL (operands[2]);
18565 HOST_WIDE_INT cf = INTVAL (operands[3]);
18566 HOST_WIDE_INT diff;
18567
18568 diff = ct - cf;
18569 /* Sign bit compares are better done using shifts than we do by using
18570 sbb. */
18571 if (sign_bit_compare_p
18572 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18573 {
18574 /* Detect overlap between destination and compare sources. */
18575 rtx tmp = out;
18576
18577 if (!sign_bit_compare_p)
18578 {
18579 rtx flags;
18580 bool fpcmp = false;
18581
18582 compare_code = GET_CODE (compare_op);
18583
18584 flags = XEXP (compare_op, 0);
18585
18586 if (GET_MODE (flags) == CCFPmode
18587 || GET_MODE (flags) == CCFPUmode)
18588 {
18589 fpcmp = true;
18590 compare_code
18591 = ix86_fp_compare_code_to_integer (compare_code);
18592 }
18593
18594 /* To simplify rest of code, restrict to the GEU case. */
18595 if (compare_code == LTU)
18596 {
18597 HOST_WIDE_INT tmp = ct;
18598 ct = cf;
18599 cf = tmp;
18600 compare_code = reverse_condition (compare_code);
18601 code = reverse_condition (code);
18602 }
18603 else
18604 {
18605 if (fpcmp)
18606 PUT_CODE (compare_op,
18607 reverse_condition_maybe_unordered
18608 (GET_CODE (compare_op)));
18609 else
18610 PUT_CODE (compare_op,
18611 reverse_condition (GET_CODE (compare_op)));
18612 }
18613 diff = ct - cf;
18614
18615 if (reg_overlap_mentioned_p (out, op0)
18616 || reg_overlap_mentioned_p (out, op1))
18617 tmp = gen_reg_rtx (mode);
18618
18619 if (mode == DImode)
18620 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18621 else
18622 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18623 flags, compare_op));
18624 }
18625 else
18626 {
18627 if (code == GT || code == GE)
18628 code = reverse_condition (code);
18629 else
18630 {
18631 HOST_WIDE_INT tmp = ct;
18632 ct = cf;
18633 cf = tmp;
18634 diff = ct - cf;
18635 }
18636 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18637 }
18638
18639 if (diff == 1)
18640 {
18641 /*
18642 * cmpl op0,op1
18643 * sbbl dest,dest
18644 * [addl dest, ct]
18645 *
18646 * Size 5 - 8.
18647 */
18648 if (ct)
18649 tmp = expand_simple_binop (mode, PLUS,
18650 tmp, GEN_INT (ct),
18651 copy_rtx (tmp), 1, OPTAB_DIRECT);
18652 }
18653 else if (cf == -1)
18654 {
18655 /*
18656 * cmpl op0,op1
18657 * sbbl dest,dest
18658 * orl $ct, dest
18659 *
18660 * Size 8.
18661 */
18662 tmp = expand_simple_binop (mode, IOR,
18663 tmp, GEN_INT (ct),
18664 copy_rtx (tmp), 1, OPTAB_DIRECT);
18665 }
18666 else if (diff == -1 && ct)
18667 {
18668 /*
18669 * cmpl op0,op1
18670 * sbbl dest,dest
18671 * notl dest
18672 * [addl dest, cf]
18673 *
18674 * Size 8 - 11.
18675 */
18676 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18677 if (cf)
18678 tmp = expand_simple_binop (mode, PLUS,
18679 copy_rtx (tmp), GEN_INT (cf),
18680 copy_rtx (tmp), 1, OPTAB_DIRECT);
18681 }
18682 else
18683 {
18684 /*
18685 * cmpl op0,op1
18686 * sbbl dest,dest
18687 * [notl dest]
18688 * andl cf - ct, dest
18689 * [addl dest, ct]
18690 *
18691 * Size 8 - 11.
18692 */
18693
18694 if (cf == 0)
18695 {
18696 cf = ct;
18697 ct = 0;
18698 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18699 }
18700
18701 tmp = expand_simple_binop (mode, AND,
18702 copy_rtx (tmp),
18703 gen_int_mode (cf - ct, mode),
18704 copy_rtx (tmp), 1, OPTAB_DIRECT);
18705 if (ct)
18706 tmp = expand_simple_binop (mode, PLUS,
18707 copy_rtx (tmp), GEN_INT (ct),
18708 copy_rtx (tmp), 1, OPTAB_DIRECT);
18709 }
18710
18711 if (!rtx_equal_p (tmp, out))
18712 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18713
18714 return true;
18715 }
18716
18717 if (diff < 0)
18718 {
18719 enum machine_mode cmp_mode = GET_MODE (op0);
18720
18721 HOST_WIDE_INT tmp;
18722 tmp = ct, ct = cf, cf = tmp;
18723 diff = -diff;
18724
18725 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18726 {
18727 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18728
18729 /* We may be reversing unordered compare to normal compare, that
18730 is not valid in general (we may convert non-trapping condition
18731 to trapping one), however on i386 we currently emit all
18732 comparisons unordered. */
18733 compare_code = reverse_condition_maybe_unordered (compare_code);
18734 code = reverse_condition_maybe_unordered (code);
18735 }
18736 else
18737 {
18738 compare_code = reverse_condition (compare_code);
18739 code = reverse_condition (code);
18740 }
18741 }
18742
18743 compare_code = UNKNOWN;
18744 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18745 && CONST_INT_P (op1))
18746 {
18747 if (op1 == const0_rtx
18748 && (code == LT || code == GE))
18749 compare_code = code;
18750 else if (op1 == constm1_rtx)
18751 {
18752 if (code == LE)
18753 compare_code = LT;
18754 else if (code == GT)
18755 compare_code = GE;
18756 }
18757 }
18758
18759 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18760 if (compare_code != UNKNOWN
18761 && GET_MODE (op0) == GET_MODE (out)
18762 && (cf == -1 || ct == -1))
18763 {
18764 /* If lea code below could be used, only optimize
18765 if it results in a 2 insn sequence. */
18766
18767 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18768 || diff == 3 || diff == 5 || diff == 9)
18769 || (compare_code == LT && ct == -1)
18770 || (compare_code == GE && cf == -1))
18771 {
18772 /*
18773 * notl op1 (if necessary)
18774 * sarl $31, op1
18775 * orl cf, op1
18776 */
18777 if (ct != -1)
18778 {
18779 cf = ct;
18780 ct = -1;
18781 code = reverse_condition (code);
18782 }
18783
18784 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18785
18786 out = expand_simple_binop (mode, IOR,
18787 out, GEN_INT (cf),
18788 out, 1, OPTAB_DIRECT);
18789 if (out != operands[0])
18790 emit_move_insn (operands[0], out);
18791
18792 return true;
18793 }
18794 }
18795
18796
18797 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
18798 || diff == 3 || diff == 5 || diff == 9)
18799 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
18800 && (mode != DImode
18801 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
18802 {
18803 /*
18804 * xorl dest,dest
18805 * cmpl op1,op2
18806 * setcc dest
18807 * lea cf(dest*(ct-cf)),dest
18808 *
18809 * Size 14.
18810 *
18811 * This also catches the degenerate setcc-only case.
18812 */
18813
18814 rtx tmp;
18815 int nops;
18816
18817 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18818
18819 nops = 0;
18820 /* On x86_64 the lea instruction operates on Pmode, so we need
18821 to get arithmetics done in proper mode to match. */
18822 if (diff == 1)
18823 tmp = copy_rtx (out);
18824 else
18825 {
18826 rtx out1;
18827 out1 = copy_rtx (out);
18828 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
18829 nops++;
18830 if (diff & 1)
18831 {
18832 tmp = gen_rtx_PLUS (mode, tmp, out1);
18833 nops++;
18834 }
18835 }
18836 if (cf != 0)
18837 {
18838 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
18839 nops++;
18840 }
18841 if (!rtx_equal_p (tmp, out))
18842 {
18843 if (nops == 1)
18844 out = force_operand (tmp, copy_rtx (out));
18845 else
18846 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
18847 }
18848 if (!rtx_equal_p (out, operands[0]))
18849 emit_move_insn (operands[0], copy_rtx (out));
18850
18851 return true;
18852 }
18853
18854 /*
18855 * General case: Jumpful:
18856 * xorl dest,dest cmpl op1, op2
18857 * cmpl op1, op2 movl ct, dest
18858 * setcc dest jcc 1f
18859 * decl dest movl cf, dest
18860 * andl (cf-ct),dest 1:
18861 * addl ct,dest
18862 *
18863 * Size 20. Size 14.
18864 *
18865 * This is reasonably steep, but branch mispredict costs are
18866 * high on modern cpus, so consider failing only if optimizing
18867 * for space.
18868 */
18869
18870 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18871 && BRANCH_COST (optimize_insn_for_speed_p (),
18872 false) >= 2)
18873 {
18874 if (cf == 0)
18875 {
18876 enum machine_mode cmp_mode = GET_MODE (op0);
18877
18878 cf = ct;
18879 ct = 0;
18880
18881 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18882 {
18883 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18884
18885 /* We may be reversing unordered compare to normal compare,
18886 that is not valid in general (we may convert non-trapping
18887 condition to trapping one), however on i386 we currently
18888 emit all comparisons unordered. */
18889 code = reverse_condition_maybe_unordered (code);
18890 }
18891 else
18892 {
18893 code = reverse_condition (code);
18894 if (compare_code != UNKNOWN)
18895 compare_code = reverse_condition (compare_code);
18896 }
18897 }
18898
18899 if (compare_code != UNKNOWN)
18900 {
18901 /* notl op1 (if needed)
18902 sarl $31, op1
18903 andl (cf-ct), op1
18904 addl ct, op1
18905
18906 For x < 0 (resp. x <= -1) there will be no notl,
18907 so if possible swap the constants to get rid of the
18908 complement.
18909 True/false will be -1/0 while code below (store flag
18910 followed by decrement) is 0/-1, so the constants need
18911 to be exchanged once more. */
18912
18913 if (compare_code == GE || !cf)
18914 {
18915 code = reverse_condition (code);
18916 compare_code = LT;
18917 }
18918 else
18919 {
18920 HOST_WIDE_INT tmp = cf;
18921 cf = ct;
18922 ct = tmp;
18923 }
18924
18925 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18926 }
18927 else
18928 {
18929 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18930
18931 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
18932 constm1_rtx,
18933 copy_rtx (out), 1, OPTAB_DIRECT);
18934 }
18935
18936 out = expand_simple_binop (mode, AND, copy_rtx (out),
18937 gen_int_mode (cf - ct, mode),
18938 copy_rtx (out), 1, OPTAB_DIRECT);
18939 if (ct)
18940 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
18941 copy_rtx (out), 1, OPTAB_DIRECT);
18942 if (!rtx_equal_p (out, operands[0]))
18943 emit_move_insn (operands[0], copy_rtx (out));
18944
18945 return true;
18946 }
18947 }
18948
18949 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18950 {
18951 /* Try a few things more with specific constants and a variable. */
18952
18953 optab op;
18954 rtx var, orig_out, out, tmp;
18955
18956 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
18957 return false;
18958
18959 /* If one of the two operands is an interesting constant, load a
18960 constant with the above and mask it in with a logical operation. */
18961
18962 if (CONST_INT_P (operands[2]))
18963 {
18964 var = operands[3];
18965 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
18966 operands[3] = constm1_rtx, op = and_optab;
18967 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
18968 operands[3] = const0_rtx, op = ior_optab;
18969 else
18970 return false;
18971 }
18972 else if (CONST_INT_P (operands[3]))
18973 {
18974 var = operands[2];
18975 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
18976 operands[2] = constm1_rtx, op = and_optab;
18977 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
18978 operands[2] = const0_rtx, op = ior_optab;
18979 else
18980 return false;
18981 }
18982 else
18983 return false;
18984
18985 orig_out = operands[0];
18986 tmp = gen_reg_rtx (mode);
18987 operands[0] = tmp;
18988
18989 /* Recurse to get the constant loaded. */
18990 if (ix86_expand_int_movcc (operands) == 0)
18991 return false;
18992
18993 /* Mask in the interesting variable. */
18994 out = expand_binop (mode, op, var, tmp, orig_out, 0,
18995 OPTAB_WIDEN);
18996 if (!rtx_equal_p (out, orig_out))
18997 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
18998
18999 return true;
19000 }
19001
19002 /*
19003 * For comparison with above,
19004 *
19005 * movl cf,dest
19006 * movl ct,tmp
19007 * cmpl op1,op2
19008 * cmovcc tmp,dest
19009 *
19010 * Size 15.
19011 */
19012
19013 if (! nonimmediate_operand (operands[2], mode))
19014 operands[2] = force_reg (mode, operands[2]);
19015 if (! nonimmediate_operand (operands[3], mode))
19016 operands[3] = force_reg (mode, operands[3]);
19017
19018 if (! register_operand (operands[2], VOIDmode)
19019 && (mode == QImode
19020 || ! register_operand (operands[3], VOIDmode)))
19021 operands[2] = force_reg (mode, operands[2]);
19022
19023 if (mode == QImode
19024 && ! register_operand (operands[3], VOIDmode))
19025 operands[3] = force_reg (mode, operands[3]);
19026
19027 emit_insn (compare_seq);
19028 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19029 gen_rtx_IF_THEN_ELSE (mode,
19030 compare_op, operands[2],
19031 operands[3])));
19032 return true;
19033 }
19034
19035 /* Swap, force into registers, or otherwise massage the two operands
19036 to an sse comparison with a mask result. Thus we differ a bit from
19037 ix86_prepare_fp_compare_args which expects to produce a flags result.
19038
19039 The DEST operand exists to help determine whether to commute commutative
19040 operators. The POP0/POP1 operands are updated in place. The new
19041 comparison code is returned, or UNKNOWN if not implementable. */
19042
19043 static enum rtx_code
19044 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19045 rtx *pop0, rtx *pop1)
19046 {
19047 rtx tmp;
19048
19049 switch (code)
19050 {
19051 case LTGT:
19052 case UNEQ:
19053 /* AVX supports all the needed comparisons. */
19054 if (TARGET_AVX)
19055 break;
19056 /* We have no LTGT as an operator. We could implement it with
19057 NE & ORDERED, but this requires an extra temporary. It's
19058 not clear that it's worth it. */
19059 return UNKNOWN;
19060
19061 case LT:
19062 case LE:
19063 case UNGT:
19064 case UNGE:
19065 /* These are supported directly. */
19066 break;
19067
19068 case EQ:
19069 case NE:
19070 case UNORDERED:
19071 case ORDERED:
19072 /* AVX has 3 operand comparisons, no need to swap anything. */
19073 if (TARGET_AVX)
19074 break;
19075 /* For commutative operators, try to canonicalize the destination
19076 operand to be first in the comparison - this helps reload to
19077 avoid extra moves. */
19078 if (!dest || !rtx_equal_p (dest, *pop1))
19079 break;
19080 /* FALLTHRU */
19081
19082 case GE:
19083 case GT:
19084 case UNLE:
19085 case UNLT:
19086 /* These are not supported directly before AVX, and furthermore
19087 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19088 comparison operands to transform into something that is
19089 supported. */
19090 tmp = *pop0;
19091 *pop0 = *pop1;
19092 *pop1 = tmp;
19093 code = swap_condition (code);
19094 break;
19095
19096 default:
19097 gcc_unreachable ();
19098 }
19099
19100 return code;
19101 }
19102
19103 /* Detect conditional moves that exactly match min/max operational
19104 semantics. Note that this is IEEE safe, as long as we don't
19105 interchange the operands.
19106
19107 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19108 and TRUE if the operation is successful and instructions are emitted. */
19109
19110 static bool
19111 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19112 rtx cmp_op1, rtx if_true, rtx if_false)
19113 {
19114 enum machine_mode mode;
19115 bool is_min;
19116 rtx tmp;
19117
19118 if (code == LT)
19119 ;
19120 else if (code == UNGE)
19121 {
19122 tmp = if_true;
19123 if_true = if_false;
19124 if_false = tmp;
19125 }
19126 else
19127 return false;
19128
19129 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19130 is_min = true;
19131 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19132 is_min = false;
19133 else
19134 return false;
19135
19136 mode = GET_MODE (dest);
19137
19138 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19139 but MODE may be a vector mode and thus not appropriate. */
19140 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19141 {
19142 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19143 rtvec v;
19144
19145 if_true = force_reg (mode, if_true);
19146 v = gen_rtvec (2, if_true, if_false);
19147 tmp = gen_rtx_UNSPEC (mode, v, u);
19148 }
19149 else
19150 {
19151 code = is_min ? SMIN : SMAX;
19152 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19153 }
19154
19155 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19156 return true;
19157 }
19158
19159 /* Expand an sse vector comparison. Return the register with the result. */
19160
19161 static rtx
19162 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19163 rtx op_true, rtx op_false)
19164 {
19165 enum machine_mode mode = GET_MODE (dest);
19166 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19167 rtx x;
19168
19169 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19170 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19171 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19172
19173 if (optimize
19174 || reg_overlap_mentioned_p (dest, op_true)
19175 || reg_overlap_mentioned_p (dest, op_false))
19176 dest = gen_reg_rtx (mode);
19177
19178 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19179 if (cmp_mode != mode)
19180 {
19181 x = force_reg (cmp_mode, x);
19182 convert_move (dest, x, false);
19183 }
19184 else
19185 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19186
19187 return dest;
19188 }
19189
19190 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19191 operations. This is used for both scalar and vector conditional moves. */
19192
19193 static void
19194 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19195 {
19196 enum machine_mode mode = GET_MODE (dest);
19197 rtx t2, t3, x;
19198
19199 if (vector_all_ones_operand (op_true, mode)
19200 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19201 {
19202 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19203 }
19204 else if (op_false == CONST0_RTX (mode))
19205 {
19206 op_true = force_reg (mode, op_true);
19207 x = gen_rtx_AND (mode, cmp, op_true);
19208 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19209 }
19210 else if (op_true == CONST0_RTX (mode))
19211 {
19212 op_false = force_reg (mode, op_false);
19213 x = gen_rtx_NOT (mode, cmp);
19214 x = gen_rtx_AND (mode, x, op_false);
19215 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19216 }
19217 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19218 {
19219 op_false = force_reg (mode, op_false);
19220 x = gen_rtx_IOR (mode, cmp, op_false);
19221 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19222 }
19223 else if (TARGET_XOP)
19224 {
19225 op_true = force_reg (mode, op_true);
19226
19227 if (!nonimmediate_operand (op_false, mode))
19228 op_false = force_reg (mode, op_false);
19229
19230 emit_insn (gen_rtx_SET (mode, dest,
19231 gen_rtx_IF_THEN_ELSE (mode, cmp,
19232 op_true,
19233 op_false)));
19234 }
19235 else
19236 {
19237 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19238
19239 if (!nonimmediate_operand (op_true, mode))
19240 op_true = force_reg (mode, op_true);
19241
19242 op_false = force_reg (mode, op_false);
19243
19244 switch (mode)
19245 {
19246 case V4SFmode:
19247 if (TARGET_SSE4_1)
19248 gen = gen_sse4_1_blendvps;
19249 break;
19250 case V2DFmode:
19251 if (TARGET_SSE4_1)
19252 gen = gen_sse4_1_blendvpd;
19253 break;
19254 case V16QImode:
19255 case V8HImode:
19256 case V4SImode:
19257 case V2DImode:
19258 if (TARGET_SSE4_1)
19259 {
19260 gen = gen_sse4_1_pblendvb;
19261 dest = gen_lowpart (V16QImode, dest);
19262 op_false = gen_lowpart (V16QImode, op_false);
19263 op_true = gen_lowpart (V16QImode, op_true);
19264 cmp = gen_lowpart (V16QImode, cmp);
19265 }
19266 break;
19267 case V8SFmode:
19268 if (TARGET_AVX)
19269 gen = gen_avx_blendvps256;
19270 break;
19271 case V4DFmode:
19272 if (TARGET_AVX)
19273 gen = gen_avx_blendvpd256;
19274 break;
19275 case V32QImode:
19276 case V16HImode:
19277 case V8SImode:
19278 case V4DImode:
19279 if (TARGET_AVX2)
19280 {
19281 gen = gen_avx2_pblendvb;
19282 dest = gen_lowpart (V32QImode, dest);
19283 op_false = gen_lowpart (V32QImode, op_false);
19284 op_true = gen_lowpart (V32QImode, op_true);
19285 cmp = gen_lowpart (V32QImode, cmp);
19286 }
19287 break;
19288 default:
19289 break;
19290 }
19291
19292 if (gen != NULL)
19293 emit_insn (gen (dest, op_false, op_true, cmp));
19294 else
19295 {
19296 op_true = force_reg (mode, op_true);
19297
19298 t2 = gen_reg_rtx (mode);
19299 if (optimize)
19300 t3 = gen_reg_rtx (mode);
19301 else
19302 t3 = dest;
19303
19304 x = gen_rtx_AND (mode, op_true, cmp);
19305 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19306
19307 x = gen_rtx_NOT (mode, cmp);
19308 x = gen_rtx_AND (mode, x, op_false);
19309 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19310
19311 x = gen_rtx_IOR (mode, t3, t2);
19312 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19313 }
19314 }
19315 }
19316
19317 /* Expand a floating-point conditional move. Return true if successful. */
19318
19319 bool
19320 ix86_expand_fp_movcc (rtx operands[])
19321 {
19322 enum machine_mode mode = GET_MODE (operands[0]);
19323 enum rtx_code code = GET_CODE (operands[1]);
19324 rtx tmp, compare_op;
19325 rtx op0 = XEXP (operands[1], 0);
19326 rtx op1 = XEXP (operands[1], 1);
19327
19328 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19329 {
19330 enum machine_mode cmode;
19331
19332 /* Since we've no cmove for sse registers, don't force bad register
19333 allocation just to gain access to it. Deny movcc when the
19334 comparison mode doesn't match the move mode. */
19335 cmode = GET_MODE (op0);
19336 if (cmode == VOIDmode)
19337 cmode = GET_MODE (op1);
19338 if (cmode != mode)
19339 return false;
19340
19341 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19342 if (code == UNKNOWN)
19343 return false;
19344
19345 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19346 operands[2], operands[3]))
19347 return true;
19348
19349 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19350 operands[2], operands[3]);
19351 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19352 return true;
19353 }
19354
19355 /* The floating point conditional move instructions don't directly
19356 support conditions resulting from a signed integer comparison. */
19357
19358 compare_op = ix86_expand_compare (code, op0, op1);
19359 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19360 {
19361 tmp = gen_reg_rtx (QImode);
19362 ix86_expand_setcc (tmp, code, op0, op1);
19363
19364 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19365 }
19366
19367 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19368 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19369 operands[2], operands[3])));
19370
19371 return true;
19372 }
19373
19374 /* Expand a floating-point vector conditional move; a vcond operation
19375 rather than a movcc operation. */
19376
19377 bool
19378 ix86_expand_fp_vcond (rtx operands[])
19379 {
19380 enum rtx_code code = GET_CODE (operands[3]);
19381 rtx cmp;
19382
19383 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19384 &operands[4], &operands[5]);
19385 if (code == UNKNOWN)
19386 {
19387 rtx temp;
19388 switch (GET_CODE (operands[3]))
19389 {
19390 case LTGT:
19391 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19392 operands[5], operands[0], operands[0]);
19393 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19394 operands[5], operands[1], operands[2]);
19395 code = AND;
19396 break;
19397 case UNEQ:
19398 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19399 operands[5], operands[0], operands[0]);
19400 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19401 operands[5], operands[1], operands[2]);
19402 code = IOR;
19403 break;
19404 default:
19405 gcc_unreachable ();
19406 }
19407 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19408 OPTAB_DIRECT);
19409 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19410 return true;
19411 }
19412
19413 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19414 operands[5], operands[1], operands[2]))
19415 return true;
19416
19417 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19418 operands[1], operands[2]);
19419 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19420 return true;
19421 }
19422
19423 /* Expand a signed/unsigned integral vector conditional move. */
19424
19425 bool
19426 ix86_expand_int_vcond (rtx operands[])
19427 {
19428 enum machine_mode data_mode = GET_MODE (operands[0]);
19429 enum machine_mode mode = GET_MODE (operands[4]);
19430 enum rtx_code code = GET_CODE (operands[3]);
19431 bool negate = false;
19432 rtx x, cop0, cop1;
19433
19434 cop0 = operands[4];
19435 cop1 = operands[5];
19436
19437 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19438 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19439 if ((code == LT || code == GE)
19440 && data_mode == mode
19441 && cop1 == CONST0_RTX (mode)
19442 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19443 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19444 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19445 && (GET_MODE_SIZE (data_mode) == 16
19446 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19447 {
19448 rtx negop = operands[2 - (code == LT)];
19449 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19450 if (negop == CONST1_RTX (data_mode))
19451 {
19452 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19453 operands[0], 1, OPTAB_DIRECT);
19454 if (res != operands[0])
19455 emit_move_insn (operands[0], res);
19456 return true;
19457 }
19458 else if (GET_MODE_INNER (data_mode) != DImode
19459 && vector_all_ones_operand (negop, data_mode))
19460 {
19461 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19462 operands[0], 0, OPTAB_DIRECT);
19463 if (res != operands[0])
19464 emit_move_insn (operands[0], res);
19465 return true;
19466 }
19467 }
19468
19469 if (!nonimmediate_operand (cop1, mode))
19470 cop1 = force_reg (mode, cop1);
19471 if (!general_operand (operands[1], data_mode))
19472 operands[1] = force_reg (data_mode, operands[1]);
19473 if (!general_operand (operands[2], data_mode))
19474 operands[2] = force_reg (data_mode, operands[2]);
19475
19476 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19477 if (TARGET_XOP
19478 && (mode == V16QImode || mode == V8HImode
19479 || mode == V4SImode || mode == V2DImode))
19480 ;
19481 else
19482 {
19483 /* Canonicalize the comparison to EQ, GT, GTU. */
19484 switch (code)
19485 {
19486 case EQ:
19487 case GT:
19488 case GTU:
19489 break;
19490
19491 case NE:
19492 case LE:
19493 case LEU:
19494 code = reverse_condition (code);
19495 negate = true;
19496 break;
19497
19498 case GE:
19499 case GEU:
19500 code = reverse_condition (code);
19501 negate = true;
19502 /* FALLTHRU */
19503
19504 case LT:
19505 case LTU:
19506 code = swap_condition (code);
19507 x = cop0, cop0 = cop1, cop1 = x;
19508 break;
19509
19510 default:
19511 gcc_unreachable ();
19512 }
19513
19514 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19515 if (mode == V2DImode)
19516 {
19517 switch (code)
19518 {
19519 case EQ:
19520 /* SSE4.1 supports EQ. */
19521 if (!TARGET_SSE4_1)
19522 return false;
19523 break;
19524
19525 case GT:
19526 case GTU:
19527 /* SSE4.2 supports GT/GTU. */
19528 if (!TARGET_SSE4_2)
19529 return false;
19530 break;
19531
19532 default:
19533 gcc_unreachable ();
19534 }
19535 }
19536
19537 /* Unsigned parallel compare is not supported by the hardware.
19538 Play some tricks to turn this into a signed comparison
19539 against 0. */
19540 if (code == GTU)
19541 {
19542 cop0 = force_reg (mode, cop0);
19543
19544 switch (mode)
19545 {
19546 case V8SImode:
19547 case V4DImode:
19548 case V4SImode:
19549 case V2DImode:
19550 {
19551 rtx t1, t2, mask;
19552 rtx (*gen_sub3) (rtx, rtx, rtx);
19553
19554 switch (mode)
19555 {
19556 case V8SImode: gen_sub3 = gen_subv8si3; break;
19557 case V4DImode: gen_sub3 = gen_subv4di3; break;
19558 case V4SImode: gen_sub3 = gen_subv4si3; break;
19559 case V2DImode: gen_sub3 = gen_subv2di3; break;
19560 default:
19561 gcc_unreachable ();
19562 }
19563 /* Subtract (-(INT MAX) - 1) from both operands to make
19564 them signed. */
19565 mask = ix86_build_signbit_mask (mode, true, false);
19566 t1 = gen_reg_rtx (mode);
19567 emit_insn (gen_sub3 (t1, cop0, mask));
19568
19569 t2 = gen_reg_rtx (mode);
19570 emit_insn (gen_sub3 (t2, cop1, mask));
19571
19572 cop0 = t1;
19573 cop1 = t2;
19574 code = GT;
19575 }
19576 break;
19577
19578 case V32QImode:
19579 case V16HImode:
19580 case V16QImode:
19581 case V8HImode:
19582 /* Perform a parallel unsigned saturating subtraction. */
19583 x = gen_reg_rtx (mode);
19584 emit_insn (gen_rtx_SET (VOIDmode, x,
19585 gen_rtx_US_MINUS (mode, cop0, cop1)));
19586
19587 cop0 = x;
19588 cop1 = CONST0_RTX (mode);
19589 code = EQ;
19590 negate = !negate;
19591 break;
19592
19593 default:
19594 gcc_unreachable ();
19595 }
19596 }
19597 }
19598
19599 /* Allow the comparison to be done in one mode, but the movcc to
19600 happen in another mode. */
19601 if (data_mode == mode)
19602 {
19603 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19604 operands[1+negate], operands[2-negate]);
19605 }
19606 else
19607 {
19608 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19609 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19610 code, cop0, cop1,
19611 operands[1+negate], operands[2-negate]);
19612 x = gen_lowpart (data_mode, x);
19613 }
19614
19615 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19616 operands[2-negate]);
19617 return true;
19618 }
19619
19620 /* Expand a variable vector permutation. */
19621
19622 void
19623 ix86_expand_vec_perm (rtx operands[])
19624 {
19625 rtx target = operands[0];
19626 rtx op0 = operands[1];
19627 rtx op1 = operands[2];
19628 rtx mask = operands[3];
19629 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19630 enum machine_mode mode = GET_MODE (op0);
19631 enum machine_mode maskmode = GET_MODE (mask);
19632 int w, e, i;
19633 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19634
19635 /* Number of elements in the vector. */
19636 w = GET_MODE_NUNITS (mode);
19637 e = GET_MODE_UNIT_SIZE (mode);
19638 gcc_assert (w <= 32);
19639
19640 if (TARGET_AVX2)
19641 {
19642 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19643 {
19644 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19645 an constant shuffle operand. With a tiny bit of effort we can
19646 use VPERMD instead. A re-interpretation stall for V4DFmode is
19647 unfortunate but there's no avoiding it.
19648 Similarly for V16HImode we don't have instructions for variable
19649 shuffling, while for V32QImode we can use after preparing suitable
19650 masks vpshufb; vpshufb; vpermq; vpor. */
19651
19652 if (mode == V16HImode)
19653 {
19654 maskmode = mode = V32QImode;
19655 w = 32;
19656 e = 1;
19657 }
19658 else
19659 {
19660 maskmode = mode = V8SImode;
19661 w = 8;
19662 e = 4;
19663 }
19664 t1 = gen_reg_rtx (maskmode);
19665
19666 /* Replicate the low bits of the V4DImode mask into V8SImode:
19667 mask = { A B C D }
19668 t1 = { A A B B C C D D }. */
19669 for (i = 0; i < w / 2; ++i)
19670 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19671 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19672 vt = force_reg (maskmode, vt);
19673 mask = gen_lowpart (maskmode, mask);
19674 if (maskmode == V8SImode)
19675 emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
19676 else
19677 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19678
19679 /* Multiply the shuffle indicies by two. */
19680 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19681 OPTAB_DIRECT);
19682
19683 /* Add one to the odd shuffle indicies:
19684 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19685 for (i = 0; i < w / 2; ++i)
19686 {
19687 vec[i * 2] = const0_rtx;
19688 vec[i * 2 + 1] = const1_rtx;
19689 }
19690 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19691 vt = force_const_mem (maskmode, vt);
19692 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19693 OPTAB_DIRECT);
19694
19695 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19696 operands[3] = mask = t1;
19697 target = gen_lowpart (mode, target);
19698 op0 = gen_lowpart (mode, op0);
19699 op1 = gen_lowpart (mode, op1);
19700 }
19701
19702 switch (mode)
19703 {
19704 case V8SImode:
19705 /* The VPERMD and VPERMPS instructions already properly ignore
19706 the high bits of the shuffle elements. No need for us to
19707 perform an AND ourselves. */
19708 if (one_operand_shuffle)
19709 emit_insn (gen_avx2_permvarv8si (target, mask, op0));
19710 else
19711 {
19712 t1 = gen_reg_rtx (V8SImode);
19713 t2 = gen_reg_rtx (V8SImode);
19714 emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
19715 emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
19716 goto merge_two;
19717 }
19718 return;
19719
19720 case V8SFmode:
19721 mask = gen_lowpart (V8SFmode, mask);
19722 if (one_operand_shuffle)
19723 emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
19724 else
19725 {
19726 t1 = gen_reg_rtx (V8SFmode);
19727 t2 = gen_reg_rtx (V8SFmode);
19728 emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
19729 emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
19730 goto merge_two;
19731 }
19732 return;
19733
19734 case V4SImode:
19735 /* By combining the two 128-bit input vectors into one 256-bit
19736 input vector, we can use VPERMD and VPERMPS for the full
19737 two-operand shuffle. */
19738 t1 = gen_reg_rtx (V8SImode);
19739 t2 = gen_reg_rtx (V8SImode);
19740 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19741 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19742 emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
19743 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19744 return;
19745
19746 case V4SFmode:
19747 t1 = gen_reg_rtx (V8SFmode);
19748 t2 = gen_reg_rtx (V8SFmode);
19749 mask = gen_lowpart (V4SFmode, mask);
19750 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19751 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19752 emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
19753 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19754 return;
19755
19756 case V32QImode:
19757 t1 = gen_reg_rtx (V32QImode);
19758 t2 = gen_reg_rtx (V32QImode);
19759 t3 = gen_reg_rtx (V32QImode);
19760 vt2 = GEN_INT (128);
19761 for (i = 0; i < 32; i++)
19762 vec[i] = vt2;
19763 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19764 vt = force_reg (V32QImode, vt);
19765 for (i = 0; i < 32; i++)
19766 vec[i] = i < 16 ? vt2 : const0_rtx;
19767 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19768 vt2 = force_reg (V32QImode, vt2);
19769 /* From mask create two adjusted masks, which contain the same
19770 bits as mask in the low 7 bits of each vector element.
19771 The first mask will have the most significant bit clear
19772 if it requests element from the same 128-bit lane
19773 and MSB set if it requests element from the other 128-bit lane.
19774 The second mask will have the opposite values of the MSB,
19775 and additionally will have its 128-bit lanes swapped.
19776 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19777 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19778 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19779 stands for other 12 bytes. */
19780 /* The bit whether element is from the same lane or the other
19781 lane is bit 4, so shift it up by 3 to the MSB position. */
19782 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19783 gen_lowpart (V4DImode, mask),
19784 GEN_INT (3)));
19785 /* Clear MSB bits from the mask just in case it had them set. */
19786 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19787 /* After this t1 will have MSB set for elements from other lane. */
19788 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
19789 /* Clear bits other than MSB. */
19790 emit_insn (gen_andv32qi3 (t1, t1, vt));
19791 /* Or in the lower bits from mask into t3. */
19792 emit_insn (gen_iorv32qi3 (t3, t1, t2));
19793 /* And invert MSB bits in t1, so MSB is set for elements from the same
19794 lane. */
19795 emit_insn (gen_xorv32qi3 (t1, t1, vt));
19796 /* Swap 128-bit lanes in t3. */
19797 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19798 gen_lowpart (V4DImode, t3),
19799 const2_rtx, GEN_INT (3),
19800 const0_rtx, const1_rtx));
19801 /* And or in the lower bits from mask into t1. */
19802 emit_insn (gen_iorv32qi3 (t1, t1, t2));
19803 if (one_operand_shuffle)
19804 {
19805 /* Each of these shuffles will put 0s in places where
19806 element from the other 128-bit lane is needed, otherwise
19807 will shuffle in the requested value. */
19808 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
19809 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
19810 /* For t3 the 128-bit lanes are swapped again. */
19811 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19812 gen_lowpart (V4DImode, t3),
19813 const2_rtx, GEN_INT (3),
19814 const0_rtx, const1_rtx));
19815 /* And oring both together leads to the result. */
19816 emit_insn (gen_iorv32qi3 (target, t1, t3));
19817 return;
19818 }
19819
19820 t4 = gen_reg_rtx (V32QImode);
19821 /* Similarly to the above one_operand_shuffle code,
19822 just for repeated twice for each operand. merge_two:
19823 code will merge the two results together. */
19824 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
19825 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
19826 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
19827 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
19828 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
19829 gen_lowpart (V4DImode, t4),
19830 const2_rtx, GEN_INT (3),
19831 const0_rtx, const1_rtx));
19832 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19833 gen_lowpart (V4DImode, t3),
19834 const2_rtx, GEN_INT (3),
19835 const0_rtx, const1_rtx));
19836 emit_insn (gen_iorv32qi3 (t4, t2, t4));
19837 emit_insn (gen_iorv32qi3 (t3, t1, t3));
19838 t1 = t4;
19839 t2 = t3;
19840 goto merge_two;
19841
19842 default:
19843 gcc_assert (GET_MODE_SIZE (mode) <= 16);
19844 break;
19845 }
19846 }
19847
19848 if (TARGET_XOP)
19849 {
19850 /* The XOP VPPERM insn supports three inputs. By ignoring the
19851 one_operand_shuffle special case, we avoid creating another
19852 set of constant vectors in memory. */
19853 one_operand_shuffle = false;
19854
19855 /* mask = mask & {2*w-1, ...} */
19856 vt = GEN_INT (2*w - 1);
19857 }
19858 else
19859 {
19860 /* mask = mask & {w-1, ...} */
19861 vt = GEN_INT (w - 1);
19862 }
19863
19864 for (i = 0; i < w; i++)
19865 vec[i] = vt;
19866 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19867 mask = expand_simple_binop (maskmode, AND, mask, vt,
19868 NULL_RTX, 0, OPTAB_DIRECT);
19869
19870 /* For non-QImode operations, convert the word permutation control
19871 into a byte permutation control. */
19872 if (mode != V16QImode)
19873 {
19874 mask = expand_simple_binop (maskmode, ASHIFT, mask,
19875 GEN_INT (exact_log2 (e)),
19876 NULL_RTX, 0, OPTAB_DIRECT);
19877
19878 /* Convert mask to vector of chars. */
19879 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
19880
19881 /* Replicate each of the input bytes into byte positions:
19882 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
19883 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
19884 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
19885 for (i = 0; i < 16; ++i)
19886 vec[i] = GEN_INT (i/e * e);
19887 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19888 vt = force_const_mem (V16QImode, vt);
19889 if (TARGET_XOP)
19890 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
19891 else
19892 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
19893
19894 /* Convert it into the byte positions by doing
19895 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
19896 for (i = 0; i < 16; ++i)
19897 vec[i] = GEN_INT (i % e);
19898 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19899 vt = force_const_mem (V16QImode, vt);
19900 emit_insn (gen_addv16qi3 (mask, mask, vt));
19901 }
19902
19903 /* The actual shuffle operations all operate on V16QImode. */
19904 op0 = gen_lowpart (V16QImode, op0);
19905 op1 = gen_lowpart (V16QImode, op1);
19906 target = gen_lowpart (V16QImode, target);
19907
19908 if (TARGET_XOP)
19909 {
19910 emit_insn (gen_xop_pperm (target, op0, op1, mask));
19911 }
19912 else if (one_operand_shuffle)
19913 {
19914 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
19915 }
19916 else
19917 {
19918 rtx xops[6];
19919 bool ok;
19920
19921 /* Shuffle the two input vectors independently. */
19922 t1 = gen_reg_rtx (V16QImode);
19923 t2 = gen_reg_rtx (V16QImode);
19924 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
19925 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
19926
19927 merge_two:
19928 /* Then merge them together. The key is whether any given control
19929 element contained a bit set that indicates the second word. */
19930 mask = operands[3];
19931 vt = GEN_INT (w);
19932 if (maskmode == V2DImode && !TARGET_SSE4_1)
19933 {
19934 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
19935 more shuffle to convert the V2DI input mask into a V4SI
19936 input mask. At which point the masking that expand_int_vcond
19937 will work as desired. */
19938 rtx t3 = gen_reg_rtx (V4SImode);
19939 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
19940 const0_rtx, const0_rtx,
19941 const2_rtx, const2_rtx));
19942 mask = t3;
19943 maskmode = V4SImode;
19944 e = w = 4;
19945 }
19946
19947 for (i = 0; i < w; i++)
19948 vec[i] = vt;
19949 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19950 vt = force_reg (maskmode, vt);
19951 mask = expand_simple_binop (maskmode, AND, mask, vt,
19952 NULL_RTX, 0, OPTAB_DIRECT);
19953
19954 xops[0] = gen_lowpart (mode, operands[0]);
19955 xops[1] = gen_lowpart (mode, t2);
19956 xops[2] = gen_lowpart (mode, t1);
19957 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
19958 xops[4] = mask;
19959 xops[5] = vt;
19960 ok = ix86_expand_int_vcond (xops);
19961 gcc_assert (ok);
19962 }
19963 }
19964
19965 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
19966 true if we should do zero extension, else sign extension. HIGH_P is
19967 true if we want the N/2 high elements, else the low elements. */
19968
19969 void
19970 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
19971 {
19972 enum machine_mode imode = GET_MODE (operands[1]);
19973 rtx tmp, dest;
19974
19975 if (TARGET_SSE4_1)
19976 {
19977 rtx (*unpack)(rtx, rtx);
19978 rtx (*extract)(rtx, rtx) = NULL;
19979 enum machine_mode halfmode = BLKmode;
19980
19981 switch (imode)
19982 {
19983 case V32QImode:
19984 if (unsigned_p)
19985 unpack = gen_avx2_zero_extendv16qiv16hi2;
19986 else
19987 unpack = gen_avx2_sign_extendv16qiv16hi2;
19988 halfmode = V16QImode;
19989 extract
19990 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
19991 break;
19992 case V16HImode:
19993 if (unsigned_p)
19994 unpack = gen_avx2_zero_extendv8hiv8si2;
19995 else
19996 unpack = gen_avx2_sign_extendv8hiv8si2;
19997 halfmode = V8HImode;
19998 extract
19999 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20000 break;
20001 case V8SImode:
20002 if (unsigned_p)
20003 unpack = gen_avx2_zero_extendv4siv4di2;
20004 else
20005 unpack = gen_avx2_sign_extendv4siv4di2;
20006 halfmode = V4SImode;
20007 extract
20008 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20009 break;
20010 case V16QImode:
20011 if (unsigned_p)
20012 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20013 else
20014 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20015 break;
20016 case V8HImode:
20017 if (unsigned_p)
20018 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20019 else
20020 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20021 break;
20022 case V4SImode:
20023 if (unsigned_p)
20024 unpack = gen_sse4_1_zero_extendv2siv2di2;
20025 else
20026 unpack = gen_sse4_1_sign_extendv2siv2di2;
20027 break;
20028 default:
20029 gcc_unreachable ();
20030 }
20031
20032 if (GET_MODE_SIZE (imode) == 32)
20033 {
20034 tmp = gen_reg_rtx (halfmode);
20035 emit_insn (extract (tmp, operands[1]));
20036 }
20037 else if (high_p)
20038 {
20039 /* Shift higher 8 bytes to lower 8 bytes. */
20040 tmp = gen_reg_rtx (imode);
20041 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20042 gen_lowpart (V1TImode, operands[1]),
20043 GEN_INT (64)));
20044 }
20045 else
20046 tmp = operands[1];
20047
20048 emit_insn (unpack (operands[0], tmp));
20049 }
20050 else
20051 {
20052 rtx (*unpack)(rtx, rtx, rtx);
20053
20054 switch (imode)
20055 {
20056 case V16QImode:
20057 if (high_p)
20058 unpack = gen_vec_interleave_highv16qi;
20059 else
20060 unpack = gen_vec_interleave_lowv16qi;
20061 break;
20062 case V8HImode:
20063 if (high_p)
20064 unpack = gen_vec_interleave_highv8hi;
20065 else
20066 unpack = gen_vec_interleave_lowv8hi;
20067 break;
20068 case V4SImode:
20069 if (high_p)
20070 unpack = gen_vec_interleave_highv4si;
20071 else
20072 unpack = gen_vec_interleave_lowv4si;
20073 break;
20074 default:
20075 gcc_unreachable ();
20076 }
20077
20078 dest = gen_lowpart (imode, operands[0]);
20079
20080 if (unsigned_p)
20081 tmp = force_reg (imode, CONST0_RTX (imode));
20082 else
20083 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20084 operands[1], pc_rtx, pc_rtx);
20085
20086 emit_insn (unpack (dest, operands[1], tmp));
20087 }
20088 }
20089
20090 /* Expand conditional increment or decrement using adb/sbb instructions.
20091 The default case using setcc followed by the conditional move can be
20092 done by generic code. */
20093 bool
20094 ix86_expand_int_addcc (rtx operands[])
20095 {
20096 enum rtx_code code = GET_CODE (operands[1]);
20097 rtx flags;
20098 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20099 rtx compare_op;
20100 rtx val = const0_rtx;
20101 bool fpcmp = false;
20102 enum machine_mode mode;
20103 rtx op0 = XEXP (operands[1], 0);
20104 rtx op1 = XEXP (operands[1], 1);
20105
20106 if (operands[3] != const1_rtx
20107 && operands[3] != constm1_rtx)
20108 return false;
20109 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20110 return false;
20111 code = GET_CODE (compare_op);
20112
20113 flags = XEXP (compare_op, 0);
20114
20115 if (GET_MODE (flags) == CCFPmode
20116 || GET_MODE (flags) == CCFPUmode)
20117 {
20118 fpcmp = true;
20119 code = ix86_fp_compare_code_to_integer (code);
20120 }
20121
20122 if (code != LTU)
20123 {
20124 val = constm1_rtx;
20125 if (fpcmp)
20126 PUT_CODE (compare_op,
20127 reverse_condition_maybe_unordered
20128 (GET_CODE (compare_op)));
20129 else
20130 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20131 }
20132
20133 mode = GET_MODE (operands[0]);
20134
20135 /* Construct either adc or sbb insn. */
20136 if ((code == LTU) == (operands[3] == constm1_rtx))
20137 {
20138 switch (mode)
20139 {
20140 case QImode:
20141 insn = gen_subqi3_carry;
20142 break;
20143 case HImode:
20144 insn = gen_subhi3_carry;
20145 break;
20146 case SImode:
20147 insn = gen_subsi3_carry;
20148 break;
20149 case DImode:
20150 insn = gen_subdi3_carry;
20151 break;
20152 default:
20153 gcc_unreachable ();
20154 }
20155 }
20156 else
20157 {
20158 switch (mode)
20159 {
20160 case QImode:
20161 insn = gen_addqi3_carry;
20162 break;
20163 case HImode:
20164 insn = gen_addhi3_carry;
20165 break;
20166 case SImode:
20167 insn = gen_addsi3_carry;
20168 break;
20169 case DImode:
20170 insn = gen_adddi3_carry;
20171 break;
20172 default:
20173 gcc_unreachable ();
20174 }
20175 }
20176 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20177
20178 return true;
20179 }
20180
20181
20182 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20183 but works for floating pointer parameters and nonoffsetable memories.
20184 For pushes, it returns just stack offsets; the values will be saved
20185 in the right order. Maximally three parts are generated. */
20186
20187 static int
20188 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20189 {
20190 int size;
20191
20192 if (!TARGET_64BIT)
20193 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20194 else
20195 size = (GET_MODE_SIZE (mode) + 4) / 8;
20196
20197 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20198 gcc_assert (size >= 2 && size <= 4);
20199
20200 /* Optimize constant pool reference to immediates. This is used by fp
20201 moves, that force all constants to memory to allow combining. */
20202 if (MEM_P (operand) && MEM_READONLY_P (operand))
20203 {
20204 rtx tmp = maybe_get_pool_constant (operand);
20205 if (tmp)
20206 operand = tmp;
20207 }
20208
20209 if (MEM_P (operand) && !offsettable_memref_p (operand))
20210 {
20211 /* The only non-offsetable memories we handle are pushes. */
20212 int ok = push_operand (operand, VOIDmode);
20213
20214 gcc_assert (ok);
20215
20216 operand = copy_rtx (operand);
20217 PUT_MODE (operand, Pmode);
20218 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20219 return size;
20220 }
20221
20222 if (GET_CODE (operand) == CONST_VECTOR)
20223 {
20224 enum machine_mode imode = int_mode_for_mode (mode);
20225 /* Caution: if we looked through a constant pool memory above,
20226 the operand may actually have a different mode now. That's
20227 ok, since we want to pun this all the way back to an integer. */
20228 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20229 gcc_assert (operand != NULL);
20230 mode = imode;
20231 }
20232
20233 if (!TARGET_64BIT)
20234 {
20235 if (mode == DImode)
20236 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20237 else
20238 {
20239 int i;
20240
20241 if (REG_P (operand))
20242 {
20243 gcc_assert (reload_completed);
20244 for (i = 0; i < size; i++)
20245 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20246 }
20247 else if (offsettable_memref_p (operand))
20248 {
20249 operand = adjust_address (operand, SImode, 0);
20250 parts[0] = operand;
20251 for (i = 1; i < size; i++)
20252 parts[i] = adjust_address (operand, SImode, 4 * i);
20253 }
20254 else if (GET_CODE (operand) == CONST_DOUBLE)
20255 {
20256 REAL_VALUE_TYPE r;
20257 long l[4];
20258
20259 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20260 switch (mode)
20261 {
20262 case TFmode:
20263 real_to_target (l, &r, mode);
20264 parts[3] = gen_int_mode (l[3], SImode);
20265 parts[2] = gen_int_mode (l[2], SImode);
20266 break;
20267 case XFmode:
20268 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20269 parts[2] = gen_int_mode (l[2], SImode);
20270 break;
20271 case DFmode:
20272 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20273 break;
20274 default:
20275 gcc_unreachable ();
20276 }
20277 parts[1] = gen_int_mode (l[1], SImode);
20278 parts[0] = gen_int_mode (l[0], SImode);
20279 }
20280 else
20281 gcc_unreachable ();
20282 }
20283 }
20284 else
20285 {
20286 if (mode == TImode)
20287 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20288 if (mode == XFmode || mode == TFmode)
20289 {
20290 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20291 if (REG_P (operand))
20292 {
20293 gcc_assert (reload_completed);
20294 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20295 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20296 }
20297 else if (offsettable_memref_p (operand))
20298 {
20299 operand = adjust_address (operand, DImode, 0);
20300 parts[0] = operand;
20301 parts[1] = adjust_address (operand, upper_mode, 8);
20302 }
20303 else if (GET_CODE (operand) == CONST_DOUBLE)
20304 {
20305 REAL_VALUE_TYPE r;
20306 long l[4];
20307
20308 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20309 real_to_target (l, &r, mode);
20310
20311 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20312 if (HOST_BITS_PER_WIDE_INT >= 64)
20313 parts[0]
20314 = gen_int_mode
20315 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20316 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20317 DImode);
20318 else
20319 parts[0] = immed_double_const (l[0], l[1], DImode);
20320
20321 if (upper_mode == SImode)
20322 parts[1] = gen_int_mode (l[2], SImode);
20323 else if (HOST_BITS_PER_WIDE_INT >= 64)
20324 parts[1]
20325 = gen_int_mode
20326 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20327 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20328 DImode);
20329 else
20330 parts[1] = immed_double_const (l[2], l[3], DImode);
20331 }
20332 else
20333 gcc_unreachable ();
20334 }
20335 }
20336
20337 return size;
20338 }
20339
20340 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20341 Return false when normal moves are needed; true when all required
20342 insns have been emitted. Operands 2-4 contain the input values
20343 int the correct order; operands 5-7 contain the output values. */
20344
20345 void
20346 ix86_split_long_move (rtx operands[])
20347 {
20348 rtx part[2][4];
20349 int nparts, i, j;
20350 int push = 0;
20351 int collisions = 0;
20352 enum machine_mode mode = GET_MODE (operands[0]);
20353 bool collisionparts[4];
20354
20355 /* The DFmode expanders may ask us to move double.
20356 For 64bit target this is single move. By hiding the fact
20357 here we simplify i386.md splitters. */
20358 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20359 {
20360 /* Optimize constant pool reference to immediates. This is used by
20361 fp moves, that force all constants to memory to allow combining. */
20362
20363 if (MEM_P (operands[1])
20364 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20365 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20366 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20367 if (push_operand (operands[0], VOIDmode))
20368 {
20369 operands[0] = copy_rtx (operands[0]);
20370 PUT_MODE (operands[0], Pmode);
20371 }
20372 else
20373 operands[0] = gen_lowpart (DImode, operands[0]);
20374 operands[1] = gen_lowpart (DImode, operands[1]);
20375 emit_move_insn (operands[0], operands[1]);
20376 return;
20377 }
20378
20379 /* The only non-offsettable memory we handle is push. */
20380 if (push_operand (operands[0], VOIDmode))
20381 push = 1;
20382 else
20383 gcc_assert (!MEM_P (operands[0])
20384 || offsettable_memref_p (operands[0]));
20385
20386 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20387 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20388
20389 /* When emitting push, take care for source operands on the stack. */
20390 if (push && MEM_P (operands[1])
20391 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20392 {
20393 rtx src_base = XEXP (part[1][nparts - 1], 0);
20394
20395 /* Compensate for the stack decrement by 4. */
20396 if (!TARGET_64BIT && nparts == 3
20397 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20398 src_base = plus_constant (src_base, 4);
20399
20400 /* src_base refers to the stack pointer and is
20401 automatically decreased by emitted push. */
20402 for (i = 0; i < nparts; i++)
20403 part[1][i] = change_address (part[1][i],
20404 GET_MODE (part[1][i]), src_base);
20405 }
20406
20407 /* We need to do copy in the right order in case an address register
20408 of the source overlaps the destination. */
20409 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20410 {
20411 rtx tmp;
20412
20413 for (i = 0; i < nparts; i++)
20414 {
20415 collisionparts[i]
20416 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20417 if (collisionparts[i])
20418 collisions++;
20419 }
20420
20421 /* Collision in the middle part can be handled by reordering. */
20422 if (collisions == 1 && nparts == 3 && collisionparts [1])
20423 {
20424 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20425 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20426 }
20427 else if (collisions == 1
20428 && nparts == 4
20429 && (collisionparts [1] || collisionparts [2]))
20430 {
20431 if (collisionparts [1])
20432 {
20433 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20434 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20435 }
20436 else
20437 {
20438 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20439 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20440 }
20441 }
20442
20443 /* If there are more collisions, we can't handle it by reordering.
20444 Do an lea to the last part and use only one colliding move. */
20445 else if (collisions > 1)
20446 {
20447 rtx base;
20448
20449 collisions = 1;
20450
20451 base = part[0][nparts - 1];
20452
20453 /* Handle the case when the last part isn't valid for lea.
20454 Happens in 64-bit mode storing the 12-byte XFmode. */
20455 if (GET_MODE (base) != Pmode)
20456 base = gen_rtx_REG (Pmode, REGNO (base));
20457
20458 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20459 part[1][0] = replace_equiv_address (part[1][0], base);
20460 for (i = 1; i < nparts; i++)
20461 {
20462 tmp = plus_constant (base, UNITS_PER_WORD * i);
20463 part[1][i] = replace_equiv_address (part[1][i], tmp);
20464 }
20465 }
20466 }
20467
20468 if (push)
20469 {
20470 if (!TARGET_64BIT)
20471 {
20472 if (nparts == 3)
20473 {
20474 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20475 emit_insn (gen_addsi3 (stack_pointer_rtx,
20476 stack_pointer_rtx, GEN_INT (-4)));
20477 emit_move_insn (part[0][2], part[1][2]);
20478 }
20479 else if (nparts == 4)
20480 {
20481 emit_move_insn (part[0][3], part[1][3]);
20482 emit_move_insn (part[0][2], part[1][2]);
20483 }
20484 }
20485 else
20486 {
20487 /* In 64bit mode we don't have 32bit push available. In case this is
20488 register, it is OK - we will just use larger counterpart. We also
20489 retype memory - these comes from attempt to avoid REX prefix on
20490 moving of second half of TFmode value. */
20491 if (GET_MODE (part[1][1]) == SImode)
20492 {
20493 switch (GET_CODE (part[1][1]))
20494 {
20495 case MEM:
20496 part[1][1] = adjust_address (part[1][1], DImode, 0);
20497 break;
20498
20499 case REG:
20500 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20501 break;
20502
20503 default:
20504 gcc_unreachable ();
20505 }
20506
20507 if (GET_MODE (part[1][0]) == SImode)
20508 part[1][0] = part[1][1];
20509 }
20510 }
20511 emit_move_insn (part[0][1], part[1][1]);
20512 emit_move_insn (part[0][0], part[1][0]);
20513 return;
20514 }
20515
20516 /* Choose correct order to not overwrite the source before it is copied. */
20517 if ((REG_P (part[0][0])
20518 && REG_P (part[1][1])
20519 && (REGNO (part[0][0]) == REGNO (part[1][1])
20520 || (nparts == 3
20521 && REGNO (part[0][0]) == REGNO (part[1][2]))
20522 || (nparts == 4
20523 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20524 || (collisions > 0
20525 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20526 {
20527 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20528 {
20529 operands[2 + i] = part[0][j];
20530 operands[6 + i] = part[1][j];
20531 }
20532 }
20533 else
20534 {
20535 for (i = 0; i < nparts; i++)
20536 {
20537 operands[2 + i] = part[0][i];
20538 operands[6 + i] = part[1][i];
20539 }
20540 }
20541
20542 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20543 if (optimize_insn_for_size_p ())
20544 {
20545 for (j = 0; j < nparts - 1; j++)
20546 if (CONST_INT_P (operands[6 + j])
20547 && operands[6 + j] != const0_rtx
20548 && REG_P (operands[2 + j]))
20549 for (i = j; i < nparts - 1; i++)
20550 if (CONST_INT_P (operands[7 + i])
20551 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20552 operands[7 + i] = operands[2 + j];
20553 }
20554
20555 for (i = 0; i < nparts; i++)
20556 emit_move_insn (operands[2 + i], operands[6 + i]);
20557
20558 return;
20559 }
20560
20561 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20562 left shift by a constant, either using a single shift or
20563 a sequence of add instructions. */
20564
20565 static void
20566 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20567 {
20568 rtx (*insn)(rtx, rtx, rtx);
20569
20570 if (count == 1
20571 || (count * ix86_cost->add <= ix86_cost->shift_const
20572 && !optimize_insn_for_size_p ()))
20573 {
20574 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20575 while (count-- > 0)
20576 emit_insn (insn (operand, operand, operand));
20577 }
20578 else
20579 {
20580 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20581 emit_insn (insn (operand, operand, GEN_INT (count)));
20582 }
20583 }
20584
20585 void
20586 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20587 {
20588 rtx (*gen_ashl3)(rtx, rtx, rtx);
20589 rtx (*gen_shld)(rtx, rtx, rtx);
20590 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20591
20592 rtx low[2], high[2];
20593 int count;
20594
20595 if (CONST_INT_P (operands[2]))
20596 {
20597 split_double_mode (mode, operands, 2, low, high);
20598 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20599
20600 if (count >= half_width)
20601 {
20602 emit_move_insn (high[0], low[1]);
20603 emit_move_insn (low[0], const0_rtx);
20604
20605 if (count > half_width)
20606 ix86_expand_ashl_const (high[0], count - half_width, mode);
20607 }
20608 else
20609 {
20610 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20611
20612 if (!rtx_equal_p (operands[0], operands[1]))
20613 emit_move_insn (operands[0], operands[1]);
20614
20615 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20616 ix86_expand_ashl_const (low[0], count, mode);
20617 }
20618 return;
20619 }
20620
20621 split_double_mode (mode, operands, 1, low, high);
20622
20623 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20624
20625 if (operands[1] == const1_rtx)
20626 {
20627 /* Assuming we've chosen a QImode capable registers, then 1 << N
20628 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20629 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20630 {
20631 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20632
20633 ix86_expand_clear (low[0]);
20634 ix86_expand_clear (high[0]);
20635 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20636
20637 d = gen_lowpart (QImode, low[0]);
20638 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20639 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20640 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20641
20642 d = gen_lowpart (QImode, high[0]);
20643 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20644 s = gen_rtx_NE (QImode, flags, const0_rtx);
20645 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20646 }
20647
20648 /* Otherwise, we can get the same results by manually performing
20649 a bit extract operation on bit 5/6, and then performing the two
20650 shifts. The two methods of getting 0/1 into low/high are exactly
20651 the same size. Avoiding the shift in the bit extract case helps
20652 pentium4 a bit; no one else seems to care much either way. */
20653 else
20654 {
20655 enum machine_mode half_mode;
20656 rtx (*gen_lshr3)(rtx, rtx, rtx);
20657 rtx (*gen_and3)(rtx, rtx, rtx);
20658 rtx (*gen_xor3)(rtx, rtx, rtx);
20659 HOST_WIDE_INT bits;
20660 rtx x;
20661
20662 if (mode == DImode)
20663 {
20664 half_mode = SImode;
20665 gen_lshr3 = gen_lshrsi3;
20666 gen_and3 = gen_andsi3;
20667 gen_xor3 = gen_xorsi3;
20668 bits = 5;
20669 }
20670 else
20671 {
20672 half_mode = DImode;
20673 gen_lshr3 = gen_lshrdi3;
20674 gen_and3 = gen_anddi3;
20675 gen_xor3 = gen_xordi3;
20676 bits = 6;
20677 }
20678
20679 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20680 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20681 else
20682 x = gen_lowpart (half_mode, operands[2]);
20683 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20684
20685 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20686 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20687 emit_move_insn (low[0], high[0]);
20688 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20689 }
20690
20691 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20692 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20693 return;
20694 }
20695
20696 if (operands[1] == constm1_rtx)
20697 {
20698 /* For -1 << N, we can avoid the shld instruction, because we
20699 know that we're shifting 0...31/63 ones into a -1. */
20700 emit_move_insn (low[0], constm1_rtx);
20701 if (optimize_insn_for_size_p ())
20702 emit_move_insn (high[0], low[0]);
20703 else
20704 emit_move_insn (high[0], constm1_rtx);
20705 }
20706 else
20707 {
20708 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20709
20710 if (!rtx_equal_p (operands[0], operands[1]))
20711 emit_move_insn (operands[0], operands[1]);
20712
20713 split_double_mode (mode, operands, 1, low, high);
20714 emit_insn (gen_shld (high[0], low[0], operands[2]));
20715 }
20716
20717 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20718
20719 if (TARGET_CMOVE && scratch)
20720 {
20721 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20722 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20723
20724 ix86_expand_clear (scratch);
20725 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20726 }
20727 else
20728 {
20729 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20730 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20731
20732 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20733 }
20734 }
20735
20736 void
20737 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20738 {
20739 rtx (*gen_ashr3)(rtx, rtx, rtx)
20740 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20741 rtx (*gen_shrd)(rtx, rtx, rtx);
20742 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20743
20744 rtx low[2], high[2];
20745 int count;
20746
20747 if (CONST_INT_P (operands[2]))
20748 {
20749 split_double_mode (mode, operands, 2, low, high);
20750 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20751
20752 if (count == GET_MODE_BITSIZE (mode) - 1)
20753 {
20754 emit_move_insn (high[0], high[1]);
20755 emit_insn (gen_ashr3 (high[0], high[0],
20756 GEN_INT (half_width - 1)));
20757 emit_move_insn (low[0], high[0]);
20758
20759 }
20760 else if (count >= half_width)
20761 {
20762 emit_move_insn (low[0], high[1]);
20763 emit_move_insn (high[0], low[0]);
20764 emit_insn (gen_ashr3 (high[0], high[0],
20765 GEN_INT (half_width - 1)));
20766
20767 if (count > half_width)
20768 emit_insn (gen_ashr3 (low[0], low[0],
20769 GEN_INT (count - half_width)));
20770 }
20771 else
20772 {
20773 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20774
20775 if (!rtx_equal_p (operands[0], operands[1]))
20776 emit_move_insn (operands[0], operands[1]);
20777
20778 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20779 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20780 }
20781 }
20782 else
20783 {
20784 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20785
20786 if (!rtx_equal_p (operands[0], operands[1]))
20787 emit_move_insn (operands[0], operands[1]);
20788
20789 split_double_mode (mode, operands, 1, low, high);
20790
20791 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20792 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
20793
20794 if (TARGET_CMOVE && scratch)
20795 {
20796 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20797 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20798
20799 emit_move_insn (scratch, high[0]);
20800 emit_insn (gen_ashr3 (scratch, scratch,
20801 GEN_INT (half_width - 1)));
20802 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20803 scratch));
20804 }
20805 else
20806 {
20807 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
20808 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
20809
20810 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
20811 }
20812 }
20813 }
20814
20815 void
20816 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
20817 {
20818 rtx (*gen_lshr3)(rtx, rtx, rtx)
20819 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
20820 rtx (*gen_shrd)(rtx, rtx, rtx);
20821 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20822
20823 rtx low[2], high[2];
20824 int count;
20825
20826 if (CONST_INT_P (operands[2]))
20827 {
20828 split_double_mode (mode, operands, 2, low, high);
20829 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20830
20831 if (count >= half_width)
20832 {
20833 emit_move_insn (low[0], high[1]);
20834 ix86_expand_clear (high[0]);
20835
20836 if (count > half_width)
20837 emit_insn (gen_lshr3 (low[0], low[0],
20838 GEN_INT (count - half_width)));
20839 }
20840 else
20841 {
20842 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20843
20844 if (!rtx_equal_p (operands[0], operands[1]))
20845 emit_move_insn (operands[0], operands[1]);
20846
20847 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20848 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
20849 }
20850 }
20851 else
20852 {
20853 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20854
20855 if (!rtx_equal_p (operands[0], operands[1]))
20856 emit_move_insn (operands[0], operands[1]);
20857
20858 split_double_mode (mode, operands, 1, low, high);
20859
20860 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20861 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
20862
20863 if (TARGET_CMOVE && scratch)
20864 {
20865 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20866 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20867
20868 ix86_expand_clear (scratch);
20869 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20870 scratch));
20871 }
20872 else
20873 {
20874 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20875 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20876
20877 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
20878 }
20879 }
20880 }
20881
20882 /* Predict just emitted jump instruction to be taken with probability PROB. */
20883 static void
20884 predict_jump (int prob)
20885 {
20886 rtx insn = get_last_insn ();
20887 gcc_assert (JUMP_P (insn));
20888 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
20889 }
20890
20891 /* Helper function for the string operations below. Dest VARIABLE whether
20892 it is aligned to VALUE bytes. If true, jump to the label. */
20893 static rtx
20894 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
20895 {
20896 rtx label = gen_label_rtx ();
20897 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
20898 if (GET_MODE (variable) == DImode)
20899 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
20900 else
20901 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
20902 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
20903 1, label);
20904 if (epilogue)
20905 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20906 else
20907 predict_jump (REG_BR_PROB_BASE * 90 / 100);
20908 return label;
20909 }
20910
20911 /* Adjust COUNTER by the VALUE. */
20912 static void
20913 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
20914 {
20915 rtx (*gen_add)(rtx, rtx, rtx)
20916 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
20917
20918 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
20919 }
20920
20921 /* Zero extend possibly SImode EXP to Pmode register. */
20922 rtx
20923 ix86_zero_extend_to_Pmode (rtx exp)
20924 {
20925 rtx r;
20926 if (GET_MODE (exp) == VOIDmode)
20927 return force_reg (Pmode, exp);
20928 if (GET_MODE (exp) == Pmode)
20929 return copy_to_mode_reg (Pmode, exp);
20930 r = gen_reg_rtx (Pmode);
20931 emit_insn (gen_zero_extendsidi2 (r, exp));
20932 return r;
20933 }
20934
20935 /* Divide COUNTREG by SCALE. */
20936 static rtx
20937 scale_counter (rtx countreg, int scale)
20938 {
20939 rtx sc;
20940
20941 if (scale == 1)
20942 return countreg;
20943 if (CONST_INT_P (countreg))
20944 return GEN_INT (INTVAL (countreg) / scale);
20945 gcc_assert (REG_P (countreg));
20946
20947 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
20948 GEN_INT (exact_log2 (scale)),
20949 NULL, 1, OPTAB_DIRECT);
20950 return sc;
20951 }
20952
20953 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
20954 DImode for constant loop counts. */
20955
20956 static enum machine_mode
20957 counter_mode (rtx count_exp)
20958 {
20959 if (GET_MODE (count_exp) != VOIDmode)
20960 return GET_MODE (count_exp);
20961 if (!CONST_INT_P (count_exp))
20962 return Pmode;
20963 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
20964 return DImode;
20965 return SImode;
20966 }
20967
20968 /* When SRCPTR is non-NULL, output simple loop to move memory
20969 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
20970 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
20971 equivalent loop to set memory by VALUE (supposed to be in MODE).
20972
20973 The size is rounded down to whole number of chunk size moved at once.
20974 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
20975
20976
20977 static void
20978 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
20979 rtx destptr, rtx srcptr, rtx value,
20980 rtx count, enum machine_mode mode, int unroll,
20981 int expected_size)
20982 {
20983 rtx out_label, top_label, iter, tmp;
20984 enum machine_mode iter_mode = counter_mode (count);
20985 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
20986 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
20987 rtx size;
20988 rtx x_addr;
20989 rtx y_addr;
20990 int i;
20991
20992 top_label = gen_label_rtx ();
20993 out_label = gen_label_rtx ();
20994 iter = gen_reg_rtx (iter_mode);
20995
20996 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
20997 NULL, 1, OPTAB_DIRECT);
20998 /* Those two should combine. */
20999 if (piece_size == const1_rtx)
21000 {
21001 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21002 true, out_label);
21003 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21004 }
21005 emit_move_insn (iter, const0_rtx);
21006
21007 emit_label (top_label);
21008
21009 tmp = convert_modes (Pmode, iter_mode, iter, true);
21010 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21011 destmem = change_address (destmem, mode, x_addr);
21012
21013 if (srcmem)
21014 {
21015 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21016 srcmem = change_address (srcmem, mode, y_addr);
21017
21018 /* When unrolling for chips that reorder memory reads and writes,
21019 we can save registers by using single temporary.
21020 Also using 4 temporaries is overkill in 32bit mode. */
21021 if (!TARGET_64BIT && 0)
21022 {
21023 for (i = 0; i < unroll; i++)
21024 {
21025 if (i)
21026 {
21027 destmem =
21028 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21029 srcmem =
21030 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21031 }
21032 emit_move_insn (destmem, srcmem);
21033 }
21034 }
21035 else
21036 {
21037 rtx tmpreg[4];
21038 gcc_assert (unroll <= 4);
21039 for (i = 0; i < unroll; i++)
21040 {
21041 tmpreg[i] = gen_reg_rtx (mode);
21042 if (i)
21043 {
21044 srcmem =
21045 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21046 }
21047 emit_move_insn (tmpreg[i], srcmem);
21048 }
21049 for (i = 0; i < unroll; i++)
21050 {
21051 if (i)
21052 {
21053 destmem =
21054 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21055 }
21056 emit_move_insn (destmem, tmpreg[i]);
21057 }
21058 }
21059 }
21060 else
21061 for (i = 0; i < unroll; i++)
21062 {
21063 if (i)
21064 destmem =
21065 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21066 emit_move_insn (destmem, value);
21067 }
21068
21069 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21070 true, OPTAB_LIB_WIDEN);
21071 if (tmp != iter)
21072 emit_move_insn (iter, tmp);
21073
21074 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21075 true, top_label);
21076 if (expected_size != -1)
21077 {
21078 expected_size /= GET_MODE_SIZE (mode) * unroll;
21079 if (expected_size == 0)
21080 predict_jump (0);
21081 else if (expected_size > REG_BR_PROB_BASE)
21082 predict_jump (REG_BR_PROB_BASE - 1);
21083 else
21084 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21085 }
21086 else
21087 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21088 iter = ix86_zero_extend_to_Pmode (iter);
21089 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21090 true, OPTAB_LIB_WIDEN);
21091 if (tmp != destptr)
21092 emit_move_insn (destptr, tmp);
21093 if (srcptr)
21094 {
21095 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21096 true, OPTAB_LIB_WIDEN);
21097 if (tmp != srcptr)
21098 emit_move_insn (srcptr, tmp);
21099 }
21100 emit_label (out_label);
21101 }
21102
21103 /* Output "rep; mov" instruction.
21104 Arguments have same meaning as for previous function */
21105 static void
21106 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21107 rtx destptr, rtx srcptr,
21108 rtx count,
21109 enum machine_mode mode)
21110 {
21111 rtx destexp;
21112 rtx srcexp;
21113 rtx countreg;
21114 HOST_WIDE_INT rounded_count;
21115
21116 /* If the size is known, it is shorter to use rep movs. */
21117 if (mode == QImode && CONST_INT_P (count)
21118 && !(INTVAL (count) & 3))
21119 mode = SImode;
21120
21121 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21122 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21123 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21124 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21125 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21126 if (mode != QImode)
21127 {
21128 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21129 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21130 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21131 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21132 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21133 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21134 }
21135 else
21136 {
21137 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21138 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21139 }
21140 if (CONST_INT_P (count))
21141 {
21142 rounded_count = (INTVAL (count)
21143 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21144 destmem = shallow_copy_rtx (destmem);
21145 srcmem = shallow_copy_rtx (srcmem);
21146 set_mem_size (destmem, rounded_count);
21147 set_mem_size (srcmem, rounded_count);
21148 }
21149 else
21150 {
21151 if (MEM_SIZE_KNOWN_P (destmem))
21152 clear_mem_size (destmem);
21153 if (MEM_SIZE_KNOWN_P (srcmem))
21154 clear_mem_size (srcmem);
21155 }
21156 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21157 destexp, srcexp));
21158 }
21159
21160 /* Output "rep; stos" instruction.
21161 Arguments have same meaning as for previous function */
21162 static void
21163 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21164 rtx count, enum machine_mode mode,
21165 rtx orig_value)
21166 {
21167 rtx destexp;
21168 rtx countreg;
21169 HOST_WIDE_INT rounded_count;
21170
21171 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21172 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21173 value = force_reg (mode, gen_lowpart (mode, value));
21174 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21175 if (mode != QImode)
21176 {
21177 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21178 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21179 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21180 }
21181 else
21182 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21183 if (orig_value == const0_rtx && CONST_INT_P (count))
21184 {
21185 rounded_count = (INTVAL (count)
21186 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21187 destmem = shallow_copy_rtx (destmem);
21188 set_mem_size (destmem, rounded_count);
21189 }
21190 else if (MEM_SIZE_KNOWN_P (destmem))
21191 clear_mem_size (destmem);
21192 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21193 }
21194
21195 static void
21196 emit_strmov (rtx destmem, rtx srcmem,
21197 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21198 {
21199 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21200 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21201 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21202 }
21203
21204 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21205 static void
21206 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21207 rtx destptr, rtx srcptr, rtx count, int max_size)
21208 {
21209 rtx src, dest;
21210 if (CONST_INT_P (count))
21211 {
21212 HOST_WIDE_INT countval = INTVAL (count);
21213 int offset = 0;
21214
21215 if ((countval & 0x10) && max_size > 16)
21216 {
21217 if (TARGET_64BIT)
21218 {
21219 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21220 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21221 }
21222 else
21223 gcc_unreachable ();
21224 offset += 16;
21225 }
21226 if ((countval & 0x08) && max_size > 8)
21227 {
21228 if (TARGET_64BIT)
21229 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21230 else
21231 {
21232 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21233 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21234 }
21235 offset += 8;
21236 }
21237 if ((countval & 0x04) && max_size > 4)
21238 {
21239 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21240 offset += 4;
21241 }
21242 if ((countval & 0x02) && max_size > 2)
21243 {
21244 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21245 offset += 2;
21246 }
21247 if ((countval & 0x01) && max_size > 1)
21248 {
21249 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21250 offset += 1;
21251 }
21252 return;
21253 }
21254 if (max_size > 8)
21255 {
21256 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21257 count, 1, OPTAB_DIRECT);
21258 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21259 count, QImode, 1, 4);
21260 return;
21261 }
21262
21263 /* When there are stringops, we can cheaply increase dest and src pointers.
21264 Otherwise we save code size by maintaining offset (zero is readily
21265 available from preceding rep operation) and using x86 addressing modes.
21266 */
21267 if (TARGET_SINGLE_STRINGOP)
21268 {
21269 if (max_size > 4)
21270 {
21271 rtx label = ix86_expand_aligntest (count, 4, true);
21272 src = change_address (srcmem, SImode, srcptr);
21273 dest = change_address (destmem, SImode, destptr);
21274 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21275 emit_label (label);
21276 LABEL_NUSES (label) = 1;
21277 }
21278 if (max_size > 2)
21279 {
21280 rtx label = ix86_expand_aligntest (count, 2, true);
21281 src = change_address (srcmem, HImode, srcptr);
21282 dest = change_address (destmem, HImode, destptr);
21283 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21284 emit_label (label);
21285 LABEL_NUSES (label) = 1;
21286 }
21287 if (max_size > 1)
21288 {
21289 rtx label = ix86_expand_aligntest (count, 1, true);
21290 src = change_address (srcmem, QImode, srcptr);
21291 dest = change_address (destmem, QImode, destptr);
21292 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21293 emit_label (label);
21294 LABEL_NUSES (label) = 1;
21295 }
21296 }
21297 else
21298 {
21299 rtx offset = force_reg (Pmode, const0_rtx);
21300 rtx tmp;
21301
21302 if (max_size > 4)
21303 {
21304 rtx label = ix86_expand_aligntest (count, 4, true);
21305 src = change_address (srcmem, SImode, srcptr);
21306 dest = change_address (destmem, SImode, destptr);
21307 emit_move_insn (dest, src);
21308 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21309 true, OPTAB_LIB_WIDEN);
21310 if (tmp != offset)
21311 emit_move_insn (offset, tmp);
21312 emit_label (label);
21313 LABEL_NUSES (label) = 1;
21314 }
21315 if (max_size > 2)
21316 {
21317 rtx label = ix86_expand_aligntest (count, 2, true);
21318 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21319 src = change_address (srcmem, HImode, tmp);
21320 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21321 dest = change_address (destmem, HImode, tmp);
21322 emit_move_insn (dest, src);
21323 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21324 true, OPTAB_LIB_WIDEN);
21325 if (tmp != offset)
21326 emit_move_insn (offset, tmp);
21327 emit_label (label);
21328 LABEL_NUSES (label) = 1;
21329 }
21330 if (max_size > 1)
21331 {
21332 rtx label = ix86_expand_aligntest (count, 1, true);
21333 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21334 src = change_address (srcmem, QImode, tmp);
21335 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21336 dest = change_address (destmem, QImode, tmp);
21337 emit_move_insn (dest, src);
21338 emit_label (label);
21339 LABEL_NUSES (label) = 1;
21340 }
21341 }
21342 }
21343
21344 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21345 static void
21346 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21347 rtx count, int max_size)
21348 {
21349 count =
21350 expand_simple_binop (counter_mode (count), AND, count,
21351 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21352 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21353 gen_lowpart (QImode, value), count, QImode,
21354 1, max_size / 2);
21355 }
21356
21357 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21358 static void
21359 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21360 {
21361 rtx dest;
21362
21363 if (CONST_INT_P (count))
21364 {
21365 HOST_WIDE_INT countval = INTVAL (count);
21366 int offset = 0;
21367
21368 if ((countval & 0x10) && max_size > 16)
21369 {
21370 if (TARGET_64BIT)
21371 {
21372 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21373 emit_insn (gen_strset (destptr, dest, value));
21374 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21375 emit_insn (gen_strset (destptr, dest, value));
21376 }
21377 else
21378 gcc_unreachable ();
21379 offset += 16;
21380 }
21381 if ((countval & 0x08) && max_size > 8)
21382 {
21383 if (TARGET_64BIT)
21384 {
21385 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21386 emit_insn (gen_strset (destptr, dest, value));
21387 }
21388 else
21389 {
21390 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21391 emit_insn (gen_strset (destptr, dest, value));
21392 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21393 emit_insn (gen_strset (destptr, dest, value));
21394 }
21395 offset += 8;
21396 }
21397 if ((countval & 0x04) && max_size > 4)
21398 {
21399 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21400 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21401 offset += 4;
21402 }
21403 if ((countval & 0x02) && max_size > 2)
21404 {
21405 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21406 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21407 offset += 2;
21408 }
21409 if ((countval & 0x01) && max_size > 1)
21410 {
21411 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21412 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21413 offset += 1;
21414 }
21415 return;
21416 }
21417 if (max_size > 32)
21418 {
21419 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21420 return;
21421 }
21422 if (max_size > 16)
21423 {
21424 rtx label = ix86_expand_aligntest (count, 16, true);
21425 if (TARGET_64BIT)
21426 {
21427 dest = change_address (destmem, DImode, destptr);
21428 emit_insn (gen_strset (destptr, dest, value));
21429 emit_insn (gen_strset (destptr, dest, value));
21430 }
21431 else
21432 {
21433 dest = change_address (destmem, SImode, destptr);
21434 emit_insn (gen_strset (destptr, dest, value));
21435 emit_insn (gen_strset (destptr, dest, value));
21436 emit_insn (gen_strset (destptr, dest, value));
21437 emit_insn (gen_strset (destptr, dest, value));
21438 }
21439 emit_label (label);
21440 LABEL_NUSES (label) = 1;
21441 }
21442 if (max_size > 8)
21443 {
21444 rtx label = ix86_expand_aligntest (count, 8, true);
21445 if (TARGET_64BIT)
21446 {
21447 dest = change_address (destmem, DImode, destptr);
21448 emit_insn (gen_strset (destptr, dest, value));
21449 }
21450 else
21451 {
21452 dest = change_address (destmem, SImode, destptr);
21453 emit_insn (gen_strset (destptr, dest, value));
21454 emit_insn (gen_strset (destptr, dest, value));
21455 }
21456 emit_label (label);
21457 LABEL_NUSES (label) = 1;
21458 }
21459 if (max_size > 4)
21460 {
21461 rtx label = ix86_expand_aligntest (count, 4, true);
21462 dest = change_address (destmem, SImode, destptr);
21463 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21464 emit_label (label);
21465 LABEL_NUSES (label) = 1;
21466 }
21467 if (max_size > 2)
21468 {
21469 rtx label = ix86_expand_aligntest (count, 2, true);
21470 dest = change_address (destmem, HImode, destptr);
21471 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21472 emit_label (label);
21473 LABEL_NUSES (label) = 1;
21474 }
21475 if (max_size > 1)
21476 {
21477 rtx label = ix86_expand_aligntest (count, 1, true);
21478 dest = change_address (destmem, QImode, destptr);
21479 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21480 emit_label (label);
21481 LABEL_NUSES (label) = 1;
21482 }
21483 }
21484
21485 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21486 DESIRED_ALIGNMENT. */
21487 static void
21488 expand_movmem_prologue (rtx destmem, rtx srcmem,
21489 rtx destptr, rtx srcptr, rtx count,
21490 int align, int desired_alignment)
21491 {
21492 if (align <= 1 && desired_alignment > 1)
21493 {
21494 rtx label = ix86_expand_aligntest (destptr, 1, false);
21495 srcmem = change_address (srcmem, QImode, srcptr);
21496 destmem = change_address (destmem, QImode, destptr);
21497 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21498 ix86_adjust_counter (count, 1);
21499 emit_label (label);
21500 LABEL_NUSES (label) = 1;
21501 }
21502 if (align <= 2 && desired_alignment > 2)
21503 {
21504 rtx label = ix86_expand_aligntest (destptr, 2, false);
21505 srcmem = change_address (srcmem, HImode, srcptr);
21506 destmem = change_address (destmem, HImode, destptr);
21507 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21508 ix86_adjust_counter (count, 2);
21509 emit_label (label);
21510 LABEL_NUSES (label) = 1;
21511 }
21512 if (align <= 4 && desired_alignment > 4)
21513 {
21514 rtx label = ix86_expand_aligntest (destptr, 4, false);
21515 srcmem = change_address (srcmem, SImode, srcptr);
21516 destmem = change_address (destmem, SImode, destptr);
21517 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21518 ix86_adjust_counter (count, 4);
21519 emit_label (label);
21520 LABEL_NUSES (label) = 1;
21521 }
21522 gcc_assert (desired_alignment <= 8);
21523 }
21524
21525 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21526 ALIGN_BYTES is how many bytes need to be copied. */
21527 static rtx
21528 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21529 int desired_align, int align_bytes)
21530 {
21531 rtx src = *srcp;
21532 rtx orig_dst = dst;
21533 rtx orig_src = src;
21534 int off = 0;
21535 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21536 if (src_align_bytes >= 0)
21537 src_align_bytes = desired_align - src_align_bytes;
21538 if (align_bytes & 1)
21539 {
21540 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21541 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21542 off = 1;
21543 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21544 }
21545 if (align_bytes & 2)
21546 {
21547 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21548 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21549 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21550 set_mem_align (dst, 2 * BITS_PER_UNIT);
21551 if (src_align_bytes >= 0
21552 && (src_align_bytes & 1) == (align_bytes & 1)
21553 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21554 set_mem_align (src, 2 * BITS_PER_UNIT);
21555 off = 2;
21556 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21557 }
21558 if (align_bytes & 4)
21559 {
21560 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21561 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21562 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21563 set_mem_align (dst, 4 * BITS_PER_UNIT);
21564 if (src_align_bytes >= 0)
21565 {
21566 unsigned int src_align = 0;
21567 if ((src_align_bytes & 3) == (align_bytes & 3))
21568 src_align = 4;
21569 else if ((src_align_bytes & 1) == (align_bytes & 1))
21570 src_align = 2;
21571 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21572 set_mem_align (src, src_align * BITS_PER_UNIT);
21573 }
21574 off = 4;
21575 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21576 }
21577 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21578 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21579 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21580 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21581 if (src_align_bytes >= 0)
21582 {
21583 unsigned int src_align = 0;
21584 if ((src_align_bytes & 7) == (align_bytes & 7))
21585 src_align = 8;
21586 else if ((src_align_bytes & 3) == (align_bytes & 3))
21587 src_align = 4;
21588 else if ((src_align_bytes & 1) == (align_bytes & 1))
21589 src_align = 2;
21590 if (src_align > (unsigned int) desired_align)
21591 src_align = desired_align;
21592 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21593 set_mem_align (src, src_align * BITS_PER_UNIT);
21594 }
21595 if (MEM_SIZE_KNOWN_P (orig_dst))
21596 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21597 if (MEM_SIZE_KNOWN_P (orig_src))
21598 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21599 *srcp = src;
21600 return dst;
21601 }
21602
21603 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21604 DESIRED_ALIGNMENT. */
21605 static void
21606 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21607 int align, int desired_alignment)
21608 {
21609 if (align <= 1 && desired_alignment > 1)
21610 {
21611 rtx label = ix86_expand_aligntest (destptr, 1, false);
21612 destmem = change_address (destmem, QImode, destptr);
21613 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21614 ix86_adjust_counter (count, 1);
21615 emit_label (label);
21616 LABEL_NUSES (label) = 1;
21617 }
21618 if (align <= 2 && desired_alignment > 2)
21619 {
21620 rtx label = ix86_expand_aligntest (destptr, 2, false);
21621 destmem = change_address (destmem, HImode, destptr);
21622 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21623 ix86_adjust_counter (count, 2);
21624 emit_label (label);
21625 LABEL_NUSES (label) = 1;
21626 }
21627 if (align <= 4 && desired_alignment > 4)
21628 {
21629 rtx label = ix86_expand_aligntest (destptr, 4, false);
21630 destmem = change_address (destmem, SImode, destptr);
21631 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21632 ix86_adjust_counter (count, 4);
21633 emit_label (label);
21634 LABEL_NUSES (label) = 1;
21635 }
21636 gcc_assert (desired_alignment <= 8);
21637 }
21638
21639 /* Set enough from DST to align DST known to by aligned by ALIGN to
21640 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21641 static rtx
21642 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21643 int desired_align, int align_bytes)
21644 {
21645 int off = 0;
21646 rtx orig_dst = dst;
21647 if (align_bytes & 1)
21648 {
21649 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21650 off = 1;
21651 emit_insn (gen_strset (destreg, dst,
21652 gen_lowpart (QImode, value)));
21653 }
21654 if (align_bytes & 2)
21655 {
21656 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21657 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21658 set_mem_align (dst, 2 * BITS_PER_UNIT);
21659 off = 2;
21660 emit_insn (gen_strset (destreg, dst,
21661 gen_lowpart (HImode, value)));
21662 }
21663 if (align_bytes & 4)
21664 {
21665 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21666 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21667 set_mem_align (dst, 4 * BITS_PER_UNIT);
21668 off = 4;
21669 emit_insn (gen_strset (destreg, dst,
21670 gen_lowpart (SImode, value)));
21671 }
21672 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21673 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21674 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21675 if (MEM_SIZE_KNOWN_P (orig_dst))
21676 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21677 return dst;
21678 }
21679
21680 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21681 static enum stringop_alg
21682 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21683 int *dynamic_check)
21684 {
21685 const struct stringop_algs * algs;
21686 bool optimize_for_speed;
21687 /* Algorithms using the rep prefix want at least edi and ecx;
21688 additionally, memset wants eax and memcpy wants esi. Don't
21689 consider such algorithms if the user has appropriated those
21690 registers for their own purposes. */
21691 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21692 || (memset
21693 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21694
21695 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21696 || (alg != rep_prefix_1_byte \
21697 && alg != rep_prefix_4_byte \
21698 && alg != rep_prefix_8_byte))
21699 const struct processor_costs *cost;
21700
21701 /* Even if the string operation call is cold, we still might spend a lot
21702 of time processing large blocks. */
21703 if (optimize_function_for_size_p (cfun)
21704 || (optimize_insn_for_size_p ()
21705 && expected_size != -1 && expected_size < 256))
21706 optimize_for_speed = false;
21707 else
21708 optimize_for_speed = true;
21709
21710 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21711
21712 *dynamic_check = -1;
21713 if (memset)
21714 algs = &cost->memset[TARGET_64BIT != 0];
21715 else
21716 algs = &cost->memcpy[TARGET_64BIT != 0];
21717 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21718 return ix86_stringop_alg;
21719 /* rep; movq or rep; movl is the smallest variant. */
21720 else if (!optimize_for_speed)
21721 {
21722 if (!count || (count & 3))
21723 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21724 else
21725 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21726 }
21727 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21728 */
21729 else if (expected_size != -1 && expected_size < 4)
21730 return loop_1_byte;
21731 else if (expected_size != -1)
21732 {
21733 unsigned int i;
21734 enum stringop_alg alg = libcall;
21735 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21736 {
21737 /* We get here if the algorithms that were not libcall-based
21738 were rep-prefix based and we are unable to use rep prefixes
21739 based on global register usage. Break out of the loop and
21740 use the heuristic below. */
21741 if (algs->size[i].max == 0)
21742 break;
21743 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21744 {
21745 enum stringop_alg candidate = algs->size[i].alg;
21746
21747 if (candidate != libcall && ALG_USABLE_P (candidate))
21748 alg = candidate;
21749 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21750 last non-libcall inline algorithm. */
21751 if (TARGET_INLINE_ALL_STRINGOPS)
21752 {
21753 /* When the current size is best to be copied by a libcall,
21754 but we are still forced to inline, run the heuristic below
21755 that will pick code for medium sized blocks. */
21756 if (alg != libcall)
21757 return alg;
21758 break;
21759 }
21760 else if (ALG_USABLE_P (candidate))
21761 return candidate;
21762 }
21763 }
21764 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21765 }
21766 /* When asked to inline the call anyway, try to pick meaningful choice.
21767 We look for maximal size of block that is faster to copy by hand and
21768 take blocks of at most of that size guessing that average size will
21769 be roughly half of the block.
21770
21771 If this turns out to be bad, we might simply specify the preferred
21772 choice in ix86_costs. */
21773 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21774 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21775 {
21776 int max = -1;
21777 enum stringop_alg alg;
21778 int i;
21779 bool any_alg_usable_p = true;
21780
21781 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21782 {
21783 enum stringop_alg candidate = algs->size[i].alg;
21784 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21785
21786 if (candidate != libcall && candidate
21787 && ALG_USABLE_P (candidate))
21788 max = algs->size[i].max;
21789 }
21790 /* If there aren't any usable algorithms, then recursing on
21791 smaller sizes isn't going to find anything. Just return the
21792 simple byte-at-a-time copy loop. */
21793 if (!any_alg_usable_p)
21794 {
21795 /* Pick something reasonable. */
21796 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21797 *dynamic_check = 128;
21798 return loop_1_byte;
21799 }
21800 if (max == -1)
21801 max = 4096;
21802 alg = decide_alg (count, max / 2, memset, dynamic_check);
21803 gcc_assert (*dynamic_check == -1);
21804 gcc_assert (alg != libcall);
21805 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21806 *dynamic_check = max;
21807 return alg;
21808 }
21809 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
21810 #undef ALG_USABLE_P
21811 }
21812
21813 /* Decide on alignment. We know that the operand is already aligned to ALIGN
21814 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
21815 static int
21816 decide_alignment (int align,
21817 enum stringop_alg alg,
21818 int expected_size)
21819 {
21820 int desired_align = 0;
21821 switch (alg)
21822 {
21823 case no_stringop:
21824 gcc_unreachable ();
21825 case loop:
21826 case unrolled_loop:
21827 desired_align = GET_MODE_SIZE (Pmode);
21828 break;
21829 case rep_prefix_8_byte:
21830 desired_align = 8;
21831 break;
21832 case rep_prefix_4_byte:
21833 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21834 copying whole cacheline at once. */
21835 if (TARGET_PENTIUMPRO)
21836 desired_align = 8;
21837 else
21838 desired_align = 4;
21839 break;
21840 case rep_prefix_1_byte:
21841 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21842 copying whole cacheline at once. */
21843 if (TARGET_PENTIUMPRO)
21844 desired_align = 8;
21845 else
21846 desired_align = 1;
21847 break;
21848 case loop_1_byte:
21849 desired_align = 1;
21850 break;
21851 case libcall:
21852 return 0;
21853 }
21854
21855 if (optimize_size)
21856 desired_align = 1;
21857 if (desired_align < align)
21858 desired_align = align;
21859 if (expected_size != -1 && expected_size < 4)
21860 desired_align = align;
21861 return desired_align;
21862 }
21863
21864 /* Return the smallest power of 2 greater than VAL. */
21865 static int
21866 smallest_pow2_greater_than (int val)
21867 {
21868 int ret = 1;
21869 while (ret <= val)
21870 ret <<= 1;
21871 return ret;
21872 }
21873
21874 /* Expand string move (memcpy) operation. Use i386 string operations
21875 when profitable. expand_setmem contains similar code. The code
21876 depends upon architecture, block size and alignment, but always has
21877 the same overall structure:
21878
21879 1) Prologue guard: Conditional that jumps up to epilogues for small
21880 blocks that can be handled by epilogue alone. This is faster
21881 but also needed for correctness, since prologue assume the block
21882 is larger than the desired alignment.
21883
21884 Optional dynamic check for size and libcall for large
21885 blocks is emitted here too, with -minline-stringops-dynamically.
21886
21887 2) Prologue: copy first few bytes in order to get destination
21888 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
21889 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
21890 copied. We emit either a jump tree on power of two sized
21891 blocks, or a byte loop.
21892
21893 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
21894 with specified algorithm.
21895
21896 4) Epilogue: code copying tail of the block that is too small to be
21897 handled by main body (or up to size guarded by prologue guard). */
21898
21899 bool
21900 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
21901 rtx expected_align_exp, rtx expected_size_exp)
21902 {
21903 rtx destreg;
21904 rtx srcreg;
21905 rtx label = NULL;
21906 rtx tmp;
21907 rtx jump_around_label = NULL;
21908 HOST_WIDE_INT align = 1;
21909 unsigned HOST_WIDE_INT count = 0;
21910 HOST_WIDE_INT expected_size = -1;
21911 int size_needed = 0, epilogue_size_needed;
21912 int desired_align = 0, align_bytes = 0;
21913 enum stringop_alg alg;
21914 int dynamic_check;
21915 bool need_zero_guard = false;
21916
21917 if (CONST_INT_P (align_exp))
21918 align = INTVAL (align_exp);
21919 /* i386 can do misaligned access on reasonably increased cost. */
21920 if (CONST_INT_P (expected_align_exp)
21921 && INTVAL (expected_align_exp) > align)
21922 align = INTVAL (expected_align_exp);
21923 /* ALIGN is the minimum of destination and source alignment, but we care here
21924 just about destination alignment. */
21925 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
21926 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
21927
21928 if (CONST_INT_P (count_exp))
21929 count = expected_size = INTVAL (count_exp);
21930 if (CONST_INT_P (expected_size_exp) && count == 0)
21931 expected_size = INTVAL (expected_size_exp);
21932
21933 /* Make sure we don't need to care about overflow later on. */
21934 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
21935 return false;
21936
21937 /* Step 0: Decide on preferred algorithm, desired alignment and
21938 size of chunks to be copied by main loop. */
21939
21940 alg = decide_alg (count, expected_size, false, &dynamic_check);
21941 desired_align = decide_alignment (align, alg, expected_size);
21942
21943 if (!TARGET_ALIGN_STRINGOPS)
21944 align = desired_align;
21945
21946 if (alg == libcall)
21947 return false;
21948 gcc_assert (alg != no_stringop);
21949 if (!count)
21950 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
21951 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21952 srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
21953 switch (alg)
21954 {
21955 case libcall:
21956 case no_stringop:
21957 gcc_unreachable ();
21958 case loop:
21959 need_zero_guard = true;
21960 size_needed = GET_MODE_SIZE (Pmode);
21961 break;
21962 case unrolled_loop:
21963 need_zero_guard = true;
21964 size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
21965 break;
21966 case rep_prefix_8_byte:
21967 size_needed = 8;
21968 break;
21969 case rep_prefix_4_byte:
21970 size_needed = 4;
21971 break;
21972 case rep_prefix_1_byte:
21973 size_needed = 1;
21974 break;
21975 case loop_1_byte:
21976 need_zero_guard = true;
21977 size_needed = 1;
21978 break;
21979 }
21980
21981 epilogue_size_needed = size_needed;
21982
21983 /* Step 1: Prologue guard. */
21984
21985 /* Alignment code needs count to be in register. */
21986 if (CONST_INT_P (count_exp) && desired_align > align)
21987 {
21988 if (INTVAL (count_exp) > desired_align
21989 && INTVAL (count_exp) > size_needed)
21990 {
21991 align_bytes
21992 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
21993 if (align_bytes <= 0)
21994 align_bytes = 0;
21995 else
21996 align_bytes = desired_align - align_bytes;
21997 }
21998 if (align_bytes == 0)
21999 count_exp = force_reg (counter_mode (count_exp), count_exp);
22000 }
22001 gcc_assert (desired_align >= 1 && align >= 1);
22002
22003 /* Ensure that alignment prologue won't copy past end of block. */
22004 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22005 {
22006 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22007 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22008 Make sure it is power of 2. */
22009 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22010
22011 if (count)
22012 {
22013 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22014 {
22015 /* If main algorithm works on QImode, no epilogue is needed.
22016 For small sizes just don't align anything. */
22017 if (size_needed == 1)
22018 desired_align = align;
22019 else
22020 goto epilogue;
22021 }
22022 }
22023 else
22024 {
22025 label = gen_label_rtx ();
22026 emit_cmp_and_jump_insns (count_exp,
22027 GEN_INT (epilogue_size_needed),
22028 LTU, 0, counter_mode (count_exp), 1, label);
22029 if (expected_size == -1 || expected_size < epilogue_size_needed)
22030 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22031 else
22032 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22033 }
22034 }
22035
22036 /* Emit code to decide on runtime whether library call or inline should be
22037 used. */
22038 if (dynamic_check != -1)
22039 {
22040 if (CONST_INT_P (count_exp))
22041 {
22042 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22043 {
22044 emit_block_move_via_libcall (dst, src, count_exp, false);
22045 count_exp = const0_rtx;
22046 goto epilogue;
22047 }
22048 }
22049 else
22050 {
22051 rtx hot_label = gen_label_rtx ();
22052 jump_around_label = gen_label_rtx ();
22053 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22054 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22055 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22056 emit_block_move_via_libcall (dst, src, count_exp, false);
22057 emit_jump (jump_around_label);
22058 emit_label (hot_label);
22059 }
22060 }
22061
22062 /* Step 2: Alignment prologue. */
22063
22064 if (desired_align > align)
22065 {
22066 if (align_bytes == 0)
22067 {
22068 /* Except for the first move in epilogue, we no longer know
22069 constant offset in aliasing info. It don't seems to worth
22070 the pain to maintain it for the first move, so throw away
22071 the info early. */
22072 src = change_address (src, BLKmode, srcreg);
22073 dst = change_address (dst, BLKmode, destreg);
22074 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22075 desired_align);
22076 }
22077 else
22078 {
22079 /* If we know how many bytes need to be stored before dst is
22080 sufficiently aligned, maintain aliasing info accurately. */
22081 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22082 desired_align, align_bytes);
22083 count_exp = plus_constant (count_exp, -align_bytes);
22084 count -= align_bytes;
22085 }
22086 if (need_zero_guard
22087 && (count < (unsigned HOST_WIDE_INT) size_needed
22088 || (align_bytes == 0
22089 && count < ((unsigned HOST_WIDE_INT) size_needed
22090 + desired_align - align))))
22091 {
22092 /* It is possible that we copied enough so the main loop will not
22093 execute. */
22094 gcc_assert (size_needed > 1);
22095 if (label == NULL_RTX)
22096 label = gen_label_rtx ();
22097 emit_cmp_and_jump_insns (count_exp,
22098 GEN_INT (size_needed),
22099 LTU, 0, counter_mode (count_exp), 1, label);
22100 if (expected_size == -1
22101 || expected_size < (desired_align - align) / 2 + size_needed)
22102 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22103 else
22104 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22105 }
22106 }
22107 if (label && size_needed == 1)
22108 {
22109 emit_label (label);
22110 LABEL_NUSES (label) = 1;
22111 label = NULL;
22112 epilogue_size_needed = 1;
22113 }
22114 else if (label == NULL_RTX)
22115 epilogue_size_needed = size_needed;
22116
22117 /* Step 3: Main loop. */
22118
22119 switch (alg)
22120 {
22121 case libcall:
22122 case no_stringop:
22123 gcc_unreachable ();
22124 case loop_1_byte:
22125 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22126 count_exp, QImode, 1, expected_size);
22127 break;
22128 case loop:
22129 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22130 count_exp, Pmode, 1, expected_size);
22131 break;
22132 case unrolled_loop:
22133 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22134 registers for 4 temporaries anyway. */
22135 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22136 count_exp, Pmode, TARGET_64BIT ? 4 : 2,
22137 expected_size);
22138 break;
22139 case rep_prefix_8_byte:
22140 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22141 DImode);
22142 break;
22143 case rep_prefix_4_byte:
22144 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22145 SImode);
22146 break;
22147 case rep_prefix_1_byte:
22148 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22149 QImode);
22150 break;
22151 }
22152 /* Adjust properly the offset of src and dest memory for aliasing. */
22153 if (CONST_INT_P (count_exp))
22154 {
22155 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22156 (count / size_needed) * size_needed);
22157 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22158 (count / size_needed) * size_needed);
22159 }
22160 else
22161 {
22162 src = change_address (src, BLKmode, srcreg);
22163 dst = change_address (dst, BLKmode, destreg);
22164 }
22165
22166 /* Step 4: Epilogue to copy the remaining bytes. */
22167 epilogue:
22168 if (label)
22169 {
22170 /* When the main loop is done, COUNT_EXP might hold original count,
22171 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22172 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22173 bytes. Compensate if needed. */
22174
22175 if (size_needed < epilogue_size_needed)
22176 {
22177 tmp =
22178 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22179 GEN_INT (size_needed - 1), count_exp, 1,
22180 OPTAB_DIRECT);
22181 if (tmp != count_exp)
22182 emit_move_insn (count_exp, tmp);
22183 }
22184 emit_label (label);
22185 LABEL_NUSES (label) = 1;
22186 }
22187
22188 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22189 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22190 epilogue_size_needed);
22191 if (jump_around_label)
22192 emit_label (jump_around_label);
22193 return true;
22194 }
22195
22196 /* Helper function for memcpy. For QImode value 0xXY produce
22197 0xXYXYXYXY of wide specified by MODE. This is essentially
22198 a * 0x10101010, but we can do slightly better than
22199 synth_mult by unwinding the sequence by hand on CPUs with
22200 slow multiply. */
22201 static rtx
22202 promote_duplicated_reg (enum machine_mode mode, rtx val)
22203 {
22204 enum machine_mode valmode = GET_MODE (val);
22205 rtx tmp;
22206 int nops = mode == DImode ? 3 : 2;
22207
22208 gcc_assert (mode == SImode || mode == DImode);
22209 if (val == const0_rtx)
22210 return copy_to_mode_reg (mode, const0_rtx);
22211 if (CONST_INT_P (val))
22212 {
22213 HOST_WIDE_INT v = INTVAL (val) & 255;
22214
22215 v |= v << 8;
22216 v |= v << 16;
22217 if (mode == DImode)
22218 v |= (v << 16) << 16;
22219 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22220 }
22221
22222 if (valmode == VOIDmode)
22223 valmode = QImode;
22224 if (valmode != QImode)
22225 val = gen_lowpart (QImode, val);
22226 if (mode == QImode)
22227 return val;
22228 if (!TARGET_PARTIAL_REG_STALL)
22229 nops--;
22230 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22231 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22232 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22233 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22234 {
22235 rtx reg = convert_modes (mode, QImode, val, true);
22236 tmp = promote_duplicated_reg (mode, const1_rtx);
22237 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22238 OPTAB_DIRECT);
22239 }
22240 else
22241 {
22242 rtx reg = convert_modes (mode, QImode, val, true);
22243
22244 if (!TARGET_PARTIAL_REG_STALL)
22245 if (mode == SImode)
22246 emit_insn (gen_movsi_insv_1 (reg, reg));
22247 else
22248 emit_insn (gen_movdi_insv_1 (reg, reg));
22249 else
22250 {
22251 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22252 NULL, 1, OPTAB_DIRECT);
22253 reg =
22254 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22255 }
22256 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22257 NULL, 1, OPTAB_DIRECT);
22258 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22259 if (mode == SImode)
22260 return reg;
22261 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22262 NULL, 1, OPTAB_DIRECT);
22263 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22264 return reg;
22265 }
22266 }
22267
22268 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22269 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22270 alignment from ALIGN to DESIRED_ALIGN. */
22271 static rtx
22272 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22273 {
22274 rtx promoted_val;
22275
22276 if (TARGET_64BIT
22277 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22278 promoted_val = promote_duplicated_reg (DImode, val);
22279 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22280 promoted_val = promote_duplicated_reg (SImode, val);
22281 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22282 promoted_val = promote_duplicated_reg (HImode, val);
22283 else
22284 promoted_val = val;
22285
22286 return promoted_val;
22287 }
22288
22289 /* Expand string clear operation (bzero). Use i386 string operations when
22290 profitable. See expand_movmem comment for explanation of individual
22291 steps performed. */
22292 bool
22293 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22294 rtx expected_align_exp, rtx expected_size_exp)
22295 {
22296 rtx destreg;
22297 rtx label = NULL;
22298 rtx tmp;
22299 rtx jump_around_label = NULL;
22300 HOST_WIDE_INT align = 1;
22301 unsigned HOST_WIDE_INT count = 0;
22302 HOST_WIDE_INT expected_size = -1;
22303 int size_needed = 0, epilogue_size_needed;
22304 int desired_align = 0, align_bytes = 0;
22305 enum stringop_alg alg;
22306 rtx promoted_val = NULL;
22307 bool force_loopy_epilogue = false;
22308 int dynamic_check;
22309 bool need_zero_guard = false;
22310
22311 if (CONST_INT_P (align_exp))
22312 align = INTVAL (align_exp);
22313 /* i386 can do misaligned access on reasonably increased cost. */
22314 if (CONST_INT_P (expected_align_exp)
22315 && INTVAL (expected_align_exp) > align)
22316 align = INTVAL (expected_align_exp);
22317 if (CONST_INT_P (count_exp))
22318 count = expected_size = INTVAL (count_exp);
22319 if (CONST_INT_P (expected_size_exp) && count == 0)
22320 expected_size = INTVAL (expected_size_exp);
22321
22322 /* Make sure we don't need to care about overflow later on. */
22323 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22324 return false;
22325
22326 /* Step 0: Decide on preferred algorithm, desired alignment and
22327 size of chunks to be copied by main loop. */
22328
22329 alg = decide_alg (count, expected_size, true, &dynamic_check);
22330 desired_align = decide_alignment (align, alg, expected_size);
22331
22332 if (!TARGET_ALIGN_STRINGOPS)
22333 align = desired_align;
22334
22335 if (alg == libcall)
22336 return false;
22337 gcc_assert (alg != no_stringop);
22338 if (!count)
22339 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22340 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22341 switch (alg)
22342 {
22343 case libcall:
22344 case no_stringop:
22345 gcc_unreachable ();
22346 case loop:
22347 need_zero_guard = true;
22348 size_needed = GET_MODE_SIZE (Pmode);
22349 break;
22350 case unrolled_loop:
22351 need_zero_guard = true;
22352 size_needed = GET_MODE_SIZE (Pmode) * 4;
22353 break;
22354 case rep_prefix_8_byte:
22355 size_needed = 8;
22356 break;
22357 case rep_prefix_4_byte:
22358 size_needed = 4;
22359 break;
22360 case rep_prefix_1_byte:
22361 size_needed = 1;
22362 break;
22363 case loop_1_byte:
22364 need_zero_guard = true;
22365 size_needed = 1;
22366 break;
22367 }
22368 epilogue_size_needed = size_needed;
22369
22370 /* Step 1: Prologue guard. */
22371
22372 /* Alignment code needs count to be in register. */
22373 if (CONST_INT_P (count_exp) && desired_align > align)
22374 {
22375 if (INTVAL (count_exp) > desired_align
22376 && INTVAL (count_exp) > size_needed)
22377 {
22378 align_bytes
22379 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22380 if (align_bytes <= 0)
22381 align_bytes = 0;
22382 else
22383 align_bytes = desired_align - align_bytes;
22384 }
22385 if (align_bytes == 0)
22386 {
22387 enum machine_mode mode = SImode;
22388 if (TARGET_64BIT && (count & ~0xffffffff))
22389 mode = DImode;
22390 count_exp = force_reg (mode, count_exp);
22391 }
22392 }
22393 /* Do the cheap promotion to allow better CSE across the
22394 main loop and epilogue (ie one load of the big constant in the
22395 front of all code. */
22396 if (CONST_INT_P (val_exp))
22397 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22398 desired_align, align);
22399 /* Ensure that alignment prologue won't copy past end of block. */
22400 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22401 {
22402 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22403 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22404 Make sure it is power of 2. */
22405 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22406
22407 /* To improve performance of small blocks, we jump around the VAL
22408 promoting mode. This mean that if the promoted VAL is not constant,
22409 we might not use it in the epilogue and have to use byte
22410 loop variant. */
22411 if (epilogue_size_needed > 2 && !promoted_val)
22412 force_loopy_epilogue = true;
22413 if (count)
22414 {
22415 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22416 {
22417 /* If main algorithm works on QImode, no epilogue is needed.
22418 For small sizes just don't align anything. */
22419 if (size_needed == 1)
22420 desired_align = align;
22421 else
22422 goto epilogue;
22423 }
22424 }
22425 else
22426 {
22427 label = gen_label_rtx ();
22428 emit_cmp_and_jump_insns (count_exp,
22429 GEN_INT (epilogue_size_needed),
22430 LTU, 0, counter_mode (count_exp), 1, label);
22431 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22432 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22433 else
22434 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22435 }
22436 }
22437 if (dynamic_check != -1)
22438 {
22439 rtx hot_label = gen_label_rtx ();
22440 jump_around_label = gen_label_rtx ();
22441 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22442 LEU, 0, counter_mode (count_exp), 1, hot_label);
22443 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22444 set_storage_via_libcall (dst, count_exp, val_exp, false);
22445 emit_jump (jump_around_label);
22446 emit_label (hot_label);
22447 }
22448
22449 /* Step 2: Alignment prologue. */
22450
22451 /* Do the expensive promotion once we branched off the small blocks. */
22452 if (!promoted_val)
22453 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22454 desired_align, align);
22455 gcc_assert (desired_align >= 1 && align >= 1);
22456
22457 if (desired_align > align)
22458 {
22459 if (align_bytes == 0)
22460 {
22461 /* Except for the first move in epilogue, we no longer know
22462 constant offset in aliasing info. It don't seems to worth
22463 the pain to maintain it for the first move, so throw away
22464 the info early. */
22465 dst = change_address (dst, BLKmode, destreg);
22466 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22467 desired_align);
22468 }
22469 else
22470 {
22471 /* If we know how many bytes need to be stored before dst is
22472 sufficiently aligned, maintain aliasing info accurately. */
22473 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22474 desired_align, align_bytes);
22475 count_exp = plus_constant (count_exp, -align_bytes);
22476 count -= align_bytes;
22477 }
22478 if (need_zero_guard
22479 && (count < (unsigned HOST_WIDE_INT) size_needed
22480 || (align_bytes == 0
22481 && count < ((unsigned HOST_WIDE_INT) size_needed
22482 + desired_align - align))))
22483 {
22484 /* It is possible that we copied enough so the main loop will not
22485 execute. */
22486 gcc_assert (size_needed > 1);
22487 if (label == NULL_RTX)
22488 label = gen_label_rtx ();
22489 emit_cmp_and_jump_insns (count_exp,
22490 GEN_INT (size_needed),
22491 LTU, 0, counter_mode (count_exp), 1, label);
22492 if (expected_size == -1
22493 || expected_size < (desired_align - align) / 2 + size_needed)
22494 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22495 else
22496 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22497 }
22498 }
22499 if (label && size_needed == 1)
22500 {
22501 emit_label (label);
22502 LABEL_NUSES (label) = 1;
22503 label = NULL;
22504 promoted_val = val_exp;
22505 epilogue_size_needed = 1;
22506 }
22507 else if (label == NULL_RTX)
22508 epilogue_size_needed = size_needed;
22509
22510 /* Step 3: Main loop. */
22511
22512 switch (alg)
22513 {
22514 case libcall:
22515 case no_stringop:
22516 gcc_unreachable ();
22517 case loop_1_byte:
22518 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22519 count_exp, QImode, 1, expected_size);
22520 break;
22521 case loop:
22522 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22523 count_exp, Pmode, 1, expected_size);
22524 break;
22525 case unrolled_loop:
22526 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22527 count_exp, Pmode, 4, expected_size);
22528 break;
22529 case rep_prefix_8_byte:
22530 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22531 DImode, val_exp);
22532 break;
22533 case rep_prefix_4_byte:
22534 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22535 SImode, val_exp);
22536 break;
22537 case rep_prefix_1_byte:
22538 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22539 QImode, val_exp);
22540 break;
22541 }
22542 /* Adjust properly the offset of src and dest memory for aliasing. */
22543 if (CONST_INT_P (count_exp))
22544 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22545 (count / size_needed) * size_needed);
22546 else
22547 dst = change_address (dst, BLKmode, destreg);
22548
22549 /* Step 4: Epilogue to copy the remaining bytes. */
22550
22551 if (label)
22552 {
22553 /* When the main loop is done, COUNT_EXP might hold original count,
22554 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22555 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22556 bytes. Compensate if needed. */
22557
22558 if (size_needed < epilogue_size_needed)
22559 {
22560 tmp =
22561 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22562 GEN_INT (size_needed - 1), count_exp, 1,
22563 OPTAB_DIRECT);
22564 if (tmp != count_exp)
22565 emit_move_insn (count_exp, tmp);
22566 }
22567 emit_label (label);
22568 LABEL_NUSES (label) = 1;
22569 }
22570 epilogue:
22571 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22572 {
22573 if (force_loopy_epilogue)
22574 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22575 epilogue_size_needed);
22576 else
22577 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22578 epilogue_size_needed);
22579 }
22580 if (jump_around_label)
22581 emit_label (jump_around_label);
22582 return true;
22583 }
22584
22585 /* Expand the appropriate insns for doing strlen if not just doing
22586 repnz; scasb
22587
22588 out = result, initialized with the start address
22589 align_rtx = alignment of the address.
22590 scratch = scratch register, initialized with the startaddress when
22591 not aligned, otherwise undefined
22592
22593 This is just the body. It needs the initializations mentioned above and
22594 some address computing at the end. These things are done in i386.md. */
22595
22596 static void
22597 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22598 {
22599 int align;
22600 rtx tmp;
22601 rtx align_2_label = NULL_RTX;
22602 rtx align_3_label = NULL_RTX;
22603 rtx align_4_label = gen_label_rtx ();
22604 rtx end_0_label = gen_label_rtx ();
22605 rtx mem;
22606 rtx tmpreg = gen_reg_rtx (SImode);
22607 rtx scratch = gen_reg_rtx (SImode);
22608 rtx cmp;
22609
22610 align = 0;
22611 if (CONST_INT_P (align_rtx))
22612 align = INTVAL (align_rtx);
22613
22614 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22615
22616 /* Is there a known alignment and is it less than 4? */
22617 if (align < 4)
22618 {
22619 rtx scratch1 = gen_reg_rtx (Pmode);
22620 emit_move_insn (scratch1, out);
22621 /* Is there a known alignment and is it not 2? */
22622 if (align != 2)
22623 {
22624 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22625 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22626
22627 /* Leave just the 3 lower bits. */
22628 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22629 NULL_RTX, 0, OPTAB_WIDEN);
22630
22631 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22632 Pmode, 1, align_4_label);
22633 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22634 Pmode, 1, align_2_label);
22635 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22636 Pmode, 1, align_3_label);
22637 }
22638 else
22639 {
22640 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22641 check if is aligned to 4 - byte. */
22642
22643 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22644 NULL_RTX, 0, OPTAB_WIDEN);
22645
22646 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22647 Pmode, 1, align_4_label);
22648 }
22649
22650 mem = change_address (src, QImode, out);
22651
22652 /* Now compare the bytes. */
22653
22654 /* Compare the first n unaligned byte on a byte per byte basis. */
22655 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22656 QImode, 1, end_0_label);
22657
22658 /* Increment the address. */
22659 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22660
22661 /* Not needed with an alignment of 2 */
22662 if (align != 2)
22663 {
22664 emit_label (align_2_label);
22665
22666 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22667 end_0_label);
22668
22669 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22670
22671 emit_label (align_3_label);
22672 }
22673
22674 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22675 end_0_label);
22676
22677 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22678 }
22679
22680 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22681 align this loop. It gives only huge programs, but does not help to
22682 speed up. */
22683 emit_label (align_4_label);
22684
22685 mem = change_address (src, SImode, out);
22686 emit_move_insn (scratch, mem);
22687 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22688
22689 /* This formula yields a nonzero result iff one of the bytes is zero.
22690 This saves three branches inside loop and many cycles. */
22691
22692 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22693 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22694 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22695 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22696 gen_int_mode (0x80808080, SImode)));
22697 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22698 align_4_label);
22699
22700 if (TARGET_CMOVE)
22701 {
22702 rtx reg = gen_reg_rtx (SImode);
22703 rtx reg2 = gen_reg_rtx (Pmode);
22704 emit_move_insn (reg, tmpreg);
22705 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22706
22707 /* If zero is not in the first two bytes, move two bytes forward. */
22708 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22709 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22710 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22711 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22712 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22713 reg,
22714 tmpreg)));
22715 /* Emit lea manually to avoid clobbering of flags. */
22716 emit_insn (gen_rtx_SET (SImode, reg2,
22717 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22718
22719 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22720 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22721 emit_insn (gen_rtx_SET (VOIDmode, out,
22722 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22723 reg2,
22724 out)));
22725 }
22726 else
22727 {
22728 rtx end_2_label = gen_label_rtx ();
22729 /* Is zero in the first two bytes? */
22730
22731 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22732 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22733 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22734 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22735 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22736 pc_rtx);
22737 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22738 JUMP_LABEL (tmp) = end_2_label;
22739
22740 /* Not in the first two. Move two bytes forward. */
22741 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22742 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22743
22744 emit_label (end_2_label);
22745
22746 }
22747
22748 /* Avoid branch in fixing the byte. */
22749 tmpreg = gen_lowpart (QImode, tmpreg);
22750 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22751 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22752 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22753 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22754
22755 emit_label (end_0_label);
22756 }
22757
22758 /* Expand strlen. */
22759
22760 bool
22761 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22762 {
22763 rtx addr, scratch1, scratch2, scratch3, scratch4;
22764
22765 /* The generic case of strlen expander is long. Avoid it's
22766 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22767
22768 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22769 && !TARGET_INLINE_ALL_STRINGOPS
22770 && !optimize_insn_for_size_p ()
22771 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22772 return false;
22773
22774 addr = force_reg (Pmode, XEXP (src, 0));
22775 scratch1 = gen_reg_rtx (Pmode);
22776
22777 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22778 && !optimize_insn_for_size_p ())
22779 {
22780 /* Well it seems that some optimizer does not combine a call like
22781 foo(strlen(bar), strlen(bar));
22782 when the move and the subtraction is done here. It does calculate
22783 the length just once when these instructions are done inside of
22784 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
22785 often used and I use one fewer register for the lifetime of
22786 output_strlen_unroll() this is better. */
22787
22788 emit_move_insn (out, addr);
22789
22790 ix86_expand_strlensi_unroll_1 (out, src, align);
22791
22792 /* strlensi_unroll_1 returns the address of the zero at the end of
22793 the string, like memchr(), so compute the length by subtracting
22794 the start address. */
22795 emit_insn (ix86_gen_sub3 (out, out, addr));
22796 }
22797 else
22798 {
22799 rtx unspec;
22800
22801 /* Can't use this if the user has appropriated eax, ecx, or edi. */
22802 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
22803 return false;
22804
22805 scratch2 = gen_reg_rtx (Pmode);
22806 scratch3 = gen_reg_rtx (Pmode);
22807 scratch4 = force_reg (Pmode, constm1_rtx);
22808
22809 emit_move_insn (scratch3, addr);
22810 eoschar = force_reg (QImode, eoschar);
22811
22812 src = replace_equiv_address_nv (src, scratch3);
22813
22814 /* If .md starts supporting :P, this can be done in .md. */
22815 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
22816 scratch4), UNSPEC_SCAS);
22817 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
22818 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
22819 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
22820 }
22821 return true;
22822 }
22823
22824 /* For given symbol (function) construct code to compute address of it's PLT
22825 entry in large x86-64 PIC model. */
22826 rtx
22827 construct_plt_address (rtx symbol)
22828 {
22829 rtx tmp = gen_reg_rtx (Pmode);
22830 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
22831
22832 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
22833 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
22834
22835 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
22836 emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
22837 return tmp;
22838 }
22839
22840 rtx
22841 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
22842 rtx callarg2,
22843 rtx pop, bool sibcall)
22844 {
22845 /* We need to represent that SI and DI registers are clobbered
22846 by SYSV calls. */
22847 static int clobbered_registers[] = {
22848 XMM6_REG, XMM7_REG, XMM8_REG,
22849 XMM9_REG, XMM10_REG, XMM11_REG,
22850 XMM12_REG, XMM13_REG, XMM14_REG,
22851 XMM15_REG, SI_REG, DI_REG
22852 };
22853 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
22854 rtx use = NULL, call;
22855 unsigned int vec_len;
22856
22857 if (pop == const0_rtx)
22858 pop = NULL;
22859 gcc_assert (!TARGET_64BIT || !pop);
22860
22861 if (TARGET_MACHO && !TARGET_64BIT)
22862 {
22863 #if TARGET_MACHO
22864 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
22865 fnaddr = machopic_indirect_call_target (fnaddr);
22866 #endif
22867 }
22868 else
22869 {
22870 /* Static functions and indirect calls don't need the pic register. */
22871 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
22872 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22873 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
22874 use_reg (&use, pic_offset_table_rtx);
22875 }
22876
22877 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
22878 {
22879 rtx al = gen_rtx_REG (QImode, AX_REG);
22880 emit_move_insn (al, callarg2);
22881 use_reg (&use, al);
22882 }
22883
22884 if (ix86_cmodel == CM_LARGE_PIC
22885 && MEM_P (fnaddr)
22886 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22887 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
22888 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
22889 else if (sibcall
22890 ? !sibcall_insn_operand (XEXP (fnaddr, 0), Pmode)
22891 : !call_insn_operand (XEXP (fnaddr, 0), Pmode))
22892 {
22893 fnaddr = XEXP (fnaddr, 0);
22894 if (GET_MODE (fnaddr) != Pmode)
22895 fnaddr = convert_to_mode (Pmode, fnaddr, 1);
22896 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (Pmode, fnaddr));
22897 }
22898
22899 vec_len = 0;
22900 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
22901 if (retval)
22902 call = gen_rtx_SET (VOIDmode, retval, call);
22903 vec[vec_len++] = call;
22904
22905 if (pop)
22906 {
22907 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
22908 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
22909 vec[vec_len++] = pop;
22910 }
22911
22912 if (TARGET_64BIT_MS_ABI
22913 && (!callarg2 || INTVAL (callarg2) != -2))
22914 {
22915 unsigned i;
22916
22917 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
22918 UNSPEC_MS_TO_SYSV_CALL);
22919
22920 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
22921 vec[vec_len++]
22922 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
22923 ? TImode : DImode,
22924 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
22925 ? TImode : DImode,
22926 clobbered_registers[i]));
22927 }
22928
22929 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
22930 if (TARGET_VZEROUPPER)
22931 {
22932 int avx256;
22933 if (cfun->machine->callee_pass_avx256_p)
22934 {
22935 if (cfun->machine->callee_return_avx256_p)
22936 avx256 = callee_return_pass_avx256;
22937 else
22938 avx256 = callee_pass_avx256;
22939 }
22940 else if (cfun->machine->callee_return_avx256_p)
22941 avx256 = callee_return_avx256;
22942 else
22943 avx256 = call_no_avx256;
22944
22945 if (reload_completed)
22946 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
22947 else
22948 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
22949 gen_rtvec (1, GEN_INT (avx256)),
22950 UNSPEC_CALL_NEEDS_VZEROUPPER);
22951 }
22952
22953 if (vec_len > 1)
22954 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
22955 call = emit_call_insn (call);
22956 if (use)
22957 CALL_INSN_FUNCTION_USAGE (call) = use;
22958
22959 return call;
22960 }
22961
22962 void
22963 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
22964 {
22965 rtx pat = PATTERN (insn);
22966 rtvec vec = XVEC (pat, 0);
22967 int len = GET_NUM_ELEM (vec) - 1;
22968
22969 /* Strip off the last entry of the parallel. */
22970 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
22971 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
22972 if (len == 1)
22973 pat = RTVEC_ELT (vec, 0);
22974 else
22975 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
22976
22977 emit_insn (gen_avx_vzeroupper (vzeroupper));
22978 emit_call_insn (pat);
22979 }
22980
22981 /* Output the assembly for a call instruction. */
22982
22983 const char *
22984 ix86_output_call_insn (rtx insn, rtx call_op)
22985 {
22986 bool direct_p = constant_call_address_operand (call_op, Pmode);
22987 bool seh_nop_p = false;
22988 const char *xasm;
22989
22990 if (SIBLING_CALL_P (insn))
22991 {
22992 if (direct_p)
22993 xasm = "jmp\t%P0";
22994 /* SEH epilogue detection requires the indirect branch case
22995 to include REX.W. */
22996 else if (TARGET_SEH)
22997 xasm = "rex.W jmp %A0";
22998 else
22999 xasm = "jmp\t%A0";
23000
23001 output_asm_insn (xasm, &call_op);
23002 return "";
23003 }
23004
23005 /* SEH unwinding can require an extra nop to be emitted in several
23006 circumstances. Determine if we have one of those. */
23007 if (TARGET_SEH)
23008 {
23009 rtx i;
23010
23011 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23012 {
23013 /* If we get to another real insn, we don't need the nop. */
23014 if (INSN_P (i))
23015 break;
23016
23017 /* If we get to the epilogue note, prevent a catch region from
23018 being adjacent to the standard epilogue sequence. If non-
23019 call-exceptions, we'll have done this during epilogue emission. */
23020 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23021 && !flag_non_call_exceptions
23022 && !can_throw_internal (insn))
23023 {
23024 seh_nop_p = true;
23025 break;
23026 }
23027 }
23028
23029 /* If we didn't find a real insn following the call, prevent the
23030 unwinder from looking into the next function. */
23031 if (i == NULL)
23032 seh_nop_p = true;
23033 }
23034
23035 if (direct_p)
23036 xasm = "call\t%P0";
23037 else
23038 xasm = "call\t%A0";
23039
23040 output_asm_insn (xasm, &call_op);
23041
23042 if (seh_nop_p)
23043 return "nop";
23044
23045 return "";
23046 }
23047 \f
23048 /* Clear stack slot assignments remembered from previous functions.
23049 This is called from INIT_EXPANDERS once before RTL is emitted for each
23050 function. */
23051
23052 static struct machine_function *
23053 ix86_init_machine_status (void)
23054 {
23055 struct machine_function *f;
23056
23057 f = ggc_alloc_cleared_machine_function ();
23058 f->use_fast_prologue_epilogue_nregs = -1;
23059 f->tls_descriptor_call_expanded_p = 0;
23060 f->call_abi = ix86_abi;
23061
23062 return f;
23063 }
23064
23065 /* Return a MEM corresponding to a stack slot with mode MODE.
23066 Allocate a new slot if necessary.
23067
23068 The RTL for a function can have several slots available: N is
23069 which slot to use. */
23070
23071 rtx
23072 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23073 {
23074 struct stack_local_entry *s;
23075
23076 gcc_assert (n < MAX_386_STACK_LOCALS);
23077
23078 /* Virtual slot is valid only before vregs are instantiated. */
23079 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23080
23081 for (s = ix86_stack_locals; s; s = s->next)
23082 if (s->mode == mode && s->n == n)
23083 return validize_mem (copy_rtx (s->rtl));
23084
23085 s = ggc_alloc_stack_local_entry ();
23086 s->n = n;
23087 s->mode = mode;
23088 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23089
23090 s->next = ix86_stack_locals;
23091 ix86_stack_locals = s;
23092 return validize_mem (s->rtl);
23093 }
23094 \f
23095 /* Calculate the length of the memory address in the instruction encoding.
23096 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23097 or other prefixes. */
23098
23099 int
23100 memory_address_length (rtx addr)
23101 {
23102 struct ix86_address parts;
23103 rtx base, index, disp;
23104 int len;
23105 int ok;
23106
23107 if (GET_CODE (addr) == PRE_DEC
23108 || GET_CODE (addr) == POST_INC
23109 || GET_CODE (addr) == PRE_MODIFY
23110 || GET_CODE (addr) == POST_MODIFY)
23111 return 0;
23112
23113 ok = ix86_decompose_address (addr, &parts);
23114 gcc_assert (ok);
23115
23116 if (parts.base && GET_CODE (parts.base) == SUBREG)
23117 parts.base = SUBREG_REG (parts.base);
23118 if (parts.index && GET_CODE (parts.index) == SUBREG)
23119 parts.index = SUBREG_REG (parts.index);
23120
23121 base = parts.base;
23122 index = parts.index;
23123 disp = parts.disp;
23124
23125 /* Add length of addr32 prefix. */
23126 len = (GET_CODE (addr) == ZERO_EXTEND
23127 || GET_CODE (addr) == AND);
23128
23129 /* Rule of thumb:
23130 - esp as the base always wants an index,
23131 - ebp as the base always wants a displacement,
23132 - r12 as the base always wants an index,
23133 - r13 as the base always wants a displacement. */
23134
23135 /* Register Indirect. */
23136 if (base && !index && !disp)
23137 {
23138 /* esp (for its index) and ebp (for its displacement) need
23139 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23140 code. */
23141 if (REG_P (addr)
23142 && (addr == arg_pointer_rtx
23143 || addr == frame_pointer_rtx
23144 || REGNO (addr) == SP_REG
23145 || REGNO (addr) == BP_REG
23146 || REGNO (addr) == R12_REG
23147 || REGNO (addr) == R13_REG))
23148 len = 1;
23149 }
23150
23151 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23152 is not disp32, but disp32(%rip), so for disp32
23153 SIB byte is needed, unless print_operand_address
23154 optimizes it into disp32(%rip) or (%rip) is implied
23155 by UNSPEC. */
23156 else if (disp && !base && !index)
23157 {
23158 len = 4;
23159 if (TARGET_64BIT)
23160 {
23161 rtx symbol = disp;
23162
23163 if (GET_CODE (disp) == CONST)
23164 symbol = XEXP (disp, 0);
23165 if (GET_CODE (symbol) == PLUS
23166 && CONST_INT_P (XEXP (symbol, 1)))
23167 symbol = XEXP (symbol, 0);
23168
23169 if (GET_CODE (symbol) != LABEL_REF
23170 && (GET_CODE (symbol) != SYMBOL_REF
23171 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23172 && (GET_CODE (symbol) != UNSPEC
23173 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23174 && XINT (symbol, 1) != UNSPEC_PCREL
23175 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23176 len += 1;
23177 }
23178 }
23179
23180 else
23181 {
23182 /* Find the length of the displacement constant. */
23183 if (disp)
23184 {
23185 if (base && satisfies_constraint_K (disp))
23186 len = 1;
23187 else
23188 len = 4;
23189 }
23190 /* ebp always wants a displacement. Similarly r13. */
23191 else if (base && REG_P (base)
23192 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23193 len = 1;
23194
23195 /* An index requires the two-byte modrm form.... */
23196 if (index
23197 /* ...like esp (or r12), which always wants an index. */
23198 || base == arg_pointer_rtx
23199 || base == frame_pointer_rtx
23200 || (base && REG_P (base)
23201 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23202 len += 1;
23203 }
23204
23205 switch (parts.seg)
23206 {
23207 case SEG_FS:
23208 case SEG_GS:
23209 len += 1;
23210 break;
23211 default:
23212 break;
23213 }
23214
23215 return len;
23216 }
23217
23218 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23219 is set, expect that insn have 8bit immediate alternative. */
23220 int
23221 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23222 {
23223 int len = 0;
23224 int i;
23225 extract_insn_cached (insn);
23226 for (i = recog_data.n_operands - 1; i >= 0; --i)
23227 if (CONSTANT_P (recog_data.operand[i]))
23228 {
23229 enum attr_mode mode = get_attr_mode (insn);
23230
23231 gcc_assert (!len);
23232 if (shortform && CONST_INT_P (recog_data.operand[i]))
23233 {
23234 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23235 switch (mode)
23236 {
23237 case MODE_QI:
23238 len = 1;
23239 continue;
23240 case MODE_HI:
23241 ival = trunc_int_for_mode (ival, HImode);
23242 break;
23243 case MODE_SI:
23244 ival = trunc_int_for_mode (ival, SImode);
23245 break;
23246 default:
23247 break;
23248 }
23249 if (IN_RANGE (ival, -128, 127))
23250 {
23251 len = 1;
23252 continue;
23253 }
23254 }
23255 switch (mode)
23256 {
23257 case MODE_QI:
23258 len = 1;
23259 break;
23260 case MODE_HI:
23261 len = 2;
23262 break;
23263 case MODE_SI:
23264 len = 4;
23265 break;
23266 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23267 case MODE_DI:
23268 len = 4;
23269 break;
23270 default:
23271 fatal_insn ("unknown insn mode", insn);
23272 }
23273 }
23274 return len;
23275 }
23276 /* Compute default value for "length_address" attribute. */
23277 int
23278 ix86_attr_length_address_default (rtx insn)
23279 {
23280 int i;
23281
23282 if (get_attr_type (insn) == TYPE_LEA)
23283 {
23284 rtx set = PATTERN (insn), addr;
23285
23286 if (GET_CODE (set) == PARALLEL)
23287 set = XVECEXP (set, 0, 0);
23288
23289 gcc_assert (GET_CODE (set) == SET);
23290
23291 addr = SET_SRC (set);
23292 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23293 {
23294 if (GET_CODE (addr) == ZERO_EXTEND)
23295 addr = XEXP (addr, 0);
23296 if (GET_CODE (addr) == SUBREG)
23297 addr = SUBREG_REG (addr);
23298 }
23299
23300 return memory_address_length (addr);
23301 }
23302
23303 extract_insn_cached (insn);
23304 for (i = recog_data.n_operands - 1; i >= 0; --i)
23305 if (MEM_P (recog_data.operand[i]))
23306 {
23307 constrain_operands_cached (reload_completed);
23308 if (which_alternative != -1)
23309 {
23310 const char *constraints = recog_data.constraints[i];
23311 int alt = which_alternative;
23312
23313 while (*constraints == '=' || *constraints == '+')
23314 constraints++;
23315 while (alt-- > 0)
23316 while (*constraints++ != ',')
23317 ;
23318 /* Skip ignored operands. */
23319 if (*constraints == 'X')
23320 continue;
23321 }
23322 return memory_address_length (XEXP (recog_data.operand[i], 0));
23323 }
23324 return 0;
23325 }
23326
23327 /* Compute default value for "length_vex" attribute. It includes
23328 2 or 3 byte VEX prefix and 1 opcode byte. */
23329
23330 int
23331 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23332 {
23333 int i;
23334
23335 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23336 byte VEX prefix. */
23337 if (!has_0f_opcode || has_vex_w)
23338 return 3 + 1;
23339
23340 /* We can always use 2 byte VEX prefix in 32bit. */
23341 if (!TARGET_64BIT)
23342 return 2 + 1;
23343
23344 extract_insn_cached (insn);
23345
23346 for (i = recog_data.n_operands - 1; i >= 0; --i)
23347 if (REG_P (recog_data.operand[i]))
23348 {
23349 /* REX.W bit uses 3 byte VEX prefix. */
23350 if (GET_MODE (recog_data.operand[i]) == DImode
23351 && GENERAL_REG_P (recog_data.operand[i]))
23352 return 3 + 1;
23353 }
23354 else
23355 {
23356 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23357 if (MEM_P (recog_data.operand[i])
23358 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23359 return 3 + 1;
23360 }
23361
23362 return 2 + 1;
23363 }
23364 \f
23365 /* Return the maximum number of instructions a cpu can issue. */
23366
23367 static int
23368 ix86_issue_rate (void)
23369 {
23370 switch (ix86_tune)
23371 {
23372 case PROCESSOR_PENTIUM:
23373 case PROCESSOR_ATOM:
23374 case PROCESSOR_K6:
23375 return 2;
23376
23377 case PROCESSOR_PENTIUMPRO:
23378 case PROCESSOR_PENTIUM4:
23379 case PROCESSOR_CORE2_32:
23380 case PROCESSOR_CORE2_64:
23381 case PROCESSOR_COREI7_32:
23382 case PROCESSOR_COREI7_64:
23383 case PROCESSOR_ATHLON:
23384 case PROCESSOR_K8:
23385 case PROCESSOR_AMDFAM10:
23386 case PROCESSOR_NOCONA:
23387 case PROCESSOR_GENERIC32:
23388 case PROCESSOR_GENERIC64:
23389 case PROCESSOR_BDVER1:
23390 case PROCESSOR_BDVER2:
23391 case PROCESSOR_BTVER1:
23392 return 3;
23393
23394 default:
23395 return 1;
23396 }
23397 }
23398
23399 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23400 by DEP_INSN and nothing set by DEP_INSN. */
23401
23402 static bool
23403 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23404 {
23405 rtx set, set2;
23406
23407 /* Simplify the test for uninteresting insns. */
23408 if (insn_type != TYPE_SETCC
23409 && insn_type != TYPE_ICMOV
23410 && insn_type != TYPE_FCMOV
23411 && insn_type != TYPE_IBR)
23412 return false;
23413
23414 if ((set = single_set (dep_insn)) != 0)
23415 {
23416 set = SET_DEST (set);
23417 set2 = NULL_RTX;
23418 }
23419 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23420 && XVECLEN (PATTERN (dep_insn), 0) == 2
23421 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23422 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23423 {
23424 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23425 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23426 }
23427 else
23428 return false;
23429
23430 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23431 return false;
23432
23433 /* This test is true if the dependent insn reads the flags but
23434 not any other potentially set register. */
23435 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23436 return false;
23437
23438 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23439 return false;
23440
23441 return true;
23442 }
23443
23444 /* Return true iff USE_INSN has a memory address with operands set by
23445 SET_INSN. */
23446
23447 bool
23448 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23449 {
23450 int i;
23451 extract_insn_cached (use_insn);
23452 for (i = recog_data.n_operands - 1; i >= 0; --i)
23453 if (MEM_P (recog_data.operand[i]))
23454 {
23455 rtx addr = XEXP (recog_data.operand[i], 0);
23456 return modified_in_p (addr, set_insn) != 0;
23457 }
23458 return false;
23459 }
23460
23461 static int
23462 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23463 {
23464 enum attr_type insn_type, dep_insn_type;
23465 enum attr_memory memory;
23466 rtx set, set2;
23467 int dep_insn_code_number;
23468
23469 /* Anti and output dependencies have zero cost on all CPUs. */
23470 if (REG_NOTE_KIND (link) != 0)
23471 return 0;
23472
23473 dep_insn_code_number = recog_memoized (dep_insn);
23474
23475 /* If we can't recognize the insns, we can't really do anything. */
23476 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23477 return cost;
23478
23479 insn_type = get_attr_type (insn);
23480 dep_insn_type = get_attr_type (dep_insn);
23481
23482 switch (ix86_tune)
23483 {
23484 case PROCESSOR_PENTIUM:
23485 /* Address Generation Interlock adds a cycle of latency. */
23486 if (insn_type == TYPE_LEA)
23487 {
23488 rtx addr = PATTERN (insn);
23489
23490 if (GET_CODE (addr) == PARALLEL)
23491 addr = XVECEXP (addr, 0, 0);
23492
23493 gcc_assert (GET_CODE (addr) == SET);
23494
23495 addr = SET_SRC (addr);
23496 if (modified_in_p (addr, dep_insn))
23497 cost += 1;
23498 }
23499 else if (ix86_agi_dependent (dep_insn, insn))
23500 cost += 1;
23501
23502 /* ??? Compares pair with jump/setcc. */
23503 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23504 cost = 0;
23505
23506 /* Floating point stores require value to be ready one cycle earlier. */
23507 if (insn_type == TYPE_FMOV
23508 && get_attr_memory (insn) == MEMORY_STORE
23509 && !ix86_agi_dependent (dep_insn, insn))
23510 cost += 1;
23511 break;
23512
23513 case PROCESSOR_PENTIUMPRO:
23514 memory = get_attr_memory (insn);
23515
23516 /* INT->FP conversion is expensive. */
23517 if (get_attr_fp_int_src (dep_insn))
23518 cost += 5;
23519
23520 /* There is one cycle extra latency between an FP op and a store. */
23521 if (insn_type == TYPE_FMOV
23522 && (set = single_set (dep_insn)) != NULL_RTX
23523 && (set2 = single_set (insn)) != NULL_RTX
23524 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23525 && MEM_P (SET_DEST (set2)))
23526 cost += 1;
23527
23528 /* Show ability of reorder buffer to hide latency of load by executing
23529 in parallel with previous instruction in case
23530 previous instruction is not needed to compute the address. */
23531 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23532 && !ix86_agi_dependent (dep_insn, insn))
23533 {
23534 /* Claim moves to take one cycle, as core can issue one load
23535 at time and the next load can start cycle later. */
23536 if (dep_insn_type == TYPE_IMOV
23537 || dep_insn_type == TYPE_FMOV)
23538 cost = 1;
23539 else if (cost > 1)
23540 cost--;
23541 }
23542 break;
23543
23544 case PROCESSOR_K6:
23545 memory = get_attr_memory (insn);
23546
23547 /* The esp dependency is resolved before the instruction is really
23548 finished. */
23549 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23550 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23551 return 1;
23552
23553 /* INT->FP conversion is expensive. */
23554 if (get_attr_fp_int_src (dep_insn))
23555 cost += 5;
23556
23557 /* Show ability of reorder buffer to hide latency of load by executing
23558 in parallel with previous instruction in case
23559 previous instruction is not needed to compute the address. */
23560 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23561 && !ix86_agi_dependent (dep_insn, insn))
23562 {
23563 /* Claim moves to take one cycle, as core can issue one load
23564 at time and the next load can start cycle later. */
23565 if (dep_insn_type == TYPE_IMOV
23566 || dep_insn_type == TYPE_FMOV)
23567 cost = 1;
23568 else if (cost > 2)
23569 cost -= 2;
23570 else
23571 cost = 1;
23572 }
23573 break;
23574
23575 case PROCESSOR_ATHLON:
23576 case PROCESSOR_K8:
23577 case PROCESSOR_AMDFAM10:
23578 case PROCESSOR_BDVER1:
23579 case PROCESSOR_BDVER2:
23580 case PROCESSOR_BTVER1:
23581 case PROCESSOR_ATOM:
23582 case PROCESSOR_GENERIC32:
23583 case PROCESSOR_GENERIC64:
23584 memory = get_attr_memory (insn);
23585
23586 /* Show ability of reorder buffer to hide latency of load by executing
23587 in parallel with previous instruction in case
23588 previous instruction is not needed to compute the address. */
23589 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23590 && !ix86_agi_dependent (dep_insn, insn))
23591 {
23592 enum attr_unit unit = get_attr_unit (insn);
23593 int loadcost = 3;
23594
23595 /* Because of the difference between the length of integer and
23596 floating unit pipeline preparation stages, the memory operands
23597 for floating point are cheaper.
23598
23599 ??? For Athlon it the difference is most probably 2. */
23600 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23601 loadcost = 3;
23602 else
23603 loadcost = TARGET_ATHLON ? 2 : 0;
23604
23605 if (cost >= loadcost)
23606 cost -= loadcost;
23607 else
23608 cost = 0;
23609 }
23610
23611 default:
23612 break;
23613 }
23614
23615 return cost;
23616 }
23617
23618 /* How many alternative schedules to try. This should be as wide as the
23619 scheduling freedom in the DFA, but no wider. Making this value too
23620 large results extra work for the scheduler. */
23621
23622 static int
23623 ia32_multipass_dfa_lookahead (void)
23624 {
23625 switch (ix86_tune)
23626 {
23627 case PROCESSOR_PENTIUM:
23628 return 2;
23629
23630 case PROCESSOR_PENTIUMPRO:
23631 case PROCESSOR_K6:
23632 return 1;
23633
23634 case PROCESSOR_CORE2_32:
23635 case PROCESSOR_CORE2_64:
23636 case PROCESSOR_COREI7_32:
23637 case PROCESSOR_COREI7_64:
23638 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23639 as many instructions can be executed on a cycle, i.e.,
23640 issue_rate. I wonder why tuning for many CPUs does not do this. */
23641 return ix86_issue_rate ();
23642
23643 default:
23644 return 0;
23645 }
23646 }
23647
23648 \f
23649
23650 /* Model decoder of Core 2/i7.
23651 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23652 track the instruction fetch block boundaries and make sure that long
23653 (9+ bytes) instructions are assigned to D0. */
23654
23655 /* Maximum length of an insn that can be handled by
23656 a secondary decoder unit. '8' for Core 2/i7. */
23657 static int core2i7_secondary_decoder_max_insn_size;
23658
23659 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23660 '16' for Core 2/i7. */
23661 static int core2i7_ifetch_block_size;
23662
23663 /* Maximum number of instructions decoder can handle per cycle.
23664 '6' for Core 2/i7. */
23665 static int core2i7_ifetch_block_max_insns;
23666
23667 typedef struct ix86_first_cycle_multipass_data_ *
23668 ix86_first_cycle_multipass_data_t;
23669 typedef const struct ix86_first_cycle_multipass_data_ *
23670 const_ix86_first_cycle_multipass_data_t;
23671
23672 /* A variable to store target state across calls to max_issue within
23673 one cycle. */
23674 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23675 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23676
23677 /* Initialize DATA. */
23678 static void
23679 core2i7_first_cycle_multipass_init (void *_data)
23680 {
23681 ix86_first_cycle_multipass_data_t data
23682 = (ix86_first_cycle_multipass_data_t) _data;
23683
23684 data->ifetch_block_len = 0;
23685 data->ifetch_block_n_insns = 0;
23686 data->ready_try_change = NULL;
23687 data->ready_try_change_size = 0;
23688 }
23689
23690 /* Advancing the cycle; reset ifetch block counts. */
23691 static void
23692 core2i7_dfa_post_advance_cycle (void)
23693 {
23694 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
23695
23696 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23697
23698 data->ifetch_block_len = 0;
23699 data->ifetch_block_n_insns = 0;
23700 }
23701
23702 static int min_insn_size (rtx);
23703
23704 /* Filter out insns from ready_try that the core will not be able to issue
23705 on current cycle due to decoder. */
23706 static void
23707 core2i7_first_cycle_multipass_filter_ready_try
23708 (const_ix86_first_cycle_multipass_data_t data,
23709 char *ready_try, int n_ready, bool first_cycle_insn_p)
23710 {
23711 while (n_ready--)
23712 {
23713 rtx insn;
23714 int insn_size;
23715
23716 if (ready_try[n_ready])
23717 continue;
23718
23719 insn = get_ready_element (n_ready);
23720 insn_size = min_insn_size (insn);
23721
23722 if (/* If this is a too long an insn for a secondary decoder ... */
23723 (!first_cycle_insn_p
23724 && insn_size > core2i7_secondary_decoder_max_insn_size)
23725 /* ... or it would not fit into the ifetch block ... */
23726 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
23727 /* ... or the decoder is full already ... */
23728 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
23729 /* ... mask the insn out. */
23730 {
23731 ready_try[n_ready] = 1;
23732
23733 if (data->ready_try_change)
23734 SET_BIT (data->ready_try_change, n_ready);
23735 }
23736 }
23737 }
23738
23739 /* Prepare for a new round of multipass lookahead scheduling. */
23740 static void
23741 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
23742 bool first_cycle_insn_p)
23743 {
23744 ix86_first_cycle_multipass_data_t data
23745 = (ix86_first_cycle_multipass_data_t) _data;
23746 const_ix86_first_cycle_multipass_data_t prev_data
23747 = ix86_first_cycle_multipass_data;
23748
23749 /* Restore the state from the end of the previous round. */
23750 data->ifetch_block_len = prev_data->ifetch_block_len;
23751 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
23752
23753 /* Filter instructions that cannot be issued on current cycle due to
23754 decoder restrictions. */
23755 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23756 first_cycle_insn_p);
23757 }
23758
23759 /* INSN is being issued in current solution. Account for its impact on
23760 the decoder model. */
23761 static void
23762 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
23763 rtx insn, const void *_prev_data)
23764 {
23765 ix86_first_cycle_multipass_data_t data
23766 = (ix86_first_cycle_multipass_data_t) _data;
23767 const_ix86_first_cycle_multipass_data_t prev_data
23768 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
23769
23770 int insn_size = min_insn_size (insn);
23771
23772 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
23773 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
23774 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
23775 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23776
23777 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
23778 if (!data->ready_try_change)
23779 {
23780 data->ready_try_change = sbitmap_alloc (n_ready);
23781 data->ready_try_change_size = n_ready;
23782 }
23783 else if (data->ready_try_change_size < n_ready)
23784 {
23785 data->ready_try_change = sbitmap_resize (data->ready_try_change,
23786 n_ready, 0);
23787 data->ready_try_change_size = n_ready;
23788 }
23789 sbitmap_zero (data->ready_try_change);
23790
23791 /* Filter out insns from ready_try that the core will not be able to issue
23792 on current cycle due to decoder. */
23793 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23794 false);
23795 }
23796
23797 /* Revert the effect on ready_try. */
23798 static void
23799 core2i7_first_cycle_multipass_backtrack (const void *_data,
23800 char *ready_try,
23801 int n_ready ATTRIBUTE_UNUSED)
23802 {
23803 const_ix86_first_cycle_multipass_data_t data
23804 = (const_ix86_first_cycle_multipass_data_t) _data;
23805 unsigned int i = 0;
23806 sbitmap_iterator sbi;
23807
23808 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
23809 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
23810 {
23811 ready_try[i] = 0;
23812 }
23813 }
23814
23815 /* Save the result of multipass lookahead scheduling for the next round. */
23816 static void
23817 core2i7_first_cycle_multipass_end (const void *_data)
23818 {
23819 const_ix86_first_cycle_multipass_data_t data
23820 = (const_ix86_first_cycle_multipass_data_t) _data;
23821 ix86_first_cycle_multipass_data_t next_data
23822 = ix86_first_cycle_multipass_data;
23823
23824 if (data != NULL)
23825 {
23826 next_data->ifetch_block_len = data->ifetch_block_len;
23827 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
23828 }
23829 }
23830
23831 /* Deallocate target data. */
23832 static void
23833 core2i7_first_cycle_multipass_fini (void *_data)
23834 {
23835 ix86_first_cycle_multipass_data_t data
23836 = (ix86_first_cycle_multipass_data_t) _data;
23837
23838 if (data->ready_try_change)
23839 {
23840 sbitmap_free (data->ready_try_change);
23841 data->ready_try_change = NULL;
23842 data->ready_try_change_size = 0;
23843 }
23844 }
23845
23846 /* Prepare for scheduling pass. */
23847 static void
23848 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
23849 int verbose ATTRIBUTE_UNUSED,
23850 int max_uid ATTRIBUTE_UNUSED)
23851 {
23852 /* Install scheduling hooks for current CPU. Some of these hooks are used
23853 in time-critical parts of the scheduler, so we only set them up when
23854 they are actually used. */
23855 switch (ix86_tune)
23856 {
23857 case PROCESSOR_CORE2_32:
23858 case PROCESSOR_CORE2_64:
23859 case PROCESSOR_COREI7_32:
23860 case PROCESSOR_COREI7_64:
23861 targetm.sched.dfa_post_advance_cycle
23862 = core2i7_dfa_post_advance_cycle;
23863 targetm.sched.first_cycle_multipass_init
23864 = core2i7_first_cycle_multipass_init;
23865 targetm.sched.first_cycle_multipass_begin
23866 = core2i7_first_cycle_multipass_begin;
23867 targetm.sched.first_cycle_multipass_issue
23868 = core2i7_first_cycle_multipass_issue;
23869 targetm.sched.first_cycle_multipass_backtrack
23870 = core2i7_first_cycle_multipass_backtrack;
23871 targetm.sched.first_cycle_multipass_end
23872 = core2i7_first_cycle_multipass_end;
23873 targetm.sched.first_cycle_multipass_fini
23874 = core2i7_first_cycle_multipass_fini;
23875
23876 /* Set decoder parameters. */
23877 core2i7_secondary_decoder_max_insn_size = 8;
23878 core2i7_ifetch_block_size = 16;
23879 core2i7_ifetch_block_max_insns = 6;
23880 break;
23881
23882 default:
23883 targetm.sched.dfa_post_advance_cycle = NULL;
23884 targetm.sched.first_cycle_multipass_init = NULL;
23885 targetm.sched.first_cycle_multipass_begin = NULL;
23886 targetm.sched.first_cycle_multipass_issue = NULL;
23887 targetm.sched.first_cycle_multipass_backtrack = NULL;
23888 targetm.sched.first_cycle_multipass_end = NULL;
23889 targetm.sched.first_cycle_multipass_fini = NULL;
23890 break;
23891 }
23892 }
23893
23894 \f
23895 /* Compute the alignment given to a constant that is being placed in memory.
23896 EXP is the constant and ALIGN is the alignment that the object would
23897 ordinarily have.
23898 The value of this function is used instead of that alignment to align
23899 the object. */
23900
23901 int
23902 ix86_constant_alignment (tree exp, int align)
23903 {
23904 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
23905 || TREE_CODE (exp) == INTEGER_CST)
23906 {
23907 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
23908 return 64;
23909 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
23910 return 128;
23911 }
23912 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
23913 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
23914 return BITS_PER_WORD;
23915
23916 return align;
23917 }
23918
23919 /* Compute the alignment for a static variable.
23920 TYPE is the data type, and ALIGN is the alignment that
23921 the object would ordinarily have. The value of this function is used
23922 instead of that alignment to align the object. */
23923
23924 int
23925 ix86_data_alignment (tree type, int align)
23926 {
23927 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
23928
23929 if (AGGREGATE_TYPE_P (type)
23930 && TYPE_SIZE (type)
23931 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23932 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
23933 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
23934 && align < max_align)
23935 align = max_align;
23936
23937 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
23938 to 16byte boundary. */
23939 if (TARGET_64BIT)
23940 {
23941 if (AGGREGATE_TYPE_P (type)
23942 && TYPE_SIZE (type)
23943 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23944 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
23945 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
23946 return 128;
23947 }
23948
23949 if (TREE_CODE (type) == ARRAY_TYPE)
23950 {
23951 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
23952 return 64;
23953 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
23954 return 128;
23955 }
23956 else if (TREE_CODE (type) == COMPLEX_TYPE)
23957 {
23958
23959 if (TYPE_MODE (type) == DCmode && align < 64)
23960 return 64;
23961 if ((TYPE_MODE (type) == XCmode
23962 || TYPE_MODE (type) == TCmode) && align < 128)
23963 return 128;
23964 }
23965 else if ((TREE_CODE (type) == RECORD_TYPE
23966 || TREE_CODE (type) == UNION_TYPE
23967 || TREE_CODE (type) == QUAL_UNION_TYPE)
23968 && TYPE_FIELDS (type))
23969 {
23970 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
23971 return 64;
23972 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
23973 return 128;
23974 }
23975 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
23976 || TREE_CODE (type) == INTEGER_TYPE)
23977 {
23978 if (TYPE_MODE (type) == DFmode && align < 64)
23979 return 64;
23980 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
23981 return 128;
23982 }
23983
23984 return align;
23985 }
23986
23987 /* Compute the alignment for a local variable or a stack slot. EXP is
23988 the data type or decl itself, MODE is the widest mode available and
23989 ALIGN is the alignment that the object would ordinarily have. The
23990 value of this macro is used instead of that alignment to align the
23991 object. */
23992
23993 unsigned int
23994 ix86_local_alignment (tree exp, enum machine_mode mode,
23995 unsigned int align)
23996 {
23997 tree type, decl;
23998
23999 if (exp && DECL_P (exp))
24000 {
24001 type = TREE_TYPE (exp);
24002 decl = exp;
24003 }
24004 else
24005 {
24006 type = exp;
24007 decl = NULL;
24008 }
24009
24010 /* Don't do dynamic stack realignment for long long objects with
24011 -mpreferred-stack-boundary=2. */
24012 if (!TARGET_64BIT
24013 && align == 64
24014 && ix86_preferred_stack_boundary < 64
24015 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24016 && (!type || !TYPE_USER_ALIGN (type))
24017 && (!decl || !DECL_USER_ALIGN (decl)))
24018 align = 32;
24019
24020 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24021 register in MODE. We will return the largest alignment of XF
24022 and DF. */
24023 if (!type)
24024 {
24025 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24026 align = GET_MODE_ALIGNMENT (DFmode);
24027 return align;
24028 }
24029
24030 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24031 to 16byte boundary. Exact wording is:
24032
24033 An array uses the same alignment as its elements, except that a local or
24034 global array variable of length at least 16 bytes or
24035 a C99 variable-length array variable always has alignment of at least 16 bytes.
24036
24037 This was added to allow use of aligned SSE instructions at arrays. This
24038 rule is meant for static storage (where compiler can not do the analysis
24039 by itself). We follow it for automatic variables only when convenient.
24040 We fully control everything in the function compiled and functions from
24041 other unit can not rely on the alignment.
24042
24043 Exclude va_list type. It is the common case of local array where
24044 we can not benefit from the alignment. */
24045 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24046 && TARGET_SSE)
24047 {
24048 if (AGGREGATE_TYPE_P (type)
24049 && (va_list_type_node == NULL_TREE
24050 || (TYPE_MAIN_VARIANT (type)
24051 != TYPE_MAIN_VARIANT (va_list_type_node)))
24052 && TYPE_SIZE (type)
24053 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24054 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24055 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24056 return 128;
24057 }
24058 if (TREE_CODE (type) == ARRAY_TYPE)
24059 {
24060 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24061 return 64;
24062 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24063 return 128;
24064 }
24065 else if (TREE_CODE (type) == COMPLEX_TYPE)
24066 {
24067 if (TYPE_MODE (type) == DCmode && align < 64)
24068 return 64;
24069 if ((TYPE_MODE (type) == XCmode
24070 || TYPE_MODE (type) == TCmode) && align < 128)
24071 return 128;
24072 }
24073 else if ((TREE_CODE (type) == RECORD_TYPE
24074 || TREE_CODE (type) == UNION_TYPE
24075 || TREE_CODE (type) == QUAL_UNION_TYPE)
24076 && TYPE_FIELDS (type))
24077 {
24078 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24079 return 64;
24080 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24081 return 128;
24082 }
24083 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24084 || TREE_CODE (type) == INTEGER_TYPE)
24085 {
24086
24087 if (TYPE_MODE (type) == DFmode && align < 64)
24088 return 64;
24089 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24090 return 128;
24091 }
24092 return align;
24093 }
24094
24095 /* Compute the minimum required alignment for dynamic stack realignment
24096 purposes for a local variable, parameter or a stack slot. EXP is
24097 the data type or decl itself, MODE is its mode and ALIGN is the
24098 alignment that the object would ordinarily have. */
24099
24100 unsigned int
24101 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24102 unsigned int align)
24103 {
24104 tree type, decl;
24105
24106 if (exp && DECL_P (exp))
24107 {
24108 type = TREE_TYPE (exp);
24109 decl = exp;
24110 }
24111 else
24112 {
24113 type = exp;
24114 decl = NULL;
24115 }
24116
24117 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24118 return align;
24119
24120 /* Don't do dynamic stack realignment for long long objects with
24121 -mpreferred-stack-boundary=2. */
24122 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24123 && (!type || !TYPE_USER_ALIGN (type))
24124 && (!decl || !DECL_USER_ALIGN (decl)))
24125 return 32;
24126
24127 return align;
24128 }
24129 \f
24130 /* Find a location for the static chain incoming to a nested function.
24131 This is a register, unless all free registers are used by arguments. */
24132
24133 static rtx
24134 ix86_static_chain (const_tree fndecl, bool incoming_p)
24135 {
24136 unsigned regno;
24137
24138 if (!DECL_STATIC_CHAIN (fndecl))
24139 return NULL;
24140
24141 if (TARGET_64BIT)
24142 {
24143 /* We always use R10 in 64-bit mode. */
24144 regno = R10_REG;
24145 }
24146 else
24147 {
24148 tree fntype;
24149 unsigned int ccvt;
24150
24151 /* By default in 32-bit mode we use ECX to pass the static chain. */
24152 regno = CX_REG;
24153
24154 fntype = TREE_TYPE (fndecl);
24155 ccvt = ix86_get_callcvt (fntype);
24156 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24157 {
24158 /* Fastcall functions use ecx/edx for arguments, which leaves
24159 us with EAX for the static chain.
24160 Thiscall functions use ecx for arguments, which also
24161 leaves us with EAX for the static chain. */
24162 regno = AX_REG;
24163 }
24164 else if (ix86_function_regparm (fntype, fndecl) == 3)
24165 {
24166 /* For regparm 3, we have no free call-clobbered registers in
24167 which to store the static chain. In order to implement this,
24168 we have the trampoline push the static chain to the stack.
24169 However, we can't push a value below the return address when
24170 we call the nested function directly, so we have to use an
24171 alternate entry point. For this we use ESI, and have the
24172 alternate entry point push ESI, so that things appear the
24173 same once we're executing the nested function. */
24174 if (incoming_p)
24175 {
24176 if (fndecl == current_function_decl)
24177 ix86_static_chain_on_stack = true;
24178 return gen_frame_mem (SImode,
24179 plus_constant (arg_pointer_rtx, -8));
24180 }
24181 regno = SI_REG;
24182 }
24183 }
24184
24185 return gen_rtx_REG (Pmode, regno);
24186 }
24187
24188 /* Emit RTL insns to initialize the variable parts of a trampoline.
24189 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24190 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24191 to be passed to the target function. */
24192
24193 static void
24194 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24195 {
24196 rtx mem, fnaddr;
24197 int opcode;
24198 int offset = 0;
24199
24200 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24201
24202 if (TARGET_64BIT)
24203 {
24204 int size;
24205
24206 /* Load the function address to r11. Try to load address using
24207 the shorter movl instead of movabs. We may want to support
24208 movq for kernel mode, but kernel does not use trampolines at
24209 the moment. */
24210 if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24211 {
24212 fnaddr = copy_to_mode_reg (DImode, fnaddr);
24213
24214 mem = adjust_address (m_tramp, HImode, offset);
24215 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24216
24217 mem = adjust_address (m_tramp, SImode, offset + 2);
24218 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24219 offset += 6;
24220 }
24221 else
24222 {
24223 mem = adjust_address (m_tramp, HImode, offset);
24224 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24225
24226 mem = adjust_address (m_tramp, DImode, offset + 2);
24227 emit_move_insn (mem, fnaddr);
24228 offset += 10;
24229 }
24230
24231 /* Load static chain using movabs to r10. Use the
24232 shorter movl instead of movabs for x32. */
24233 if (TARGET_X32)
24234 {
24235 opcode = 0xba41;
24236 size = 6;
24237 }
24238 else
24239 {
24240 opcode = 0xba49;
24241 size = 10;
24242 }
24243
24244 mem = adjust_address (m_tramp, HImode, offset);
24245 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24246
24247 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24248 emit_move_insn (mem, chain_value);
24249 offset += size;
24250
24251 /* Jump to r11; the last (unused) byte is a nop, only there to
24252 pad the write out to a single 32-bit store. */
24253 mem = adjust_address (m_tramp, SImode, offset);
24254 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24255 offset += 4;
24256 }
24257 else
24258 {
24259 rtx disp, chain;
24260
24261 /* Depending on the static chain location, either load a register
24262 with a constant, or push the constant to the stack. All of the
24263 instructions are the same size. */
24264 chain = ix86_static_chain (fndecl, true);
24265 if (REG_P (chain))
24266 {
24267 switch (REGNO (chain))
24268 {
24269 case AX_REG:
24270 opcode = 0xb8; break;
24271 case CX_REG:
24272 opcode = 0xb9; break;
24273 default:
24274 gcc_unreachable ();
24275 }
24276 }
24277 else
24278 opcode = 0x68;
24279
24280 mem = adjust_address (m_tramp, QImode, offset);
24281 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24282
24283 mem = adjust_address (m_tramp, SImode, offset + 1);
24284 emit_move_insn (mem, chain_value);
24285 offset += 5;
24286
24287 mem = adjust_address (m_tramp, QImode, offset);
24288 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24289
24290 mem = adjust_address (m_tramp, SImode, offset + 1);
24291
24292 /* Compute offset from the end of the jmp to the target function.
24293 In the case in which the trampoline stores the static chain on
24294 the stack, we need to skip the first insn which pushes the
24295 (call-saved) register static chain; this push is 1 byte. */
24296 offset += 5;
24297 disp = expand_binop (SImode, sub_optab, fnaddr,
24298 plus_constant (XEXP (m_tramp, 0),
24299 offset - (MEM_P (chain) ? 1 : 0)),
24300 NULL_RTX, 1, OPTAB_DIRECT);
24301 emit_move_insn (mem, disp);
24302 }
24303
24304 gcc_assert (offset <= TRAMPOLINE_SIZE);
24305
24306 #ifdef HAVE_ENABLE_EXECUTE_STACK
24307 #ifdef CHECK_EXECUTE_STACK_ENABLED
24308 if (CHECK_EXECUTE_STACK_ENABLED)
24309 #endif
24310 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24311 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24312 #endif
24313 }
24314 \f
24315 /* The following file contains several enumerations and data structures
24316 built from the definitions in i386-builtin-types.def. */
24317
24318 #include "i386-builtin-types.inc"
24319
24320 /* Table for the ix86 builtin non-function types. */
24321 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24322
24323 /* Retrieve an element from the above table, building some of
24324 the types lazily. */
24325
24326 static tree
24327 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24328 {
24329 unsigned int index;
24330 tree type, itype;
24331
24332 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24333
24334 type = ix86_builtin_type_tab[(int) tcode];
24335 if (type != NULL)
24336 return type;
24337
24338 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24339 if (tcode <= IX86_BT_LAST_VECT)
24340 {
24341 enum machine_mode mode;
24342
24343 index = tcode - IX86_BT_LAST_PRIM - 1;
24344 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24345 mode = ix86_builtin_type_vect_mode[index];
24346
24347 type = build_vector_type_for_mode (itype, mode);
24348 }
24349 else
24350 {
24351 int quals;
24352
24353 index = tcode - IX86_BT_LAST_VECT - 1;
24354 if (tcode <= IX86_BT_LAST_PTR)
24355 quals = TYPE_UNQUALIFIED;
24356 else
24357 quals = TYPE_QUAL_CONST;
24358
24359 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24360 if (quals != TYPE_UNQUALIFIED)
24361 itype = build_qualified_type (itype, quals);
24362
24363 type = build_pointer_type (itype);
24364 }
24365
24366 ix86_builtin_type_tab[(int) tcode] = type;
24367 return type;
24368 }
24369
24370 /* Table for the ix86 builtin function types. */
24371 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24372
24373 /* Retrieve an element from the above table, building some of
24374 the types lazily. */
24375
24376 static tree
24377 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24378 {
24379 tree type;
24380
24381 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24382
24383 type = ix86_builtin_func_type_tab[(int) tcode];
24384 if (type != NULL)
24385 return type;
24386
24387 if (tcode <= IX86_BT_LAST_FUNC)
24388 {
24389 unsigned start = ix86_builtin_func_start[(int) tcode];
24390 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24391 tree rtype, atype, args = void_list_node;
24392 unsigned i;
24393
24394 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24395 for (i = after - 1; i > start; --i)
24396 {
24397 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24398 args = tree_cons (NULL, atype, args);
24399 }
24400
24401 type = build_function_type (rtype, args);
24402 }
24403 else
24404 {
24405 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24406 enum ix86_builtin_func_type icode;
24407
24408 icode = ix86_builtin_func_alias_base[index];
24409 type = ix86_get_builtin_func_type (icode);
24410 }
24411
24412 ix86_builtin_func_type_tab[(int) tcode] = type;
24413 return type;
24414 }
24415
24416
24417 /* Codes for all the SSE/MMX builtins. */
24418 enum ix86_builtins
24419 {
24420 IX86_BUILTIN_ADDPS,
24421 IX86_BUILTIN_ADDSS,
24422 IX86_BUILTIN_DIVPS,
24423 IX86_BUILTIN_DIVSS,
24424 IX86_BUILTIN_MULPS,
24425 IX86_BUILTIN_MULSS,
24426 IX86_BUILTIN_SUBPS,
24427 IX86_BUILTIN_SUBSS,
24428
24429 IX86_BUILTIN_CMPEQPS,
24430 IX86_BUILTIN_CMPLTPS,
24431 IX86_BUILTIN_CMPLEPS,
24432 IX86_BUILTIN_CMPGTPS,
24433 IX86_BUILTIN_CMPGEPS,
24434 IX86_BUILTIN_CMPNEQPS,
24435 IX86_BUILTIN_CMPNLTPS,
24436 IX86_BUILTIN_CMPNLEPS,
24437 IX86_BUILTIN_CMPNGTPS,
24438 IX86_BUILTIN_CMPNGEPS,
24439 IX86_BUILTIN_CMPORDPS,
24440 IX86_BUILTIN_CMPUNORDPS,
24441 IX86_BUILTIN_CMPEQSS,
24442 IX86_BUILTIN_CMPLTSS,
24443 IX86_BUILTIN_CMPLESS,
24444 IX86_BUILTIN_CMPNEQSS,
24445 IX86_BUILTIN_CMPNLTSS,
24446 IX86_BUILTIN_CMPNLESS,
24447 IX86_BUILTIN_CMPNGTSS,
24448 IX86_BUILTIN_CMPNGESS,
24449 IX86_BUILTIN_CMPORDSS,
24450 IX86_BUILTIN_CMPUNORDSS,
24451
24452 IX86_BUILTIN_COMIEQSS,
24453 IX86_BUILTIN_COMILTSS,
24454 IX86_BUILTIN_COMILESS,
24455 IX86_BUILTIN_COMIGTSS,
24456 IX86_BUILTIN_COMIGESS,
24457 IX86_BUILTIN_COMINEQSS,
24458 IX86_BUILTIN_UCOMIEQSS,
24459 IX86_BUILTIN_UCOMILTSS,
24460 IX86_BUILTIN_UCOMILESS,
24461 IX86_BUILTIN_UCOMIGTSS,
24462 IX86_BUILTIN_UCOMIGESS,
24463 IX86_BUILTIN_UCOMINEQSS,
24464
24465 IX86_BUILTIN_CVTPI2PS,
24466 IX86_BUILTIN_CVTPS2PI,
24467 IX86_BUILTIN_CVTSI2SS,
24468 IX86_BUILTIN_CVTSI642SS,
24469 IX86_BUILTIN_CVTSS2SI,
24470 IX86_BUILTIN_CVTSS2SI64,
24471 IX86_BUILTIN_CVTTPS2PI,
24472 IX86_BUILTIN_CVTTSS2SI,
24473 IX86_BUILTIN_CVTTSS2SI64,
24474
24475 IX86_BUILTIN_MAXPS,
24476 IX86_BUILTIN_MAXSS,
24477 IX86_BUILTIN_MINPS,
24478 IX86_BUILTIN_MINSS,
24479
24480 IX86_BUILTIN_LOADUPS,
24481 IX86_BUILTIN_STOREUPS,
24482 IX86_BUILTIN_MOVSS,
24483
24484 IX86_BUILTIN_MOVHLPS,
24485 IX86_BUILTIN_MOVLHPS,
24486 IX86_BUILTIN_LOADHPS,
24487 IX86_BUILTIN_LOADLPS,
24488 IX86_BUILTIN_STOREHPS,
24489 IX86_BUILTIN_STORELPS,
24490
24491 IX86_BUILTIN_MASKMOVQ,
24492 IX86_BUILTIN_MOVMSKPS,
24493 IX86_BUILTIN_PMOVMSKB,
24494
24495 IX86_BUILTIN_MOVNTPS,
24496 IX86_BUILTIN_MOVNTQ,
24497
24498 IX86_BUILTIN_LOADDQU,
24499 IX86_BUILTIN_STOREDQU,
24500
24501 IX86_BUILTIN_PACKSSWB,
24502 IX86_BUILTIN_PACKSSDW,
24503 IX86_BUILTIN_PACKUSWB,
24504
24505 IX86_BUILTIN_PADDB,
24506 IX86_BUILTIN_PADDW,
24507 IX86_BUILTIN_PADDD,
24508 IX86_BUILTIN_PADDQ,
24509 IX86_BUILTIN_PADDSB,
24510 IX86_BUILTIN_PADDSW,
24511 IX86_BUILTIN_PADDUSB,
24512 IX86_BUILTIN_PADDUSW,
24513 IX86_BUILTIN_PSUBB,
24514 IX86_BUILTIN_PSUBW,
24515 IX86_BUILTIN_PSUBD,
24516 IX86_BUILTIN_PSUBQ,
24517 IX86_BUILTIN_PSUBSB,
24518 IX86_BUILTIN_PSUBSW,
24519 IX86_BUILTIN_PSUBUSB,
24520 IX86_BUILTIN_PSUBUSW,
24521
24522 IX86_BUILTIN_PAND,
24523 IX86_BUILTIN_PANDN,
24524 IX86_BUILTIN_POR,
24525 IX86_BUILTIN_PXOR,
24526
24527 IX86_BUILTIN_PAVGB,
24528 IX86_BUILTIN_PAVGW,
24529
24530 IX86_BUILTIN_PCMPEQB,
24531 IX86_BUILTIN_PCMPEQW,
24532 IX86_BUILTIN_PCMPEQD,
24533 IX86_BUILTIN_PCMPGTB,
24534 IX86_BUILTIN_PCMPGTW,
24535 IX86_BUILTIN_PCMPGTD,
24536
24537 IX86_BUILTIN_PMADDWD,
24538
24539 IX86_BUILTIN_PMAXSW,
24540 IX86_BUILTIN_PMAXUB,
24541 IX86_BUILTIN_PMINSW,
24542 IX86_BUILTIN_PMINUB,
24543
24544 IX86_BUILTIN_PMULHUW,
24545 IX86_BUILTIN_PMULHW,
24546 IX86_BUILTIN_PMULLW,
24547
24548 IX86_BUILTIN_PSADBW,
24549 IX86_BUILTIN_PSHUFW,
24550
24551 IX86_BUILTIN_PSLLW,
24552 IX86_BUILTIN_PSLLD,
24553 IX86_BUILTIN_PSLLQ,
24554 IX86_BUILTIN_PSRAW,
24555 IX86_BUILTIN_PSRAD,
24556 IX86_BUILTIN_PSRLW,
24557 IX86_BUILTIN_PSRLD,
24558 IX86_BUILTIN_PSRLQ,
24559 IX86_BUILTIN_PSLLWI,
24560 IX86_BUILTIN_PSLLDI,
24561 IX86_BUILTIN_PSLLQI,
24562 IX86_BUILTIN_PSRAWI,
24563 IX86_BUILTIN_PSRADI,
24564 IX86_BUILTIN_PSRLWI,
24565 IX86_BUILTIN_PSRLDI,
24566 IX86_BUILTIN_PSRLQI,
24567
24568 IX86_BUILTIN_PUNPCKHBW,
24569 IX86_BUILTIN_PUNPCKHWD,
24570 IX86_BUILTIN_PUNPCKHDQ,
24571 IX86_BUILTIN_PUNPCKLBW,
24572 IX86_BUILTIN_PUNPCKLWD,
24573 IX86_BUILTIN_PUNPCKLDQ,
24574
24575 IX86_BUILTIN_SHUFPS,
24576
24577 IX86_BUILTIN_RCPPS,
24578 IX86_BUILTIN_RCPSS,
24579 IX86_BUILTIN_RSQRTPS,
24580 IX86_BUILTIN_RSQRTPS_NR,
24581 IX86_BUILTIN_RSQRTSS,
24582 IX86_BUILTIN_RSQRTF,
24583 IX86_BUILTIN_SQRTPS,
24584 IX86_BUILTIN_SQRTPS_NR,
24585 IX86_BUILTIN_SQRTSS,
24586
24587 IX86_BUILTIN_UNPCKHPS,
24588 IX86_BUILTIN_UNPCKLPS,
24589
24590 IX86_BUILTIN_ANDPS,
24591 IX86_BUILTIN_ANDNPS,
24592 IX86_BUILTIN_ORPS,
24593 IX86_BUILTIN_XORPS,
24594
24595 IX86_BUILTIN_EMMS,
24596 IX86_BUILTIN_LDMXCSR,
24597 IX86_BUILTIN_STMXCSR,
24598 IX86_BUILTIN_SFENCE,
24599
24600 /* 3DNow! Original */
24601 IX86_BUILTIN_FEMMS,
24602 IX86_BUILTIN_PAVGUSB,
24603 IX86_BUILTIN_PF2ID,
24604 IX86_BUILTIN_PFACC,
24605 IX86_BUILTIN_PFADD,
24606 IX86_BUILTIN_PFCMPEQ,
24607 IX86_BUILTIN_PFCMPGE,
24608 IX86_BUILTIN_PFCMPGT,
24609 IX86_BUILTIN_PFMAX,
24610 IX86_BUILTIN_PFMIN,
24611 IX86_BUILTIN_PFMUL,
24612 IX86_BUILTIN_PFRCP,
24613 IX86_BUILTIN_PFRCPIT1,
24614 IX86_BUILTIN_PFRCPIT2,
24615 IX86_BUILTIN_PFRSQIT1,
24616 IX86_BUILTIN_PFRSQRT,
24617 IX86_BUILTIN_PFSUB,
24618 IX86_BUILTIN_PFSUBR,
24619 IX86_BUILTIN_PI2FD,
24620 IX86_BUILTIN_PMULHRW,
24621
24622 /* 3DNow! Athlon Extensions */
24623 IX86_BUILTIN_PF2IW,
24624 IX86_BUILTIN_PFNACC,
24625 IX86_BUILTIN_PFPNACC,
24626 IX86_BUILTIN_PI2FW,
24627 IX86_BUILTIN_PSWAPDSI,
24628 IX86_BUILTIN_PSWAPDSF,
24629
24630 /* SSE2 */
24631 IX86_BUILTIN_ADDPD,
24632 IX86_BUILTIN_ADDSD,
24633 IX86_BUILTIN_DIVPD,
24634 IX86_BUILTIN_DIVSD,
24635 IX86_BUILTIN_MULPD,
24636 IX86_BUILTIN_MULSD,
24637 IX86_BUILTIN_SUBPD,
24638 IX86_BUILTIN_SUBSD,
24639
24640 IX86_BUILTIN_CMPEQPD,
24641 IX86_BUILTIN_CMPLTPD,
24642 IX86_BUILTIN_CMPLEPD,
24643 IX86_BUILTIN_CMPGTPD,
24644 IX86_BUILTIN_CMPGEPD,
24645 IX86_BUILTIN_CMPNEQPD,
24646 IX86_BUILTIN_CMPNLTPD,
24647 IX86_BUILTIN_CMPNLEPD,
24648 IX86_BUILTIN_CMPNGTPD,
24649 IX86_BUILTIN_CMPNGEPD,
24650 IX86_BUILTIN_CMPORDPD,
24651 IX86_BUILTIN_CMPUNORDPD,
24652 IX86_BUILTIN_CMPEQSD,
24653 IX86_BUILTIN_CMPLTSD,
24654 IX86_BUILTIN_CMPLESD,
24655 IX86_BUILTIN_CMPNEQSD,
24656 IX86_BUILTIN_CMPNLTSD,
24657 IX86_BUILTIN_CMPNLESD,
24658 IX86_BUILTIN_CMPORDSD,
24659 IX86_BUILTIN_CMPUNORDSD,
24660
24661 IX86_BUILTIN_COMIEQSD,
24662 IX86_BUILTIN_COMILTSD,
24663 IX86_BUILTIN_COMILESD,
24664 IX86_BUILTIN_COMIGTSD,
24665 IX86_BUILTIN_COMIGESD,
24666 IX86_BUILTIN_COMINEQSD,
24667 IX86_BUILTIN_UCOMIEQSD,
24668 IX86_BUILTIN_UCOMILTSD,
24669 IX86_BUILTIN_UCOMILESD,
24670 IX86_BUILTIN_UCOMIGTSD,
24671 IX86_BUILTIN_UCOMIGESD,
24672 IX86_BUILTIN_UCOMINEQSD,
24673
24674 IX86_BUILTIN_MAXPD,
24675 IX86_BUILTIN_MAXSD,
24676 IX86_BUILTIN_MINPD,
24677 IX86_BUILTIN_MINSD,
24678
24679 IX86_BUILTIN_ANDPD,
24680 IX86_BUILTIN_ANDNPD,
24681 IX86_BUILTIN_ORPD,
24682 IX86_BUILTIN_XORPD,
24683
24684 IX86_BUILTIN_SQRTPD,
24685 IX86_BUILTIN_SQRTSD,
24686
24687 IX86_BUILTIN_UNPCKHPD,
24688 IX86_BUILTIN_UNPCKLPD,
24689
24690 IX86_BUILTIN_SHUFPD,
24691
24692 IX86_BUILTIN_LOADUPD,
24693 IX86_BUILTIN_STOREUPD,
24694 IX86_BUILTIN_MOVSD,
24695
24696 IX86_BUILTIN_LOADHPD,
24697 IX86_BUILTIN_LOADLPD,
24698
24699 IX86_BUILTIN_CVTDQ2PD,
24700 IX86_BUILTIN_CVTDQ2PS,
24701
24702 IX86_BUILTIN_CVTPD2DQ,
24703 IX86_BUILTIN_CVTPD2PI,
24704 IX86_BUILTIN_CVTPD2PS,
24705 IX86_BUILTIN_CVTTPD2DQ,
24706 IX86_BUILTIN_CVTTPD2PI,
24707
24708 IX86_BUILTIN_CVTPI2PD,
24709 IX86_BUILTIN_CVTSI2SD,
24710 IX86_BUILTIN_CVTSI642SD,
24711
24712 IX86_BUILTIN_CVTSD2SI,
24713 IX86_BUILTIN_CVTSD2SI64,
24714 IX86_BUILTIN_CVTSD2SS,
24715 IX86_BUILTIN_CVTSS2SD,
24716 IX86_BUILTIN_CVTTSD2SI,
24717 IX86_BUILTIN_CVTTSD2SI64,
24718
24719 IX86_BUILTIN_CVTPS2DQ,
24720 IX86_BUILTIN_CVTPS2PD,
24721 IX86_BUILTIN_CVTTPS2DQ,
24722
24723 IX86_BUILTIN_MOVNTI,
24724 IX86_BUILTIN_MOVNTI64,
24725 IX86_BUILTIN_MOVNTPD,
24726 IX86_BUILTIN_MOVNTDQ,
24727
24728 IX86_BUILTIN_MOVQ128,
24729
24730 /* SSE2 MMX */
24731 IX86_BUILTIN_MASKMOVDQU,
24732 IX86_BUILTIN_MOVMSKPD,
24733 IX86_BUILTIN_PMOVMSKB128,
24734
24735 IX86_BUILTIN_PACKSSWB128,
24736 IX86_BUILTIN_PACKSSDW128,
24737 IX86_BUILTIN_PACKUSWB128,
24738
24739 IX86_BUILTIN_PADDB128,
24740 IX86_BUILTIN_PADDW128,
24741 IX86_BUILTIN_PADDD128,
24742 IX86_BUILTIN_PADDQ128,
24743 IX86_BUILTIN_PADDSB128,
24744 IX86_BUILTIN_PADDSW128,
24745 IX86_BUILTIN_PADDUSB128,
24746 IX86_BUILTIN_PADDUSW128,
24747 IX86_BUILTIN_PSUBB128,
24748 IX86_BUILTIN_PSUBW128,
24749 IX86_BUILTIN_PSUBD128,
24750 IX86_BUILTIN_PSUBQ128,
24751 IX86_BUILTIN_PSUBSB128,
24752 IX86_BUILTIN_PSUBSW128,
24753 IX86_BUILTIN_PSUBUSB128,
24754 IX86_BUILTIN_PSUBUSW128,
24755
24756 IX86_BUILTIN_PAND128,
24757 IX86_BUILTIN_PANDN128,
24758 IX86_BUILTIN_POR128,
24759 IX86_BUILTIN_PXOR128,
24760
24761 IX86_BUILTIN_PAVGB128,
24762 IX86_BUILTIN_PAVGW128,
24763
24764 IX86_BUILTIN_PCMPEQB128,
24765 IX86_BUILTIN_PCMPEQW128,
24766 IX86_BUILTIN_PCMPEQD128,
24767 IX86_BUILTIN_PCMPGTB128,
24768 IX86_BUILTIN_PCMPGTW128,
24769 IX86_BUILTIN_PCMPGTD128,
24770
24771 IX86_BUILTIN_PMADDWD128,
24772
24773 IX86_BUILTIN_PMAXSW128,
24774 IX86_BUILTIN_PMAXUB128,
24775 IX86_BUILTIN_PMINSW128,
24776 IX86_BUILTIN_PMINUB128,
24777
24778 IX86_BUILTIN_PMULUDQ,
24779 IX86_BUILTIN_PMULUDQ128,
24780 IX86_BUILTIN_PMULHUW128,
24781 IX86_BUILTIN_PMULHW128,
24782 IX86_BUILTIN_PMULLW128,
24783
24784 IX86_BUILTIN_PSADBW128,
24785 IX86_BUILTIN_PSHUFHW,
24786 IX86_BUILTIN_PSHUFLW,
24787 IX86_BUILTIN_PSHUFD,
24788
24789 IX86_BUILTIN_PSLLDQI128,
24790 IX86_BUILTIN_PSLLWI128,
24791 IX86_BUILTIN_PSLLDI128,
24792 IX86_BUILTIN_PSLLQI128,
24793 IX86_BUILTIN_PSRAWI128,
24794 IX86_BUILTIN_PSRADI128,
24795 IX86_BUILTIN_PSRLDQI128,
24796 IX86_BUILTIN_PSRLWI128,
24797 IX86_BUILTIN_PSRLDI128,
24798 IX86_BUILTIN_PSRLQI128,
24799
24800 IX86_BUILTIN_PSLLDQ128,
24801 IX86_BUILTIN_PSLLW128,
24802 IX86_BUILTIN_PSLLD128,
24803 IX86_BUILTIN_PSLLQ128,
24804 IX86_BUILTIN_PSRAW128,
24805 IX86_BUILTIN_PSRAD128,
24806 IX86_BUILTIN_PSRLW128,
24807 IX86_BUILTIN_PSRLD128,
24808 IX86_BUILTIN_PSRLQ128,
24809
24810 IX86_BUILTIN_PUNPCKHBW128,
24811 IX86_BUILTIN_PUNPCKHWD128,
24812 IX86_BUILTIN_PUNPCKHDQ128,
24813 IX86_BUILTIN_PUNPCKHQDQ128,
24814 IX86_BUILTIN_PUNPCKLBW128,
24815 IX86_BUILTIN_PUNPCKLWD128,
24816 IX86_BUILTIN_PUNPCKLDQ128,
24817 IX86_BUILTIN_PUNPCKLQDQ128,
24818
24819 IX86_BUILTIN_CLFLUSH,
24820 IX86_BUILTIN_MFENCE,
24821 IX86_BUILTIN_LFENCE,
24822 IX86_BUILTIN_PAUSE,
24823
24824 IX86_BUILTIN_BSRSI,
24825 IX86_BUILTIN_BSRDI,
24826 IX86_BUILTIN_RDPMC,
24827 IX86_BUILTIN_RDTSC,
24828 IX86_BUILTIN_RDTSCP,
24829 IX86_BUILTIN_ROLQI,
24830 IX86_BUILTIN_ROLHI,
24831 IX86_BUILTIN_RORQI,
24832 IX86_BUILTIN_RORHI,
24833
24834 /* SSE3. */
24835 IX86_BUILTIN_ADDSUBPS,
24836 IX86_BUILTIN_HADDPS,
24837 IX86_BUILTIN_HSUBPS,
24838 IX86_BUILTIN_MOVSHDUP,
24839 IX86_BUILTIN_MOVSLDUP,
24840 IX86_BUILTIN_ADDSUBPD,
24841 IX86_BUILTIN_HADDPD,
24842 IX86_BUILTIN_HSUBPD,
24843 IX86_BUILTIN_LDDQU,
24844
24845 IX86_BUILTIN_MONITOR,
24846 IX86_BUILTIN_MWAIT,
24847
24848 /* SSSE3. */
24849 IX86_BUILTIN_PHADDW,
24850 IX86_BUILTIN_PHADDD,
24851 IX86_BUILTIN_PHADDSW,
24852 IX86_BUILTIN_PHSUBW,
24853 IX86_BUILTIN_PHSUBD,
24854 IX86_BUILTIN_PHSUBSW,
24855 IX86_BUILTIN_PMADDUBSW,
24856 IX86_BUILTIN_PMULHRSW,
24857 IX86_BUILTIN_PSHUFB,
24858 IX86_BUILTIN_PSIGNB,
24859 IX86_BUILTIN_PSIGNW,
24860 IX86_BUILTIN_PSIGND,
24861 IX86_BUILTIN_PALIGNR,
24862 IX86_BUILTIN_PABSB,
24863 IX86_BUILTIN_PABSW,
24864 IX86_BUILTIN_PABSD,
24865
24866 IX86_BUILTIN_PHADDW128,
24867 IX86_BUILTIN_PHADDD128,
24868 IX86_BUILTIN_PHADDSW128,
24869 IX86_BUILTIN_PHSUBW128,
24870 IX86_BUILTIN_PHSUBD128,
24871 IX86_BUILTIN_PHSUBSW128,
24872 IX86_BUILTIN_PMADDUBSW128,
24873 IX86_BUILTIN_PMULHRSW128,
24874 IX86_BUILTIN_PSHUFB128,
24875 IX86_BUILTIN_PSIGNB128,
24876 IX86_BUILTIN_PSIGNW128,
24877 IX86_BUILTIN_PSIGND128,
24878 IX86_BUILTIN_PALIGNR128,
24879 IX86_BUILTIN_PABSB128,
24880 IX86_BUILTIN_PABSW128,
24881 IX86_BUILTIN_PABSD128,
24882
24883 /* AMDFAM10 - SSE4A New Instructions. */
24884 IX86_BUILTIN_MOVNTSD,
24885 IX86_BUILTIN_MOVNTSS,
24886 IX86_BUILTIN_EXTRQI,
24887 IX86_BUILTIN_EXTRQ,
24888 IX86_BUILTIN_INSERTQI,
24889 IX86_BUILTIN_INSERTQ,
24890
24891 /* SSE4.1. */
24892 IX86_BUILTIN_BLENDPD,
24893 IX86_BUILTIN_BLENDPS,
24894 IX86_BUILTIN_BLENDVPD,
24895 IX86_BUILTIN_BLENDVPS,
24896 IX86_BUILTIN_PBLENDVB128,
24897 IX86_BUILTIN_PBLENDW128,
24898
24899 IX86_BUILTIN_DPPD,
24900 IX86_BUILTIN_DPPS,
24901
24902 IX86_BUILTIN_INSERTPS128,
24903
24904 IX86_BUILTIN_MOVNTDQA,
24905 IX86_BUILTIN_MPSADBW128,
24906 IX86_BUILTIN_PACKUSDW128,
24907 IX86_BUILTIN_PCMPEQQ,
24908 IX86_BUILTIN_PHMINPOSUW128,
24909
24910 IX86_BUILTIN_PMAXSB128,
24911 IX86_BUILTIN_PMAXSD128,
24912 IX86_BUILTIN_PMAXUD128,
24913 IX86_BUILTIN_PMAXUW128,
24914
24915 IX86_BUILTIN_PMINSB128,
24916 IX86_BUILTIN_PMINSD128,
24917 IX86_BUILTIN_PMINUD128,
24918 IX86_BUILTIN_PMINUW128,
24919
24920 IX86_BUILTIN_PMOVSXBW128,
24921 IX86_BUILTIN_PMOVSXBD128,
24922 IX86_BUILTIN_PMOVSXBQ128,
24923 IX86_BUILTIN_PMOVSXWD128,
24924 IX86_BUILTIN_PMOVSXWQ128,
24925 IX86_BUILTIN_PMOVSXDQ128,
24926
24927 IX86_BUILTIN_PMOVZXBW128,
24928 IX86_BUILTIN_PMOVZXBD128,
24929 IX86_BUILTIN_PMOVZXBQ128,
24930 IX86_BUILTIN_PMOVZXWD128,
24931 IX86_BUILTIN_PMOVZXWQ128,
24932 IX86_BUILTIN_PMOVZXDQ128,
24933
24934 IX86_BUILTIN_PMULDQ128,
24935 IX86_BUILTIN_PMULLD128,
24936
24937 IX86_BUILTIN_ROUNDSD,
24938 IX86_BUILTIN_ROUNDSS,
24939
24940 IX86_BUILTIN_ROUNDPD,
24941 IX86_BUILTIN_ROUNDPS,
24942
24943 IX86_BUILTIN_FLOORPD,
24944 IX86_BUILTIN_CEILPD,
24945 IX86_BUILTIN_TRUNCPD,
24946 IX86_BUILTIN_RINTPD,
24947 IX86_BUILTIN_ROUNDPD_AZ,
24948
24949 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
24950 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
24951 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
24952
24953 IX86_BUILTIN_FLOORPS,
24954 IX86_BUILTIN_CEILPS,
24955 IX86_BUILTIN_TRUNCPS,
24956 IX86_BUILTIN_RINTPS,
24957 IX86_BUILTIN_ROUNDPS_AZ,
24958
24959 IX86_BUILTIN_FLOORPS_SFIX,
24960 IX86_BUILTIN_CEILPS_SFIX,
24961 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
24962
24963 IX86_BUILTIN_PTESTZ,
24964 IX86_BUILTIN_PTESTC,
24965 IX86_BUILTIN_PTESTNZC,
24966
24967 IX86_BUILTIN_VEC_INIT_V2SI,
24968 IX86_BUILTIN_VEC_INIT_V4HI,
24969 IX86_BUILTIN_VEC_INIT_V8QI,
24970 IX86_BUILTIN_VEC_EXT_V2DF,
24971 IX86_BUILTIN_VEC_EXT_V2DI,
24972 IX86_BUILTIN_VEC_EXT_V4SF,
24973 IX86_BUILTIN_VEC_EXT_V4SI,
24974 IX86_BUILTIN_VEC_EXT_V8HI,
24975 IX86_BUILTIN_VEC_EXT_V2SI,
24976 IX86_BUILTIN_VEC_EXT_V4HI,
24977 IX86_BUILTIN_VEC_EXT_V16QI,
24978 IX86_BUILTIN_VEC_SET_V2DI,
24979 IX86_BUILTIN_VEC_SET_V4SF,
24980 IX86_BUILTIN_VEC_SET_V4SI,
24981 IX86_BUILTIN_VEC_SET_V8HI,
24982 IX86_BUILTIN_VEC_SET_V4HI,
24983 IX86_BUILTIN_VEC_SET_V16QI,
24984
24985 IX86_BUILTIN_VEC_PACK_SFIX,
24986 IX86_BUILTIN_VEC_PACK_SFIX256,
24987
24988 /* SSE4.2. */
24989 IX86_BUILTIN_CRC32QI,
24990 IX86_BUILTIN_CRC32HI,
24991 IX86_BUILTIN_CRC32SI,
24992 IX86_BUILTIN_CRC32DI,
24993
24994 IX86_BUILTIN_PCMPESTRI128,
24995 IX86_BUILTIN_PCMPESTRM128,
24996 IX86_BUILTIN_PCMPESTRA128,
24997 IX86_BUILTIN_PCMPESTRC128,
24998 IX86_BUILTIN_PCMPESTRO128,
24999 IX86_BUILTIN_PCMPESTRS128,
25000 IX86_BUILTIN_PCMPESTRZ128,
25001 IX86_BUILTIN_PCMPISTRI128,
25002 IX86_BUILTIN_PCMPISTRM128,
25003 IX86_BUILTIN_PCMPISTRA128,
25004 IX86_BUILTIN_PCMPISTRC128,
25005 IX86_BUILTIN_PCMPISTRO128,
25006 IX86_BUILTIN_PCMPISTRS128,
25007 IX86_BUILTIN_PCMPISTRZ128,
25008
25009 IX86_BUILTIN_PCMPGTQ,
25010
25011 /* AES instructions */
25012 IX86_BUILTIN_AESENC128,
25013 IX86_BUILTIN_AESENCLAST128,
25014 IX86_BUILTIN_AESDEC128,
25015 IX86_BUILTIN_AESDECLAST128,
25016 IX86_BUILTIN_AESIMC128,
25017 IX86_BUILTIN_AESKEYGENASSIST128,
25018
25019 /* PCLMUL instruction */
25020 IX86_BUILTIN_PCLMULQDQ128,
25021
25022 /* AVX */
25023 IX86_BUILTIN_ADDPD256,
25024 IX86_BUILTIN_ADDPS256,
25025 IX86_BUILTIN_ADDSUBPD256,
25026 IX86_BUILTIN_ADDSUBPS256,
25027 IX86_BUILTIN_ANDPD256,
25028 IX86_BUILTIN_ANDPS256,
25029 IX86_BUILTIN_ANDNPD256,
25030 IX86_BUILTIN_ANDNPS256,
25031 IX86_BUILTIN_BLENDPD256,
25032 IX86_BUILTIN_BLENDPS256,
25033 IX86_BUILTIN_BLENDVPD256,
25034 IX86_BUILTIN_BLENDVPS256,
25035 IX86_BUILTIN_DIVPD256,
25036 IX86_BUILTIN_DIVPS256,
25037 IX86_BUILTIN_DPPS256,
25038 IX86_BUILTIN_HADDPD256,
25039 IX86_BUILTIN_HADDPS256,
25040 IX86_BUILTIN_HSUBPD256,
25041 IX86_BUILTIN_HSUBPS256,
25042 IX86_BUILTIN_MAXPD256,
25043 IX86_BUILTIN_MAXPS256,
25044 IX86_BUILTIN_MINPD256,
25045 IX86_BUILTIN_MINPS256,
25046 IX86_BUILTIN_MULPD256,
25047 IX86_BUILTIN_MULPS256,
25048 IX86_BUILTIN_ORPD256,
25049 IX86_BUILTIN_ORPS256,
25050 IX86_BUILTIN_SHUFPD256,
25051 IX86_BUILTIN_SHUFPS256,
25052 IX86_BUILTIN_SUBPD256,
25053 IX86_BUILTIN_SUBPS256,
25054 IX86_BUILTIN_XORPD256,
25055 IX86_BUILTIN_XORPS256,
25056 IX86_BUILTIN_CMPSD,
25057 IX86_BUILTIN_CMPSS,
25058 IX86_BUILTIN_CMPPD,
25059 IX86_BUILTIN_CMPPS,
25060 IX86_BUILTIN_CMPPD256,
25061 IX86_BUILTIN_CMPPS256,
25062 IX86_BUILTIN_CVTDQ2PD256,
25063 IX86_BUILTIN_CVTDQ2PS256,
25064 IX86_BUILTIN_CVTPD2PS256,
25065 IX86_BUILTIN_CVTPS2DQ256,
25066 IX86_BUILTIN_CVTPS2PD256,
25067 IX86_BUILTIN_CVTTPD2DQ256,
25068 IX86_BUILTIN_CVTPD2DQ256,
25069 IX86_BUILTIN_CVTTPS2DQ256,
25070 IX86_BUILTIN_EXTRACTF128PD256,
25071 IX86_BUILTIN_EXTRACTF128PS256,
25072 IX86_BUILTIN_EXTRACTF128SI256,
25073 IX86_BUILTIN_VZEROALL,
25074 IX86_BUILTIN_VZEROUPPER,
25075 IX86_BUILTIN_VPERMILVARPD,
25076 IX86_BUILTIN_VPERMILVARPS,
25077 IX86_BUILTIN_VPERMILVARPD256,
25078 IX86_BUILTIN_VPERMILVARPS256,
25079 IX86_BUILTIN_VPERMILPD,
25080 IX86_BUILTIN_VPERMILPS,
25081 IX86_BUILTIN_VPERMILPD256,
25082 IX86_BUILTIN_VPERMILPS256,
25083 IX86_BUILTIN_VPERMIL2PD,
25084 IX86_BUILTIN_VPERMIL2PS,
25085 IX86_BUILTIN_VPERMIL2PD256,
25086 IX86_BUILTIN_VPERMIL2PS256,
25087 IX86_BUILTIN_VPERM2F128PD256,
25088 IX86_BUILTIN_VPERM2F128PS256,
25089 IX86_BUILTIN_VPERM2F128SI256,
25090 IX86_BUILTIN_VBROADCASTSS,
25091 IX86_BUILTIN_VBROADCASTSD256,
25092 IX86_BUILTIN_VBROADCASTSS256,
25093 IX86_BUILTIN_VBROADCASTPD256,
25094 IX86_BUILTIN_VBROADCASTPS256,
25095 IX86_BUILTIN_VINSERTF128PD256,
25096 IX86_BUILTIN_VINSERTF128PS256,
25097 IX86_BUILTIN_VINSERTF128SI256,
25098 IX86_BUILTIN_LOADUPD256,
25099 IX86_BUILTIN_LOADUPS256,
25100 IX86_BUILTIN_STOREUPD256,
25101 IX86_BUILTIN_STOREUPS256,
25102 IX86_BUILTIN_LDDQU256,
25103 IX86_BUILTIN_MOVNTDQ256,
25104 IX86_BUILTIN_MOVNTPD256,
25105 IX86_BUILTIN_MOVNTPS256,
25106 IX86_BUILTIN_LOADDQU256,
25107 IX86_BUILTIN_STOREDQU256,
25108 IX86_BUILTIN_MASKLOADPD,
25109 IX86_BUILTIN_MASKLOADPS,
25110 IX86_BUILTIN_MASKSTOREPD,
25111 IX86_BUILTIN_MASKSTOREPS,
25112 IX86_BUILTIN_MASKLOADPD256,
25113 IX86_BUILTIN_MASKLOADPS256,
25114 IX86_BUILTIN_MASKSTOREPD256,
25115 IX86_BUILTIN_MASKSTOREPS256,
25116 IX86_BUILTIN_MOVSHDUP256,
25117 IX86_BUILTIN_MOVSLDUP256,
25118 IX86_BUILTIN_MOVDDUP256,
25119
25120 IX86_BUILTIN_SQRTPD256,
25121 IX86_BUILTIN_SQRTPS256,
25122 IX86_BUILTIN_SQRTPS_NR256,
25123 IX86_BUILTIN_RSQRTPS256,
25124 IX86_BUILTIN_RSQRTPS_NR256,
25125
25126 IX86_BUILTIN_RCPPS256,
25127
25128 IX86_BUILTIN_ROUNDPD256,
25129 IX86_BUILTIN_ROUNDPS256,
25130
25131 IX86_BUILTIN_FLOORPD256,
25132 IX86_BUILTIN_CEILPD256,
25133 IX86_BUILTIN_TRUNCPD256,
25134 IX86_BUILTIN_RINTPD256,
25135 IX86_BUILTIN_ROUNDPD_AZ256,
25136
25137 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25138 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25139 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25140
25141 IX86_BUILTIN_FLOORPS256,
25142 IX86_BUILTIN_CEILPS256,
25143 IX86_BUILTIN_TRUNCPS256,
25144 IX86_BUILTIN_RINTPS256,
25145 IX86_BUILTIN_ROUNDPS_AZ256,
25146
25147 IX86_BUILTIN_FLOORPS_SFIX256,
25148 IX86_BUILTIN_CEILPS_SFIX256,
25149 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25150
25151 IX86_BUILTIN_UNPCKHPD256,
25152 IX86_BUILTIN_UNPCKLPD256,
25153 IX86_BUILTIN_UNPCKHPS256,
25154 IX86_BUILTIN_UNPCKLPS256,
25155
25156 IX86_BUILTIN_SI256_SI,
25157 IX86_BUILTIN_PS256_PS,
25158 IX86_BUILTIN_PD256_PD,
25159 IX86_BUILTIN_SI_SI256,
25160 IX86_BUILTIN_PS_PS256,
25161 IX86_BUILTIN_PD_PD256,
25162
25163 IX86_BUILTIN_VTESTZPD,
25164 IX86_BUILTIN_VTESTCPD,
25165 IX86_BUILTIN_VTESTNZCPD,
25166 IX86_BUILTIN_VTESTZPS,
25167 IX86_BUILTIN_VTESTCPS,
25168 IX86_BUILTIN_VTESTNZCPS,
25169 IX86_BUILTIN_VTESTZPD256,
25170 IX86_BUILTIN_VTESTCPD256,
25171 IX86_BUILTIN_VTESTNZCPD256,
25172 IX86_BUILTIN_VTESTZPS256,
25173 IX86_BUILTIN_VTESTCPS256,
25174 IX86_BUILTIN_VTESTNZCPS256,
25175 IX86_BUILTIN_PTESTZ256,
25176 IX86_BUILTIN_PTESTC256,
25177 IX86_BUILTIN_PTESTNZC256,
25178
25179 IX86_BUILTIN_MOVMSKPD256,
25180 IX86_BUILTIN_MOVMSKPS256,
25181
25182 /* AVX2 */
25183 IX86_BUILTIN_MPSADBW256,
25184 IX86_BUILTIN_PABSB256,
25185 IX86_BUILTIN_PABSW256,
25186 IX86_BUILTIN_PABSD256,
25187 IX86_BUILTIN_PACKSSDW256,
25188 IX86_BUILTIN_PACKSSWB256,
25189 IX86_BUILTIN_PACKUSDW256,
25190 IX86_BUILTIN_PACKUSWB256,
25191 IX86_BUILTIN_PADDB256,
25192 IX86_BUILTIN_PADDW256,
25193 IX86_BUILTIN_PADDD256,
25194 IX86_BUILTIN_PADDQ256,
25195 IX86_BUILTIN_PADDSB256,
25196 IX86_BUILTIN_PADDSW256,
25197 IX86_BUILTIN_PADDUSB256,
25198 IX86_BUILTIN_PADDUSW256,
25199 IX86_BUILTIN_PALIGNR256,
25200 IX86_BUILTIN_AND256I,
25201 IX86_BUILTIN_ANDNOT256I,
25202 IX86_BUILTIN_PAVGB256,
25203 IX86_BUILTIN_PAVGW256,
25204 IX86_BUILTIN_PBLENDVB256,
25205 IX86_BUILTIN_PBLENDVW256,
25206 IX86_BUILTIN_PCMPEQB256,
25207 IX86_BUILTIN_PCMPEQW256,
25208 IX86_BUILTIN_PCMPEQD256,
25209 IX86_BUILTIN_PCMPEQQ256,
25210 IX86_BUILTIN_PCMPGTB256,
25211 IX86_BUILTIN_PCMPGTW256,
25212 IX86_BUILTIN_PCMPGTD256,
25213 IX86_BUILTIN_PCMPGTQ256,
25214 IX86_BUILTIN_PHADDW256,
25215 IX86_BUILTIN_PHADDD256,
25216 IX86_BUILTIN_PHADDSW256,
25217 IX86_BUILTIN_PHSUBW256,
25218 IX86_BUILTIN_PHSUBD256,
25219 IX86_BUILTIN_PHSUBSW256,
25220 IX86_BUILTIN_PMADDUBSW256,
25221 IX86_BUILTIN_PMADDWD256,
25222 IX86_BUILTIN_PMAXSB256,
25223 IX86_BUILTIN_PMAXSW256,
25224 IX86_BUILTIN_PMAXSD256,
25225 IX86_BUILTIN_PMAXUB256,
25226 IX86_BUILTIN_PMAXUW256,
25227 IX86_BUILTIN_PMAXUD256,
25228 IX86_BUILTIN_PMINSB256,
25229 IX86_BUILTIN_PMINSW256,
25230 IX86_BUILTIN_PMINSD256,
25231 IX86_BUILTIN_PMINUB256,
25232 IX86_BUILTIN_PMINUW256,
25233 IX86_BUILTIN_PMINUD256,
25234 IX86_BUILTIN_PMOVMSKB256,
25235 IX86_BUILTIN_PMOVSXBW256,
25236 IX86_BUILTIN_PMOVSXBD256,
25237 IX86_BUILTIN_PMOVSXBQ256,
25238 IX86_BUILTIN_PMOVSXWD256,
25239 IX86_BUILTIN_PMOVSXWQ256,
25240 IX86_BUILTIN_PMOVSXDQ256,
25241 IX86_BUILTIN_PMOVZXBW256,
25242 IX86_BUILTIN_PMOVZXBD256,
25243 IX86_BUILTIN_PMOVZXBQ256,
25244 IX86_BUILTIN_PMOVZXWD256,
25245 IX86_BUILTIN_PMOVZXWQ256,
25246 IX86_BUILTIN_PMOVZXDQ256,
25247 IX86_BUILTIN_PMULDQ256,
25248 IX86_BUILTIN_PMULHRSW256,
25249 IX86_BUILTIN_PMULHUW256,
25250 IX86_BUILTIN_PMULHW256,
25251 IX86_BUILTIN_PMULLW256,
25252 IX86_BUILTIN_PMULLD256,
25253 IX86_BUILTIN_PMULUDQ256,
25254 IX86_BUILTIN_POR256,
25255 IX86_BUILTIN_PSADBW256,
25256 IX86_BUILTIN_PSHUFB256,
25257 IX86_BUILTIN_PSHUFD256,
25258 IX86_BUILTIN_PSHUFHW256,
25259 IX86_BUILTIN_PSHUFLW256,
25260 IX86_BUILTIN_PSIGNB256,
25261 IX86_BUILTIN_PSIGNW256,
25262 IX86_BUILTIN_PSIGND256,
25263 IX86_BUILTIN_PSLLDQI256,
25264 IX86_BUILTIN_PSLLWI256,
25265 IX86_BUILTIN_PSLLW256,
25266 IX86_BUILTIN_PSLLDI256,
25267 IX86_BUILTIN_PSLLD256,
25268 IX86_BUILTIN_PSLLQI256,
25269 IX86_BUILTIN_PSLLQ256,
25270 IX86_BUILTIN_PSRAWI256,
25271 IX86_BUILTIN_PSRAW256,
25272 IX86_BUILTIN_PSRADI256,
25273 IX86_BUILTIN_PSRAD256,
25274 IX86_BUILTIN_PSRLDQI256,
25275 IX86_BUILTIN_PSRLWI256,
25276 IX86_BUILTIN_PSRLW256,
25277 IX86_BUILTIN_PSRLDI256,
25278 IX86_BUILTIN_PSRLD256,
25279 IX86_BUILTIN_PSRLQI256,
25280 IX86_BUILTIN_PSRLQ256,
25281 IX86_BUILTIN_PSUBB256,
25282 IX86_BUILTIN_PSUBW256,
25283 IX86_BUILTIN_PSUBD256,
25284 IX86_BUILTIN_PSUBQ256,
25285 IX86_BUILTIN_PSUBSB256,
25286 IX86_BUILTIN_PSUBSW256,
25287 IX86_BUILTIN_PSUBUSB256,
25288 IX86_BUILTIN_PSUBUSW256,
25289 IX86_BUILTIN_PUNPCKHBW256,
25290 IX86_BUILTIN_PUNPCKHWD256,
25291 IX86_BUILTIN_PUNPCKHDQ256,
25292 IX86_BUILTIN_PUNPCKHQDQ256,
25293 IX86_BUILTIN_PUNPCKLBW256,
25294 IX86_BUILTIN_PUNPCKLWD256,
25295 IX86_BUILTIN_PUNPCKLDQ256,
25296 IX86_BUILTIN_PUNPCKLQDQ256,
25297 IX86_BUILTIN_PXOR256,
25298 IX86_BUILTIN_MOVNTDQA256,
25299 IX86_BUILTIN_VBROADCASTSS_PS,
25300 IX86_BUILTIN_VBROADCASTSS_PS256,
25301 IX86_BUILTIN_VBROADCASTSD_PD256,
25302 IX86_BUILTIN_VBROADCASTSI256,
25303 IX86_BUILTIN_PBLENDD256,
25304 IX86_BUILTIN_PBLENDD128,
25305 IX86_BUILTIN_PBROADCASTB256,
25306 IX86_BUILTIN_PBROADCASTW256,
25307 IX86_BUILTIN_PBROADCASTD256,
25308 IX86_BUILTIN_PBROADCASTQ256,
25309 IX86_BUILTIN_PBROADCASTB128,
25310 IX86_BUILTIN_PBROADCASTW128,
25311 IX86_BUILTIN_PBROADCASTD128,
25312 IX86_BUILTIN_PBROADCASTQ128,
25313 IX86_BUILTIN_VPERMVARSI256,
25314 IX86_BUILTIN_VPERMDF256,
25315 IX86_BUILTIN_VPERMVARSF256,
25316 IX86_BUILTIN_VPERMDI256,
25317 IX86_BUILTIN_VPERMTI256,
25318 IX86_BUILTIN_VEXTRACT128I256,
25319 IX86_BUILTIN_VINSERT128I256,
25320 IX86_BUILTIN_MASKLOADD,
25321 IX86_BUILTIN_MASKLOADQ,
25322 IX86_BUILTIN_MASKLOADD256,
25323 IX86_BUILTIN_MASKLOADQ256,
25324 IX86_BUILTIN_MASKSTORED,
25325 IX86_BUILTIN_MASKSTOREQ,
25326 IX86_BUILTIN_MASKSTORED256,
25327 IX86_BUILTIN_MASKSTOREQ256,
25328 IX86_BUILTIN_PSLLVV4DI,
25329 IX86_BUILTIN_PSLLVV2DI,
25330 IX86_BUILTIN_PSLLVV8SI,
25331 IX86_BUILTIN_PSLLVV4SI,
25332 IX86_BUILTIN_PSRAVV8SI,
25333 IX86_BUILTIN_PSRAVV4SI,
25334 IX86_BUILTIN_PSRLVV4DI,
25335 IX86_BUILTIN_PSRLVV2DI,
25336 IX86_BUILTIN_PSRLVV8SI,
25337 IX86_BUILTIN_PSRLVV4SI,
25338
25339 IX86_BUILTIN_GATHERSIV2DF,
25340 IX86_BUILTIN_GATHERSIV4DF,
25341 IX86_BUILTIN_GATHERDIV2DF,
25342 IX86_BUILTIN_GATHERDIV4DF,
25343 IX86_BUILTIN_GATHERSIV4SF,
25344 IX86_BUILTIN_GATHERSIV8SF,
25345 IX86_BUILTIN_GATHERDIV4SF,
25346 IX86_BUILTIN_GATHERDIV8SF,
25347 IX86_BUILTIN_GATHERSIV2DI,
25348 IX86_BUILTIN_GATHERSIV4DI,
25349 IX86_BUILTIN_GATHERDIV2DI,
25350 IX86_BUILTIN_GATHERDIV4DI,
25351 IX86_BUILTIN_GATHERSIV4SI,
25352 IX86_BUILTIN_GATHERSIV8SI,
25353 IX86_BUILTIN_GATHERDIV4SI,
25354 IX86_BUILTIN_GATHERDIV8SI,
25355
25356 /* Alternate 4 element gather for the vectorizer where
25357 all operands are 32-byte wide. */
25358 IX86_BUILTIN_GATHERALTSIV4DF,
25359 IX86_BUILTIN_GATHERALTDIV8SF,
25360 IX86_BUILTIN_GATHERALTSIV4DI,
25361 IX86_BUILTIN_GATHERALTDIV8SI,
25362
25363 /* TFmode support builtins. */
25364 IX86_BUILTIN_INFQ,
25365 IX86_BUILTIN_HUGE_VALQ,
25366 IX86_BUILTIN_FABSQ,
25367 IX86_BUILTIN_COPYSIGNQ,
25368
25369 /* Vectorizer support builtins. */
25370 IX86_BUILTIN_CPYSGNPS,
25371 IX86_BUILTIN_CPYSGNPD,
25372 IX86_BUILTIN_CPYSGNPS256,
25373 IX86_BUILTIN_CPYSGNPD256,
25374
25375 /* FMA4 instructions. */
25376 IX86_BUILTIN_VFMADDSS,
25377 IX86_BUILTIN_VFMADDSD,
25378 IX86_BUILTIN_VFMADDPS,
25379 IX86_BUILTIN_VFMADDPD,
25380 IX86_BUILTIN_VFMADDPS256,
25381 IX86_BUILTIN_VFMADDPD256,
25382 IX86_BUILTIN_VFMADDSUBPS,
25383 IX86_BUILTIN_VFMADDSUBPD,
25384 IX86_BUILTIN_VFMADDSUBPS256,
25385 IX86_BUILTIN_VFMADDSUBPD256,
25386
25387 /* FMA3 instructions. */
25388 IX86_BUILTIN_VFMADDSS3,
25389 IX86_BUILTIN_VFMADDSD3,
25390
25391 /* XOP instructions. */
25392 IX86_BUILTIN_VPCMOV,
25393 IX86_BUILTIN_VPCMOV_V2DI,
25394 IX86_BUILTIN_VPCMOV_V4SI,
25395 IX86_BUILTIN_VPCMOV_V8HI,
25396 IX86_BUILTIN_VPCMOV_V16QI,
25397 IX86_BUILTIN_VPCMOV_V4SF,
25398 IX86_BUILTIN_VPCMOV_V2DF,
25399 IX86_BUILTIN_VPCMOV256,
25400 IX86_BUILTIN_VPCMOV_V4DI256,
25401 IX86_BUILTIN_VPCMOV_V8SI256,
25402 IX86_BUILTIN_VPCMOV_V16HI256,
25403 IX86_BUILTIN_VPCMOV_V32QI256,
25404 IX86_BUILTIN_VPCMOV_V8SF256,
25405 IX86_BUILTIN_VPCMOV_V4DF256,
25406
25407 IX86_BUILTIN_VPPERM,
25408
25409 IX86_BUILTIN_VPMACSSWW,
25410 IX86_BUILTIN_VPMACSWW,
25411 IX86_BUILTIN_VPMACSSWD,
25412 IX86_BUILTIN_VPMACSWD,
25413 IX86_BUILTIN_VPMACSSDD,
25414 IX86_BUILTIN_VPMACSDD,
25415 IX86_BUILTIN_VPMACSSDQL,
25416 IX86_BUILTIN_VPMACSSDQH,
25417 IX86_BUILTIN_VPMACSDQL,
25418 IX86_BUILTIN_VPMACSDQH,
25419 IX86_BUILTIN_VPMADCSSWD,
25420 IX86_BUILTIN_VPMADCSWD,
25421
25422 IX86_BUILTIN_VPHADDBW,
25423 IX86_BUILTIN_VPHADDBD,
25424 IX86_BUILTIN_VPHADDBQ,
25425 IX86_BUILTIN_VPHADDWD,
25426 IX86_BUILTIN_VPHADDWQ,
25427 IX86_BUILTIN_VPHADDDQ,
25428 IX86_BUILTIN_VPHADDUBW,
25429 IX86_BUILTIN_VPHADDUBD,
25430 IX86_BUILTIN_VPHADDUBQ,
25431 IX86_BUILTIN_VPHADDUWD,
25432 IX86_BUILTIN_VPHADDUWQ,
25433 IX86_BUILTIN_VPHADDUDQ,
25434 IX86_BUILTIN_VPHSUBBW,
25435 IX86_BUILTIN_VPHSUBWD,
25436 IX86_BUILTIN_VPHSUBDQ,
25437
25438 IX86_BUILTIN_VPROTB,
25439 IX86_BUILTIN_VPROTW,
25440 IX86_BUILTIN_VPROTD,
25441 IX86_BUILTIN_VPROTQ,
25442 IX86_BUILTIN_VPROTB_IMM,
25443 IX86_BUILTIN_VPROTW_IMM,
25444 IX86_BUILTIN_VPROTD_IMM,
25445 IX86_BUILTIN_VPROTQ_IMM,
25446
25447 IX86_BUILTIN_VPSHLB,
25448 IX86_BUILTIN_VPSHLW,
25449 IX86_BUILTIN_VPSHLD,
25450 IX86_BUILTIN_VPSHLQ,
25451 IX86_BUILTIN_VPSHAB,
25452 IX86_BUILTIN_VPSHAW,
25453 IX86_BUILTIN_VPSHAD,
25454 IX86_BUILTIN_VPSHAQ,
25455
25456 IX86_BUILTIN_VFRCZSS,
25457 IX86_BUILTIN_VFRCZSD,
25458 IX86_BUILTIN_VFRCZPS,
25459 IX86_BUILTIN_VFRCZPD,
25460 IX86_BUILTIN_VFRCZPS256,
25461 IX86_BUILTIN_VFRCZPD256,
25462
25463 IX86_BUILTIN_VPCOMEQUB,
25464 IX86_BUILTIN_VPCOMNEUB,
25465 IX86_BUILTIN_VPCOMLTUB,
25466 IX86_BUILTIN_VPCOMLEUB,
25467 IX86_BUILTIN_VPCOMGTUB,
25468 IX86_BUILTIN_VPCOMGEUB,
25469 IX86_BUILTIN_VPCOMFALSEUB,
25470 IX86_BUILTIN_VPCOMTRUEUB,
25471
25472 IX86_BUILTIN_VPCOMEQUW,
25473 IX86_BUILTIN_VPCOMNEUW,
25474 IX86_BUILTIN_VPCOMLTUW,
25475 IX86_BUILTIN_VPCOMLEUW,
25476 IX86_BUILTIN_VPCOMGTUW,
25477 IX86_BUILTIN_VPCOMGEUW,
25478 IX86_BUILTIN_VPCOMFALSEUW,
25479 IX86_BUILTIN_VPCOMTRUEUW,
25480
25481 IX86_BUILTIN_VPCOMEQUD,
25482 IX86_BUILTIN_VPCOMNEUD,
25483 IX86_BUILTIN_VPCOMLTUD,
25484 IX86_BUILTIN_VPCOMLEUD,
25485 IX86_BUILTIN_VPCOMGTUD,
25486 IX86_BUILTIN_VPCOMGEUD,
25487 IX86_BUILTIN_VPCOMFALSEUD,
25488 IX86_BUILTIN_VPCOMTRUEUD,
25489
25490 IX86_BUILTIN_VPCOMEQUQ,
25491 IX86_BUILTIN_VPCOMNEUQ,
25492 IX86_BUILTIN_VPCOMLTUQ,
25493 IX86_BUILTIN_VPCOMLEUQ,
25494 IX86_BUILTIN_VPCOMGTUQ,
25495 IX86_BUILTIN_VPCOMGEUQ,
25496 IX86_BUILTIN_VPCOMFALSEUQ,
25497 IX86_BUILTIN_VPCOMTRUEUQ,
25498
25499 IX86_BUILTIN_VPCOMEQB,
25500 IX86_BUILTIN_VPCOMNEB,
25501 IX86_BUILTIN_VPCOMLTB,
25502 IX86_BUILTIN_VPCOMLEB,
25503 IX86_BUILTIN_VPCOMGTB,
25504 IX86_BUILTIN_VPCOMGEB,
25505 IX86_BUILTIN_VPCOMFALSEB,
25506 IX86_BUILTIN_VPCOMTRUEB,
25507
25508 IX86_BUILTIN_VPCOMEQW,
25509 IX86_BUILTIN_VPCOMNEW,
25510 IX86_BUILTIN_VPCOMLTW,
25511 IX86_BUILTIN_VPCOMLEW,
25512 IX86_BUILTIN_VPCOMGTW,
25513 IX86_BUILTIN_VPCOMGEW,
25514 IX86_BUILTIN_VPCOMFALSEW,
25515 IX86_BUILTIN_VPCOMTRUEW,
25516
25517 IX86_BUILTIN_VPCOMEQD,
25518 IX86_BUILTIN_VPCOMNED,
25519 IX86_BUILTIN_VPCOMLTD,
25520 IX86_BUILTIN_VPCOMLED,
25521 IX86_BUILTIN_VPCOMGTD,
25522 IX86_BUILTIN_VPCOMGED,
25523 IX86_BUILTIN_VPCOMFALSED,
25524 IX86_BUILTIN_VPCOMTRUED,
25525
25526 IX86_BUILTIN_VPCOMEQQ,
25527 IX86_BUILTIN_VPCOMNEQ,
25528 IX86_BUILTIN_VPCOMLTQ,
25529 IX86_BUILTIN_VPCOMLEQ,
25530 IX86_BUILTIN_VPCOMGTQ,
25531 IX86_BUILTIN_VPCOMGEQ,
25532 IX86_BUILTIN_VPCOMFALSEQ,
25533 IX86_BUILTIN_VPCOMTRUEQ,
25534
25535 /* LWP instructions. */
25536 IX86_BUILTIN_LLWPCB,
25537 IX86_BUILTIN_SLWPCB,
25538 IX86_BUILTIN_LWPVAL32,
25539 IX86_BUILTIN_LWPVAL64,
25540 IX86_BUILTIN_LWPINS32,
25541 IX86_BUILTIN_LWPINS64,
25542
25543 IX86_BUILTIN_CLZS,
25544
25545 /* BMI instructions. */
25546 IX86_BUILTIN_BEXTR32,
25547 IX86_BUILTIN_BEXTR64,
25548 IX86_BUILTIN_CTZS,
25549
25550 /* TBM instructions. */
25551 IX86_BUILTIN_BEXTRI32,
25552 IX86_BUILTIN_BEXTRI64,
25553
25554 /* BMI2 instructions. */
25555 IX86_BUILTIN_BZHI32,
25556 IX86_BUILTIN_BZHI64,
25557 IX86_BUILTIN_PDEP32,
25558 IX86_BUILTIN_PDEP64,
25559 IX86_BUILTIN_PEXT32,
25560 IX86_BUILTIN_PEXT64,
25561
25562 /* FSGSBASE instructions. */
25563 IX86_BUILTIN_RDFSBASE32,
25564 IX86_BUILTIN_RDFSBASE64,
25565 IX86_BUILTIN_RDGSBASE32,
25566 IX86_BUILTIN_RDGSBASE64,
25567 IX86_BUILTIN_WRFSBASE32,
25568 IX86_BUILTIN_WRFSBASE64,
25569 IX86_BUILTIN_WRGSBASE32,
25570 IX86_BUILTIN_WRGSBASE64,
25571
25572 /* RDRND instructions. */
25573 IX86_BUILTIN_RDRAND16_STEP,
25574 IX86_BUILTIN_RDRAND32_STEP,
25575 IX86_BUILTIN_RDRAND64_STEP,
25576
25577 /* F16C instructions. */
25578 IX86_BUILTIN_CVTPH2PS,
25579 IX86_BUILTIN_CVTPH2PS256,
25580 IX86_BUILTIN_CVTPS2PH,
25581 IX86_BUILTIN_CVTPS2PH256,
25582
25583 /* CFString built-in for darwin */
25584 IX86_BUILTIN_CFSTRING,
25585
25586 IX86_BUILTIN_MAX
25587 };
25588
25589 /* Table for the ix86 builtin decls. */
25590 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25591
25592 /* Table of all of the builtin functions that are possible with different ISA's
25593 but are waiting to be built until a function is declared to use that
25594 ISA. */
25595 struct builtin_isa {
25596 const char *name; /* function name */
25597 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25598 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25599 bool const_p; /* true if the declaration is constant */
25600 bool set_and_not_built_p;
25601 };
25602
25603 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25604
25605
25606 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25607 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25608 function decl in the ix86_builtins array. Returns the function decl or
25609 NULL_TREE, if the builtin was not added.
25610
25611 If the front end has a special hook for builtin functions, delay adding
25612 builtin functions that aren't in the current ISA until the ISA is changed
25613 with function specific optimization. Doing so, can save about 300K for the
25614 default compiler. When the builtin is expanded, check at that time whether
25615 it is valid.
25616
25617 If the front end doesn't have a special hook, record all builtins, even if
25618 it isn't an instruction set in the current ISA in case the user uses
25619 function specific options for a different ISA, so that we don't get scope
25620 errors if a builtin is added in the middle of a function scope. */
25621
25622 static inline tree
25623 def_builtin (HOST_WIDE_INT mask, const char *name,
25624 enum ix86_builtin_func_type tcode,
25625 enum ix86_builtins code)
25626 {
25627 tree decl = NULL_TREE;
25628
25629 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25630 {
25631 ix86_builtins_isa[(int) code].isa = mask;
25632
25633 mask &= ~OPTION_MASK_ISA_64BIT;
25634 if (mask == 0
25635 || (mask & ix86_isa_flags) != 0
25636 || (lang_hooks.builtin_function
25637 == lang_hooks.builtin_function_ext_scope))
25638
25639 {
25640 tree type = ix86_get_builtin_func_type (tcode);
25641 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25642 NULL, NULL_TREE);
25643 ix86_builtins[(int) code] = decl;
25644 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25645 }
25646 else
25647 {
25648 ix86_builtins[(int) code] = NULL_TREE;
25649 ix86_builtins_isa[(int) code].tcode = tcode;
25650 ix86_builtins_isa[(int) code].name = name;
25651 ix86_builtins_isa[(int) code].const_p = false;
25652 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25653 }
25654 }
25655
25656 return decl;
25657 }
25658
25659 /* Like def_builtin, but also marks the function decl "const". */
25660
25661 static inline tree
25662 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25663 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25664 {
25665 tree decl = def_builtin (mask, name, tcode, code);
25666 if (decl)
25667 TREE_READONLY (decl) = 1;
25668 else
25669 ix86_builtins_isa[(int) code].const_p = true;
25670
25671 return decl;
25672 }
25673
25674 /* Add any new builtin functions for a given ISA that may not have been
25675 declared. This saves a bit of space compared to adding all of the
25676 declarations to the tree, even if we didn't use them. */
25677
25678 static void
25679 ix86_add_new_builtins (HOST_WIDE_INT isa)
25680 {
25681 int i;
25682
25683 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
25684 {
25685 if ((ix86_builtins_isa[i].isa & isa) != 0
25686 && ix86_builtins_isa[i].set_and_not_built_p)
25687 {
25688 tree decl, type;
25689
25690 /* Don't define the builtin again. */
25691 ix86_builtins_isa[i].set_and_not_built_p = false;
25692
25693 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
25694 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
25695 type, i, BUILT_IN_MD, NULL,
25696 NULL_TREE);
25697
25698 ix86_builtins[i] = decl;
25699 if (ix86_builtins_isa[i].const_p)
25700 TREE_READONLY (decl) = 1;
25701 }
25702 }
25703 }
25704
25705 /* Bits for builtin_description.flag. */
25706
25707 /* Set when we don't support the comparison natively, and should
25708 swap_comparison in order to support it. */
25709 #define BUILTIN_DESC_SWAP_OPERANDS 1
25710
25711 struct builtin_description
25712 {
25713 const HOST_WIDE_INT mask;
25714 const enum insn_code icode;
25715 const char *const name;
25716 const enum ix86_builtins code;
25717 const enum rtx_code comparison;
25718 const int flag;
25719 };
25720
25721 static const struct builtin_description bdesc_comi[] =
25722 {
25723 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
25724 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
25725 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
25726 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
25727 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
25728 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
25729 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
25730 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
25731 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
25732 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
25733 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
25734 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
25735 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
25736 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
25737 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
25738 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
25739 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
25740 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
25741 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
25742 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
25743 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
25744 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
25745 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
25746 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
25747 };
25748
25749 static const struct builtin_description bdesc_pcmpestr[] =
25750 {
25751 /* SSE4.2 */
25752 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
25753 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
25754 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
25755 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
25756 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
25757 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
25758 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
25759 };
25760
25761 static const struct builtin_description bdesc_pcmpistr[] =
25762 {
25763 /* SSE4.2 */
25764 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
25765 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
25766 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
25767 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
25768 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
25769 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
25770 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
25771 };
25772
25773 /* Special builtins with variable number of arguments. */
25774 static const struct builtin_description bdesc_special_args[] =
25775 {
25776 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
25777 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
25778 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
25779
25780 /* MMX */
25781 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25782
25783 /* 3DNow! */
25784 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25785
25786 /* SSE */
25787 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25788 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25789 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25790
25791 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25792 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25793 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25794 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25795
25796 /* SSE or 3DNow!A */
25797 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25798 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
25799
25800 /* SSE2 */
25801 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25802 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25803 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25804 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
25805 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25806 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
25807 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
25808 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
25809 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
25810 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25811
25812 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25813 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25814
25815 /* SSE3 */
25816 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25817
25818 /* SSE4.1 */
25819 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
25820
25821 /* SSE4A */
25822 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25823 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25824
25825 /* AVX */
25826 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
25827 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
25828
25829 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25830 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25831 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25832 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
25833 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
25834
25835 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25836 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25837 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25838 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25839 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25840 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
25841 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25842
25843 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
25844 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25845 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25846
25847 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
25848 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
25849 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
25850 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
25851 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
25852 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
25853 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
25854 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
25855
25856 /* AVX2 */
25857 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
25858 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
25859 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
25860 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
25861 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
25862 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
25863 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
25864 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
25865 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
25866
25867 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
25868 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
25869 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
25870 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
25871 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
25872 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
25873
25874 /* FSGSBASE */
25875 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25876 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25877 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25878 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25879 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25880 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25881 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25882 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25883 };
25884
25885 /* Builtins with variable number of arguments. */
25886 static const struct builtin_description bdesc_args[] =
25887 {
25888 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
25889 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
25890 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
25891 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25892 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25893 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25894 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25895
25896 /* MMX */
25897 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25898 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25899 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25900 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25901 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25902 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25903
25904 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25905 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25906 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25907 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25908 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25909 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25910 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25911 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25912
25913 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25914 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25915
25916 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25917 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25918 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25919 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25920
25921 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25922 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25923 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25924 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25925 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25926 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25927
25928 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25929 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25930 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25931 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25932 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
25933 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
25934
25935 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25936 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
25937 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25938
25939 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
25940
25941 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25942 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25943 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
25944 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25945 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25946 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
25947
25948 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25949 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25950 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
25951 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25952 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25953 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
25954
25955 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25956 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25957 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25958 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25959
25960 /* 3DNow! */
25961 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
25962 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
25963 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25964 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25965
25966 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25967 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25968 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25969 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25970 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25971 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
25972 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25973 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25974 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25975 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25976 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25977 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25978 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25979 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25980 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25981
25982 /* 3DNow!A */
25983 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
25984 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
25985 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
25986 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25987 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25988 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
25989
25990 /* SSE */
25991 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
25992 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25993 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25994 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25995 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25996 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
25997 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
25998 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
25999 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26000 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26001 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26002 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26003
26004 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26005
26006 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26007 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26008 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26009 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26010 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26011 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26012 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26013 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26014
26015 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26016 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26017 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26018 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26019 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26020 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26021 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26022 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26023 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26024 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26025 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26026 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26027 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26028 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26029 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26030 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26031 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26032 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26033 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26034 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26035 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26036 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26037
26038 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26039 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26040 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26041 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26042
26043 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26044 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26045 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26046 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26047
26048 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26049
26050 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26051 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26052 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26053 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26054 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26055
26056 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26057 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26058 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26059
26060 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26061
26062 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26063 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26064 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26065
26066 /* SSE MMX or 3Dnow!A */
26067 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26068 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26069 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26070
26071 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26072 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26073 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26074 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26075
26076 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26077 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26078
26079 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26080
26081 /* SSE2 */
26082 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26083
26084 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26085 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26086 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26087 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26088 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26089
26090 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26091 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26092 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26093 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26094 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26095
26096 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26097
26098 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26099 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26100 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26101 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26102
26103 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26104 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26105 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26106
26107 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26108 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26109 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26110 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26111 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26112 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26113 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26114 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26115
26116 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26117 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26118 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26119 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26120 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26121 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26122 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26123 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26124 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26125 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26126 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26127 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26128 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26129 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26130 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26131 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26132 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26133 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26134 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26135 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26136
26137 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26138 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26139 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26140 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26141
26142 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26143 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26144 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26145 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26146
26147 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26148
26149 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26150 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26151 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26152
26153 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26154
26155 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26156 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26157 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26158 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26159 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26160 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26161 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26162 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26163
26164 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26165 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26166 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26167 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26168 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26169 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26170 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26171 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26172
26173 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26174 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26175
26176 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26177 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26178 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26179 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26180
26181 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26182 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26183
26184 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26185 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26186 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26187 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26188 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26189 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26190
26191 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26192 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26193 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26194 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26195
26196 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26197 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26198 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26199 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26200 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26201 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26202 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26203 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26204
26205 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26206 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26207 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26208
26209 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26210 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26211
26212 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26213 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26214
26215 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26216
26217 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26218 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26219 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26220 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26221
26222 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26223 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26224 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26225 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26226 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26227 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26228 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26229
26230 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26231 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26232 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26233 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26234 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26235 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26236 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26237
26238 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26239 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26240 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26241 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26242
26243 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26244 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26245 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26246
26247 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26248
26249 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26250 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26251
26252 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26253
26254 /* SSE2 MMX */
26255 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26256 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26257
26258 /* SSE3 */
26259 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26260 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26261
26262 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26263 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26264 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26265 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26266 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26267 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26268
26269 /* SSSE3 */
26270 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26271 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26272 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26273 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26274 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26275 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26276
26277 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26278 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26279 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26280 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26281 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26282 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26283 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26284 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26285 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26286 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26287 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26288 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26289 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26290 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26291 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26292 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26293 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26294 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26295 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26296 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26297 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26298 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26299 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26300 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26301
26302 /* SSSE3. */
26303 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26304 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26305
26306 /* SSE4.1 */
26307 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26308 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26309 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26310 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26311 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26312 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26313 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26314 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26315 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26316 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26317
26318 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26319 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26320 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26321 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26322 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26323 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26324 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26325 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26326 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26327 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26328 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26329 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26330 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26331
26332 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26333 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26334 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26335 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26336 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26337 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26338 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26339 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26340 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26341 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26342 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26343 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26344
26345 /* SSE4.1 */
26346 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26347 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26348 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26349 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26350
26351 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26352 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26353 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26354 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26355
26356 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26357 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26358
26359 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26360 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26361
26362 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26363 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26364 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26365 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26366
26367 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26368 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26369
26370 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26371 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26372
26373 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26374 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26375 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26376
26377 /* SSE4.2 */
26378 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26379 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26380 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26381 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26382 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26383
26384 /* SSE4A */
26385 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26386 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26387 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26388 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26389
26390 /* AES */
26391 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26392 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26393
26394 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26395 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26396 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26397 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26398
26399 /* PCLMUL */
26400 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26401
26402 /* AVX */
26403 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26404 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26405 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26406 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26407 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26408 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26409 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26410 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26411 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26412 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26413 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26414 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26415 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26416 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26417 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26418 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26419 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26420 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26421 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26422 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26423 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26424 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26425 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26426 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26427 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26428 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26429
26430 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26431 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26432 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26433 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26434
26435 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26436 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26437 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26438 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26439 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26440 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26441 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26442 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26443 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26444 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26445 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26446 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26447 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26448 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26449 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26450 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26451 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26452 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26453 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26454 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26455 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26456 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26457 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26458 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26459 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26460 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26461 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26462 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26463 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26464 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26465 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26466 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26467 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26468 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26469
26470 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26471 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26472 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26473
26474 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26475 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26476 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26477 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26478 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26479
26480 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26481
26482 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26483 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26484
26485 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26486 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26487 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26488 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26489
26490 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26491 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26492
26493 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26494 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26495
26496 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26497 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26498 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26499 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26500
26501 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26502 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26503
26504 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26505 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26506
26507 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26508 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26509 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26510 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26511
26512 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26513 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26514 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26515 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26516 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26517 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26518
26519 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26520 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26521 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26522 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26523 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26524 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26525 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26526 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26527 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26528 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26529 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26530 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26531 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26532 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26533 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26534
26535 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26536 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26537
26538 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26539 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26540
26541 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26542
26543 /* AVX2 */
26544 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26545 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26546 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26547 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26548 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26549 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26550 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26551 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26552 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26553 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26554 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26555 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26556 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26557 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26558 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26559 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26560 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26561 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26562 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26563 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26564 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26565 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26566 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26567 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26568 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26569 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26570 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26571 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26572 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26573 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26574 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26575 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26576 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26577 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26578 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26579 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26580 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26581 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26582 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26583 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26584 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26585 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26586 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26587 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26588 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26589 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26590 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26591 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26592 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26593 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26594 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26595 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26596 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26597 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26598 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26599 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26600 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26601 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26602 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26603 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26604 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26605 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26606 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26607 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26608 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26609 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26610 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26611 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26612 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26613 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26614 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26615 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26616 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26617 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26618 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26619 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26620 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26621 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26622 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26623 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26624 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26625 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26626 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26627 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26628 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26629 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26630 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26631 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26632 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26633 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26634 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26635 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26636 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26637 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26638 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26639 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26640 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26641 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26642 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26643 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26644 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26645 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26646 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26647 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26648 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26649 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26650 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26651 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26652 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26653 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26654 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26655 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26656 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26657 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26658 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26659 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26660 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26661 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26662 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26663 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
26664 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26665 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
26666 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
26667 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26668 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26669 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26670 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26671 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26672 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26673 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26674 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26675 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26676 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
26677 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
26678 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
26679 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
26680 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26681 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26682 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26683 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26684 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26685 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26686 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26687 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26688 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26689 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26690
26691 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26692
26693 /* BMI */
26694 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26695 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26696 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26697
26698 /* TBM */
26699 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26700 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26701
26702 /* F16C */
26703 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
26704 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
26705 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
26706 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
26707
26708 /* BMI2 */
26709 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26710 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26711 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26712 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26713 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26714 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26715 };
26716
26717 /* FMA4 and XOP. */
26718 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
26719 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
26720 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
26721 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
26722 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
26723 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
26724 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
26725 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
26726 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
26727 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
26728 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
26729 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
26730 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
26731 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
26732 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
26733 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
26734 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
26735 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
26736 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
26737 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
26738 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
26739 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
26740 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
26741 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
26742 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
26743 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
26744 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
26745 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
26746 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
26747 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
26748 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
26749 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
26750 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
26751 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
26752 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
26753 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
26754 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
26755 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
26756 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
26757 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
26758 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
26759 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
26760 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
26761 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
26762 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
26763 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
26764 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
26765 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
26766 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
26767 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
26768 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
26769 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
26770
26771 static const struct builtin_description bdesc_multi_arg[] =
26772 {
26773 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
26774 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
26775 UNKNOWN, (int)MULTI_ARG_3_SF },
26776 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
26777 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
26778 UNKNOWN, (int)MULTI_ARG_3_DF },
26779
26780 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
26781 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
26782 UNKNOWN, (int)MULTI_ARG_3_SF },
26783 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
26784 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
26785 UNKNOWN, (int)MULTI_ARG_3_DF },
26786
26787 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
26788 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
26789 UNKNOWN, (int)MULTI_ARG_3_SF },
26790 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
26791 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
26792 UNKNOWN, (int)MULTI_ARG_3_DF },
26793 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
26794 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
26795 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26796 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
26797 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
26798 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26799
26800 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
26801 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
26802 UNKNOWN, (int)MULTI_ARG_3_SF },
26803 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
26804 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
26805 UNKNOWN, (int)MULTI_ARG_3_DF },
26806 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
26807 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
26808 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26809 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
26810 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
26811 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26812
26813 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
26814 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
26815 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
26816 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
26817 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
26818 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
26819 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
26820
26821 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26822 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26823 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
26824 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
26825 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
26826 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
26827 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
26828
26829 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
26830
26831 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26832 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26833 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26834 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26835 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26836 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26837 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26838 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26839 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26840 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26841 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26842 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26843
26844 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26845 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
26846 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
26847 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
26848 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
26849 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
26850 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
26851 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
26852 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26853 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
26854 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
26855 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
26856 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26857 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
26858 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
26859 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
26860
26861 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
26862 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
26863 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
26864 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
26865 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
26866 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
26867
26868 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26869 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26870 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26871 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26872 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26873 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26874 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26875 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26876 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26877 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26878 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26879 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26880 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26881 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26882 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26883
26884 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
26885 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26886 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26887 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
26888 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
26889 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
26890 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
26891
26892 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
26893 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26894 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26895 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
26896 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
26897 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
26898 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
26899
26900 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
26901 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26902 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26903 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
26904 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
26905 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
26906 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
26907
26908 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26909 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26910 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26911 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
26912 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
26913 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
26914 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
26915
26916 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
26917 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26918 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26919 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
26920 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
26921 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
26922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
26923
26924 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
26925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
26928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
26929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
26930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
26931
26932 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
26933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
26936 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
26937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
26938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
26939
26940 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
26944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
26945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
26946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
26947
26948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
26949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
26950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
26951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
26952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
26953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
26954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
26955 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
26956
26957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
26958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
26959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
26960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
26961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
26962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
26963 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
26964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
26965
26966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
26967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
26968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
26969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
26970
26971 };
26972 \f
26973 /* TM vector builtins. */
26974
26975 /* Reuse the existing x86-specific `struct builtin_description' cause
26976 we're lazy. Add casts to make them fit. */
26977 static const struct builtin_description bdesc_tm[] =
26978 {
26979 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
26980 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
26981 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
26982 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
26983 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
26984 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
26985 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
26986
26987 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
26988 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
26989 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
26990 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
26991 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
26992 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
26993 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
26994
26995 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
26996 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
26997 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
26998 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
26999 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27000 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27001 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27002
27003 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27004 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27005 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27006 };
27007
27008 /* TM callbacks. */
27009
27010 /* Return the builtin decl needed to load a vector of TYPE. */
27011
27012 static tree
27013 ix86_builtin_tm_load (tree type)
27014 {
27015 if (TREE_CODE (type) == VECTOR_TYPE)
27016 {
27017 switch (tree_low_cst (TYPE_SIZE (type), 1))
27018 {
27019 case 64:
27020 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27021 case 128:
27022 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27023 case 256:
27024 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27025 }
27026 }
27027 return NULL_TREE;
27028 }
27029
27030 /* Return the builtin decl needed to store a vector of TYPE. */
27031
27032 static tree
27033 ix86_builtin_tm_store (tree type)
27034 {
27035 if (TREE_CODE (type) == VECTOR_TYPE)
27036 {
27037 switch (tree_low_cst (TYPE_SIZE (type), 1))
27038 {
27039 case 64:
27040 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27041 case 128:
27042 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27043 case 256:
27044 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27045 }
27046 }
27047 return NULL_TREE;
27048 }
27049 \f
27050 /* Initialize the transactional memory vector load/store builtins. */
27051
27052 static void
27053 ix86_init_tm_builtins (void)
27054 {
27055 enum ix86_builtin_func_type ftype;
27056 const struct builtin_description *d;
27057 size_t i;
27058 tree decl;
27059 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27060 tree attrs_log, attrs_type_log;
27061
27062 if (!flag_tm)
27063 return;
27064
27065 /* If there are no builtins defined, we must be compiling in a
27066 language without trans-mem support. */
27067 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27068 return;
27069
27070 /* Use whatever attributes a normal TM load has. */
27071 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27072 attrs_load = DECL_ATTRIBUTES (decl);
27073 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27074 /* Use whatever attributes a normal TM store has. */
27075 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27076 attrs_store = DECL_ATTRIBUTES (decl);
27077 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27078 /* Use whatever attributes a normal TM log has. */
27079 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27080 attrs_log = DECL_ATTRIBUTES (decl);
27081 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27082
27083 for (i = 0, d = bdesc_tm;
27084 i < ARRAY_SIZE (bdesc_tm);
27085 i++, d++)
27086 {
27087 if ((d->mask & ix86_isa_flags) != 0
27088 || (lang_hooks.builtin_function
27089 == lang_hooks.builtin_function_ext_scope))
27090 {
27091 tree type, attrs, attrs_type;
27092 enum built_in_function code = (enum built_in_function) d->code;
27093
27094 ftype = (enum ix86_builtin_func_type) d->flag;
27095 type = ix86_get_builtin_func_type (ftype);
27096
27097 if (BUILTIN_TM_LOAD_P (code))
27098 {
27099 attrs = attrs_load;
27100 attrs_type = attrs_type_load;
27101 }
27102 else if (BUILTIN_TM_STORE_P (code))
27103 {
27104 attrs = attrs_store;
27105 attrs_type = attrs_type_store;
27106 }
27107 else
27108 {
27109 attrs = attrs_log;
27110 attrs_type = attrs_type_log;
27111 }
27112 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27113 /* The builtin without the prefix for
27114 calling it directly. */
27115 d->name + strlen ("__builtin_"),
27116 attrs);
27117 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27118 set the TYPE_ATTRIBUTES. */
27119 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27120
27121 set_builtin_decl (code, decl, false);
27122 }
27123 }
27124 }
27125
27126 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27127 in the current target ISA to allow the user to compile particular modules
27128 with different target specific options that differ from the command line
27129 options. */
27130 static void
27131 ix86_init_mmx_sse_builtins (void)
27132 {
27133 const struct builtin_description * d;
27134 enum ix86_builtin_func_type ftype;
27135 size_t i;
27136
27137 /* Add all special builtins with variable number of operands. */
27138 for (i = 0, d = bdesc_special_args;
27139 i < ARRAY_SIZE (bdesc_special_args);
27140 i++, d++)
27141 {
27142 if (d->name == 0)
27143 continue;
27144
27145 ftype = (enum ix86_builtin_func_type) d->flag;
27146 def_builtin (d->mask, d->name, ftype, d->code);
27147 }
27148
27149 /* Add all builtins with variable number of operands. */
27150 for (i = 0, d = bdesc_args;
27151 i < ARRAY_SIZE (bdesc_args);
27152 i++, d++)
27153 {
27154 if (d->name == 0)
27155 continue;
27156
27157 ftype = (enum ix86_builtin_func_type) d->flag;
27158 def_builtin_const (d->mask, d->name, ftype, d->code);
27159 }
27160
27161 /* pcmpestr[im] insns. */
27162 for (i = 0, d = bdesc_pcmpestr;
27163 i < ARRAY_SIZE (bdesc_pcmpestr);
27164 i++, d++)
27165 {
27166 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27167 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27168 else
27169 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27170 def_builtin_const (d->mask, d->name, ftype, d->code);
27171 }
27172
27173 /* pcmpistr[im] insns. */
27174 for (i = 0, d = bdesc_pcmpistr;
27175 i < ARRAY_SIZE (bdesc_pcmpistr);
27176 i++, d++)
27177 {
27178 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27179 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27180 else
27181 ftype = INT_FTYPE_V16QI_V16QI_INT;
27182 def_builtin_const (d->mask, d->name, ftype, d->code);
27183 }
27184
27185 /* comi/ucomi insns. */
27186 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27187 {
27188 if (d->mask == OPTION_MASK_ISA_SSE2)
27189 ftype = INT_FTYPE_V2DF_V2DF;
27190 else
27191 ftype = INT_FTYPE_V4SF_V4SF;
27192 def_builtin_const (d->mask, d->name, ftype, d->code);
27193 }
27194
27195 /* SSE */
27196 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27197 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27198 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27199 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27200
27201 /* SSE or 3DNow!A */
27202 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27203 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27204 IX86_BUILTIN_MASKMOVQ);
27205
27206 /* SSE2 */
27207 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27208 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27209
27210 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27211 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27212 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27213 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27214
27215 /* SSE3. */
27216 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27217 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27218 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27219 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27220
27221 /* AES */
27222 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27223 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27224 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27225 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27226 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27227 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27228 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27229 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27230 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27231 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27232 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27233 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27234
27235 /* PCLMUL */
27236 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27237 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27238
27239 /* RDRND */
27240 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27241 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27242 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27243 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27244 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27245 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27246 IX86_BUILTIN_RDRAND64_STEP);
27247
27248 /* AVX2 */
27249 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27250 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27251 IX86_BUILTIN_GATHERSIV2DF);
27252
27253 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27254 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27255 IX86_BUILTIN_GATHERSIV4DF);
27256
27257 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27258 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27259 IX86_BUILTIN_GATHERDIV2DF);
27260
27261 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27262 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27263 IX86_BUILTIN_GATHERDIV4DF);
27264
27265 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27266 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27267 IX86_BUILTIN_GATHERSIV4SF);
27268
27269 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27270 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27271 IX86_BUILTIN_GATHERSIV8SF);
27272
27273 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27274 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27275 IX86_BUILTIN_GATHERDIV4SF);
27276
27277 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27278 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27279 IX86_BUILTIN_GATHERDIV8SF);
27280
27281 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27282 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27283 IX86_BUILTIN_GATHERSIV2DI);
27284
27285 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27286 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27287 IX86_BUILTIN_GATHERSIV4DI);
27288
27289 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27290 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27291 IX86_BUILTIN_GATHERDIV2DI);
27292
27293 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27294 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27295 IX86_BUILTIN_GATHERDIV4DI);
27296
27297 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27298 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27299 IX86_BUILTIN_GATHERSIV4SI);
27300
27301 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27302 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27303 IX86_BUILTIN_GATHERSIV8SI);
27304
27305 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27306 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27307 IX86_BUILTIN_GATHERDIV4SI);
27308
27309 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27310 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27311 IX86_BUILTIN_GATHERDIV8SI);
27312
27313 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27314 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27315 IX86_BUILTIN_GATHERALTSIV4DF);
27316
27317 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27318 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27319 IX86_BUILTIN_GATHERALTDIV8SF);
27320
27321 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27322 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27323 IX86_BUILTIN_GATHERALTSIV4DI);
27324
27325 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27326 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27327 IX86_BUILTIN_GATHERALTDIV8SI);
27328
27329 /* MMX access to the vec_init patterns. */
27330 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27331 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27332
27333 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27334 V4HI_FTYPE_HI_HI_HI_HI,
27335 IX86_BUILTIN_VEC_INIT_V4HI);
27336
27337 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27338 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27339 IX86_BUILTIN_VEC_INIT_V8QI);
27340
27341 /* Access to the vec_extract patterns. */
27342 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27343 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27344 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27345 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27346 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27347 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27348 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27349 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27350 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27351 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27352
27353 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27354 "__builtin_ia32_vec_ext_v4hi",
27355 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27356
27357 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27358 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27359
27360 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27361 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27362
27363 /* Access to the vec_set patterns. */
27364 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27365 "__builtin_ia32_vec_set_v2di",
27366 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27367
27368 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27369 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27370
27371 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27372 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27373
27374 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27375 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27376
27377 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27378 "__builtin_ia32_vec_set_v4hi",
27379 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27380
27381 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27382 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27383
27384 /* Add FMA4 multi-arg argument instructions */
27385 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27386 {
27387 if (d->name == 0)
27388 continue;
27389
27390 ftype = (enum ix86_builtin_func_type) d->flag;
27391 def_builtin_const (d->mask, d->name, ftype, d->code);
27392 }
27393 }
27394
27395 /* Internal method for ix86_init_builtins. */
27396
27397 static void
27398 ix86_init_builtins_va_builtins_abi (void)
27399 {
27400 tree ms_va_ref, sysv_va_ref;
27401 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27402 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
27403 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
27404 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
27405
27406 if (!TARGET_64BIT)
27407 return;
27408 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
27409 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
27410 ms_va_ref = build_reference_type (ms_va_list_type_node);
27411 sysv_va_ref =
27412 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
27413
27414 fnvoid_va_end_ms =
27415 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27416 fnvoid_va_start_ms =
27417 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27418 fnvoid_va_end_sysv =
27419 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
27420 fnvoid_va_start_sysv =
27421 build_varargs_function_type_list (void_type_node, sysv_va_ref,
27422 NULL_TREE);
27423 fnvoid_va_copy_ms =
27424 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
27425 NULL_TREE);
27426 fnvoid_va_copy_sysv =
27427 build_function_type_list (void_type_node, sysv_va_ref,
27428 sysv_va_ref, NULL_TREE);
27429
27430 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
27431 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
27432 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
27433 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
27434 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
27435 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
27436 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
27437 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27438 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
27439 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27440 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
27441 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27442 }
27443
27444 static void
27445 ix86_init_builtin_types (void)
27446 {
27447 tree float128_type_node, float80_type_node;
27448
27449 /* The __float80 type. */
27450 float80_type_node = long_double_type_node;
27451 if (TYPE_MODE (float80_type_node) != XFmode)
27452 {
27453 /* The __float80 type. */
27454 float80_type_node = make_node (REAL_TYPE);
27455
27456 TYPE_PRECISION (float80_type_node) = 80;
27457 layout_type (float80_type_node);
27458 }
27459 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
27460
27461 /* The __float128 type. */
27462 float128_type_node = make_node (REAL_TYPE);
27463 TYPE_PRECISION (float128_type_node) = 128;
27464 layout_type (float128_type_node);
27465 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
27466
27467 /* This macro is built by i386-builtin-types.awk. */
27468 DEFINE_BUILTIN_PRIMITIVE_TYPES;
27469 }
27470
27471 static void
27472 ix86_init_builtins (void)
27473 {
27474 tree t;
27475
27476 ix86_init_builtin_types ();
27477
27478 /* TFmode support builtins. */
27479 def_builtin_const (0, "__builtin_infq",
27480 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
27481 def_builtin_const (0, "__builtin_huge_valq",
27482 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
27483
27484 /* We will expand them to normal call if SSE2 isn't available since
27485 they are used by libgcc. */
27486 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27487 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27488 BUILT_IN_MD, "__fabstf2", NULL_TREE);
27489 TREE_READONLY (t) = 1;
27490 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27491
27492 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27493 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27494 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27495 TREE_READONLY (t) = 1;
27496 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27497
27498 ix86_init_tm_builtins ();
27499 ix86_init_mmx_sse_builtins ();
27500
27501 if (TARGET_LP64)
27502 ix86_init_builtins_va_builtins_abi ();
27503
27504 #ifdef SUBTARGET_INIT_BUILTINS
27505 SUBTARGET_INIT_BUILTINS;
27506 #endif
27507 }
27508
27509 /* Return the ix86 builtin for CODE. */
27510
27511 static tree
27512 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27513 {
27514 if (code >= IX86_BUILTIN_MAX)
27515 return error_mark_node;
27516
27517 return ix86_builtins[code];
27518 }
27519
27520 /* Errors in the source file can cause expand_expr to return const0_rtx
27521 where we expect a vector. To avoid crashing, use one of the vector
27522 clear instructions. */
27523 static rtx
27524 safe_vector_operand (rtx x, enum machine_mode mode)
27525 {
27526 if (x == const0_rtx)
27527 x = CONST0_RTX (mode);
27528 return x;
27529 }
27530
27531 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
27532
27533 static rtx
27534 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27535 {
27536 rtx pat;
27537 tree arg0 = CALL_EXPR_ARG (exp, 0);
27538 tree arg1 = CALL_EXPR_ARG (exp, 1);
27539 rtx op0 = expand_normal (arg0);
27540 rtx op1 = expand_normal (arg1);
27541 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27542 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27543 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27544
27545 if (VECTOR_MODE_P (mode0))
27546 op0 = safe_vector_operand (op0, mode0);
27547 if (VECTOR_MODE_P (mode1))
27548 op1 = safe_vector_operand (op1, mode1);
27549
27550 if (optimize || !target
27551 || GET_MODE (target) != tmode
27552 || !insn_data[icode].operand[0].predicate (target, tmode))
27553 target = gen_reg_rtx (tmode);
27554
27555 if (GET_MODE (op1) == SImode && mode1 == TImode)
27556 {
27557 rtx x = gen_reg_rtx (V4SImode);
27558 emit_insn (gen_sse2_loadd (x, op1));
27559 op1 = gen_lowpart (TImode, x);
27560 }
27561
27562 if (!insn_data[icode].operand[1].predicate (op0, mode0))
27563 op0 = copy_to_mode_reg (mode0, op0);
27564 if (!insn_data[icode].operand[2].predicate (op1, mode1))
27565 op1 = copy_to_mode_reg (mode1, op1);
27566
27567 pat = GEN_FCN (icode) (target, op0, op1);
27568 if (! pat)
27569 return 0;
27570
27571 emit_insn (pat);
27572
27573 return target;
27574 }
27575
27576 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
27577
27578 static rtx
27579 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27580 enum ix86_builtin_func_type m_type,
27581 enum rtx_code sub_code)
27582 {
27583 rtx pat;
27584 int i;
27585 int nargs;
27586 bool comparison_p = false;
27587 bool tf_p = false;
27588 bool last_arg_constant = false;
27589 int num_memory = 0;
27590 struct {
27591 rtx op;
27592 enum machine_mode mode;
27593 } args[4];
27594
27595 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27596
27597 switch (m_type)
27598 {
27599 case MULTI_ARG_4_DF2_DI_I:
27600 case MULTI_ARG_4_DF2_DI_I1:
27601 case MULTI_ARG_4_SF2_SI_I:
27602 case MULTI_ARG_4_SF2_SI_I1:
27603 nargs = 4;
27604 last_arg_constant = true;
27605 break;
27606
27607 case MULTI_ARG_3_SF:
27608 case MULTI_ARG_3_DF:
27609 case MULTI_ARG_3_SF2:
27610 case MULTI_ARG_3_DF2:
27611 case MULTI_ARG_3_DI:
27612 case MULTI_ARG_3_SI:
27613 case MULTI_ARG_3_SI_DI:
27614 case MULTI_ARG_3_HI:
27615 case MULTI_ARG_3_HI_SI:
27616 case MULTI_ARG_3_QI:
27617 case MULTI_ARG_3_DI2:
27618 case MULTI_ARG_3_SI2:
27619 case MULTI_ARG_3_HI2:
27620 case MULTI_ARG_3_QI2:
27621 nargs = 3;
27622 break;
27623
27624 case MULTI_ARG_2_SF:
27625 case MULTI_ARG_2_DF:
27626 case MULTI_ARG_2_DI:
27627 case MULTI_ARG_2_SI:
27628 case MULTI_ARG_2_HI:
27629 case MULTI_ARG_2_QI:
27630 nargs = 2;
27631 break;
27632
27633 case MULTI_ARG_2_DI_IMM:
27634 case MULTI_ARG_2_SI_IMM:
27635 case MULTI_ARG_2_HI_IMM:
27636 case MULTI_ARG_2_QI_IMM:
27637 nargs = 2;
27638 last_arg_constant = true;
27639 break;
27640
27641 case MULTI_ARG_1_SF:
27642 case MULTI_ARG_1_DF:
27643 case MULTI_ARG_1_SF2:
27644 case MULTI_ARG_1_DF2:
27645 case MULTI_ARG_1_DI:
27646 case MULTI_ARG_1_SI:
27647 case MULTI_ARG_1_HI:
27648 case MULTI_ARG_1_QI:
27649 case MULTI_ARG_1_SI_DI:
27650 case MULTI_ARG_1_HI_DI:
27651 case MULTI_ARG_1_HI_SI:
27652 case MULTI_ARG_1_QI_DI:
27653 case MULTI_ARG_1_QI_SI:
27654 case MULTI_ARG_1_QI_HI:
27655 nargs = 1;
27656 break;
27657
27658 case MULTI_ARG_2_DI_CMP:
27659 case MULTI_ARG_2_SI_CMP:
27660 case MULTI_ARG_2_HI_CMP:
27661 case MULTI_ARG_2_QI_CMP:
27662 nargs = 2;
27663 comparison_p = true;
27664 break;
27665
27666 case MULTI_ARG_2_SF_TF:
27667 case MULTI_ARG_2_DF_TF:
27668 case MULTI_ARG_2_DI_TF:
27669 case MULTI_ARG_2_SI_TF:
27670 case MULTI_ARG_2_HI_TF:
27671 case MULTI_ARG_2_QI_TF:
27672 nargs = 2;
27673 tf_p = true;
27674 break;
27675
27676 default:
27677 gcc_unreachable ();
27678 }
27679
27680 if (optimize || !target
27681 || GET_MODE (target) != tmode
27682 || !insn_data[icode].operand[0].predicate (target, tmode))
27683 target = gen_reg_rtx (tmode);
27684
27685 gcc_assert (nargs <= 4);
27686
27687 for (i = 0; i < nargs; i++)
27688 {
27689 tree arg = CALL_EXPR_ARG (exp, i);
27690 rtx op = expand_normal (arg);
27691 int adjust = (comparison_p) ? 1 : 0;
27692 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
27693
27694 if (last_arg_constant && i == nargs - 1)
27695 {
27696 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
27697 {
27698 enum insn_code new_icode = icode;
27699 switch (icode)
27700 {
27701 case CODE_FOR_xop_vpermil2v2df3:
27702 case CODE_FOR_xop_vpermil2v4sf3:
27703 case CODE_FOR_xop_vpermil2v4df3:
27704 case CODE_FOR_xop_vpermil2v8sf3:
27705 error ("the last argument must be a 2-bit immediate");
27706 return gen_reg_rtx (tmode);
27707 case CODE_FOR_xop_rotlv2di3:
27708 new_icode = CODE_FOR_rotlv2di3;
27709 goto xop_rotl;
27710 case CODE_FOR_xop_rotlv4si3:
27711 new_icode = CODE_FOR_rotlv4si3;
27712 goto xop_rotl;
27713 case CODE_FOR_xop_rotlv8hi3:
27714 new_icode = CODE_FOR_rotlv8hi3;
27715 goto xop_rotl;
27716 case CODE_FOR_xop_rotlv16qi3:
27717 new_icode = CODE_FOR_rotlv16qi3;
27718 xop_rotl:
27719 if (CONST_INT_P (op))
27720 {
27721 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
27722 op = GEN_INT (INTVAL (op) & mask);
27723 gcc_checking_assert
27724 (insn_data[icode].operand[i + 1].predicate (op, mode));
27725 }
27726 else
27727 {
27728 gcc_checking_assert
27729 (nargs == 2
27730 && insn_data[new_icode].operand[0].mode == tmode
27731 && insn_data[new_icode].operand[1].mode == tmode
27732 && insn_data[new_icode].operand[2].mode == mode
27733 && insn_data[new_icode].operand[0].predicate
27734 == insn_data[icode].operand[0].predicate
27735 && insn_data[new_icode].operand[1].predicate
27736 == insn_data[icode].operand[1].predicate);
27737 icode = new_icode;
27738 goto non_constant;
27739 }
27740 break;
27741 default:
27742 gcc_unreachable ();
27743 }
27744 }
27745 }
27746 else
27747 {
27748 non_constant:
27749 if (VECTOR_MODE_P (mode))
27750 op = safe_vector_operand (op, mode);
27751
27752 /* If we aren't optimizing, only allow one memory operand to be
27753 generated. */
27754 if (memory_operand (op, mode))
27755 num_memory++;
27756
27757 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
27758
27759 if (optimize
27760 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
27761 || num_memory > 1)
27762 op = force_reg (mode, op);
27763 }
27764
27765 args[i].op = op;
27766 args[i].mode = mode;
27767 }
27768
27769 switch (nargs)
27770 {
27771 case 1:
27772 pat = GEN_FCN (icode) (target, args[0].op);
27773 break;
27774
27775 case 2:
27776 if (tf_p)
27777 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
27778 GEN_INT ((int)sub_code));
27779 else if (! comparison_p)
27780 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
27781 else
27782 {
27783 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
27784 args[0].op,
27785 args[1].op);
27786
27787 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
27788 }
27789 break;
27790
27791 case 3:
27792 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
27793 break;
27794
27795 case 4:
27796 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
27797 break;
27798
27799 default:
27800 gcc_unreachable ();
27801 }
27802
27803 if (! pat)
27804 return 0;
27805
27806 emit_insn (pat);
27807 return target;
27808 }
27809
27810 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
27811 insns with vec_merge. */
27812
27813 static rtx
27814 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
27815 rtx target)
27816 {
27817 rtx pat;
27818 tree arg0 = CALL_EXPR_ARG (exp, 0);
27819 rtx op1, op0 = expand_normal (arg0);
27820 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27821 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27822
27823 if (optimize || !target
27824 || GET_MODE (target) != tmode
27825 || !insn_data[icode].operand[0].predicate (target, tmode))
27826 target = gen_reg_rtx (tmode);
27827
27828 if (VECTOR_MODE_P (mode0))
27829 op0 = safe_vector_operand (op0, mode0);
27830
27831 if ((optimize && !register_operand (op0, mode0))
27832 || !insn_data[icode].operand[1].predicate (op0, mode0))
27833 op0 = copy_to_mode_reg (mode0, op0);
27834
27835 op1 = op0;
27836 if (!insn_data[icode].operand[2].predicate (op1, mode0))
27837 op1 = copy_to_mode_reg (mode0, op1);
27838
27839 pat = GEN_FCN (icode) (target, op0, op1);
27840 if (! pat)
27841 return 0;
27842 emit_insn (pat);
27843 return target;
27844 }
27845
27846 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
27847
27848 static rtx
27849 ix86_expand_sse_compare (const struct builtin_description *d,
27850 tree exp, rtx target, bool swap)
27851 {
27852 rtx pat;
27853 tree arg0 = CALL_EXPR_ARG (exp, 0);
27854 tree arg1 = CALL_EXPR_ARG (exp, 1);
27855 rtx op0 = expand_normal (arg0);
27856 rtx op1 = expand_normal (arg1);
27857 rtx op2;
27858 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27859 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27860 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
27861 enum rtx_code comparison = d->comparison;
27862
27863 if (VECTOR_MODE_P (mode0))
27864 op0 = safe_vector_operand (op0, mode0);
27865 if (VECTOR_MODE_P (mode1))
27866 op1 = safe_vector_operand (op1, mode1);
27867
27868 /* Swap operands if we have a comparison that isn't available in
27869 hardware. */
27870 if (swap)
27871 {
27872 rtx tmp = gen_reg_rtx (mode1);
27873 emit_move_insn (tmp, op1);
27874 op1 = op0;
27875 op0 = tmp;
27876 }
27877
27878 if (optimize || !target
27879 || GET_MODE (target) != tmode
27880 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27881 target = gen_reg_rtx (tmode);
27882
27883 if ((optimize && !register_operand (op0, mode0))
27884 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
27885 op0 = copy_to_mode_reg (mode0, op0);
27886 if ((optimize && !register_operand (op1, mode1))
27887 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
27888 op1 = copy_to_mode_reg (mode1, op1);
27889
27890 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
27891 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
27892 if (! pat)
27893 return 0;
27894 emit_insn (pat);
27895 return target;
27896 }
27897
27898 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
27899
27900 static rtx
27901 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
27902 rtx target)
27903 {
27904 rtx pat;
27905 tree arg0 = CALL_EXPR_ARG (exp, 0);
27906 tree arg1 = CALL_EXPR_ARG (exp, 1);
27907 rtx op0 = expand_normal (arg0);
27908 rtx op1 = expand_normal (arg1);
27909 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
27910 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
27911 enum rtx_code comparison = d->comparison;
27912
27913 if (VECTOR_MODE_P (mode0))
27914 op0 = safe_vector_operand (op0, mode0);
27915 if (VECTOR_MODE_P (mode1))
27916 op1 = safe_vector_operand (op1, mode1);
27917
27918 /* Swap operands if we have a comparison that isn't available in
27919 hardware. */
27920 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
27921 {
27922 rtx tmp = op1;
27923 op1 = op0;
27924 op0 = tmp;
27925 }
27926
27927 target = gen_reg_rtx (SImode);
27928 emit_move_insn (target, const0_rtx);
27929 target = gen_rtx_SUBREG (QImode, target, 0);
27930
27931 if ((optimize && !register_operand (op0, mode0))
27932 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27933 op0 = copy_to_mode_reg (mode0, op0);
27934 if ((optimize && !register_operand (op1, mode1))
27935 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
27936 op1 = copy_to_mode_reg (mode1, op1);
27937
27938 pat = GEN_FCN (d->icode) (op0, op1);
27939 if (! pat)
27940 return 0;
27941 emit_insn (pat);
27942 emit_insn (gen_rtx_SET (VOIDmode,
27943 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27944 gen_rtx_fmt_ee (comparison, QImode,
27945 SET_DEST (pat),
27946 const0_rtx)));
27947
27948 return SUBREG_REG (target);
27949 }
27950
27951 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
27952
27953 static rtx
27954 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
27955 rtx target)
27956 {
27957 rtx pat;
27958 tree arg0 = CALL_EXPR_ARG (exp, 0);
27959 rtx op1, op0 = expand_normal (arg0);
27960 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27961 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27962
27963 if (optimize || target == 0
27964 || GET_MODE (target) != tmode
27965 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27966 target = gen_reg_rtx (tmode);
27967
27968 if (VECTOR_MODE_P (mode0))
27969 op0 = safe_vector_operand (op0, mode0);
27970
27971 if ((optimize && !register_operand (op0, mode0))
27972 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27973 op0 = copy_to_mode_reg (mode0, op0);
27974
27975 op1 = GEN_INT (d->comparison);
27976
27977 pat = GEN_FCN (d->icode) (target, op0, op1);
27978 if (! pat)
27979 return 0;
27980 emit_insn (pat);
27981 return target;
27982 }
27983
27984 static rtx
27985 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
27986 tree exp, rtx target)
27987 {
27988 rtx pat;
27989 tree arg0 = CALL_EXPR_ARG (exp, 0);
27990 tree arg1 = CALL_EXPR_ARG (exp, 1);
27991 rtx op0 = expand_normal (arg0);
27992 rtx op1 = expand_normal (arg1);
27993 rtx op2;
27994 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27995 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27996 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
27997
27998 if (optimize || target == 0
27999 || GET_MODE (target) != tmode
28000 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28001 target = gen_reg_rtx (tmode);
28002
28003 op0 = safe_vector_operand (op0, mode0);
28004 op1 = safe_vector_operand (op1, mode1);
28005
28006 if ((optimize && !register_operand (op0, mode0))
28007 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28008 op0 = copy_to_mode_reg (mode0, op0);
28009 if ((optimize && !register_operand (op1, mode1))
28010 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28011 op1 = copy_to_mode_reg (mode1, op1);
28012
28013 op2 = GEN_INT (d->comparison);
28014
28015 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28016 if (! pat)
28017 return 0;
28018 emit_insn (pat);
28019 return target;
28020 }
28021
28022 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28023
28024 static rtx
28025 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28026 rtx target)
28027 {
28028 rtx pat;
28029 tree arg0 = CALL_EXPR_ARG (exp, 0);
28030 tree arg1 = CALL_EXPR_ARG (exp, 1);
28031 rtx op0 = expand_normal (arg0);
28032 rtx op1 = expand_normal (arg1);
28033 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28034 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28035 enum rtx_code comparison = d->comparison;
28036
28037 if (VECTOR_MODE_P (mode0))
28038 op0 = safe_vector_operand (op0, mode0);
28039 if (VECTOR_MODE_P (mode1))
28040 op1 = safe_vector_operand (op1, mode1);
28041
28042 target = gen_reg_rtx (SImode);
28043 emit_move_insn (target, const0_rtx);
28044 target = gen_rtx_SUBREG (QImode, target, 0);
28045
28046 if ((optimize && !register_operand (op0, mode0))
28047 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28048 op0 = copy_to_mode_reg (mode0, op0);
28049 if ((optimize && !register_operand (op1, mode1))
28050 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28051 op1 = copy_to_mode_reg (mode1, op1);
28052
28053 pat = GEN_FCN (d->icode) (op0, op1);
28054 if (! pat)
28055 return 0;
28056 emit_insn (pat);
28057 emit_insn (gen_rtx_SET (VOIDmode,
28058 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28059 gen_rtx_fmt_ee (comparison, QImode,
28060 SET_DEST (pat),
28061 const0_rtx)));
28062
28063 return SUBREG_REG (target);
28064 }
28065
28066 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28067
28068 static rtx
28069 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28070 tree exp, rtx target)
28071 {
28072 rtx pat;
28073 tree arg0 = CALL_EXPR_ARG (exp, 0);
28074 tree arg1 = CALL_EXPR_ARG (exp, 1);
28075 tree arg2 = CALL_EXPR_ARG (exp, 2);
28076 tree arg3 = CALL_EXPR_ARG (exp, 3);
28077 tree arg4 = CALL_EXPR_ARG (exp, 4);
28078 rtx scratch0, scratch1;
28079 rtx op0 = expand_normal (arg0);
28080 rtx op1 = expand_normal (arg1);
28081 rtx op2 = expand_normal (arg2);
28082 rtx op3 = expand_normal (arg3);
28083 rtx op4 = expand_normal (arg4);
28084 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28085
28086 tmode0 = insn_data[d->icode].operand[0].mode;
28087 tmode1 = insn_data[d->icode].operand[1].mode;
28088 modev2 = insn_data[d->icode].operand[2].mode;
28089 modei3 = insn_data[d->icode].operand[3].mode;
28090 modev4 = insn_data[d->icode].operand[4].mode;
28091 modei5 = insn_data[d->icode].operand[5].mode;
28092 modeimm = insn_data[d->icode].operand[6].mode;
28093
28094 if (VECTOR_MODE_P (modev2))
28095 op0 = safe_vector_operand (op0, modev2);
28096 if (VECTOR_MODE_P (modev4))
28097 op2 = safe_vector_operand (op2, modev4);
28098
28099 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28100 op0 = copy_to_mode_reg (modev2, op0);
28101 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28102 op1 = copy_to_mode_reg (modei3, op1);
28103 if ((optimize && !register_operand (op2, modev4))
28104 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28105 op2 = copy_to_mode_reg (modev4, op2);
28106 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28107 op3 = copy_to_mode_reg (modei5, op3);
28108
28109 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28110 {
28111 error ("the fifth argument must be an 8-bit immediate");
28112 return const0_rtx;
28113 }
28114
28115 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28116 {
28117 if (optimize || !target
28118 || GET_MODE (target) != tmode0
28119 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28120 target = gen_reg_rtx (tmode0);
28121
28122 scratch1 = gen_reg_rtx (tmode1);
28123
28124 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28125 }
28126 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28127 {
28128 if (optimize || !target
28129 || GET_MODE (target) != tmode1
28130 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28131 target = gen_reg_rtx (tmode1);
28132
28133 scratch0 = gen_reg_rtx (tmode0);
28134
28135 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28136 }
28137 else
28138 {
28139 gcc_assert (d->flag);
28140
28141 scratch0 = gen_reg_rtx (tmode0);
28142 scratch1 = gen_reg_rtx (tmode1);
28143
28144 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28145 }
28146
28147 if (! pat)
28148 return 0;
28149
28150 emit_insn (pat);
28151
28152 if (d->flag)
28153 {
28154 target = gen_reg_rtx (SImode);
28155 emit_move_insn (target, const0_rtx);
28156 target = gen_rtx_SUBREG (QImode, target, 0);
28157
28158 emit_insn
28159 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28160 gen_rtx_fmt_ee (EQ, QImode,
28161 gen_rtx_REG ((enum machine_mode) d->flag,
28162 FLAGS_REG),
28163 const0_rtx)));
28164 return SUBREG_REG (target);
28165 }
28166 else
28167 return target;
28168 }
28169
28170
28171 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28172
28173 static rtx
28174 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28175 tree exp, rtx target)
28176 {
28177 rtx pat;
28178 tree arg0 = CALL_EXPR_ARG (exp, 0);
28179 tree arg1 = CALL_EXPR_ARG (exp, 1);
28180 tree arg2 = CALL_EXPR_ARG (exp, 2);
28181 rtx scratch0, scratch1;
28182 rtx op0 = expand_normal (arg0);
28183 rtx op1 = expand_normal (arg1);
28184 rtx op2 = expand_normal (arg2);
28185 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28186
28187 tmode0 = insn_data[d->icode].operand[0].mode;
28188 tmode1 = insn_data[d->icode].operand[1].mode;
28189 modev2 = insn_data[d->icode].operand[2].mode;
28190 modev3 = insn_data[d->icode].operand[3].mode;
28191 modeimm = insn_data[d->icode].operand[4].mode;
28192
28193 if (VECTOR_MODE_P (modev2))
28194 op0 = safe_vector_operand (op0, modev2);
28195 if (VECTOR_MODE_P (modev3))
28196 op1 = safe_vector_operand (op1, modev3);
28197
28198 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28199 op0 = copy_to_mode_reg (modev2, op0);
28200 if ((optimize && !register_operand (op1, modev3))
28201 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28202 op1 = copy_to_mode_reg (modev3, op1);
28203
28204 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28205 {
28206 error ("the third argument must be an 8-bit immediate");
28207 return const0_rtx;
28208 }
28209
28210 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28211 {
28212 if (optimize || !target
28213 || GET_MODE (target) != tmode0
28214 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28215 target = gen_reg_rtx (tmode0);
28216
28217 scratch1 = gen_reg_rtx (tmode1);
28218
28219 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28220 }
28221 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28222 {
28223 if (optimize || !target
28224 || GET_MODE (target) != tmode1
28225 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28226 target = gen_reg_rtx (tmode1);
28227
28228 scratch0 = gen_reg_rtx (tmode0);
28229
28230 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28231 }
28232 else
28233 {
28234 gcc_assert (d->flag);
28235
28236 scratch0 = gen_reg_rtx (tmode0);
28237 scratch1 = gen_reg_rtx (tmode1);
28238
28239 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28240 }
28241
28242 if (! pat)
28243 return 0;
28244
28245 emit_insn (pat);
28246
28247 if (d->flag)
28248 {
28249 target = gen_reg_rtx (SImode);
28250 emit_move_insn (target, const0_rtx);
28251 target = gen_rtx_SUBREG (QImode, target, 0);
28252
28253 emit_insn
28254 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28255 gen_rtx_fmt_ee (EQ, QImode,
28256 gen_rtx_REG ((enum machine_mode) d->flag,
28257 FLAGS_REG),
28258 const0_rtx)));
28259 return SUBREG_REG (target);
28260 }
28261 else
28262 return target;
28263 }
28264
28265 /* Subroutine of ix86_expand_builtin to take care of insns with
28266 variable number of operands. */
28267
28268 static rtx
28269 ix86_expand_args_builtin (const struct builtin_description *d,
28270 tree exp, rtx target)
28271 {
28272 rtx pat, real_target;
28273 unsigned int i, nargs;
28274 unsigned int nargs_constant = 0;
28275 int num_memory = 0;
28276 struct
28277 {
28278 rtx op;
28279 enum machine_mode mode;
28280 } args[4];
28281 bool last_arg_count = false;
28282 enum insn_code icode = d->icode;
28283 const struct insn_data_d *insn_p = &insn_data[icode];
28284 enum machine_mode tmode = insn_p->operand[0].mode;
28285 enum machine_mode rmode = VOIDmode;
28286 bool swap = false;
28287 enum rtx_code comparison = d->comparison;
28288
28289 switch ((enum ix86_builtin_func_type) d->flag)
28290 {
28291 case V2DF_FTYPE_V2DF_ROUND:
28292 case V4DF_FTYPE_V4DF_ROUND:
28293 case V4SF_FTYPE_V4SF_ROUND:
28294 case V8SF_FTYPE_V8SF_ROUND:
28295 case V4SI_FTYPE_V4SF_ROUND:
28296 case V8SI_FTYPE_V8SF_ROUND:
28297 return ix86_expand_sse_round (d, exp, target);
28298 case V4SI_FTYPE_V2DF_V2DF_ROUND:
28299 case V8SI_FTYPE_V4DF_V4DF_ROUND:
28300 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28301 case INT_FTYPE_V8SF_V8SF_PTEST:
28302 case INT_FTYPE_V4DI_V4DI_PTEST:
28303 case INT_FTYPE_V4DF_V4DF_PTEST:
28304 case INT_FTYPE_V4SF_V4SF_PTEST:
28305 case INT_FTYPE_V2DI_V2DI_PTEST:
28306 case INT_FTYPE_V2DF_V2DF_PTEST:
28307 return ix86_expand_sse_ptest (d, exp, target);
28308 case FLOAT128_FTYPE_FLOAT128:
28309 case FLOAT_FTYPE_FLOAT:
28310 case INT_FTYPE_INT:
28311 case UINT64_FTYPE_INT:
28312 case UINT16_FTYPE_UINT16:
28313 case INT64_FTYPE_INT64:
28314 case INT64_FTYPE_V4SF:
28315 case INT64_FTYPE_V2DF:
28316 case INT_FTYPE_V16QI:
28317 case INT_FTYPE_V8QI:
28318 case INT_FTYPE_V8SF:
28319 case INT_FTYPE_V4DF:
28320 case INT_FTYPE_V4SF:
28321 case INT_FTYPE_V2DF:
28322 case INT_FTYPE_V32QI:
28323 case V16QI_FTYPE_V16QI:
28324 case V8SI_FTYPE_V8SF:
28325 case V8SI_FTYPE_V4SI:
28326 case V8HI_FTYPE_V8HI:
28327 case V8HI_FTYPE_V16QI:
28328 case V8QI_FTYPE_V8QI:
28329 case V8SF_FTYPE_V8SF:
28330 case V8SF_FTYPE_V8SI:
28331 case V8SF_FTYPE_V4SF:
28332 case V8SF_FTYPE_V8HI:
28333 case V4SI_FTYPE_V4SI:
28334 case V4SI_FTYPE_V16QI:
28335 case V4SI_FTYPE_V4SF:
28336 case V4SI_FTYPE_V8SI:
28337 case V4SI_FTYPE_V8HI:
28338 case V4SI_FTYPE_V4DF:
28339 case V4SI_FTYPE_V2DF:
28340 case V4HI_FTYPE_V4HI:
28341 case V4DF_FTYPE_V4DF:
28342 case V4DF_FTYPE_V4SI:
28343 case V4DF_FTYPE_V4SF:
28344 case V4DF_FTYPE_V2DF:
28345 case V4SF_FTYPE_V4SF:
28346 case V4SF_FTYPE_V4SI:
28347 case V4SF_FTYPE_V8SF:
28348 case V4SF_FTYPE_V4DF:
28349 case V4SF_FTYPE_V8HI:
28350 case V4SF_FTYPE_V2DF:
28351 case V2DI_FTYPE_V2DI:
28352 case V2DI_FTYPE_V16QI:
28353 case V2DI_FTYPE_V8HI:
28354 case V2DI_FTYPE_V4SI:
28355 case V2DF_FTYPE_V2DF:
28356 case V2DF_FTYPE_V4SI:
28357 case V2DF_FTYPE_V4DF:
28358 case V2DF_FTYPE_V4SF:
28359 case V2DF_FTYPE_V2SI:
28360 case V2SI_FTYPE_V2SI:
28361 case V2SI_FTYPE_V4SF:
28362 case V2SI_FTYPE_V2SF:
28363 case V2SI_FTYPE_V2DF:
28364 case V2SF_FTYPE_V2SF:
28365 case V2SF_FTYPE_V2SI:
28366 case V32QI_FTYPE_V32QI:
28367 case V32QI_FTYPE_V16QI:
28368 case V16HI_FTYPE_V16HI:
28369 case V16HI_FTYPE_V8HI:
28370 case V8SI_FTYPE_V8SI:
28371 case V16HI_FTYPE_V16QI:
28372 case V8SI_FTYPE_V16QI:
28373 case V4DI_FTYPE_V16QI:
28374 case V8SI_FTYPE_V8HI:
28375 case V4DI_FTYPE_V8HI:
28376 case V4DI_FTYPE_V4SI:
28377 case V4DI_FTYPE_V2DI:
28378 nargs = 1;
28379 break;
28380 case V4SF_FTYPE_V4SF_VEC_MERGE:
28381 case V2DF_FTYPE_V2DF_VEC_MERGE:
28382 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28383 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28384 case V16QI_FTYPE_V16QI_V16QI:
28385 case V16QI_FTYPE_V8HI_V8HI:
28386 case V8QI_FTYPE_V8QI_V8QI:
28387 case V8QI_FTYPE_V4HI_V4HI:
28388 case V8HI_FTYPE_V8HI_V8HI:
28389 case V8HI_FTYPE_V16QI_V16QI:
28390 case V8HI_FTYPE_V4SI_V4SI:
28391 case V8SF_FTYPE_V8SF_V8SF:
28392 case V8SF_FTYPE_V8SF_V8SI:
28393 case V4SI_FTYPE_V4SI_V4SI:
28394 case V4SI_FTYPE_V8HI_V8HI:
28395 case V4SI_FTYPE_V4SF_V4SF:
28396 case V4SI_FTYPE_V2DF_V2DF:
28397 case V4HI_FTYPE_V4HI_V4HI:
28398 case V4HI_FTYPE_V8QI_V8QI:
28399 case V4HI_FTYPE_V2SI_V2SI:
28400 case V4DF_FTYPE_V4DF_V4DF:
28401 case V4DF_FTYPE_V4DF_V4DI:
28402 case V4SF_FTYPE_V4SF_V4SF:
28403 case V4SF_FTYPE_V4SF_V4SI:
28404 case V4SF_FTYPE_V4SF_V2SI:
28405 case V4SF_FTYPE_V4SF_V2DF:
28406 case V4SF_FTYPE_V4SF_DI:
28407 case V4SF_FTYPE_V4SF_SI:
28408 case V2DI_FTYPE_V2DI_V2DI:
28409 case V2DI_FTYPE_V16QI_V16QI:
28410 case V2DI_FTYPE_V4SI_V4SI:
28411 case V2DI_FTYPE_V2DI_V16QI:
28412 case V2DI_FTYPE_V2DF_V2DF:
28413 case V2SI_FTYPE_V2SI_V2SI:
28414 case V2SI_FTYPE_V4HI_V4HI:
28415 case V2SI_FTYPE_V2SF_V2SF:
28416 case V2DF_FTYPE_V2DF_V2DF:
28417 case V2DF_FTYPE_V2DF_V4SF:
28418 case V2DF_FTYPE_V2DF_V2DI:
28419 case V2DF_FTYPE_V2DF_DI:
28420 case V2DF_FTYPE_V2DF_SI:
28421 case V2SF_FTYPE_V2SF_V2SF:
28422 case V1DI_FTYPE_V1DI_V1DI:
28423 case V1DI_FTYPE_V8QI_V8QI:
28424 case V1DI_FTYPE_V2SI_V2SI:
28425 case V32QI_FTYPE_V16HI_V16HI:
28426 case V16HI_FTYPE_V8SI_V8SI:
28427 case V32QI_FTYPE_V32QI_V32QI:
28428 case V16HI_FTYPE_V32QI_V32QI:
28429 case V16HI_FTYPE_V16HI_V16HI:
28430 case V8SI_FTYPE_V4DF_V4DF:
28431 case V8SI_FTYPE_V8SI_V8SI:
28432 case V8SI_FTYPE_V16HI_V16HI:
28433 case V4DI_FTYPE_V4DI_V4DI:
28434 case V4DI_FTYPE_V8SI_V8SI:
28435 if (comparison == UNKNOWN)
28436 return ix86_expand_binop_builtin (icode, exp, target);
28437 nargs = 2;
28438 break;
28439 case V4SF_FTYPE_V4SF_V4SF_SWAP:
28440 case V2DF_FTYPE_V2DF_V2DF_SWAP:
28441 gcc_assert (comparison != UNKNOWN);
28442 nargs = 2;
28443 swap = true;
28444 break;
28445 case V16HI_FTYPE_V16HI_V8HI_COUNT:
28446 case V16HI_FTYPE_V16HI_SI_COUNT:
28447 case V8SI_FTYPE_V8SI_V4SI_COUNT:
28448 case V8SI_FTYPE_V8SI_SI_COUNT:
28449 case V4DI_FTYPE_V4DI_V2DI_COUNT:
28450 case V4DI_FTYPE_V4DI_INT_COUNT:
28451 case V8HI_FTYPE_V8HI_V8HI_COUNT:
28452 case V8HI_FTYPE_V8HI_SI_COUNT:
28453 case V4SI_FTYPE_V4SI_V4SI_COUNT:
28454 case V4SI_FTYPE_V4SI_SI_COUNT:
28455 case V4HI_FTYPE_V4HI_V4HI_COUNT:
28456 case V4HI_FTYPE_V4HI_SI_COUNT:
28457 case V2DI_FTYPE_V2DI_V2DI_COUNT:
28458 case V2DI_FTYPE_V2DI_SI_COUNT:
28459 case V2SI_FTYPE_V2SI_V2SI_COUNT:
28460 case V2SI_FTYPE_V2SI_SI_COUNT:
28461 case V1DI_FTYPE_V1DI_V1DI_COUNT:
28462 case V1DI_FTYPE_V1DI_SI_COUNT:
28463 nargs = 2;
28464 last_arg_count = true;
28465 break;
28466 case UINT64_FTYPE_UINT64_UINT64:
28467 case UINT_FTYPE_UINT_UINT:
28468 case UINT_FTYPE_UINT_USHORT:
28469 case UINT_FTYPE_UINT_UCHAR:
28470 case UINT16_FTYPE_UINT16_INT:
28471 case UINT8_FTYPE_UINT8_INT:
28472 nargs = 2;
28473 break;
28474 case V2DI_FTYPE_V2DI_INT_CONVERT:
28475 nargs = 2;
28476 rmode = V1TImode;
28477 nargs_constant = 1;
28478 break;
28479 case V4DI_FTYPE_V4DI_INT_CONVERT:
28480 nargs = 2;
28481 rmode = V2TImode;
28482 nargs_constant = 1;
28483 break;
28484 case V8HI_FTYPE_V8HI_INT:
28485 case V8HI_FTYPE_V8SF_INT:
28486 case V8HI_FTYPE_V4SF_INT:
28487 case V8SF_FTYPE_V8SF_INT:
28488 case V4SI_FTYPE_V4SI_INT:
28489 case V4SI_FTYPE_V8SI_INT:
28490 case V4HI_FTYPE_V4HI_INT:
28491 case V4DF_FTYPE_V4DF_INT:
28492 case V4SF_FTYPE_V4SF_INT:
28493 case V4SF_FTYPE_V8SF_INT:
28494 case V2DI_FTYPE_V2DI_INT:
28495 case V2DF_FTYPE_V2DF_INT:
28496 case V2DF_FTYPE_V4DF_INT:
28497 case V16HI_FTYPE_V16HI_INT:
28498 case V8SI_FTYPE_V8SI_INT:
28499 case V4DI_FTYPE_V4DI_INT:
28500 case V2DI_FTYPE_V4DI_INT:
28501 nargs = 2;
28502 nargs_constant = 1;
28503 break;
28504 case V16QI_FTYPE_V16QI_V16QI_V16QI:
28505 case V8SF_FTYPE_V8SF_V8SF_V8SF:
28506 case V4DF_FTYPE_V4DF_V4DF_V4DF:
28507 case V4SF_FTYPE_V4SF_V4SF_V4SF:
28508 case V2DF_FTYPE_V2DF_V2DF_V2DF:
28509 case V32QI_FTYPE_V32QI_V32QI_V32QI:
28510 nargs = 3;
28511 break;
28512 case V32QI_FTYPE_V32QI_V32QI_INT:
28513 case V16HI_FTYPE_V16HI_V16HI_INT:
28514 case V16QI_FTYPE_V16QI_V16QI_INT:
28515 case V4DI_FTYPE_V4DI_V4DI_INT:
28516 case V8HI_FTYPE_V8HI_V8HI_INT:
28517 case V8SI_FTYPE_V8SI_V8SI_INT:
28518 case V8SI_FTYPE_V8SI_V4SI_INT:
28519 case V8SF_FTYPE_V8SF_V8SF_INT:
28520 case V8SF_FTYPE_V8SF_V4SF_INT:
28521 case V4SI_FTYPE_V4SI_V4SI_INT:
28522 case V4DF_FTYPE_V4DF_V4DF_INT:
28523 case V4DF_FTYPE_V4DF_V2DF_INT:
28524 case V4SF_FTYPE_V4SF_V4SF_INT:
28525 case V2DI_FTYPE_V2DI_V2DI_INT:
28526 case V4DI_FTYPE_V4DI_V2DI_INT:
28527 case V2DF_FTYPE_V2DF_V2DF_INT:
28528 nargs = 3;
28529 nargs_constant = 1;
28530 break;
28531 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28532 nargs = 3;
28533 rmode = V4DImode;
28534 nargs_constant = 1;
28535 break;
28536 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28537 nargs = 3;
28538 rmode = V2DImode;
28539 nargs_constant = 1;
28540 break;
28541 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28542 nargs = 3;
28543 rmode = DImode;
28544 nargs_constant = 1;
28545 break;
28546 case V2DI_FTYPE_V2DI_UINT_UINT:
28547 nargs = 3;
28548 nargs_constant = 2;
28549 break;
28550 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28551 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28552 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28553 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28554 nargs = 4;
28555 nargs_constant = 1;
28556 break;
28557 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28558 nargs = 4;
28559 nargs_constant = 2;
28560 break;
28561 default:
28562 gcc_unreachable ();
28563 }
28564
28565 gcc_assert (nargs <= ARRAY_SIZE (args));
28566
28567 if (comparison != UNKNOWN)
28568 {
28569 gcc_assert (nargs == 2);
28570 return ix86_expand_sse_compare (d, exp, target, swap);
28571 }
28572
28573 if (rmode == VOIDmode || rmode == tmode)
28574 {
28575 if (optimize
28576 || target == 0
28577 || GET_MODE (target) != tmode
28578 || !insn_p->operand[0].predicate (target, tmode))
28579 target = gen_reg_rtx (tmode);
28580 real_target = target;
28581 }
28582 else
28583 {
28584 target = gen_reg_rtx (rmode);
28585 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28586 }
28587
28588 for (i = 0; i < nargs; i++)
28589 {
28590 tree arg = CALL_EXPR_ARG (exp, i);
28591 rtx op = expand_normal (arg);
28592 enum machine_mode mode = insn_p->operand[i + 1].mode;
28593 bool match = insn_p->operand[i + 1].predicate (op, mode);
28594
28595 if (last_arg_count && (i + 1) == nargs)
28596 {
28597 /* SIMD shift insns take either an 8-bit immediate or
28598 register as count. But builtin functions take int as
28599 count. If count doesn't match, we put it in register. */
28600 if (!match)
28601 {
28602 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28603 if (!insn_p->operand[i + 1].predicate (op, mode))
28604 op = copy_to_reg (op);
28605 }
28606 }
28607 else if ((nargs - i) <= nargs_constant)
28608 {
28609 if (!match)
28610 switch (icode)
28611 {
28612 case CODE_FOR_avx2_inserti128:
28613 case CODE_FOR_avx2_extracti128:
28614 error ("the last argument must be an 1-bit immediate");
28615 return const0_rtx;
28616
28617 case CODE_FOR_sse4_1_roundsd:
28618 case CODE_FOR_sse4_1_roundss:
28619
28620 case CODE_FOR_sse4_1_roundpd:
28621 case CODE_FOR_sse4_1_roundps:
28622 case CODE_FOR_avx_roundpd256:
28623 case CODE_FOR_avx_roundps256:
28624
28625 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
28626 case CODE_FOR_sse4_1_roundps_sfix:
28627 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
28628 case CODE_FOR_avx_roundps_sfix256:
28629
28630 case CODE_FOR_sse4_1_blendps:
28631 case CODE_FOR_avx_blendpd256:
28632 case CODE_FOR_avx_vpermilv4df:
28633 error ("the last argument must be a 4-bit immediate");
28634 return const0_rtx;
28635
28636 case CODE_FOR_sse4_1_blendpd:
28637 case CODE_FOR_avx_vpermilv2df:
28638 case CODE_FOR_xop_vpermil2v2df3:
28639 case CODE_FOR_xop_vpermil2v4sf3:
28640 case CODE_FOR_xop_vpermil2v4df3:
28641 case CODE_FOR_xop_vpermil2v8sf3:
28642 error ("the last argument must be a 2-bit immediate");
28643 return const0_rtx;
28644
28645 case CODE_FOR_avx_vextractf128v4df:
28646 case CODE_FOR_avx_vextractf128v8sf:
28647 case CODE_FOR_avx_vextractf128v8si:
28648 case CODE_FOR_avx_vinsertf128v4df:
28649 case CODE_FOR_avx_vinsertf128v8sf:
28650 case CODE_FOR_avx_vinsertf128v8si:
28651 error ("the last argument must be a 1-bit immediate");
28652 return const0_rtx;
28653
28654 case CODE_FOR_avx_vmcmpv2df3:
28655 case CODE_FOR_avx_vmcmpv4sf3:
28656 case CODE_FOR_avx_cmpv2df3:
28657 case CODE_FOR_avx_cmpv4sf3:
28658 case CODE_FOR_avx_cmpv4df3:
28659 case CODE_FOR_avx_cmpv8sf3:
28660 error ("the last argument must be a 5-bit immediate");
28661 return const0_rtx;
28662
28663 default:
28664 switch (nargs_constant)
28665 {
28666 case 2:
28667 if ((nargs - i) == nargs_constant)
28668 {
28669 error ("the next to last argument must be an 8-bit immediate");
28670 break;
28671 }
28672 case 1:
28673 error ("the last argument must be an 8-bit immediate");
28674 break;
28675 default:
28676 gcc_unreachable ();
28677 }
28678 return const0_rtx;
28679 }
28680 }
28681 else
28682 {
28683 if (VECTOR_MODE_P (mode))
28684 op = safe_vector_operand (op, mode);
28685
28686 /* If we aren't optimizing, only allow one memory operand to
28687 be generated. */
28688 if (memory_operand (op, mode))
28689 num_memory++;
28690
28691 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
28692 {
28693 if (optimize || !match || num_memory > 1)
28694 op = copy_to_mode_reg (mode, op);
28695 }
28696 else
28697 {
28698 op = copy_to_reg (op);
28699 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
28700 }
28701 }
28702
28703 args[i].op = op;
28704 args[i].mode = mode;
28705 }
28706
28707 switch (nargs)
28708 {
28709 case 1:
28710 pat = GEN_FCN (icode) (real_target, args[0].op);
28711 break;
28712 case 2:
28713 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
28714 break;
28715 case 3:
28716 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28717 args[2].op);
28718 break;
28719 case 4:
28720 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28721 args[2].op, args[3].op);
28722 break;
28723 default:
28724 gcc_unreachable ();
28725 }
28726
28727 if (! pat)
28728 return 0;
28729
28730 emit_insn (pat);
28731 return target;
28732 }
28733
28734 /* Subroutine of ix86_expand_builtin to take care of special insns
28735 with variable number of operands. */
28736
28737 static rtx
28738 ix86_expand_special_args_builtin (const struct builtin_description *d,
28739 tree exp, rtx target)
28740 {
28741 tree arg;
28742 rtx pat, op;
28743 unsigned int i, nargs, arg_adjust, memory;
28744 struct
28745 {
28746 rtx op;
28747 enum machine_mode mode;
28748 } args[3];
28749 enum insn_code icode = d->icode;
28750 bool last_arg_constant = false;
28751 const struct insn_data_d *insn_p = &insn_data[icode];
28752 enum machine_mode tmode = insn_p->operand[0].mode;
28753 enum { load, store } klass;
28754
28755 switch ((enum ix86_builtin_func_type) d->flag)
28756 {
28757 case VOID_FTYPE_VOID:
28758 if (icode == CODE_FOR_avx_vzeroupper)
28759 target = GEN_INT (vzeroupper_intrinsic);
28760 emit_insn (GEN_FCN (icode) (target));
28761 return 0;
28762 case VOID_FTYPE_UINT64:
28763 case VOID_FTYPE_UNSIGNED:
28764 nargs = 0;
28765 klass = store;
28766 memory = 0;
28767 break;
28768 case UINT64_FTYPE_VOID:
28769 case UNSIGNED_FTYPE_VOID:
28770 nargs = 0;
28771 klass = load;
28772 memory = 0;
28773 break;
28774 case UINT64_FTYPE_PUNSIGNED:
28775 case V2DI_FTYPE_PV2DI:
28776 case V4DI_FTYPE_PV4DI:
28777 case V32QI_FTYPE_PCCHAR:
28778 case V16QI_FTYPE_PCCHAR:
28779 case V8SF_FTYPE_PCV4SF:
28780 case V8SF_FTYPE_PCFLOAT:
28781 case V4SF_FTYPE_PCFLOAT:
28782 case V4DF_FTYPE_PCV2DF:
28783 case V4DF_FTYPE_PCDOUBLE:
28784 case V2DF_FTYPE_PCDOUBLE:
28785 case VOID_FTYPE_PVOID:
28786 nargs = 1;
28787 klass = load;
28788 memory = 0;
28789 break;
28790 case VOID_FTYPE_PV2SF_V4SF:
28791 case VOID_FTYPE_PV4DI_V4DI:
28792 case VOID_FTYPE_PV2DI_V2DI:
28793 case VOID_FTYPE_PCHAR_V32QI:
28794 case VOID_FTYPE_PCHAR_V16QI:
28795 case VOID_FTYPE_PFLOAT_V8SF:
28796 case VOID_FTYPE_PFLOAT_V4SF:
28797 case VOID_FTYPE_PDOUBLE_V4DF:
28798 case VOID_FTYPE_PDOUBLE_V2DF:
28799 case VOID_FTYPE_PLONGLONG_LONGLONG:
28800 case VOID_FTYPE_PULONGLONG_ULONGLONG:
28801 case VOID_FTYPE_PINT_INT:
28802 nargs = 1;
28803 klass = store;
28804 /* Reserve memory operand for target. */
28805 memory = ARRAY_SIZE (args);
28806 break;
28807 case V4SF_FTYPE_V4SF_PCV2SF:
28808 case V2DF_FTYPE_V2DF_PCDOUBLE:
28809 nargs = 2;
28810 klass = load;
28811 memory = 1;
28812 break;
28813 case V8SF_FTYPE_PCV8SF_V8SI:
28814 case V4DF_FTYPE_PCV4DF_V4DI:
28815 case V4SF_FTYPE_PCV4SF_V4SI:
28816 case V2DF_FTYPE_PCV2DF_V2DI:
28817 case V8SI_FTYPE_PCV8SI_V8SI:
28818 case V4DI_FTYPE_PCV4DI_V4DI:
28819 case V4SI_FTYPE_PCV4SI_V4SI:
28820 case V2DI_FTYPE_PCV2DI_V2DI:
28821 nargs = 2;
28822 klass = load;
28823 memory = 0;
28824 break;
28825 case VOID_FTYPE_PV8SF_V8SI_V8SF:
28826 case VOID_FTYPE_PV4DF_V4DI_V4DF:
28827 case VOID_FTYPE_PV4SF_V4SI_V4SF:
28828 case VOID_FTYPE_PV2DF_V2DI_V2DF:
28829 case VOID_FTYPE_PV8SI_V8SI_V8SI:
28830 case VOID_FTYPE_PV4DI_V4DI_V4DI:
28831 case VOID_FTYPE_PV4SI_V4SI_V4SI:
28832 case VOID_FTYPE_PV2DI_V2DI_V2DI:
28833 nargs = 2;
28834 klass = store;
28835 /* Reserve memory operand for target. */
28836 memory = ARRAY_SIZE (args);
28837 break;
28838 case VOID_FTYPE_UINT_UINT_UINT:
28839 case VOID_FTYPE_UINT64_UINT_UINT:
28840 case UCHAR_FTYPE_UINT_UINT_UINT:
28841 case UCHAR_FTYPE_UINT64_UINT_UINT:
28842 nargs = 3;
28843 klass = load;
28844 memory = ARRAY_SIZE (args);
28845 last_arg_constant = true;
28846 break;
28847 default:
28848 gcc_unreachable ();
28849 }
28850
28851 gcc_assert (nargs <= ARRAY_SIZE (args));
28852
28853 if (klass == store)
28854 {
28855 arg = CALL_EXPR_ARG (exp, 0);
28856 op = expand_normal (arg);
28857 gcc_assert (target == 0);
28858 if (memory)
28859 {
28860 if (GET_MODE (op) != Pmode)
28861 op = convert_to_mode (Pmode, op, 1);
28862 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
28863 }
28864 else
28865 target = force_reg (tmode, op);
28866 arg_adjust = 1;
28867 }
28868 else
28869 {
28870 arg_adjust = 0;
28871 if (optimize
28872 || target == 0
28873 || GET_MODE (target) != tmode
28874 || !insn_p->operand[0].predicate (target, tmode))
28875 target = gen_reg_rtx (tmode);
28876 }
28877
28878 for (i = 0; i < nargs; i++)
28879 {
28880 enum machine_mode mode = insn_p->operand[i + 1].mode;
28881 bool match;
28882
28883 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
28884 op = expand_normal (arg);
28885 match = insn_p->operand[i + 1].predicate (op, mode);
28886
28887 if (last_arg_constant && (i + 1) == nargs)
28888 {
28889 if (!match)
28890 {
28891 if (icode == CODE_FOR_lwp_lwpvalsi3
28892 || icode == CODE_FOR_lwp_lwpinssi3
28893 || icode == CODE_FOR_lwp_lwpvaldi3
28894 || icode == CODE_FOR_lwp_lwpinsdi3)
28895 error ("the last argument must be a 32-bit immediate");
28896 else
28897 error ("the last argument must be an 8-bit immediate");
28898 return const0_rtx;
28899 }
28900 }
28901 else
28902 {
28903 if (i == memory)
28904 {
28905 /* This must be the memory operand. */
28906 if (GET_MODE (op) != Pmode)
28907 op = convert_to_mode (Pmode, op, 1);
28908 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
28909 gcc_assert (GET_MODE (op) == mode
28910 || GET_MODE (op) == VOIDmode);
28911 }
28912 else
28913 {
28914 /* This must be register. */
28915 if (VECTOR_MODE_P (mode))
28916 op = safe_vector_operand (op, mode);
28917
28918 gcc_assert (GET_MODE (op) == mode
28919 || GET_MODE (op) == VOIDmode);
28920 op = copy_to_mode_reg (mode, op);
28921 }
28922 }
28923
28924 args[i].op = op;
28925 args[i].mode = mode;
28926 }
28927
28928 switch (nargs)
28929 {
28930 case 0:
28931 pat = GEN_FCN (icode) (target);
28932 break;
28933 case 1:
28934 pat = GEN_FCN (icode) (target, args[0].op);
28935 break;
28936 case 2:
28937 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28938 break;
28939 case 3:
28940 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28941 break;
28942 default:
28943 gcc_unreachable ();
28944 }
28945
28946 if (! pat)
28947 return 0;
28948 emit_insn (pat);
28949 return klass == store ? 0 : target;
28950 }
28951
28952 /* Return the integer constant in ARG. Constrain it to be in the range
28953 of the subparts of VEC_TYPE; issue an error if not. */
28954
28955 static int
28956 get_element_number (tree vec_type, tree arg)
28957 {
28958 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
28959
28960 if (!host_integerp (arg, 1)
28961 || (elt = tree_low_cst (arg, 1), elt > max))
28962 {
28963 error ("selector must be an integer constant in the range 0..%wi", max);
28964 return 0;
28965 }
28966
28967 return elt;
28968 }
28969
28970 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
28971 ix86_expand_vector_init. We DO have language-level syntax for this, in
28972 the form of (type){ init-list }. Except that since we can't place emms
28973 instructions from inside the compiler, we can't allow the use of MMX
28974 registers unless the user explicitly asks for it. So we do *not* define
28975 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
28976 we have builtins invoked by mmintrin.h that gives us license to emit
28977 these sorts of instructions. */
28978
28979 static rtx
28980 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
28981 {
28982 enum machine_mode tmode = TYPE_MODE (type);
28983 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
28984 int i, n_elt = GET_MODE_NUNITS (tmode);
28985 rtvec v = rtvec_alloc (n_elt);
28986
28987 gcc_assert (VECTOR_MODE_P (tmode));
28988 gcc_assert (call_expr_nargs (exp) == n_elt);
28989
28990 for (i = 0; i < n_elt; ++i)
28991 {
28992 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
28993 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
28994 }
28995
28996 if (!target || !register_operand (target, tmode))
28997 target = gen_reg_rtx (tmode);
28998
28999 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29000 return target;
29001 }
29002
29003 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29004 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29005 had a language-level syntax for referencing vector elements. */
29006
29007 static rtx
29008 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29009 {
29010 enum machine_mode tmode, mode0;
29011 tree arg0, arg1;
29012 int elt;
29013 rtx op0;
29014
29015 arg0 = CALL_EXPR_ARG (exp, 0);
29016 arg1 = CALL_EXPR_ARG (exp, 1);
29017
29018 op0 = expand_normal (arg0);
29019 elt = get_element_number (TREE_TYPE (arg0), arg1);
29020
29021 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29022 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29023 gcc_assert (VECTOR_MODE_P (mode0));
29024
29025 op0 = force_reg (mode0, op0);
29026
29027 if (optimize || !target || !register_operand (target, tmode))
29028 target = gen_reg_rtx (tmode);
29029
29030 ix86_expand_vector_extract (true, target, op0, elt);
29031
29032 return target;
29033 }
29034
29035 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29036 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29037 a language-level syntax for referencing vector elements. */
29038
29039 static rtx
29040 ix86_expand_vec_set_builtin (tree exp)
29041 {
29042 enum machine_mode tmode, mode1;
29043 tree arg0, arg1, arg2;
29044 int elt;
29045 rtx op0, op1, target;
29046
29047 arg0 = CALL_EXPR_ARG (exp, 0);
29048 arg1 = CALL_EXPR_ARG (exp, 1);
29049 arg2 = CALL_EXPR_ARG (exp, 2);
29050
29051 tmode = TYPE_MODE (TREE_TYPE (arg0));
29052 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29053 gcc_assert (VECTOR_MODE_P (tmode));
29054
29055 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29056 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29057 elt = get_element_number (TREE_TYPE (arg0), arg2);
29058
29059 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29060 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29061
29062 op0 = force_reg (tmode, op0);
29063 op1 = force_reg (mode1, op1);
29064
29065 /* OP0 is the source of these builtin functions and shouldn't be
29066 modified. Create a copy, use it and return it as target. */
29067 target = gen_reg_rtx (tmode);
29068 emit_move_insn (target, op0);
29069 ix86_expand_vector_set (true, target, op1, elt);
29070
29071 return target;
29072 }
29073
29074 /* Expand an expression EXP that calls a built-in function,
29075 with result going to TARGET if that's convenient
29076 (and in mode MODE if that's convenient).
29077 SUBTARGET may be used as the target for computing one of EXP's operands.
29078 IGNORE is nonzero if the value is to be ignored. */
29079
29080 static rtx
29081 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29082 enum machine_mode mode ATTRIBUTE_UNUSED,
29083 int ignore ATTRIBUTE_UNUSED)
29084 {
29085 const struct builtin_description *d;
29086 size_t i;
29087 enum insn_code icode;
29088 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29089 tree arg0, arg1, arg2, arg3, arg4;
29090 rtx op0, op1, op2, op3, op4, pat;
29091 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29092 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29093
29094 /* Determine whether the builtin function is available under the current ISA.
29095 Originally the builtin was not created if it wasn't applicable to the
29096 current ISA based on the command line switches. With function specific
29097 options, we need to check in the context of the function making the call
29098 whether it is supported. */
29099 if (ix86_builtins_isa[fcode].isa
29100 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29101 {
29102 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29103 NULL, (enum fpmath_unit) 0, false);
29104
29105 if (!opts)
29106 error ("%qE needs unknown isa option", fndecl);
29107 else
29108 {
29109 gcc_assert (opts != NULL);
29110 error ("%qE needs isa option %s", fndecl, opts);
29111 free (opts);
29112 }
29113 return const0_rtx;
29114 }
29115
29116 switch (fcode)
29117 {
29118 case IX86_BUILTIN_MASKMOVQ:
29119 case IX86_BUILTIN_MASKMOVDQU:
29120 icode = (fcode == IX86_BUILTIN_MASKMOVQ
29121 ? CODE_FOR_mmx_maskmovq
29122 : CODE_FOR_sse2_maskmovdqu);
29123 /* Note the arg order is different from the operand order. */
29124 arg1 = CALL_EXPR_ARG (exp, 0);
29125 arg2 = CALL_EXPR_ARG (exp, 1);
29126 arg0 = CALL_EXPR_ARG (exp, 2);
29127 op0 = expand_normal (arg0);
29128 op1 = expand_normal (arg1);
29129 op2 = expand_normal (arg2);
29130 mode0 = insn_data[icode].operand[0].mode;
29131 mode1 = insn_data[icode].operand[1].mode;
29132 mode2 = insn_data[icode].operand[2].mode;
29133
29134 if (GET_MODE (op0) != Pmode)
29135 op0 = convert_to_mode (Pmode, op0, 1);
29136 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29137
29138 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29139 op0 = copy_to_mode_reg (mode0, op0);
29140 if (!insn_data[icode].operand[1].predicate (op1, mode1))
29141 op1 = copy_to_mode_reg (mode1, op1);
29142 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29143 op2 = copy_to_mode_reg (mode2, op2);
29144 pat = GEN_FCN (icode) (op0, op1, op2);
29145 if (! pat)
29146 return 0;
29147 emit_insn (pat);
29148 return 0;
29149
29150 case IX86_BUILTIN_LDMXCSR:
29151 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29152 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29153 emit_move_insn (target, op0);
29154 emit_insn (gen_sse_ldmxcsr (target));
29155 return 0;
29156
29157 case IX86_BUILTIN_STMXCSR:
29158 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29159 emit_insn (gen_sse_stmxcsr (target));
29160 return copy_to_mode_reg (SImode, target);
29161
29162 case IX86_BUILTIN_CLFLUSH:
29163 arg0 = CALL_EXPR_ARG (exp, 0);
29164 op0 = expand_normal (arg0);
29165 icode = CODE_FOR_sse2_clflush;
29166 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29167 {
29168 if (GET_MODE (op0) != Pmode)
29169 op0 = convert_to_mode (Pmode, op0, 1);
29170 op0 = force_reg (Pmode, op0);
29171 }
29172
29173 emit_insn (gen_sse2_clflush (op0));
29174 return 0;
29175
29176 case IX86_BUILTIN_MONITOR:
29177 arg0 = CALL_EXPR_ARG (exp, 0);
29178 arg1 = CALL_EXPR_ARG (exp, 1);
29179 arg2 = CALL_EXPR_ARG (exp, 2);
29180 op0 = expand_normal (arg0);
29181 op1 = expand_normal (arg1);
29182 op2 = expand_normal (arg2);
29183 if (!REG_P (op0))
29184 {
29185 if (GET_MODE (op0) != Pmode)
29186 op0 = convert_to_mode (Pmode, op0, 1);
29187 op0 = force_reg (Pmode, op0);
29188 }
29189 if (!REG_P (op1))
29190 op1 = copy_to_mode_reg (SImode, op1);
29191 if (!REG_P (op2))
29192 op2 = copy_to_mode_reg (SImode, op2);
29193 emit_insn (ix86_gen_monitor (op0, op1, op2));
29194 return 0;
29195
29196 case IX86_BUILTIN_MWAIT:
29197 arg0 = CALL_EXPR_ARG (exp, 0);
29198 arg1 = CALL_EXPR_ARG (exp, 1);
29199 op0 = expand_normal (arg0);
29200 op1 = expand_normal (arg1);
29201 if (!REG_P (op0))
29202 op0 = copy_to_mode_reg (SImode, op0);
29203 if (!REG_P (op1))
29204 op1 = copy_to_mode_reg (SImode, op1);
29205 emit_insn (gen_sse3_mwait (op0, op1));
29206 return 0;
29207
29208 case IX86_BUILTIN_VEC_INIT_V2SI:
29209 case IX86_BUILTIN_VEC_INIT_V4HI:
29210 case IX86_BUILTIN_VEC_INIT_V8QI:
29211 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29212
29213 case IX86_BUILTIN_VEC_EXT_V2DF:
29214 case IX86_BUILTIN_VEC_EXT_V2DI:
29215 case IX86_BUILTIN_VEC_EXT_V4SF:
29216 case IX86_BUILTIN_VEC_EXT_V4SI:
29217 case IX86_BUILTIN_VEC_EXT_V8HI:
29218 case IX86_BUILTIN_VEC_EXT_V2SI:
29219 case IX86_BUILTIN_VEC_EXT_V4HI:
29220 case IX86_BUILTIN_VEC_EXT_V16QI:
29221 return ix86_expand_vec_ext_builtin (exp, target);
29222
29223 case IX86_BUILTIN_VEC_SET_V2DI:
29224 case IX86_BUILTIN_VEC_SET_V4SF:
29225 case IX86_BUILTIN_VEC_SET_V4SI:
29226 case IX86_BUILTIN_VEC_SET_V8HI:
29227 case IX86_BUILTIN_VEC_SET_V4HI:
29228 case IX86_BUILTIN_VEC_SET_V16QI:
29229 return ix86_expand_vec_set_builtin (exp);
29230
29231 case IX86_BUILTIN_INFQ:
29232 case IX86_BUILTIN_HUGE_VALQ:
29233 {
29234 REAL_VALUE_TYPE inf;
29235 rtx tmp;
29236
29237 real_inf (&inf);
29238 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29239
29240 tmp = validize_mem (force_const_mem (mode, tmp));
29241
29242 if (target == 0)
29243 target = gen_reg_rtx (mode);
29244
29245 emit_move_insn (target, tmp);
29246 return target;
29247 }
29248
29249 case IX86_BUILTIN_LLWPCB:
29250 arg0 = CALL_EXPR_ARG (exp, 0);
29251 op0 = expand_normal (arg0);
29252 icode = CODE_FOR_lwp_llwpcb;
29253 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29254 {
29255 if (GET_MODE (op0) != Pmode)
29256 op0 = convert_to_mode (Pmode, op0, 1);
29257 op0 = force_reg (Pmode, op0);
29258 }
29259 emit_insn (gen_lwp_llwpcb (op0));
29260 return 0;
29261
29262 case IX86_BUILTIN_SLWPCB:
29263 icode = CODE_FOR_lwp_slwpcb;
29264 if (!target
29265 || !insn_data[icode].operand[0].predicate (target, Pmode))
29266 target = gen_reg_rtx (Pmode);
29267 emit_insn (gen_lwp_slwpcb (target));
29268 return target;
29269
29270 case IX86_BUILTIN_BEXTRI32:
29271 case IX86_BUILTIN_BEXTRI64:
29272 arg0 = CALL_EXPR_ARG (exp, 0);
29273 arg1 = CALL_EXPR_ARG (exp, 1);
29274 op0 = expand_normal (arg0);
29275 op1 = expand_normal (arg1);
29276 icode = (fcode == IX86_BUILTIN_BEXTRI32
29277 ? CODE_FOR_tbm_bextri_si
29278 : CODE_FOR_tbm_bextri_di);
29279 if (!CONST_INT_P (op1))
29280 {
29281 error ("last argument must be an immediate");
29282 return const0_rtx;
29283 }
29284 else
29285 {
29286 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29287 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29288 op1 = GEN_INT (length);
29289 op2 = GEN_INT (lsb_index);
29290 pat = GEN_FCN (icode) (target, op0, op1, op2);
29291 if (pat)
29292 emit_insn (pat);
29293 return target;
29294 }
29295
29296 case IX86_BUILTIN_RDRAND16_STEP:
29297 icode = CODE_FOR_rdrandhi_1;
29298 mode0 = HImode;
29299 goto rdrand_step;
29300
29301 case IX86_BUILTIN_RDRAND32_STEP:
29302 icode = CODE_FOR_rdrandsi_1;
29303 mode0 = SImode;
29304 goto rdrand_step;
29305
29306 case IX86_BUILTIN_RDRAND64_STEP:
29307 icode = CODE_FOR_rdranddi_1;
29308 mode0 = DImode;
29309
29310 rdrand_step:
29311 op0 = gen_reg_rtx (mode0);
29312 emit_insn (GEN_FCN (icode) (op0));
29313
29314 arg0 = CALL_EXPR_ARG (exp, 0);
29315 op1 = expand_normal (arg0);
29316 if (!address_operand (op1, VOIDmode))
29317 {
29318 op1 = convert_memory_address (Pmode, op1);
29319 op1 = copy_addr_to_reg (op1);
29320 }
29321 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29322
29323 op1 = gen_reg_rtx (SImode);
29324 emit_move_insn (op1, CONST1_RTX (SImode));
29325
29326 /* Emit SImode conditional move. */
29327 if (mode0 == HImode)
29328 {
29329 op2 = gen_reg_rtx (SImode);
29330 emit_insn (gen_zero_extendhisi2 (op2, op0));
29331 }
29332 else if (mode0 == SImode)
29333 op2 = op0;
29334 else
29335 op2 = gen_rtx_SUBREG (SImode, op0, 0);
29336
29337 if (target == 0)
29338 target = gen_reg_rtx (SImode);
29339
29340 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29341 const0_rtx);
29342 emit_insn (gen_rtx_SET (VOIDmode, target,
29343 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29344 return target;
29345
29346 case IX86_BUILTIN_GATHERSIV2DF:
29347 icode = CODE_FOR_avx2_gathersiv2df;
29348 goto gather_gen;
29349 case IX86_BUILTIN_GATHERSIV4DF:
29350 icode = CODE_FOR_avx2_gathersiv4df;
29351 goto gather_gen;
29352 case IX86_BUILTIN_GATHERDIV2DF:
29353 icode = CODE_FOR_avx2_gatherdiv2df;
29354 goto gather_gen;
29355 case IX86_BUILTIN_GATHERDIV4DF:
29356 icode = CODE_FOR_avx2_gatherdiv4df;
29357 goto gather_gen;
29358 case IX86_BUILTIN_GATHERSIV4SF:
29359 icode = CODE_FOR_avx2_gathersiv4sf;
29360 goto gather_gen;
29361 case IX86_BUILTIN_GATHERSIV8SF:
29362 icode = CODE_FOR_avx2_gathersiv8sf;
29363 goto gather_gen;
29364 case IX86_BUILTIN_GATHERDIV4SF:
29365 icode = CODE_FOR_avx2_gatherdiv4sf;
29366 goto gather_gen;
29367 case IX86_BUILTIN_GATHERDIV8SF:
29368 icode = CODE_FOR_avx2_gatherdiv8sf;
29369 goto gather_gen;
29370 case IX86_BUILTIN_GATHERSIV2DI:
29371 icode = CODE_FOR_avx2_gathersiv2di;
29372 goto gather_gen;
29373 case IX86_BUILTIN_GATHERSIV4DI:
29374 icode = CODE_FOR_avx2_gathersiv4di;
29375 goto gather_gen;
29376 case IX86_BUILTIN_GATHERDIV2DI:
29377 icode = CODE_FOR_avx2_gatherdiv2di;
29378 goto gather_gen;
29379 case IX86_BUILTIN_GATHERDIV4DI:
29380 icode = CODE_FOR_avx2_gatherdiv4di;
29381 goto gather_gen;
29382 case IX86_BUILTIN_GATHERSIV4SI:
29383 icode = CODE_FOR_avx2_gathersiv4si;
29384 goto gather_gen;
29385 case IX86_BUILTIN_GATHERSIV8SI:
29386 icode = CODE_FOR_avx2_gathersiv8si;
29387 goto gather_gen;
29388 case IX86_BUILTIN_GATHERDIV4SI:
29389 icode = CODE_FOR_avx2_gatherdiv4si;
29390 goto gather_gen;
29391 case IX86_BUILTIN_GATHERDIV8SI:
29392 icode = CODE_FOR_avx2_gatherdiv8si;
29393 goto gather_gen;
29394 case IX86_BUILTIN_GATHERALTSIV4DF:
29395 icode = CODE_FOR_avx2_gathersiv4df;
29396 goto gather_gen;
29397 case IX86_BUILTIN_GATHERALTDIV8SF:
29398 icode = CODE_FOR_avx2_gatherdiv8sf;
29399 goto gather_gen;
29400 case IX86_BUILTIN_GATHERALTSIV4DI:
29401 icode = CODE_FOR_avx2_gathersiv4di;
29402 goto gather_gen;
29403 case IX86_BUILTIN_GATHERALTDIV8SI:
29404 icode = CODE_FOR_avx2_gatherdiv8si;
29405 goto gather_gen;
29406
29407 gather_gen:
29408 arg0 = CALL_EXPR_ARG (exp, 0);
29409 arg1 = CALL_EXPR_ARG (exp, 1);
29410 arg2 = CALL_EXPR_ARG (exp, 2);
29411 arg3 = CALL_EXPR_ARG (exp, 3);
29412 arg4 = CALL_EXPR_ARG (exp, 4);
29413 op0 = expand_normal (arg0);
29414 op1 = expand_normal (arg1);
29415 op2 = expand_normal (arg2);
29416 op3 = expand_normal (arg3);
29417 op4 = expand_normal (arg4);
29418 /* Note the arg order is different from the operand order. */
29419 mode0 = insn_data[icode].operand[1].mode;
29420 mode2 = insn_data[icode].operand[3].mode;
29421 mode3 = insn_data[icode].operand[4].mode;
29422 mode4 = insn_data[icode].operand[5].mode;
29423
29424 if (target == NULL_RTX
29425 || GET_MODE (target) != insn_data[icode].operand[0].mode)
29426 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
29427 else
29428 subtarget = target;
29429
29430 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
29431 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
29432 {
29433 rtx half = gen_reg_rtx (V4SImode);
29434 if (!nonimmediate_operand (op2, V8SImode))
29435 op2 = copy_to_mode_reg (V8SImode, op2);
29436 emit_insn (gen_vec_extract_lo_v8si (half, op2));
29437 op2 = half;
29438 }
29439 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
29440 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
29441 {
29442 rtx (*gen) (rtx, rtx);
29443 rtx half = gen_reg_rtx (mode0);
29444 if (mode0 == V4SFmode)
29445 gen = gen_vec_extract_lo_v8sf;
29446 else
29447 gen = gen_vec_extract_lo_v8si;
29448 if (!nonimmediate_operand (op0, GET_MODE (op0)))
29449 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
29450 emit_insn (gen (half, op0));
29451 op0 = half;
29452 if (!nonimmediate_operand (op3, GET_MODE (op3)))
29453 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
29454 emit_insn (gen (half, op3));
29455 op3 = half;
29456 }
29457
29458 /* Force memory operand only with base register here. But we
29459 don't want to do it on memory operand for other builtin
29460 functions. */
29461 if (GET_MODE (op1) != Pmode)
29462 op1 = convert_to_mode (Pmode, op1, 1);
29463 op1 = force_reg (Pmode, op1);
29464
29465 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29466 op0 = copy_to_mode_reg (mode0, op0);
29467 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
29468 op1 = copy_to_mode_reg (Pmode, op1);
29469 if (!insn_data[icode].operand[3].predicate (op2, mode2))
29470 op2 = copy_to_mode_reg (mode2, op2);
29471 if (!insn_data[icode].operand[4].predicate (op3, mode3))
29472 op3 = copy_to_mode_reg (mode3, op3);
29473 if (!insn_data[icode].operand[5].predicate (op4, mode4))
29474 {
29475 error ("last argument must be scale 1, 2, 4, 8");
29476 return const0_rtx;
29477 }
29478
29479 /* Optimize. If mask is known to have all high bits set,
29480 replace op0 with pc_rtx to signal that the instruction
29481 overwrites the whole destination and doesn't use its
29482 previous contents. */
29483 if (optimize)
29484 {
29485 if (TREE_CODE (arg3) == VECTOR_CST)
29486 {
29487 tree elt;
29488 unsigned int negative = 0;
29489 for (elt = TREE_VECTOR_CST_ELTS (arg3);
29490 elt; elt = TREE_CHAIN (elt))
29491 {
29492 tree cst = TREE_VALUE (elt);
29493 if (TREE_CODE (cst) == INTEGER_CST
29494 && tree_int_cst_sign_bit (cst))
29495 negative++;
29496 else if (TREE_CODE (cst) == REAL_CST
29497 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
29498 negative++;
29499 }
29500 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
29501 op0 = pc_rtx;
29502 }
29503 else if (TREE_CODE (arg3) == SSA_NAME)
29504 {
29505 /* Recognize also when mask is like:
29506 __v2df src = _mm_setzero_pd ();
29507 __v2df mask = _mm_cmpeq_pd (src, src);
29508 or
29509 __v8sf src = _mm256_setzero_ps ();
29510 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
29511 as that is a cheaper way to load all ones into
29512 a register than having to load a constant from
29513 memory. */
29514 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
29515 if (is_gimple_call (def_stmt))
29516 {
29517 tree fndecl = gimple_call_fndecl (def_stmt);
29518 if (fndecl
29519 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29520 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
29521 {
29522 case IX86_BUILTIN_CMPPD:
29523 case IX86_BUILTIN_CMPPS:
29524 case IX86_BUILTIN_CMPPD256:
29525 case IX86_BUILTIN_CMPPS256:
29526 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
29527 break;
29528 /* FALLTHRU */
29529 case IX86_BUILTIN_CMPEQPD:
29530 case IX86_BUILTIN_CMPEQPS:
29531 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
29532 && initializer_zerop (gimple_call_arg (def_stmt,
29533 1)))
29534 op0 = pc_rtx;
29535 break;
29536 default:
29537 break;
29538 }
29539 }
29540 }
29541 }
29542
29543 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
29544 if (! pat)
29545 return const0_rtx;
29546 emit_insn (pat);
29547
29548 if (fcode == IX86_BUILTIN_GATHERDIV8SF
29549 || fcode == IX86_BUILTIN_GATHERDIV8SI)
29550 {
29551 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
29552 ? V4SFmode : V4SImode;
29553 if (target == NULL_RTX)
29554 target = gen_reg_rtx (tmode);
29555 if (tmode == V4SFmode)
29556 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
29557 else
29558 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
29559 }
29560 else
29561 target = subtarget;
29562
29563 return target;
29564
29565 default:
29566 break;
29567 }
29568
29569 for (i = 0, d = bdesc_special_args;
29570 i < ARRAY_SIZE (bdesc_special_args);
29571 i++, d++)
29572 if (d->code == fcode)
29573 return ix86_expand_special_args_builtin (d, exp, target);
29574
29575 for (i = 0, d = bdesc_args;
29576 i < ARRAY_SIZE (bdesc_args);
29577 i++, d++)
29578 if (d->code == fcode)
29579 switch (fcode)
29580 {
29581 case IX86_BUILTIN_FABSQ:
29582 case IX86_BUILTIN_COPYSIGNQ:
29583 if (!TARGET_SSE2)
29584 /* Emit a normal call if SSE2 isn't available. */
29585 return expand_call (exp, target, ignore);
29586 default:
29587 return ix86_expand_args_builtin (d, exp, target);
29588 }
29589
29590 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29591 if (d->code == fcode)
29592 return ix86_expand_sse_comi (d, exp, target);
29593
29594 for (i = 0, d = bdesc_pcmpestr;
29595 i < ARRAY_SIZE (bdesc_pcmpestr);
29596 i++, d++)
29597 if (d->code == fcode)
29598 return ix86_expand_sse_pcmpestr (d, exp, target);
29599
29600 for (i = 0, d = bdesc_pcmpistr;
29601 i < ARRAY_SIZE (bdesc_pcmpistr);
29602 i++, d++)
29603 if (d->code == fcode)
29604 return ix86_expand_sse_pcmpistr (d, exp, target);
29605
29606 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29607 if (d->code == fcode)
29608 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
29609 (enum ix86_builtin_func_type)
29610 d->flag, d->comparison);
29611
29612 gcc_unreachable ();
29613 }
29614
29615 /* Returns a function decl for a vectorized version of the builtin function
29616 with builtin function code FN and the result vector type TYPE, or NULL_TREE
29617 if it is not available. */
29618
29619 static tree
29620 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
29621 tree type_in)
29622 {
29623 enum machine_mode in_mode, out_mode;
29624 int in_n, out_n;
29625 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
29626
29627 if (TREE_CODE (type_out) != VECTOR_TYPE
29628 || TREE_CODE (type_in) != VECTOR_TYPE
29629 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
29630 return NULL_TREE;
29631
29632 out_mode = TYPE_MODE (TREE_TYPE (type_out));
29633 out_n = TYPE_VECTOR_SUBPARTS (type_out);
29634 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29635 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29636
29637 switch (fn)
29638 {
29639 case BUILT_IN_SQRT:
29640 if (out_mode == DFmode && in_mode == DFmode)
29641 {
29642 if (out_n == 2 && in_n == 2)
29643 return ix86_builtins[IX86_BUILTIN_SQRTPD];
29644 else if (out_n == 4 && in_n == 4)
29645 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
29646 }
29647 break;
29648
29649 case BUILT_IN_SQRTF:
29650 if (out_mode == SFmode && in_mode == SFmode)
29651 {
29652 if (out_n == 4 && in_n == 4)
29653 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
29654 else if (out_n == 8 && in_n == 8)
29655 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
29656 }
29657 break;
29658
29659 case BUILT_IN_IFLOOR:
29660 case BUILT_IN_LFLOOR:
29661 case BUILT_IN_LLFLOOR:
29662 /* The round insn does not trap on denormals. */
29663 if (flag_trapping_math || !TARGET_ROUND)
29664 break;
29665
29666 if (out_mode == SImode && in_mode == DFmode)
29667 {
29668 if (out_n == 4 && in_n == 2)
29669 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
29670 else if (out_n == 8 && in_n == 4)
29671 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
29672 }
29673 break;
29674
29675 case BUILT_IN_IFLOORF:
29676 case BUILT_IN_LFLOORF:
29677 case BUILT_IN_LLFLOORF:
29678 /* The round insn does not trap on denormals. */
29679 if (flag_trapping_math || !TARGET_ROUND)
29680 break;
29681
29682 if (out_mode == SImode && in_mode == SFmode)
29683 {
29684 if (out_n == 4 && in_n == 4)
29685 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
29686 else if (out_n == 8 && in_n == 8)
29687 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
29688 }
29689 break;
29690
29691 case BUILT_IN_ICEIL:
29692 case BUILT_IN_LCEIL:
29693 case BUILT_IN_LLCEIL:
29694 /* The round insn does not trap on denormals. */
29695 if (flag_trapping_math || !TARGET_ROUND)
29696 break;
29697
29698 if (out_mode == SImode && in_mode == DFmode)
29699 {
29700 if (out_n == 4 && in_n == 2)
29701 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
29702 else if (out_n == 8 && in_n == 4)
29703 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
29704 }
29705 break;
29706
29707 case BUILT_IN_ICEILF:
29708 case BUILT_IN_LCEILF:
29709 case BUILT_IN_LLCEILF:
29710 /* The round insn does not trap on denormals. */
29711 if (flag_trapping_math || !TARGET_ROUND)
29712 break;
29713
29714 if (out_mode == SImode && in_mode == SFmode)
29715 {
29716 if (out_n == 4 && in_n == 4)
29717 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
29718 else if (out_n == 8 && in_n == 8)
29719 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
29720 }
29721 break;
29722
29723 case BUILT_IN_IRINT:
29724 case BUILT_IN_LRINT:
29725 case BUILT_IN_LLRINT:
29726 if (out_mode == SImode && in_mode == DFmode)
29727 {
29728 if (out_n == 4 && in_n == 2)
29729 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
29730 else if (out_n == 8 && in_n == 4)
29731 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
29732 }
29733 break;
29734
29735 case BUILT_IN_IRINTF:
29736 case BUILT_IN_LRINTF:
29737 case BUILT_IN_LLRINTF:
29738 if (out_mode == SImode && in_mode == SFmode)
29739 {
29740 if (out_n == 4 && in_n == 4)
29741 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
29742 else if (out_n == 8 && in_n == 8)
29743 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
29744 }
29745 break;
29746
29747 case BUILT_IN_IROUND:
29748 case BUILT_IN_LROUND:
29749 case BUILT_IN_LLROUND:
29750 /* The round insn does not trap on denormals. */
29751 if (flag_trapping_math || !TARGET_ROUND)
29752 break;
29753
29754 if (out_mode == SImode && in_mode == DFmode)
29755 {
29756 if (out_n == 4 && in_n == 2)
29757 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
29758 else if (out_n == 8 && in_n == 4)
29759 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
29760 }
29761 break;
29762
29763 case BUILT_IN_IROUNDF:
29764 case BUILT_IN_LROUNDF:
29765 case BUILT_IN_LLROUNDF:
29766 /* The round insn does not trap on denormals. */
29767 if (flag_trapping_math || !TARGET_ROUND)
29768 break;
29769
29770 if (out_mode == SImode && in_mode == SFmode)
29771 {
29772 if (out_n == 4 && in_n == 4)
29773 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
29774 else if (out_n == 8 && in_n == 8)
29775 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
29776 }
29777 break;
29778
29779 case BUILT_IN_COPYSIGN:
29780 if (out_mode == DFmode && in_mode == DFmode)
29781 {
29782 if (out_n == 2 && in_n == 2)
29783 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
29784 else if (out_n == 4 && in_n == 4)
29785 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
29786 }
29787 break;
29788
29789 case BUILT_IN_COPYSIGNF:
29790 if (out_mode == SFmode && in_mode == SFmode)
29791 {
29792 if (out_n == 4 && in_n == 4)
29793 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
29794 else if (out_n == 8 && in_n == 8)
29795 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
29796 }
29797 break;
29798
29799 case BUILT_IN_FLOOR:
29800 /* The round insn does not trap on denormals. */
29801 if (flag_trapping_math || !TARGET_ROUND)
29802 break;
29803
29804 if (out_mode == DFmode && in_mode == DFmode)
29805 {
29806 if (out_n == 2 && in_n == 2)
29807 return ix86_builtins[IX86_BUILTIN_FLOORPD];
29808 else if (out_n == 4 && in_n == 4)
29809 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
29810 }
29811 break;
29812
29813 case BUILT_IN_FLOORF:
29814 /* The round insn does not trap on denormals. */
29815 if (flag_trapping_math || !TARGET_ROUND)
29816 break;
29817
29818 if (out_mode == SFmode && in_mode == SFmode)
29819 {
29820 if (out_n == 4 && in_n == 4)
29821 return ix86_builtins[IX86_BUILTIN_FLOORPS];
29822 else if (out_n == 8 && in_n == 8)
29823 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
29824 }
29825 break;
29826
29827 case BUILT_IN_CEIL:
29828 /* The round insn does not trap on denormals. */
29829 if (flag_trapping_math || !TARGET_ROUND)
29830 break;
29831
29832 if (out_mode == DFmode && in_mode == DFmode)
29833 {
29834 if (out_n == 2 && in_n == 2)
29835 return ix86_builtins[IX86_BUILTIN_CEILPD];
29836 else if (out_n == 4 && in_n == 4)
29837 return ix86_builtins[IX86_BUILTIN_CEILPD256];
29838 }
29839 break;
29840
29841 case BUILT_IN_CEILF:
29842 /* The round insn does not trap on denormals. */
29843 if (flag_trapping_math || !TARGET_ROUND)
29844 break;
29845
29846 if (out_mode == SFmode && in_mode == SFmode)
29847 {
29848 if (out_n == 4 && in_n == 4)
29849 return ix86_builtins[IX86_BUILTIN_CEILPS];
29850 else if (out_n == 8 && in_n == 8)
29851 return ix86_builtins[IX86_BUILTIN_CEILPS256];
29852 }
29853 break;
29854
29855 case BUILT_IN_TRUNC:
29856 /* The round insn does not trap on denormals. */
29857 if (flag_trapping_math || !TARGET_ROUND)
29858 break;
29859
29860 if (out_mode == DFmode && in_mode == DFmode)
29861 {
29862 if (out_n == 2 && in_n == 2)
29863 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
29864 else if (out_n == 4 && in_n == 4)
29865 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
29866 }
29867 break;
29868
29869 case BUILT_IN_TRUNCF:
29870 /* The round insn does not trap on denormals. */
29871 if (flag_trapping_math || !TARGET_ROUND)
29872 break;
29873
29874 if (out_mode == SFmode && in_mode == SFmode)
29875 {
29876 if (out_n == 4 && in_n == 4)
29877 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
29878 else if (out_n == 8 && in_n == 8)
29879 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
29880 }
29881 break;
29882
29883 case BUILT_IN_RINT:
29884 /* The round insn does not trap on denormals. */
29885 if (flag_trapping_math || !TARGET_ROUND)
29886 break;
29887
29888 if (out_mode == DFmode && in_mode == DFmode)
29889 {
29890 if (out_n == 2 && in_n == 2)
29891 return ix86_builtins[IX86_BUILTIN_RINTPD];
29892 else if (out_n == 4 && in_n == 4)
29893 return ix86_builtins[IX86_BUILTIN_RINTPD256];
29894 }
29895 break;
29896
29897 case BUILT_IN_RINTF:
29898 /* The round insn does not trap on denormals. */
29899 if (flag_trapping_math || !TARGET_ROUND)
29900 break;
29901
29902 if (out_mode == SFmode && in_mode == SFmode)
29903 {
29904 if (out_n == 4 && in_n == 4)
29905 return ix86_builtins[IX86_BUILTIN_RINTPS];
29906 else if (out_n == 8 && in_n == 8)
29907 return ix86_builtins[IX86_BUILTIN_RINTPS256];
29908 }
29909 break;
29910
29911 case BUILT_IN_ROUND:
29912 /* The round insn does not trap on denormals. */
29913 if (flag_trapping_math || !TARGET_ROUND)
29914 break;
29915
29916 if (out_mode == DFmode && in_mode == DFmode)
29917 {
29918 if (out_n == 2 && in_n == 2)
29919 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
29920 else if (out_n == 4 && in_n == 4)
29921 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
29922 }
29923 break;
29924
29925 case BUILT_IN_ROUNDF:
29926 /* The round insn does not trap on denormals. */
29927 if (flag_trapping_math || !TARGET_ROUND)
29928 break;
29929
29930 if (out_mode == SFmode && in_mode == SFmode)
29931 {
29932 if (out_n == 4 && in_n == 4)
29933 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
29934 else if (out_n == 8 && in_n == 8)
29935 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
29936 }
29937 break;
29938
29939 case BUILT_IN_FMA:
29940 if (out_mode == DFmode && in_mode == DFmode)
29941 {
29942 if (out_n == 2 && in_n == 2)
29943 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
29944 if (out_n == 4 && in_n == 4)
29945 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
29946 }
29947 break;
29948
29949 case BUILT_IN_FMAF:
29950 if (out_mode == SFmode && in_mode == SFmode)
29951 {
29952 if (out_n == 4 && in_n == 4)
29953 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
29954 if (out_n == 8 && in_n == 8)
29955 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
29956 }
29957 break;
29958
29959 default:
29960 break;
29961 }
29962
29963 /* Dispatch to a handler for a vectorization library. */
29964 if (ix86_veclib_handler)
29965 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
29966 type_in);
29967
29968 return NULL_TREE;
29969 }
29970
29971 /* Handler for an SVML-style interface to
29972 a library with vectorized intrinsics. */
29973
29974 static tree
29975 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
29976 {
29977 char name[20];
29978 tree fntype, new_fndecl, args;
29979 unsigned arity;
29980 const char *bname;
29981 enum machine_mode el_mode, in_mode;
29982 int n, in_n;
29983
29984 /* The SVML is suitable for unsafe math only. */
29985 if (!flag_unsafe_math_optimizations)
29986 return NULL_TREE;
29987
29988 el_mode = TYPE_MODE (TREE_TYPE (type_out));
29989 n = TYPE_VECTOR_SUBPARTS (type_out);
29990 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29991 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29992 if (el_mode != in_mode
29993 || n != in_n)
29994 return NULL_TREE;
29995
29996 switch (fn)
29997 {
29998 case BUILT_IN_EXP:
29999 case BUILT_IN_LOG:
30000 case BUILT_IN_LOG10:
30001 case BUILT_IN_POW:
30002 case BUILT_IN_TANH:
30003 case BUILT_IN_TAN:
30004 case BUILT_IN_ATAN:
30005 case BUILT_IN_ATAN2:
30006 case BUILT_IN_ATANH:
30007 case BUILT_IN_CBRT:
30008 case BUILT_IN_SINH:
30009 case BUILT_IN_SIN:
30010 case BUILT_IN_ASINH:
30011 case BUILT_IN_ASIN:
30012 case BUILT_IN_COSH:
30013 case BUILT_IN_COS:
30014 case BUILT_IN_ACOSH:
30015 case BUILT_IN_ACOS:
30016 if (el_mode != DFmode || n != 2)
30017 return NULL_TREE;
30018 break;
30019
30020 case BUILT_IN_EXPF:
30021 case BUILT_IN_LOGF:
30022 case BUILT_IN_LOG10F:
30023 case BUILT_IN_POWF:
30024 case BUILT_IN_TANHF:
30025 case BUILT_IN_TANF:
30026 case BUILT_IN_ATANF:
30027 case BUILT_IN_ATAN2F:
30028 case BUILT_IN_ATANHF:
30029 case BUILT_IN_CBRTF:
30030 case BUILT_IN_SINHF:
30031 case BUILT_IN_SINF:
30032 case BUILT_IN_ASINHF:
30033 case BUILT_IN_ASINF:
30034 case BUILT_IN_COSHF:
30035 case BUILT_IN_COSF:
30036 case BUILT_IN_ACOSHF:
30037 case BUILT_IN_ACOSF:
30038 if (el_mode != SFmode || n != 4)
30039 return NULL_TREE;
30040 break;
30041
30042 default:
30043 return NULL_TREE;
30044 }
30045
30046 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30047
30048 if (fn == BUILT_IN_LOGF)
30049 strcpy (name, "vmlsLn4");
30050 else if (fn == BUILT_IN_LOG)
30051 strcpy (name, "vmldLn2");
30052 else if (n == 4)
30053 {
30054 sprintf (name, "vmls%s", bname+10);
30055 name[strlen (name)-1] = '4';
30056 }
30057 else
30058 sprintf (name, "vmld%s2", bname+10);
30059
30060 /* Convert to uppercase. */
30061 name[4] &= ~0x20;
30062
30063 arity = 0;
30064 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30065 args;
30066 args = TREE_CHAIN (args))
30067 arity++;
30068
30069 if (arity == 1)
30070 fntype = build_function_type_list (type_out, type_in, NULL);
30071 else
30072 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30073
30074 /* Build a function declaration for the vectorized function. */
30075 new_fndecl = build_decl (BUILTINS_LOCATION,
30076 FUNCTION_DECL, get_identifier (name), fntype);
30077 TREE_PUBLIC (new_fndecl) = 1;
30078 DECL_EXTERNAL (new_fndecl) = 1;
30079 DECL_IS_NOVOPS (new_fndecl) = 1;
30080 TREE_READONLY (new_fndecl) = 1;
30081
30082 return new_fndecl;
30083 }
30084
30085 /* Handler for an ACML-style interface to
30086 a library with vectorized intrinsics. */
30087
30088 static tree
30089 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30090 {
30091 char name[20] = "__vr.._";
30092 tree fntype, new_fndecl, args;
30093 unsigned arity;
30094 const char *bname;
30095 enum machine_mode el_mode, in_mode;
30096 int n, in_n;
30097
30098 /* The ACML is 64bits only and suitable for unsafe math only as
30099 it does not correctly support parts of IEEE with the required
30100 precision such as denormals. */
30101 if (!TARGET_64BIT
30102 || !flag_unsafe_math_optimizations)
30103 return NULL_TREE;
30104
30105 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30106 n = TYPE_VECTOR_SUBPARTS (type_out);
30107 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30108 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30109 if (el_mode != in_mode
30110 || n != in_n)
30111 return NULL_TREE;
30112
30113 switch (fn)
30114 {
30115 case BUILT_IN_SIN:
30116 case BUILT_IN_COS:
30117 case BUILT_IN_EXP:
30118 case BUILT_IN_LOG:
30119 case BUILT_IN_LOG2:
30120 case BUILT_IN_LOG10:
30121 name[4] = 'd';
30122 name[5] = '2';
30123 if (el_mode != DFmode
30124 || n != 2)
30125 return NULL_TREE;
30126 break;
30127
30128 case BUILT_IN_SINF:
30129 case BUILT_IN_COSF:
30130 case BUILT_IN_EXPF:
30131 case BUILT_IN_POWF:
30132 case BUILT_IN_LOGF:
30133 case BUILT_IN_LOG2F:
30134 case BUILT_IN_LOG10F:
30135 name[4] = 's';
30136 name[5] = '4';
30137 if (el_mode != SFmode
30138 || n != 4)
30139 return NULL_TREE;
30140 break;
30141
30142 default:
30143 return NULL_TREE;
30144 }
30145
30146 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30147 sprintf (name + 7, "%s", bname+10);
30148
30149 arity = 0;
30150 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30151 args;
30152 args = TREE_CHAIN (args))
30153 arity++;
30154
30155 if (arity == 1)
30156 fntype = build_function_type_list (type_out, type_in, NULL);
30157 else
30158 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30159
30160 /* Build a function declaration for the vectorized function. */
30161 new_fndecl = build_decl (BUILTINS_LOCATION,
30162 FUNCTION_DECL, get_identifier (name), fntype);
30163 TREE_PUBLIC (new_fndecl) = 1;
30164 DECL_EXTERNAL (new_fndecl) = 1;
30165 DECL_IS_NOVOPS (new_fndecl) = 1;
30166 TREE_READONLY (new_fndecl) = 1;
30167
30168 return new_fndecl;
30169 }
30170
30171 /* Returns a decl of a function that implements gather load with
30172 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30173 Return NULL_TREE if it is not available. */
30174
30175 static tree
30176 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30177 const_tree index_type, int scale)
30178 {
30179 bool si;
30180 enum ix86_builtins code;
30181
30182 if (! TARGET_AVX2)
30183 return NULL_TREE;
30184
30185 if ((TREE_CODE (index_type) != INTEGER_TYPE
30186 && !POINTER_TYPE_P (index_type))
30187 || (TYPE_MODE (index_type) != SImode
30188 && TYPE_MODE (index_type) != DImode))
30189 return NULL_TREE;
30190
30191 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30192 return NULL_TREE;
30193
30194 /* v*gather* insn sign extends index to pointer mode. */
30195 if (TYPE_PRECISION (index_type) < POINTER_SIZE
30196 && TYPE_UNSIGNED (index_type))
30197 return NULL_TREE;
30198
30199 if (scale <= 0
30200 || scale > 8
30201 || (scale & (scale - 1)) != 0)
30202 return NULL_TREE;
30203
30204 si = TYPE_MODE (index_type) == SImode;
30205 switch (TYPE_MODE (mem_vectype))
30206 {
30207 case V2DFmode:
30208 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30209 break;
30210 case V4DFmode:
30211 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30212 break;
30213 case V2DImode:
30214 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30215 break;
30216 case V4DImode:
30217 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30218 break;
30219 case V4SFmode:
30220 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30221 break;
30222 case V8SFmode:
30223 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30224 break;
30225 case V4SImode:
30226 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30227 break;
30228 case V8SImode:
30229 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30230 break;
30231 default:
30232 return NULL_TREE;
30233 }
30234
30235 return ix86_builtins[code];
30236 }
30237
30238 /* Returns a code for a target-specific builtin that implements
30239 reciprocal of the function, or NULL_TREE if not available. */
30240
30241 static tree
30242 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30243 bool sqrt ATTRIBUTE_UNUSED)
30244 {
30245 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30246 && flag_finite_math_only && !flag_trapping_math
30247 && flag_unsafe_math_optimizations))
30248 return NULL_TREE;
30249
30250 if (md_fn)
30251 /* Machine dependent builtins. */
30252 switch (fn)
30253 {
30254 /* Vectorized version of sqrt to rsqrt conversion. */
30255 case IX86_BUILTIN_SQRTPS_NR:
30256 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30257
30258 case IX86_BUILTIN_SQRTPS_NR256:
30259 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30260
30261 default:
30262 return NULL_TREE;
30263 }
30264 else
30265 /* Normal builtins. */
30266 switch (fn)
30267 {
30268 /* Sqrt to rsqrt conversion. */
30269 case BUILT_IN_SQRTF:
30270 return ix86_builtins[IX86_BUILTIN_RSQRTF];
30271
30272 default:
30273 return NULL_TREE;
30274 }
30275 }
30276 \f
30277 /* Helper for avx_vpermilps256_operand et al. This is also used by
30278 the expansion functions to turn the parallel back into a mask.
30279 The return value is 0 for no match and the imm8+1 for a match. */
30280
30281 int
30282 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30283 {
30284 unsigned i, nelt = GET_MODE_NUNITS (mode);
30285 unsigned mask = 0;
30286 unsigned char ipar[8];
30287
30288 if (XVECLEN (par, 0) != (int) nelt)
30289 return 0;
30290
30291 /* Validate that all of the elements are constants, and not totally
30292 out of range. Copy the data into an integral array to make the
30293 subsequent checks easier. */
30294 for (i = 0; i < nelt; ++i)
30295 {
30296 rtx er = XVECEXP (par, 0, i);
30297 unsigned HOST_WIDE_INT ei;
30298
30299 if (!CONST_INT_P (er))
30300 return 0;
30301 ei = INTVAL (er);
30302 if (ei >= nelt)
30303 return 0;
30304 ipar[i] = ei;
30305 }
30306
30307 switch (mode)
30308 {
30309 case V4DFmode:
30310 /* In the 256-bit DFmode case, we can only move elements within
30311 a 128-bit lane. */
30312 for (i = 0; i < 2; ++i)
30313 {
30314 if (ipar[i] >= 2)
30315 return 0;
30316 mask |= ipar[i] << i;
30317 }
30318 for (i = 2; i < 4; ++i)
30319 {
30320 if (ipar[i] < 2)
30321 return 0;
30322 mask |= (ipar[i] - 2) << i;
30323 }
30324 break;
30325
30326 case V8SFmode:
30327 /* In the 256-bit SFmode case, we have full freedom of movement
30328 within the low 128-bit lane, but the high 128-bit lane must
30329 mirror the exact same pattern. */
30330 for (i = 0; i < 4; ++i)
30331 if (ipar[i] + 4 != ipar[i + 4])
30332 return 0;
30333 nelt = 4;
30334 /* FALLTHRU */
30335
30336 case V2DFmode:
30337 case V4SFmode:
30338 /* In the 128-bit case, we've full freedom in the placement of
30339 the elements from the source operand. */
30340 for (i = 0; i < nelt; ++i)
30341 mask |= ipar[i] << (i * (nelt / 2));
30342 break;
30343
30344 default:
30345 gcc_unreachable ();
30346 }
30347
30348 /* Make sure success has a non-zero value by adding one. */
30349 return mask + 1;
30350 }
30351
30352 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
30353 the expansion functions to turn the parallel back into a mask.
30354 The return value is 0 for no match and the imm8+1 for a match. */
30355
30356 int
30357 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30358 {
30359 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30360 unsigned mask = 0;
30361 unsigned char ipar[8];
30362
30363 if (XVECLEN (par, 0) != (int) nelt)
30364 return 0;
30365
30366 /* Validate that all of the elements are constants, and not totally
30367 out of range. Copy the data into an integral array to make the
30368 subsequent checks easier. */
30369 for (i = 0; i < nelt; ++i)
30370 {
30371 rtx er = XVECEXP (par, 0, i);
30372 unsigned HOST_WIDE_INT ei;
30373
30374 if (!CONST_INT_P (er))
30375 return 0;
30376 ei = INTVAL (er);
30377 if (ei >= 2 * nelt)
30378 return 0;
30379 ipar[i] = ei;
30380 }
30381
30382 /* Validate that the halves of the permute are halves. */
30383 for (i = 0; i < nelt2 - 1; ++i)
30384 if (ipar[i] + 1 != ipar[i + 1])
30385 return 0;
30386 for (i = nelt2; i < nelt - 1; ++i)
30387 if (ipar[i] + 1 != ipar[i + 1])
30388 return 0;
30389
30390 /* Reconstruct the mask. */
30391 for (i = 0; i < 2; ++i)
30392 {
30393 unsigned e = ipar[i * nelt2];
30394 if (e % nelt2)
30395 return 0;
30396 e /= nelt2;
30397 mask |= e << (i * 4);
30398 }
30399
30400 /* Make sure success has a non-zero value by adding one. */
30401 return mask + 1;
30402 }
30403 \f
30404 /* Store OPERAND to the memory after reload is completed. This means
30405 that we can't easily use assign_stack_local. */
30406 rtx
30407 ix86_force_to_memory (enum machine_mode mode, rtx operand)
30408 {
30409 rtx result;
30410
30411 gcc_assert (reload_completed);
30412 if (ix86_using_red_zone ())
30413 {
30414 result = gen_rtx_MEM (mode,
30415 gen_rtx_PLUS (Pmode,
30416 stack_pointer_rtx,
30417 GEN_INT (-RED_ZONE_SIZE)));
30418 emit_move_insn (result, operand);
30419 }
30420 else if (TARGET_64BIT)
30421 {
30422 switch (mode)
30423 {
30424 case HImode:
30425 case SImode:
30426 operand = gen_lowpart (DImode, operand);
30427 /* FALLTHRU */
30428 case DImode:
30429 emit_insn (
30430 gen_rtx_SET (VOIDmode,
30431 gen_rtx_MEM (DImode,
30432 gen_rtx_PRE_DEC (DImode,
30433 stack_pointer_rtx)),
30434 operand));
30435 break;
30436 default:
30437 gcc_unreachable ();
30438 }
30439 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30440 }
30441 else
30442 {
30443 switch (mode)
30444 {
30445 case DImode:
30446 {
30447 rtx operands[2];
30448 split_double_mode (mode, &operand, 1, operands, operands + 1);
30449 emit_insn (
30450 gen_rtx_SET (VOIDmode,
30451 gen_rtx_MEM (SImode,
30452 gen_rtx_PRE_DEC (Pmode,
30453 stack_pointer_rtx)),
30454 operands[1]));
30455 emit_insn (
30456 gen_rtx_SET (VOIDmode,
30457 gen_rtx_MEM (SImode,
30458 gen_rtx_PRE_DEC (Pmode,
30459 stack_pointer_rtx)),
30460 operands[0]));
30461 }
30462 break;
30463 case HImode:
30464 /* Store HImodes as SImodes. */
30465 operand = gen_lowpart (SImode, operand);
30466 /* FALLTHRU */
30467 case SImode:
30468 emit_insn (
30469 gen_rtx_SET (VOIDmode,
30470 gen_rtx_MEM (GET_MODE (operand),
30471 gen_rtx_PRE_DEC (SImode,
30472 stack_pointer_rtx)),
30473 operand));
30474 break;
30475 default:
30476 gcc_unreachable ();
30477 }
30478 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30479 }
30480 return result;
30481 }
30482
30483 /* Free operand from the memory. */
30484 void
30485 ix86_free_from_memory (enum machine_mode mode)
30486 {
30487 if (!ix86_using_red_zone ())
30488 {
30489 int size;
30490
30491 if (mode == DImode || TARGET_64BIT)
30492 size = 8;
30493 else
30494 size = 4;
30495 /* Use LEA to deallocate stack space. In peephole2 it will be converted
30496 to pop or add instruction if registers are available. */
30497 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
30498 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
30499 GEN_INT (size))));
30500 }
30501 }
30502
30503 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
30504
30505 Put float CONST_DOUBLE in the constant pool instead of fp regs.
30506 QImode must go into class Q_REGS.
30507 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
30508 movdf to do mem-to-mem moves through integer regs. */
30509
30510 static reg_class_t
30511 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
30512 {
30513 enum machine_mode mode = GET_MODE (x);
30514
30515 /* We're only allowed to return a subclass of CLASS. Many of the
30516 following checks fail for NO_REGS, so eliminate that early. */
30517 if (regclass == NO_REGS)
30518 return NO_REGS;
30519
30520 /* All classes can load zeros. */
30521 if (x == CONST0_RTX (mode))
30522 return regclass;
30523
30524 /* Force constants into memory if we are loading a (nonzero) constant into
30525 an MMX or SSE register. This is because there are no MMX/SSE instructions
30526 to load from a constant. */
30527 if (CONSTANT_P (x)
30528 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
30529 return NO_REGS;
30530
30531 /* Prefer SSE regs only, if we can use them for math. */
30532 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
30533 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
30534
30535 /* Floating-point constants need more complex checks. */
30536 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
30537 {
30538 /* General regs can load everything. */
30539 if (reg_class_subset_p (regclass, GENERAL_REGS))
30540 return regclass;
30541
30542 /* Floats can load 0 and 1 plus some others. Note that we eliminated
30543 zero above. We only want to wind up preferring 80387 registers if
30544 we plan on doing computation with them. */
30545 if (TARGET_80387
30546 && standard_80387_constant_p (x) > 0)
30547 {
30548 /* Limit class to non-sse. */
30549 if (regclass == FLOAT_SSE_REGS)
30550 return FLOAT_REGS;
30551 if (regclass == FP_TOP_SSE_REGS)
30552 return FP_TOP_REG;
30553 if (regclass == FP_SECOND_SSE_REGS)
30554 return FP_SECOND_REG;
30555 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
30556 return regclass;
30557 }
30558
30559 return NO_REGS;
30560 }
30561
30562 /* Generally when we see PLUS here, it's the function invariant
30563 (plus soft-fp const_int). Which can only be computed into general
30564 regs. */
30565 if (GET_CODE (x) == PLUS)
30566 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
30567
30568 /* QImode constants are easy to load, but non-constant QImode data
30569 must go into Q_REGS. */
30570 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
30571 {
30572 if (reg_class_subset_p (regclass, Q_REGS))
30573 return regclass;
30574 if (reg_class_subset_p (Q_REGS, regclass))
30575 return Q_REGS;
30576 return NO_REGS;
30577 }
30578
30579 return regclass;
30580 }
30581
30582 /* Discourage putting floating-point values in SSE registers unless
30583 SSE math is being used, and likewise for the 387 registers. */
30584 static reg_class_t
30585 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
30586 {
30587 enum machine_mode mode = GET_MODE (x);
30588
30589 /* Restrict the output reload class to the register bank that we are doing
30590 math on. If we would like not to return a subset of CLASS, reject this
30591 alternative: if reload cannot do this, it will still use its choice. */
30592 mode = GET_MODE (x);
30593 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
30594 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
30595
30596 if (X87_FLOAT_MODE_P (mode))
30597 {
30598 if (regclass == FP_TOP_SSE_REGS)
30599 return FP_TOP_REG;
30600 else if (regclass == FP_SECOND_SSE_REGS)
30601 return FP_SECOND_REG;
30602 else
30603 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
30604 }
30605
30606 return regclass;
30607 }
30608
30609 static reg_class_t
30610 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
30611 enum machine_mode mode, secondary_reload_info *sri)
30612 {
30613 /* Double-word spills from general registers to non-offsettable memory
30614 references (zero-extended addresses) require special handling. */
30615 if (TARGET_64BIT
30616 && MEM_P (x)
30617 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
30618 && rclass == GENERAL_REGS
30619 && !offsettable_memref_p (x))
30620 {
30621 sri->icode = (in_p
30622 ? CODE_FOR_reload_noff_load
30623 : CODE_FOR_reload_noff_store);
30624 /* Add the cost of moving address to a temporary. */
30625 sri->extra_cost = 1;
30626
30627 return NO_REGS;
30628 }
30629
30630 /* QImode spills from non-QI registers require
30631 intermediate register on 32bit targets. */
30632 if (!TARGET_64BIT
30633 && !in_p && mode == QImode
30634 && (rclass == GENERAL_REGS
30635 || rclass == LEGACY_REGS
30636 || rclass == INDEX_REGS))
30637 {
30638 int regno;
30639
30640 if (REG_P (x))
30641 regno = REGNO (x);
30642 else
30643 regno = -1;
30644
30645 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
30646 regno = true_regnum (x);
30647
30648 /* Return Q_REGS if the operand is in memory. */
30649 if (regno == -1)
30650 return Q_REGS;
30651 }
30652
30653 /* This condition handles corner case where an expression involving
30654 pointers gets vectorized. We're trying to use the address of a
30655 stack slot as a vector initializer.
30656
30657 (set (reg:V2DI 74 [ vect_cst_.2 ])
30658 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
30659
30660 Eventually frame gets turned into sp+offset like this:
30661
30662 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30663 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30664 (const_int 392 [0x188]))))
30665
30666 That later gets turned into:
30667
30668 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30669 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30670 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
30671
30672 We'll have the following reload recorded:
30673
30674 Reload 0: reload_in (DI) =
30675 (plus:DI (reg/f:DI 7 sp)
30676 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
30677 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30678 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
30679 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
30680 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30681 reload_reg_rtx: (reg:V2DI 22 xmm1)
30682
30683 Which isn't going to work since SSE instructions can't handle scalar
30684 additions. Returning GENERAL_REGS forces the addition into integer
30685 register and reload can handle subsequent reloads without problems. */
30686
30687 if (in_p && GET_CODE (x) == PLUS
30688 && SSE_CLASS_P (rclass)
30689 && SCALAR_INT_MODE_P (mode))
30690 return GENERAL_REGS;
30691
30692 return NO_REGS;
30693 }
30694
30695 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
30696
30697 static bool
30698 ix86_class_likely_spilled_p (reg_class_t rclass)
30699 {
30700 switch (rclass)
30701 {
30702 case AREG:
30703 case DREG:
30704 case CREG:
30705 case BREG:
30706 case AD_REGS:
30707 case SIREG:
30708 case DIREG:
30709 case SSE_FIRST_REG:
30710 case FP_TOP_REG:
30711 case FP_SECOND_REG:
30712 return true;
30713
30714 default:
30715 break;
30716 }
30717
30718 return false;
30719 }
30720
30721 /* If we are copying between general and FP registers, we need a memory
30722 location. The same is true for SSE and MMX registers.
30723
30724 To optimize register_move_cost performance, allow inline variant.
30725
30726 The macro can't work reliably when one of the CLASSES is class containing
30727 registers from multiple units (SSE, MMX, integer). We avoid this by never
30728 combining those units in single alternative in the machine description.
30729 Ensure that this constraint holds to avoid unexpected surprises.
30730
30731 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
30732 enforce these sanity checks. */
30733
30734 static inline bool
30735 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30736 enum machine_mode mode, int strict)
30737 {
30738 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
30739 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
30740 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
30741 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
30742 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
30743 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
30744 {
30745 gcc_assert (!strict);
30746 return true;
30747 }
30748
30749 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
30750 return true;
30751
30752 /* ??? This is a lie. We do have moves between mmx/general, and for
30753 mmx/sse2. But by saying we need secondary memory we discourage the
30754 register allocator from using the mmx registers unless needed. */
30755 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
30756 return true;
30757
30758 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30759 {
30760 /* SSE1 doesn't have any direct moves from other classes. */
30761 if (!TARGET_SSE2)
30762 return true;
30763
30764 /* If the target says that inter-unit moves are more expensive
30765 than moving through memory, then don't generate them. */
30766 if (!TARGET_INTER_UNIT_MOVES)
30767 return true;
30768
30769 /* Between SSE and general, we have moves no larger than word size. */
30770 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
30771 return true;
30772 }
30773
30774 return false;
30775 }
30776
30777 bool
30778 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30779 enum machine_mode mode, int strict)
30780 {
30781 return inline_secondary_memory_needed (class1, class2, mode, strict);
30782 }
30783
30784 /* Implement the TARGET_CLASS_MAX_NREGS hook.
30785
30786 On the 80386, this is the size of MODE in words,
30787 except in the FP regs, where a single reg is always enough. */
30788
30789 static unsigned char
30790 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
30791 {
30792 if (MAYBE_INTEGER_CLASS_P (rclass))
30793 {
30794 if (mode == XFmode)
30795 return (TARGET_64BIT ? 2 : 3);
30796 else if (mode == XCmode)
30797 return (TARGET_64BIT ? 4 : 6);
30798 else
30799 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
30800 }
30801 else
30802 {
30803 if (COMPLEX_MODE_P (mode))
30804 return 2;
30805 else
30806 return 1;
30807 }
30808 }
30809
30810 /* Return true if the registers in CLASS cannot represent the change from
30811 modes FROM to TO. */
30812
30813 bool
30814 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
30815 enum reg_class regclass)
30816 {
30817 if (from == to)
30818 return false;
30819
30820 /* x87 registers can't do subreg at all, as all values are reformatted
30821 to extended precision. */
30822 if (MAYBE_FLOAT_CLASS_P (regclass))
30823 return true;
30824
30825 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
30826 {
30827 /* Vector registers do not support QI or HImode loads. If we don't
30828 disallow a change to these modes, reload will assume it's ok to
30829 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
30830 the vec_dupv4hi pattern. */
30831 if (GET_MODE_SIZE (from) < 4)
30832 return true;
30833
30834 /* Vector registers do not support subreg with nonzero offsets, which
30835 are otherwise valid for integer registers. Since we can't see
30836 whether we have a nonzero offset from here, prohibit all
30837 nonparadoxical subregs changing size. */
30838 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
30839 return true;
30840 }
30841
30842 return false;
30843 }
30844
30845 /* Return the cost of moving data of mode M between a
30846 register and memory. A value of 2 is the default; this cost is
30847 relative to those in `REGISTER_MOVE_COST'.
30848
30849 This function is used extensively by register_move_cost that is used to
30850 build tables at startup. Make it inline in this case.
30851 When IN is 2, return maximum of in and out move cost.
30852
30853 If moving between registers and memory is more expensive than
30854 between two registers, you should define this macro to express the
30855 relative cost.
30856
30857 Model also increased moving costs of QImode registers in non
30858 Q_REGS classes.
30859 */
30860 static inline int
30861 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
30862 int in)
30863 {
30864 int cost;
30865 if (FLOAT_CLASS_P (regclass))
30866 {
30867 int index;
30868 switch (mode)
30869 {
30870 case SFmode:
30871 index = 0;
30872 break;
30873 case DFmode:
30874 index = 1;
30875 break;
30876 case XFmode:
30877 index = 2;
30878 break;
30879 default:
30880 return 100;
30881 }
30882 if (in == 2)
30883 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
30884 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
30885 }
30886 if (SSE_CLASS_P (regclass))
30887 {
30888 int index;
30889 switch (GET_MODE_SIZE (mode))
30890 {
30891 case 4:
30892 index = 0;
30893 break;
30894 case 8:
30895 index = 1;
30896 break;
30897 case 16:
30898 index = 2;
30899 break;
30900 default:
30901 return 100;
30902 }
30903 if (in == 2)
30904 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
30905 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
30906 }
30907 if (MMX_CLASS_P (regclass))
30908 {
30909 int index;
30910 switch (GET_MODE_SIZE (mode))
30911 {
30912 case 4:
30913 index = 0;
30914 break;
30915 case 8:
30916 index = 1;
30917 break;
30918 default:
30919 return 100;
30920 }
30921 if (in)
30922 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
30923 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
30924 }
30925 switch (GET_MODE_SIZE (mode))
30926 {
30927 case 1:
30928 if (Q_CLASS_P (regclass) || TARGET_64BIT)
30929 {
30930 if (!in)
30931 return ix86_cost->int_store[0];
30932 if (TARGET_PARTIAL_REG_DEPENDENCY
30933 && optimize_function_for_speed_p (cfun))
30934 cost = ix86_cost->movzbl_load;
30935 else
30936 cost = ix86_cost->int_load[0];
30937 if (in == 2)
30938 return MAX (cost, ix86_cost->int_store[0]);
30939 return cost;
30940 }
30941 else
30942 {
30943 if (in == 2)
30944 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
30945 if (in)
30946 return ix86_cost->movzbl_load;
30947 else
30948 return ix86_cost->int_store[0] + 4;
30949 }
30950 break;
30951 case 2:
30952 if (in == 2)
30953 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
30954 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
30955 default:
30956 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
30957 if (mode == TFmode)
30958 mode = XFmode;
30959 if (in == 2)
30960 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
30961 else if (in)
30962 cost = ix86_cost->int_load[2];
30963 else
30964 cost = ix86_cost->int_store[2];
30965 return (cost * (((int) GET_MODE_SIZE (mode)
30966 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
30967 }
30968 }
30969
30970 static int
30971 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
30972 bool in)
30973 {
30974 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
30975 }
30976
30977
30978 /* Return the cost of moving data from a register in class CLASS1 to
30979 one in class CLASS2.
30980
30981 It is not required that the cost always equal 2 when FROM is the same as TO;
30982 on some machines it is expensive to move between registers if they are not
30983 general registers. */
30984
30985 static int
30986 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
30987 reg_class_t class2_i)
30988 {
30989 enum reg_class class1 = (enum reg_class) class1_i;
30990 enum reg_class class2 = (enum reg_class) class2_i;
30991
30992 /* In case we require secondary memory, compute cost of the store followed
30993 by load. In order to avoid bad register allocation choices, we need
30994 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
30995
30996 if (inline_secondary_memory_needed (class1, class2, mode, 0))
30997 {
30998 int cost = 1;
30999
31000 cost += inline_memory_move_cost (mode, class1, 2);
31001 cost += inline_memory_move_cost (mode, class2, 2);
31002
31003 /* In case of copying from general_purpose_register we may emit multiple
31004 stores followed by single load causing memory size mismatch stall.
31005 Count this as arbitrarily high cost of 20. */
31006 if (targetm.class_max_nregs (class1, mode)
31007 > targetm.class_max_nregs (class2, mode))
31008 cost += 20;
31009
31010 /* In the case of FP/MMX moves, the registers actually overlap, and we
31011 have to switch modes in order to treat them differently. */
31012 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31013 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31014 cost += 20;
31015
31016 return cost;
31017 }
31018
31019 /* Moves between SSE/MMX and integer unit are expensive. */
31020 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31021 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31022
31023 /* ??? By keeping returned value relatively high, we limit the number
31024 of moves between integer and MMX/SSE registers for all targets.
31025 Additionally, high value prevents problem with x86_modes_tieable_p(),
31026 where integer modes in MMX/SSE registers are not tieable
31027 because of missing QImode and HImode moves to, from or between
31028 MMX/SSE registers. */
31029 return MAX (8, ix86_cost->mmxsse_to_integer);
31030
31031 if (MAYBE_FLOAT_CLASS_P (class1))
31032 return ix86_cost->fp_move;
31033 if (MAYBE_SSE_CLASS_P (class1))
31034 return ix86_cost->sse_move;
31035 if (MAYBE_MMX_CLASS_P (class1))
31036 return ix86_cost->mmx_move;
31037 return 2;
31038 }
31039
31040 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31041 MODE. */
31042
31043 bool
31044 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31045 {
31046 /* Flags and only flags can only hold CCmode values. */
31047 if (CC_REGNO_P (regno))
31048 return GET_MODE_CLASS (mode) == MODE_CC;
31049 if (GET_MODE_CLASS (mode) == MODE_CC
31050 || GET_MODE_CLASS (mode) == MODE_RANDOM
31051 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31052 return false;
31053 if (FP_REGNO_P (regno))
31054 return VALID_FP_MODE_P (mode);
31055 if (SSE_REGNO_P (regno))
31056 {
31057 /* We implement the move patterns for all vector modes into and
31058 out of SSE registers, even when no operation instructions
31059 are available. OImode move is available only when AVX is
31060 enabled. */
31061 return ((TARGET_AVX && mode == OImode)
31062 || VALID_AVX256_REG_MODE (mode)
31063 || VALID_SSE_REG_MODE (mode)
31064 || VALID_SSE2_REG_MODE (mode)
31065 || VALID_MMX_REG_MODE (mode)
31066 || VALID_MMX_REG_MODE_3DNOW (mode));
31067 }
31068 if (MMX_REGNO_P (regno))
31069 {
31070 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31071 so if the register is available at all, then we can move data of
31072 the given mode into or out of it. */
31073 return (VALID_MMX_REG_MODE (mode)
31074 || VALID_MMX_REG_MODE_3DNOW (mode));
31075 }
31076
31077 if (mode == QImode)
31078 {
31079 /* Take care for QImode values - they can be in non-QI regs,
31080 but then they do cause partial register stalls. */
31081 if (regno <= BX_REG || TARGET_64BIT)
31082 return true;
31083 if (!TARGET_PARTIAL_REG_STALL)
31084 return true;
31085 return !can_create_pseudo_p ();
31086 }
31087 /* We handle both integer and floats in the general purpose registers. */
31088 else if (VALID_INT_MODE_P (mode))
31089 return true;
31090 else if (VALID_FP_MODE_P (mode))
31091 return true;
31092 else if (VALID_DFP_MODE_P (mode))
31093 return true;
31094 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
31095 on to use that value in smaller contexts, this can easily force a
31096 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
31097 supporting DImode, allow it. */
31098 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31099 return true;
31100
31101 return false;
31102 }
31103
31104 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
31105 tieable integer mode. */
31106
31107 static bool
31108 ix86_tieable_integer_mode_p (enum machine_mode mode)
31109 {
31110 switch (mode)
31111 {
31112 case HImode:
31113 case SImode:
31114 return true;
31115
31116 case QImode:
31117 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31118
31119 case DImode:
31120 return TARGET_64BIT;
31121
31122 default:
31123 return false;
31124 }
31125 }
31126
31127 /* Return true if MODE1 is accessible in a register that can hold MODE2
31128 without copying. That is, all register classes that can hold MODE2
31129 can also hold MODE1. */
31130
31131 bool
31132 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31133 {
31134 if (mode1 == mode2)
31135 return true;
31136
31137 if (ix86_tieable_integer_mode_p (mode1)
31138 && ix86_tieable_integer_mode_p (mode2))
31139 return true;
31140
31141 /* MODE2 being XFmode implies fp stack or general regs, which means we
31142 can tie any smaller floating point modes to it. Note that we do not
31143 tie this with TFmode. */
31144 if (mode2 == XFmode)
31145 return mode1 == SFmode || mode1 == DFmode;
31146
31147 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31148 that we can tie it with SFmode. */
31149 if (mode2 == DFmode)
31150 return mode1 == SFmode;
31151
31152 /* If MODE2 is only appropriate for an SSE register, then tie with
31153 any other mode acceptable to SSE registers. */
31154 if (GET_MODE_SIZE (mode2) == 16
31155 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31156 return (GET_MODE_SIZE (mode1) == 16
31157 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31158
31159 /* If MODE2 is appropriate for an MMX register, then tie
31160 with any other mode acceptable to MMX registers. */
31161 if (GET_MODE_SIZE (mode2) == 8
31162 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31163 return (GET_MODE_SIZE (mode1) == 8
31164 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31165
31166 return false;
31167 }
31168
31169 /* Compute a (partial) cost for rtx X. Return true if the complete
31170 cost has been computed, and false if subexpressions should be
31171 scanned. In either case, *TOTAL contains the cost result. */
31172
31173 static bool
31174 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
31175 bool speed)
31176 {
31177 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31178 enum machine_mode mode = GET_MODE (x);
31179 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31180
31181 switch (code)
31182 {
31183 case CONST_INT:
31184 case CONST:
31185 case LABEL_REF:
31186 case SYMBOL_REF:
31187 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31188 *total = 3;
31189 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31190 *total = 2;
31191 else if (flag_pic && SYMBOLIC_CONST (x)
31192 && (!TARGET_64BIT
31193 || (!GET_CODE (x) != LABEL_REF
31194 && (GET_CODE (x) != SYMBOL_REF
31195 || !SYMBOL_REF_LOCAL_P (x)))))
31196 *total = 1;
31197 else
31198 *total = 0;
31199 return true;
31200
31201 case CONST_DOUBLE:
31202 if (mode == VOIDmode)
31203 *total = 0;
31204 else
31205 switch (standard_80387_constant_p (x))
31206 {
31207 case 1: /* 0.0 */
31208 *total = 1;
31209 break;
31210 default: /* Other constants */
31211 *total = 2;
31212 break;
31213 case 0:
31214 case -1:
31215 /* Start with (MEM (SYMBOL_REF)), since that's where
31216 it'll probably end up. Add a penalty for size. */
31217 *total = (COSTS_N_INSNS (1)
31218 + (flag_pic != 0 && !TARGET_64BIT)
31219 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
31220 break;
31221 }
31222 return true;
31223
31224 case ZERO_EXTEND:
31225 /* The zero extensions is often completely free on x86_64, so make
31226 it as cheap as possible. */
31227 if (TARGET_64BIT && mode == DImode
31228 && GET_MODE (XEXP (x, 0)) == SImode)
31229 *total = 1;
31230 else if (TARGET_ZERO_EXTEND_WITH_AND)
31231 *total = cost->add;
31232 else
31233 *total = cost->movzx;
31234 return false;
31235
31236 case SIGN_EXTEND:
31237 *total = cost->movsx;
31238 return false;
31239
31240 case ASHIFT:
31241 if (CONST_INT_P (XEXP (x, 1))
31242 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
31243 {
31244 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31245 if (value == 1)
31246 {
31247 *total = cost->add;
31248 return false;
31249 }
31250 if ((value == 2 || value == 3)
31251 && cost->lea <= cost->shift_const)
31252 {
31253 *total = cost->lea;
31254 return false;
31255 }
31256 }
31257 /* FALLTHRU */
31258
31259 case ROTATE:
31260 case ASHIFTRT:
31261 case LSHIFTRT:
31262 case ROTATERT:
31263 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
31264 {
31265 if (CONST_INT_P (XEXP (x, 1)))
31266 {
31267 if (INTVAL (XEXP (x, 1)) > 32)
31268 *total = cost->shift_const + COSTS_N_INSNS (2);
31269 else
31270 *total = cost->shift_const * 2;
31271 }
31272 else
31273 {
31274 if (GET_CODE (XEXP (x, 1)) == AND)
31275 *total = cost->shift_var * 2;
31276 else
31277 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
31278 }
31279 }
31280 else
31281 {
31282 if (CONST_INT_P (XEXP (x, 1)))
31283 *total = cost->shift_const;
31284 else
31285 *total = cost->shift_var;
31286 }
31287 return false;
31288
31289 case FMA:
31290 {
31291 rtx sub;
31292
31293 gcc_assert (FLOAT_MODE_P (mode));
31294 gcc_assert (TARGET_FMA || TARGET_FMA4);
31295
31296 /* ??? SSE scalar/vector cost should be used here. */
31297 /* ??? Bald assumption that fma has the same cost as fmul. */
31298 *total = cost->fmul;
31299 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31300
31301 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
31302 sub = XEXP (x, 0);
31303 if (GET_CODE (sub) == NEG)
31304 sub = XEXP (sub, 0);
31305 *total += rtx_cost (sub, FMA, 0, speed);
31306
31307 sub = XEXP (x, 2);
31308 if (GET_CODE (sub) == NEG)
31309 sub = XEXP (sub, 0);
31310 *total += rtx_cost (sub, FMA, 2, speed);
31311 return true;
31312 }
31313
31314 case MULT:
31315 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31316 {
31317 /* ??? SSE scalar cost should be used here. */
31318 *total = cost->fmul;
31319 return false;
31320 }
31321 else if (X87_FLOAT_MODE_P (mode))
31322 {
31323 *total = cost->fmul;
31324 return false;
31325 }
31326 else if (FLOAT_MODE_P (mode))
31327 {
31328 /* ??? SSE vector cost should be used here. */
31329 *total = cost->fmul;
31330 return false;
31331 }
31332 else
31333 {
31334 rtx op0 = XEXP (x, 0);
31335 rtx op1 = XEXP (x, 1);
31336 int nbits;
31337 if (CONST_INT_P (XEXP (x, 1)))
31338 {
31339 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31340 for (nbits = 0; value != 0; value &= value - 1)
31341 nbits++;
31342 }
31343 else
31344 /* This is arbitrary. */
31345 nbits = 7;
31346
31347 /* Compute costs correctly for widening multiplication. */
31348 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
31349 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
31350 == GET_MODE_SIZE (mode))
31351 {
31352 int is_mulwiden = 0;
31353 enum machine_mode inner_mode = GET_MODE (op0);
31354
31355 if (GET_CODE (op0) == GET_CODE (op1))
31356 is_mulwiden = 1, op1 = XEXP (op1, 0);
31357 else if (CONST_INT_P (op1))
31358 {
31359 if (GET_CODE (op0) == SIGN_EXTEND)
31360 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
31361 == INTVAL (op1);
31362 else
31363 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
31364 }
31365
31366 if (is_mulwiden)
31367 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
31368 }
31369
31370 *total = (cost->mult_init[MODE_INDEX (mode)]
31371 + nbits * cost->mult_bit
31372 + rtx_cost (op0, outer_code, opno, speed)
31373 + rtx_cost (op1, outer_code, opno, speed));
31374
31375 return true;
31376 }
31377
31378 case DIV:
31379 case UDIV:
31380 case MOD:
31381 case UMOD:
31382 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31383 /* ??? SSE cost should be used here. */
31384 *total = cost->fdiv;
31385 else if (X87_FLOAT_MODE_P (mode))
31386 *total = cost->fdiv;
31387 else if (FLOAT_MODE_P (mode))
31388 /* ??? SSE vector cost should be used here. */
31389 *total = cost->fdiv;
31390 else
31391 *total = cost->divide[MODE_INDEX (mode)];
31392 return false;
31393
31394 case PLUS:
31395 if (GET_MODE_CLASS (mode) == MODE_INT
31396 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
31397 {
31398 if (GET_CODE (XEXP (x, 0)) == PLUS
31399 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
31400 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
31401 && CONSTANT_P (XEXP (x, 1)))
31402 {
31403 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
31404 if (val == 2 || val == 4 || val == 8)
31405 {
31406 *total = cost->lea;
31407 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31408 outer_code, opno, speed);
31409 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
31410 outer_code, opno, speed);
31411 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31412 return true;
31413 }
31414 }
31415 else if (GET_CODE (XEXP (x, 0)) == MULT
31416 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
31417 {
31418 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
31419 if (val == 2 || val == 4 || val == 8)
31420 {
31421 *total = cost->lea;
31422 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31423 outer_code, opno, speed);
31424 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31425 return true;
31426 }
31427 }
31428 else if (GET_CODE (XEXP (x, 0)) == PLUS)
31429 {
31430 *total = cost->lea;
31431 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31432 outer_code, opno, speed);
31433 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31434 outer_code, opno, speed);
31435 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31436 return true;
31437 }
31438 }
31439 /* FALLTHRU */
31440
31441 case MINUS:
31442 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31443 {
31444 /* ??? SSE cost should be used here. */
31445 *total = cost->fadd;
31446 return false;
31447 }
31448 else if (X87_FLOAT_MODE_P (mode))
31449 {
31450 *total = cost->fadd;
31451 return false;
31452 }
31453 else if (FLOAT_MODE_P (mode))
31454 {
31455 /* ??? SSE vector cost should be used here. */
31456 *total = cost->fadd;
31457 return false;
31458 }
31459 /* FALLTHRU */
31460
31461 case AND:
31462 case IOR:
31463 case XOR:
31464 if (!TARGET_64BIT && mode == DImode)
31465 {
31466 *total = (cost->add * 2
31467 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
31468 << (GET_MODE (XEXP (x, 0)) != DImode))
31469 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
31470 << (GET_MODE (XEXP (x, 1)) != DImode)));
31471 return true;
31472 }
31473 /* FALLTHRU */
31474
31475 case NEG:
31476 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31477 {
31478 /* ??? SSE cost should be used here. */
31479 *total = cost->fchs;
31480 return false;
31481 }
31482 else if (X87_FLOAT_MODE_P (mode))
31483 {
31484 *total = cost->fchs;
31485 return false;
31486 }
31487 else if (FLOAT_MODE_P (mode))
31488 {
31489 /* ??? SSE vector cost should be used here. */
31490 *total = cost->fchs;
31491 return false;
31492 }
31493 /* FALLTHRU */
31494
31495 case NOT:
31496 if (!TARGET_64BIT && mode == DImode)
31497 *total = cost->add * 2;
31498 else
31499 *total = cost->add;
31500 return false;
31501
31502 case COMPARE:
31503 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
31504 && XEXP (XEXP (x, 0), 1) == const1_rtx
31505 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
31506 && XEXP (x, 1) == const0_rtx)
31507 {
31508 /* This kind of construct is implemented using test[bwl].
31509 Treat it as if we had an AND. */
31510 *total = (cost->add
31511 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
31512 + rtx_cost (const1_rtx, outer_code, opno, speed));
31513 return true;
31514 }
31515 return false;
31516
31517 case FLOAT_EXTEND:
31518 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
31519 *total = 0;
31520 return false;
31521
31522 case ABS:
31523 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31524 /* ??? SSE cost should be used here. */
31525 *total = cost->fabs;
31526 else if (X87_FLOAT_MODE_P (mode))
31527 *total = cost->fabs;
31528 else if (FLOAT_MODE_P (mode))
31529 /* ??? SSE vector cost should be used here. */
31530 *total = cost->fabs;
31531 return false;
31532
31533 case SQRT:
31534 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31535 /* ??? SSE cost should be used here. */
31536 *total = cost->fsqrt;
31537 else if (X87_FLOAT_MODE_P (mode))
31538 *total = cost->fsqrt;
31539 else if (FLOAT_MODE_P (mode))
31540 /* ??? SSE vector cost should be used here. */
31541 *total = cost->fsqrt;
31542 return false;
31543
31544 case UNSPEC:
31545 if (XINT (x, 1) == UNSPEC_TP)
31546 *total = 0;
31547 return false;
31548
31549 case VEC_SELECT:
31550 case VEC_CONCAT:
31551 case VEC_MERGE:
31552 case VEC_DUPLICATE:
31553 /* ??? Assume all of these vector manipulation patterns are
31554 recognizable. In which case they all pretty much have the
31555 same cost. */
31556 *total = COSTS_N_INSNS (1);
31557 return true;
31558
31559 default:
31560 return false;
31561 }
31562 }
31563
31564 #if TARGET_MACHO
31565
31566 static int current_machopic_label_num;
31567
31568 /* Given a symbol name and its associated stub, write out the
31569 definition of the stub. */
31570
31571 void
31572 machopic_output_stub (FILE *file, const char *symb, const char *stub)
31573 {
31574 unsigned int length;
31575 char *binder_name, *symbol_name, lazy_ptr_name[32];
31576 int label = ++current_machopic_label_num;
31577
31578 /* For 64-bit we shouldn't get here. */
31579 gcc_assert (!TARGET_64BIT);
31580
31581 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
31582 symb = targetm.strip_name_encoding (symb);
31583
31584 length = strlen (stub);
31585 binder_name = XALLOCAVEC (char, length + 32);
31586 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
31587
31588 length = strlen (symb);
31589 symbol_name = XALLOCAVEC (char, length + 32);
31590 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
31591
31592 sprintf (lazy_ptr_name, "L%d$lz", label);
31593
31594 if (MACHOPIC_ATT_STUB)
31595 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
31596 else if (MACHOPIC_PURE)
31597 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
31598 else
31599 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
31600
31601 fprintf (file, "%s:\n", stub);
31602 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31603
31604 if (MACHOPIC_ATT_STUB)
31605 {
31606 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
31607 }
31608 else if (MACHOPIC_PURE)
31609 {
31610 /* PIC stub. */
31611 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31612 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
31613 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
31614 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
31615 label, lazy_ptr_name, label);
31616 fprintf (file, "\tjmp\t*%%ecx\n");
31617 }
31618 else
31619 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
31620
31621 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
31622 it needs no stub-binding-helper. */
31623 if (MACHOPIC_ATT_STUB)
31624 return;
31625
31626 fprintf (file, "%s:\n", binder_name);
31627
31628 if (MACHOPIC_PURE)
31629 {
31630 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
31631 fprintf (file, "\tpushl\t%%ecx\n");
31632 }
31633 else
31634 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
31635
31636 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
31637
31638 /* N.B. Keep the correspondence of these
31639 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
31640 old-pic/new-pic/non-pic stubs; altering this will break
31641 compatibility with existing dylibs. */
31642 if (MACHOPIC_PURE)
31643 {
31644 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31645 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
31646 }
31647 else
31648 /* 16-byte -mdynamic-no-pic stub. */
31649 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
31650
31651 fprintf (file, "%s:\n", lazy_ptr_name);
31652 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31653 fprintf (file, ASM_LONG "%s\n", binder_name);
31654 }
31655 #endif /* TARGET_MACHO */
31656
31657 /* Order the registers for register allocator. */
31658
31659 void
31660 x86_order_regs_for_local_alloc (void)
31661 {
31662 int pos = 0;
31663 int i;
31664
31665 /* First allocate the local general purpose registers. */
31666 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31667 if (GENERAL_REGNO_P (i) && call_used_regs[i])
31668 reg_alloc_order [pos++] = i;
31669
31670 /* Global general purpose registers. */
31671 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31672 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
31673 reg_alloc_order [pos++] = i;
31674
31675 /* x87 registers come first in case we are doing FP math
31676 using them. */
31677 if (!TARGET_SSE_MATH)
31678 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31679 reg_alloc_order [pos++] = i;
31680
31681 /* SSE registers. */
31682 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
31683 reg_alloc_order [pos++] = i;
31684 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
31685 reg_alloc_order [pos++] = i;
31686
31687 /* x87 registers. */
31688 if (TARGET_SSE_MATH)
31689 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31690 reg_alloc_order [pos++] = i;
31691
31692 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
31693 reg_alloc_order [pos++] = i;
31694
31695 /* Initialize the rest of array as we do not allocate some registers
31696 at all. */
31697 while (pos < FIRST_PSEUDO_REGISTER)
31698 reg_alloc_order [pos++] = 0;
31699 }
31700
31701 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
31702 in struct attribute_spec handler. */
31703 static tree
31704 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
31705 tree args,
31706 int flags ATTRIBUTE_UNUSED,
31707 bool *no_add_attrs)
31708 {
31709 if (TREE_CODE (*node) != FUNCTION_TYPE
31710 && TREE_CODE (*node) != METHOD_TYPE
31711 && TREE_CODE (*node) != FIELD_DECL
31712 && TREE_CODE (*node) != TYPE_DECL)
31713 {
31714 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31715 name);
31716 *no_add_attrs = true;
31717 return NULL_TREE;
31718 }
31719 if (TARGET_64BIT)
31720 {
31721 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
31722 name);
31723 *no_add_attrs = true;
31724 return NULL_TREE;
31725 }
31726 if (is_attribute_p ("callee_pop_aggregate_return", name))
31727 {
31728 tree cst;
31729
31730 cst = TREE_VALUE (args);
31731 if (TREE_CODE (cst) != INTEGER_CST)
31732 {
31733 warning (OPT_Wattributes,
31734 "%qE attribute requires an integer constant argument",
31735 name);
31736 *no_add_attrs = true;
31737 }
31738 else if (compare_tree_int (cst, 0) != 0
31739 && compare_tree_int (cst, 1) != 0)
31740 {
31741 warning (OPT_Wattributes,
31742 "argument to %qE attribute is neither zero, nor one",
31743 name);
31744 *no_add_attrs = true;
31745 }
31746
31747 return NULL_TREE;
31748 }
31749
31750 return NULL_TREE;
31751 }
31752
31753 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
31754 struct attribute_spec.handler. */
31755 static tree
31756 ix86_handle_abi_attribute (tree *node, tree name,
31757 tree args ATTRIBUTE_UNUSED,
31758 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31759 {
31760 if (TREE_CODE (*node) != FUNCTION_TYPE
31761 && TREE_CODE (*node) != METHOD_TYPE
31762 && TREE_CODE (*node) != FIELD_DECL
31763 && TREE_CODE (*node) != TYPE_DECL)
31764 {
31765 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31766 name);
31767 *no_add_attrs = true;
31768 return NULL_TREE;
31769 }
31770
31771 /* Can combine regparm with all attributes but fastcall. */
31772 if (is_attribute_p ("ms_abi", name))
31773 {
31774 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
31775 {
31776 error ("ms_abi and sysv_abi attributes are not compatible");
31777 }
31778
31779 return NULL_TREE;
31780 }
31781 else if (is_attribute_p ("sysv_abi", name))
31782 {
31783 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
31784 {
31785 error ("ms_abi and sysv_abi attributes are not compatible");
31786 }
31787
31788 return NULL_TREE;
31789 }
31790
31791 return NULL_TREE;
31792 }
31793
31794 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
31795 struct attribute_spec.handler. */
31796 static tree
31797 ix86_handle_struct_attribute (tree *node, tree name,
31798 tree args ATTRIBUTE_UNUSED,
31799 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31800 {
31801 tree *type = NULL;
31802 if (DECL_P (*node))
31803 {
31804 if (TREE_CODE (*node) == TYPE_DECL)
31805 type = &TREE_TYPE (*node);
31806 }
31807 else
31808 type = node;
31809
31810 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
31811 || TREE_CODE (*type) == UNION_TYPE)))
31812 {
31813 warning (OPT_Wattributes, "%qE attribute ignored",
31814 name);
31815 *no_add_attrs = true;
31816 }
31817
31818 else if ((is_attribute_p ("ms_struct", name)
31819 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
31820 || ((is_attribute_p ("gcc_struct", name)
31821 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
31822 {
31823 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
31824 name);
31825 *no_add_attrs = true;
31826 }
31827
31828 return NULL_TREE;
31829 }
31830
31831 static tree
31832 ix86_handle_fndecl_attribute (tree *node, tree name,
31833 tree args ATTRIBUTE_UNUSED,
31834 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31835 {
31836 if (TREE_CODE (*node) != FUNCTION_DECL)
31837 {
31838 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31839 name);
31840 *no_add_attrs = true;
31841 }
31842 return NULL_TREE;
31843 }
31844
31845 static bool
31846 ix86_ms_bitfield_layout_p (const_tree record_type)
31847 {
31848 return ((TARGET_MS_BITFIELD_LAYOUT
31849 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
31850 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
31851 }
31852
31853 /* Returns an expression indicating where the this parameter is
31854 located on entry to the FUNCTION. */
31855
31856 static rtx
31857 x86_this_parameter (tree function)
31858 {
31859 tree type = TREE_TYPE (function);
31860 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
31861 int nregs;
31862
31863 if (TARGET_64BIT)
31864 {
31865 const int *parm_regs;
31866
31867 if (ix86_function_type_abi (type) == MS_ABI)
31868 parm_regs = x86_64_ms_abi_int_parameter_registers;
31869 else
31870 parm_regs = x86_64_int_parameter_registers;
31871 return gen_rtx_REG (DImode, parm_regs[aggr]);
31872 }
31873
31874 nregs = ix86_function_regparm (type, function);
31875
31876 if (nregs > 0 && !stdarg_p (type))
31877 {
31878 int regno;
31879 unsigned int ccvt = ix86_get_callcvt (type);
31880
31881 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31882 regno = aggr ? DX_REG : CX_REG;
31883 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31884 {
31885 regno = CX_REG;
31886 if (aggr)
31887 return gen_rtx_MEM (SImode,
31888 plus_constant (stack_pointer_rtx, 4));
31889 }
31890 else
31891 {
31892 regno = AX_REG;
31893 if (aggr)
31894 {
31895 regno = DX_REG;
31896 if (nregs == 1)
31897 return gen_rtx_MEM (SImode,
31898 plus_constant (stack_pointer_rtx, 4));
31899 }
31900 }
31901 return gen_rtx_REG (SImode, regno);
31902 }
31903
31904 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
31905 }
31906
31907 /* Determine whether x86_output_mi_thunk can succeed. */
31908
31909 static bool
31910 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
31911 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
31912 HOST_WIDE_INT vcall_offset, const_tree function)
31913 {
31914 /* 64-bit can handle anything. */
31915 if (TARGET_64BIT)
31916 return true;
31917
31918 /* For 32-bit, everything's fine if we have one free register. */
31919 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
31920 return true;
31921
31922 /* Need a free register for vcall_offset. */
31923 if (vcall_offset)
31924 return false;
31925
31926 /* Need a free register for GOT references. */
31927 if (flag_pic && !targetm.binds_local_p (function))
31928 return false;
31929
31930 /* Otherwise ok. */
31931 return true;
31932 }
31933
31934 /* Output the assembler code for a thunk function. THUNK_DECL is the
31935 declaration for the thunk function itself, FUNCTION is the decl for
31936 the target function. DELTA is an immediate constant offset to be
31937 added to THIS. If VCALL_OFFSET is nonzero, the word at
31938 *(*this + vcall_offset) should be added to THIS. */
31939
31940 static void
31941 x86_output_mi_thunk (FILE *file,
31942 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
31943 HOST_WIDE_INT vcall_offset, tree function)
31944 {
31945 rtx this_param = x86_this_parameter (function);
31946 rtx this_reg, tmp, fnaddr;
31947
31948 emit_note (NOTE_INSN_PROLOGUE_END);
31949
31950 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
31951 pull it in now and let DELTA benefit. */
31952 if (REG_P (this_param))
31953 this_reg = this_param;
31954 else if (vcall_offset)
31955 {
31956 /* Put the this parameter into %eax. */
31957 this_reg = gen_rtx_REG (Pmode, AX_REG);
31958 emit_move_insn (this_reg, this_param);
31959 }
31960 else
31961 this_reg = NULL_RTX;
31962
31963 /* Adjust the this parameter by a fixed constant. */
31964 if (delta)
31965 {
31966 rtx delta_rtx = GEN_INT (delta);
31967 rtx delta_dst = this_reg ? this_reg : this_param;
31968
31969 if (TARGET_64BIT)
31970 {
31971 if (!x86_64_general_operand (delta_rtx, Pmode))
31972 {
31973 tmp = gen_rtx_REG (Pmode, R10_REG);
31974 emit_move_insn (tmp, delta_rtx);
31975 delta_rtx = tmp;
31976 }
31977 }
31978
31979 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
31980 }
31981
31982 /* Adjust the this parameter by a value stored in the vtable. */
31983 if (vcall_offset)
31984 {
31985 rtx vcall_addr, vcall_mem, this_mem;
31986 unsigned int tmp_regno;
31987
31988 if (TARGET_64BIT)
31989 tmp_regno = R10_REG;
31990 else
31991 {
31992 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
31993 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
31994 tmp_regno = AX_REG;
31995 else
31996 tmp_regno = CX_REG;
31997 }
31998 tmp = gen_rtx_REG (Pmode, tmp_regno);
31999
32000 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32001 if (Pmode != ptr_mode)
32002 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32003 emit_move_insn (tmp, this_mem);
32004
32005 /* Adjust the this parameter. */
32006 vcall_addr = plus_constant (tmp, vcall_offset);
32007 if (TARGET_64BIT
32008 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32009 {
32010 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32011 emit_move_insn (tmp2, GEN_INT (vcall_offset));
32012 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32013 }
32014
32015 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32016 if (Pmode != ptr_mode)
32017 emit_insn (gen_addsi_1_zext (this_reg,
32018 gen_rtx_REG (ptr_mode,
32019 REGNO (this_reg)),
32020 vcall_mem));
32021 else
32022 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32023 }
32024
32025 /* If necessary, drop THIS back to its stack slot. */
32026 if (this_reg && this_reg != this_param)
32027 emit_move_insn (this_param, this_reg);
32028
32029 fnaddr = XEXP (DECL_RTL (function), 0);
32030 if (TARGET_64BIT)
32031 {
32032 if (!flag_pic || targetm.binds_local_p (function)
32033 || cfun->machine->call_abi == MS_ABI)
32034 ;
32035 else
32036 {
32037 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32038 tmp = gen_rtx_CONST (Pmode, tmp);
32039 fnaddr = gen_rtx_MEM (Pmode, tmp);
32040 }
32041 }
32042 else
32043 {
32044 if (!flag_pic || targetm.binds_local_p (function))
32045 ;
32046 #if TARGET_MACHO
32047 else if (TARGET_MACHO)
32048 {
32049 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32050 fnaddr = XEXP (fnaddr, 0);
32051 }
32052 #endif /* TARGET_MACHO */
32053 else
32054 {
32055 tmp = gen_rtx_REG (Pmode, CX_REG);
32056 output_set_got (tmp, NULL_RTX);
32057
32058 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32059 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32060 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32061 }
32062 }
32063
32064 /* Our sibling call patterns do not allow memories, because we have no
32065 predicate that can distinguish between frame and non-frame memory.
32066 For our purposes here, we can get away with (ab)using a jump pattern,
32067 because we're going to do no optimization. */
32068 if (MEM_P (fnaddr))
32069 emit_jump_insn (gen_indirect_jump (fnaddr));
32070 else
32071 {
32072 tmp = gen_rtx_MEM (QImode, fnaddr);
32073 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32074 tmp = emit_call_insn (tmp);
32075 SIBLING_CALL_P (tmp) = 1;
32076 }
32077 emit_barrier ();
32078
32079 /* Emit just enough of rest_of_compilation to get the insns emitted.
32080 Note that use_thunk calls assemble_start_function et al. */
32081 tmp = get_insns ();
32082 insn_locators_alloc ();
32083 shorten_branches (tmp);
32084 final_start_function (tmp, file, 1);
32085 final (tmp, file, 1);
32086 final_end_function ();
32087 }
32088
32089 static void
32090 x86_file_start (void)
32091 {
32092 default_file_start ();
32093 #if TARGET_MACHO
32094 darwin_file_start ();
32095 #endif
32096 if (X86_FILE_START_VERSION_DIRECTIVE)
32097 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32098 if (X86_FILE_START_FLTUSED)
32099 fputs ("\t.global\t__fltused\n", asm_out_file);
32100 if (ix86_asm_dialect == ASM_INTEL)
32101 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32102 }
32103
32104 int
32105 x86_field_alignment (tree field, int computed)
32106 {
32107 enum machine_mode mode;
32108 tree type = TREE_TYPE (field);
32109
32110 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32111 return computed;
32112 mode = TYPE_MODE (strip_array_types (type));
32113 if (mode == DFmode || mode == DCmode
32114 || GET_MODE_CLASS (mode) == MODE_INT
32115 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32116 return MIN (32, computed);
32117 return computed;
32118 }
32119
32120 /* Output assembler code to FILE to increment profiler label # LABELNO
32121 for profiling a function entry. */
32122 void
32123 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32124 {
32125 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32126 : MCOUNT_NAME);
32127
32128 if (TARGET_64BIT)
32129 {
32130 #ifndef NO_PROFILE_COUNTERS
32131 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32132 #endif
32133
32134 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32135 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32136 else
32137 fprintf (file, "\tcall\t%s\n", mcount_name);
32138 }
32139 else if (flag_pic)
32140 {
32141 #ifndef NO_PROFILE_COUNTERS
32142 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32143 LPREFIX, labelno);
32144 #endif
32145 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32146 }
32147 else
32148 {
32149 #ifndef NO_PROFILE_COUNTERS
32150 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32151 LPREFIX, labelno);
32152 #endif
32153 fprintf (file, "\tcall\t%s\n", mcount_name);
32154 }
32155 }
32156
32157 /* We don't have exact information about the insn sizes, but we may assume
32158 quite safely that we are informed about all 1 byte insns and memory
32159 address sizes. This is enough to eliminate unnecessary padding in
32160 99% of cases. */
32161
32162 static int
32163 min_insn_size (rtx insn)
32164 {
32165 int l = 0, len;
32166
32167 if (!INSN_P (insn) || !active_insn_p (insn))
32168 return 0;
32169
32170 /* Discard alignments we've emit and jump instructions. */
32171 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
32172 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
32173 return 0;
32174 if (JUMP_TABLE_DATA_P (insn))
32175 return 0;
32176
32177 /* Important case - calls are always 5 bytes.
32178 It is common to have many calls in the row. */
32179 if (CALL_P (insn)
32180 && symbolic_reference_mentioned_p (PATTERN (insn))
32181 && !SIBLING_CALL_P (insn))
32182 return 5;
32183 len = get_attr_length (insn);
32184 if (len <= 1)
32185 return 1;
32186
32187 /* For normal instructions we rely on get_attr_length being exact,
32188 with a few exceptions. */
32189 if (!JUMP_P (insn))
32190 {
32191 enum attr_type type = get_attr_type (insn);
32192
32193 switch (type)
32194 {
32195 case TYPE_MULTI:
32196 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
32197 || asm_noperands (PATTERN (insn)) >= 0)
32198 return 0;
32199 break;
32200 case TYPE_OTHER:
32201 case TYPE_FCMP:
32202 break;
32203 default:
32204 /* Otherwise trust get_attr_length. */
32205 return len;
32206 }
32207
32208 l = get_attr_length_address (insn);
32209 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
32210 l = 4;
32211 }
32212 if (l)
32213 return 1+l;
32214 else
32215 return 2;
32216 }
32217
32218 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32219
32220 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
32221 window. */
32222
32223 static void
32224 ix86_avoid_jump_mispredicts (void)
32225 {
32226 rtx insn, start = get_insns ();
32227 int nbytes = 0, njumps = 0;
32228 int isjump = 0;
32229
32230 /* Look for all minimal intervals of instructions containing 4 jumps.
32231 The intervals are bounded by START and INSN. NBYTES is the total
32232 size of instructions in the interval including INSN and not including
32233 START. When the NBYTES is smaller than 16 bytes, it is possible
32234 that the end of START and INSN ends up in the same 16byte page.
32235
32236 The smallest offset in the page INSN can start is the case where START
32237 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
32238 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
32239 */
32240 for (insn = start; insn; insn = NEXT_INSN (insn))
32241 {
32242 int min_size;
32243
32244 if (LABEL_P (insn))
32245 {
32246 int align = label_to_alignment (insn);
32247 int max_skip = label_to_max_skip (insn);
32248
32249 if (max_skip > 15)
32250 max_skip = 15;
32251 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
32252 already in the current 16 byte page, because otherwise
32253 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
32254 bytes to reach 16 byte boundary. */
32255 if (align <= 0
32256 || (align <= 3 && max_skip != (1 << align) - 1))
32257 max_skip = 0;
32258 if (dump_file)
32259 fprintf (dump_file, "Label %i with max_skip %i\n",
32260 INSN_UID (insn), max_skip);
32261 if (max_skip)
32262 {
32263 while (nbytes + max_skip >= 16)
32264 {
32265 start = NEXT_INSN (start);
32266 if ((JUMP_P (start)
32267 && GET_CODE (PATTERN (start)) != ADDR_VEC
32268 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32269 || CALL_P (start))
32270 njumps--, isjump = 1;
32271 else
32272 isjump = 0;
32273 nbytes -= min_insn_size (start);
32274 }
32275 }
32276 continue;
32277 }
32278
32279 min_size = min_insn_size (insn);
32280 nbytes += min_size;
32281 if (dump_file)
32282 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
32283 INSN_UID (insn), min_size);
32284 if ((JUMP_P (insn)
32285 && GET_CODE (PATTERN (insn)) != ADDR_VEC
32286 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
32287 || CALL_P (insn))
32288 njumps++;
32289 else
32290 continue;
32291
32292 while (njumps > 3)
32293 {
32294 start = NEXT_INSN (start);
32295 if ((JUMP_P (start)
32296 && GET_CODE (PATTERN (start)) != ADDR_VEC
32297 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32298 || CALL_P (start))
32299 njumps--, isjump = 1;
32300 else
32301 isjump = 0;
32302 nbytes -= min_insn_size (start);
32303 }
32304 gcc_assert (njumps >= 0);
32305 if (dump_file)
32306 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
32307 INSN_UID (start), INSN_UID (insn), nbytes);
32308
32309 if (njumps == 3 && isjump && nbytes < 16)
32310 {
32311 int padsize = 15 - nbytes + min_insn_size (insn);
32312
32313 if (dump_file)
32314 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
32315 INSN_UID (insn), padsize);
32316 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
32317 }
32318 }
32319 }
32320 #endif
32321
32322 /* AMD Athlon works faster
32323 when RET is not destination of conditional jump or directly preceded
32324 by other jump instruction. We avoid the penalty by inserting NOP just
32325 before the RET instructions in such cases. */
32326 static void
32327 ix86_pad_returns (void)
32328 {
32329 edge e;
32330 edge_iterator ei;
32331
32332 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32333 {
32334 basic_block bb = e->src;
32335 rtx ret = BB_END (bb);
32336 rtx prev;
32337 bool replace = false;
32338
32339 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
32340 || optimize_bb_for_size_p (bb))
32341 continue;
32342 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
32343 if (active_insn_p (prev) || LABEL_P (prev))
32344 break;
32345 if (prev && LABEL_P (prev))
32346 {
32347 edge e;
32348 edge_iterator ei;
32349
32350 FOR_EACH_EDGE (e, ei, bb->preds)
32351 if (EDGE_FREQUENCY (e) && e->src->index >= 0
32352 && !(e->flags & EDGE_FALLTHRU))
32353 replace = true;
32354 }
32355 if (!replace)
32356 {
32357 prev = prev_active_insn (ret);
32358 if (prev
32359 && ((JUMP_P (prev) && any_condjump_p (prev))
32360 || CALL_P (prev)))
32361 replace = true;
32362 /* Empty functions get branch mispredict even when
32363 the jump destination is not visible to us. */
32364 if (!prev && !optimize_function_for_size_p (cfun))
32365 replace = true;
32366 }
32367 if (replace)
32368 {
32369 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
32370 delete_insn (ret);
32371 }
32372 }
32373 }
32374
32375 /* Count the minimum number of instructions in BB. Return 4 if the
32376 number of instructions >= 4. */
32377
32378 static int
32379 ix86_count_insn_bb (basic_block bb)
32380 {
32381 rtx insn;
32382 int insn_count = 0;
32383
32384 /* Count number of instructions in this block. Return 4 if the number
32385 of instructions >= 4. */
32386 FOR_BB_INSNS (bb, insn)
32387 {
32388 /* Only happen in exit blocks. */
32389 if (JUMP_P (insn)
32390 && ANY_RETURN_P (PATTERN (insn)))
32391 break;
32392
32393 if (NONDEBUG_INSN_P (insn)
32394 && GET_CODE (PATTERN (insn)) != USE
32395 && GET_CODE (PATTERN (insn)) != CLOBBER)
32396 {
32397 insn_count++;
32398 if (insn_count >= 4)
32399 return insn_count;
32400 }
32401 }
32402
32403 return insn_count;
32404 }
32405
32406
32407 /* Count the minimum number of instructions in code path in BB.
32408 Return 4 if the number of instructions >= 4. */
32409
32410 static int
32411 ix86_count_insn (basic_block bb)
32412 {
32413 edge e;
32414 edge_iterator ei;
32415 int min_prev_count;
32416
32417 /* Only bother counting instructions along paths with no
32418 more than 2 basic blocks between entry and exit. Given
32419 that BB has an edge to exit, determine if a predecessor
32420 of BB has an edge from entry. If so, compute the number
32421 of instructions in the predecessor block. If there
32422 happen to be multiple such blocks, compute the minimum. */
32423 min_prev_count = 4;
32424 FOR_EACH_EDGE (e, ei, bb->preds)
32425 {
32426 edge prev_e;
32427 edge_iterator prev_ei;
32428
32429 if (e->src == ENTRY_BLOCK_PTR)
32430 {
32431 min_prev_count = 0;
32432 break;
32433 }
32434 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
32435 {
32436 if (prev_e->src == ENTRY_BLOCK_PTR)
32437 {
32438 int count = ix86_count_insn_bb (e->src);
32439 if (count < min_prev_count)
32440 min_prev_count = count;
32441 break;
32442 }
32443 }
32444 }
32445
32446 if (min_prev_count < 4)
32447 min_prev_count += ix86_count_insn_bb (bb);
32448
32449 return min_prev_count;
32450 }
32451
32452 /* Pad short funtion to 4 instructions. */
32453
32454 static void
32455 ix86_pad_short_function (void)
32456 {
32457 edge e;
32458 edge_iterator ei;
32459
32460 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32461 {
32462 rtx ret = BB_END (e->src);
32463 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
32464 {
32465 int insn_count = ix86_count_insn (e->src);
32466
32467 /* Pad short function. */
32468 if (insn_count < 4)
32469 {
32470 rtx insn = ret;
32471
32472 /* Find epilogue. */
32473 while (insn
32474 && (!NOTE_P (insn)
32475 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
32476 insn = PREV_INSN (insn);
32477
32478 if (!insn)
32479 insn = ret;
32480
32481 /* Two NOPs count as one instruction. */
32482 insn_count = 2 * (4 - insn_count);
32483 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
32484 }
32485 }
32486 }
32487 }
32488
32489 /* Implement machine specific optimizations. We implement padding of returns
32490 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
32491 static void
32492 ix86_reorg (void)
32493 {
32494 /* We are freeing block_for_insn in the toplev to keep compatibility
32495 with old MDEP_REORGS that are not CFG based. Recompute it now. */
32496 compute_bb_for_insn ();
32497
32498 /* Run the vzeroupper optimization if needed. */
32499 if (TARGET_VZEROUPPER)
32500 move_or_delete_vzeroupper ();
32501
32502 if (optimize && optimize_function_for_speed_p (cfun))
32503 {
32504 if (TARGET_PAD_SHORT_FUNCTION)
32505 ix86_pad_short_function ();
32506 else if (TARGET_PAD_RETURNS)
32507 ix86_pad_returns ();
32508 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32509 if (TARGET_FOUR_JUMP_LIMIT)
32510 ix86_avoid_jump_mispredicts ();
32511 #endif
32512 }
32513 }
32514
32515 /* Return nonzero when QImode register that must be represented via REX prefix
32516 is used. */
32517 bool
32518 x86_extended_QIreg_mentioned_p (rtx insn)
32519 {
32520 int i;
32521 extract_insn_cached (insn);
32522 for (i = 0; i < recog_data.n_operands; i++)
32523 if (REG_P (recog_data.operand[i])
32524 && REGNO (recog_data.operand[i]) > BX_REG)
32525 return true;
32526 return false;
32527 }
32528
32529 /* Return nonzero when P points to register encoded via REX prefix.
32530 Called via for_each_rtx. */
32531 static int
32532 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
32533 {
32534 unsigned int regno;
32535 if (!REG_P (*p))
32536 return 0;
32537 regno = REGNO (*p);
32538 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
32539 }
32540
32541 /* Return true when INSN mentions register that must be encoded using REX
32542 prefix. */
32543 bool
32544 x86_extended_reg_mentioned_p (rtx insn)
32545 {
32546 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
32547 extended_reg_mentioned_1, NULL);
32548 }
32549
32550 /* If profitable, negate (without causing overflow) integer constant
32551 of mode MODE at location LOC. Return true in this case. */
32552 bool
32553 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
32554 {
32555 HOST_WIDE_INT val;
32556
32557 if (!CONST_INT_P (*loc))
32558 return false;
32559
32560 switch (mode)
32561 {
32562 case DImode:
32563 /* DImode x86_64 constants must fit in 32 bits. */
32564 gcc_assert (x86_64_immediate_operand (*loc, mode));
32565
32566 mode = SImode;
32567 break;
32568
32569 case SImode:
32570 case HImode:
32571 case QImode:
32572 break;
32573
32574 default:
32575 gcc_unreachable ();
32576 }
32577
32578 /* Avoid overflows. */
32579 if (mode_signbit_p (mode, *loc))
32580 return false;
32581
32582 val = INTVAL (*loc);
32583
32584 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
32585 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
32586 if ((val < 0 && val != -128)
32587 || val == 128)
32588 {
32589 *loc = GEN_INT (-val);
32590 return true;
32591 }
32592
32593 return false;
32594 }
32595
32596 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
32597 optabs would emit if we didn't have TFmode patterns. */
32598
32599 void
32600 x86_emit_floatuns (rtx operands[2])
32601 {
32602 rtx neglab, donelab, i0, i1, f0, in, out;
32603 enum machine_mode mode, inmode;
32604
32605 inmode = GET_MODE (operands[1]);
32606 gcc_assert (inmode == SImode || inmode == DImode);
32607
32608 out = operands[0];
32609 in = force_reg (inmode, operands[1]);
32610 mode = GET_MODE (out);
32611 neglab = gen_label_rtx ();
32612 donelab = gen_label_rtx ();
32613 f0 = gen_reg_rtx (mode);
32614
32615 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
32616
32617 expand_float (out, in, 0);
32618
32619 emit_jump_insn (gen_jump (donelab));
32620 emit_barrier ();
32621
32622 emit_label (neglab);
32623
32624 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
32625 1, OPTAB_DIRECT);
32626 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
32627 1, OPTAB_DIRECT);
32628 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
32629
32630 expand_float (f0, i0, 0);
32631
32632 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
32633
32634 emit_label (donelab);
32635 }
32636 \f
32637 /* AVX2 does support 32-byte integer vector operations,
32638 thus the longest vector we are faced with is V32QImode. */
32639 #define MAX_VECT_LEN 32
32640
32641 struct expand_vec_perm_d
32642 {
32643 rtx target, op0, op1;
32644 unsigned char perm[MAX_VECT_LEN];
32645 enum machine_mode vmode;
32646 unsigned char nelt;
32647 bool testing_p;
32648 };
32649
32650 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
32651 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
32652
32653 /* Get a vector mode of the same size as the original but with elements
32654 twice as wide. This is only guaranteed to apply to integral vectors. */
32655
32656 static inline enum machine_mode
32657 get_mode_wider_vector (enum machine_mode o)
32658 {
32659 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
32660 enum machine_mode n = GET_MODE_WIDER_MODE (o);
32661 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
32662 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
32663 return n;
32664 }
32665
32666 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32667 with all elements equal to VAR. Return true if successful. */
32668
32669 static bool
32670 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
32671 rtx target, rtx val)
32672 {
32673 bool ok;
32674
32675 switch (mode)
32676 {
32677 case V2SImode:
32678 case V2SFmode:
32679 if (!mmx_ok)
32680 return false;
32681 /* FALLTHRU */
32682
32683 case V4DFmode:
32684 case V4DImode:
32685 case V8SFmode:
32686 case V8SImode:
32687 case V2DFmode:
32688 case V2DImode:
32689 case V4SFmode:
32690 case V4SImode:
32691 {
32692 rtx insn, dup;
32693
32694 /* First attempt to recognize VAL as-is. */
32695 dup = gen_rtx_VEC_DUPLICATE (mode, val);
32696 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
32697 if (recog_memoized (insn) < 0)
32698 {
32699 rtx seq;
32700 /* If that fails, force VAL into a register. */
32701
32702 start_sequence ();
32703 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
32704 seq = get_insns ();
32705 end_sequence ();
32706 if (seq)
32707 emit_insn_before (seq, insn);
32708
32709 ok = recog_memoized (insn) >= 0;
32710 gcc_assert (ok);
32711 }
32712 }
32713 return true;
32714
32715 case V4HImode:
32716 if (!mmx_ok)
32717 return false;
32718 if (TARGET_SSE || TARGET_3DNOW_A)
32719 {
32720 rtx x;
32721
32722 val = gen_lowpart (SImode, val);
32723 x = gen_rtx_TRUNCATE (HImode, val);
32724 x = gen_rtx_VEC_DUPLICATE (mode, x);
32725 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32726 return true;
32727 }
32728 goto widen;
32729
32730 case V8QImode:
32731 if (!mmx_ok)
32732 return false;
32733 goto widen;
32734
32735 case V8HImode:
32736 if (TARGET_SSE2)
32737 {
32738 struct expand_vec_perm_d dperm;
32739 rtx tmp1, tmp2;
32740
32741 permute:
32742 memset (&dperm, 0, sizeof (dperm));
32743 dperm.target = target;
32744 dperm.vmode = mode;
32745 dperm.nelt = GET_MODE_NUNITS (mode);
32746 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
32747
32748 /* Extend to SImode using a paradoxical SUBREG. */
32749 tmp1 = gen_reg_rtx (SImode);
32750 emit_move_insn (tmp1, gen_lowpart (SImode, val));
32751
32752 /* Insert the SImode value as low element of a V4SImode vector. */
32753 tmp2 = gen_lowpart (V4SImode, dperm.op0);
32754 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
32755
32756 ok = (expand_vec_perm_1 (&dperm)
32757 || expand_vec_perm_broadcast_1 (&dperm));
32758 gcc_assert (ok);
32759 return ok;
32760 }
32761 goto widen;
32762
32763 case V16QImode:
32764 if (TARGET_SSE2)
32765 goto permute;
32766 goto widen;
32767
32768 widen:
32769 /* Replicate the value once into the next wider mode and recurse. */
32770 {
32771 enum machine_mode smode, wsmode, wvmode;
32772 rtx x;
32773
32774 smode = GET_MODE_INNER (mode);
32775 wvmode = get_mode_wider_vector (mode);
32776 wsmode = GET_MODE_INNER (wvmode);
32777
32778 val = convert_modes (wsmode, smode, val, true);
32779 x = expand_simple_binop (wsmode, ASHIFT, val,
32780 GEN_INT (GET_MODE_BITSIZE (smode)),
32781 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32782 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
32783
32784 x = gen_lowpart (wvmode, target);
32785 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
32786 gcc_assert (ok);
32787 return ok;
32788 }
32789
32790 case V16HImode:
32791 case V32QImode:
32792 {
32793 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
32794 rtx x = gen_reg_rtx (hvmode);
32795
32796 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
32797 gcc_assert (ok);
32798
32799 x = gen_rtx_VEC_CONCAT (mode, x, x);
32800 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32801 }
32802 return true;
32803
32804 default:
32805 return false;
32806 }
32807 }
32808
32809 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32810 whose ONE_VAR element is VAR, and other elements are zero. Return true
32811 if successful. */
32812
32813 static bool
32814 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
32815 rtx target, rtx var, int one_var)
32816 {
32817 enum machine_mode vsimode;
32818 rtx new_target;
32819 rtx x, tmp;
32820 bool use_vector_set = false;
32821
32822 switch (mode)
32823 {
32824 case V2DImode:
32825 /* For SSE4.1, we normally use vector set. But if the second
32826 element is zero and inter-unit moves are OK, we use movq
32827 instead. */
32828 use_vector_set = (TARGET_64BIT
32829 && TARGET_SSE4_1
32830 && !(TARGET_INTER_UNIT_MOVES
32831 && one_var == 0));
32832 break;
32833 case V16QImode:
32834 case V4SImode:
32835 case V4SFmode:
32836 use_vector_set = TARGET_SSE4_1;
32837 break;
32838 case V8HImode:
32839 use_vector_set = TARGET_SSE2;
32840 break;
32841 case V4HImode:
32842 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
32843 break;
32844 case V32QImode:
32845 case V16HImode:
32846 case V8SImode:
32847 case V8SFmode:
32848 case V4DFmode:
32849 use_vector_set = TARGET_AVX;
32850 break;
32851 case V4DImode:
32852 /* Use ix86_expand_vector_set in 64bit mode only. */
32853 use_vector_set = TARGET_AVX && TARGET_64BIT;
32854 break;
32855 default:
32856 break;
32857 }
32858
32859 if (use_vector_set)
32860 {
32861 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
32862 var = force_reg (GET_MODE_INNER (mode), var);
32863 ix86_expand_vector_set (mmx_ok, target, var, one_var);
32864 return true;
32865 }
32866
32867 switch (mode)
32868 {
32869 case V2SFmode:
32870 case V2SImode:
32871 if (!mmx_ok)
32872 return false;
32873 /* FALLTHRU */
32874
32875 case V2DFmode:
32876 case V2DImode:
32877 if (one_var != 0)
32878 return false;
32879 var = force_reg (GET_MODE_INNER (mode), var);
32880 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
32881 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32882 return true;
32883
32884 case V4SFmode:
32885 case V4SImode:
32886 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
32887 new_target = gen_reg_rtx (mode);
32888 else
32889 new_target = target;
32890 var = force_reg (GET_MODE_INNER (mode), var);
32891 x = gen_rtx_VEC_DUPLICATE (mode, var);
32892 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
32893 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
32894 if (one_var != 0)
32895 {
32896 /* We need to shuffle the value to the correct position, so
32897 create a new pseudo to store the intermediate result. */
32898
32899 /* With SSE2, we can use the integer shuffle insns. */
32900 if (mode != V4SFmode && TARGET_SSE2)
32901 {
32902 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
32903 const1_rtx,
32904 GEN_INT (one_var == 1 ? 0 : 1),
32905 GEN_INT (one_var == 2 ? 0 : 1),
32906 GEN_INT (one_var == 3 ? 0 : 1)));
32907 if (target != new_target)
32908 emit_move_insn (target, new_target);
32909 return true;
32910 }
32911
32912 /* Otherwise convert the intermediate result to V4SFmode and
32913 use the SSE1 shuffle instructions. */
32914 if (mode != V4SFmode)
32915 {
32916 tmp = gen_reg_rtx (V4SFmode);
32917 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
32918 }
32919 else
32920 tmp = new_target;
32921
32922 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
32923 const1_rtx,
32924 GEN_INT (one_var == 1 ? 0 : 1),
32925 GEN_INT (one_var == 2 ? 0+4 : 1+4),
32926 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
32927
32928 if (mode != V4SFmode)
32929 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
32930 else if (tmp != target)
32931 emit_move_insn (target, tmp);
32932 }
32933 else if (target != new_target)
32934 emit_move_insn (target, new_target);
32935 return true;
32936
32937 case V8HImode:
32938 case V16QImode:
32939 vsimode = V4SImode;
32940 goto widen;
32941 case V4HImode:
32942 case V8QImode:
32943 if (!mmx_ok)
32944 return false;
32945 vsimode = V2SImode;
32946 goto widen;
32947 widen:
32948 if (one_var != 0)
32949 return false;
32950
32951 /* Zero extend the variable element to SImode and recurse. */
32952 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
32953
32954 x = gen_reg_rtx (vsimode);
32955 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
32956 var, one_var))
32957 gcc_unreachable ();
32958
32959 emit_move_insn (target, gen_lowpart (mode, x));
32960 return true;
32961
32962 default:
32963 return false;
32964 }
32965 }
32966
32967 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32968 consisting of the values in VALS. It is known that all elements
32969 except ONE_VAR are constants. Return true if successful. */
32970
32971 static bool
32972 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
32973 rtx target, rtx vals, int one_var)
32974 {
32975 rtx var = XVECEXP (vals, 0, one_var);
32976 enum machine_mode wmode;
32977 rtx const_vec, x;
32978
32979 const_vec = copy_rtx (vals);
32980 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
32981 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
32982
32983 switch (mode)
32984 {
32985 case V2DFmode:
32986 case V2DImode:
32987 case V2SFmode:
32988 case V2SImode:
32989 /* For the two element vectors, it's just as easy to use
32990 the general case. */
32991 return false;
32992
32993 case V4DImode:
32994 /* Use ix86_expand_vector_set in 64bit mode only. */
32995 if (!TARGET_64BIT)
32996 return false;
32997 case V4DFmode:
32998 case V8SFmode:
32999 case V8SImode:
33000 case V16HImode:
33001 case V32QImode:
33002 case V4SFmode:
33003 case V4SImode:
33004 case V8HImode:
33005 case V4HImode:
33006 break;
33007
33008 case V16QImode:
33009 if (TARGET_SSE4_1)
33010 break;
33011 wmode = V8HImode;
33012 goto widen;
33013 case V8QImode:
33014 wmode = V4HImode;
33015 goto widen;
33016 widen:
33017 /* There's no way to set one QImode entry easily. Combine
33018 the variable value with its adjacent constant value, and
33019 promote to an HImode set. */
33020 x = XVECEXP (vals, 0, one_var ^ 1);
33021 if (one_var & 1)
33022 {
33023 var = convert_modes (HImode, QImode, var, true);
33024 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33025 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33026 x = GEN_INT (INTVAL (x) & 0xff);
33027 }
33028 else
33029 {
33030 var = convert_modes (HImode, QImode, var, true);
33031 x = gen_int_mode (INTVAL (x) << 8, HImode);
33032 }
33033 if (x != const0_rtx)
33034 var = expand_simple_binop (HImode, IOR, var, x, var,
33035 1, OPTAB_LIB_WIDEN);
33036
33037 x = gen_reg_rtx (wmode);
33038 emit_move_insn (x, gen_lowpart (wmode, const_vec));
33039 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33040
33041 emit_move_insn (target, gen_lowpart (mode, x));
33042 return true;
33043
33044 default:
33045 return false;
33046 }
33047
33048 emit_move_insn (target, const_vec);
33049 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33050 return true;
33051 }
33052
33053 /* A subroutine of ix86_expand_vector_init_general. Use vector
33054 concatenate to handle the most general case: all values variable,
33055 and none identical. */
33056
33057 static void
33058 ix86_expand_vector_init_concat (enum machine_mode mode,
33059 rtx target, rtx *ops, int n)
33060 {
33061 enum machine_mode cmode, hmode = VOIDmode;
33062 rtx first[8], second[4];
33063 rtvec v;
33064 int i, j;
33065
33066 switch (n)
33067 {
33068 case 2:
33069 switch (mode)
33070 {
33071 case V8SImode:
33072 cmode = V4SImode;
33073 break;
33074 case V8SFmode:
33075 cmode = V4SFmode;
33076 break;
33077 case V4DImode:
33078 cmode = V2DImode;
33079 break;
33080 case V4DFmode:
33081 cmode = V2DFmode;
33082 break;
33083 case V4SImode:
33084 cmode = V2SImode;
33085 break;
33086 case V4SFmode:
33087 cmode = V2SFmode;
33088 break;
33089 case V2DImode:
33090 cmode = DImode;
33091 break;
33092 case V2SImode:
33093 cmode = SImode;
33094 break;
33095 case V2DFmode:
33096 cmode = DFmode;
33097 break;
33098 case V2SFmode:
33099 cmode = SFmode;
33100 break;
33101 default:
33102 gcc_unreachable ();
33103 }
33104
33105 if (!register_operand (ops[1], cmode))
33106 ops[1] = force_reg (cmode, ops[1]);
33107 if (!register_operand (ops[0], cmode))
33108 ops[0] = force_reg (cmode, ops[0]);
33109 emit_insn (gen_rtx_SET (VOIDmode, target,
33110 gen_rtx_VEC_CONCAT (mode, ops[0],
33111 ops[1])));
33112 break;
33113
33114 case 4:
33115 switch (mode)
33116 {
33117 case V4DImode:
33118 cmode = V2DImode;
33119 break;
33120 case V4DFmode:
33121 cmode = V2DFmode;
33122 break;
33123 case V4SImode:
33124 cmode = V2SImode;
33125 break;
33126 case V4SFmode:
33127 cmode = V2SFmode;
33128 break;
33129 default:
33130 gcc_unreachable ();
33131 }
33132 goto half;
33133
33134 case 8:
33135 switch (mode)
33136 {
33137 case V8SImode:
33138 cmode = V2SImode;
33139 hmode = V4SImode;
33140 break;
33141 case V8SFmode:
33142 cmode = V2SFmode;
33143 hmode = V4SFmode;
33144 break;
33145 default:
33146 gcc_unreachable ();
33147 }
33148 goto half;
33149
33150 half:
33151 /* FIXME: We process inputs backward to help RA. PR 36222. */
33152 i = n - 1;
33153 j = (n >> 1) - 1;
33154 for (; i > 0; i -= 2, j--)
33155 {
33156 first[j] = gen_reg_rtx (cmode);
33157 v = gen_rtvec (2, ops[i - 1], ops[i]);
33158 ix86_expand_vector_init (false, first[j],
33159 gen_rtx_PARALLEL (cmode, v));
33160 }
33161
33162 n >>= 1;
33163 if (n > 2)
33164 {
33165 gcc_assert (hmode != VOIDmode);
33166 for (i = j = 0; i < n; i += 2, j++)
33167 {
33168 second[j] = gen_reg_rtx (hmode);
33169 ix86_expand_vector_init_concat (hmode, second [j],
33170 &first [i], 2);
33171 }
33172 n >>= 1;
33173 ix86_expand_vector_init_concat (mode, target, second, n);
33174 }
33175 else
33176 ix86_expand_vector_init_concat (mode, target, first, n);
33177 break;
33178
33179 default:
33180 gcc_unreachable ();
33181 }
33182 }
33183
33184 /* A subroutine of ix86_expand_vector_init_general. Use vector
33185 interleave to handle the most general case: all values variable,
33186 and none identical. */
33187
33188 static void
33189 ix86_expand_vector_init_interleave (enum machine_mode mode,
33190 rtx target, rtx *ops, int n)
33191 {
33192 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
33193 int i, j;
33194 rtx op0, op1;
33195 rtx (*gen_load_even) (rtx, rtx, rtx);
33196 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
33197 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
33198
33199 switch (mode)
33200 {
33201 case V8HImode:
33202 gen_load_even = gen_vec_setv8hi;
33203 gen_interleave_first_low = gen_vec_interleave_lowv4si;
33204 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33205 inner_mode = HImode;
33206 first_imode = V4SImode;
33207 second_imode = V2DImode;
33208 third_imode = VOIDmode;
33209 break;
33210 case V16QImode:
33211 gen_load_even = gen_vec_setv16qi;
33212 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
33213 gen_interleave_second_low = gen_vec_interleave_lowv4si;
33214 inner_mode = QImode;
33215 first_imode = V8HImode;
33216 second_imode = V4SImode;
33217 third_imode = V2DImode;
33218 break;
33219 default:
33220 gcc_unreachable ();
33221 }
33222
33223 for (i = 0; i < n; i++)
33224 {
33225 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
33226 op0 = gen_reg_rtx (SImode);
33227 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
33228
33229 /* Insert the SImode value as low element of V4SImode vector. */
33230 op1 = gen_reg_rtx (V4SImode);
33231 op0 = gen_rtx_VEC_MERGE (V4SImode,
33232 gen_rtx_VEC_DUPLICATE (V4SImode,
33233 op0),
33234 CONST0_RTX (V4SImode),
33235 const1_rtx);
33236 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
33237
33238 /* Cast the V4SImode vector back to a vector in orignal mode. */
33239 op0 = gen_reg_rtx (mode);
33240 emit_move_insn (op0, gen_lowpart (mode, op1));
33241
33242 /* Load even elements into the second positon. */
33243 emit_insn (gen_load_even (op0,
33244 force_reg (inner_mode,
33245 ops [i + i + 1]),
33246 const1_rtx));
33247
33248 /* Cast vector to FIRST_IMODE vector. */
33249 ops[i] = gen_reg_rtx (first_imode);
33250 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
33251 }
33252
33253 /* Interleave low FIRST_IMODE vectors. */
33254 for (i = j = 0; i < n; i += 2, j++)
33255 {
33256 op0 = gen_reg_rtx (first_imode);
33257 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
33258
33259 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
33260 ops[j] = gen_reg_rtx (second_imode);
33261 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
33262 }
33263
33264 /* Interleave low SECOND_IMODE vectors. */
33265 switch (second_imode)
33266 {
33267 case V4SImode:
33268 for (i = j = 0; i < n / 2; i += 2, j++)
33269 {
33270 op0 = gen_reg_rtx (second_imode);
33271 emit_insn (gen_interleave_second_low (op0, ops[i],
33272 ops[i + 1]));
33273
33274 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
33275 vector. */
33276 ops[j] = gen_reg_rtx (third_imode);
33277 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
33278 }
33279 second_imode = V2DImode;
33280 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33281 /* FALLTHRU */
33282
33283 case V2DImode:
33284 op0 = gen_reg_rtx (second_imode);
33285 emit_insn (gen_interleave_second_low (op0, ops[0],
33286 ops[1]));
33287
33288 /* Cast the SECOND_IMODE vector back to a vector on original
33289 mode. */
33290 emit_insn (gen_rtx_SET (VOIDmode, target,
33291 gen_lowpart (mode, op0)));
33292 break;
33293
33294 default:
33295 gcc_unreachable ();
33296 }
33297 }
33298
33299 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
33300 all values variable, and none identical. */
33301
33302 static void
33303 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
33304 rtx target, rtx vals)
33305 {
33306 rtx ops[32], op0, op1;
33307 enum machine_mode half_mode = VOIDmode;
33308 int n, i;
33309
33310 switch (mode)
33311 {
33312 case V2SFmode:
33313 case V2SImode:
33314 if (!mmx_ok && !TARGET_SSE)
33315 break;
33316 /* FALLTHRU */
33317
33318 case V8SFmode:
33319 case V8SImode:
33320 case V4DFmode:
33321 case V4DImode:
33322 case V4SFmode:
33323 case V4SImode:
33324 case V2DFmode:
33325 case V2DImode:
33326 n = GET_MODE_NUNITS (mode);
33327 for (i = 0; i < n; i++)
33328 ops[i] = XVECEXP (vals, 0, i);
33329 ix86_expand_vector_init_concat (mode, target, ops, n);
33330 return;
33331
33332 case V32QImode:
33333 half_mode = V16QImode;
33334 goto half;
33335
33336 case V16HImode:
33337 half_mode = V8HImode;
33338 goto half;
33339
33340 half:
33341 n = GET_MODE_NUNITS (mode);
33342 for (i = 0; i < n; i++)
33343 ops[i] = XVECEXP (vals, 0, i);
33344 op0 = gen_reg_rtx (half_mode);
33345 op1 = gen_reg_rtx (half_mode);
33346 ix86_expand_vector_init_interleave (half_mode, op0, ops,
33347 n >> 2);
33348 ix86_expand_vector_init_interleave (half_mode, op1,
33349 &ops [n >> 1], n >> 2);
33350 emit_insn (gen_rtx_SET (VOIDmode, target,
33351 gen_rtx_VEC_CONCAT (mode, op0, op1)));
33352 return;
33353
33354 case V16QImode:
33355 if (!TARGET_SSE4_1)
33356 break;
33357 /* FALLTHRU */
33358
33359 case V8HImode:
33360 if (!TARGET_SSE2)
33361 break;
33362
33363 /* Don't use ix86_expand_vector_init_interleave if we can't
33364 move from GPR to SSE register directly. */
33365 if (!TARGET_INTER_UNIT_MOVES)
33366 break;
33367
33368 n = GET_MODE_NUNITS (mode);
33369 for (i = 0; i < n; i++)
33370 ops[i] = XVECEXP (vals, 0, i);
33371 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
33372 return;
33373
33374 case V4HImode:
33375 case V8QImode:
33376 break;
33377
33378 default:
33379 gcc_unreachable ();
33380 }
33381
33382 {
33383 int i, j, n_elts, n_words, n_elt_per_word;
33384 enum machine_mode inner_mode;
33385 rtx words[4], shift;
33386
33387 inner_mode = GET_MODE_INNER (mode);
33388 n_elts = GET_MODE_NUNITS (mode);
33389 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
33390 n_elt_per_word = n_elts / n_words;
33391 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
33392
33393 for (i = 0; i < n_words; ++i)
33394 {
33395 rtx word = NULL_RTX;
33396
33397 for (j = 0; j < n_elt_per_word; ++j)
33398 {
33399 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
33400 elt = convert_modes (word_mode, inner_mode, elt, true);
33401
33402 if (j == 0)
33403 word = elt;
33404 else
33405 {
33406 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
33407 word, 1, OPTAB_LIB_WIDEN);
33408 word = expand_simple_binop (word_mode, IOR, word, elt,
33409 word, 1, OPTAB_LIB_WIDEN);
33410 }
33411 }
33412
33413 words[i] = word;
33414 }
33415
33416 if (n_words == 1)
33417 emit_move_insn (target, gen_lowpart (mode, words[0]));
33418 else if (n_words == 2)
33419 {
33420 rtx tmp = gen_reg_rtx (mode);
33421 emit_clobber (tmp);
33422 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
33423 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
33424 emit_move_insn (target, tmp);
33425 }
33426 else if (n_words == 4)
33427 {
33428 rtx tmp = gen_reg_rtx (V4SImode);
33429 gcc_assert (word_mode == SImode);
33430 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
33431 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
33432 emit_move_insn (target, gen_lowpart (mode, tmp));
33433 }
33434 else
33435 gcc_unreachable ();
33436 }
33437 }
33438
33439 /* Initialize vector TARGET via VALS. Suppress the use of MMX
33440 instructions unless MMX_OK is true. */
33441
33442 void
33443 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
33444 {
33445 enum machine_mode mode = GET_MODE (target);
33446 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33447 int n_elts = GET_MODE_NUNITS (mode);
33448 int n_var = 0, one_var = -1;
33449 bool all_same = true, all_const_zero = true;
33450 int i;
33451 rtx x;
33452
33453 for (i = 0; i < n_elts; ++i)
33454 {
33455 x = XVECEXP (vals, 0, i);
33456 if (!(CONST_INT_P (x)
33457 || GET_CODE (x) == CONST_DOUBLE
33458 || GET_CODE (x) == CONST_FIXED))
33459 n_var++, one_var = i;
33460 else if (x != CONST0_RTX (inner_mode))
33461 all_const_zero = false;
33462 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
33463 all_same = false;
33464 }
33465
33466 /* Constants are best loaded from the constant pool. */
33467 if (n_var == 0)
33468 {
33469 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
33470 return;
33471 }
33472
33473 /* If all values are identical, broadcast the value. */
33474 if (all_same
33475 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
33476 XVECEXP (vals, 0, 0)))
33477 return;
33478
33479 /* Values where only one field is non-constant are best loaded from
33480 the pool and overwritten via move later. */
33481 if (n_var == 1)
33482 {
33483 if (all_const_zero
33484 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
33485 XVECEXP (vals, 0, one_var),
33486 one_var))
33487 return;
33488
33489 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
33490 return;
33491 }
33492
33493 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
33494 }
33495
33496 void
33497 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
33498 {
33499 enum machine_mode mode = GET_MODE (target);
33500 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33501 enum machine_mode half_mode;
33502 bool use_vec_merge = false;
33503 rtx tmp;
33504 static rtx (*gen_extract[6][2]) (rtx, rtx)
33505 = {
33506 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
33507 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
33508 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
33509 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
33510 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
33511 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
33512 };
33513 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
33514 = {
33515 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
33516 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
33517 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
33518 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
33519 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
33520 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
33521 };
33522 int i, j, n;
33523
33524 switch (mode)
33525 {
33526 case V2SFmode:
33527 case V2SImode:
33528 if (mmx_ok)
33529 {
33530 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33531 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
33532 if (elt == 0)
33533 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33534 else
33535 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33536 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33537 return;
33538 }
33539 break;
33540
33541 case V2DImode:
33542 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
33543 if (use_vec_merge)
33544 break;
33545
33546 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33547 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
33548 if (elt == 0)
33549 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33550 else
33551 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33552 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33553 return;
33554
33555 case V2DFmode:
33556 {
33557 rtx op0, op1;
33558
33559 /* For the two element vectors, we implement a VEC_CONCAT with
33560 the extraction of the other element. */
33561
33562 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
33563 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
33564
33565 if (elt == 0)
33566 op0 = val, op1 = tmp;
33567 else
33568 op0 = tmp, op1 = val;
33569
33570 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
33571 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33572 }
33573 return;
33574
33575 case V4SFmode:
33576 use_vec_merge = TARGET_SSE4_1;
33577 if (use_vec_merge)
33578 break;
33579
33580 switch (elt)
33581 {
33582 case 0:
33583 use_vec_merge = true;
33584 break;
33585
33586 case 1:
33587 /* tmp = target = A B C D */
33588 tmp = copy_to_reg (target);
33589 /* target = A A B B */
33590 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
33591 /* target = X A B B */
33592 ix86_expand_vector_set (false, target, val, 0);
33593 /* target = A X C D */
33594 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33595 const1_rtx, const0_rtx,
33596 GEN_INT (2+4), GEN_INT (3+4)));
33597 return;
33598
33599 case 2:
33600 /* tmp = target = A B C D */
33601 tmp = copy_to_reg (target);
33602 /* tmp = X B C D */
33603 ix86_expand_vector_set (false, tmp, val, 0);
33604 /* target = A B X D */
33605 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33606 const0_rtx, const1_rtx,
33607 GEN_INT (0+4), GEN_INT (3+4)));
33608 return;
33609
33610 case 3:
33611 /* tmp = target = A B C D */
33612 tmp = copy_to_reg (target);
33613 /* tmp = X B C D */
33614 ix86_expand_vector_set (false, tmp, val, 0);
33615 /* target = A B X D */
33616 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33617 const0_rtx, const1_rtx,
33618 GEN_INT (2+4), GEN_INT (0+4)));
33619 return;
33620
33621 default:
33622 gcc_unreachable ();
33623 }
33624 break;
33625
33626 case V4SImode:
33627 use_vec_merge = TARGET_SSE4_1;
33628 if (use_vec_merge)
33629 break;
33630
33631 /* Element 0 handled by vec_merge below. */
33632 if (elt == 0)
33633 {
33634 use_vec_merge = true;
33635 break;
33636 }
33637
33638 if (TARGET_SSE2)
33639 {
33640 /* With SSE2, use integer shuffles to swap element 0 and ELT,
33641 store into element 0, then shuffle them back. */
33642
33643 rtx order[4];
33644
33645 order[0] = GEN_INT (elt);
33646 order[1] = const1_rtx;
33647 order[2] = const2_rtx;
33648 order[3] = GEN_INT (3);
33649 order[elt] = const0_rtx;
33650
33651 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33652 order[1], order[2], order[3]));
33653
33654 ix86_expand_vector_set (false, target, val, 0);
33655
33656 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33657 order[1], order[2], order[3]));
33658 }
33659 else
33660 {
33661 /* For SSE1, we have to reuse the V4SF code. */
33662 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
33663 gen_lowpart (SFmode, val), elt);
33664 }
33665 return;
33666
33667 case V8HImode:
33668 use_vec_merge = TARGET_SSE2;
33669 break;
33670 case V4HImode:
33671 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33672 break;
33673
33674 case V16QImode:
33675 use_vec_merge = TARGET_SSE4_1;
33676 break;
33677
33678 case V8QImode:
33679 break;
33680
33681 case V32QImode:
33682 half_mode = V16QImode;
33683 j = 0;
33684 n = 16;
33685 goto half;
33686
33687 case V16HImode:
33688 half_mode = V8HImode;
33689 j = 1;
33690 n = 8;
33691 goto half;
33692
33693 case V8SImode:
33694 half_mode = V4SImode;
33695 j = 2;
33696 n = 4;
33697 goto half;
33698
33699 case V4DImode:
33700 half_mode = V2DImode;
33701 j = 3;
33702 n = 2;
33703 goto half;
33704
33705 case V8SFmode:
33706 half_mode = V4SFmode;
33707 j = 4;
33708 n = 4;
33709 goto half;
33710
33711 case V4DFmode:
33712 half_mode = V2DFmode;
33713 j = 5;
33714 n = 2;
33715 goto half;
33716
33717 half:
33718 /* Compute offset. */
33719 i = elt / n;
33720 elt %= n;
33721
33722 gcc_assert (i <= 1);
33723
33724 /* Extract the half. */
33725 tmp = gen_reg_rtx (half_mode);
33726 emit_insn (gen_extract[j][i] (tmp, target));
33727
33728 /* Put val in tmp at elt. */
33729 ix86_expand_vector_set (false, tmp, val, elt);
33730
33731 /* Put it back. */
33732 emit_insn (gen_insert[j][i] (target, target, tmp));
33733 return;
33734
33735 default:
33736 break;
33737 }
33738
33739 if (use_vec_merge)
33740 {
33741 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
33742 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
33743 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33744 }
33745 else
33746 {
33747 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33748
33749 emit_move_insn (mem, target);
33750
33751 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33752 emit_move_insn (tmp, val);
33753
33754 emit_move_insn (target, mem);
33755 }
33756 }
33757
33758 void
33759 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
33760 {
33761 enum machine_mode mode = GET_MODE (vec);
33762 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33763 bool use_vec_extr = false;
33764 rtx tmp;
33765
33766 switch (mode)
33767 {
33768 case V2SImode:
33769 case V2SFmode:
33770 if (!mmx_ok)
33771 break;
33772 /* FALLTHRU */
33773
33774 case V2DFmode:
33775 case V2DImode:
33776 use_vec_extr = true;
33777 break;
33778
33779 case V4SFmode:
33780 use_vec_extr = TARGET_SSE4_1;
33781 if (use_vec_extr)
33782 break;
33783
33784 switch (elt)
33785 {
33786 case 0:
33787 tmp = vec;
33788 break;
33789
33790 case 1:
33791 case 3:
33792 tmp = gen_reg_rtx (mode);
33793 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
33794 GEN_INT (elt), GEN_INT (elt),
33795 GEN_INT (elt+4), GEN_INT (elt+4)));
33796 break;
33797
33798 case 2:
33799 tmp = gen_reg_rtx (mode);
33800 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
33801 break;
33802
33803 default:
33804 gcc_unreachable ();
33805 }
33806 vec = tmp;
33807 use_vec_extr = true;
33808 elt = 0;
33809 break;
33810
33811 case V4SImode:
33812 use_vec_extr = TARGET_SSE4_1;
33813 if (use_vec_extr)
33814 break;
33815
33816 if (TARGET_SSE2)
33817 {
33818 switch (elt)
33819 {
33820 case 0:
33821 tmp = vec;
33822 break;
33823
33824 case 1:
33825 case 3:
33826 tmp = gen_reg_rtx (mode);
33827 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
33828 GEN_INT (elt), GEN_INT (elt),
33829 GEN_INT (elt), GEN_INT (elt)));
33830 break;
33831
33832 case 2:
33833 tmp = gen_reg_rtx (mode);
33834 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
33835 break;
33836
33837 default:
33838 gcc_unreachable ();
33839 }
33840 vec = tmp;
33841 use_vec_extr = true;
33842 elt = 0;
33843 }
33844 else
33845 {
33846 /* For SSE1, we have to reuse the V4SF code. */
33847 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
33848 gen_lowpart (V4SFmode, vec), elt);
33849 return;
33850 }
33851 break;
33852
33853 case V8HImode:
33854 use_vec_extr = TARGET_SSE2;
33855 break;
33856 case V4HImode:
33857 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33858 break;
33859
33860 case V16QImode:
33861 use_vec_extr = TARGET_SSE4_1;
33862 break;
33863
33864 case V8SFmode:
33865 if (TARGET_AVX)
33866 {
33867 tmp = gen_reg_rtx (V4SFmode);
33868 if (elt < 4)
33869 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
33870 else
33871 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
33872 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33873 return;
33874 }
33875 break;
33876
33877 case V4DFmode:
33878 if (TARGET_AVX)
33879 {
33880 tmp = gen_reg_rtx (V2DFmode);
33881 if (elt < 2)
33882 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
33883 else
33884 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
33885 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33886 return;
33887 }
33888 break;
33889
33890 case V32QImode:
33891 if (TARGET_AVX)
33892 {
33893 tmp = gen_reg_rtx (V16QImode);
33894 if (elt < 16)
33895 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
33896 else
33897 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
33898 ix86_expand_vector_extract (false, target, tmp, elt & 15);
33899 return;
33900 }
33901 break;
33902
33903 case V16HImode:
33904 if (TARGET_AVX)
33905 {
33906 tmp = gen_reg_rtx (V8HImode);
33907 if (elt < 8)
33908 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
33909 else
33910 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
33911 ix86_expand_vector_extract (false, target, tmp, elt & 7);
33912 return;
33913 }
33914 break;
33915
33916 case V8SImode:
33917 if (TARGET_AVX)
33918 {
33919 tmp = gen_reg_rtx (V4SImode);
33920 if (elt < 4)
33921 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
33922 else
33923 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
33924 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33925 return;
33926 }
33927 break;
33928
33929 case V4DImode:
33930 if (TARGET_AVX)
33931 {
33932 tmp = gen_reg_rtx (V2DImode);
33933 if (elt < 2)
33934 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
33935 else
33936 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
33937 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33938 return;
33939 }
33940 break;
33941
33942 case V8QImode:
33943 /* ??? Could extract the appropriate HImode element and shift. */
33944 default:
33945 break;
33946 }
33947
33948 if (use_vec_extr)
33949 {
33950 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
33951 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
33952
33953 /* Let the rtl optimizers know about the zero extension performed. */
33954 if (inner_mode == QImode || inner_mode == HImode)
33955 {
33956 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
33957 target = gen_lowpart (SImode, target);
33958 }
33959
33960 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33961 }
33962 else
33963 {
33964 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33965
33966 emit_move_insn (mem, vec);
33967
33968 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33969 emit_move_insn (target, tmp);
33970 }
33971 }
33972
33973 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
33974 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
33975 The upper bits of DEST are undefined, though they shouldn't cause
33976 exceptions (some bits from src or all zeros are ok). */
33977
33978 static void
33979 emit_reduc_half (rtx dest, rtx src, int i)
33980 {
33981 rtx tem;
33982 switch (GET_MODE (src))
33983 {
33984 case V4SFmode:
33985 if (i == 128)
33986 tem = gen_sse_movhlps (dest, src, src);
33987 else
33988 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
33989 GEN_INT (1 + 4), GEN_INT (1 + 4));
33990 break;
33991 case V2DFmode:
33992 tem = gen_vec_interleave_highv2df (dest, src, src);
33993 break;
33994 case V16QImode:
33995 case V8HImode:
33996 case V4SImode:
33997 case V2DImode:
33998 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
33999 gen_lowpart (V1TImode, src),
34000 GEN_INT (i / 2));
34001 break;
34002 case V8SFmode:
34003 if (i == 256)
34004 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34005 else
34006 tem = gen_avx_shufps256 (dest, src, src,
34007 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34008 break;
34009 case V4DFmode:
34010 if (i == 256)
34011 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34012 else
34013 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34014 break;
34015 case V32QImode:
34016 case V16HImode:
34017 case V8SImode:
34018 case V4DImode:
34019 if (i == 256)
34020 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34021 gen_lowpart (V4DImode, src),
34022 gen_lowpart (V4DImode, src),
34023 const1_rtx);
34024 else
34025 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34026 gen_lowpart (V2TImode, src),
34027 GEN_INT (i / 2));
34028 break;
34029 default:
34030 gcc_unreachable ();
34031 }
34032 emit_insn (tem);
34033 }
34034
34035 /* Expand a vector reduction. FN is the binary pattern to reduce;
34036 DEST is the destination; IN is the input vector. */
34037
34038 void
34039 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34040 {
34041 rtx half, dst, vec = in;
34042 enum machine_mode mode = GET_MODE (in);
34043 int i;
34044
34045 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
34046 if (TARGET_SSE4_1
34047 && mode == V8HImode
34048 && fn == gen_uminv8hi3)
34049 {
34050 emit_insn (gen_sse4_1_phminposuw (dest, in));
34051 return;
34052 }
34053
34054 for (i = GET_MODE_BITSIZE (mode);
34055 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34056 i >>= 1)
34057 {
34058 half = gen_reg_rtx (mode);
34059 emit_reduc_half (half, vec, i);
34060 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34061 dst = dest;
34062 else
34063 dst = gen_reg_rtx (mode);
34064 emit_insn (fn (dst, half, vec));
34065 vec = dst;
34066 }
34067 }
34068 \f
34069 /* Target hook for scalar_mode_supported_p. */
34070 static bool
34071 ix86_scalar_mode_supported_p (enum machine_mode mode)
34072 {
34073 if (DECIMAL_FLOAT_MODE_P (mode))
34074 return default_decimal_float_supported_p ();
34075 else if (mode == TFmode)
34076 return true;
34077 else
34078 return default_scalar_mode_supported_p (mode);
34079 }
34080
34081 /* Implements target hook vector_mode_supported_p. */
34082 static bool
34083 ix86_vector_mode_supported_p (enum machine_mode mode)
34084 {
34085 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34086 return true;
34087 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34088 return true;
34089 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34090 return true;
34091 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34092 return true;
34093 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34094 return true;
34095 return false;
34096 }
34097
34098 /* Target hook for c_mode_for_suffix. */
34099 static enum machine_mode
34100 ix86_c_mode_for_suffix (char suffix)
34101 {
34102 if (suffix == 'q')
34103 return TFmode;
34104 if (suffix == 'w')
34105 return XFmode;
34106
34107 return VOIDmode;
34108 }
34109
34110 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34111
34112 We do this in the new i386 backend to maintain source compatibility
34113 with the old cc0-based compiler. */
34114
34115 static tree
34116 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34117 tree inputs ATTRIBUTE_UNUSED,
34118 tree clobbers)
34119 {
34120 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34121 clobbers);
34122 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34123 clobbers);
34124 return clobbers;
34125 }
34126
34127 /* Implements target vector targetm.asm.encode_section_info. */
34128
34129 static void ATTRIBUTE_UNUSED
34130 ix86_encode_section_info (tree decl, rtx rtl, int first)
34131 {
34132 default_encode_section_info (decl, rtl, first);
34133
34134 if (TREE_CODE (decl) == VAR_DECL
34135 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34136 && ix86_in_large_data_p (decl))
34137 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34138 }
34139
34140 /* Worker function for REVERSE_CONDITION. */
34141
34142 enum rtx_code
34143 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34144 {
34145 return (mode != CCFPmode && mode != CCFPUmode
34146 ? reverse_condition (code)
34147 : reverse_condition_maybe_unordered (code));
34148 }
34149
34150 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34151 to OPERANDS[0]. */
34152
34153 const char *
34154 output_387_reg_move (rtx insn, rtx *operands)
34155 {
34156 if (REG_P (operands[0]))
34157 {
34158 if (REG_P (operands[1])
34159 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34160 {
34161 if (REGNO (operands[0]) == FIRST_STACK_REG)
34162 return output_387_ffreep (operands, 0);
34163 return "fstp\t%y0";
34164 }
34165 if (STACK_TOP_P (operands[0]))
34166 return "fld%Z1\t%y1";
34167 return "fst\t%y0";
34168 }
34169 else if (MEM_P (operands[0]))
34170 {
34171 gcc_assert (REG_P (operands[1]));
34172 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34173 return "fstp%Z0\t%y0";
34174 else
34175 {
34176 /* There is no non-popping store to memory for XFmode.
34177 So if we need one, follow the store with a load. */
34178 if (GET_MODE (operands[0]) == XFmode)
34179 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
34180 else
34181 return "fst%Z0\t%y0";
34182 }
34183 }
34184 else
34185 gcc_unreachable();
34186 }
34187
34188 /* Output code to perform a conditional jump to LABEL, if C2 flag in
34189 FP status register is set. */
34190
34191 void
34192 ix86_emit_fp_unordered_jump (rtx label)
34193 {
34194 rtx reg = gen_reg_rtx (HImode);
34195 rtx temp;
34196
34197 emit_insn (gen_x86_fnstsw_1 (reg));
34198
34199 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
34200 {
34201 emit_insn (gen_x86_sahf_1 (reg));
34202
34203 temp = gen_rtx_REG (CCmode, FLAGS_REG);
34204 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
34205 }
34206 else
34207 {
34208 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
34209
34210 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
34211 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
34212 }
34213
34214 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
34215 gen_rtx_LABEL_REF (VOIDmode, label),
34216 pc_rtx);
34217 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
34218
34219 emit_jump_insn (temp);
34220 predict_jump (REG_BR_PROB_BASE * 10 / 100);
34221 }
34222
34223 /* Output code to perform a log1p XFmode calculation. */
34224
34225 void ix86_emit_i387_log1p (rtx op0, rtx op1)
34226 {
34227 rtx label1 = gen_label_rtx ();
34228 rtx label2 = gen_label_rtx ();
34229
34230 rtx tmp = gen_reg_rtx (XFmode);
34231 rtx tmp2 = gen_reg_rtx (XFmode);
34232 rtx test;
34233
34234 emit_insn (gen_absxf2 (tmp, op1));
34235 test = gen_rtx_GE (VOIDmode, tmp,
34236 CONST_DOUBLE_FROM_REAL_VALUE (
34237 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
34238 XFmode));
34239 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
34240
34241 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34242 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
34243 emit_jump (label2);
34244
34245 emit_label (label1);
34246 emit_move_insn (tmp, CONST1_RTX (XFmode));
34247 emit_insn (gen_addxf3 (tmp, op1, tmp));
34248 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34249 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
34250
34251 emit_label (label2);
34252 }
34253
34254 /* Emit code for round calculation. */
34255 void ix86_emit_i387_round (rtx op0, rtx op1)
34256 {
34257 enum machine_mode inmode = GET_MODE (op1);
34258 enum machine_mode outmode = GET_MODE (op0);
34259 rtx e1, e2, res, tmp, tmp1, half;
34260 rtx scratch = gen_reg_rtx (HImode);
34261 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
34262 rtx jump_label = gen_label_rtx ();
34263 rtx insn;
34264 rtx (*gen_abs) (rtx, rtx);
34265 rtx (*gen_neg) (rtx, rtx);
34266
34267 switch (inmode)
34268 {
34269 case SFmode:
34270 gen_abs = gen_abssf2;
34271 break;
34272 case DFmode:
34273 gen_abs = gen_absdf2;
34274 break;
34275 case XFmode:
34276 gen_abs = gen_absxf2;
34277 break;
34278 default:
34279 gcc_unreachable ();
34280 }
34281
34282 switch (outmode)
34283 {
34284 case SFmode:
34285 gen_neg = gen_negsf2;
34286 break;
34287 case DFmode:
34288 gen_neg = gen_negdf2;
34289 break;
34290 case XFmode:
34291 gen_neg = gen_negxf2;
34292 break;
34293 case HImode:
34294 gen_neg = gen_neghi2;
34295 break;
34296 case SImode:
34297 gen_neg = gen_negsi2;
34298 break;
34299 case DImode:
34300 gen_neg = gen_negdi2;
34301 break;
34302 default:
34303 gcc_unreachable ();
34304 }
34305
34306 e1 = gen_reg_rtx (inmode);
34307 e2 = gen_reg_rtx (inmode);
34308 res = gen_reg_rtx (outmode);
34309
34310 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
34311
34312 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
34313
34314 /* scratch = fxam(op1) */
34315 emit_insn (gen_rtx_SET (VOIDmode, scratch,
34316 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
34317 UNSPEC_FXAM)));
34318 /* e1 = fabs(op1) */
34319 emit_insn (gen_abs (e1, op1));
34320
34321 /* e2 = e1 + 0.5 */
34322 half = force_reg (inmode, half);
34323 emit_insn (gen_rtx_SET (VOIDmode, e2,
34324 gen_rtx_PLUS (inmode, e1, half)));
34325
34326 /* res = floor(e2) */
34327 if (inmode != XFmode)
34328 {
34329 tmp1 = gen_reg_rtx (XFmode);
34330
34331 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
34332 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
34333 }
34334 else
34335 tmp1 = e2;
34336
34337 switch (outmode)
34338 {
34339 case SFmode:
34340 case DFmode:
34341 {
34342 rtx tmp0 = gen_reg_rtx (XFmode);
34343
34344 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
34345
34346 emit_insn (gen_rtx_SET (VOIDmode, res,
34347 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
34348 UNSPEC_TRUNC_NOOP)));
34349 }
34350 break;
34351 case XFmode:
34352 emit_insn (gen_frndintxf2_floor (res, tmp1));
34353 break;
34354 case HImode:
34355 emit_insn (gen_lfloorxfhi2 (res, tmp1));
34356 break;
34357 case SImode:
34358 emit_insn (gen_lfloorxfsi2 (res, tmp1));
34359 break;
34360 case DImode:
34361 emit_insn (gen_lfloorxfdi2 (res, tmp1));
34362 break;
34363 default:
34364 gcc_unreachable ();
34365 }
34366
34367 /* flags = signbit(a) */
34368 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
34369
34370 /* if (flags) then res = -res */
34371 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
34372 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
34373 gen_rtx_LABEL_REF (VOIDmode, jump_label),
34374 pc_rtx);
34375 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34376 predict_jump (REG_BR_PROB_BASE * 50 / 100);
34377 JUMP_LABEL (insn) = jump_label;
34378
34379 emit_insn (gen_neg (res, res));
34380
34381 emit_label (jump_label);
34382 LABEL_NUSES (jump_label) = 1;
34383
34384 emit_move_insn (op0, res);
34385 }
34386
34387 /* Output code to perform a Newton-Rhapson approximation of a single precision
34388 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
34389
34390 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
34391 {
34392 rtx x0, x1, e0, e1;
34393
34394 x0 = gen_reg_rtx (mode);
34395 e0 = gen_reg_rtx (mode);
34396 e1 = gen_reg_rtx (mode);
34397 x1 = gen_reg_rtx (mode);
34398
34399 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
34400
34401 b = force_reg (mode, b);
34402
34403 /* x0 = rcp(b) estimate */
34404 emit_insn (gen_rtx_SET (VOIDmode, x0,
34405 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
34406 UNSPEC_RCP)));
34407 /* e0 = x0 * b */
34408 emit_insn (gen_rtx_SET (VOIDmode, e0,
34409 gen_rtx_MULT (mode, x0, b)));
34410
34411 /* e0 = x0 * e0 */
34412 emit_insn (gen_rtx_SET (VOIDmode, e0,
34413 gen_rtx_MULT (mode, x0, e0)));
34414
34415 /* e1 = x0 + x0 */
34416 emit_insn (gen_rtx_SET (VOIDmode, e1,
34417 gen_rtx_PLUS (mode, x0, x0)));
34418
34419 /* x1 = e1 - e0 */
34420 emit_insn (gen_rtx_SET (VOIDmode, x1,
34421 gen_rtx_MINUS (mode, e1, e0)));
34422
34423 /* res = a * x1 */
34424 emit_insn (gen_rtx_SET (VOIDmode, res,
34425 gen_rtx_MULT (mode, a, x1)));
34426 }
34427
34428 /* Output code to perform a Newton-Rhapson approximation of a
34429 single precision floating point [reciprocal] square root. */
34430
34431 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
34432 bool recip)
34433 {
34434 rtx x0, e0, e1, e2, e3, mthree, mhalf;
34435 REAL_VALUE_TYPE r;
34436
34437 x0 = gen_reg_rtx (mode);
34438 e0 = gen_reg_rtx (mode);
34439 e1 = gen_reg_rtx (mode);
34440 e2 = gen_reg_rtx (mode);
34441 e3 = gen_reg_rtx (mode);
34442
34443 real_from_integer (&r, VOIDmode, -3, -1, 0);
34444 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34445
34446 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
34447 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34448
34449 if (VECTOR_MODE_P (mode))
34450 {
34451 mthree = ix86_build_const_vector (mode, true, mthree);
34452 mhalf = ix86_build_const_vector (mode, true, mhalf);
34453 }
34454
34455 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
34456 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
34457
34458 a = force_reg (mode, a);
34459
34460 /* x0 = rsqrt(a) estimate */
34461 emit_insn (gen_rtx_SET (VOIDmode, x0,
34462 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
34463 UNSPEC_RSQRT)));
34464
34465 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
34466 if (!recip)
34467 {
34468 rtx zero, mask;
34469
34470 zero = gen_reg_rtx (mode);
34471 mask = gen_reg_rtx (mode);
34472
34473 zero = force_reg (mode, CONST0_RTX(mode));
34474 emit_insn (gen_rtx_SET (VOIDmode, mask,
34475 gen_rtx_NE (mode, zero, a)));
34476
34477 emit_insn (gen_rtx_SET (VOIDmode, x0,
34478 gen_rtx_AND (mode, x0, mask)));
34479 }
34480
34481 /* e0 = x0 * a */
34482 emit_insn (gen_rtx_SET (VOIDmode, e0,
34483 gen_rtx_MULT (mode, x0, a)));
34484 /* e1 = e0 * x0 */
34485 emit_insn (gen_rtx_SET (VOIDmode, e1,
34486 gen_rtx_MULT (mode, e0, x0)));
34487
34488 /* e2 = e1 - 3. */
34489 mthree = force_reg (mode, mthree);
34490 emit_insn (gen_rtx_SET (VOIDmode, e2,
34491 gen_rtx_PLUS (mode, e1, mthree)));
34492
34493 mhalf = force_reg (mode, mhalf);
34494 if (recip)
34495 /* e3 = -.5 * x0 */
34496 emit_insn (gen_rtx_SET (VOIDmode, e3,
34497 gen_rtx_MULT (mode, x0, mhalf)));
34498 else
34499 /* e3 = -.5 * e0 */
34500 emit_insn (gen_rtx_SET (VOIDmode, e3,
34501 gen_rtx_MULT (mode, e0, mhalf)));
34502 /* ret = e2 * e3 */
34503 emit_insn (gen_rtx_SET (VOIDmode, res,
34504 gen_rtx_MULT (mode, e2, e3)));
34505 }
34506
34507 #ifdef TARGET_SOLARIS
34508 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
34509
34510 static void
34511 i386_solaris_elf_named_section (const char *name, unsigned int flags,
34512 tree decl)
34513 {
34514 /* With Binutils 2.15, the "@unwind" marker must be specified on
34515 every occurrence of the ".eh_frame" section, not just the first
34516 one. */
34517 if (TARGET_64BIT
34518 && strcmp (name, ".eh_frame") == 0)
34519 {
34520 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
34521 flags & SECTION_WRITE ? "aw" : "a");
34522 return;
34523 }
34524
34525 #ifndef USE_GAS
34526 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
34527 {
34528 solaris_elf_asm_comdat_section (name, flags, decl);
34529 return;
34530 }
34531 #endif
34532
34533 default_elf_asm_named_section (name, flags, decl);
34534 }
34535 #endif /* TARGET_SOLARIS */
34536
34537 /* Return the mangling of TYPE if it is an extended fundamental type. */
34538
34539 static const char *
34540 ix86_mangle_type (const_tree type)
34541 {
34542 type = TYPE_MAIN_VARIANT (type);
34543
34544 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
34545 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
34546 return NULL;
34547
34548 switch (TYPE_MODE (type))
34549 {
34550 case TFmode:
34551 /* __float128 is "g". */
34552 return "g";
34553 case XFmode:
34554 /* "long double" or __float80 is "e". */
34555 return "e";
34556 default:
34557 return NULL;
34558 }
34559 }
34560
34561 /* For 32-bit code we can save PIC register setup by using
34562 __stack_chk_fail_local hidden function instead of calling
34563 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
34564 register, so it is better to call __stack_chk_fail directly. */
34565
34566 static tree ATTRIBUTE_UNUSED
34567 ix86_stack_protect_fail (void)
34568 {
34569 return TARGET_64BIT
34570 ? default_external_stack_protect_fail ()
34571 : default_hidden_stack_protect_fail ();
34572 }
34573
34574 /* Select a format to encode pointers in exception handling data. CODE
34575 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
34576 true if the symbol may be affected by dynamic relocations.
34577
34578 ??? All x86 object file formats are capable of representing this.
34579 After all, the relocation needed is the same as for the call insn.
34580 Whether or not a particular assembler allows us to enter such, I
34581 guess we'll have to see. */
34582 int
34583 asm_preferred_eh_data_format (int code, int global)
34584 {
34585 if (flag_pic)
34586 {
34587 int type = DW_EH_PE_sdata8;
34588 if (!TARGET_64BIT
34589 || ix86_cmodel == CM_SMALL_PIC
34590 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
34591 type = DW_EH_PE_sdata4;
34592 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
34593 }
34594 if (ix86_cmodel == CM_SMALL
34595 || (ix86_cmodel == CM_MEDIUM && code))
34596 return DW_EH_PE_udata4;
34597 return DW_EH_PE_absptr;
34598 }
34599 \f
34600 /* Expand copysign from SIGN to the positive value ABS_VALUE
34601 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
34602 the sign-bit. */
34603 static void
34604 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
34605 {
34606 enum machine_mode mode = GET_MODE (sign);
34607 rtx sgn = gen_reg_rtx (mode);
34608 if (mask == NULL_RTX)
34609 {
34610 enum machine_mode vmode;
34611
34612 if (mode == SFmode)
34613 vmode = V4SFmode;
34614 else if (mode == DFmode)
34615 vmode = V2DFmode;
34616 else
34617 vmode = mode;
34618
34619 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
34620 if (!VECTOR_MODE_P (mode))
34621 {
34622 /* We need to generate a scalar mode mask in this case. */
34623 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34624 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34625 mask = gen_reg_rtx (mode);
34626 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34627 }
34628 }
34629 else
34630 mask = gen_rtx_NOT (mode, mask);
34631 emit_insn (gen_rtx_SET (VOIDmode, sgn,
34632 gen_rtx_AND (mode, mask, sign)));
34633 emit_insn (gen_rtx_SET (VOIDmode, result,
34634 gen_rtx_IOR (mode, abs_value, sgn)));
34635 }
34636
34637 /* Expand fabs (OP0) and return a new rtx that holds the result. The
34638 mask for masking out the sign-bit is stored in *SMASK, if that is
34639 non-null. */
34640 static rtx
34641 ix86_expand_sse_fabs (rtx op0, rtx *smask)
34642 {
34643 enum machine_mode vmode, mode = GET_MODE (op0);
34644 rtx xa, mask;
34645
34646 xa = gen_reg_rtx (mode);
34647 if (mode == SFmode)
34648 vmode = V4SFmode;
34649 else if (mode == DFmode)
34650 vmode = V2DFmode;
34651 else
34652 vmode = mode;
34653 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
34654 if (!VECTOR_MODE_P (mode))
34655 {
34656 /* We need to generate a scalar mode mask in this case. */
34657 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34658 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34659 mask = gen_reg_rtx (mode);
34660 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34661 }
34662 emit_insn (gen_rtx_SET (VOIDmode, xa,
34663 gen_rtx_AND (mode, op0, mask)));
34664
34665 if (smask)
34666 *smask = mask;
34667
34668 return xa;
34669 }
34670
34671 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
34672 swapping the operands if SWAP_OPERANDS is true. The expanded
34673 code is a forward jump to a newly created label in case the
34674 comparison is true. The generated label rtx is returned. */
34675 static rtx
34676 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
34677 bool swap_operands)
34678 {
34679 rtx label, tmp;
34680
34681 if (swap_operands)
34682 {
34683 tmp = op0;
34684 op0 = op1;
34685 op1 = tmp;
34686 }
34687
34688 label = gen_label_rtx ();
34689 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
34690 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34691 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
34692 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
34693 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
34694 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
34695 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34696 JUMP_LABEL (tmp) = label;
34697
34698 return label;
34699 }
34700
34701 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
34702 using comparison code CODE. Operands are swapped for the comparison if
34703 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
34704 static rtx
34705 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
34706 bool swap_operands)
34707 {
34708 rtx (*insn)(rtx, rtx, rtx, rtx);
34709 enum machine_mode mode = GET_MODE (op0);
34710 rtx mask = gen_reg_rtx (mode);
34711
34712 if (swap_operands)
34713 {
34714 rtx tmp = op0;
34715 op0 = op1;
34716 op1 = tmp;
34717 }
34718
34719 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
34720
34721 emit_insn (insn (mask, op0, op1,
34722 gen_rtx_fmt_ee (code, mode, op0, op1)));
34723 return mask;
34724 }
34725
34726 /* Generate and return a rtx of mode MODE for 2**n where n is the number
34727 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
34728 static rtx
34729 ix86_gen_TWO52 (enum machine_mode mode)
34730 {
34731 REAL_VALUE_TYPE TWO52r;
34732 rtx TWO52;
34733
34734 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
34735 TWO52 = const_double_from_real_value (TWO52r, mode);
34736 TWO52 = force_reg (mode, TWO52);
34737
34738 return TWO52;
34739 }
34740
34741 /* Expand SSE sequence for computing lround from OP1 storing
34742 into OP0. */
34743 void
34744 ix86_expand_lround (rtx op0, rtx op1)
34745 {
34746 /* C code for the stuff we're doing below:
34747 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
34748 return (long)tmp;
34749 */
34750 enum machine_mode mode = GET_MODE (op1);
34751 const struct real_format *fmt;
34752 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34753 rtx adj;
34754
34755 /* load nextafter (0.5, 0.0) */
34756 fmt = REAL_MODE_FORMAT (mode);
34757 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34758 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34759
34760 /* adj = copysign (0.5, op1) */
34761 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
34762 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
34763
34764 /* adj = op1 + adj */
34765 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
34766
34767 /* op0 = (imode)adj */
34768 expand_fix (op0, adj, 0);
34769 }
34770
34771 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
34772 into OPERAND0. */
34773 void
34774 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
34775 {
34776 /* C code for the stuff we're doing below (for do_floor):
34777 xi = (long)op1;
34778 xi -= (double)xi > op1 ? 1 : 0;
34779 return xi;
34780 */
34781 enum machine_mode fmode = GET_MODE (op1);
34782 enum machine_mode imode = GET_MODE (op0);
34783 rtx ireg, freg, label, tmp;
34784
34785 /* reg = (long)op1 */
34786 ireg = gen_reg_rtx (imode);
34787 expand_fix (ireg, op1, 0);
34788
34789 /* freg = (double)reg */
34790 freg = gen_reg_rtx (fmode);
34791 expand_float (freg, ireg, 0);
34792
34793 /* ireg = (freg > op1) ? ireg - 1 : ireg */
34794 label = ix86_expand_sse_compare_and_jump (UNLE,
34795 freg, op1, !do_floor);
34796 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
34797 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
34798 emit_move_insn (ireg, tmp);
34799
34800 emit_label (label);
34801 LABEL_NUSES (label) = 1;
34802
34803 emit_move_insn (op0, ireg);
34804 }
34805
34806 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
34807 result in OPERAND0. */
34808 void
34809 ix86_expand_rint (rtx operand0, rtx operand1)
34810 {
34811 /* C code for the stuff we're doing below:
34812 xa = fabs (operand1);
34813 if (!isless (xa, 2**52))
34814 return operand1;
34815 xa = xa + 2**52 - 2**52;
34816 return copysign (xa, operand1);
34817 */
34818 enum machine_mode mode = GET_MODE (operand0);
34819 rtx res, xa, label, TWO52, mask;
34820
34821 res = gen_reg_rtx (mode);
34822 emit_move_insn (res, operand1);
34823
34824 /* xa = abs (operand1) */
34825 xa = ix86_expand_sse_fabs (res, &mask);
34826
34827 /* if (!isless (xa, TWO52)) goto label; */
34828 TWO52 = ix86_gen_TWO52 (mode);
34829 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34830
34831 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34832 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34833
34834 ix86_sse_copysign_to_positive (res, xa, res, mask);
34835
34836 emit_label (label);
34837 LABEL_NUSES (label) = 1;
34838
34839 emit_move_insn (operand0, res);
34840 }
34841
34842 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34843 into OPERAND0. */
34844 void
34845 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
34846 {
34847 /* C code for the stuff we expand below.
34848 double xa = fabs (x), x2;
34849 if (!isless (xa, TWO52))
34850 return x;
34851 xa = xa + TWO52 - TWO52;
34852 x2 = copysign (xa, x);
34853 Compensate. Floor:
34854 if (x2 > x)
34855 x2 -= 1;
34856 Compensate. Ceil:
34857 if (x2 < x)
34858 x2 -= -1;
34859 return x2;
34860 */
34861 enum machine_mode mode = GET_MODE (operand0);
34862 rtx xa, TWO52, tmp, label, one, res, mask;
34863
34864 TWO52 = ix86_gen_TWO52 (mode);
34865
34866 /* Temporary for holding the result, initialized to the input
34867 operand to ease control flow. */
34868 res = gen_reg_rtx (mode);
34869 emit_move_insn (res, operand1);
34870
34871 /* xa = abs (operand1) */
34872 xa = ix86_expand_sse_fabs (res, &mask);
34873
34874 /* if (!isless (xa, TWO52)) goto label; */
34875 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34876
34877 /* xa = xa + TWO52 - TWO52; */
34878 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34879 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34880
34881 /* xa = copysign (xa, operand1) */
34882 ix86_sse_copysign_to_positive (xa, xa, res, mask);
34883
34884 /* generate 1.0 or -1.0 */
34885 one = force_reg (mode,
34886 const_double_from_real_value (do_floor
34887 ? dconst1 : dconstm1, mode));
34888
34889 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34890 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34891 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34892 gen_rtx_AND (mode, one, tmp)));
34893 /* We always need to subtract here to preserve signed zero. */
34894 tmp = expand_simple_binop (mode, MINUS,
34895 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34896 emit_move_insn (res, tmp);
34897
34898 emit_label (label);
34899 LABEL_NUSES (label) = 1;
34900
34901 emit_move_insn (operand0, res);
34902 }
34903
34904 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34905 into OPERAND0. */
34906 void
34907 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
34908 {
34909 /* C code for the stuff we expand below.
34910 double xa = fabs (x), x2;
34911 if (!isless (xa, TWO52))
34912 return x;
34913 x2 = (double)(long)x;
34914 Compensate. Floor:
34915 if (x2 > x)
34916 x2 -= 1;
34917 Compensate. Ceil:
34918 if (x2 < x)
34919 x2 += 1;
34920 if (HONOR_SIGNED_ZEROS (mode))
34921 return copysign (x2, x);
34922 return x2;
34923 */
34924 enum machine_mode mode = GET_MODE (operand0);
34925 rtx xa, xi, TWO52, tmp, label, one, res, mask;
34926
34927 TWO52 = ix86_gen_TWO52 (mode);
34928
34929 /* Temporary for holding the result, initialized to the input
34930 operand to ease control flow. */
34931 res = gen_reg_rtx (mode);
34932 emit_move_insn (res, operand1);
34933
34934 /* xa = abs (operand1) */
34935 xa = ix86_expand_sse_fabs (res, &mask);
34936
34937 /* if (!isless (xa, TWO52)) goto label; */
34938 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34939
34940 /* xa = (double)(long)x */
34941 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34942 expand_fix (xi, res, 0);
34943 expand_float (xa, xi, 0);
34944
34945 /* generate 1.0 */
34946 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
34947
34948 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34949 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34950 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34951 gen_rtx_AND (mode, one, tmp)));
34952 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
34953 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34954 emit_move_insn (res, tmp);
34955
34956 if (HONOR_SIGNED_ZEROS (mode))
34957 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
34958
34959 emit_label (label);
34960 LABEL_NUSES (label) = 1;
34961
34962 emit_move_insn (operand0, res);
34963 }
34964
34965 /* Expand SSE sequence for computing round from OPERAND1 storing
34966 into OPERAND0. Sequence that works without relying on DImode truncation
34967 via cvttsd2siq that is only available on 64bit targets. */
34968 void
34969 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
34970 {
34971 /* C code for the stuff we expand below.
34972 double xa = fabs (x), xa2, x2;
34973 if (!isless (xa, TWO52))
34974 return x;
34975 Using the absolute value and copying back sign makes
34976 -0.0 -> -0.0 correct.
34977 xa2 = xa + TWO52 - TWO52;
34978 Compensate.
34979 dxa = xa2 - xa;
34980 if (dxa <= -0.5)
34981 xa2 += 1;
34982 else if (dxa > 0.5)
34983 xa2 -= 1;
34984 x2 = copysign (xa2, x);
34985 return x2;
34986 */
34987 enum machine_mode mode = GET_MODE (operand0);
34988 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
34989
34990 TWO52 = ix86_gen_TWO52 (mode);
34991
34992 /* Temporary for holding the result, initialized to the input
34993 operand to ease control flow. */
34994 res = gen_reg_rtx (mode);
34995 emit_move_insn (res, operand1);
34996
34997 /* xa = abs (operand1) */
34998 xa = ix86_expand_sse_fabs (res, &mask);
34999
35000 /* if (!isless (xa, TWO52)) goto label; */
35001 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35002
35003 /* xa2 = xa + TWO52 - TWO52; */
35004 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35005 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35006
35007 /* dxa = xa2 - xa; */
35008 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35009
35010 /* generate 0.5, 1.0 and -0.5 */
35011 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35012 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35013 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35014 0, OPTAB_DIRECT);
35015
35016 /* Compensate. */
35017 tmp = gen_reg_rtx (mode);
35018 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35019 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35020 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35021 gen_rtx_AND (mode, one, tmp)));
35022 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35023 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35024 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35025 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35026 gen_rtx_AND (mode, one, tmp)));
35027 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35028
35029 /* res = copysign (xa2, operand1) */
35030 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35031
35032 emit_label (label);
35033 LABEL_NUSES (label) = 1;
35034
35035 emit_move_insn (operand0, res);
35036 }
35037
35038 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35039 into OPERAND0. */
35040 void
35041 ix86_expand_trunc (rtx operand0, rtx operand1)
35042 {
35043 /* C code for SSE variant we expand below.
35044 double xa = fabs (x), x2;
35045 if (!isless (xa, TWO52))
35046 return x;
35047 x2 = (double)(long)x;
35048 if (HONOR_SIGNED_ZEROS (mode))
35049 return copysign (x2, x);
35050 return x2;
35051 */
35052 enum machine_mode mode = GET_MODE (operand0);
35053 rtx xa, xi, TWO52, label, res, mask;
35054
35055 TWO52 = ix86_gen_TWO52 (mode);
35056
35057 /* Temporary for holding the result, initialized to the input
35058 operand to ease control flow. */
35059 res = gen_reg_rtx (mode);
35060 emit_move_insn (res, operand1);
35061
35062 /* xa = abs (operand1) */
35063 xa = ix86_expand_sse_fabs (res, &mask);
35064
35065 /* if (!isless (xa, TWO52)) goto label; */
35066 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35067
35068 /* x = (double)(long)x */
35069 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35070 expand_fix (xi, res, 0);
35071 expand_float (res, xi, 0);
35072
35073 if (HONOR_SIGNED_ZEROS (mode))
35074 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35075
35076 emit_label (label);
35077 LABEL_NUSES (label) = 1;
35078
35079 emit_move_insn (operand0, res);
35080 }
35081
35082 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35083 into OPERAND0. */
35084 void
35085 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35086 {
35087 enum machine_mode mode = GET_MODE (operand0);
35088 rtx xa, mask, TWO52, label, one, res, smask, tmp;
35089
35090 /* C code for SSE variant we expand below.
35091 double xa = fabs (x), x2;
35092 if (!isless (xa, TWO52))
35093 return x;
35094 xa2 = xa + TWO52 - TWO52;
35095 Compensate:
35096 if (xa2 > xa)
35097 xa2 -= 1.0;
35098 x2 = copysign (xa2, x);
35099 return x2;
35100 */
35101
35102 TWO52 = ix86_gen_TWO52 (mode);
35103
35104 /* Temporary for holding the result, initialized to the input
35105 operand to ease control flow. */
35106 res = gen_reg_rtx (mode);
35107 emit_move_insn (res, operand1);
35108
35109 /* xa = abs (operand1) */
35110 xa = ix86_expand_sse_fabs (res, &smask);
35111
35112 /* if (!isless (xa, TWO52)) goto label; */
35113 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35114
35115 /* res = xa + TWO52 - TWO52; */
35116 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35117 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35118 emit_move_insn (res, tmp);
35119
35120 /* generate 1.0 */
35121 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35122
35123 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
35124 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35125 emit_insn (gen_rtx_SET (VOIDmode, mask,
35126 gen_rtx_AND (mode, mask, one)));
35127 tmp = expand_simple_binop (mode, MINUS,
35128 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35129 emit_move_insn (res, tmp);
35130
35131 /* res = copysign (res, operand1) */
35132 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35133
35134 emit_label (label);
35135 LABEL_NUSES (label) = 1;
35136
35137 emit_move_insn (operand0, res);
35138 }
35139
35140 /* Expand SSE sequence for computing round from OPERAND1 storing
35141 into OPERAND0. */
35142 void
35143 ix86_expand_round (rtx operand0, rtx operand1)
35144 {
35145 /* C code for the stuff we're doing below:
35146 double xa = fabs (x);
35147 if (!isless (xa, TWO52))
35148 return x;
35149 xa = (double)(long)(xa + nextafter (0.5, 0.0));
35150 return copysign (xa, x);
35151 */
35152 enum machine_mode mode = GET_MODE (operand0);
35153 rtx res, TWO52, xa, label, xi, half, mask;
35154 const struct real_format *fmt;
35155 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35156
35157 /* Temporary for holding the result, initialized to the input
35158 operand to ease control flow. */
35159 res = gen_reg_rtx (mode);
35160 emit_move_insn (res, operand1);
35161
35162 TWO52 = ix86_gen_TWO52 (mode);
35163 xa = ix86_expand_sse_fabs (res, &mask);
35164 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35165
35166 /* load nextafter (0.5, 0.0) */
35167 fmt = REAL_MODE_FORMAT (mode);
35168 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35169 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35170
35171 /* xa = xa + 0.5 */
35172 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
35173 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
35174
35175 /* xa = (double)(int64_t)xa */
35176 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35177 expand_fix (xi, xa, 0);
35178 expand_float (xa, xi, 0);
35179
35180 /* res = copysign (xa, operand1) */
35181 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
35182
35183 emit_label (label);
35184 LABEL_NUSES (label) = 1;
35185
35186 emit_move_insn (operand0, res);
35187 }
35188
35189 /* Expand SSE sequence for computing round
35190 from OP1 storing into OP0 using sse4 round insn. */
35191 void
35192 ix86_expand_round_sse4 (rtx op0, rtx op1)
35193 {
35194 enum machine_mode mode = GET_MODE (op0);
35195 rtx e1, e2, res, half;
35196 const struct real_format *fmt;
35197 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35198 rtx (*gen_copysign) (rtx, rtx, rtx);
35199 rtx (*gen_round) (rtx, rtx, rtx);
35200
35201 switch (mode)
35202 {
35203 case SFmode:
35204 gen_copysign = gen_copysignsf3;
35205 gen_round = gen_sse4_1_roundsf2;
35206 break;
35207 case DFmode:
35208 gen_copysign = gen_copysigndf3;
35209 gen_round = gen_sse4_1_rounddf2;
35210 break;
35211 default:
35212 gcc_unreachable ();
35213 }
35214
35215 /* round (a) = trunc (a + copysign (0.5, a)) */
35216
35217 /* load nextafter (0.5, 0.0) */
35218 fmt = REAL_MODE_FORMAT (mode);
35219 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35220 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35221 half = const_double_from_real_value (pred_half, mode);
35222
35223 /* e1 = copysign (0.5, op1) */
35224 e1 = gen_reg_rtx (mode);
35225 emit_insn (gen_copysign (e1, half, op1));
35226
35227 /* e2 = op1 + e1 */
35228 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
35229
35230 /* res = trunc (e2) */
35231 res = gen_reg_rtx (mode);
35232 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
35233
35234 emit_move_insn (op0, res);
35235 }
35236 \f
35237
35238 /* Table of valid machine attributes. */
35239 static const struct attribute_spec ix86_attribute_table[] =
35240 {
35241 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
35242 affects_type_identity } */
35243 /* Stdcall attribute says callee is responsible for popping arguments
35244 if they are not variable. */
35245 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35246 true },
35247 /* Fastcall attribute says callee is responsible for popping arguments
35248 if they are not variable. */
35249 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35250 true },
35251 /* Thiscall attribute says callee is responsible for popping arguments
35252 if they are not variable. */
35253 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35254 true },
35255 /* Cdecl attribute says the callee is a normal C declaration */
35256 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35257 true },
35258 /* Regparm attribute specifies how many integer arguments are to be
35259 passed in registers. */
35260 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
35261 true },
35262 /* Sseregparm attribute says we are using x86_64 calling conventions
35263 for FP arguments. */
35264 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35265 true },
35266 /* The transactional memory builtins are implicitly regparm or fastcall
35267 depending on the ABI. Override the generic do-nothing attribute that
35268 these builtins were declared with. */
35269 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
35270 true },
35271 /* force_align_arg_pointer says this function realigns the stack at entry. */
35272 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
35273 false, true, true, ix86_handle_cconv_attribute, false },
35274 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
35275 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
35276 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
35277 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
35278 false },
35279 #endif
35280 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35281 false },
35282 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35283 false },
35284 #ifdef SUBTARGET_ATTRIBUTE_TABLE
35285 SUBTARGET_ATTRIBUTE_TABLE,
35286 #endif
35287 /* ms_abi and sysv_abi calling convention function attributes. */
35288 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35289 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35290 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
35291 false },
35292 { "callee_pop_aggregate_return", 1, 1, false, true, true,
35293 ix86_handle_callee_pop_aggregate_return, true },
35294 /* End element. */
35295 { NULL, 0, 0, false, false, false, NULL, false }
35296 };
35297
35298 /* Implement targetm.vectorize.builtin_vectorization_cost. */
35299 static int
35300 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35301 tree vectype ATTRIBUTE_UNUSED,
35302 int misalign ATTRIBUTE_UNUSED)
35303 {
35304 switch (type_of_cost)
35305 {
35306 case scalar_stmt:
35307 return ix86_cost->scalar_stmt_cost;
35308
35309 case scalar_load:
35310 return ix86_cost->scalar_load_cost;
35311
35312 case scalar_store:
35313 return ix86_cost->scalar_store_cost;
35314
35315 case vector_stmt:
35316 return ix86_cost->vec_stmt_cost;
35317
35318 case vector_load:
35319 return ix86_cost->vec_align_load_cost;
35320
35321 case vector_store:
35322 return ix86_cost->vec_store_cost;
35323
35324 case vec_to_scalar:
35325 return ix86_cost->vec_to_scalar_cost;
35326
35327 case scalar_to_vec:
35328 return ix86_cost->scalar_to_vec_cost;
35329
35330 case unaligned_load:
35331 case unaligned_store:
35332 return ix86_cost->vec_unalign_load_cost;
35333
35334 case cond_branch_taken:
35335 return ix86_cost->cond_taken_branch_cost;
35336
35337 case cond_branch_not_taken:
35338 return ix86_cost->cond_not_taken_branch_cost;
35339
35340 case vec_perm:
35341 return 1;
35342
35343 default:
35344 gcc_unreachable ();
35345 }
35346 }
35347
35348 /* Construct (set target (vec_select op0 (parallel perm))) and
35349 return true if that's a valid instruction in the active ISA. */
35350
35351 static bool
35352 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
35353 {
35354 rtx rperm[MAX_VECT_LEN], x;
35355 unsigned i;
35356
35357 for (i = 0; i < nelt; ++i)
35358 rperm[i] = GEN_INT (perm[i]);
35359
35360 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
35361 x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
35362 x = gen_rtx_SET (VOIDmode, target, x);
35363
35364 x = emit_insn (x);
35365 if (recog_memoized (x) < 0)
35366 {
35367 remove_insn (x);
35368 return false;
35369 }
35370 return true;
35371 }
35372
35373 /* Similar, but generate a vec_concat from op0 and op1 as well. */
35374
35375 static bool
35376 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
35377 const unsigned char *perm, unsigned nelt)
35378 {
35379 enum machine_mode v2mode;
35380 rtx x;
35381
35382 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
35383 x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
35384 return expand_vselect (target, x, perm, nelt);
35385 }
35386
35387 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35388 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
35389
35390 static bool
35391 expand_vec_perm_blend (struct expand_vec_perm_d *d)
35392 {
35393 enum machine_mode vmode = d->vmode;
35394 unsigned i, mask, nelt = d->nelt;
35395 rtx target, op0, op1, x;
35396 rtx rperm[32], vperm;
35397
35398 if (d->op0 == d->op1)
35399 return false;
35400 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
35401 ;
35402 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
35403 ;
35404 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
35405 ;
35406 else
35407 return false;
35408
35409 /* This is a blend, not a permute. Elements must stay in their
35410 respective lanes. */
35411 for (i = 0; i < nelt; ++i)
35412 {
35413 unsigned e = d->perm[i];
35414 if (!(e == i || e == i + nelt))
35415 return false;
35416 }
35417
35418 if (d->testing_p)
35419 return true;
35420
35421 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
35422 decision should be extracted elsewhere, so that we only try that
35423 sequence once all budget==3 options have been tried. */
35424 target = d->target;
35425 op0 = d->op0;
35426 op1 = d->op1;
35427 mask = 0;
35428
35429 switch (vmode)
35430 {
35431 case V4DFmode:
35432 case V8SFmode:
35433 case V2DFmode:
35434 case V4SFmode:
35435 case V8HImode:
35436 case V8SImode:
35437 for (i = 0; i < nelt; ++i)
35438 mask |= (d->perm[i] >= nelt) << i;
35439 break;
35440
35441 case V2DImode:
35442 for (i = 0; i < 2; ++i)
35443 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
35444 vmode = V8HImode;
35445 goto do_subreg;
35446
35447 case V4SImode:
35448 for (i = 0; i < 4; ++i)
35449 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35450 vmode = V8HImode;
35451 goto do_subreg;
35452
35453 case V16QImode:
35454 /* See if bytes move in pairs so we can use pblendw with
35455 an immediate argument, rather than pblendvb with a vector
35456 argument. */
35457 for (i = 0; i < 16; i += 2)
35458 if (d->perm[i] + 1 != d->perm[i + 1])
35459 {
35460 use_pblendvb:
35461 for (i = 0; i < nelt; ++i)
35462 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
35463
35464 finish_pblendvb:
35465 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
35466 vperm = force_reg (vmode, vperm);
35467
35468 if (GET_MODE_SIZE (vmode) == 16)
35469 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
35470 else
35471 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
35472 return true;
35473 }
35474
35475 for (i = 0; i < 8; ++i)
35476 mask |= (d->perm[i * 2] >= 16) << i;
35477 vmode = V8HImode;
35478 /* FALLTHRU */
35479
35480 do_subreg:
35481 target = gen_lowpart (vmode, target);
35482 op0 = gen_lowpart (vmode, op0);
35483 op1 = gen_lowpart (vmode, op1);
35484 break;
35485
35486 case V32QImode:
35487 /* See if bytes move in pairs. If not, vpblendvb must be used. */
35488 for (i = 0; i < 32; i += 2)
35489 if (d->perm[i] + 1 != d->perm[i + 1])
35490 goto use_pblendvb;
35491 /* See if bytes move in quadruplets. If yes, vpblendd
35492 with immediate can be used. */
35493 for (i = 0; i < 32; i += 4)
35494 if (d->perm[i] + 2 != d->perm[i + 2])
35495 break;
35496 if (i < 32)
35497 {
35498 /* See if bytes move the same in both lanes. If yes,
35499 vpblendw with immediate can be used. */
35500 for (i = 0; i < 16; i += 2)
35501 if (d->perm[i] + 16 != d->perm[i + 16])
35502 goto use_pblendvb;
35503
35504 /* Use vpblendw. */
35505 for (i = 0; i < 16; ++i)
35506 mask |= (d->perm[i * 2] >= 32) << i;
35507 vmode = V16HImode;
35508 goto do_subreg;
35509 }
35510
35511 /* Use vpblendd. */
35512 for (i = 0; i < 8; ++i)
35513 mask |= (d->perm[i * 4] >= 32) << i;
35514 vmode = V8SImode;
35515 goto do_subreg;
35516
35517 case V16HImode:
35518 /* See if words move in pairs. If yes, vpblendd can be used. */
35519 for (i = 0; i < 16; i += 2)
35520 if (d->perm[i] + 1 != d->perm[i + 1])
35521 break;
35522 if (i < 16)
35523 {
35524 /* See if words move the same in both lanes. If not,
35525 vpblendvb must be used. */
35526 for (i = 0; i < 8; i++)
35527 if (d->perm[i] + 8 != d->perm[i + 8])
35528 {
35529 /* Use vpblendvb. */
35530 for (i = 0; i < 32; ++i)
35531 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
35532
35533 vmode = V32QImode;
35534 nelt = 32;
35535 target = gen_lowpart (vmode, target);
35536 op0 = gen_lowpart (vmode, op0);
35537 op1 = gen_lowpart (vmode, op1);
35538 goto finish_pblendvb;
35539 }
35540
35541 /* Use vpblendw. */
35542 for (i = 0; i < 16; ++i)
35543 mask |= (d->perm[i] >= 16) << i;
35544 break;
35545 }
35546
35547 /* Use vpblendd. */
35548 for (i = 0; i < 8; ++i)
35549 mask |= (d->perm[i * 2] >= 16) << i;
35550 vmode = V8SImode;
35551 goto do_subreg;
35552
35553 case V4DImode:
35554 /* Use vpblendd. */
35555 for (i = 0; i < 4; ++i)
35556 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35557 vmode = V8SImode;
35558 goto do_subreg;
35559
35560 default:
35561 gcc_unreachable ();
35562 }
35563
35564 /* This matches five different patterns with the different modes. */
35565 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
35566 x = gen_rtx_SET (VOIDmode, target, x);
35567 emit_insn (x);
35568
35569 return true;
35570 }
35571
35572 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35573 in terms of the variable form of vpermilps.
35574
35575 Note that we will have already failed the immediate input vpermilps,
35576 which requires that the high and low part shuffle be identical; the
35577 variable form doesn't require that. */
35578
35579 static bool
35580 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
35581 {
35582 rtx rperm[8], vperm;
35583 unsigned i;
35584
35585 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
35586 return false;
35587
35588 /* We can only permute within the 128-bit lane. */
35589 for (i = 0; i < 8; ++i)
35590 {
35591 unsigned e = d->perm[i];
35592 if (i < 4 ? e >= 4 : e < 4)
35593 return false;
35594 }
35595
35596 if (d->testing_p)
35597 return true;
35598
35599 for (i = 0; i < 8; ++i)
35600 {
35601 unsigned e = d->perm[i];
35602
35603 /* Within each 128-bit lane, the elements of op0 are numbered
35604 from 0 and the elements of op1 are numbered from 4. */
35605 if (e >= 8 + 4)
35606 e -= 8;
35607 else if (e >= 4)
35608 e -= 4;
35609
35610 rperm[i] = GEN_INT (e);
35611 }
35612
35613 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
35614 vperm = force_reg (V8SImode, vperm);
35615 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
35616
35617 return true;
35618 }
35619
35620 /* Return true if permutation D can be performed as VMODE permutation
35621 instead. */
35622
35623 static bool
35624 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
35625 {
35626 unsigned int i, j, chunk;
35627
35628 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
35629 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
35630 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
35631 return false;
35632
35633 if (GET_MODE_NUNITS (vmode) >= d->nelt)
35634 return true;
35635
35636 chunk = d->nelt / GET_MODE_NUNITS (vmode);
35637 for (i = 0; i < d->nelt; i += chunk)
35638 if (d->perm[i] & (chunk - 1))
35639 return false;
35640 else
35641 for (j = 1; j < chunk; ++j)
35642 if (d->perm[i] + j != d->perm[i + j])
35643 return false;
35644
35645 return true;
35646 }
35647
35648 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35649 in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */
35650
35651 static bool
35652 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
35653 {
35654 unsigned i, nelt, eltsz, mask;
35655 unsigned char perm[32];
35656 enum machine_mode vmode = V16QImode;
35657 rtx rperm[32], vperm, target, op0, op1;
35658
35659 nelt = d->nelt;
35660
35661 if (d->op0 != d->op1)
35662 {
35663 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
35664 {
35665 if (TARGET_AVX2
35666 && valid_perm_using_mode_p (V2TImode, d))
35667 {
35668 if (d->testing_p)
35669 return true;
35670
35671 /* Use vperm2i128 insn. The pattern uses
35672 V4DImode instead of V2TImode. */
35673 target = gen_lowpart (V4DImode, d->target);
35674 op0 = gen_lowpart (V4DImode, d->op0);
35675 op1 = gen_lowpart (V4DImode, d->op1);
35676 rperm[0]
35677 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
35678 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
35679 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
35680 return true;
35681 }
35682 return false;
35683 }
35684 }
35685 else
35686 {
35687 if (GET_MODE_SIZE (d->vmode) == 16)
35688 {
35689 if (!TARGET_SSSE3)
35690 return false;
35691 }
35692 else if (GET_MODE_SIZE (d->vmode) == 32)
35693 {
35694 if (!TARGET_AVX2)
35695 return false;
35696
35697 /* V4DImode should be already handled through
35698 expand_vselect by vpermq instruction. */
35699 gcc_assert (d->vmode != V4DImode);
35700
35701 vmode = V32QImode;
35702 if (d->vmode == V8SImode
35703 || d->vmode == V16HImode
35704 || d->vmode == V32QImode)
35705 {
35706 /* First see if vpermq can be used for
35707 V8SImode/V16HImode/V32QImode. */
35708 if (valid_perm_using_mode_p (V4DImode, d))
35709 {
35710 for (i = 0; i < 4; i++)
35711 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
35712 if (d->testing_p)
35713 return true;
35714 return expand_vselect (gen_lowpart (V4DImode, d->target),
35715 gen_lowpart (V4DImode, d->op0),
35716 perm, 4);
35717 }
35718
35719 /* Next see if vpermd can be used. */
35720 if (valid_perm_using_mode_p (V8SImode, d))
35721 vmode = V8SImode;
35722 }
35723
35724 if (vmode == V32QImode)
35725 {
35726 /* vpshufb only works intra lanes, it is not
35727 possible to shuffle bytes in between the lanes. */
35728 for (i = 0; i < nelt; ++i)
35729 if ((d->perm[i] ^ i) & (nelt / 2))
35730 return false;
35731 }
35732 }
35733 else
35734 return false;
35735 }
35736
35737 if (d->testing_p)
35738 return true;
35739
35740 if (vmode == V8SImode)
35741 for (i = 0; i < 8; ++i)
35742 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
35743 else
35744 {
35745 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35746 if (d->op0 != d->op1)
35747 mask = 2 * nelt - 1;
35748 else if (vmode == V16QImode)
35749 mask = nelt - 1;
35750 else
35751 mask = nelt / 2 - 1;
35752
35753 for (i = 0; i < nelt; ++i)
35754 {
35755 unsigned j, e = d->perm[i] & mask;
35756 for (j = 0; j < eltsz; ++j)
35757 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
35758 }
35759 }
35760
35761 vperm = gen_rtx_CONST_VECTOR (vmode,
35762 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
35763 vperm = force_reg (vmode, vperm);
35764
35765 target = gen_lowpart (vmode, d->target);
35766 op0 = gen_lowpart (vmode, d->op0);
35767 if (d->op0 == d->op1)
35768 {
35769 if (vmode == V16QImode)
35770 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
35771 else if (vmode == V32QImode)
35772 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
35773 else
35774 emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
35775 }
35776 else
35777 {
35778 op1 = gen_lowpart (vmode, d->op1);
35779 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
35780 }
35781
35782 return true;
35783 }
35784
35785 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
35786 in a single instruction. */
35787
35788 static bool
35789 expand_vec_perm_1 (struct expand_vec_perm_d *d)
35790 {
35791 unsigned i, nelt = d->nelt;
35792 unsigned char perm2[MAX_VECT_LEN];
35793
35794 /* Check plain VEC_SELECT first, because AVX has instructions that could
35795 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
35796 input where SEL+CONCAT may not. */
35797 if (d->op0 == d->op1)
35798 {
35799 int mask = nelt - 1;
35800 bool identity_perm = true;
35801 bool broadcast_perm = true;
35802
35803 for (i = 0; i < nelt; i++)
35804 {
35805 perm2[i] = d->perm[i] & mask;
35806 if (perm2[i] != i)
35807 identity_perm = false;
35808 if (perm2[i])
35809 broadcast_perm = false;
35810 }
35811
35812 if (identity_perm)
35813 {
35814 if (!d->testing_p)
35815 emit_move_insn (d->target, d->op0);
35816 return true;
35817 }
35818 else if (broadcast_perm && TARGET_AVX2)
35819 {
35820 /* Use vpbroadcast{b,w,d}. */
35821 rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
35822 switch (d->vmode)
35823 {
35824 case V32QImode:
35825 op = gen_lowpart (V16QImode, op);
35826 gen = gen_avx2_pbroadcastv32qi;
35827 break;
35828 case V16HImode:
35829 op = gen_lowpart (V8HImode, op);
35830 gen = gen_avx2_pbroadcastv16hi;
35831 break;
35832 case V8SImode:
35833 op = gen_lowpart (V4SImode, op);
35834 gen = gen_avx2_pbroadcastv8si;
35835 break;
35836 case V16QImode:
35837 gen = gen_avx2_pbroadcastv16qi;
35838 break;
35839 case V8HImode:
35840 gen = gen_avx2_pbroadcastv8hi;
35841 break;
35842 /* For other modes prefer other shuffles this function creates. */
35843 default: break;
35844 }
35845 if (gen != NULL)
35846 {
35847 if (!d->testing_p)
35848 emit_insn (gen (d->target, op));
35849 return true;
35850 }
35851 }
35852
35853 if (expand_vselect (d->target, d->op0, perm2, nelt))
35854 return true;
35855
35856 /* There are plenty of patterns in sse.md that are written for
35857 SEL+CONCAT and are not replicated for a single op. Perhaps
35858 that should be changed, to avoid the nastiness here. */
35859
35860 /* Recognize interleave style patterns, which means incrementing
35861 every other permutation operand. */
35862 for (i = 0; i < nelt; i += 2)
35863 {
35864 perm2[i] = d->perm[i] & mask;
35865 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
35866 }
35867 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35868 return true;
35869
35870 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
35871 if (nelt >= 4)
35872 {
35873 for (i = 0; i < nelt; i += 4)
35874 {
35875 perm2[i + 0] = d->perm[i + 0] & mask;
35876 perm2[i + 1] = d->perm[i + 1] & mask;
35877 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
35878 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
35879 }
35880
35881 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35882 return true;
35883 }
35884 }
35885
35886 /* Finally, try the fully general two operand permute. */
35887 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
35888 return true;
35889
35890 /* Recognize interleave style patterns with reversed operands. */
35891 if (d->op0 != d->op1)
35892 {
35893 for (i = 0; i < nelt; ++i)
35894 {
35895 unsigned e = d->perm[i];
35896 if (e >= nelt)
35897 e -= nelt;
35898 else
35899 e += nelt;
35900 perm2[i] = e;
35901 }
35902
35903 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
35904 return true;
35905 }
35906
35907 /* Try the SSE4.1 blend variable merge instructions. */
35908 if (expand_vec_perm_blend (d))
35909 return true;
35910
35911 /* Try one of the AVX vpermil variable permutations. */
35912 if (expand_vec_perm_vpermil (d))
35913 return true;
35914
35915 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
35916 vpshufb, vpermd or vpermq variable permutation. */
35917 if (expand_vec_perm_pshufb (d))
35918 return true;
35919
35920 return false;
35921 }
35922
35923 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35924 in terms of a pair of pshuflw + pshufhw instructions. */
35925
35926 static bool
35927 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
35928 {
35929 unsigned char perm2[MAX_VECT_LEN];
35930 unsigned i;
35931 bool ok;
35932
35933 if (d->vmode != V8HImode || d->op0 != d->op1)
35934 return false;
35935
35936 /* The two permutations only operate in 64-bit lanes. */
35937 for (i = 0; i < 4; ++i)
35938 if (d->perm[i] >= 4)
35939 return false;
35940 for (i = 4; i < 8; ++i)
35941 if (d->perm[i] < 4)
35942 return false;
35943
35944 if (d->testing_p)
35945 return true;
35946
35947 /* Emit the pshuflw. */
35948 memcpy (perm2, d->perm, 4);
35949 for (i = 4; i < 8; ++i)
35950 perm2[i] = i;
35951 ok = expand_vselect (d->target, d->op0, perm2, 8);
35952 gcc_assert (ok);
35953
35954 /* Emit the pshufhw. */
35955 memcpy (perm2 + 4, d->perm + 4, 4);
35956 for (i = 0; i < 4; ++i)
35957 perm2[i] = i;
35958 ok = expand_vselect (d->target, d->target, perm2, 8);
35959 gcc_assert (ok);
35960
35961 return true;
35962 }
35963
35964 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35965 the permutation using the SSSE3 palignr instruction. This succeeds
35966 when all of the elements in PERM fit within one vector and we merely
35967 need to shift them down so that a single vector permutation has a
35968 chance to succeed. */
35969
35970 static bool
35971 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
35972 {
35973 unsigned i, nelt = d->nelt;
35974 unsigned min, max;
35975 bool in_order, ok;
35976 rtx shift;
35977
35978 /* Even with AVX, palignr only operates on 128-bit vectors. */
35979 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
35980 return false;
35981
35982 min = nelt, max = 0;
35983 for (i = 0; i < nelt; ++i)
35984 {
35985 unsigned e = d->perm[i];
35986 if (e < min)
35987 min = e;
35988 if (e > max)
35989 max = e;
35990 }
35991 if (min == 0 || max - min >= nelt)
35992 return false;
35993
35994 /* Given that we have SSSE3, we know we'll be able to implement the
35995 single operand permutation after the palignr with pshufb. */
35996 if (d->testing_p)
35997 return true;
35998
35999 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36000 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36001 gen_lowpart (TImode, d->op1),
36002 gen_lowpart (TImode, d->op0), shift));
36003
36004 d->op0 = d->op1 = d->target;
36005
36006 in_order = true;
36007 for (i = 0; i < nelt; ++i)
36008 {
36009 unsigned e = d->perm[i] - min;
36010 if (e != i)
36011 in_order = false;
36012 d->perm[i] = e;
36013 }
36014
36015 /* Test for the degenerate case where the alignment by itself
36016 produces the desired permutation. */
36017 if (in_order)
36018 return true;
36019
36020 ok = expand_vec_perm_1 (d);
36021 gcc_assert (ok);
36022
36023 return ok;
36024 }
36025
36026 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36027 a two vector permutation into a single vector permutation by using
36028 an interleave operation to merge the vectors. */
36029
36030 static bool
36031 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36032 {
36033 struct expand_vec_perm_d dremap, dfinal;
36034 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36035 unsigned HOST_WIDE_INT contents;
36036 unsigned char remap[2 * MAX_VECT_LEN];
36037 rtx seq;
36038 bool ok, same_halves = false;
36039
36040 if (GET_MODE_SIZE (d->vmode) == 16)
36041 {
36042 if (d->op0 == d->op1)
36043 return false;
36044 }
36045 else if (GET_MODE_SIZE (d->vmode) == 32)
36046 {
36047 if (!TARGET_AVX)
36048 return false;
36049 /* For 32-byte modes allow even d->op0 == d->op1.
36050 The lack of cross-lane shuffling in some instructions
36051 might prevent a single insn shuffle. */
36052 }
36053 else
36054 return false;
36055
36056 /* Examine from whence the elements come. */
36057 contents = 0;
36058 for (i = 0; i < nelt; ++i)
36059 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36060
36061 memset (remap, 0xff, sizeof (remap));
36062 dremap = *d;
36063
36064 if (GET_MODE_SIZE (d->vmode) == 16)
36065 {
36066 unsigned HOST_WIDE_INT h1, h2, h3, h4;
36067
36068 /* Split the two input vectors into 4 halves. */
36069 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36070 h2 = h1 << nelt2;
36071 h3 = h2 << nelt2;
36072 h4 = h3 << nelt2;
36073
36074 /* If the elements from the low halves use interleave low, and similarly
36075 for interleave high. If the elements are from mis-matched halves, we
36076 can use shufps for V4SF/V4SI or do a DImode shuffle. */
36077 if ((contents & (h1 | h3)) == contents)
36078 {
36079 /* punpckl* */
36080 for (i = 0; i < nelt2; ++i)
36081 {
36082 remap[i] = i * 2;
36083 remap[i + nelt] = i * 2 + 1;
36084 dremap.perm[i * 2] = i;
36085 dremap.perm[i * 2 + 1] = i + nelt;
36086 }
36087 if (!TARGET_SSE2 && d->vmode == V4SImode)
36088 dremap.vmode = V4SFmode;
36089 }
36090 else if ((contents & (h2 | h4)) == contents)
36091 {
36092 /* punpckh* */
36093 for (i = 0; i < nelt2; ++i)
36094 {
36095 remap[i + nelt2] = i * 2;
36096 remap[i + nelt + nelt2] = i * 2 + 1;
36097 dremap.perm[i * 2] = i + nelt2;
36098 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
36099 }
36100 if (!TARGET_SSE2 && d->vmode == V4SImode)
36101 dremap.vmode = V4SFmode;
36102 }
36103 else if ((contents & (h1 | h4)) == contents)
36104 {
36105 /* shufps */
36106 for (i = 0; i < nelt2; ++i)
36107 {
36108 remap[i] = i;
36109 remap[i + nelt + nelt2] = i + nelt2;
36110 dremap.perm[i] = i;
36111 dremap.perm[i + nelt2] = i + nelt + nelt2;
36112 }
36113 if (nelt != 4)
36114 {
36115 /* shufpd */
36116 dremap.vmode = V2DImode;
36117 dremap.nelt = 2;
36118 dremap.perm[0] = 0;
36119 dremap.perm[1] = 3;
36120 }
36121 }
36122 else if ((contents & (h2 | h3)) == contents)
36123 {
36124 /* shufps */
36125 for (i = 0; i < nelt2; ++i)
36126 {
36127 remap[i + nelt2] = i;
36128 remap[i + nelt] = i + nelt2;
36129 dremap.perm[i] = i + nelt2;
36130 dremap.perm[i + nelt2] = i + nelt;
36131 }
36132 if (nelt != 4)
36133 {
36134 /* shufpd */
36135 dremap.vmode = V2DImode;
36136 dremap.nelt = 2;
36137 dremap.perm[0] = 1;
36138 dremap.perm[1] = 2;
36139 }
36140 }
36141 else
36142 return false;
36143 }
36144 else
36145 {
36146 unsigned int nelt4 = nelt / 4, nzcnt = 0;
36147 unsigned HOST_WIDE_INT q[8];
36148 unsigned int nonzero_halves[4];
36149
36150 /* Split the two input vectors into 8 quarters. */
36151 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
36152 for (i = 1; i < 8; ++i)
36153 q[i] = q[0] << (nelt4 * i);
36154 for (i = 0; i < 4; ++i)
36155 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
36156 {
36157 nonzero_halves[nzcnt] = i;
36158 ++nzcnt;
36159 }
36160
36161 if (nzcnt == 1)
36162 {
36163 gcc_assert (d->op0 == d->op1);
36164 nonzero_halves[1] = nonzero_halves[0];
36165 same_halves = true;
36166 }
36167 else if (d->op0 == d->op1)
36168 {
36169 gcc_assert (nonzero_halves[0] == 0);
36170 gcc_assert (nonzero_halves[1] == 1);
36171 }
36172
36173 if (nzcnt <= 2)
36174 {
36175 if (d->perm[0] / nelt2 == nonzero_halves[1])
36176 {
36177 /* Attempt to increase the likelyhood that dfinal
36178 shuffle will be intra-lane. */
36179 char tmph = nonzero_halves[0];
36180 nonzero_halves[0] = nonzero_halves[1];
36181 nonzero_halves[1] = tmph;
36182 }
36183
36184 /* vperm2f128 or vperm2i128. */
36185 for (i = 0; i < nelt2; ++i)
36186 {
36187 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
36188 remap[i + nonzero_halves[0] * nelt2] = i;
36189 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
36190 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
36191 }
36192
36193 if (d->vmode != V8SFmode
36194 && d->vmode != V4DFmode
36195 && d->vmode != V8SImode)
36196 {
36197 dremap.vmode = V8SImode;
36198 dremap.nelt = 8;
36199 for (i = 0; i < 4; ++i)
36200 {
36201 dremap.perm[i] = i + nonzero_halves[0] * 4;
36202 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
36203 }
36204 }
36205 }
36206 else if (d->op0 == d->op1)
36207 return false;
36208 else if (TARGET_AVX2
36209 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
36210 {
36211 /* vpunpckl* */
36212 for (i = 0; i < nelt4; ++i)
36213 {
36214 remap[i] = i * 2;
36215 remap[i + nelt] = i * 2 + 1;
36216 remap[i + nelt2] = i * 2 + nelt2;
36217 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
36218 dremap.perm[i * 2] = i;
36219 dremap.perm[i * 2 + 1] = i + nelt;
36220 dremap.perm[i * 2 + nelt2] = i + nelt2;
36221 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
36222 }
36223 }
36224 else if (TARGET_AVX2
36225 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
36226 {
36227 /* vpunpckh* */
36228 for (i = 0; i < nelt4; ++i)
36229 {
36230 remap[i + nelt4] = i * 2;
36231 remap[i + nelt + nelt4] = i * 2 + 1;
36232 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
36233 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
36234 dremap.perm[i * 2] = i + nelt4;
36235 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
36236 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
36237 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
36238 }
36239 }
36240 else
36241 return false;
36242 }
36243
36244 /* Use the remapping array set up above to move the elements from their
36245 swizzled locations into their final destinations. */
36246 dfinal = *d;
36247 for (i = 0; i < nelt; ++i)
36248 {
36249 unsigned e = remap[d->perm[i]];
36250 gcc_assert (e < nelt);
36251 /* If same_halves is true, both halves of the remapped vector are the
36252 same. Avoid cross-lane accesses if possible. */
36253 if (same_halves && i >= nelt2)
36254 {
36255 gcc_assert (e < nelt2);
36256 dfinal.perm[i] = e + nelt2;
36257 }
36258 else
36259 dfinal.perm[i] = e;
36260 }
36261 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
36262 dfinal.op1 = dfinal.op0;
36263 dremap.target = dfinal.op0;
36264
36265 /* Test if the final remap can be done with a single insn. For V4SFmode or
36266 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
36267 start_sequence ();
36268 ok = expand_vec_perm_1 (&dfinal);
36269 seq = get_insns ();
36270 end_sequence ();
36271
36272 if (!ok)
36273 return false;
36274
36275 if (d->testing_p)
36276 return true;
36277
36278 if (dremap.vmode != dfinal.vmode)
36279 {
36280 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
36281 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
36282 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
36283 }
36284
36285 ok = expand_vec_perm_1 (&dremap);
36286 gcc_assert (ok);
36287
36288 emit_insn (seq);
36289 return true;
36290 }
36291
36292 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36293 a single vector cross-lane permutation into vpermq followed
36294 by any of the single insn permutations. */
36295
36296 static bool
36297 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
36298 {
36299 struct expand_vec_perm_d dremap, dfinal;
36300 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
36301 unsigned contents[2];
36302 bool ok;
36303
36304 if (!(TARGET_AVX2
36305 && (d->vmode == V32QImode || d->vmode == V16HImode)
36306 && d->op0 == d->op1))
36307 return false;
36308
36309 contents[0] = 0;
36310 contents[1] = 0;
36311 for (i = 0; i < nelt2; ++i)
36312 {
36313 contents[0] |= 1u << (d->perm[i] / nelt4);
36314 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
36315 }
36316
36317 for (i = 0; i < 2; ++i)
36318 {
36319 unsigned int cnt = 0;
36320 for (j = 0; j < 4; ++j)
36321 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
36322 return false;
36323 }
36324
36325 if (d->testing_p)
36326 return true;
36327
36328 dremap = *d;
36329 dremap.vmode = V4DImode;
36330 dremap.nelt = 4;
36331 dremap.target = gen_reg_rtx (V4DImode);
36332 dremap.op0 = gen_lowpart (V4DImode, d->op0);
36333 dremap.op1 = dremap.op0;
36334 for (i = 0; i < 2; ++i)
36335 {
36336 unsigned int cnt = 0;
36337 for (j = 0; j < 4; ++j)
36338 if ((contents[i] & (1u << j)) != 0)
36339 dremap.perm[2 * i + cnt++] = j;
36340 for (; cnt < 2; ++cnt)
36341 dremap.perm[2 * i + cnt] = 0;
36342 }
36343
36344 dfinal = *d;
36345 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
36346 dfinal.op1 = dfinal.op0;
36347 for (i = 0, j = 0; i < nelt; ++i)
36348 {
36349 if (i == nelt2)
36350 j = 2;
36351 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
36352 if ((d->perm[i] / nelt4) == dremap.perm[j])
36353 ;
36354 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
36355 dfinal.perm[i] |= nelt4;
36356 else
36357 gcc_unreachable ();
36358 }
36359
36360 ok = expand_vec_perm_1 (&dremap);
36361 gcc_assert (ok);
36362
36363 ok = expand_vec_perm_1 (&dfinal);
36364 gcc_assert (ok);
36365
36366 return true;
36367 }
36368
36369 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36370 a two vector permutation using 2 intra-lane interleave insns
36371 and cross-lane shuffle for 32-byte vectors. */
36372
36373 static bool
36374 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
36375 {
36376 unsigned i, nelt;
36377 rtx (*gen) (rtx, rtx, rtx);
36378
36379 if (d->op0 == d->op1)
36380 return false;
36381 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
36382 ;
36383 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
36384 ;
36385 else
36386 return false;
36387
36388 nelt = d->nelt;
36389 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
36390 return false;
36391 for (i = 0; i < nelt; i += 2)
36392 if (d->perm[i] != d->perm[0] + i / 2
36393 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
36394 return false;
36395
36396 if (d->testing_p)
36397 return true;
36398
36399 switch (d->vmode)
36400 {
36401 case V32QImode:
36402 if (d->perm[0])
36403 gen = gen_vec_interleave_highv32qi;
36404 else
36405 gen = gen_vec_interleave_lowv32qi;
36406 break;
36407 case V16HImode:
36408 if (d->perm[0])
36409 gen = gen_vec_interleave_highv16hi;
36410 else
36411 gen = gen_vec_interleave_lowv16hi;
36412 break;
36413 case V8SImode:
36414 if (d->perm[0])
36415 gen = gen_vec_interleave_highv8si;
36416 else
36417 gen = gen_vec_interleave_lowv8si;
36418 break;
36419 case V4DImode:
36420 if (d->perm[0])
36421 gen = gen_vec_interleave_highv4di;
36422 else
36423 gen = gen_vec_interleave_lowv4di;
36424 break;
36425 case V8SFmode:
36426 if (d->perm[0])
36427 gen = gen_vec_interleave_highv8sf;
36428 else
36429 gen = gen_vec_interleave_lowv8sf;
36430 break;
36431 case V4DFmode:
36432 if (d->perm[0])
36433 gen = gen_vec_interleave_highv4df;
36434 else
36435 gen = gen_vec_interleave_lowv4df;
36436 break;
36437 default:
36438 gcc_unreachable ();
36439 }
36440
36441 emit_insn (gen (d->target, d->op0, d->op1));
36442 return true;
36443 }
36444
36445 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
36446 permutation with two pshufb insns and an ior. We should have already
36447 failed all two instruction sequences. */
36448
36449 static bool
36450 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
36451 {
36452 rtx rperm[2][16], vperm, l, h, op, m128;
36453 unsigned int i, nelt, eltsz;
36454
36455 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36456 return false;
36457 gcc_assert (d->op0 != d->op1);
36458
36459 nelt = d->nelt;
36460 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36461
36462 /* Generate two permutation masks. If the required element is within
36463 the given vector it is shuffled into the proper lane. If the required
36464 element is in the other vector, force a zero into the lane by setting
36465 bit 7 in the permutation mask. */
36466 m128 = GEN_INT (-128);
36467 for (i = 0; i < nelt; ++i)
36468 {
36469 unsigned j, e = d->perm[i];
36470 unsigned which = (e >= nelt);
36471 if (e >= nelt)
36472 e -= nelt;
36473
36474 for (j = 0; j < eltsz; ++j)
36475 {
36476 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
36477 rperm[1-which][i*eltsz + j] = m128;
36478 }
36479 }
36480
36481 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
36482 vperm = force_reg (V16QImode, vperm);
36483
36484 l = gen_reg_rtx (V16QImode);
36485 op = gen_lowpart (V16QImode, d->op0);
36486 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
36487
36488 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
36489 vperm = force_reg (V16QImode, vperm);
36490
36491 h = gen_reg_rtx (V16QImode);
36492 op = gen_lowpart (V16QImode, d->op1);
36493 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
36494
36495 op = gen_lowpart (V16QImode, d->target);
36496 emit_insn (gen_iorv16qi3 (op, l, h));
36497
36498 return true;
36499 }
36500
36501 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
36502 with two vpshufb insns, vpermq and vpor. We should have already failed
36503 all two or three instruction sequences. */
36504
36505 static bool
36506 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
36507 {
36508 rtx rperm[2][32], vperm, l, h, hp, op, m128;
36509 unsigned int i, nelt, eltsz;
36510
36511 if (!TARGET_AVX2
36512 || d->op0 != d->op1
36513 || (d->vmode != V32QImode && d->vmode != V16HImode))
36514 return false;
36515
36516 if (d->testing_p)
36517 return true;
36518
36519 nelt = d->nelt;
36520 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36521
36522 /* Generate two permutation masks. If the required element is within
36523 the same lane, it is shuffled in. If the required element from the
36524 other lane, force a zero by setting bit 7 in the permutation mask.
36525 In the other mask the mask has non-negative elements if element
36526 is requested from the other lane, but also moved to the other lane,
36527 so that the result of vpshufb can have the two V2TImode halves
36528 swapped. */
36529 m128 = GEN_INT (-128);
36530 for (i = 0; i < nelt; ++i)
36531 {
36532 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36533 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36534
36535 for (j = 0; j < eltsz; ++j)
36536 {
36537 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
36538 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
36539 }
36540 }
36541
36542 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36543 vperm = force_reg (V32QImode, vperm);
36544
36545 h = gen_reg_rtx (V32QImode);
36546 op = gen_lowpart (V32QImode, d->op0);
36547 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36548
36549 /* Swap the 128-byte lanes of h into hp. */
36550 hp = gen_reg_rtx (V4DImode);
36551 op = gen_lowpart (V4DImode, h);
36552 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
36553 const1_rtx));
36554
36555 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36556 vperm = force_reg (V32QImode, vperm);
36557
36558 l = gen_reg_rtx (V32QImode);
36559 op = gen_lowpart (V32QImode, d->op0);
36560 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36561
36562 op = gen_lowpart (V32QImode, d->target);
36563 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
36564
36565 return true;
36566 }
36567
36568 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
36569 and extract-odd permutations of two V32QImode and V16QImode operand
36570 with two vpshufb insns, vpor and vpermq. We should have already
36571 failed all two or three instruction sequences. */
36572
36573 static bool
36574 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
36575 {
36576 rtx rperm[2][32], vperm, l, h, ior, op, m128;
36577 unsigned int i, nelt, eltsz;
36578
36579 if (!TARGET_AVX2
36580 || d->op0 == d->op1
36581 || (d->vmode != V32QImode && d->vmode != V16HImode))
36582 return false;
36583
36584 for (i = 0; i < d->nelt; ++i)
36585 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
36586 return false;
36587
36588 if (d->testing_p)
36589 return true;
36590
36591 nelt = d->nelt;
36592 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36593
36594 /* Generate two permutation masks. In the first permutation mask
36595 the first quarter will contain indexes for the first half
36596 of the op0, the second quarter will contain bit 7 set, third quarter
36597 will contain indexes for the second half of the op0 and the
36598 last quarter bit 7 set. In the second permutation mask
36599 the first quarter will contain bit 7 set, the second quarter
36600 indexes for the first half of the op1, the third quarter bit 7 set
36601 and last quarter indexes for the second half of the op1.
36602 I.e. the first mask e.g. for V32QImode extract even will be:
36603 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
36604 (all values masked with 0xf except for -128) and second mask
36605 for extract even will be
36606 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
36607 m128 = GEN_INT (-128);
36608 for (i = 0; i < nelt; ++i)
36609 {
36610 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36611 unsigned which = d->perm[i] >= nelt;
36612 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
36613
36614 for (j = 0; j < eltsz; ++j)
36615 {
36616 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
36617 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
36618 }
36619 }
36620
36621 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36622 vperm = force_reg (V32QImode, vperm);
36623
36624 l = gen_reg_rtx (V32QImode);
36625 op = gen_lowpart (V32QImode, d->op0);
36626 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36627
36628 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36629 vperm = force_reg (V32QImode, vperm);
36630
36631 h = gen_reg_rtx (V32QImode);
36632 op = gen_lowpart (V32QImode, d->op1);
36633 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36634
36635 ior = gen_reg_rtx (V32QImode);
36636 emit_insn (gen_iorv32qi3 (ior, l, h));
36637
36638 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
36639 op = gen_lowpart (V4DImode, d->target);
36640 ior = gen_lowpart (V4DImode, ior);
36641 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
36642 const1_rtx, GEN_INT (3)));
36643
36644 return true;
36645 }
36646
36647 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
36648 and extract-odd permutations. */
36649
36650 static bool
36651 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
36652 {
36653 rtx t1, t2, t3;
36654
36655 switch (d->vmode)
36656 {
36657 case V4DFmode:
36658 t1 = gen_reg_rtx (V4DFmode);
36659 t2 = gen_reg_rtx (V4DFmode);
36660
36661 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36662 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
36663 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
36664
36665 /* Now an unpck[lh]pd will produce the result required. */
36666 if (odd)
36667 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
36668 else
36669 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
36670 emit_insn (t3);
36671 break;
36672
36673 case V8SFmode:
36674 {
36675 int mask = odd ? 0xdd : 0x88;
36676
36677 t1 = gen_reg_rtx (V8SFmode);
36678 t2 = gen_reg_rtx (V8SFmode);
36679 t3 = gen_reg_rtx (V8SFmode);
36680
36681 /* Shuffle within the 128-bit lanes to produce:
36682 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
36683 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
36684 GEN_INT (mask)));
36685
36686 /* Shuffle the lanes around to produce:
36687 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
36688 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
36689 GEN_INT (0x3)));
36690
36691 /* Shuffle within the 128-bit lanes to produce:
36692 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
36693 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
36694
36695 /* Shuffle within the 128-bit lanes to produce:
36696 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
36697 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
36698
36699 /* Shuffle the lanes around to produce:
36700 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
36701 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
36702 GEN_INT (0x20)));
36703 }
36704 break;
36705
36706 case V2DFmode:
36707 case V4SFmode:
36708 case V2DImode:
36709 case V4SImode:
36710 /* These are always directly implementable by expand_vec_perm_1. */
36711 gcc_unreachable ();
36712
36713 case V8HImode:
36714 if (TARGET_SSSE3)
36715 return expand_vec_perm_pshufb2 (d);
36716 else
36717 {
36718 /* We need 2*log2(N)-1 operations to achieve odd/even
36719 with interleave. */
36720 t1 = gen_reg_rtx (V8HImode);
36721 t2 = gen_reg_rtx (V8HImode);
36722 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
36723 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
36724 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
36725 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
36726 if (odd)
36727 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
36728 else
36729 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
36730 emit_insn (t3);
36731 }
36732 break;
36733
36734 case V16QImode:
36735 if (TARGET_SSSE3)
36736 return expand_vec_perm_pshufb2 (d);
36737 else
36738 {
36739 t1 = gen_reg_rtx (V16QImode);
36740 t2 = gen_reg_rtx (V16QImode);
36741 t3 = gen_reg_rtx (V16QImode);
36742 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
36743 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
36744 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
36745 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
36746 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
36747 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
36748 if (odd)
36749 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
36750 else
36751 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
36752 emit_insn (t3);
36753 }
36754 break;
36755
36756 case V16HImode:
36757 case V32QImode:
36758 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
36759
36760 case V4DImode:
36761 if (!TARGET_AVX2)
36762 {
36763 struct expand_vec_perm_d d_copy = *d;
36764 d_copy.vmode = V4DFmode;
36765 d_copy.target = gen_lowpart (V4DFmode, d->target);
36766 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
36767 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
36768 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36769 }
36770
36771 t1 = gen_reg_rtx (V4DImode);
36772 t2 = gen_reg_rtx (V4DImode);
36773
36774 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36775 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
36776 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
36777
36778 /* Now an vpunpck[lh]qdq will produce the result required. */
36779 if (odd)
36780 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
36781 else
36782 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
36783 emit_insn (t3);
36784 break;
36785
36786 case V8SImode:
36787 if (!TARGET_AVX2)
36788 {
36789 struct expand_vec_perm_d d_copy = *d;
36790 d_copy.vmode = V8SFmode;
36791 d_copy.target = gen_lowpart (V8SFmode, d->target);
36792 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
36793 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
36794 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36795 }
36796
36797 t1 = gen_reg_rtx (V8SImode);
36798 t2 = gen_reg_rtx (V8SImode);
36799
36800 /* Shuffle the lanes around into
36801 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
36802 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
36803 gen_lowpart (V4DImode, d->op0),
36804 gen_lowpart (V4DImode, d->op1),
36805 GEN_INT (0x20)));
36806 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
36807 gen_lowpart (V4DImode, d->op0),
36808 gen_lowpart (V4DImode, d->op1),
36809 GEN_INT (0x31)));
36810
36811 /* Swap the 2nd and 3rd position in each lane into
36812 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
36813 emit_insn (gen_avx2_pshufdv3 (t1, t1,
36814 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36815 emit_insn (gen_avx2_pshufdv3 (t2, t2,
36816 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36817
36818 /* Now an vpunpck[lh]qdq will produce
36819 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
36820 if (odd)
36821 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
36822 gen_lowpart (V4DImode, t1),
36823 gen_lowpart (V4DImode, t2));
36824 else
36825 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
36826 gen_lowpart (V4DImode, t1),
36827 gen_lowpart (V4DImode, t2));
36828 emit_insn (t3);
36829 break;
36830
36831 default:
36832 gcc_unreachable ();
36833 }
36834
36835 return true;
36836 }
36837
36838 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36839 extract-even and extract-odd permutations. */
36840
36841 static bool
36842 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
36843 {
36844 unsigned i, odd, nelt = d->nelt;
36845
36846 odd = d->perm[0];
36847 if (odd != 0 && odd != 1)
36848 return false;
36849
36850 for (i = 1; i < nelt; ++i)
36851 if (d->perm[i] != 2 * i + odd)
36852 return false;
36853
36854 return expand_vec_perm_even_odd_1 (d, odd);
36855 }
36856
36857 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
36858 permutations. We assume that expand_vec_perm_1 has already failed. */
36859
36860 static bool
36861 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
36862 {
36863 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
36864 enum machine_mode vmode = d->vmode;
36865 unsigned char perm2[4];
36866 rtx op0 = d->op0;
36867 bool ok;
36868
36869 switch (vmode)
36870 {
36871 case V4DFmode:
36872 case V8SFmode:
36873 /* These are special-cased in sse.md so that we can optionally
36874 use the vbroadcast instruction. They expand to two insns
36875 if the input happens to be in a register. */
36876 gcc_unreachable ();
36877
36878 case V2DFmode:
36879 case V2DImode:
36880 case V4SFmode:
36881 case V4SImode:
36882 /* These are always implementable using standard shuffle patterns. */
36883 gcc_unreachable ();
36884
36885 case V8HImode:
36886 case V16QImode:
36887 /* These can be implemented via interleave. We save one insn by
36888 stopping once we have promoted to V4SImode and then use pshufd. */
36889 do
36890 {
36891 optab otab = vec_interleave_low_optab;
36892
36893 if (elt >= nelt2)
36894 {
36895 otab = vec_interleave_high_optab;
36896 elt -= nelt2;
36897 }
36898 nelt2 /= 2;
36899
36900 op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
36901 vmode = get_mode_wider_vector (vmode);
36902 op0 = gen_lowpart (vmode, op0);
36903 }
36904 while (vmode != V4SImode);
36905
36906 memset (perm2, elt, 4);
36907 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
36908 gcc_assert (ok);
36909 return true;
36910
36911 case V32QImode:
36912 case V16HImode:
36913 case V8SImode:
36914 case V4DImode:
36915 /* For AVX2 broadcasts of the first element vpbroadcast* or
36916 vpermq should be used by expand_vec_perm_1. */
36917 gcc_assert (!TARGET_AVX2 || d->perm[0]);
36918 return false;
36919
36920 default:
36921 gcc_unreachable ();
36922 }
36923 }
36924
36925 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36926 broadcast permutations. */
36927
36928 static bool
36929 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
36930 {
36931 unsigned i, elt, nelt = d->nelt;
36932
36933 if (d->op0 != d->op1)
36934 return false;
36935
36936 elt = d->perm[0];
36937 for (i = 1; i < nelt; ++i)
36938 if (d->perm[i] != elt)
36939 return false;
36940
36941 return expand_vec_perm_broadcast_1 (d);
36942 }
36943
36944 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
36945 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
36946 all the shorter instruction sequences. */
36947
36948 static bool
36949 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
36950 {
36951 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
36952 unsigned int i, nelt, eltsz;
36953 bool used[4];
36954
36955 if (!TARGET_AVX2
36956 || d->op0 == d->op1
36957 || (d->vmode != V32QImode && d->vmode != V16HImode))
36958 return false;
36959
36960 if (d->testing_p)
36961 return true;
36962
36963 nelt = d->nelt;
36964 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36965
36966 /* Generate 4 permutation masks. If the required element is within
36967 the same lane, it is shuffled in. If the required element from the
36968 other lane, force a zero by setting bit 7 in the permutation mask.
36969 In the other mask the mask has non-negative elements if element
36970 is requested from the other lane, but also moved to the other lane,
36971 so that the result of vpshufb can have the two V2TImode halves
36972 swapped. */
36973 m128 = GEN_INT (-128);
36974 for (i = 0; i < 32; ++i)
36975 {
36976 rperm[0][i] = m128;
36977 rperm[1][i] = m128;
36978 rperm[2][i] = m128;
36979 rperm[3][i] = m128;
36980 }
36981 used[0] = false;
36982 used[1] = false;
36983 used[2] = false;
36984 used[3] = false;
36985 for (i = 0; i < nelt; ++i)
36986 {
36987 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36988 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36989 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
36990
36991 for (j = 0; j < eltsz; ++j)
36992 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
36993 used[which] = true;
36994 }
36995
36996 for (i = 0; i < 2; ++i)
36997 {
36998 if (!used[2 * i + 1])
36999 {
37000 h[i] = NULL_RTX;
37001 continue;
37002 }
37003 vperm = gen_rtx_CONST_VECTOR (V32QImode,
37004 gen_rtvec_v (32, rperm[2 * i + 1]));
37005 vperm = force_reg (V32QImode, vperm);
37006 h[i] = gen_reg_rtx (V32QImode);
37007 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37008 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
37009 }
37010
37011 /* Swap the 128-byte lanes of h[X]. */
37012 for (i = 0; i < 2; ++i)
37013 {
37014 if (h[i] == NULL_RTX)
37015 continue;
37016 op = gen_reg_rtx (V4DImode);
37017 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
37018 const2_rtx, GEN_INT (3), const0_rtx,
37019 const1_rtx));
37020 h[i] = gen_lowpart (V32QImode, op);
37021 }
37022
37023 for (i = 0; i < 2; ++i)
37024 {
37025 if (!used[2 * i])
37026 {
37027 l[i] = NULL_RTX;
37028 continue;
37029 }
37030 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
37031 vperm = force_reg (V32QImode, vperm);
37032 l[i] = gen_reg_rtx (V32QImode);
37033 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37034 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
37035 }
37036
37037 for (i = 0; i < 2; ++i)
37038 {
37039 if (h[i] && l[i])
37040 {
37041 op = gen_reg_rtx (V32QImode);
37042 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
37043 l[i] = op;
37044 }
37045 else if (h[i])
37046 l[i] = h[i];
37047 }
37048
37049 gcc_assert (l[0] && l[1]);
37050 op = gen_lowpart (V32QImode, d->target);
37051 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
37052 return true;
37053 }
37054
37055 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
37056 With all of the interface bits taken care of, perform the expansion
37057 in D and return true on success. */
37058
37059 static bool
37060 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
37061 {
37062 /* Try a single instruction expansion. */
37063 if (expand_vec_perm_1 (d))
37064 return true;
37065
37066 /* Try sequences of two instructions. */
37067
37068 if (expand_vec_perm_pshuflw_pshufhw (d))
37069 return true;
37070
37071 if (expand_vec_perm_palignr (d))
37072 return true;
37073
37074 if (expand_vec_perm_interleave2 (d))
37075 return true;
37076
37077 if (expand_vec_perm_broadcast (d))
37078 return true;
37079
37080 if (expand_vec_perm_vpermq_perm_1 (d))
37081 return true;
37082
37083 /* Try sequences of three instructions. */
37084
37085 if (expand_vec_perm_pshufb2 (d))
37086 return true;
37087
37088 if (expand_vec_perm_interleave3 (d))
37089 return true;
37090
37091 /* Try sequences of four instructions. */
37092
37093 if (expand_vec_perm_vpshufb2_vpermq (d))
37094 return true;
37095
37096 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
37097 return true;
37098
37099 /* ??? Look for narrow permutations whose element orderings would
37100 allow the promotion to a wider mode. */
37101
37102 /* ??? Look for sequences of interleave or a wider permute that place
37103 the data into the correct lanes for a half-vector shuffle like
37104 pshuf[lh]w or vpermilps. */
37105
37106 /* ??? Look for sequences of interleave that produce the desired results.
37107 The combinatorics of punpck[lh] get pretty ugly... */
37108
37109 if (expand_vec_perm_even_odd (d))
37110 return true;
37111
37112 /* Even longer sequences. */
37113 if (expand_vec_perm_vpshufb4_vpermq2 (d))
37114 return true;
37115
37116 return false;
37117 }
37118
37119 bool
37120 ix86_expand_vec_perm_const (rtx operands[4])
37121 {
37122 struct expand_vec_perm_d d;
37123 unsigned char perm[MAX_VECT_LEN];
37124 int i, nelt, which;
37125 rtx sel;
37126
37127 d.target = operands[0];
37128 d.op0 = operands[1];
37129 d.op1 = operands[2];
37130 sel = operands[3];
37131
37132 d.vmode = GET_MODE (d.target);
37133 gcc_assert (VECTOR_MODE_P (d.vmode));
37134 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37135 d.testing_p = false;
37136
37137 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
37138 gcc_assert (XVECLEN (sel, 0) == nelt);
37139 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
37140
37141 for (i = which = 0; i < nelt; ++i)
37142 {
37143 rtx e = XVECEXP (sel, 0, i);
37144 int ei = INTVAL (e) & (2 * nelt - 1);
37145
37146 which |= (ei < nelt ? 1 : 2);
37147 d.perm[i] = ei;
37148 perm[i] = ei;
37149 }
37150
37151 switch (which)
37152 {
37153 default:
37154 gcc_unreachable();
37155
37156 case 3:
37157 if (!rtx_equal_p (d.op0, d.op1))
37158 break;
37159
37160 /* The elements of PERM do not suggest that only the first operand
37161 is used, but both operands are identical. Allow easier matching
37162 of the permutation by folding the permutation into the single
37163 input vector. */
37164 for (i = 0; i < nelt; ++i)
37165 if (d.perm[i] >= nelt)
37166 d.perm[i] -= nelt;
37167 /* FALLTHRU */
37168
37169 case 1:
37170 d.op1 = d.op0;
37171 break;
37172
37173 case 2:
37174 for (i = 0; i < nelt; ++i)
37175 d.perm[i] -= nelt;
37176 d.op0 = d.op1;
37177 break;
37178 }
37179
37180 if (ix86_expand_vec_perm_const_1 (&d))
37181 return true;
37182
37183 /* If the mask says both arguments are needed, but they are the same,
37184 the above tried to expand with d.op0 == d.op1. If that didn't work,
37185 retry with d.op0 != d.op1 as that is what testing has been done with. */
37186 if (which == 3 && d.op0 == d.op1)
37187 {
37188 rtx seq;
37189 bool ok;
37190
37191 memcpy (d.perm, perm, sizeof (perm));
37192 d.op1 = gen_reg_rtx (d.vmode);
37193 start_sequence ();
37194 ok = ix86_expand_vec_perm_const_1 (&d);
37195 seq = get_insns ();
37196 end_sequence ();
37197 if (ok)
37198 {
37199 emit_move_insn (d.op1, d.op0);
37200 emit_insn (seq);
37201 return true;
37202 }
37203 }
37204
37205 return false;
37206 }
37207
37208 /* Implement targetm.vectorize.vec_perm_const_ok. */
37209
37210 static bool
37211 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
37212 const unsigned char *sel)
37213 {
37214 struct expand_vec_perm_d d;
37215 unsigned int i, nelt, which;
37216 bool ret, one_vec;
37217
37218 d.vmode = vmode;
37219 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37220 d.testing_p = true;
37221
37222 /* Given sufficient ISA support we can just return true here
37223 for selected vector modes. */
37224 if (GET_MODE_SIZE (d.vmode) == 16)
37225 {
37226 /* All implementable with a single vpperm insn. */
37227 if (TARGET_XOP)
37228 return true;
37229 /* All implementable with 2 pshufb + 1 ior. */
37230 if (TARGET_SSSE3)
37231 return true;
37232 /* All implementable with shufpd or unpck[lh]pd. */
37233 if (d.nelt == 2)
37234 return true;
37235 }
37236
37237 /* Extract the values from the vector CST into the permutation
37238 array in D. */
37239 memcpy (d.perm, sel, nelt);
37240 for (i = which = 0; i < nelt; ++i)
37241 {
37242 unsigned char e = d.perm[i];
37243 gcc_assert (e < 2 * nelt);
37244 which |= (e < nelt ? 1 : 2);
37245 }
37246
37247 /* For all elements from second vector, fold the elements to first. */
37248 if (which == 2)
37249 for (i = 0; i < nelt; ++i)
37250 d.perm[i] -= nelt;
37251
37252 /* Check whether the mask can be applied to the vector type. */
37253 one_vec = (which != 3);
37254
37255 /* Implementable with shufps or pshufd. */
37256 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
37257 return true;
37258
37259 /* Otherwise we have to go through the motions and see if we can
37260 figure out how to generate the requested permutation. */
37261 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
37262 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
37263 if (!one_vec)
37264 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
37265
37266 start_sequence ();
37267 ret = ix86_expand_vec_perm_const_1 (&d);
37268 end_sequence ();
37269
37270 return ret;
37271 }
37272
37273 void
37274 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
37275 {
37276 struct expand_vec_perm_d d;
37277 unsigned i, nelt;
37278
37279 d.target = targ;
37280 d.op0 = op0;
37281 d.op1 = op1;
37282 d.vmode = GET_MODE (targ);
37283 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37284 d.testing_p = false;
37285
37286 for (i = 0; i < nelt; ++i)
37287 d.perm[i] = i * 2 + odd;
37288
37289 /* We'll either be able to implement the permutation directly... */
37290 if (expand_vec_perm_1 (&d))
37291 return;
37292
37293 /* ... or we use the special-case patterns. */
37294 expand_vec_perm_even_odd_1 (&d, odd);
37295 }
37296
37297 /* Expand an insert into a vector register through pinsr insn.
37298 Return true if successful. */
37299
37300 bool
37301 ix86_expand_pinsr (rtx *operands)
37302 {
37303 rtx dst = operands[0];
37304 rtx src = operands[3];
37305
37306 unsigned int size = INTVAL (operands[1]);
37307 unsigned int pos = INTVAL (operands[2]);
37308
37309 if (GET_CODE (dst) == SUBREG)
37310 {
37311 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
37312 dst = SUBREG_REG (dst);
37313 }
37314
37315 if (GET_CODE (src) == SUBREG)
37316 src = SUBREG_REG (src);
37317
37318 switch (GET_MODE (dst))
37319 {
37320 case V16QImode:
37321 case V8HImode:
37322 case V4SImode:
37323 case V2DImode:
37324 {
37325 enum machine_mode srcmode, dstmode;
37326 rtx (*pinsr)(rtx, rtx, rtx, rtx);
37327
37328 srcmode = mode_for_size (size, MODE_INT, 0);
37329
37330 switch (srcmode)
37331 {
37332 case QImode:
37333 if (!TARGET_SSE4_1)
37334 return false;
37335 dstmode = V16QImode;
37336 pinsr = gen_sse4_1_pinsrb;
37337 break;
37338
37339 case HImode:
37340 if (!TARGET_SSE2)
37341 return false;
37342 dstmode = V8HImode;
37343 pinsr = gen_sse2_pinsrw;
37344 break;
37345
37346 case SImode:
37347 if (!TARGET_SSE4_1)
37348 return false;
37349 dstmode = V4SImode;
37350 pinsr = gen_sse4_1_pinsrd;
37351 break;
37352
37353 case DImode:
37354 gcc_assert (TARGET_64BIT);
37355 if (!TARGET_SSE4_1)
37356 return false;
37357 dstmode = V2DImode;
37358 pinsr = gen_sse4_1_pinsrq;
37359 break;
37360
37361 default:
37362 return false;
37363 }
37364
37365 dst = gen_lowpart (dstmode, dst);
37366 src = gen_lowpart (srcmode, src);
37367
37368 pos /= size;
37369
37370 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
37371 return true;
37372 }
37373
37374 default:
37375 return false;
37376 }
37377 }
37378 \f
37379 /* This function returns the calling abi specific va_list type node.
37380 It returns the FNDECL specific va_list type. */
37381
37382 static tree
37383 ix86_fn_abi_va_list (tree fndecl)
37384 {
37385 if (!TARGET_64BIT)
37386 return va_list_type_node;
37387 gcc_assert (fndecl != NULL_TREE);
37388
37389 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
37390 return ms_va_list_type_node;
37391 else
37392 return sysv_va_list_type_node;
37393 }
37394
37395 /* Returns the canonical va_list type specified by TYPE. If there
37396 is no valid TYPE provided, it return NULL_TREE. */
37397
37398 static tree
37399 ix86_canonical_va_list_type (tree type)
37400 {
37401 tree wtype, htype;
37402
37403 /* Resolve references and pointers to va_list type. */
37404 if (TREE_CODE (type) == MEM_REF)
37405 type = TREE_TYPE (type);
37406 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
37407 type = TREE_TYPE (type);
37408 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
37409 type = TREE_TYPE (type);
37410
37411 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
37412 {
37413 wtype = va_list_type_node;
37414 gcc_assert (wtype != NULL_TREE);
37415 htype = type;
37416 if (TREE_CODE (wtype) == ARRAY_TYPE)
37417 {
37418 /* If va_list is an array type, the argument may have decayed
37419 to a pointer type, e.g. by being passed to another function.
37420 In that case, unwrap both types so that we can compare the
37421 underlying records. */
37422 if (TREE_CODE (htype) == ARRAY_TYPE
37423 || POINTER_TYPE_P (htype))
37424 {
37425 wtype = TREE_TYPE (wtype);
37426 htype = TREE_TYPE (htype);
37427 }
37428 }
37429 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37430 return va_list_type_node;
37431 wtype = sysv_va_list_type_node;
37432 gcc_assert (wtype != NULL_TREE);
37433 htype = type;
37434 if (TREE_CODE (wtype) == ARRAY_TYPE)
37435 {
37436 /* If va_list is an array type, the argument may have decayed
37437 to a pointer type, e.g. by being passed to another function.
37438 In that case, unwrap both types so that we can compare the
37439 underlying records. */
37440 if (TREE_CODE (htype) == ARRAY_TYPE
37441 || POINTER_TYPE_P (htype))
37442 {
37443 wtype = TREE_TYPE (wtype);
37444 htype = TREE_TYPE (htype);
37445 }
37446 }
37447 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37448 return sysv_va_list_type_node;
37449 wtype = ms_va_list_type_node;
37450 gcc_assert (wtype != NULL_TREE);
37451 htype = type;
37452 if (TREE_CODE (wtype) == ARRAY_TYPE)
37453 {
37454 /* If va_list is an array type, the argument may have decayed
37455 to a pointer type, e.g. by being passed to another function.
37456 In that case, unwrap both types so that we can compare the
37457 underlying records. */
37458 if (TREE_CODE (htype) == ARRAY_TYPE
37459 || POINTER_TYPE_P (htype))
37460 {
37461 wtype = TREE_TYPE (wtype);
37462 htype = TREE_TYPE (htype);
37463 }
37464 }
37465 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37466 return ms_va_list_type_node;
37467 return NULL_TREE;
37468 }
37469 return std_canonical_va_list_type (type);
37470 }
37471
37472 /* Iterate through the target-specific builtin types for va_list.
37473 IDX denotes the iterator, *PTREE is set to the result type of
37474 the va_list builtin, and *PNAME to its internal type.
37475 Returns zero if there is no element for this index, otherwise
37476 IDX should be increased upon the next call.
37477 Note, do not iterate a base builtin's name like __builtin_va_list.
37478 Used from c_common_nodes_and_builtins. */
37479
37480 static int
37481 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
37482 {
37483 if (TARGET_64BIT)
37484 {
37485 switch (idx)
37486 {
37487 default:
37488 break;
37489
37490 case 0:
37491 *ptree = ms_va_list_type_node;
37492 *pname = "__builtin_ms_va_list";
37493 return 1;
37494
37495 case 1:
37496 *ptree = sysv_va_list_type_node;
37497 *pname = "__builtin_sysv_va_list";
37498 return 1;
37499 }
37500 }
37501
37502 return 0;
37503 }
37504
37505 #undef TARGET_SCHED_DISPATCH
37506 #define TARGET_SCHED_DISPATCH has_dispatch
37507 #undef TARGET_SCHED_DISPATCH_DO
37508 #define TARGET_SCHED_DISPATCH_DO do_dispatch
37509 #undef TARGET_SCHED_REASSOCIATION_WIDTH
37510 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
37511
37512 /* The size of the dispatch window is the total number of bytes of
37513 object code allowed in a window. */
37514 #define DISPATCH_WINDOW_SIZE 16
37515
37516 /* Number of dispatch windows considered for scheduling. */
37517 #define MAX_DISPATCH_WINDOWS 3
37518
37519 /* Maximum number of instructions in a window. */
37520 #define MAX_INSN 4
37521
37522 /* Maximum number of immediate operands in a window. */
37523 #define MAX_IMM 4
37524
37525 /* Maximum number of immediate bits allowed in a window. */
37526 #define MAX_IMM_SIZE 128
37527
37528 /* Maximum number of 32 bit immediates allowed in a window. */
37529 #define MAX_IMM_32 4
37530
37531 /* Maximum number of 64 bit immediates allowed in a window. */
37532 #define MAX_IMM_64 2
37533
37534 /* Maximum total of loads or prefetches allowed in a window. */
37535 #define MAX_LOAD 2
37536
37537 /* Maximum total of stores allowed in a window. */
37538 #define MAX_STORE 1
37539
37540 #undef BIG
37541 #define BIG 100
37542
37543
37544 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
37545 enum dispatch_group {
37546 disp_no_group = 0,
37547 disp_load,
37548 disp_store,
37549 disp_load_store,
37550 disp_prefetch,
37551 disp_imm,
37552 disp_imm_32,
37553 disp_imm_64,
37554 disp_branch,
37555 disp_cmp,
37556 disp_jcc,
37557 disp_last
37558 };
37559
37560 /* Number of allowable groups in a dispatch window. It is an array
37561 indexed by dispatch_group enum. 100 is used as a big number,
37562 because the number of these kind of operations does not have any
37563 effect in dispatch window, but we need them for other reasons in
37564 the table. */
37565 static unsigned int num_allowable_groups[disp_last] = {
37566 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
37567 };
37568
37569 char group_name[disp_last + 1][16] = {
37570 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
37571 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
37572 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
37573 };
37574
37575 /* Instruction path. */
37576 enum insn_path {
37577 no_path = 0,
37578 path_single, /* Single micro op. */
37579 path_double, /* Double micro op. */
37580 path_multi, /* Instructions with more than 2 micro op.. */
37581 last_path
37582 };
37583
37584 /* sched_insn_info defines a window to the instructions scheduled in
37585 the basic block. It contains a pointer to the insn_info table and
37586 the instruction scheduled.
37587
37588 Windows are allocated for each basic block and are linked
37589 together. */
37590 typedef struct sched_insn_info_s {
37591 rtx insn;
37592 enum dispatch_group group;
37593 enum insn_path path;
37594 int byte_len;
37595 int imm_bytes;
37596 } sched_insn_info;
37597
37598 /* Linked list of dispatch windows. This is a two way list of
37599 dispatch windows of a basic block. It contains information about
37600 the number of uops in the window and the total number of
37601 instructions and of bytes in the object code for this dispatch
37602 window. */
37603 typedef struct dispatch_windows_s {
37604 int num_insn; /* Number of insn in the window. */
37605 int num_uops; /* Number of uops in the window. */
37606 int window_size; /* Number of bytes in the window. */
37607 int window_num; /* Window number between 0 or 1. */
37608 int num_imm; /* Number of immediates in an insn. */
37609 int num_imm_32; /* Number of 32 bit immediates in an insn. */
37610 int num_imm_64; /* Number of 64 bit immediates in an insn. */
37611 int imm_size; /* Total immediates in the window. */
37612 int num_loads; /* Total memory loads in the window. */
37613 int num_stores; /* Total memory stores in the window. */
37614 int violation; /* Violation exists in window. */
37615 sched_insn_info *window; /* Pointer to the window. */
37616 struct dispatch_windows_s *next;
37617 struct dispatch_windows_s *prev;
37618 } dispatch_windows;
37619
37620 /* Immediate valuse used in an insn. */
37621 typedef struct imm_info_s
37622 {
37623 int imm;
37624 int imm32;
37625 int imm64;
37626 } imm_info;
37627
37628 static dispatch_windows *dispatch_window_list;
37629 static dispatch_windows *dispatch_window_list1;
37630
37631 /* Get dispatch group of insn. */
37632
37633 static enum dispatch_group
37634 get_mem_group (rtx insn)
37635 {
37636 enum attr_memory memory;
37637
37638 if (INSN_CODE (insn) < 0)
37639 return disp_no_group;
37640 memory = get_attr_memory (insn);
37641 if (memory == MEMORY_STORE)
37642 return disp_store;
37643
37644 if (memory == MEMORY_LOAD)
37645 return disp_load;
37646
37647 if (memory == MEMORY_BOTH)
37648 return disp_load_store;
37649
37650 return disp_no_group;
37651 }
37652
37653 /* Return true if insn is a compare instruction. */
37654
37655 static bool
37656 is_cmp (rtx insn)
37657 {
37658 enum attr_type type;
37659
37660 type = get_attr_type (insn);
37661 return (type == TYPE_TEST
37662 || type == TYPE_ICMP
37663 || type == TYPE_FCMP
37664 || GET_CODE (PATTERN (insn)) == COMPARE);
37665 }
37666
37667 /* Return true if a dispatch violation encountered. */
37668
37669 static bool
37670 dispatch_violation (void)
37671 {
37672 if (dispatch_window_list->next)
37673 return dispatch_window_list->next->violation;
37674 return dispatch_window_list->violation;
37675 }
37676
37677 /* Return true if insn is a branch instruction. */
37678
37679 static bool
37680 is_branch (rtx insn)
37681 {
37682 return (CALL_P (insn) || JUMP_P (insn));
37683 }
37684
37685 /* Return true if insn is a prefetch instruction. */
37686
37687 static bool
37688 is_prefetch (rtx insn)
37689 {
37690 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
37691 }
37692
37693 /* This function initializes a dispatch window and the list container holding a
37694 pointer to the window. */
37695
37696 static void
37697 init_window (int window_num)
37698 {
37699 int i;
37700 dispatch_windows *new_list;
37701
37702 if (window_num == 0)
37703 new_list = dispatch_window_list;
37704 else
37705 new_list = dispatch_window_list1;
37706
37707 new_list->num_insn = 0;
37708 new_list->num_uops = 0;
37709 new_list->window_size = 0;
37710 new_list->next = NULL;
37711 new_list->prev = NULL;
37712 new_list->window_num = window_num;
37713 new_list->num_imm = 0;
37714 new_list->num_imm_32 = 0;
37715 new_list->num_imm_64 = 0;
37716 new_list->imm_size = 0;
37717 new_list->num_loads = 0;
37718 new_list->num_stores = 0;
37719 new_list->violation = false;
37720
37721 for (i = 0; i < MAX_INSN; i++)
37722 {
37723 new_list->window[i].insn = NULL;
37724 new_list->window[i].group = disp_no_group;
37725 new_list->window[i].path = no_path;
37726 new_list->window[i].byte_len = 0;
37727 new_list->window[i].imm_bytes = 0;
37728 }
37729 return;
37730 }
37731
37732 /* This function allocates and initializes a dispatch window and the
37733 list container holding a pointer to the window. */
37734
37735 static dispatch_windows *
37736 allocate_window (void)
37737 {
37738 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
37739 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
37740
37741 return new_list;
37742 }
37743
37744 /* This routine initializes the dispatch scheduling information. It
37745 initiates building dispatch scheduler tables and constructs the
37746 first dispatch window. */
37747
37748 static void
37749 init_dispatch_sched (void)
37750 {
37751 /* Allocate a dispatch list and a window. */
37752 dispatch_window_list = allocate_window ();
37753 dispatch_window_list1 = allocate_window ();
37754 init_window (0);
37755 init_window (1);
37756 }
37757
37758 /* This function returns true if a branch is detected. End of a basic block
37759 does not have to be a branch, but here we assume only branches end a
37760 window. */
37761
37762 static bool
37763 is_end_basic_block (enum dispatch_group group)
37764 {
37765 return group == disp_branch;
37766 }
37767
37768 /* This function is called when the end of a window processing is reached. */
37769
37770 static void
37771 process_end_window (void)
37772 {
37773 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
37774 if (dispatch_window_list->next)
37775 {
37776 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
37777 gcc_assert (dispatch_window_list->window_size
37778 + dispatch_window_list1->window_size <= 48);
37779 init_window (1);
37780 }
37781 init_window (0);
37782 }
37783
37784 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
37785 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
37786 for 48 bytes of instructions. Note that these windows are not dispatch
37787 windows that their sizes are DISPATCH_WINDOW_SIZE. */
37788
37789 static dispatch_windows *
37790 allocate_next_window (int window_num)
37791 {
37792 if (window_num == 0)
37793 {
37794 if (dispatch_window_list->next)
37795 init_window (1);
37796 init_window (0);
37797 return dispatch_window_list;
37798 }
37799
37800 dispatch_window_list->next = dispatch_window_list1;
37801 dispatch_window_list1->prev = dispatch_window_list;
37802
37803 return dispatch_window_list1;
37804 }
37805
37806 /* Increment the number of immediate operands of an instruction. */
37807
37808 static int
37809 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
37810 {
37811 if (*in_rtx == 0)
37812 return 0;
37813
37814 switch ( GET_CODE (*in_rtx))
37815 {
37816 case CONST:
37817 case SYMBOL_REF:
37818 case CONST_INT:
37819 (imm_values->imm)++;
37820 if (x86_64_immediate_operand (*in_rtx, SImode))
37821 (imm_values->imm32)++;
37822 else
37823 (imm_values->imm64)++;
37824 break;
37825
37826 case CONST_DOUBLE:
37827 (imm_values->imm)++;
37828 (imm_values->imm64)++;
37829 break;
37830
37831 case CODE_LABEL:
37832 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
37833 {
37834 (imm_values->imm)++;
37835 (imm_values->imm32)++;
37836 }
37837 break;
37838
37839 default:
37840 break;
37841 }
37842
37843 return 0;
37844 }
37845
37846 /* Compute number of immediate operands of an instruction. */
37847
37848 static void
37849 find_constant (rtx in_rtx, imm_info *imm_values)
37850 {
37851 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
37852 (rtx_function) find_constant_1, (void *) imm_values);
37853 }
37854
37855 /* Return total size of immediate operands of an instruction along with number
37856 of corresponding immediate-operands. It initializes its parameters to zero
37857 befor calling FIND_CONSTANT.
37858 INSN is the input instruction. IMM is the total of immediates.
37859 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
37860 bit immediates. */
37861
37862 static int
37863 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
37864 {
37865 imm_info imm_values = {0, 0, 0};
37866
37867 find_constant (insn, &imm_values);
37868 *imm = imm_values.imm;
37869 *imm32 = imm_values.imm32;
37870 *imm64 = imm_values.imm64;
37871 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
37872 }
37873
37874 /* This function indicates if an operand of an instruction is an
37875 immediate. */
37876
37877 static bool
37878 has_immediate (rtx insn)
37879 {
37880 int num_imm_operand;
37881 int num_imm32_operand;
37882 int num_imm64_operand;
37883
37884 if (insn)
37885 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37886 &num_imm64_operand);
37887 return false;
37888 }
37889
37890 /* Return single or double path for instructions. */
37891
37892 static enum insn_path
37893 get_insn_path (rtx insn)
37894 {
37895 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
37896
37897 if ((int)path == 0)
37898 return path_single;
37899
37900 if ((int)path == 1)
37901 return path_double;
37902
37903 return path_multi;
37904 }
37905
37906 /* Return insn dispatch group. */
37907
37908 static enum dispatch_group
37909 get_insn_group (rtx insn)
37910 {
37911 enum dispatch_group group = get_mem_group (insn);
37912 if (group)
37913 return group;
37914
37915 if (is_branch (insn))
37916 return disp_branch;
37917
37918 if (is_cmp (insn))
37919 return disp_cmp;
37920
37921 if (has_immediate (insn))
37922 return disp_imm;
37923
37924 if (is_prefetch (insn))
37925 return disp_prefetch;
37926
37927 return disp_no_group;
37928 }
37929
37930 /* Count number of GROUP restricted instructions in a dispatch
37931 window WINDOW_LIST. */
37932
37933 static int
37934 count_num_restricted (rtx insn, dispatch_windows *window_list)
37935 {
37936 enum dispatch_group group = get_insn_group (insn);
37937 int imm_size;
37938 int num_imm_operand;
37939 int num_imm32_operand;
37940 int num_imm64_operand;
37941
37942 if (group == disp_no_group)
37943 return 0;
37944
37945 if (group == disp_imm)
37946 {
37947 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37948 &num_imm64_operand);
37949 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
37950 || num_imm_operand + window_list->num_imm > MAX_IMM
37951 || (num_imm32_operand > 0
37952 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
37953 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
37954 || (num_imm64_operand > 0
37955 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
37956 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
37957 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
37958 && num_imm64_operand > 0
37959 && ((window_list->num_imm_64 > 0
37960 && window_list->num_insn >= 2)
37961 || window_list->num_insn >= 3)))
37962 return BIG;
37963
37964 return 1;
37965 }
37966
37967 if ((group == disp_load_store
37968 && (window_list->num_loads >= MAX_LOAD
37969 || window_list->num_stores >= MAX_STORE))
37970 || ((group == disp_load
37971 || group == disp_prefetch)
37972 && window_list->num_loads >= MAX_LOAD)
37973 || (group == disp_store
37974 && window_list->num_stores >= MAX_STORE))
37975 return BIG;
37976
37977 return 1;
37978 }
37979
37980 /* This function returns true if insn satisfies dispatch rules on the
37981 last window scheduled. */
37982
37983 static bool
37984 fits_dispatch_window (rtx insn)
37985 {
37986 dispatch_windows *window_list = dispatch_window_list;
37987 dispatch_windows *window_list_next = dispatch_window_list->next;
37988 unsigned int num_restrict;
37989 enum dispatch_group group = get_insn_group (insn);
37990 enum insn_path path = get_insn_path (insn);
37991 int sum;
37992
37993 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
37994 instructions should be given the lowest priority in the
37995 scheduling process in Haifa scheduler to make sure they will be
37996 scheduled in the same dispatch window as the refrence to them. */
37997 if (group == disp_jcc || group == disp_cmp)
37998 return false;
37999
38000 /* Check nonrestricted. */
38001 if (group == disp_no_group || group == disp_branch)
38002 return true;
38003
38004 /* Get last dispatch window. */
38005 if (window_list_next)
38006 window_list = window_list_next;
38007
38008 if (window_list->window_num == 1)
38009 {
38010 sum = window_list->prev->window_size + window_list->window_size;
38011
38012 if (sum == 32
38013 || (min_insn_size (insn) + sum) >= 48)
38014 /* Window 1 is full. Go for next window. */
38015 return true;
38016 }
38017
38018 num_restrict = count_num_restricted (insn, window_list);
38019
38020 if (num_restrict > num_allowable_groups[group])
38021 return false;
38022
38023 /* See if it fits in the first window. */
38024 if (window_list->window_num == 0)
38025 {
38026 /* The first widow should have only single and double path
38027 uops. */
38028 if (path == path_double
38029 && (window_list->num_uops + 2) > MAX_INSN)
38030 return false;
38031 else if (path != path_single)
38032 return false;
38033 }
38034 return true;
38035 }
38036
38037 /* Add an instruction INSN with NUM_UOPS micro-operations to the
38038 dispatch window WINDOW_LIST. */
38039
38040 static void
38041 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
38042 {
38043 int byte_len = min_insn_size (insn);
38044 int num_insn = window_list->num_insn;
38045 int imm_size;
38046 sched_insn_info *window = window_list->window;
38047 enum dispatch_group group = get_insn_group (insn);
38048 enum insn_path path = get_insn_path (insn);
38049 int num_imm_operand;
38050 int num_imm32_operand;
38051 int num_imm64_operand;
38052
38053 if (!window_list->violation && group != disp_cmp
38054 && !fits_dispatch_window (insn))
38055 window_list->violation = true;
38056
38057 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38058 &num_imm64_operand);
38059
38060 /* Initialize window with new instruction. */
38061 window[num_insn].insn = insn;
38062 window[num_insn].byte_len = byte_len;
38063 window[num_insn].group = group;
38064 window[num_insn].path = path;
38065 window[num_insn].imm_bytes = imm_size;
38066
38067 window_list->window_size += byte_len;
38068 window_list->num_insn = num_insn + 1;
38069 window_list->num_uops = window_list->num_uops + num_uops;
38070 window_list->imm_size += imm_size;
38071 window_list->num_imm += num_imm_operand;
38072 window_list->num_imm_32 += num_imm32_operand;
38073 window_list->num_imm_64 += num_imm64_operand;
38074
38075 if (group == disp_store)
38076 window_list->num_stores += 1;
38077 else if (group == disp_load
38078 || group == disp_prefetch)
38079 window_list->num_loads += 1;
38080 else if (group == disp_load_store)
38081 {
38082 window_list->num_stores += 1;
38083 window_list->num_loads += 1;
38084 }
38085 }
38086
38087 /* Adds a scheduled instruction, INSN, to the current dispatch window.
38088 If the total bytes of instructions or the number of instructions in
38089 the window exceed allowable, it allocates a new window. */
38090
38091 static void
38092 add_to_dispatch_window (rtx insn)
38093 {
38094 int byte_len;
38095 dispatch_windows *window_list;
38096 dispatch_windows *next_list;
38097 dispatch_windows *window0_list;
38098 enum insn_path path;
38099 enum dispatch_group insn_group;
38100 bool insn_fits;
38101 int num_insn;
38102 int num_uops;
38103 int window_num;
38104 int insn_num_uops;
38105 int sum;
38106
38107 if (INSN_CODE (insn) < 0)
38108 return;
38109
38110 byte_len = min_insn_size (insn);
38111 window_list = dispatch_window_list;
38112 next_list = window_list->next;
38113 path = get_insn_path (insn);
38114 insn_group = get_insn_group (insn);
38115
38116 /* Get the last dispatch window. */
38117 if (next_list)
38118 window_list = dispatch_window_list->next;
38119
38120 if (path == path_single)
38121 insn_num_uops = 1;
38122 else if (path == path_double)
38123 insn_num_uops = 2;
38124 else
38125 insn_num_uops = (int) path;
38126
38127 /* If current window is full, get a new window.
38128 Window number zero is full, if MAX_INSN uops are scheduled in it.
38129 Window number one is full, if window zero's bytes plus window
38130 one's bytes is 32, or if the bytes of the new instruction added
38131 to the total makes it greater than 48, or it has already MAX_INSN
38132 instructions in it. */
38133 num_insn = window_list->num_insn;
38134 num_uops = window_list->num_uops;
38135 window_num = window_list->window_num;
38136 insn_fits = fits_dispatch_window (insn);
38137
38138 if (num_insn >= MAX_INSN
38139 || num_uops + insn_num_uops > MAX_INSN
38140 || !(insn_fits))
38141 {
38142 window_num = ~window_num & 1;
38143 window_list = allocate_next_window (window_num);
38144 }
38145
38146 if (window_num == 0)
38147 {
38148 add_insn_window (insn, window_list, insn_num_uops);
38149 if (window_list->num_insn >= MAX_INSN
38150 && insn_group == disp_branch)
38151 {
38152 process_end_window ();
38153 return;
38154 }
38155 }
38156 else if (window_num == 1)
38157 {
38158 window0_list = window_list->prev;
38159 sum = window0_list->window_size + window_list->window_size;
38160 if (sum == 32
38161 || (byte_len + sum) >= 48)
38162 {
38163 process_end_window ();
38164 window_list = dispatch_window_list;
38165 }
38166
38167 add_insn_window (insn, window_list, insn_num_uops);
38168 }
38169 else
38170 gcc_unreachable ();
38171
38172 if (is_end_basic_block (insn_group))
38173 {
38174 /* End of basic block is reached do end-basic-block process. */
38175 process_end_window ();
38176 return;
38177 }
38178 }
38179
38180 /* Print the dispatch window, WINDOW_NUM, to FILE. */
38181
38182 DEBUG_FUNCTION static void
38183 debug_dispatch_window_file (FILE *file, int window_num)
38184 {
38185 dispatch_windows *list;
38186 int i;
38187
38188 if (window_num == 0)
38189 list = dispatch_window_list;
38190 else
38191 list = dispatch_window_list1;
38192
38193 fprintf (file, "Window #%d:\n", list->window_num);
38194 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
38195 list->num_insn, list->num_uops, list->window_size);
38196 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38197 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
38198
38199 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
38200 list->num_stores);
38201 fprintf (file, " insn info:\n");
38202
38203 for (i = 0; i < MAX_INSN; i++)
38204 {
38205 if (!list->window[i].insn)
38206 break;
38207 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
38208 i, group_name[list->window[i].group],
38209 i, (void *)list->window[i].insn,
38210 i, list->window[i].path,
38211 i, list->window[i].byte_len,
38212 i, list->window[i].imm_bytes);
38213 }
38214 }
38215
38216 /* Print to stdout a dispatch window. */
38217
38218 DEBUG_FUNCTION void
38219 debug_dispatch_window (int window_num)
38220 {
38221 debug_dispatch_window_file (stdout, window_num);
38222 }
38223
38224 /* Print INSN dispatch information to FILE. */
38225
38226 DEBUG_FUNCTION static void
38227 debug_insn_dispatch_info_file (FILE *file, rtx insn)
38228 {
38229 int byte_len;
38230 enum insn_path path;
38231 enum dispatch_group group;
38232 int imm_size;
38233 int num_imm_operand;
38234 int num_imm32_operand;
38235 int num_imm64_operand;
38236
38237 if (INSN_CODE (insn) < 0)
38238 return;
38239
38240 byte_len = min_insn_size (insn);
38241 path = get_insn_path (insn);
38242 group = get_insn_group (insn);
38243 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38244 &num_imm64_operand);
38245
38246 fprintf (file, " insn info:\n");
38247 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
38248 group_name[group], path, byte_len);
38249 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38250 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
38251 }
38252
38253 /* Print to STDERR the status of the ready list with respect to
38254 dispatch windows. */
38255
38256 DEBUG_FUNCTION void
38257 debug_ready_dispatch (void)
38258 {
38259 int i;
38260 int no_ready = number_in_ready ();
38261
38262 fprintf (stdout, "Number of ready: %d\n", no_ready);
38263
38264 for (i = 0; i < no_ready; i++)
38265 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
38266 }
38267
38268 /* This routine is the driver of the dispatch scheduler. */
38269
38270 static void
38271 do_dispatch (rtx insn, int mode)
38272 {
38273 if (mode == DISPATCH_INIT)
38274 init_dispatch_sched ();
38275 else if (mode == ADD_TO_DISPATCH_WINDOW)
38276 add_to_dispatch_window (insn);
38277 }
38278
38279 /* Return TRUE if Dispatch Scheduling is supported. */
38280
38281 static bool
38282 has_dispatch (rtx insn, int action)
38283 {
38284 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
38285 && flag_dispatch_scheduler)
38286 switch (action)
38287 {
38288 default:
38289 return false;
38290
38291 case IS_DISPATCH_ON:
38292 return true;
38293 break;
38294
38295 case IS_CMP:
38296 return is_cmp (insn);
38297
38298 case DISPATCH_VIOLATION:
38299 return dispatch_violation ();
38300
38301 case FITS_DISPATCH_WINDOW:
38302 return fits_dispatch_window (insn);
38303 }
38304
38305 return false;
38306 }
38307
38308 /* Implementation of reassociation_width target hook used by
38309 reassoc phase to identify parallelism level in reassociated
38310 tree. Statements tree_code is passed in OPC. Arguments type
38311 is passed in MODE.
38312
38313 Currently parallel reassociation is enabled for Atom
38314 processors only and we set reassociation width to be 2
38315 because Atom may issue up to 2 instructions per cycle.
38316
38317 Return value should be fixed if parallel reassociation is
38318 enabled for other processors. */
38319
38320 static int
38321 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
38322 enum machine_mode mode)
38323 {
38324 int res = 1;
38325
38326 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
38327 res = 2;
38328 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
38329 res = 2;
38330
38331 return res;
38332 }
38333
38334 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
38335 place emms and femms instructions. */
38336
38337 static enum machine_mode
38338 ix86_preferred_simd_mode (enum machine_mode mode)
38339 {
38340 if (!TARGET_SSE)
38341 return word_mode;
38342
38343 switch (mode)
38344 {
38345 case QImode:
38346 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
38347 case HImode:
38348 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
38349 case SImode:
38350 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
38351 case DImode:
38352 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
38353
38354 case SFmode:
38355 if (TARGET_AVX && !TARGET_PREFER_AVX128)
38356 return V8SFmode;
38357 else
38358 return V4SFmode;
38359
38360 case DFmode:
38361 if (!TARGET_VECTORIZE_DOUBLE)
38362 return word_mode;
38363 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
38364 return V4DFmode;
38365 else if (TARGET_SSE2)
38366 return V2DFmode;
38367 /* FALLTHRU */
38368
38369 default:
38370 return word_mode;
38371 }
38372 }
38373
38374 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
38375 vectors. */
38376
38377 static unsigned int
38378 ix86_autovectorize_vector_sizes (void)
38379 {
38380 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
38381 }
38382
38383 /* Initialize the GCC target structure. */
38384 #undef TARGET_RETURN_IN_MEMORY
38385 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
38386
38387 #undef TARGET_LEGITIMIZE_ADDRESS
38388 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
38389
38390 #undef TARGET_ATTRIBUTE_TABLE
38391 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
38392 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38393 # undef TARGET_MERGE_DECL_ATTRIBUTES
38394 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
38395 #endif
38396
38397 #undef TARGET_COMP_TYPE_ATTRIBUTES
38398 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
38399
38400 #undef TARGET_INIT_BUILTINS
38401 #define TARGET_INIT_BUILTINS ix86_init_builtins
38402 #undef TARGET_BUILTIN_DECL
38403 #define TARGET_BUILTIN_DECL ix86_builtin_decl
38404 #undef TARGET_EXPAND_BUILTIN
38405 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
38406
38407 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
38408 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
38409 ix86_builtin_vectorized_function
38410
38411 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
38412 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
38413
38414 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
38415 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
38416
38417 #undef TARGET_VECTORIZE_BUILTIN_GATHER
38418 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
38419
38420 #undef TARGET_BUILTIN_RECIPROCAL
38421 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
38422
38423 #undef TARGET_ASM_FUNCTION_EPILOGUE
38424 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
38425
38426 #undef TARGET_ENCODE_SECTION_INFO
38427 #ifndef SUBTARGET_ENCODE_SECTION_INFO
38428 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
38429 #else
38430 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
38431 #endif
38432
38433 #undef TARGET_ASM_OPEN_PAREN
38434 #define TARGET_ASM_OPEN_PAREN ""
38435 #undef TARGET_ASM_CLOSE_PAREN
38436 #define TARGET_ASM_CLOSE_PAREN ""
38437
38438 #undef TARGET_ASM_BYTE_OP
38439 #define TARGET_ASM_BYTE_OP ASM_BYTE
38440
38441 #undef TARGET_ASM_ALIGNED_HI_OP
38442 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
38443 #undef TARGET_ASM_ALIGNED_SI_OP
38444 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
38445 #ifdef ASM_QUAD
38446 #undef TARGET_ASM_ALIGNED_DI_OP
38447 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
38448 #endif
38449
38450 #undef TARGET_PROFILE_BEFORE_PROLOGUE
38451 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
38452
38453 #undef TARGET_ASM_UNALIGNED_HI_OP
38454 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
38455 #undef TARGET_ASM_UNALIGNED_SI_OP
38456 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
38457 #undef TARGET_ASM_UNALIGNED_DI_OP
38458 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
38459
38460 #undef TARGET_PRINT_OPERAND
38461 #define TARGET_PRINT_OPERAND ix86_print_operand
38462 #undef TARGET_PRINT_OPERAND_ADDRESS
38463 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
38464 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
38465 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
38466 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
38467 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
38468
38469 #undef TARGET_SCHED_INIT_GLOBAL
38470 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
38471 #undef TARGET_SCHED_ADJUST_COST
38472 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
38473 #undef TARGET_SCHED_ISSUE_RATE
38474 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
38475 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
38476 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
38477 ia32_multipass_dfa_lookahead
38478
38479 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
38480 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
38481
38482 #ifdef HAVE_AS_TLS
38483 #undef TARGET_HAVE_TLS
38484 #define TARGET_HAVE_TLS true
38485 #endif
38486 #undef TARGET_CANNOT_FORCE_CONST_MEM
38487 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
38488 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
38489 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
38490
38491 #undef TARGET_DELEGITIMIZE_ADDRESS
38492 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
38493
38494 #undef TARGET_MS_BITFIELD_LAYOUT_P
38495 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
38496
38497 #if TARGET_MACHO
38498 #undef TARGET_BINDS_LOCAL_P
38499 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
38500 #endif
38501 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38502 #undef TARGET_BINDS_LOCAL_P
38503 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
38504 #endif
38505
38506 #undef TARGET_ASM_OUTPUT_MI_THUNK
38507 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
38508 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
38509 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
38510
38511 #undef TARGET_ASM_FILE_START
38512 #define TARGET_ASM_FILE_START x86_file_start
38513
38514 #undef TARGET_OPTION_OVERRIDE
38515 #define TARGET_OPTION_OVERRIDE ix86_option_override
38516
38517 #undef TARGET_REGISTER_MOVE_COST
38518 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
38519 #undef TARGET_MEMORY_MOVE_COST
38520 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
38521 #undef TARGET_RTX_COSTS
38522 #define TARGET_RTX_COSTS ix86_rtx_costs
38523 #undef TARGET_ADDRESS_COST
38524 #define TARGET_ADDRESS_COST ix86_address_cost
38525
38526 #undef TARGET_FIXED_CONDITION_CODE_REGS
38527 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
38528 #undef TARGET_CC_MODES_COMPATIBLE
38529 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
38530
38531 #undef TARGET_MACHINE_DEPENDENT_REORG
38532 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
38533
38534 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
38535 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
38536
38537 #undef TARGET_BUILD_BUILTIN_VA_LIST
38538 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
38539
38540 #undef TARGET_ENUM_VA_LIST_P
38541 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
38542
38543 #undef TARGET_FN_ABI_VA_LIST
38544 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
38545
38546 #undef TARGET_CANONICAL_VA_LIST_TYPE
38547 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
38548
38549 #undef TARGET_EXPAND_BUILTIN_VA_START
38550 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
38551
38552 #undef TARGET_MD_ASM_CLOBBERS
38553 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
38554
38555 #undef TARGET_PROMOTE_PROTOTYPES
38556 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
38557 #undef TARGET_STRUCT_VALUE_RTX
38558 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
38559 #undef TARGET_SETUP_INCOMING_VARARGS
38560 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
38561 #undef TARGET_MUST_PASS_IN_STACK
38562 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
38563 #undef TARGET_FUNCTION_ARG_ADVANCE
38564 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
38565 #undef TARGET_FUNCTION_ARG
38566 #define TARGET_FUNCTION_ARG ix86_function_arg
38567 #undef TARGET_FUNCTION_ARG_BOUNDARY
38568 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
38569 #undef TARGET_PASS_BY_REFERENCE
38570 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
38571 #undef TARGET_INTERNAL_ARG_POINTER
38572 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
38573 #undef TARGET_UPDATE_STACK_BOUNDARY
38574 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
38575 #undef TARGET_GET_DRAP_RTX
38576 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
38577 #undef TARGET_STRICT_ARGUMENT_NAMING
38578 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
38579 #undef TARGET_STATIC_CHAIN
38580 #define TARGET_STATIC_CHAIN ix86_static_chain
38581 #undef TARGET_TRAMPOLINE_INIT
38582 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
38583 #undef TARGET_RETURN_POPS_ARGS
38584 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
38585
38586 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
38587 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
38588
38589 #undef TARGET_SCALAR_MODE_SUPPORTED_P
38590 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
38591
38592 #undef TARGET_VECTOR_MODE_SUPPORTED_P
38593 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
38594
38595 #undef TARGET_C_MODE_FOR_SUFFIX
38596 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
38597
38598 #ifdef HAVE_AS_TLS
38599 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
38600 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
38601 #endif
38602
38603 #ifdef SUBTARGET_INSERT_ATTRIBUTES
38604 #undef TARGET_INSERT_ATTRIBUTES
38605 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
38606 #endif
38607
38608 #undef TARGET_MANGLE_TYPE
38609 #define TARGET_MANGLE_TYPE ix86_mangle_type
38610
38611 #ifndef TARGET_MACHO
38612 #undef TARGET_STACK_PROTECT_FAIL
38613 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
38614 #endif
38615
38616 #undef TARGET_FUNCTION_VALUE
38617 #define TARGET_FUNCTION_VALUE ix86_function_value
38618
38619 #undef TARGET_FUNCTION_VALUE_REGNO_P
38620 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
38621
38622 #undef TARGET_PROMOTE_FUNCTION_MODE
38623 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
38624
38625 #undef TARGET_SECONDARY_RELOAD
38626 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
38627
38628 #undef TARGET_CLASS_MAX_NREGS
38629 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
38630
38631 #undef TARGET_PREFERRED_RELOAD_CLASS
38632 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
38633 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
38634 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
38635 #undef TARGET_CLASS_LIKELY_SPILLED_P
38636 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
38637
38638 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
38639 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
38640 ix86_builtin_vectorization_cost
38641 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
38642 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
38643 ix86_vectorize_vec_perm_const_ok
38644 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
38645 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
38646 ix86_preferred_simd_mode
38647 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
38648 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
38649 ix86_autovectorize_vector_sizes
38650
38651 #undef TARGET_SET_CURRENT_FUNCTION
38652 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
38653
38654 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
38655 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
38656
38657 #undef TARGET_OPTION_SAVE
38658 #define TARGET_OPTION_SAVE ix86_function_specific_save
38659
38660 #undef TARGET_OPTION_RESTORE
38661 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
38662
38663 #undef TARGET_OPTION_PRINT
38664 #define TARGET_OPTION_PRINT ix86_function_specific_print
38665
38666 #undef TARGET_CAN_INLINE_P
38667 #define TARGET_CAN_INLINE_P ix86_can_inline_p
38668
38669 #undef TARGET_EXPAND_TO_RTL_HOOK
38670 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
38671
38672 #undef TARGET_LEGITIMATE_ADDRESS_P
38673 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
38674
38675 #undef TARGET_LEGITIMATE_CONSTANT_P
38676 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
38677
38678 #undef TARGET_FRAME_POINTER_REQUIRED
38679 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
38680
38681 #undef TARGET_CAN_ELIMINATE
38682 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
38683
38684 #undef TARGET_EXTRA_LIVE_ON_ENTRY
38685 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
38686
38687 #undef TARGET_ASM_CODE_END
38688 #define TARGET_ASM_CODE_END ix86_code_end
38689
38690 #undef TARGET_CONDITIONAL_REGISTER_USAGE
38691 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
38692
38693 #if TARGET_MACHO
38694 #undef TARGET_INIT_LIBFUNCS
38695 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
38696 #endif
38697
38698 struct gcc_target targetm = TARGET_INITIALIZER;
38699 \f
38700 #include "gt-i386.h"