f93583f90463f8a314de8feb40b675ff0ca446a2
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "cgraph.h"
51 #include "gimple.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "sched-int.h"
59 #include "sbitmap.h"
60 #include "fibheap.h"
61 #include "opts.h"
62 #include "diagnostic.h"
63
64 enum upper_128bits_state
65 {
66 unknown = 0,
67 unused,
68 used
69 };
70
71 typedef struct block_info_def
72 {
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
76 in this block. */
77 bool unchanged;
78 /* TRUE if block has been processed. */
79 bool processed;
80 /* TRUE if block has been scanned. */
81 bool scanned;
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
84 } *block_info;
85
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
87
88 enum call_avx256_state
89 {
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
95 callee_pass_avx256,
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
98 call_no_avx256,
99 /* vzeroupper intrinsic. */
100 vzeroupper_intrinsic
101 };
102
103 /* Check if a 256bit AVX register is referenced in stores. */
104
105 static void
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
107 {
108 if ((REG_P (dest)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
113 {
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
116 *state = used;
117 }
118 }
119
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
123
124 STATE is state of the upper 128bits of AVX registers at entry. */
125
126 static void
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
129 {
130 rtx insn, bb_end;
131 rtx vzeroupper_insn = NULL_RTX;
132 rtx pat;
133 int avx256;
134 bool unchanged;
135
136 if (BLOCK_INFO (bb)->unchanged)
137 {
138 if (dump_file)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
140 bb->index, state);
141
142 BLOCK_INFO (bb)->state = state;
143 return;
144 }
145
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
147 {
148 if (dump_file)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
151 return;
152 }
153
154 BLOCK_INFO (bb)->prev = state;
155
156 if (dump_file)
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
158 bb->index, state);
159
160 unchanged = true;
161
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
164 insn = BB_HEAD (bb);
165 while (insn != bb_end)
166 {
167 insn = NEXT_INSN (insn);
168
169 if (!NONDEBUG_INSN_P (insn))
170 continue;
171
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
174 {
175 if (!vzeroupper_insn)
176 continue;
177
178 if (PREV_INSN (insn) != vzeroupper_insn)
179 {
180 if (dump_file)
181 {
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
186 }
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
188 PREV_INSN (insn));
189 }
190 vzeroupper_insn = NULL_RTX;
191 continue;
192 }
193
194 pat = PATTERN (insn);
195
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
199 {
200 if (dump_file)
201 {
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
205 }
206 }
207 else
208 {
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
213 {
214 state = unused;
215 unchanged = false;
216
217 /* Delete pending vzeroupper insertion. */
218 if (vzeroupper_insn)
219 {
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
222 }
223 }
224 else if (state != used)
225 {
226 note_stores (pat, check_avx256_stores, &state);
227 if (state == used)
228 unchanged = false;
229 }
230 continue;
231 }
232
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
235
236 if (state == unused)
237 {
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
242 {
243 state = used;
244 unchanged = false;
245 }
246
247 /* Remove unnecessary vzeroupper since upper 128bits are
248 cleared. */
249 if (dump_file)
250 {
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
253 }
254 delete_insn (insn);
255 }
256 else
257 {
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
259 register. */
260 if (avx256 != callee_return_pass_avx256)
261 state = unused;
262
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
265 {
266 /* Must remove vzeroupper since callee passes in 256bit
267 AVX register. */
268 if (dump_file)
269 {
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
272 }
273 delete_insn (insn);
274 }
275 else
276 {
277 vzeroupper_insn = insn;
278 unchanged = false;
279 }
280 }
281 }
282
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
286
287 if (dump_file)
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
290 state);
291 }
292
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
296 state is changed. */
297
298 static bool
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
300 {
301 edge e;
302 edge_iterator ei;
303 enum upper_128bits_state state, old_state, new_state;
304 bool seen_unknown;
305
306 if (dump_file)
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
309
310 if (BLOCK_INFO (block)->processed)
311 return false;
312
313 state = unused;
314
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
318 {
319 if (e->src == block)
320 continue;
321 switch (BLOCK_INFO (e->src)->state)
322 {
323 case unknown:
324 if (!unknown_is_unused)
325 seen_unknown = true;
326 case unused:
327 break;
328 case used:
329 state = used;
330 goto done;
331 }
332 }
333
334 if (seen_unknown)
335 state = unknown;
336
337 done:
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
341
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
344
345 /* Need to rescan if the upper 128bits of AVX registers are changed
346 to USED at exit. */
347 if (new_state != old_state)
348 {
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
351 return true;
352 }
353 else
354 return false;
355 }
356
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
360
361 static void
362 move_or_delete_vzeroupper (void)
363 {
364 edge e;
365 edge_iterator ei;
366 basic_block bb;
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
369 int *bb_order;
370 int *rc_order;
371 int i;
372
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
375
376 /* Process outgoing edges of entry point. */
377 if (dump_file)
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
379
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
381 {
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
384 ? used : unused);
385 BLOCK_INFO (e->dest)->processed = true;
386 }
387
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
395 free (rc_order);
396
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
403
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
406 FOR_EACH_BB (bb)
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
409 else
410 {
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
413 }
414
415 if (dump_file)
416 fprintf (dump_file, "Check remaining basic blocks\n");
417
418 while (!fibheap_empty (pending))
419 {
420 fibheap_swap = pending;
421 pending = worklist;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
426
427 sbitmap_zero (visited);
428
429 cfun->machine->rescan_vzeroupper_p = 0;
430
431 while (!fibheap_empty (worklist))
432 {
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
437 {
438 edge_iterator ei;
439
440 SET_BIT (visited, bb->index);
441
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
444 {
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
447 continue;
448
449 if (TEST_BIT (visited, e->dest->index))
450 {
451 if (!TEST_BIT (in_pending, e->dest->index))
452 {
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
457 e->dest);
458 }
459 }
460 else if (!TEST_BIT (in_worklist, e->dest->index))
461 {
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
465 e->dest);
466 }
467 }
468 }
469 }
470
471 if (!cfun->machine->rescan_vzeroupper_p)
472 break;
473 }
474
475 free (bb_order);
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
481
482 if (dump_file)
483 fprintf (dump_file, "Process remaining basic blocks\n");
484
485 FOR_EACH_BB (bb)
486 move_or_delete_vzeroupper_1 (bb, true);
487
488 free_aux_for_blocks ();
489 }
490
491 static rtx legitimize_dllimport_symbol (rtx, bool);
492
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
495 #endif
496
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
503 : 4)
504
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
508
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
510
511 const
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
531 2, /* MOVE_RATIO */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
557 2, /* Branch cost */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 1, /* scalar_stmt_cost. */
569 1, /* scalar load_cost. */
570 1, /* scalar_store_cost. */
571 1, /* vec_stmt_cost. */
572 1, /* vec_to_scalar_cost. */
573 1, /* scalar_to_vec_cost. */
574 1, /* vec_align_load_cost. */
575 1, /* vec_unalign_load_cost. */
576 1, /* vec_store_cost. */
577 1, /* cond_taken_branch_cost. */
578 1, /* cond_not_taken_branch_cost. */
579 };
580
581 /* Processor costs (relative to an add) */
582 static const
583 struct processor_costs i386_cost = { /* 386 specific costs */
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (1), /* cost of a lea instruction */
586 COSTS_N_INSNS (3), /* variable shift costs */
587 COSTS_N_INSNS (2), /* constant shift costs */
588 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (6), /* HI */
590 COSTS_N_INSNS (6), /* SI */
591 COSTS_N_INSNS (6), /* DI */
592 COSTS_N_INSNS (6)}, /* other */
593 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (23), /* HI */
596 COSTS_N_INSNS (23), /* SI */
597 COSTS_N_INSNS (23), /* DI */
598 COSTS_N_INSNS (23)}, /* other */
599 COSTS_N_INSNS (3), /* cost of movsx */
600 COSTS_N_INSNS (2), /* cost of movzx */
601 15, /* "large" insn */
602 3, /* MOVE_RATIO */
603 4, /* cost for loading QImode using movzbl */
604 {2, 4, 2}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {2, 4, 2}, /* cost of storing integer registers */
608 2, /* cost of reg,reg fld/fst */
609 {8, 8, 8}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {8, 8, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 8}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 8}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 8, 16}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 8, 16}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 3, /* MMX or SSE register to integer */
624 0, /* size of l1 cache */
625 0, /* size of l2 cache */
626 0, /* size of prefetch block */
627 0, /* number of parallel prefetches */
628 1, /* Branch cost */
629 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (22), /* cost of FABS instruction. */
633 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
635 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
636 DUMMY_STRINGOP_ALGS},
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
650 };
651
652 static const
653 struct processor_costs i486_cost = { /* 486 specific costs */
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (1), /* cost of a lea instruction */
656 COSTS_N_INSNS (3), /* variable shift costs */
657 COSTS_N_INSNS (2), /* constant shift costs */
658 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (12), /* HI */
660 COSTS_N_INSNS (12), /* SI */
661 COSTS_N_INSNS (12), /* DI */
662 COSTS_N_INSNS (12)}, /* other */
663 1, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (40), /* HI */
666 COSTS_N_INSNS (40), /* SI */
667 COSTS_N_INSNS (40), /* DI */
668 COSTS_N_INSNS (40)}, /* other */
669 COSTS_N_INSNS (3), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 15, /* "large" insn */
672 3, /* MOVE_RATIO */
673 4, /* cost for loading QImode using movzbl */
674 {2, 4, 2}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 4, 2}, /* cost of storing integer registers */
678 2, /* cost of reg,reg fld/fst */
679 {8, 8, 8}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {8, 8, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 8}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 8}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 8, 16}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 8, 16}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 3, /* MMX or SSE register to integer */
694 4, /* size of l1 cache. 486 has 8kB cache
695 shared for code and data, so 4kB is
696 not really precise. */
697 4, /* size of l2 cache */
698 0, /* size of prefetch block */
699 0, /* number of parallel prefetches */
700 1, /* Branch cost */
701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (3), /* cost of FABS instruction. */
705 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
707 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
708 DUMMY_STRINGOP_ALGS},
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 1, /* scalar_stmt_cost. */
712 1, /* scalar load_cost. */
713 1, /* scalar_store_cost. */
714 1, /* vec_stmt_cost. */
715 1, /* vec_to_scalar_cost. */
716 1, /* scalar_to_vec_cost. */
717 1, /* vec_align_load_cost. */
718 2, /* vec_unalign_load_cost. */
719 1, /* vec_store_cost. */
720 3, /* cond_taken_branch_cost. */
721 1, /* cond_not_taken_branch_cost. */
722 };
723
724 static const
725 struct processor_costs pentium_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (1), /* cost of a lea instruction */
728 COSTS_N_INSNS (4), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (11), /* HI */
732 COSTS_N_INSNS (11), /* SI */
733 COSTS_N_INSNS (11), /* DI */
734 COSTS_N_INSNS (11)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (25), /* HI */
738 COSTS_N_INSNS (25), /* SI */
739 COSTS_N_INSNS (25), /* DI */
740 COSTS_N_INSNS (25)}, /* other */
741 COSTS_N_INSNS (3), /* cost of movsx */
742 COSTS_N_INSNS (2), /* cost of movzx */
743 8, /* "large" insn */
744 6, /* MOVE_RATIO */
745 6, /* cost for loading QImode using movzbl */
746 {2, 4, 2}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {2, 4, 2}, /* cost of storing integer registers */
750 2, /* cost of reg,reg fld/fst */
751 {2, 2, 6}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {4, 4, 6}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 8, /* cost of moving MMX register */
756 {8, 8}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {8, 8}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 8, 16}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 8, 16}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 3, /* MMX or SSE register to integer */
766 8, /* size of l1 cache. */
767 8, /* size of l2 cache */
768 0, /* size of prefetch block */
769 0, /* number of parallel prefetches */
770 2, /* Branch cost */
771 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
772 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
773 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
774 COSTS_N_INSNS (1), /* cost of FABS instruction. */
775 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
776 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
777 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
778 DUMMY_STRINGOP_ALGS},
779 {{libcall, {{-1, rep_prefix_4_byte}}},
780 DUMMY_STRINGOP_ALGS},
781 1, /* scalar_stmt_cost. */
782 1, /* scalar load_cost. */
783 1, /* scalar_store_cost. */
784 1, /* vec_stmt_cost. */
785 1, /* vec_to_scalar_cost. */
786 1, /* scalar_to_vec_cost. */
787 1, /* vec_align_load_cost. */
788 2, /* vec_unalign_load_cost. */
789 1, /* vec_store_cost. */
790 3, /* cond_taken_branch_cost. */
791 1, /* cond_not_taken_branch_cost. */
792 };
793
794 static const
795 struct processor_costs pentiumpro_cost = {
796 COSTS_N_INSNS (1), /* cost of an add instruction */
797 COSTS_N_INSNS (1), /* cost of a lea instruction */
798 COSTS_N_INSNS (1), /* variable shift costs */
799 COSTS_N_INSNS (1), /* constant shift costs */
800 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
801 COSTS_N_INSNS (4), /* HI */
802 COSTS_N_INSNS (4), /* SI */
803 COSTS_N_INSNS (4), /* DI */
804 COSTS_N_INSNS (4)}, /* other */
805 0, /* cost of multiply per each bit set */
806 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
807 COSTS_N_INSNS (17), /* HI */
808 COSTS_N_INSNS (17), /* SI */
809 COSTS_N_INSNS (17), /* DI */
810 COSTS_N_INSNS (17)}, /* other */
811 COSTS_N_INSNS (1), /* cost of movsx */
812 COSTS_N_INSNS (1), /* cost of movzx */
813 8, /* "large" insn */
814 6, /* MOVE_RATIO */
815 2, /* cost for loading QImode using movzbl */
816 {4, 4, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 2, 2}, /* cost of storing integer registers */
820 2, /* cost of reg,reg fld/fst */
821 {2, 2, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 6}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, /* cost of moving SSE register */
831 {2, 2, 8}, /* cost of loading SSE registers
832 in SImode, DImode and TImode */
833 {2, 2, 8}, /* cost of storing SSE registers
834 in SImode, DImode and TImode */
835 3, /* MMX or SSE register to integer */
836 8, /* size of l1 cache. */
837 256, /* size of l2 cache */
838 32, /* size of prefetch block */
839 6, /* number of parallel prefetches */
840 2, /* Branch cost */
841 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
842 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
843 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
844 COSTS_N_INSNS (2), /* cost of FABS instruction. */
845 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
846 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
847 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
848 (we ensure the alignment). For small blocks inline loop is still a
849 noticeable win, for bigger blocks either rep movsl or rep movsb is
850 way to go. Rep movsb has apparently more expensive startup time in CPU,
851 but after 4K the difference is down in the noise. */
852 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
853 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
854 DUMMY_STRINGOP_ALGS},
855 {{rep_prefix_4_byte, {{1024, unrolled_loop},
856 {8192, rep_prefix_4_byte}, {-1, libcall}}},
857 DUMMY_STRINGOP_ALGS},
858 1, /* scalar_stmt_cost. */
859 1, /* scalar load_cost. */
860 1, /* scalar_store_cost. */
861 1, /* vec_stmt_cost. */
862 1, /* vec_to_scalar_cost. */
863 1, /* scalar_to_vec_cost. */
864 1, /* vec_align_load_cost. */
865 2, /* vec_unalign_load_cost. */
866 1, /* vec_store_cost. */
867 3, /* cond_taken_branch_cost. */
868 1, /* cond_not_taken_branch_cost. */
869 };
870
871 static const
872 struct processor_costs geode_cost = {
873 COSTS_N_INSNS (1), /* cost of an add instruction */
874 COSTS_N_INSNS (1), /* cost of a lea instruction */
875 COSTS_N_INSNS (2), /* variable shift costs */
876 COSTS_N_INSNS (1), /* constant shift costs */
877 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
878 COSTS_N_INSNS (4), /* HI */
879 COSTS_N_INSNS (7), /* SI */
880 COSTS_N_INSNS (7), /* DI */
881 COSTS_N_INSNS (7)}, /* other */
882 0, /* cost of multiply per each bit set */
883 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
884 COSTS_N_INSNS (23), /* HI */
885 COSTS_N_INSNS (39), /* SI */
886 COSTS_N_INSNS (39), /* DI */
887 COSTS_N_INSNS (39)}, /* other */
888 COSTS_N_INSNS (1), /* cost of movsx */
889 COSTS_N_INSNS (1), /* cost of movzx */
890 8, /* "large" insn */
891 4, /* MOVE_RATIO */
892 1, /* cost for loading QImode using movzbl */
893 {1, 1, 1}, /* cost of loading integer registers
894 in QImode, HImode and SImode.
895 Relative to reg-reg move (2). */
896 {1, 1, 1}, /* cost of storing integer registers */
897 1, /* cost of reg,reg fld/fst */
898 {1, 1, 1}, /* cost of loading fp registers
899 in SFmode, DFmode and XFmode */
900 {4, 6, 6}, /* cost of storing fp registers
901 in SFmode, DFmode and XFmode */
902
903 1, /* cost of moving MMX register */
904 {1, 1}, /* cost of loading MMX registers
905 in SImode and DImode */
906 {1, 1}, /* cost of storing MMX registers
907 in SImode and DImode */
908 1, /* cost of moving SSE register */
909 {1, 1, 1}, /* cost of loading SSE registers
910 in SImode, DImode and TImode */
911 {1, 1, 1}, /* cost of storing SSE registers
912 in SImode, DImode and TImode */
913 1, /* MMX or SSE register to integer */
914 64, /* size of l1 cache. */
915 128, /* size of l2 cache. */
916 32, /* size of prefetch block */
917 1, /* number of parallel prefetches */
918 1, /* Branch cost */
919 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
920 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
921 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
922 COSTS_N_INSNS (1), /* cost of FABS instruction. */
923 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
924 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
925 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
926 DUMMY_STRINGOP_ALGS},
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 1, /* scalar_stmt_cost. */
930 1, /* scalar load_cost. */
931 1, /* scalar_store_cost. */
932 1, /* vec_stmt_cost. */
933 1, /* vec_to_scalar_cost. */
934 1, /* scalar_to_vec_cost. */
935 1, /* vec_align_load_cost. */
936 2, /* vec_unalign_load_cost. */
937 1, /* vec_store_cost. */
938 3, /* cond_taken_branch_cost. */
939 1, /* cond_not_taken_branch_cost. */
940 };
941
942 static const
943 struct processor_costs k6_cost = {
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (3), /* HI */
950 COSTS_N_INSNS (3), /* SI */
951 COSTS_N_INSNS (3), /* DI */
952 COSTS_N_INSNS (3)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (18), /* HI */
956 COSTS_N_INSNS (18), /* SI */
957 COSTS_N_INSNS (18), /* DI */
958 COSTS_N_INSNS (18)}, /* other */
959 COSTS_N_INSNS (2), /* cost of movsx */
960 COSTS_N_INSNS (2), /* cost of movzx */
961 8, /* "large" insn */
962 4, /* MOVE_RATIO */
963 3, /* cost for loading QImode using movzbl */
964 {4, 5, 4}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {2, 3, 2}, /* cost of storing integer registers */
968 4, /* cost of reg,reg fld/fst */
969 {6, 6, 6}, /* cost of loading fp registers
970 in SFmode, DFmode and XFmode */
971 {4, 4, 4}, /* cost of storing fp registers
972 in SFmode, DFmode and XFmode */
973 2, /* cost of moving MMX register */
974 {2, 2}, /* cost of loading MMX registers
975 in SImode and DImode */
976 {2, 2}, /* cost of storing MMX registers
977 in SImode and DImode */
978 2, /* cost of moving SSE register */
979 {2, 2, 8}, /* cost of loading SSE registers
980 in SImode, DImode and TImode */
981 {2, 2, 8}, /* cost of storing SSE registers
982 in SImode, DImode and TImode */
983 6, /* MMX or SSE register to integer */
984 32, /* size of l1 cache. */
985 32, /* size of l2 cache. Some models
986 have integrated l2 cache, but
987 optimizing for k6 is not important
988 enough to worry about that. */
989 32, /* size of prefetch block */
990 1, /* number of parallel prefetches */
991 1, /* Branch cost */
992 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
993 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
994 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
995 COSTS_N_INSNS (2), /* cost of FABS instruction. */
996 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
997 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
998 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
999 DUMMY_STRINGOP_ALGS},
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 1, /* scalar_stmt_cost. */
1003 1, /* scalar load_cost. */
1004 1, /* scalar_store_cost. */
1005 1, /* vec_stmt_cost. */
1006 1, /* vec_to_scalar_cost. */
1007 1, /* scalar_to_vec_cost. */
1008 1, /* vec_align_load_cost. */
1009 2, /* vec_unalign_load_cost. */
1010 1, /* vec_store_cost. */
1011 3, /* cond_taken_branch_cost. */
1012 1, /* cond_not_taken_branch_cost. */
1013 };
1014
1015 static const
1016 struct processor_costs athlon_cost = {
1017 COSTS_N_INSNS (1), /* cost of an add instruction */
1018 COSTS_N_INSNS (2), /* cost of a lea instruction */
1019 COSTS_N_INSNS (1), /* variable shift costs */
1020 COSTS_N_INSNS (1), /* constant shift costs */
1021 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1022 COSTS_N_INSNS (5), /* HI */
1023 COSTS_N_INSNS (5), /* SI */
1024 COSTS_N_INSNS (5), /* DI */
1025 COSTS_N_INSNS (5)}, /* other */
1026 0, /* cost of multiply per each bit set */
1027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1028 COSTS_N_INSNS (26), /* HI */
1029 COSTS_N_INSNS (42), /* SI */
1030 COSTS_N_INSNS (74), /* DI */
1031 COSTS_N_INSNS (74)}, /* other */
1032 COSTS_N_INSNS (1), /* cost of movsx */
1033 COSTS_N_INSNS (1), /* cost of movzx */
1034 8, /* "large" insn */
1035 9, /* MOVE_RATIO */
1036 4, /* cost for loading QImode using movzbl */
1037 {3, 4, 3}, /* cost of loading integer registers
1038 in QImode, HImode and SImode.
1039 Relative to reg-reg move (2). */
1040 {3, 4, 3}, /* cost of storing integer registers */
1041 4, /* cost of reg,reg fld/fst */
1042 {4, 4, 12}, /* cost of loading fp registers
1043 in SFmode, DFmode and XFmode */
1044 {6, 6, 8}, /* cost of storing fp registers
1045 in SFmode, DFmode and XFmode */
1046 2, /* cost of moving MMX register */
1047 {4, 4}, /* cost of loading MMX registers
1048 in SImode and DImode */
1049 {4, 4}, /* cost of storing MMX registers
1050 in SImode and DImode */
1051 2, /* cost of moving SSE register */
1052 {4, 4, 6}, /* cost of loading SSE registers
1053 in SImode, DImode and TImode */
1054 {4, 4, 5}, /* cost of storing SSE registers
1055 in SImode, DImode and TImode */
1056 5, /* MMX or SSE register to integer */
1057 64, /* size of l1 cache. */
1058 256, /* size of l2 cache. */
1059 64, /* size of prefetch block */
1060 6, /* number of parallel prefetches */
1061 5, /* Branch cost */
1062 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1063 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1064 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1065 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1066 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1067 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1068 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1069 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1070 128 bytes for memset. */
1071 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1072 DUMMY_STRINGOP_ALGS},
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 1, /* scalar_stmt_cost. */
1076 1, /* scalar load_cost. */
1077 1, /* scalar_store_cost. */
1078 1, /* vec_stmt_cost. */
1079 1, /* vec_to_scalar_cost. */
1080 1, /* scalar_to_vec_cost. */
1081 1, /* vec_align_load_cost. */
1082 2, /* vec_unalign_load_cost. */
1083 1, /* vec_store_cost. */
1084 3, /* cond_taken_branch_cost. */
1085 1, /* cond_not_taken_branch_cost. */
1086 };
1087
1088 static const
1089 struct processor_costs k8_cost = {
1090 COSTS_N_INSNS (1), /* cost of an add instruction */
1091 COSTS_N_INSNS (2), /* cost of a lea instruction */
1092 COSTS_N_INSNS (1), /* variable shift costs */
1093 COSTS_N_INSNS (1), /* constant shift costs */
1094 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1095 COSTS_N_INSNS (4), /* HI */
1096 COSTS_N_INSNS (3), /* SI */
1097 COSTS_N_INSNS (4), /* DI */
1098 COSTS_N_INSNS (5)}, /* other */
1099 0, /* cost of multiply per each bit set */
1100 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1101 COSTS_N_INSNS (26), /* HI */
1102 COSTS_N_INSNS (42), /* SI */
1103 COSTS_N_INSNS (74), /* DI */
1104 COSTS_N_INSNS (74)}, /* other */
1105 COSTS_N_INSNS (1), /* cost of movsx */
1106 COSTS_N_INSNS (1), /* cost of movzx */
1107 8, /* "large" insn */
1108 9, /* MOVE_RATIO */
1109 4, /* cost for loading QImode using movzbl */
1110 {3, 4, 3}, /* cost of loading integer registers
1111 in QImode, HImode and SImode.
1112 Relative to reg-reg move (2). */
1113 {3, 4, 3}, /* cost of storing integer registers */
1114 4, /* cost of reg,reg fld/fst */
1115 {4, 4, 12}, /* cost of loading fp registers
1116 in SFmode, DFmode and XFmode */
1117 {6, 6, 8}, /* cost of storing fp registers
1118 in SFmode, DFmode and XFmode */
1119 2, /* cost of moving MMX register */
1120 {3, 3}, /* cost of loading MMX registers
1121 in SImode and DImode */
1122 {4, 4}, /* cost of storing MMX registers
1123 in SImode and DImode */
1124 2, /* cost of moving SSE register */
1125 {4, 3, 6}, /* cost of loading SSE registers
1126 in SImode, DImode and TImode */
1127 {4, 4, 5}, /* cost of storing SSE registers
1128 in SImode, DImode and TImode */
1129 5, /* MMX or SSE register to integer */
1130 64, /* size of l1 cache. */
1131 512, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 3, /* Branch cost */
1140 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1146 /* K8 has optimized REP instruction for medium sized blocks, but for very
1147 small blocks it is better to use loop. For large blocks, libcall can
1148 do nontemporary accesses and beat inline considerably. */
1149 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1150 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1151 {{libcall, {{8, loop}, {24, unrolled_loop},
1152 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1153 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1154 4, /* scalar_stmt_cost. */
1155 2, /* scalar load_cost. */
1156 2, /* scalar_store_cost. */
1157 5, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 2, /* vec_align_load_cost. */
1161 3, /* vec_unalign_load_cost. */
1162 3, /* vec_store_cost. */
1163 3, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1165 };
1166
1167 struct processor_costs amdfam10_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (2), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (3), /* SI */
1175 COSTS_N_INSNS (4), /* DI */
1176 COSTS_N_INSNS (5)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1186 9, /* MOVE_RATIO */
1187 4, /* cost for loading QImode using movzbl */
1188 {3, 4, 3}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {3, 4, 3}, /* cost of storing integer registers */
1192 4, /* cost of reg,reg fld/fst */
1193 {4, 4, 12}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {6, 6, 8}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {3, 3}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {4, 4}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 2, /* cost of moving SSE register */
1203 {4, 4, 3}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {4, 4, 5}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 3, /* MMX or SSE register to integer */
1208 /* On K8:
1209 MOVD reg64, xmmreg Double FSTORE 4
1210 MOVD reg32, xmmreg Double FSTORE 4
1211 On AMDFAM10:
1212 MOVD reg64, xmmreg Double FADD 3
1213 1/1 1/1
1214 MOVD reg32, xmmreg Double FADD 3
1215 1/1 1/1 */
1216 64, /* size of l1 cache. */
1217 512, /* size of l2 cache. */
1218 64, /* size of prefetch block */
1219 /* New AMD processors never drop prefetches; if they cannot be performed
1220 immediately, they are queued. We set number of simultaneous prefetches
1221 to a large constant to reflect this (it probably is not a good idea not
1222 to limit number of prefetches at all, as their execution also takes some
1223 time). */
1224 100, /* number of parallel prefetches */
1225 2, /* Branch cost */
1226 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1227 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1228 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1229 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1230 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1231 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1232
1233 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1234 very small blocks it is better to use loop. For large blocks, libcall can
1235 do nontemporary accesses and beat inline considerably. */
1236 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1237 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1238 {{libcall, {{8, loop}, {24, unrolled_loop},
1239 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1240 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1241 4, /* scalar_stmt_cost. */
1242 2, /* scalar load_cost. */
1243 2, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 2, /* vec_align_load_cost. */
1248 2, /* vec_unalign_load_cost. */
1249 2, /* vec_store_cost. */
1250 2, /* cond_taken_branch_cost. */
1251 1, /* cond_not_taken_branch_cost. */
1252 };
1253
1254 struct processor_costs bdver1_cost = {
1255 COSTS_N_INSNS (1), /* cost of an add instruction */
1256 COSTS_N_INSNS (1), /* cost of a lea instruction */
1257 COSTS_N_INSNS (1), /* variable shift costs */
1258 COSTS_N_INSNS (1), /* constant shift costs */
1259 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1260 COSTS_N_INSNS (4), /* HI */
1261 COSTS_N_INSNS (4), /* SI */
1262 COSTS_N_INSNS (6), /* DI */
1263 COSTS_N_INSNS (6)}, /* other */
1264 0, /* cost of multiply per each bit set */
1265 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1266 COSTS_N_INSNS (35), /* HI */
1267 COSTS_N_INSNS (51), /* SI */
1268 COSTS_N_INSNS (83), /* DI */
1269 COSTS_N_INSNS (83)}, /* other */
1270 COSTS_N_INSNS (1), /* cost of movsx */
1271 COSTS_N_INSNS (1), /* cost of movzx */
1272 8, /* "large" insn */
1273 9, /* MOVE_RATIO */
1274 4, /* cost for loading QImode using movzbl */
1275 {5, 5, 4}, /* cost of loading integer registers
1276 in QImode, HImode and SImode.
1277 Relative to reg-reg move (2). */
1278 {4, 4, 4}, /* cost of storing integer registers */
1279 2, /* cost of reg,reg fld/fst */
1280 {5, 5, 12}, /* cost of loading fp registers
1281 in SFmode, DFmode and XFmode */
1282 {4, 4, 8}, /* cost of storing fp registers
1283 in SFmode, DFmode and XFmode */
1284 2, /* cost of moving MMX register */
1285 {4, 4}, /* cost of loading MMX registers
1286 in SImode and DImode */
1287 {4, 4}, /* cost of storing MMX registers
1288 in SImode and DImode */
1289 2, /* cost of moving SSE register */
1290 {4, 4, 4}, /* cost of loading SSE registers
1291 in SImode, DImode and TImode */
1292 {4, 4, 4}, /* cost of storing SSE registers
1293 in SImode, DImode and TImode */
1294 2, /* MMX or SSE register to integer */
1295 /* On K8:
1296 MOVD reg64, xmmreg Double FSTORE 4
1297 MOVD reg32, xmmreg Double FSTORE 4
1298 On AMDFAM10:
1299 MOVD reg64, xmmreg Double FADD 3
1300 1/1 1/1
1301 MOVD reg32, xmmreg Double FADD 3
1302 1/1 1/1 */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1319
1320 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1321 very small blocks it is better to use loop. For large blocks, libcall
1322 can do nontemporary accesses and beat inline considerably. */
1323 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1324 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1325 {{libcall, {{8, loop}, {24, unrolled_loop},
1326 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1327 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 2, /* cond_taken_branch_cost. */
1338 1, /* cond_not_taken_branch_cost. */
1339 };
1340
1341 struct processor_costs bdver2_cost = {
1342 COSTS_N_INSNS (1), /* cost of an add instruction */
1343 COSTS_N_INSNS (1), /* cost of a lea instruction */
1344 COSTS_N_INSNS (1), /* variable shift costs */
1345 COSTS_N_INSNS (1), /* constant shift costs */
1346 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1347 COSTS_N_INSNS (4), /* HI */
1348 COSTS_N_INSNS (4), /* SI */
1349 COSTS_N_INSNS (6), /* DI */
1350 COSTS_N_INSNS (6)}, /* other */
1351 0, /* cost of multiply per each bit set */
1352 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1353 COSTS_N_INSNS (35), /* HI */
1354 COSTS_N_INSNS (51), /* SI */
1355 COSTS_N_INSNS (83), /* DI */
1356 COSTS_N_INSNS (83)}, /* other */
1357 COSTS_N_INSNS (1), /* cost of movsx */
1358 COSTS_N_INSNS (1), /* cost of movzx */
1359 8, /* "large" insn */
1360 9, /* MOVE_RATIO */
1361 4, /* cost for loading QImode using movzbl */
1362 {5, 5, 4}, /* cost of loading integer registers
1363 in QImode, HImode and SImode.
1364 Relative to reg-reg move (2). */
1365 {4, 4, 4}, /* cost of storing integer registers */
1366 2, /* cost of reg,reg fld/fst */
1367 {5, 5, 12}, /* cost of loading fp registers
1368 in SFmode, DFmode and XFmode */
1369 {4, 4, 8}, /* cost of storing fp registers
1370 in SFmode, DFmode and XFmode */
1371 2, /* cost of moving MMX register */
1372 {4, 4}, /* cost of loading MMX registers
1373 in SImode and DImode */
1374 {4, 4}, /* cost of storing MMX registers
1375 in SImode and DImode */
1376 2, /* cost of moving SSE register */
1377 {4, 4, 4}, /* cost of loading SSE registers
1378 in SImode, DImode and TImode */
1379 {4, 4, 4}, /* cost of storing SSE registers
1380 in SImode, DImode and TImode */
1381 2, /* MMX or SSE register to integer */
1382 /* On K8:
1383 MOVD reg64, xmmreg Double FSTORE 4
1384 MOVD reg32, xmmreg Double FSTORE 4
1385 On AMDFAM10:
1386 MOVD reg64, xmmreg Double FADD 3
1387 1/1 1/1
1388 MOVD reg32, xmmreg Double FADD 3
1389 1/1 1/1 */
1390 16, /* size of l1 cache. */
1391 2048, /* size of l2 cache. */
1392 64, /* size of prefetch block */
1393 /* New AMD processors never drop prefetches; if they cannot be performed
1394 immediately, they are queued. We set number of simultaneous prefetches
1395 to a large constant to reflect this (it probably is not a good idea not
1396 to limit number of prefetches at all, as their execution also takes some
1397 time). */
1398 100, /* number of parallel prefetches */
1399 2, /* Branch cost */
1400 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1401 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1402 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1403 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1404 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1405 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1406
1407 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1408 very small blocks it is better to use loop. For large blocks, libcall
1409 can do nontemporary accesses and beat inline considerably. */
1410 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1411 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1412 {{libcall, {{8, loop}, {24, unrolled_loop},
1413 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1414 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1415 6, /* scalar_stmt_cost. */
1416 4, /* scalar load_cost. */
1417 4, /* scalar_store_cost. */
1418 6, /* vec_stmt_cost. */
1419 0, /* vec_to_scalar_cost. */
1420 2, /* scalar_to_vec_cost. */
1421 4, /* vec_align_load_cost. */
1422 4, /* vec_unalign_load_cost. */
1423 4, /* vec_store_cost. */
1424 2, /* cond_taken_branch_cost. */
1425 1, /* cond_not_taken_branch_cost. */
1426 };
1427
1428 struct processor_costs btver1_cost = {
1429 COSTS_N_INSNS (1), /* cost of an add instruction */
1430 COSTS_N_INSNS (2), /* cost of a lea instruction */
1431 COSTS_N_INSNS (1), /* variable shift costs */
1432 COSTS_N_INSNS (1), /* constant shift costs */
1433 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1434 COSTS_N_INSNS (4), /* HI */
1435 COSTS_N_INSNS (3), /* SI */
1436 COSTS_N_INSNS (4), /* DI */
1437 COSTS_N_INSNS (5)}, /* other */
1438 0, /* cost of multiply per each bit set */
1439 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1440 COSTS_N_INSNS (35), /* HI */
1441 COSTS_N_INSNS (51), /* SI */
1442 COSTS_N_INSNS (83), /* DI */
1443 COSTS_N_INSNS (83)}, /* other */
1444 COSTS_N_INSNS (1), /* cost of movsx */
1445 COSTS_N_INSNS (1), /* cost of movzx */
1446 8, /* "large" insn */
1447 9, /* MOVE_RATIO */
1448 4, /* cost for loading QImode using movzbl */
1449 {3, 4, 3}, /* cost of loading integer registers
1450 in QImode, HImode and SImode.
1451 Relative to reg-reg move (2). */
1452 {3, 4, 3}, /* cost of storing integer registers */
1453 4, /* cost of reg,reg fld/fst */
1454 {4, 4, 12}, /* cost of loading fp registers
1455 in SFmode, DFmode and XFmode */
1456 {6, 6, 8}, /* cost of storing fp registers
1457 in SFmode, DFmode and XFmode */
1458 2, /* cost of moving MMX register */
1459 {3, 3}, /* cost of loading MMX registers
1460 in SImode and DImode */
1461 {4, 4}, /* cost of storing MMX registers
1462 in SImode and DImode */
1463 2, /* cost of moving SSE register */
1464 {4, 4, 3}, /* cost of loading SSE registers
1465 in SImode, DImode and TImode */
1466 {4, 4, 5}, /* cost of storing SSE registers
1467 in SImode, DImode and TImode */
1468 3, /* MMX or SSE register to integer */
1469 /* On K8:
1470 MOVD reg64, xmmreg Double FSTORE 4
1471 MOVD reg32, xmmreg Double FSTORE 4
1472 On AMDFAM10:
1473 MOVD reg64, xmmreg Double FADD 3
1474 1/1 1/1
1475 MOVD reg32, xmmreg Double FADD 3
1476 1/1 1/1 */
1477 32, /* size of l1 cache. */
1478 512, /* size of l2 cache. */
1479 64, /* size of prefetch block */
1480 100, /* number of parallel prefetches */
1481 2, /* Branch cost */
1482 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1483 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1484 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1485 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1486 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1487 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1488
1489 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1490 very small blocks it is better to use loop. For large blocks, libcall can
1491 do nontemporary accesses and beat inline considerably. */
1492 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1493 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1494 {{libcall, {{8, loop}, {24, unrolled_loop},
1495 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1496 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1497 4, /* scalar_stmt_cost. */
1498 2, /* scalar load_cost. */
1499 2, /* scalar_store_cost. */
1500 6, /* vec_stmt_cost. */
1501 0, /* vec_to_scalar_cost. */
1502 2, /* scalar_to_vec_cost. */
1503 2, /* vec_align_load_cost. */
1504 2, /* vec_unalign_load_cost. */
1505 2, /* vec_store_cost. */
1506 2, /* cond_taken_branch_cost. */
1507 1, /* cond_not_taken_branch_cost. */
1508 };
1509
1510 static const
1511 struct processor_costs pentium4_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (3), /* cost of a lea instruction */
1514 COSTS_N_INSNS (4), /* variable shift costs */
1515 COSTS_N_INSNS (4), /* constant shift costs */
1516 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (15), /* HI */
1518 COSTS_N_INSNS (15), /* SI */
1519 COSTS_N_INSNS (15), /* DI */
1520 COSTS_N_INSNS (15)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (56), /* HI */
1524 COSTS_N_INSNS (56), /* SI */
1525 COSTS_N_INSNS (56), /* DI */
1526 COSTS_N_INSNS (56)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 16, /* "large" insn */
1530 6, /* MOVE_RATIO */
1531 2, /* cost for loading QImode using movzbl */
1532 {4, 5, 4}, /* cost of loading integer registers
1533 in QImode, HImode and SImode.
1534 Relative to reg-reg move (2). */
1535 {2, 3, 2}, /* cost of storing integer registers */
1536 2, /* cost of reg,reg fld/fst */
1537 {2, 2, 6}, /* cost of loading fp registers
1538 in SFmode, DFmode and XFmode */
1539 {4, 4, 6}, /* cost of storing fp registers
1540 in SFmode, DFmode and XFmode */
1541 2, /* cost of moving MMX register */
1542 {2, 2}, /* cost of loading MMX registers
1543 in SImode and DImode */
1544 {2, 2}, /* cost of storing MMX registers
1545 in SImode and DImode */
1546 12, /* cost of moving SSE register */
1547 {12, 12, 12}, /* cost of loading SSE registers
1548 in SImode, DImode and TImode */
1549 {2, 2, 8}, /* cost of storing SSE registers
1550 in SImode, DImode and TImode */
1551 10, /* MMX or SSE register to integer */
1552 8, /* size of l1 cache. */
1553 256, /* size of l2 cache. */
1554 64, /* size of prefetch block */
1555 6, /* number of parallel prefetches */
1556 2, /* Branch cost */
1557 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1558 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1559 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1560 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1561 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1562 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1563 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1564 DUMMY_STRINGOP_ALGS},
1565 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1566 {-1, libcall}}},
1567 DUMMY_STRINGOP_ALGS},
1568 1, /* scalar_stmt_cost. */
1569 1, /* scalar load_cost. */
1570 1, /* scalar_store_cost. */
1571 1, /* vec_stmt_cost. */
1572 1, /* vec_to_scalar_cost. */
1573 1, /* scalar_to_vec_cost. */
1574 1, /* vec_align_load_cost. */
1575 2, /* vec_unalign_load_cost. */
1576 1, /* vec_store_cost. */
1577 3, /* cond_taken_branch_cost. */
1578 1, /* cond_not_taken_branch_cost. */
1579 };
1580
1581 static const
1582 struct processor_costs nocona_cost = {
1583 COSTS_N_INSNS (1), /* cost of an add instruction */
1584 COSTS_N_INSNS (1), /* cost of a lea instruction */
1585 COSTS_N_INSNS (1), /* variable shift costs */
1586 COSTS_N_INSNS (1), /* constant shift costs */
1587 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1588 COSTS_N_INSNS (10), /* HI */
1589 COSTS_N_INSNS (10), /* SI */
1590 COSTS_N_INSNS (10), /* DI */
1591 COSTS_N_INSNS (10)}, /* other */
1592 0, /* cost of multiply per each bit set */
1593 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1594 COSTS_N_INSNS (66), /* HI */
1595 COSTS_N_INSNS (66), /* SI */
1596 COSTS_N_INSNS (66), /* DI */
1597 COSTS_N_INSNS (66)}, /* other */
1598 COSTS_N_INSNS (1), /* cost of movsx */
1599 COSTS_N_INSNS (1), /* cost of movzx */
1600 16, /* "large" insn */
1601 17, /* MOVE_RATIO */
1602 4, /* cost for loading QImode using movzbl */
1603 {4, 4, 4}, /* cost of loading integer registers
1604 in QImode, HImode and SImode.
1605 Relative to reg-reg move (2). */
1606 {4, 4, 4}, /* cost of storing integer registers */
1607 3, /* cost of reg,reg fld/fst */
1608 {12, 12, 12}, /* cost of loading fp registers
1609 in SFmode, DFmode and XFmode */
1610 {4, 4, 4}, /* cost of storing fp registers
1611 in SFmode, DFmode and XFmode */
1612 6, /* cost of moving MMX register */
1613 {12, 12}, /* cost of loading MMX registers
1614 in SImode and DImode */
1615 {12, 12}, /* cost of storing MMX registers
1616 in SImode and DImode */
1617 6, /* cost of moving SSE register */
1618 {12, 12, 12}, /* cost of loading SSE registers
1619 in SImode, DImode and TImode */
1620 {12, 12, 12}, /* cost of storing SSE registers
1621 in SImode, DImode and TImode */
1622 8, /* MMX or SSE register to integer */
1623 8, /* size of l1 cache. */
1624 1024, /* size of l2 cache. */
1625 128, /* size of prefetch block */
1626 8, /* number of parallel prefetches */
1627 1, /* Branch cost */
1628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1629 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1630 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1631 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1632 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1633 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1634 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1635 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1636 {100000, unrolled_loop}, {-1, libcall}}}},
1637 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1638 {-1, libcall}}},
1639 {libcall, {{24, loop}, {64, unrolled_loop},
1640 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1641 1, /* scalar_stmt_cost. */
1642 1, /* scalar load_cost. */
1643 1, /* scalar_store_cost. */
1644 1, /* vec_stmt_cost. */
1645 1, /* vec_to_scalar_cost. */
1646 1, /* scalar_to_vec_cost. */
1647 1, /* vec_align_load_cost. */
1648 2, /* vec_unalign_load_cost. */
1649 1, /* vec_store_cost. */
1650 3, /* cond_taken_branch_cost. */
1651 1, /* cond_not_taken_branch_cost. */
1652 };
1653
1654 static const
1655 struct processor_costs atom_cost = {
1656 COSTS_N_INSNS (1), /* cost of an add instruction */
1657 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1658 COSTS_N_INSNS (1), /* variable shift costs */
1659 COSTS_N_INSNS (1), /* constant shift costs */
1660 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1661 COSTS_N_INSNS (4), /* HI */
1662 COSTS_N_INSNS (3), /* SI */
1663 COSTS_N_INSNS (4), /* DI */
1664 COSTS_N_INSNS (2)}, /* other */
1665 0, /* cost of multiply per each bit set */
1666 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1667 COSTS_N_INSNS (26), /* HI */
1668 COSTS_N_INSNS (42), /* SI */
1669 COSTS_N_INSNS (74), /* DI */
1670 COSTS_N_INSNS (74)}, /* other */
1671 COSTS_N_INSNS (1), /* cost of movsx */
1672 COSTS_N_INSNS (1), /* cost of movzx */
1673 8, /* "large" insn */
1674 17, /* MOVE_RATIO */
1675 4, /* cost for loading QImode using movzbl */
1676 {4, 4, 4}, /* cost of loading integer registers
1677 in QImode, HImode and SImode.
1678 Relative to reg-reg move (2). */
1679 {4, 4, 4}, /* cost of storing integer registers */
1680 4, /* cost of reg,reg fld/fst */
1681 {12, 12, 12}, /* cost of loading fp registers
1682 in SFmode, DFmode and XFmode */
1683 {6, 6, 8}, /* cost of storing fp registers
1684 in SFmode, DFmode and XFmode */
1685 2, /* cost of moving MMX register */
1686 {8, 8}, /* cost of loading MMX registers
1687 in SImode and DImode */
1688 {8, 8}, /* cost of storing MMX registers
1689 in SImode and DImode */
1690 2, /* cost of moving SSE register */
1691 {8, 8, 8}, /* cost of loading SSE registers
1692 in SImode, DImode and TImode */
1693 {8, 8, 8}, /* cost of storing SSE registers
1694 in SImode, DImode and TImode */
1695 5, /* MMX or SSE register to integer */
1696 32, /* size of l1 cache. */
1697 256, /* size of l2 cache. */
1698 64, /* size of prefetch block */
1699 6, /* number of parallel prefetches */
1700 3, /* Branch cost */
1701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1702 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1703 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1704 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1705 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1706 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1707 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1708 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1709 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1710 {{libcall, {{8, loop}, {15, unrolled_loop},
1711 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1712 {libcall, {{24, loop}, {32, unrolled_loop},
1713 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1714 1, /* scalar_stmt_cost. */
1715 1, /* scalar load_cost. */
1716 1, /* scalar_store_cost. */
1717 1, /* vec_stmt_cost. */
1718 1, /* vec_to_scalar_cost. */
1719 1, /* scalar_to_vec_cost. */
1720 1, /* vec_align_load_cost. */
1721 2, /* vec_unalign_load_cost. */
1722 1, /* vec_store_cost. */
1723 3, /* cond_taken_branch_cost. */
1724 1, /* cond_not_taken_branch_cost. */
1725 };
1726
1727 /* Generic64 should produce code tuned for Nocona and K8. */
1728 static const
1729 struct processor_costs generic64_cost = {
1730 COSTS_N_INSNS (1), /* cost of an add instruction */
1731 /* On all chips taken into consideration lea is 2 cycles and more. With
1732 this cost however our current implementation of synth_mult results in
1733 use of unnecessary temporary registers causing regression on several
1734 SPECfp benchmarks. */
1735 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1736 COSTS_N_INSNS (1), /* variable shift costs */
1737 COSTS_N_INSNS (1), /* constant shift costs */
1738 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1739 COSTS_N_INSNS (4), /* HI */
1740 COSTS_N_INSNS (3), /* SI */
1741 COSTS_N_INSNS (4), /* DI */
1742 COSTS_N_INSNS (2)}, /* other */
1743 0, /* cost of multiply per each bit set */
1744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1745 COSTS_N_INSNS (26), /* HI */
1746 COSTS_N_INSNS (42), /* SI */
1747 COSTS_N_INSNS (74), /* DI */
1748 COSTS_N_INSNS (74)}, /* other */
1749 COSTS_N_INSNS (1), /* cost of movsx */
1750 COSTS_N_INSNS (1), /* cost of movzx */
1751 8, /* "large" insn */
1752 17, /* MOVE_RATIO */
1753 4, /* cost for loading QImode using movzbl */
1754 {4, 4, 4}, /* cost of loading integer registers
1755 in QImode, HImode and SImode.
1756 Relative to reg-reg move (2). */
1757 {4, 4, 4}, /* cost of storing integer registers */
1758 4, /* cost of reg,reg fld/fst */
1759 {12, 12, 12}, /* cost of loading fp registers
1760 in SFmode, DFmode and XFmode */
1761 {6, 6, 8}, /* cost of storing fp registers
1762 in SFmode, DFmode and XFmode */
1763 2, /* cost of moving MMX register */
1764 {8, 8}, /* cost of loading MMX registers
1765 in SImode and DImode */
1766 {8, 8}, /* cost of storing MMX registers
1767 in SImode and DImode */
1768 2, /* cost of moving SSE register */
1769 {8, 8, 8}, /* cost of loading SSE registers
1770 in SImode, DImode and TImode */
1771 {8, 8, 8}, /* cost of storing SSE registers
1772 in SImode, DImode and TImode */
1773 5, /* MMX or SSE register to integer */
1774 32, /* size of l1 cache. */
1775 512, /* size of l2 cache. */
1776 64, /* size of prefetch block */
1777 6, /* number of parallel prefetches */
1778 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1779 value is increased to perhaps more appropriate value of 5. */
1780 3, /* Branch cost */
1781 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1782 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1783 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1784 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1785 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1786 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1787 {DUMMY_STRINGOP_ALGS,
1788 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1789 {DUMMY_STRINGOP_ALGS,
1790 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 1, /* scalar_stmt_cost. */
1792 1, /* scalar load_cost. */
1793 1, /* scalar_store_cost. */
1794 1, /* vec_stmt_cost. */
1795 1, /* vec_to_scalar_cost. */
1796 1, /* scalar_to_vec_cost. */
1797 1, /* vec_align_load_cost. */
1798 2, /* vec_unalign_load_cost. */
1799 1, /* vec_store_cost. */
1800 3, /* cond_taken_branch_cost. */
1801 1, /* cond_not_taken_branch_cost. */
1802 };
1803
1804 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1805 Athlon and K8. */
1806 static const
1807 struct processor_costs generic32_cost = {
1808 COSTS_N_INSNS (1), /* cost of an add instruction */
1809 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1810 COSTS_N_INSNS (1), /* variable shift costs */
1811 COSTS_N_INSNS (1), /* constant shift costs */
1812 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1813 COSTS_N_INSNS (4), /* HI */
1814 COSTS_N_INSNS (3), /* SI */
1815 COSTS_N_INSNS (4), /* DI */
1816 COSTS_N_INSNS (2)}, /* other */
1817 0, /* cost of multiply per each bit set */
1818 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1819 COSTS_N_INSNS (26), /* HI */
1820 COSTS_N_INSNS (42), /* SI */
1821 COSTS_N_INSNS (74), /* DI */
1822 COSTS_N_INSNS (74)}, /* other */
1823 COSTS_N_INSNS (1), /* cost of movsx */
1824 COSTS_N_INSNS (1), /* cost of movzx */
1825 8, /* "large" insn */
1826 17, /* MOVE_RATIO */
1827 4, /* cost for loading QImode using movzbl */
1828 {4, 4, 4}, /* cost of loading integer registers
1829 in QImode, HImode and SImode.
1830 Relative to reg-reg move (2). */
1831 {4, 4, 4}, /* cost of storing integer registers */
1832 4, /* cost of reg,reg fld/fst */
1833 {12, 12, 12}, /* cost of loading fp registers
1834 in SFmode, DFmode and XFmode */
1835 {6, 6, 8}, /* cost of storing fp registers
1836 in SFmode, DFmode and XFmode */
1837 2, /* cost of moving MMX register */
1838 {8, 8}, /* cost of loading MMX registers
1839 in SImode and DImode */
1840 {8, 8}, /* cost of storing MMX registers
1841 in SImode and DImode */
1842 2, /* cost of moving SSE register */
1843 {8, 8, 8}, /* cost of loading SSE registers
1844 in SImode, DImode and TImode */
1845 {8, 8, 8}, /* cost of storing SSE registers
1846 in SImode, DImode and TImode */
1847 5, /* MMX or SSE register to integer */
1848 32, /* size of l1 cache. */
1849 256, /* size of l2 cache. */
1850 64, /* size of prefetch block */
1851 6, /* number of parallel prefetches */
1852 3, /* Branch cost */
1853 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1854 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1855 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1856 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1857 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1858 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1859 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1860 DUMMY_STRINGOP_ALGS},
1861 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1862 DUMMY_STRINGOP_ALGS},
1863 1, /* scalar_stmt_cost. */
1864 1, /* scalar load_cost. */
1865 1, /* scalar_store_cost. */
1866 1, /* vec_stmt_cost. */
1867 1, /* vec_to_scalar_cost. */
1868 1, /* scalar_to_vec_cost. */
1869 1, /* vec_align_load_cost. */
1870 2, /* vec_unalign_load_cost. */
1871 1, /* vec_store_cost. */
1872 3, /* cond_taken_branch_cost. */
1873 1, /* cond_not_taken_branch_cost. */
1874 };
1875
1876 const struct processor_costs *ix86_cost = &pentium_cost;
1877
1878 /* Processor feature/optimization bitmasks. */
1879 #define m_386 (1<<PROCESSOR_I386)
1880 #define m_486 (1<<PROCESSOR_I486)
1881 #define m_PENT (1<<PROCESSOR_PENTIUM)
1882 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1883 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1884 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1885 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1886 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1887 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1888 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1889 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1890 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1891 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1892 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1893 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1894 #define m_ATOM (1<<PROCESSOR_ATOM)
1895
1896 #define m_GEODE (1<<PROCESSOR_GEODE)
1897 #define m_K6 (1<<PROCESSOR_K6)
1898 #define m_K6_GEODE (m_K6 | m_GEODE)
1899 #define m_K8 (1<<PROCESSOR_K8)
1900 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1901 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1902 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1903 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1904 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1905 #define m_BDVER (m_BDVER1 | m_BDVER2)
1906 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1907 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1908
1909 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1910 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1911
1912 /* Generic instruction choice should be common subset of supported CPUs
1913 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1914 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1915
1916 /* Feature tests against the various tunings. */
1917 unsigned char ix86_tune_features[X86_TUNE_LAST];
1918
1919 /* Feature tests against the various tunings used to create ix86_tune_features
1920 based on the processor mask. */
1921 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1922 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1923 negatively, so enabling for Generic64 seems like good code size
1924 tradeoff. We can't enable it for 32bit generic because it does not
1925 work well with PPro base chips. */
1926 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1927
1928 /* X86_TUNE_PUSH_MEMORY */
1929 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1930
1931 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1932 m_486 | m_PENT,
1933
1934 /* X86_TUNE_UNROLL_STRLEN */
1935 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1936
1937 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1938 on simulation result. But after P4 was made, no performance benefit
1939 was observed with branch hints. It also increases the code size.
1940 As a result, icc never generates branch hints. */
1941 0,
1942
1943 /* X86_TUNE_DOUBLE_WITH_ADD */
1944 ~m_386,
1945
1946 /* X86_TUNE_USE_SAHF */
1947 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1948
1949 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1950 partial dependencies. */
1951 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1952
1953 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1954 register stalls on Generic32 compilation setting as well. However
1955 in current implementation the partial register stalls are not eliminated
1956 very well - they can be introduced via subregs synthesized by combine
1957 and can happen in caller/callee saving sequences. Because this option
1958 pays back little on PPro based chips and is in conflict with partial reg
1959 dependencies used by Athlon/P4 based chips, it is better to leave it off
1960 for generic32 for now. */
1961 m_PPRO,
1962
1963 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1964 m_CORE2I7 | m_GENERIC,
1965
1966 /* X86_TUNE_USE_HIMODE_FIOP */
1967 m_386 | m_486 | m_K6_GEODE,
1968
1969 /* X86_TUNE_USE_SIMODE_FIOP */
1970 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1971
1972 /* X86_TUNE_USE_MOV0 */
1973 m_K6,
1974
1975 /* X86_TUNE_USE_CLTD */
1976 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1977
1978 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1979 m_PENT4,
1980
1981 /* X86_TUNE_SPLIT_LONG_MOVES */
1982 m_PPRO,
1983
1984 /* X86_TUNE_READ_MODIFY_WRITE */
1985 ~m_PENT,
1986
1987 /* X86_TUNE_READ_MODIFY */
1988 ~(m_PENT | m_PPRO),
1989
1990 /* X86_TUNE_PROMOTE_QIMODE */
1991 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1992
1993 /* X86_TUNE_FAST_PREFIX */
1994 ~(m_386 | m_486 | m_PENT),
1995
1996 /* X86_TUNE_SINGLE_STRINGOP */
1997 m_386 | m_P4_NOCONA,
1998
1999 /* X86_TUNE_QIMODE_MATH */
2000 ~0,
2001
2002 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2003 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2004 might be considered for Generic32 if our scheme for avoiding partial
2005 stalls was more effective. */
2006 ~m_PPRO,
2007
2008 /* X86_TUNE_PROMOTE_QI_REGS */
2009 0,
2010
2011 /* X86_TUNE_PROMOTE_HI_REGS */
2012 m_PPRO,
2013
2014 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2015 over esp addition. */
2016 m_386 | m_486 | m_PENT | m_PPRO,
2017
2018 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2019 over esp addition. */
2020 m_PENT,
2021
2022 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2023 over esp subtraction. */
2024 m_386 | m_486 | m_PENT | m_K6_GEODE,
2025
2026 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2027 over esp subtraction. */
2028 m_PENT | m_K6_GEODE,
2029
2030 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2031 for DFmode copies */
2032 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2033
2034 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2035 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2036
2037 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2038 conflict here in between PPro/Pentium4 based chips that thread 128bit
2039 SSE registers as single units versus K8 based chips that divide SSE
2040 registers to two 64bit halves. This knob promotes all store destinations
2041 to be 128bit to allow register renaming on 128bit SSE units, but usually
2042 results in one extra microop on 64bit SSE units. Experimental results
2043 shows that disabling this option on P4 brings over 20% SPECfp regression,
2044 while enabling it on K8 brings roughly 2.4% regression that can be partly
2045 masked by careful scheduling of moves. */
2046 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2047
2048 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2049 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2050
2051 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2052 m_COREI7 | m_BDVER,
2053
2054 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2055 m_BDVER ,
2056
2057 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2058 are resolved on SSE register parts instead of whole registers, so we may
2059 maintain just lower part of scalar values in proper format leaving the
2060 upper part undefined. */
2061 m_ATHLON_K8,
2062
2063 /* X86_TUNE_SSE_TYPELESS_STORES */
2064 m_AMD_MULTIPLE,
2065
2066 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2067 m_PPRO | m_P4_NOCONA,
2068
2069 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2070 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2071
2072 /* X86_TUNE_PROLOGUE_USING_MOVE */
2073 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2074
2075 /* X86_TUNE_EPILOGUE_USING_MOVE */
2076 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2077
2078 /* X86_TUNE_SHIFT1 */
2079 ~m_486,
2080
2081 /* X86_TUNE_USE_FFREEP */
2082 m_AMD_MULTIPLE,
2083
2084 /* X86_TUNE_INTER_UNIT_MOVES */
2085 ~(m_AMD_MULTIPLE | m_GENERIC),
2086
2087 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2088 ~(m_AMDFAM10 | m_BDVER ),
2089
2090 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2091 than 4 branch instructions in the 16 byte window. */
2092 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2093
2094 /* X86_TUNE_SCHEDULE */
2095 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2096
2097 /* X86_TUNE_USE_BT */
2098 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2099
2100 /* X86_TUNE_USE_INCDEC */
2101 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2102
2103 /* X86_TUNE_PAD_RETURNS */
2104 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2105
2106 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2107 m_ATOM,
2108
2109 /* X86_TUNE_EXT_80387_CONSTANTS */
2110 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2111
2112 /* X86_TUNE_SHORTEN_X87_SSE */
2113 ~m_K8,
2114
2115 /* X86_TUNE_AVOID_VECTOR_DECODE */
2116 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2117
2118 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2119 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2120 ~(m_386 | m_486),
2121
2122 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2123 vector path on AMD machines. */
2124 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2125
2126 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2127 machines. */
2128 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2129
2130 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2131 than a MOV. */
2132 m_PENT,
2133
2134 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2135 but one byte longer. */
2136 m_PENT,
2137
2138 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2139 operand that cannot be represented using a modRM byte. The XOR
2140 replacement is long decoded, so this split helps here as well. */
2141 m_K6,
2142
2143 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2144 from FP to FP. */
2145 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2146
2147 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2148 from integer to FP. */
2149 m_AMDFAM10,
2150
2151 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2152 with a subsequent conditional jump instruction into a single
2153 compare-and-branch uop. */
2154 m_BDVER,
2155
2156 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2157 will impact LEA instruction selection. */
2158 m_ATOM,
2159
2160 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2161 instructions. */
2162 ~m_ATOM,
2163
2164 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2165 at -O3. For the moment, the prefetching seems badly tuned for Intel
2166 chips. */
2167 m_K6_GEODE | m_AMD_MULTIPLE,
2168
2169 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2170 the auto-vectorizer. */
2171 m_BDVER,
2172
2173 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2174 during reassociation of integer computation. */
2175 m_ATOM,
2176
2177 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2178 during reassociation of fp computation. */
2179 m_ATOM
2180 };
2181
2182 /* Feature tests against the various architecture variations. */
2183 unsigned char ix86_arch_features[X86_ARCH_LAST];
2184
2185 /* Feature tests against the various architecture variations, used to create
2186 ix86_arch_features based on the processor mask. */
2187 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2188 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2189 ~(m_386 | m_486 | m_PENT | m_K6),
2190
2191 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2192 ~m_386,
2193
2194 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2195 ~(m_386 | m_486),
2196
2197 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2198 ~m_386,
2199
2200 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2201 ~m_386,
2202 };
2203
2204 static const unsigned int x86_accumulate_outgoing_args
2205 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2206
2207 static const unsigned int x86_arch_always_fancy_math_387
2208 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2209
2210 static const unsigned int x86_avx256_split_unaligned_load
2211 = m_COREI7 | m_GENERIC;
2212
2213 static const unsigned int x86_avx256_split_unaligned_store
2214 = m_COREI7 | m_BDVER | m_GENERIC;
2215
2216 /* In case the average insn count for single function invocation is
2217 lower than this constant, emit fast (but longer) prologue and
2218 epilogue code. */
2219 #define FAST_PROLOGUE_INSN_COUNT 20
2220
2221 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2222 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2223 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2224 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2225
2226 /* Array of the smallest class containing reg number REGNO, indexed by
2227 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2228
2229 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2230 {
2231 /* ax, dx, cx, bx */
2232 AREG, DREG, CREG, BREG,
2233 /* si, di, bp, sp */
2234 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2235 /* FP registers */
2236 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2237 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2238 /* arg pointer */
2239 NON_Q_REGS,
2240 /* flags, fpsr, fpcr, frame */
2241 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2242 /* SSE registers */
2243 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2244 SSE_REGS, SSE_REGS,
2245 /* MMX registers */
2246 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2247 MMX_REGS, MMX_REGS,
2248 /* REX registers */
2249 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 /* SSE REX registers */
2252 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2253 SSE_REGS, SSE_REGS,
2254 };
2255
2256 /* The "default" register map used in 32bit mode. */
2257
2258 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2259 {
2260 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2261 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2262 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2263 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2264 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2265 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2267 };
2268
2269 /* The "default" register map used in 64bit mode. */
2270
2271 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2272 {
2273 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2274 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2275 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2276 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2277 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2278 8,9,10,11,12,13,14,15, /* extended integer registers */
2279 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2280 };
2281
2282 /* Define the register numbers to be used in Dwarf debugging information.
2283 The SVR4 reference port C compiler uses the following register numbers
2284 in its Dwarf output code:
2285 0 for %eax (gcc regno = 0)
2286 1 for %ecx (gcc regno = 2)
2287 2 for %edx (gcc regno = 1)
2288 3 for %ebx (gcc regno = 3)
2289 4 for %esp (gcc regno = 7)
2290 5 for %ebp (gcc regno = 6)
2291 6 for %esi (gcc regno = 4)
2292 7 for %edi (gcc regno = 5)
2293 The following three DWARF register numbers are never generated by
2294 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2295 believes these numbers have these meanings.
2296 8 for %eip (no gcc equivalent)
2297 9 for %eflags (gcc regno = 17)
2298 10 for %trapno (no gcc equivalent)
2299 It is not at all clear how we should number the FP stack registers
2300 for the x86 architecture. If the version of SDB on x86/svr4 were
2301 a bit less brain dead with respect to floating-point then we would
2302 have a precedent to follow with respect to DWARF register numbers
2303 for x86 FP registers, but the SDB on x86/svr4 is so completely
2304 broken with respect to FP registers that it is hardly worth thinking
2305 of it as something to strive for compatibility with.
2306 The version of x86/svr4 SDB I have at the moment does (partially)
2307 seem to believe that DWARF register number 11 is associated with
2308 the x86 register %st(0), but that's about all. Higher DWARF
2309 register numbers don't seem to be associated with anything in
2310 particular, and even for DWARF regno 11, SDB only seems to under-
2311 stand that it should say that a variable lives in %st(0) (when
2312 asked via an `=' command) if we said it was in DWARF regno 11,
2313 but SDB still prints garbage when asked for the value of the
2314 variable in question (via a `/' command).
2315 (Also note that the labels SDB prints for various FP stack regs
2316 when doing an `x' command are all wrong.)
2317 Note that these problems generally don't affect the native SVR4
2318 C compiler because it doesn't allow the use of -O with -g and
2319 because when it is *not* optimizing, it allocates a memory
2320 location for each floating-point variable, and the memory
2321 location is what gets described in the DWARF AT_location
2322 attribute for the variable in question.
2323 Regardless of the severe mental illness of the x86/svr4 SDB, we
2324 do something sensible here and we use the following DWARF
2325 register numbers. Note that these are all stack-top-relative
2326 numbers.
2327 11 for %st(0) (gcc regno = 8)
2328 12 for %st(1) (gcc regno = 9)
2329 13 for %st(2) (gcc regno = 10)
2330 14 for %st(3) (gcc regno = 11)
2331 15 for %st(4) (gcc regno = 12)
2332 16 for %st(5) (gcc regno = 13)
2333 17 for %st(6) (gcc regno = 14)
2334 18 for %st(7) (gcc regno = 15)
2335 */
2336 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2337 {
2338 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2339 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2340 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2341 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2342 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2343 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2345 };
2346
2347 /* Define parameter passing and return registers. */
2348
2349 static int const x86_64_int_parameter_registers[6] =
2350 {
2351 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2352 };
2353
2354 static int const x86_64_ms_abi_int_parameter_registers[4] =
2355 {
2356 CX_REG, DX_REG, R8_REG, R9_REG
2357 };
2358
2359 static int const x86_64_int_return_registers[4] =
2360 {
2361 AX_REG, DX_REG, DI_REG, SI_REG
2362 };
2363
2364 /* Define the structure for the machine field in struct function. */
2365
2366 struct GTY(()) stack_local_entry {
2367 unsigned short mode;
2368 unsigned short n;
2369 rtx rtl;
2370 struct stack_local_entry *next;
2371 };
2372
2373 /* Structure describing stack frame layout.
2374 Stack grows downward:
2375
2376 [arguments]
2377 <- ARG_POINTER
2378 saved pc
2379
2380 saved static chain if ix86_static_chain_on_stack
2381
2382 saved frame pointer if frame_pointer_needed
2383 <- HARD_FRAME_POINTER
2384 [saved regs]
2385 <- regs_save_offset
2386 [padding0]
2387
2388 [saved SSE regs]
2389 <- sse_regs_save_offset
2390 [padding1] |
2391 | <- FRAME_POINTER
2392 [va_arg registers] |
2393 |
2394 [frame] |
2395 |
2396 [padding2] | = to_allocate
2397 <- STACK_POINTER
2398 */
2399 struct ix86_frame
2400 {
2401 int nsseregs;
2402 int nregs;
2403 int va_arg_size;
2404 int red_zone_size;
2405 int outgoing_arguments_size;
2406 HOST_WIDE_INT frame;
2407
2408 /* The offsets relative to ARG_POINTER. */
2409 HOST_WIDE_INT frame_pointer_offset;
2410 HOST_WIDE_INT hard_frame_pointer_offset;
2411 HOST_WIDE_INT stack_pointer_offset;
2412 HOST_WIDE_INT hfp_save_offset;
2413 HOST_WIDE_INT reg_save_offset;
2414 HOST_WIDE_INT sse_reg_save_offset;
2415
2416 /* When save_regs_using_mov is set, emit prologue using
2417 move instead of push instructions. */
2418 bool save_regs_using_mov;
2419 };
2420
2421 /* Which cpu are we scheduling for. */
2422 enum attr_cpu ix86_schedule;
2423
2424 /* Which cpu are we optimizing for. */
2425 enum processor_type ix86_tune;
2426
2427 /* Which instruction set architecture to use. */
2428 enum processor_type ix86_arch;
2429
2430 /* true if sse prefetch instruction is not NOOP. */
2431 int x86_prefetch_sse;
2432
2433 /* -mstackrealign option */
2434 static const char ix86_force_align_arg_pointer_string[]
2435 = "force_align_arg_pointer";
2436
2437 static rtx (*ix86_gen_leave) (void);
2438 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2439 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2440 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2441 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2442 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2445 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2446 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2447
2448 /* Preferred alignment for stack boundary in bits. */
2449 unsigned int ix86_preferred_stack_boundary;
2450
2451 /* Alignment for incoming stack boundary in bits specified at
2452 command line. */
2453 static unsigned int ix86_user_incoming_stack_boundary;
2454
2455 /* Default alignment for incoming stack boundary in bits. */
2456 static unsigned int ix86_default_incoming_stack_boundary;
2457
2458 /* Alignment for incoming stack boundary in bits. */
2459 unsigned int ix86_incoming_stack_boundary;
2460
2461 /* Calling abi specific va_list type nodes. */
2462 static GTY(()) tree sysv_va_list_type_node;
2463 static GTY(()) tree ms_va_list_type_node;
2464
2465 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2466 char internal_label_prefix[16];
2467 int internal_label_prefix_len;
2468
2469 /* Fence to use after loop using movnt. */
2470 tree x86_mfence;
2471
2472 /* Register class used for passing given 64bit part of the argument.
2473 These represent classes as documented by the PS ABI, with the exception
2474 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2475 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2476
2477 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2478 whenever possible (upper half does contain padding). */
2479 enum x86_64_reg_class
2480 {
2481 X86_64_NO_CLASS,
2482 X86_64_INTEGER_CLASS,
2483 X86_64_INTEGERSI_CLASS,
2484 X86_64_SSE_CLASS,
2485 X86_64_SSESF_CLASS,
2486 X86_64_SSEDF_CLASS,
2487 X86_64_SSEUP_CLASS,
2488 X86_64_X87_CLASS,
2489 X86_64_X87UP_CLASS,
2490 X86_64_COMPLEX_X87_CLASS,
2491 X86_64_MEMORY_CLASS
2492 };
2493
2494 #define MAX_CLASSES 4
2495
2496 /* Table of constants used by fldpi, fldln2, etc.... */
2497 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2498 static bool ext_80387_constants_init = 0;
2499
2500 \f
2501 static struct machine_function * ix86_init_machine_status (void);
2502 static rtx ix86_function_value (const_tree, const_tree, bool);
2503 static bool ix86_function_value_regno_p (const unsigned int);
2504 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2505 const_tree);
2506 static rtx ix86_static_chain (const_tree, bool);
2507 static int ix86_function_regparm (const_tree, const_tree);
2508 static void ix86_compute_frame_layout (struct ix86_frame *);
2509 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2510 rtx, rtx, int);
2511 static void ix86_add_new_builtins (HOST_WIDE_INT);
2512 static tree ix86_canonical_va_list_type (tree);
2513 static void predict_jump (int);
2514 static unsigned int split_stack_prologue_scratch_regno (void);
2515 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2516
2517 enum ix86_function_specific_strings
2518 {
2519 IX86_FUNCTION_SPECIFIC_ARCH,
2520 IX86_FUNCTION_SPECIFIC_TUNE,
2521 IX86_FUNCTION_SPECIFIC_MAX
2522 };
2523
2524 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2525 const char *, enum fpmath_unit, bool);
2526 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2527 static void ix86_function_specific_save (struct cl_target_option *);
2528 static void ix86_function_specific_restore (struct cl_target_option *);
2529 static void ix86_function_specific_print (FILE *, int,
2530 struct cl_target_option *);
2531 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2532 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2533 struct gcc_options *);
2534 static bool ix86_can_inline_p (tree, tree);
2535 static void ix86_set_current_function (tree);
2536 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2537
2538 static enum calling_abi ix86_function_abi (const_tree);
2539
2540 \f
2541 #ifndef SUBTARGET32_DEFAULT_CPU
2542 #define SUBTARGET32_DEFAULT_CPU "i386"
2543 #endif
2544
2545 /* The svr4 ABI for the i386 says that records and unions are returned
2546 in memory. */
2547 #ifndef DEFAULT_PCC_STRUCT_RETURN
2548 #define DEFAULT_PCC_STRUCT_RETURN 1
2549 #endif
2550
2551 /* Whether -mtune= or -march= were specified */
2552 static int ix86_tune_defaulted;
2553 static int ix86_arch_specified;
2554
2555 /* Vectorization library interface and handlers. */
2556 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2557
2558 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2559 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2560
2561 /* Processor target table, indexed by processor number */
2562 struct ptt
2563 {
2564 const struct processor_costs *cost; /* Processor costs */
2565 const int align_loop; /* Default alignments. */
2566 const int align_loop_max_skip;
2567 const int align_jump;
2568 const int align_jump_max_skip;
2569 const int align_func;
2570 };
2571
2572 static const struct ptt processor_target_table[PROCESSOR_max] =
2573 {
2574 {&i386_cost, 4, 3, 4, 3, 4},
2575 {&i486_cost, 16, 15, 16, 15, 16},
2576 {&pentium_cost, 16, 7, 16, 7, 16},
2577 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2578 {&geode_cost, 0, 0, 0, 0, 0},
2579 {&k6_cost, 32, 7, 32, 7, 32},
2580 {&athlon_cost, 16, 7, 16, 7, 16},
2581 {&pentium4_cost, 0, 0, 0, 0, 0},
2582 {&k8_cost, 16, 7, 16, 7, 16},
2583 {&nocona_cost, 0, 0, 0, 0, 0},
2584 /* Core 2 32-bit. */
2585 {&generic32_cost, 16, 10, 16, 10, 16},
2586 /* Core 2 64-bit. */
2587 {&generic64_cost, 16, 10, 16, 10, 16},
2588 /* Core i7 32-bit. */
2589 {&generic32_cost, 16, 10, 16, 10, 16},
2590 /* Core i7 64-bit. */
2591 {&generic64_cost, 16, 10, 16, 10, 16},
2592 {&generic32_cost, 16, 7, 16, 7, 16},
2593 {&generic64_cost, 16, 10, 16, 10, 16},
2594 {&amdfam10_cost, 32, 24, 32, 7, 32},
2595 {&bdver1_cost, 32, 24, 32, 7, 32},
2596 {&bdver2_cost, 32, 24, 32, 7, 32},
2597 {&btver1_cost, 32, 24, 32, 7, 32},
2598 {&atom_cost, 16, 15, 16, 7, 16}
2599 };
2600
2601 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2602 {
2603 "generic",
2604 "i386",
2605 "i486",
2606 "pentium",
2607 "pentium-mmx",
2608 "pentiumpro",
2609 "pentium2",
2610 "pentium3",
2611 "pentium4",
2612 "pentium-m",
2613 "prescott",
2614 "nocona",
2615 "core2",
2616 "corei7",
2617 "atom",
2618 "geode",
2619 "k6",
2620 "k6-2",
2621 "k6-3",
2622 "athlon",
2623 "athlon-4",
2624 "k8",
2625 "amdfam10",
2626 "bdver1",
2627 "bdver2",
2628 "btver1"
2629 };
2630 \f
2631 /* Return true if a red-zone is in use. */
2632
2633 static inline bool
2634 ix86_using_red_zone (void)
2635 {
2636 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2637 }
2638 \f
2639 /* Return a string that documents the current -m options. The caller is
2640 responsible for freeing the string. */
2641
2642 static char *
2643 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2644 const char *tune, enum fpmath_unit fpmath,
2645 bool add_nl_p)
2646 {
2647 struct ix86_target_opts
2648 {
2649 const char *option; /* option string */
2650 HOST_WIDE_INT mask; /* isa mask options */
2651 };
2652
2653 /* This table is ordered so that options like -msse4.2 that imply
2654 preceding options while match those first. */
2655 static struct ix86_target_opts isa_opts[] =
2656 {
2657 { "-m64", OPTION_MASK_ISA_64BIT },
2658 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2659 { "-mfma", OPTION_MASK_ISA_FMA },
2660 { "-mxop", OPTION_MASK_ISA_XOP },
2661 { "-mlwp", OPTION_MASK_ISA_LWP },
2662 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2663 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2664 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2665 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2666 { "-msse3", OPTION_MASK_ISA_SSE3 },
2667 { "-msse2", OPTION_MASK_ISA_SSE2 },
2668 { "-msse", OPTION_MASK_ISA_SSE },
2669 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2670 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2671 { "-mmmx", OPTION_MASK_ISA_MMX },
2672 { "-mabm", OPTION_MASK_ISA_ABM },
2673 { "-mbmi", OPTION_MASK_ISA_BMI },
2674 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2675 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2676 { "-mtbm", OPTION_MASK_ISA_TBM },
2677 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2678 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2679 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2680 { "-maes", OPTION_MASK_ISA_AES },
2681 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2682 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2683 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2684 { "-mf16c", OPTION_MASK_ISA_F16C },
2685 };
2686
2687 /* Flag options. */
2688 static struct ix86_target_opts flag_opts[] =
2689 {
2690 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2691 { "-m80387", MASK_80387 },
2692 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2693 { "-malign-double", MASK_ALIGN_DOUBLE },
2694 { "-mcld", MASK_CLD },
2695 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2696 { "-mieee-fp", MASK_IEEE_FP },
2697 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2698 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2699 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2700 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2701 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2702 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2703 { "-mno-red-zone", MASK_NO_RED_ZONE },
2704 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2705 { "-mrecip", MASK_RECIP },
2706 { "-mrtd", MASK_RTD },
2707 { "-msseregparm", MASK_SSEREGPARM },
2708 { "-mstack-arg-probe", MASK_STACK_PROBE },
2709 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2710 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2711 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2712 { "-mvzeroupper", MASK_VZEROUPPER },
2713 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2714 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2715 { "-mprefer-avx128", MASK_PREFER_AVX128},
2716 };
2717
2718 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2719
2720 char isa_other[40];
2721 char target_other[40];
2722 unsigned num = 0;
2723 unsigned i, j;
2724 char *ret;
2725 char *ptr;
2726 size_t len;
2727 size_t line_len;
2728 size_t sep_len;
2729
2730 memset (opts, '\0', sizeof (opts));
2731
2732 /* Add -march= option. */
2733 if (arch)
2734 {
2735 opts[num][0] = "-march=";
2736 opts[num++][1] = arch;
2737 }
2738
2739 /* Add -mtune= option. */
2740 if (tune)
2741 {
2742 opts[num][0] = "-mtune=";
2743 opts[num++][1] = tune;
2744 }
2745
2746 /* Pick out the options in isa options. */
2747 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2748 {
2749 if ((isa & isa_opts[i].mask) != 0)
2750 {
2751 opts[num++][0] = isa_opts[i].option;
2752 isa &= ~ isa_opts[i].mask;
2753 }
2754 }
2755
2756 if (isa && add_nl_p)
2757 {
2758 opts[num++][0] = isa_other;
2759 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2760 isa);
2761 }
2762
2763 /* Add flag options. */
2764 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2765 {
2766 if ((flags & flag_opts[i].mask) != 0)
2767 {
2768 opts[num++][0] = flag_opts[i].option;
2769 flags &= ~ flag_opts[i].mask;
2770 }
2771 }
2772
2773 if (flags && add_nl_p)
2774 {
2775 opts[num++][0] = target_other;
2776 sprintf (target_other, "(other flags: %#x)", flags);
2777 }
2778
2779 /* Add -fpmath= option. */
2780 if (fpmath)
2781 {
2782 opts[num][0] = "-mfpmath=";
2783 switch ((int) fpmath)
2784 {
2785 case FPMATH_387:
2786 opts[num++][1] = "387";
2787 break;
2788
2789 case FPMATH_SSE:
2790 opts[num++][1] = "sse";
2791 break;
2792
2793 case FPMATH_387 | FPMATH_SSE:
2794 opts[num++][1] = "sse+387";
2795 break;
2796
2797 default:
2798 gcc_unreachable ();
2799 }
2800 }
2801
2802 /* Any options? */
2803 if (num == 0)
2804 return NULL;
2805
2806 gcc_assert (num < ARRAY_SIZE (opts));
2807
2808 /* Size the string. */
2809 len = 0;
2810 sep_len = (add_nl_p) ? 3 : 1;
2811 for (i = 0; i < num; i++)
2812 {
2813 len += sep_len;
2814 for (j = 0; j < 2; j++)
2815 if (opts[i][j])
2816 len += strlen (opts[i][j]);
2817 }
2818
2819 /* Build the string. */
2820 ret = ptr = (char *) xmalloc (len);
2821 line_len = 0;
2822
2823 for (i = 0; i < num; i++)
2824 {
2825 size_t len2[2];
2826
2827 for (j = 0; j < 2; j++)
2828 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2829
2830 if (i != 0)
2831 {
2832 *ptr++ = ' ';
2833 line_len++;
2834
2835 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2836 {
2837 *ptr++ = '\\';
2838 *ptr++ = '\n';
2839 line_len = 0;
2840 }
2841 }
2842
2843 for (j = 0; j < 2; j++)
2844 if (opts[i][j])
2845 {
2846 memcpy (ptr, opts[i][j], len2[j]);
2847 ptr += len2[j];
2848 line_len += len2[j];
2849 }
2850 }
2851
2852 *ptr = '\0';
2853 gcc_assert (ret + len >= ptr);
2854
2855 return ret;
2856 }
2857
2858 /* Return true, if profiling code should be emitted before
2859 prologue. Otherwise it returns false.
2860 Note: For x86 with "hotfix" it is sorried. */
2861 static bool
2862 ix86_profile_before_prologue (void)
2863 {
2864 return flag_fentry != 0;
2865 }
2866
2867 /* Function that is callable from the debugger to print the current
2868 options. */
2869 void
2870 ix86_debug_options (void)
2871 {
2872 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2873 ix86_arch_string, ix86_tune_string,
2874 ix86_fpmath, true);
2875
2876 if (opts)
2877 {
2878 fprintf (stderr, "%s\n\n", opts);
2879 free (opts);
2880 }
2881 else
2882 fputs ("<no options>\n\n", stderr);
2883
2884 return;
2885 }
2886 \f
2887 /* Override various settings based on options. If MAIN_ARGS_P, the
2888 options are from the command line, otherwise they are from
2889 attributes. */
2890
2891 static void
2892 ix86_option_override_internal (bool main_args_p)
2893 {
2894 int i;
2895 unsigned int ix86_arch_mask, ix86_tune_mask;
2896 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2897 const char *prefix;
2898 const char *suffix;
2899 const char *sw;
2900
2901 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2902 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2903 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2904 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2905 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2906 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2907 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2908 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2909 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2910 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2911 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2912 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2913 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2914 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2915 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2916 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2917 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2918 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2919 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2920 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2921 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2922 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2923 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2924 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2925 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2926 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2927 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2928 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2929 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2930 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2931 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2932 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2933 /* if this reaches 64, need to widen struct pta flags below */
2934
2935 static struct pta
2936 {
2937 const char *const name; /* processor name or nickname. */
2938 const enum processor_type processor;
2939 const enum attr_cpu schedule;
2940 const unsigned HOST_WIDE_INT flags;
2941 }
2942 const processor_alias_table[] =
2943 {
2944 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2945 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2946 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2947 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2948 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2949 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2950 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2951 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2952 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2953 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2954 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2955 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2956 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2957 PTA_MMX | PTA_SSE},
2958 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2959 PTA_MMX | PTA_SSE},
2960 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2961 PTA_MMX | PTA_SSE | PTA_SSE2},
2962 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2963 PTA_MMX |PTA_SSE | PTA_SSE2},
2964 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2965 PTA_MMX | PTA_SSE | PTA_SSE2},
2966 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2967 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2968 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2969 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2970 | PTA_CX16 | PTA_NO_SAHF},
2971 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2972 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2973 | PTA_SSSE3 | PTA_CX16},
2974 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2975 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2976 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2977 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2978 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2979 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2980 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2981 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2982 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2983 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2984 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2985 | PTA_RDRND | PTA_F16C},
2986 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2987 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2988 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2989 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2990 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2991 | PTA_FMA | PTA_MOVBE},
2992 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2993 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2994 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
2995 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2996 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
2997 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2998 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2999 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3000 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3001 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3002 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3003 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3004 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3005 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3006 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3007 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3008 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3009 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3010 {"x86-64", PROCESSOR_K8, CPU_K8,
3011 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3012 {"k8", PROCESSOR_K8, CPU_K8,
3013 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3014 | PTA_SSE2 | PTA_NO_SAHF},
3015 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3016 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3017 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3018 {"opteron", PROCESSOR_K8, CPU_K8,
3019 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3020 | PTA_SSE2 | PTA_NO_SAHF},
3021 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3022 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3023 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3024 {"athlon64", PROCESSOR_K8, CPU_K8,
3025 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3026 | PTA_SSE2 | PTA_NO_SAHF},
3027 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3028 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3029 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3030 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3031 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3032 | PTA_SSE2 | PTA_NO_SAHF},
3033 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3034 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3035 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3036 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3037 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3038 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3039 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3040 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3041 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3042 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3043 | PTA_XOP | PTA_LWP},
3044 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3045 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3046 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3047 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3048 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3049 | PTA_FMA},
3050 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3051 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3052 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3053 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3054 0 /* flags are only used for -march switch. */ },
3055 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3056 PTA_64BIT /* flags are only used for -march switch. */ },
3057 };
3058
3059 /* -mrecip options. */
3060 static struct
3061 {
3062 const char *string; /* option name */
3063 unsigned int mask; /* mask bits to set */
3064 }
3065 const recip_options[] =
3066 {
3067 { "all", RECIP_MASK_ALL },
3068 { "none", RECIP_MASK_NONE },
3069 { "div", RECIP_MASK_DIV },
3070 { "sqrt", RECIP_MASK_SQRT },
3071 { "vec-div", RECIP_MASK_VEC_DIV },
3072 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3073 };
3074
3075 int const pta_size = ARRAY_SIZE (processor_alias_table);
3076
3077 /* Set up prefix/suffix so the error messages refer to either the command
3078 line argument, or the attribute(target). */
3079 if (main_args_p)
3080 {
3081 prefix = "-m";
3082 suffix = "";
3083 sw = "switch";
3084 }
3085 else
3086 {
3087 prefix = "option(\"";
3088 suffix = "\")";
3089 sw = "attribute";
3090 }
3091
3092 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3093 SUBTARGET_OVERRIDE_OPTIONS;
3094 #endif
3095
3096 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3097 SUBSUBTARGET_OVERRIDE_OPTIONS;
3098 #endif
3099
3100 if (TARGET_X32)
3101 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3102
3103 /* -fPIC is the default for x86_64. */
3104 if (TARGET_MACHO && TARGET_64BIT)
3105 flag_pic = 2;
3106
3107 /* Need to check -mtune=generic first. */
3108 if (ix86_tune_string)
3109 {
3110 if (!strcmp (ix86_tune_string, "generic")
3111 || !strcmp (ix86_tune_string, "i686")
3112 /* As special support for cross compilers we read -mtune=native
3113 as -mtune=generic. With native compilers we won't see the
3114 -mtune=native, as it was changed by the driver. */
3115 || !strcmp (ix86_tune_string, "native"))
3116 {
3117 if (TARGET_64BIT)
3118 ix86_tune_string = "generic64";
3119 else
3120 ix86_tune_string = "generic32";
3121 }
3122 /* If this call is for setting the option attribute, allow the
3123 generic32/generic64 that was previously set. */
3124 else if (!main_args_p
3125 && (!strcmp (ix86_tune_string, "generic32")
3126 || !strcmp (ix86_tune_string, "generic64")))
3127 ;
3128 else if (!strncmp (ix86_tune_string, "generic", 7))
3129 error ("bad value (%s) for %stune=%s %s",
3130 ix86_tune_string, prefix, suffix, sw);
3131 else if (!strcmp (ix86_tune_string, "x86-64"))
3132 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3133 "%stune=k8%s or %stune=generic%s instead as appropriate",
3134 prefix, suffix, prefix, suffix, prefix, suffix);
3135 }
3136 else
3137 {
3138 if (ix86_arch_string)
3139 ix86_tune_string = ix86_arch_string;
3140 if (!ix86_tune_string)
3141 {
3142 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3143 ix86_tune_defaulted = 1;
3144 }
3145
3146 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3147 need to use a sensible tune option. */
3148 if (!strcmp (ix86_tune_string, "generic")
3149 || !strcmp (ix86_tune_string, "x86-64")
3150 || !strcmp (ix86_tune_string, "i686"))
3151 {
3152 if (TARGET_64BIT)
3153 ix86_tune_string = "generic64";
3154 else
3155 ix86_tune_string = "generic32";
3156 }
3157 }
3158
3159 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3160 {
3161 /* rep; movq isn't available in 32-bit code. */
3162 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3163 ix86_stringop_alg = no_stringop;
3164 }
3165
3166 if (!ix86_arch_string)
3167 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3168 else
3169 ix86_arch_specified = 1;
3170
3171 if (!global_options_set.x_ix86_abi)
3172 ix86_abi = DEFAULT_ABI;
3173
3174 if (global_options_set.x_ix86_cmodel)
3175 {
3176 switch (ix86_cmodel)
3177 {
3178 case CM_SMALL:
3179 case CM_SMALL_PIC:
3180 if (flag_pic)
3181 ix86_cmodel = CM_SMALL_PIC;
3182 if (!TARGET_64BIT)
3183 error ("code model %qs not supported in the %s bit mode",
3184 "small", "32");
3185 break;
3186
3187 case CM_MEDIUM:
3188 case CM_MEDIUM_PIC:
3189 if (flag_pic)
3190 ix86_cmodel = CM_MEDIUM_PIC;
3191 if (!TARGET_64BIT)
3192 error ("code model %qs not supported in the %s bit mode",
3193 "medium", "32");
3194 else if (TARGET_X32)
3195 error ("code model %qs not supported in x32 mode",
3196 "medium");
3197 break;
3198
3199 case CM_LARGE:
3200 case CM_LARGE_PIC:
3201 if (flag_pic)
3202 ix86_cmodel = CM_LARGE_PIC;
3203 if (!TARGET_64BIT)
3204 error ("code model %qs not supported in the %s bit mode",
3205 "large", "32");
3206 else if (TARGET_X32)
3207 error ("code model %qs not supported in x32 mode",
3208 "medium");
3209 break;
3210
3211 case CM_32:
3212 if (flag_pic)
3213 error ("code model %s does not support PIC mode", "32");
3214 if (TARGET_64BIT)
3215 error ("code model %qs not supported in the %s bit mode",
3216 "32", "64");
3217 break;
3218
3219 case CM_KERNEL:
3220 if (flag_pic)
3221 {
3222 error ("code model %s does not support PIC mode", "kernel");
3223 ix86_cmodel = CM_32;
3224 }
3225 if (!TARGET_64BIT)
3226 error ("code model %qs not supported in the %s bit mode",
3227 "kernel", "32");
3228 break;
3229
3230 default:
3231 gcc_unreachable ();
3232 }
3233 }
3234 else
3235 {
3236 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3237 use of rip-relative addressing. This eliminates fixups that
3238 would otherwise be needed if this object is to be placed in a
3239 DLL, and is essentially just as efficient as direct addressing. */
3240 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3241 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3242 else if (TARGET_64BIT)
3243 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3244 else
3245 ix86_cmodel = CM_32;
3246 }
3247 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3248 {
3249 error ("-masm=intel not supported in this configuration");
3250 ix86_asm_dialect = ASM_ATT;
3251 }
3252 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3253 sorry ("%i-bit mode not compiled in",
3254 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3255
3256 for (i = 0; i < pta_size; i++)
3257 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3258 {
3259 ix86_schedule = processor_alias_table[i].schedule;
3260 ix86_arch = processor_alias_table[i].processor;
3261 /* Default cpu tuning to the architecture. */
3262 ix86_tune = ix86_arch;
3263
3264 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3265 error ("CPU you selected does not support x86-64 "
3266 "instruction set");
3267
3268 if (processor_alias_table[i].flags & PTA_MMX
3269 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3270 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3271 if (processor_alias_table[i].flags & PTA_3DNOW
3272 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3273 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3274 if (processor_alias_table[i].flags & PTA_3DNOW_A
3275 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3276 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3277 if (processor_alias_table[i].flags & PTA_SSE
3278 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3279 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3280 if (processor_alias_table[i].flags & PTA_SSE2
3281 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3282 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3283 if (processor_alias_table[i].flags & PTA_SSE3
3284 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3285 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3286 if (processor_alias_table[i].flags & PTA_SSSE3
3287 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3288 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3289 if (processor_alias_table[i].flags & PTA_SSE4_1
3290 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3291 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3292 if (processor_alias_table[i].flags & PTA_SSE4_2
3293 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3294 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3295 if (processor_alias_table[i].flags & PTA_AVX
3296 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3297 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3298 if (processor_alias_table[i].flags & PTA_AVX2
3299 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3300 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3301 if (processor_alias_table[i].flags & PTA_FMA
3302 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3303 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3304 if (processor_alias_table[i].flags & PTA_SSE4A
3305 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3306 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3307 if (processor_alias_table[i].flags & PTA_FMA4
3308 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3309 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3310 if (processor_alias_table[i].flags & PTA_XOP
3311 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3312 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3313 if (processor_alias_table[i].flags & PTA_LWP
3314 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3315 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3316 if (processor_alias_table[i].flags & PTA_ABM
3317 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3318 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3319 if (processor_alias_table[i].flags & PTA_BMI
3320 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3321 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3322 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3323 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3324 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3325 if (processor_alias_table[i].flags & PTA_TBM
3326 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3327 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3328 if (processor_alias_table[i].flags & PTA_BMI2
3329 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3330 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3331 if (processor_alias_table[i].flags & PTA_CX16
3332 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3333 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3334 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3335 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3336 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3337 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3338 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3339 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3340 if (processor_alias_table[i].flags & PTA_MOVBE
3341 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3342 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3343 if (processor_alias_table[i].flags & PTA_AES
3344 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3345 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3346 if (processor_alias_table[i].flags & PTA_PCLMUL
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3348 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3349 if (processor_alias_table[i].flags & PTA_FSGSBASE
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3351 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3352 if (processor_alias_table[i].flags & PTA_RDRND
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3354 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3355 if (processor_alias_table[i].flags & PTA_F16C
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3357 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3358 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3359 x86_prefetch_sse = true;
3360
3361 break;
3362 }
3363
3364 if (!strcmp (ix86_arch_string, "generic"))
3365 error ("generic CPU can be used only for %stune=%s %s",
3366 prefix, suffix, sw);
3367 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3368 error ("bad value (%s) for %sarch=%s %s",
3369 ix86_arch_string, prefix, suffix, sw);
3370
3371 ix86_arch_mask = 1u << ix86_arch;
3372 for (i = 0; i < X86_ARCH_LAST; ++i)
3373 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3374
3375 for (i = 0; i < pta_size; i++)
3376 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3377 {
3378 ix86_schedule = processor_alias_table[i].schedule;
3379 ix86_tune = processor_alias_table[i].processor;
3380 if (TARGET_64BIT)
3381 {
3382 if (!(processor_alias_table[i].flags & PTA_64BIT))
3383 {
3384 if (ix86_tune_defaulted)
3385 {
3386 ix86_tune_string = "x86-64";
3387 for (i = 0; i < pta_size; i++)
3388 if (! strcmp (ix86_tune_string,
3389 processor_alias_table[i].name))
3390 break;
3391 ix86_schedule = processor_alias_table[i].schedule;
3392 ix86_tune = processor_alias_table[i].processor;
3393 }
3394 else
3395 error ("CPU you selected does not support x86-64 "
3396 "instruction set");
3397 }
3398 }
3399 else
3400 {
3401 /* Adjust tuning when compiling for 32-bit ABI. */
3402 switch (ix86_tune)
3403 {
3404 case PROCESSOR_GENERIC64:
3405 ix86_tune = PROCESSOR_GENERIC32;
3406 ix86_schedule = CPU_PENTIUMPRO;
3407 break;
3408
3409 case PROCESSOR_CORE2_64:
3410 ix86_tune = PROCESSOR_CORE2_32;
3411 break;
3412
3413 case PROCESSOR_COREI7_64:
3414 ix86_tune = PROCESSOR_COREI7_32;
3415 break;
3416
3417 default:
3418 break;
3419 }
3420 }
3421 /* Intel CPUs have always interpreted SSE prefetch instructions as
3422 NOPs; so, we can enable SSE prefetch instructions even when
3423 -mtune (rather than -march) points us to a processor that has them.
3424 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3425 higher processors. */
3426 if (TARGET_CMOVE
3427 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3428 x86_prefetch_sse = true;
3429 break;
3430 }
3431
3432 if (ix86_tune_specified && i == pta_size)
3433 error ("bad value (%s) for %stune=%s %s",
3434 ix86_tune_string, prefix, suffix, sw);
3435
3436 ix86_tune_mask = 1u << ix86_tune;
3437 for (i = 0; i < X86_TUNE_LAST; ++i)
3438 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3439
3440 #ifndef USE_IX86_FRAME_POINTER
3441 #define USE_IX86_FRAME_POINTER 0
3442 #endif
3443
3444 #ifndef USE_X86_64_FRAME_POINTER
3445 #define USE_X86_64_FRAME_POINTER 0
3446 #endif
3447
3448 /* Set the default values for switches whose default depends on TARGET_64BIT
3449 in case they weren't overwritten by command line options. */
3450 if (TARGET_64BIT)
3451 {
3452 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3453 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3454 if (flag_asynchronous_unwind_tables == 2)
3455 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3456 if (flag_pcc_struct_return == 2)
3457 flag_pcc_struct_return = 0;
3458 }
3459 else
3460 {
3461 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3462 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3463 if (flag_asynchronous_unwind_tables == 2)
3464 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3465 if (flag_pcc_struct_return == 2)
3466 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3467 }
3468
3469 if (optimize_size)
3470 ix86_cost = &ix86_size_cost;
3471 else
3472 ix86_cost = processor_target_table[ix86_tune].cost;
3473
3474 /* Arrange to set up i386_stack_locals for all functions. */
3475 init_machine_status = ix86_init_machine_status;
3476
3477 /* Validate -mregparm= value. */
3478 if (global_options_set.x_ix86_regparm)
3479 {
3480 if (TARGET_64BIT)
3481 warning (0, "-mregparm is ignored in 64-bit mode");
3482 if (ix86_regparm > REGPARM_MAX)
3483 {
3484 error ("-mregparm=%d is not between 0 and %d",
3485 ix86_regparm, REGPARM_MAX);
3486 ix86_regparm = 0;
3487 }
3488 }
3489 if (TARGET_64BIT)
3490 ix86_regparm = REGPARM_MAX;
3491
3492 /* Default align_* from the processor table. */
3493 if (align_loops == 0)
3494 {
3495 align_loops = processor_target_table[ix86_tune].align_loop;
3496 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3497 }
3498 if (align_jumps == 0)
3499 {
3500 align_jumps = processor_target_table[ix86_tune].align_jump;
3501 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3502 }
3503 if (align_functions == 0)
3504 {
3505 align_functions = processor_target_table[ix86_tune].align_func;
3506 }
3507
3508 /* Provide default for -mbranch-cost= value. */
3509 if (!global_options_set.x_ix86_branch_cost)
3510 ix86_branch_cost = ix86_cost->branch_cost;
3511
3512 if (TARGET_64BIT)
3513 {
3514 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3515
3516 /* Enable by default the SSE and MMX builtins. Do allow the user to
3517 explicitly disable any of these. In particular, disabling SSE and
3518 MMX for kernel code is extremely useful. */
3519 if (!ix86_arch_specified)
3520 ix86_isa_flags
3521 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3522 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3523
3524 if (TARGET_RTD)
3525 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3526 }
3527 else
3528 {
3529 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3530
3531 if (!ix86_arch_specified)
3532 ix86_isa_flags
3533 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3534
3535 /* i386 ABI does not specify red zone. It still makes sense to use it
3536 when programmer takes care to stack from being destroyed. */
3537 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3538 target_flags |= MASK_NO_RED_ZONE;
3539 }
3540
3541 /* Keep nonleaf frame pointers. */
3542 if (flag_omit_frame_pointer)
3543 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3544 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3545 flag_omit_frame_pointer = 1;
3546
3547 /* If we're doing fast math, we don't care about comparison order
3548 wrt NaNs. This lets us use a shorter comparison sequence. */
3549 if (flag_finite_math_only)
3550 target_flags &= ~MASK_IEEE_FP;
3551
3552 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3553 since the insns won't need emulation. */
3554 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3555 target_flags &= ~MASK_NO_FANCY_MATH_387;
3556
3557 /* Likewise, if the target doesn't have a 387, or we've specified
3558 software floating point, don't use 387 inline intrinsics. */
3559 if (!TARGET_80387)
3560 target_flags |= MASK_NO_FANCY_MATH_387;
3561
3562 /* Turn on MMX builtins for -msse. */
3563 if (TARGET_SSE)
3564 {
3565 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3566 x86_prefetch_sse = true;
3567 }
3568
3569 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3570 if (TARGET_SSE4_2 || TARGET_ABM)
3571 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3572
3573 /* Turn on lzcnt instruction for -mabm. */
3574 if (TARGET_ABM)
3575 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3576
3577 /* Validate -mpreferred-stack-boundary= value or default it to
3578 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3579 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3580 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3581 {
3582 int min = (TARGET_64BIT ? 4 : 2);
3583 int max = (TARGET_SEH ? 4 : 12);
3584
3585 if (ix86_preferred_stack_boundary_arg < min
3586 || ix86_preferred_stack_boundary_arg > max)
3587 {
3588 if (min == max)
3589 error ("-mpreferred-stack-boundary is not supported "
3590 "for this target");
3591 else
3592 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3593 ix86_preferred_stack_boundary_arg, min, max);
3594 }
3595 else
3596 ix86_preferred_stack_boundary
3597 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3598 }
3599
3600 /* Set the default value for -mstackrealign. */
3601 if (ix86_force_align_arg_pointer == -1)
3602 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3603
3604 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3605
3606 /* Validate -mincoming-stack-boundary= value or default it to
3607 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3608 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3609 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3610 {
3611 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3612 || ix86_incoming_stack_boundary_arg > 12)
3613 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3614 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3615 else
3616 {
3617 ix86_user_incoming_stack_boundary
3618 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3619 ix86_incoming_stack_boundary
3620 = ix86_user_incoming_stack_boundary;
3621 }
3622 }
3623
3624 /* Accept -msseregparm only if at least SSE support is enabled. */
3625 if (TARGET_SSEREGPARM
3626 && ! TARGET_SSE)
3627 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3628
3629 if (global_options_set.x_ix86_fpmath)
3630 {
3631 if (ix86_fpmath & FPMATH_SSE)
3632 {
3633 if (!TARGET_SSE)
3634 {
3635 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3636 ix86_fpmath = FPMATH_387;
3637 }
3638 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3639 {
3640 warning (0, "387 instruction set disabled, using SSE arithmetics");
3641 ix86_fpmath = FPMATH_SSE;
3642 }
3643 }
3644 }
3645 else
3646 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3647
3648 /* If the i387 is disabled, then do not return values in it. */
3649 if (!TARGET_80387)
3650 target_flags &= ~MASK_FLOAT_RETURNS;
3651
3652 /* Use external vectorized library in vectorizing intrinsics. */
3653 if (global_options_set.x_ix86_veclibabi_type)
3654 switch (ix86_veclibabi_type)
3655 {
3656 case ix86_veclibabi_type_svml:
3657 ix86_veclib_handler = ix86_veclibabi_svml;
3658 break;
3659
3660 case ix86_veclibabi_type_acml:
3661 ix86_veclib_handler = ix86_veclibabi_acml;
3662 break;
3663
3664 default:
3665 gcc_unreachable ();
3666 }
3667
3668 if ((!USE_IX86_FRAME_POINTER
3669 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3670 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3671 && !optimize_size)
3672 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3673
3674 /* ??? Unwind info is not correct around the CFG unless either a frame
3675 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3676 unwind info generation to be aware of the CFG and propagating states
3677 around edges. */
3678 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3679 || flag_exceptions || flag_non_call_exceptions)
3680 && flag_omit_frame_pointer
3681 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3682 {
3683 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3684 warning (0, "unwind tables currently require either a frame pointer "
3685 "or %saccumulate-outgoing-args%s for correctness",
3686 prefix, suffix);
3687 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3688 }
3689
3690 /* If stack probes are required, the space used for large function
3691 arguments on the stack must also be probed, so enable
3692 -maccumulate-outgoing-args so this happens in the prologue. */
3693 if (TARGET_STACK_PROBE
3694 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3695 {
3696 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3697 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3698 "for correctness", prefix, suffix);
3699 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3700 }
3701
3702 /* For sane SSE instruction set generation we need fcomi instruction.
3703 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3704 expands to a sequence that includes conditional move. */
3705 if (TARGET_SSE || TARGET_RDRND)
3706 TARGET_CMOVE = 1;
3707
3708 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3709 {
3710 char *p;
3711 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3712 p = strchr (internal_label_prefix, 'X');
3713 internal_label_prefix_len = p - internal_label_prefix;
3714 *p = '\0';
3715 }
3716
3717 /* When scheduling description is not available, disable scheduler pass
3718 so it won't slow down the compilation and make x87 code slower. */
3719 if (!TARGET_SCHEDULE)
3720 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3721
3722 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3723 ix86_cost->simultaneous_prefetches,
3724 global_options.x_param_values,
3725 global_options_set.x_param_values);
3726 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3727 global_options.x_param_values,
3728 global_options_set.x_param_values);
3729 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3730 global_options.x_param_values,
3731 global_options_set.x_param_values);
3732 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3733 global_options.x_param_values,
3734 global_options_set.x_param_values);
3735
3736 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3737 if (flag_prefetch_loop_arrays < 0
3738 && HAVE_prefetch
3739 && optimize >= 3
3740 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3741 flag_prefetch_loop_arrays = 1;
3742
3743 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3744 can be optimized to ap = __builtin_next_arg (0). */
3745 if (!TARGET_64BIT && !flag_split_stack)
3746 targetm.expand_builtin_va_start = NULL;
3747
3748 if (TARGET_64BIT)
3749 {
3750 ix86_gen_leave = gen_leave_rex64;
3751 ix86_gen_add3 = gen_adddi3;
3752 ix86_gen_sub3 = gen_subdi3;
3753 ix86_gen_sub3_carry = gen_subdi3_carry;
3754 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3755 ix86_gen_monitor = gen_sse3_monitor64;
3756 ix86_gen_andsp = gen_anddi3;
3757 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3758 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3759 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3760 }
3761 else
3762 {
3763 ix86_gen_leave = gen_leave;
3764 ix86_gen_add3 = gen_addsi3;
3765 ix86_gen_sub3 = gen_subsi3;
3766 ix86_gen_sub3_carry = gen_subsi3_carry;
3767 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3768 ix86_gen_monitor = gen_sse3_monitor;
3769 ix86_gen_andsp = gen_andsi3;
3770 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3771 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3772 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3773 }
3774
3775 #ifdef USE_IX86_CLD
3776 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3777 if (!TARGET_64BIT)
3778 target_flags |= MASK_CLD & ~target_flags_explicit;
3779 #endif
3780
3781 if (!TARGET_64BIT && flag_pic)
3782 {
3783 if (flag_fentry > 0)
3784 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3785 "with -fpic");
3786 flag_fentry = 0;
3787 }
3788 else if (TARGET_SEH)
3789 {
3790 if (flag_fentry == 0)
3791 sorry ("-mno-fentry isn%'t compatible with SEH");
3792 flag_fentry = 1;
3793 }
3794 else if (flag_fentry < 0)
3795 {
3796 #if defined(PROFILE_BEFORE_PROLOGUE)
3797 flag_fentry = 1;
3798 #else
3799 flag_fentry = 0;
3800 #endif
3801 }
3802
3803 if (TARGET_AVX)
3804 {
3805 /* When not optimize for size, enable vzeroupper optimization for
3806 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3807 AVX unaligned load/store. */
3808 if (!optimize_size)
3809 {
3810 if (flag_expensive_optimizations
3811 && !(target_flags_explicit & MASK_VZEROUPPER))
3812 target_flags |= MASK_VZEROUPPER;
3813 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3814 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3815 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3816 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3817 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3818 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3819 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3820 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3821 target_flags |= MASK_PREFER_AVX128;
3822 }
3823 }
3824 else
3825 {
3826 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3827 target_flags &= ~MASK_VZEROUPPER;
3828 }
3829
3830 if (ix86_recip_name)
3831 {
3832 char *p = ASTRDUP (ix86_recip_name);
3833 char *q;
3834 unsigned int mask, i;
3835 bool invert;
3836
3837 while ((q = strtok (p, ",")) != NULL)
3838 {
3839 p = NULL;
3840 if (*q == '!')
3841 {
3842 invert = true;
3843 q++;
3844 }
3845 else
3846 invert = false;
3847
3848 if (!strcmp (q, "default"))
3849 mask = RECIP_MASK_ALL;
3850 else
3851 {
3852 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3853 if (!strcmp (q, recip_options[i].string))
3854 {
3855 mask = recip_options[i].mask;
3856 break;
3857 }
3858
3859 if (i == ARRAY_SIZE (recip_options))
3860 {
3861 error ("unknown option for -mrecip=%s", q);
3862 invert = false;
3863 mask = RECIP_MASK_NONE;
3864 }
3865 }
3866
3867 recip_mask_explicit |= mask;
3868 if (invert)
3869 recip_mask &= ~mask;
3870 else
3871 recip_mask |= mask;
3872 }
3873 }
3874
3875 if (TARGET_RECIP)
3876 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3877 else if (target_flags_explicit & MASK_RECIP)
3878 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3879
3880 /* Save the initial options in case the user does function specific
3881 options. */
3882 if (main_args_p)
3883 target_option_default_node = target_option_current_node
3884 = build_target_option_node ();
3885 }
3886
3887 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3888
3889 static bool
3890 function_pass_avx256_p (const_rtx val)
3891 {
3892 if (!val)
3893 return false;
3894
3895 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3896 return true;
3897
3898 if (GET_CODE (val) == PARALLEL)
3899 {
3900 int i;
3901 rtx r;
3902
3903 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3904 {
3905 r = XVECEXP (val, 0, i);
3906 if (GET_CODE (r) == EXPR_LIST
3907 && XEXP (r, 0)
3908 && REG_P (XEXP (r, 0))
3909 && (GET_MODE (XEXP (r, 0)) == OImode
3910 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3911 return true;
3912 }
3913 }
3914
3915 return false;
3916 }
3917
3918 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3919
3920 static void
3921 ix86_option_override (void)
3922 {
3923 ix86_option_override_internal (true);
3924 }
3925
3926 /* Update register usage after having seen the compiler flags. */
3927
3928 static void
3929 ix86_conditional_register_usage (void)
3930 {
3931 int i;
3932 unsigned int j;
3933
3934 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3935 {
3936 if (fixed_regs[i] > 1)
3937 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3938 if (call_used_regs[i] > 1)
3939 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3940 }
3941
3942 /* The PIC register, if it exists, is fixed. */
3943 j = PIC_OFFSET_TABLE_REGNUM;
3944 if (j != INVALID_REGNUM)
3945 fixed_regs[j] = call_used_regs[j] = 1;
3946
3947 /* The 64-bit MS_ABI changes the set of call-used registers. */
3948 if (TARGET_64BIT_MS_ABI)
3949 {
3950 call_used_regs[SI_REG] = 0;
3951 call_used_regs[DI_REG] = 0;
3952 call_used_regs[XMM6_REG] = 0;
3953 call_used_regs[XMM7_REG] = 0;
3954 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3955 call_used_regs[i] = 0;
3956 }
3957
3958 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3959 other call-clobbered regs for 64-bit. */
3960 if (TARGET_64BIT)
3961 {
3962 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3963
3964 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3965 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3966 && call_used_regs[i])
3967 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3968 }
3969
3970 /* If MMX is disabled, squash the registers. */
3971 if (! TARGET_MMX)
3972 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3973 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3974 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3975
3976 /* If SSE is disabled, squash the registers. */
3977 if (! TARGET_SSE)
3978 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3979 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3980 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3981
3982 /* If the FPU is disabled, squash the registers. */
3983 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3984 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3985 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3986 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3987
3988 /* If 32-bit, squash the 64-bit registers. */
3989 if (! TARGET_64BIT)
3990 {
3991 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3992 reg_names[i] = "";
3993 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3994 reg_names[i] = "";
3995 }
3996 }
3997
3998 \f
3999 /* Save the current options */
4000
4001 static void
4002 ix86_function_specific_save (struct cl_target_option *ptr)
4003 {
4004 ptr->arch = ix86_arch;
4005 ptr->schedule = ix86_schedule;
4006 ptr->tune = ix86_tune;
4007 ptr->branch_cost = ix86_branch_cost;
4008 ptr->tune_defaulted = ix86_tune_defaulted;
4009 ptr->arch_specified = ix86_arch_specified;
4010 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4011 ptr->ix86_target_flags_explicit = target_flags_explicit;
4012 ptr->x_recip_mask_explicit = recip_mask_explicit;
4013
4014 /* The fields are char but the variables are not; make sure the
4015 values fit in the fields. */
4016 gcc_assert (ptr->arch == ix86_arch);
4017 gcc_assert (ptr->schedule == ix86_schedule);
4018 gcc_assert (ptr->tune == ix86_tune);
4019 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4020 }
4021
4022 /* Restore the current options */
4023
4024 static void
4025 ix86_function_specific_restore (struct cl_target_option *ptr)
4026 {
4027 enum processor_type old_tune = ix86_tune;
4028 enum processor_type old_arch = ix86_arch;
4029 unsigned int ix86_arch_mask, ix86_tune_mask;
4030 int i;
4031
4032 ix86_arch = (enum processor_type) ptr->arch;
4033 ix86_schedule = (enum attr_cpu) ptr->schedule;
4034 ix86_tune = (enum processor_type) ptr->tune;
4035 ix86_branch_cost = ptr->branch_cost;
4036 ix86_tune_defaulted = ptr->tune_defaulted;
4037 ix86_arch_specified = ptr->arch_specified;
4038 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4039 target_flags_explicit = ptr->ix86_target_flags_explicit;
4040 recip_mask_explicit = ptr->x_recip_mask_explicit;
4041
4042 /* Recreate the arch feature tests if the arch changed */
4043 if (old_arch != ix86_arch)
4044 {
4045 ix86_arch_mask = 1u << ix86_arch;
4046 for (i = 0; i < X86_ARCH_LAST; ++i)
4047 ix86_arch_features[i]
4048 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4049 }
4050
4051 /* Recreate the tune optimization tests */
4052 if (old_tune != ix86_tune)
4053 {
4054 ix86_tune_mask = 1u << ix86_tune;
4055 for (i = 0; i < X86_TUNE_LAST; ++i)
4056 ix86_tune_features[i]
4057 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4058 }
4059 }
4060
4061 /* Print the current options */
4062
4063 static void
4064 ix86_function_specific_print (FILE *file, int indent,
4065 struct cl_target_option *ptr)
4066 {
4067 char *target_string
4068 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4069 NULL, NULL, ptr->x_ix86_fpmath, false);
4070
4071 fprintf (file, "%*sarch = %d (%s)\n",
4072 indent, "",
4073 ptr->arch,
4074 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4075 ? cpu_names[ptr->arch]
4076 : "<unknown>"));
4077
4078 fprintf (file, "%*stune = %d (%s)\n",
4079 indent, "",
4080 ptr->tune,
4081 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4082 ? cpu_names[ptr->tune]
4083 : "<unknown>"));
4084
4085 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4086
4087 if (target_string)
4088 {
4089 fprintf (file, "%*s%s\n", indent, "", target_string);
4090 free (target_string);
4091 }
4092 }
4093
4094 \f
4095 /* Inner function to process the attribute((target(...))), take an argument and
4096 set the current options from the argument. If we have a list, recursively go
4097 over the list. */
4098
4099 static bool
4100 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4101 struct gcc_options *enum_opts_set)
4102 {
4103 char *next_optstr;
4104 bool ret = true;
4105
4106 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4107 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4108 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4109 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4110 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4111
4112 enum ix86_opt_type
4113 {
4114 ix86_opt_unknown,
4115 ix86_opt_yes,
4116 ix86_opt_no,
4117 ix86_opt_str,
4118 ix86_opt_enum,
4119 ix86_opt_isa
4120 };
4121
4122 static const struct
4123 {
4124 const char *string;
4125 size_t len;
4126 enum ix86_opt_type type;
4127 int opt;
4128 int mask;
4129 } attrs[] = {
4130 /* isa options */
4131 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4132 IX86_ATTR_ISA ("abm", OPT_mabm),
4133 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4134 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4135 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4136 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4137 IX86_ATTR_ISA ("aes", OPT_maes),
4138 IX86_ATTR_ISA ("avx", OPT_mavx),
4139 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4140 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4141 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4142 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4143 IX86_ATTR_ISA ("sse", OPT_msse),
4144 IX86_ATTR_ISA ("sse2", OPT_msse2),
4145 IX86_ATTR_ISA ("sse3", OPT_msse3),
4146 IX86_ATTR_ISA ("sse4", OPT_msse4),
4147 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4148 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4149 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4150 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4151 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4152 IX86_ATTR_ISA ("fma", OPT_mfma),
4153 IX86_ATTR_ISA ("xop", OPT_mxop),
4154 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4155 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4156 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4157 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4158
4159 /* enum options */
4160 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4161
4162 /* string options */
4163 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4164 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4165
4166 /* flag options */
4167 IX86_ATTR_YES ("cld",
4168 OPT_mcld,
4169 MASK_CLD),
4170
4171 IX86_ATTR_NO ("fancy-math-387",
4172 OPT_mfancy_math_387,
4173 MASK_NO_FANCY_MATH_387),
4174
4175 IX86_ATTR_YES ("ieee-fp",
4176 OPT_mieee_fp,
4177 MASK_IEEE_FP),
4178
4179 IX86_ATTR_YES ("inline-all-stringops",
4180 OPT_minline_all_stringops,
4181 MASK_INLINE_ALL_STRINGOPS),
4182
4183 IX86_ATTR_YES ("inline-stringops-dynamically",
4184 OPT_minline_stringops_dynamically,
4185 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4186
4187 IX86_ATTR_NO ("align-stringops",
4188 OPT_mno_align_stringops,
4189 MASK_NO_ALIGN_STRINGOPS),
4190
4191 IX86_ATTR_YES ("recip",
4192 OPT_mrecip,
4193 MASK_RECIP),
4194
4195 };
4196
4197 /* If this is a list, recurse to get the options. */
4198 if (TREE_CODE (args) == TREE_LIST)
4199 {
4200 bool ret = true;
4201
4202 for (; args; args = TREE_CHAIN (args))
4203 if (TREE_VALUE (args)
4204 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4205 p_strings, enum_opts_set))
4206 ret = false;
4207
4208 return ret;
4209 }
4210
4211 else if (TREE_CODE (args) != STRING_CST)
4212 gcc_unreachable ();
4213
4214 /* Handle multiple arguments separated by commas. */
4215 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4216
4217 while (next_optstr && *next_optstr != '\0')
4218 {
4219 char *p = next_optstr;
4220 char *orig_p = p;
4221 char *comma = strchr (next_optstr, ',');
4222 const char *opt_string;
4223 size_t len, opt_len;
4224 int opt;
4225 bool opt_set_p;
4226 char ch;
4227 unsigned i;
4228 enum ix86_opt_type type = ix86_opt_unknown;
4229 int mask = 0;
4230
4231 if (comma)
4232 {
4233 *comma = '\0';
4234 len = comma - next_optstr;
4235 next_optstr = comma + 1;
4236 }
4237 else
4238 {
4239 len = strlen (p);
4240 next_optstr = NULL;
4241 }
4242
4243 /* Recognize no-xxx. */
4244 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4245 {
4246 opt_set_p = false;
4247 p += 3;
4248 len -= 3;
4249 }
4250 else
4251 opt_set_p = true;
4252
4253 /* Find the option. */
4254 ch = *p;
4255 opt = N_OPTS;
4256 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4257 {
4258 type = attrs[i].type;
4259 opt_len = attrs[i].len;
4260 if (ch == attrs[i].string[0]
4261 && ((type != ix86_opt_str && type != ix86_opt_enum)
4262 ? len == opt_len
4263 : len > opt_len)
4264 && memcmp (p, attrs[i].string, opt_len) == 0)
4265 {
4266 opt = attrs[i].opt;
4267 mask = attrs[i].mask;
4268 opt_string = attrs[i].string;
4269 break;
4270 }
4271 }
4272
4273 /* Process the option. */
4274 if (opt == N_OPTS)
4275 {
4276 error ("attribute(target(\"%s\")) is unknown", orig_p);
4277 ret = false;
4278 }
4279
4280 else if (type == ix86_opt_isa)
4281 {
4282 struct cl_decoded_option decoded;
4283
4284 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4285 ix86_handle_option (&global_options, &global_options_set,
4286 &decoded, input_location);
4287 }
4288
4289 else if (type == ix86_opt_yes || type == ix86_opt_no)
4290 {
4291 if (type == ix86_opt_no)
4292 opt_set_p = !opt_set_p;
4293
4294 if (opt_set_p)
4295 target_flags |= mask;
4296 else
4297 target_flags &= ~mask;
4298 }
4299
4300 else if (type == ix86_opt_str)
4301 {
4302 if (p_strings[opt])
4303 {
4304 error ("option(\"%s\") was already specified", opt_string);
4305 ret = false;
4306 }
4307 else
4308 p_strings[opt] = xstrdup (p + opt_len);
4309 }
4310
4311 else if (type == ix86_opt_enum)
4312 {
4313 bool arg_ok;
4314 int value;
4315
4316 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4317 if (arg_ok)
4318 set_option (&global_options, enum_opts_set, opt, value,
4319 p + opt_len, DK_UNSPECIFIED, input_location,
4320 global_dc);
4321 else
4322 {
4323 error ("attribute(target(\"%s\")) is unknown", orig_p);
4324 ret = false;
4325 }
4326 }
4327
4328 else
4329 gcc_unreachable ();
4330 }
4331
4332 return ret;
4333 }
4334
4335 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4336
4337 tree
4338 ix86_valid_target_attribute_tree (tree args)
4339 {
4340 const char *orig_arch_string = ix86_arch_string;
4341 const char *orig_tune_string = ix86_tune_string;
4342 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4343 int orig_tune_defaulted = ix86_tune_defaulted;
4344 int orig_arch_specified = ix86_arch_specified;
4345 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4346 tree t = NULL_TREE;
4347 int i;
4348 struct cl_target_option *def
4349 = TREE_TARGET_OPTION (target_option_default_node);
4350 struct gcc_options enum_opts_set;
4351
4352 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4353
4354 /* Process each of the options on the chain. */
4355 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4356 &enum_opts_set))
4357 return NULL_TREE;
4358
4359 /* If the changed options are different from the default, rerun
4360 ix86_option_override_internal, and then save the options away.
4361 The string options are are attribute options, and will be undone
4362 when we copy the save structure. */
4363 if (ix86_isa_flags != def->x_ix86_isa_flags
4364 || target_flags != def->x_target_flags
4365 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4366 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4367 || enum_opts_set.x_ix86_fpmath)
4368 {
4369 /* If we are using the default tune= or arch=, undo the string assigned,
4370 and use the default. */
4371 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4372 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4373 else if (!orig_arch_specified)
4374 ix86_arch_string = NULL;
4375
4376 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4377 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4378 else if (orig_tune_defaulted)
4379 ix86_tune_string = NULL;
4380
4381 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4382 if (enum_opts_set.x_ix86_fpmath)
4383 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4384 else if (!TARGET_64BIT && TARGET_SSE)
4385 {
4386 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4387 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4388 }
4389
4390 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4391 ix86_option_override_internal (false);
4392
4393 /* Add any builtin functions with the new isa if any. */
4394 ix86_add_new_builtins (ix86_isa_flags);
4395
4396 /* Save the current options unless we are validating options for
4397 #pragma. */
4398 t = build_target_option_node ();
4399
4400 ix86_arch_string = orig_arch_string;
4401 ix86_tune_string = orig_tune_string;
4402 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4403
4404 /* Free up memory allocated to hold the strings */
4405 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4406 free (option_strings[i]);
4407 }
4408
4409 return t;
4410 }
4411
4412 /* Hook to validate attribute((target("string"))). */
4413
4414 static bool
4415 ix86_valid_target_attribute_p (tree fndecl,
4416 tree ARG_UNUSED (name),
4417 tree args,
4418 int ARG_UNUSED (flags))
4419 {
4420 struct cl_target_option cur_target;
4421 bool ret = true;
4422 tree old_optimize = build_optimization_node ();
4423 tree new_target, new_optimize;
4424 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4425
4426 /* If the function changed the optimization levels as well as setting target
4427 options, start with the optimizations specified. */
4428 if (func_optimize && func_optimize != old_optimize)
4429 cl_optimization_restore (&global_options,
4430 TREE_OPTIMIZATION (func_optimize));
4431
4432 /* The target attributes may also change some optimization flags, so update
4433 the optimization options if necessary. */
4434 cl_target_option_save (&cur_target, &global_options);
4435 new_target = ix86_valid_target_attribute_tree (args);
4436 new_optimize = build_optimization_node ();
4437
4438 if (!new_target)
4439 ret = false;
4440
4441 else if (fndecl)
4442 {
4443 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4444
4445 if (old_optimize != new_optimize)
4446 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4447 }
4448
4449 cl_target_option_restore (&global_options, &cur_target);
4450
4451 if (old_optimize != new_optimize)
4452 cl_optimization_restore (&global_options,
4453 TREE_OPTIMIZATION (old_optimize));
4454
4455 return ret;
4456 }
4457
4458 \f
4459 /* Hook to determine if one function can safely inline another. */
4460
4461 static bool
4462 ix86_can_inline_p (tree caller, tree callee)
4463 {
4464 bool ret = false;
4465 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4466 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4467
4468 /* If callee has no option attributes, then it is ok to inline. */
4469 if (!callee_tree)
4470 ret = true;
4471
4472 /* If caller has no option attributes, but callee does then it is not ok to
4473 inline. */
4474 else if (!caller_tree)
4475 ret = false;
4476
4477 else
4478 {
4479 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4480 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4481
4482 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4483 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4484 function. */
4485 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4486 != callee_opts->x_ix86_isa_flags)
4487 ret = false;
4488
4489 /* See if we have the same non-isa options. */
4490 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4491 ret = false;
4492
4493 /* See if arch, tune, etc. are the same. */
4494 else if (caller_opts->arch != callee_opts->arch)
4495 ret = false;
4496
4497 else if (caller_opts->tune != callee_opts->tune)
4498 ret = false;
4499
4500 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4501 ret = false;
4502
4503 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4504 ret = false;
4505
4506 else
4507 ret = true;
4508 }
4509
4510 return ret;
4511 }
4512
4513 \f
4514 /* Remember the last target of ix86_set_current_function. */
4515 static GTY(()) tree ix86_previous_fndecl;
4516
4517 /* Establish appropriate back-end context for processing the function
4518 FNDECL. The argument might be NULL to indicate processing at top
4519 level, outside of any function scope. */
4520 static void
4521 ix86_set_current_function (tree fndecl)
4522 {
4523 /* Only change the context if the function changes. This hook is called
4524 several times in the course of compiling a function, and we don't want to
4525 slow things down too much or call target_reinit when it isn't safe. */
4526 if (fndecl && fndecl != ix86_previous_fndecl)
4527 {
4528 tree old_tree = (ix86_previous_fndecl
4529 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4530 : NULL_TREE);
4531
4532 tree new_tree = (fndecl
4533 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4534 : NULL_TREE);
4535
4536 ix86_previous_fndecl = fndecl;
4537 if (old_tree == new_tree)
4538 ;
4539
4540 else if (new_tree)
4541 {
4542 cl_target_option_restore (&global_options,
4543 TREE_TARGET_OPTION (new_tree));
4544 target_reinit ();
4545 }
4546
4547 else if (old_tree)
4548 {
4549 struct cl_target_option *def
4550 = TREE_TARGET_OPTION (target_option_current_node);
4551
4552 cl_target_option_restore (&global_options, def);
4553 target_reinit ();
4554 }
4555 }
4556 }
4557
4558 \f
4559 /* Return true if this goes in large data/bss. */
4560
4561 static bool
4562 ix86_in_large_data_p (tree exp)
4563 {
4564 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4565 return false;
4566
4567 /* Functions are never large data. */
4568 if (TREE_CODE (exp) == FUNCTION_DECL)
4569 return false;
4570
4571 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4572 {
4573 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4574 if (strcmp (section, ".ldata") == 0
4575 || strcmp (section, ".lbss") == 0)
4576 return true;
4577 return false;
4578 }
4579 else
4580 {
4581 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4582
4583 /* If this is an incomplete type with size 0, then we can't put it
4584 in data because it might be too big when completed. */
4585 if (!size || size > ix86_section_threshold)
4586 return true;
4587 }
4588
4589 return false;
4590 }
4591
4592 /* Switch to the appropriate section for output of DECL.
4593 DECL is either a `VAR_DECL' node or a constant of some sort.
4594 RELOC indicates whether forming the initial value of DECL requires
4595 link-time relocations. */
4596
4597 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4598 ATTRIBUTE_UNUSED;
4599
4600 static section *
4601 x86_64_elf_select_section (tree decl, int reloc,
4602 unsigned HOST_WIDE_INT align)
4603 {
4604 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4605 && ix86_in_large_data_p (decl))
4606 {
4607 const char *sname = NULL;
4608 unsigned int flags = SECTION_WRITE;
4609 switch (categorize_decl_for_section (decl, reloc))
4610 {
4611 case SECCAT_DATA:
4612 sname = ".ldata";
4613 break;
4614 case SECCAT_DATA_REL:
4615 sname = ".ldata.rel";
4616 break;
4617 case SECCAT_DATA_REL_LOCAL:
4618 sname = ".ldata.rel.local";
4619 break;
4620 case SECCAT_DATA_REL_RO:
4621 sname = ".ldata.rel.ro";
4622 break;
4623 case SECCAT_DATA_REL_RO_LOCAL:
4624 sname = ".ldata.rel.ro.local";
4625 break;
4626 case SECCAT_BSS:
4627 sname = ".lbss";
4628 flags |= SECTION_BSS;
4629 break;
4630 case SECCAT_RODATA:
4631 case SECCAT_RODATA_MERGE_STR:
4632 case SECCAT_RODATA_MERGE_STR_INIT:
4633 case SECCAT_RODATA_MERGE_CONST:
4634 sname = ".lrodata";
4635 flags = 0;
4636 break;
4637 case SECCAT_SRODATA:
4638 case SECCAT_SDATA:
4639 case SECCAT_SBSS:
4640 gcc_unreachable ();
4641 case SECCAT_TEXT:
4642 case SECCAT_TDATA:
4643 case SECCAT_TBSS:
4644 /* We don't split these for medium model. Place them into
4645 default sections and hope for best. */
4646 break;
4647 }
4648 if (sname)
4649 {
4650 /* We might get called with string constants, but get_named_section
4651 doesn't like them as they are not DECLs. Also, we need to set
4652 flags in that case. */
4653 if (!DECL_P (decl))
4654 return get_section (sname, flags, NULL);
4655 return get_named_section (decl, sname, reloc);
4656 }
4657 }
4658 return default_elf_select_section (decl, reloc, align);
4659 }
4660
4661 /* Build up a unique section name, expressed as a
4662 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4663 RELOC indicates whether the initial value of EXP requires
4664 link-time relocations. */
4665
4666 static void ATTRIBUTE_UNUSED
4667 x86_64_elf_unique_section (tree decl, int reloc)
4668 {
4669 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4670 && ix86_in_large_data_p (decl))
4671 {
4672 const char *prefix = NULL;
4673 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4674 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4675
4676 switch (categorize_decl_for_section (decl, reloc))
4677 {
4678 case SECCAT_DATA:
4679 case SECCAT_DATA_REL:
4680 case SECCAT_DATA_REL_LOCAL:
4681 case SECCAT_DATA_REL_RO:
4682 case SECCAT_DATA_REL_RO_LOCAL:
4683 prefix = one_only ? ".ld" : ".ldata";
4684 break;
4685 case SECCAT_BSS:
4686 prefix = one_only ? ".lb" : ".lbss";
4687 break;
4688 case SECCAT_RODATA:
4689 case SECCAT_RODATA_MERGE_STR:
4690 case SECCAT_RODATA_MERGE_STR_INIT:
4691 case SECCAT_RODATA_MERGE_CONST:
4692 prefix = one_only ? ".lr" : ".lrodata";
4693 break;
4694 case SECCAT_SRODATA:
4695 case SECCAT_SDATA:
4696 case SECCAT_SBSS:
4697 gcc_unreachable ();
4698 case SECCAT_TEXT:
4699 case SECCAT_TDATA:
4700 case SECCAT_TBSS:
4701 /* We don't split these for medium model. Place them into
4702 default sections and hope for best. */
4703 break;
4704 }
4705 if (prefix)
4706 {
4707 const char *name, *linkonce;
4708 char *string;
4709
4710 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4711 name = targetm.strip_name_encoding (name);
4712
4713 /* If we're using one_only, then there needs to be a .gnu.linkonce
4714 prefix to the section name. */
4715 linkonce = one_only ? ".gnu.linkonce" : "";
4716
4717 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4718
4719 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4720 return;
4721 }
4722 }
4723 default_unique_section (decl, reloc);
4724 }
4725
4726 #ifdef COMMON_ASM_OP
4727 /* This says how to output assembler code to declare an
4728 uninitialized external linkage data object.
4729
4730 For medium model x86-64 we need to use .largecomm opcode for
4731 large objects. */
4732 void
4733 x86_elf_aligned_common (FILE *file,
4734 const char *name, unsigned HOST_WIDE_INT size,
4735 int align)
4736 {
4737 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4738 && size > (unsigned int)ix86_section_threshold)
4739 fputs (".largecomm\t", file);
4740 else
4741 fputs (COMMON_ASM_OP, file);
4742 assemble_name (file, name);
4743 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4744 size, align / BITS_PER_UNIT);
4745 }
4746 #endif
4747
4748 /* Utility function for targets to use in implementing
4749 ASM_OUTPUT_ALIGNED_BSS. */
4750
4751 void
4752 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4753 const char *name, unsigned HOST_WIDE_INT size,
4754 int align)
4755 {
4756 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4757 && size > (unsigned int)ix86_section_threshold)
4758 switch_to_section (get_named_section (decl, ".lbss", 0));
4759 else
4760 switch_to_section (bss_section);
4761 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4762 #ifdef ASM_DECLARE_OBJECT_NAME
4763 last_assemble_variable_decl = decl;
4764 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4765 #else
4766 /* Standard thing is just output label for the object. */
4767 ASM_OUTPUT_LABEL (file, name);
4768 #endif /* ASM_DECLARE_OBJECT_NAME */
4769 ASM_OUTPUT_SKIP (file, size ? size : 1);
4770 }
4771 \f
4772 /* Decide whether we must probe the stack before any space allocation
4773 on this target. It's essentially TARGET_STACK_PROBE except when
4774 -fstack-check causes the stack to be already probed differently. */
4775
4776 bool
4777 ix86_target_stack_probe (void)
4778 {
4779 /* Do not probe the stack twice if static stack checking is enabled. */
4780 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4781 return false;
4782
4783 return TARGET_STACK_PROBE;
4784 }
4785 \f
4786 /* Decide whether we can make a sibling call to a function. DECL is the
4787 declaration of the function being targeted by the call and EXP is the
4788 CALL_EXPR representing the call. */
4789
4790 static bool
4791 ix86_function_ok_for_sibcall (tree decl, tree exp)
4792 {
4793 tree type, decl_or_type;
4794 rtx a, b;
4795
4796 /* If we are generating position-independent code, we cannot sibcall
4797 optimize any indirect call, or a direct call to a global function,
4798 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4799 if (!TARGET_MACHO
4800 && !TARGET_64BIT
4801 && flag_pic
4802 && (!decl || !targetm.binds_local_p (decl)))
4803 return false;
4804
4805 /* If we need to align the outgoing stack, then sibcalling would
4806 unalign the stack, which may break the called function. */
4807 if (ix86_minimum_incoming_stack_boundary (true)
4808 < PREFERRED_STACK_BOUNDARY)
4809 return false;
4810
4811 if (decl)
4812 {
4813 decl_or_type = decl;
4814 type = TREE_TYPE (decl);
4815 }
4816 else
4817 {
4818 /* We're looking at the CALL_EXPR, we need the type of the function. */
4819 type = CALL_EXPR_FN (exp); /* pointer expression */
4820 type = TREE_TYPE (type); /* pointer type */
4821 type = TREE_TYPE (type); /* function type */
4822 decl_or_type = type;
4823 }
4824
4825 /* Check that the return value locations are the same. Like
4826 if we are returning floats on the 80387 register stack, we cannot
4827 make a sibcall from a function that doesn't return a float to a
4828 function that does or, conversely, from a function that does return
4829 a float to a function that doesn't; the necessary stack adjustment
4830 would not be executed. This is also the place we notice
4831 differences in the return value ABI. Note that it is ok for one
4832 of the functions to have void return type as long as the return
4833 value of the other is passed in a register. */
4834 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4835 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4836 cfun->decl, false);
4837 if (STACK_REG_P (a) || STACK_REG_P (b))
4838 {
4839 if (!rtx_equal_p (a, b))
4840 return false;
4841 }
4842 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4843 {
4844 /* Disable sibcall if we need to generate vzeroupper after
4845 callee returns. */
4846 if (TARGET_VZEROUPPER
4847 && cfun->machine->callee_return_avx256_p
4848 && !cfun->machine->caller_return_avx256_p)
4849 return false;
4850 }
4851 else if (!rtx_equal_p (a, b))
4852 return false;
4853
4854 if (TARGET_64BIT)
4855 {
4856 /* The SYSV ABI has more call-clobbered registers;
4857 disallow sibcalls from MS to SYSV. */
4858 if (cfun->machine->call_abi == MS_ABI
4859 && ix86_function_type_abi (type) == SYSV_ABI)
4860 return false;
4861 }
4862 else
4863 {
4864 /* If this call is indirect, we'll need to be able to use a
4865 call-clobbered register for the address of the target function.
4866 Make sure that all such registers are not used for passing
4867 parameters. Note that DLLIMPORT functions are indirect. */
4868 if (!decl
4869 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4870 {
4871 if (ix86_function_regparm (type, NULL) >= 3)
4872 {
4873 /* ??? Need to count the actual number of registers to be used,
4874 not the possible number of registers. Fix later. */
4875 return false;
4876 }
4877 }
4878 }
4879
4880 /* Otherwise okay. That also includes certain types of indirect calls. */
4881 return true;
4882 }
4883
4884 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4885 and "sseregparm" calling convention attributes;
4886 arguments as in struct attribute_spec.handler. */
4887
4888 static tree
4889 ix86_handle_cconv_attribute (tree *node, tree name,
4890 tree args,
4891 int flags ATTRIBUTE_UNUSED,
4892 bool *no_add_attrs)
4893 {
4894 if (TREE_CODE (*node) != FUNCTION_TYPE
4895 && TREE_CODE (*node) != METHOD_TYPE
4896 && TREE_CODE (*node) != FIELD_DECL
4897 && TREE_CODE (*node) != TYPE_DECL)
4898 {
4899 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4900 name);
4901 *no_add_attrs = true;
4902 return NULL_TREE;
4903 }
4904
4905 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4906 if (is_attribute_p ("regparm", name))
4907 {
4908 tree cst;
4909
4910 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4911 {
4912 error ("fastcall and regparm attributes are not compatible");
4913 }
4914
4915 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4916 {
4917 error ("regparam and thiscall attributes are not compatible");
4918 }
4919
4920 cst = TREE_VALUE (args);
4921 if (TREE_CODE (cst) != INTEGER_CST)
4922 {
4923 warning (OPT_Wattributes,
4924 "%qE attribute requires an integer constant argument",
4925 name);
4926 *no_add_attrs = true;
4927 }
4928 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4929 {
4930 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4931 name, REGPARM_MAX);
4932 *no_add_attrs = true;
4933 }
4934
4935 return NULL_TREE;
4936 }
4937
4938 if (TARGET_64BIT)
4939 {
4940 /* Do not warn when emulating the MS ABI. */
4941 if ((TREE_CODE (*node) != FUNCTION_TYPE
4942 && TREE_CODE (*node) != METHOD_TYPE)
4943 || ix86_function_type_abi (*node) != MS_ABI)
4944 warning (OPT_Wattributes, "%qE attribute ignored",
4945 name);
4946 *no_add_attrs = true;
4947 return NULL_TREE;
4948 }
4949
4950 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4951 if (is_attribute_p ("fastcall", name))
4952 {
4953 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4954 {
4955 error ("fastcall and cdecl attributes are not compatible");
4956 }
4957 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4958 {
4959 error ("fastcall and stdcall attributes are not compatible");
4960 }
4961 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4962 {
4963 error ("fastcall and regparm attributes are not compatible");
4964 }
4965 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4966 {
4967 error ("fastcall and thiscall attributes are not compatible");
4968 }
4969 }
4970
4971 /* Can combine stdcall with fastcall (redundant), regparm and
4972 sseregparm. */
4973 else if (is_attribute_p ("stdcall", name))
4974 {
4975 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4976 {
4977 error ("stdcall and cdecl attributes are not compatible");
4978 }
4979 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4980 {
4981 error ("stdcall and fastcall attributes are not compatible");
4982 }
4983 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4984 {
4985 error ("stdcall and thiscall attributes are not compatible");
4986 }
4987 }
4988
4989 /* Can combine cdecl with regparm and sseregparm. */
4990 else if (is_attribute_p ("cdecl", name))
4991 {
4992 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4993 {
4994 error ("stdcall and cdecl attributes are not compatible");
4995 }
4996 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4997 {
4998 error ("fastcall and cdecl attributes are not compatible");
4999 }
5000 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5001 {
5002 error ("cdecl and thiscall attributes are not compatible");
5003 }
5004 }
5005 else if (is_attribute_p ("thiscall", name))
5006 {
5007 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5008 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5009 name);
5010 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5011 {
5012 error ("stdcall and thiscall attributes are not compatible");
5013 }
5014 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5015 {
5016 error ("fastcall and thiscall attributes are not compatible");
5017 }
5018 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5019 {
5020 error ("cdecl and thiscall attributes are not compatible");
5021 }
5022 }
5023
5024 /* Can combine sseregparm with all attributes. */
5025
5026 return NULL_TREE;
5027 }
5028
5029 /* The transactional memory builtins are implicitly regparm or fastcall
5030 depending on the ABI. Override the generic do-nothing attribute that
5031 these builtins were declared with, and replace it with one of the two
5032 attributes that we expect elsewhere. */
5033
5034 static tree
5035 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5036 tree args ATTRIBUTE_UNUSED,
5037 int flags ATTRIBUTE_UNUSED,
5038 bool *no_add_attrs)
5039 {
5040 tree alt;
5041
5042 /* In no case do we want to add the placeholder attribute. */
5043 *no_add_attrs = true;
5044
5045 /* The 64-bit ABI is unchanged for transactional memory. */
5046 if (TARGET_64BIT)
5047 return NULL_TREE;
5048
5049 /* ??? Is there a better way to validate 32-bit windows? We have
5050 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5051 if (CHECK_STACK_LIMIT > 0)
5052 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5053 else
5054 {
5055 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5056 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5057 }
5058 decl_attributes (node, alt, flags);
5059
5060 return NULL_TREE;
5061 }
5062
5063 /* This function determines from TYPE the calling-convention. */
5064
5065 unsigned int
5066 ix86_get_callcvt (const_tree type)
5067 {
5068 unsigned int ret = 0;
5069 bool is_stdarg;
5070 tree attrs;
5071
5072 if (TARGET_64BIT)
5073 return IX86_CALLCVT_CDECL;
5074
5075 attrs = TYPE_ATTRIBUTES (type);
5076 if (attrs != NULL_TREE)
5077 {
5078 if (lookup_attribute ("cdecl", attrs))
5079 ret |= IX86_CALLCVT_CDECL;
5080 else if (lookup_attribute ("stdcall", attrs))
5081 ret |= IX86_CALLCVT_STDCALL;
5082 else if (lookup_attribute ("fastcall", attrs))
5083 ret |= IX86_CALLCVT_FASTCALL;
5084 else if (lookup_attribute ("thiscall", attrs))
5085 ret |= IX86_CALLCVT_THISCALL;
5086
5087 /* Regparam isn't allowed for thiscall and fastcall. */
5088 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5089 {
5090 if (lookup_attribute ("regparm", attrs))
5091 ret |= IX86_CALLCVT_REGPARM;
5092 if (lookup_attribute ("sseregparm", attrs))
5093 ret |= IX86_CALLCVT_SSEREGPARM;
5094 }
5095
5096 if (IX86_BASE_CALLCVT(ret) != 0)
5097 return ret;
5098 }
5099
5100 is_stdarg = stdarg_p (type);
5101 if (TARGET_RTD && !is_stdarg)
5102 return IX86_CALLCVT_STDCALL | ret;
5103
5104 if (ret != 0
5105 || is_stdarg
5106 || TREE_CODE (type) != METHOD_TYPE
5107 || ix86_function_type_abi (type) != MS_ABI)
5108 return IX86_CALLCVT_CDECL | ret;
5109
5110 return IX86_CALLCVT_THISCALL;
5111 }
5112
5113 /* Return 0 if the attributes for two types are incompatible, 1 if they
5114 are compatible, and 2 if they are nearly compatible (which causes a
5115 warning to be generated). */
5116
5117 static int
5118 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5119 {
5120 unsigned int ccvt1, ccvt2;
5121
5122 if (TREE_CODE (type1) != FUNCTION_TYPE
5123 && TREE_CODE (type1) != METHOD_TYPE)
5124 return 1;
5125
5126 ccvt1 = ix86_get_callcvt (type1);
5127 ccvt2 = ix86_get_callcvt (type2);
5128 if (ccvt1 != ccvt2)
5129 return 0;
5130 if (ix86_function_regparm (type1, NULL)
5131 != ix86_function_regparm (type2, NULL))
5132 return 0;
5133
5134 return 1;
5135 }
5136 \f
5137 /* Return the regparm value for a function with the indicated TYPE and DECL.
5138 DECL may be NULL when calling function indirectly
5139 or considering a libcall. */
5140
5141 static int
5142 ix86_function_regparm (const_tree type, const_tree decl)
5143 {
5144 tree attr;
5145 int regparm;
5146 unsigned int ccvt;
5147
5148 if (TARGET_64BIT)
5149 return (ix86_function_type_abi (type) == SYSV_ABI
5150 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5151 ccvt = ix86_get_callcvt (type);
5152 regparm = ix86_regparm;
5153
5154 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5155 {
5156 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5157 if (attr)
5158 {
5159 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5160 return regparm;
5161 }
5162 }
5163 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5164 return 2;
5165 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5166 return 1;
5167
5168 /* Use register calling convention for local functions when possible. */
5169 if (decl
5170 && TREE_CODE (decl) == FUNCTION_DECL
5171 && optimize
5172 && !(profile_flag && !flag_fentry))
5173 {
5174 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5175 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5176 if (i && i->local && i->can_change_signature)
5177 {
5178 int local_regparm, globals = 0, regno;
5179
5180 /* Make sure no regparm register is taken by a
5181 fixed register variable. */
5182 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5183 if (fixed_regs[local_regparm])
5184 break;
5185
5186 /* We don't want to use regparm(3) for nested functions as
5187 these use a static chain pointer in the third argument. */
5188 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5189 local_regparm = 2;
5190
5191 /* In 32-bit mode save a register for the split stack. */
5192 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5193 local_regparm = 2;
5194
5195 /* Each fixed register usage increases register pressure,
5196 so less registers should be used for argument passing.
5197 This functionality can be overriden by an explicit
5198 regparm value. */
5199 for (regno = 0; regno <= DI_REG; regno++)
5200 if (fixed_regs[regno])
5201 globals++;
5202
5203 local_regparm
5204 = globals < local_regparm ? local_regparm - globals : 0;
5205
5206 if (local_regparm > regparm)
5207 regparm = local_regparm;
5208 }
5209 }
5210
5211 return regparm;
5212 }
5213
5214 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5215 DFmode (2) arguments in SSE registers for a function with the
5216 indicated TYPE and DECL. DECL may be NULL when calling function
5217 indirectly or considering a libcall. Otherwise return 0. */
5218
5219 static int
5220 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5221 {
5222 gcc_assert (!TARGET_64BIT);
5223
5224 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5225 by the sseregparm attribute. */
5226 if (TARGET_SSEREGPARM
5227 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5228 {
5229 if (!TARGET_SSE)
5230 {
5231 if (warn)
5232 {
5233 if (decl)
5234 error ("calling %qD with attribute sseregparm without "
5235 "SSE/SSE2 enabled", decl);
5236 else
5237 error ("calling %qT with attribute sseregparm without "
5238 "SSE/SSE2 enabled", type);
5239 }
5240 return 0;
5241 }
5242
5243 return 2;
5244 }
5245
5246 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5247 (and DFmode for SSE2) arguments in SSE registers. */
5248 if (decl && TARGET_SSE_MATH && optimize
5249 && !(profile_flag && !flag_fentry))
5250 {
5251 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5252 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5253 if (i && i->local && i->can_change_signature)
5254 return TARGET_SSE2 ? 2 : 1;
5255 }
5256
5257 return 0;
5258 }
5259
5260 /* Return true if EAX is live at the start of the function. Used by
5261 ix86_expand_prologue to determine if we need special help before
5262 calling allocate_stack_worker. */
5263
5264 static bool
5265 ix86_eax_live_at_start_p (void)
5266 {
5267 /* Cheat. Don't bother working forward from ix86_function_regparm
5268 to the function type to whether an actual argument is located in
5269 eax. Instead just look at cfg info, which is still close enough
5270 to correct at this point. This gives false positives for broken
5271 functions that might use uninitialized data that happens to be
5272 allocated in eax, but who cares? */
5273 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5274 }
5275
5276 static bool
5277 ix86_keep_aggregate_return_pointer (tree fntype)
5278 {
5279 tree attr;
5280
5281 if (!TARGET_64BIT)
5282 {
5283 attr = lookup_attribute ("callee_pop_aggregate_return",
5284 TYPE_ATTRIBUTES (fntype));
5285 if (attr)
5286 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5287
5288 /* For 32-bit MS-ABI the default is to keep aggregate
5289 return pointer. */
5290 if (ix86_function_type_abi (fntype) == MS_ABI)
5291 return true;
5292 }
5293 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5294 }
5295
5296 /* Value is the number of bytes of arguments automatically
5297 popped when returning from a subroutine call.
5298 FUNDECL is the declaration node of the function (as a tree),
5299 FUNTYPE is the data type of the function (as a tree),
5300 or for a library call it is an identifier node for the subroutine name.
5301 SIZE is the number of bytes of arguments passed on the stack.
5302
5303 On the 80386, the RTD insn may be used to pop them if the number
5304 of args is fixed, but if the number is variable then the caller
5305 must pop them all. RTD can't be used for library calls now
5306 because the library is compiled with the Unix compiler.
5307 Use of RTD is a selectable option, since it is incompatible with
5308 standard Unix calling sequences. If the option is not selected,
5309 the caller must always pop the args.
5310
5311 The attribute stdcall is equivalent to RTD on a per module basis. */
5312
5313 static int
5314 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5315 {
5316 unsigned int ccvt;
5317
5318 /* None of the 64-bit ABIs pop arguments. */
5319 if (TARGET_64BIT)
5320 return 0;
5321
5322 ccvt = ix86_get_callcvt (funtype);
5323
5324 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5325 | IX86_CALLCVT_THISCALL)) != 0
5326 && ! stdarg_p (funtype))
5327 return size;
5328
5329 /* Lose any fake structure return argument if it is passed on the stack. */
5330 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5331 && !ix86_keep_aggregate_return_pointer (funtype))
5332 {
5333 int nregs = ix86_function_regparm (funtype, fundecl);
5334 if (nregs == 0)
5335 return GET_MODE_SIZE (Pmode);
5336 }
5337
5338 return 0;
5339 }
5340 \f
5341 /* Argument support functions. */
5342
5343 /* Return true when register may be used to pass function parameters. */
5344 bool
5345 ix86_function_arg_regno_p (int regno)
5346 {
5347 int i;
5348 const int *parm_regs;
5349
5350 if (!TARGET_64BIT)
5351 {
5352 if (TARGET_MACHO)
5353 return (regno < REGPARM_MAX
5354 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5355 else
5356 return (regno < REGPARM_MAX
5357 || (TARGET_MMX && MMX_REGNO_P (regno)
5358 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5359 || (TARGET_SSE && SSE_REGNO_P (regno)
5360 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5361 }
5362
5363 if (TARGET_MACHO)
5364 {
5365 if (SSE_REGNO_P (regno) && TARGET_SSE)
5366 return true;
5367 }
5368 else
5369 {
5370 if (TARGET_SSE && SSE_REGNO_P (regno)
5371 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5372 return true;
5373 }
5374
5375 /* TODO: The function should depend on current function ABI but
5376 builtins.c would need updating then. Therefore we use the
5377 default ABI. */
5378
5379 /* RAX is used as hidden argument to va_arg functions. */
5380 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5381 return true;
5382
5383 if (ix86_abi == MS_ABI)
5384 parm_regs = x86_64_ms_abi_int_parameter_registers;
5385 else
5386 parm_regs = x86_64_int_parameter_registers;
5387 for (i = 0; i < (ix86_abi == MS_ABI
5388 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5389 if (regno == parm_regs[i])
5390 return true;
5391 return false;
5392 }
5393
5394 /* Return if we do not know how to pass TYPE solely in registers. */
5395
5396 static bool
5397 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5398 {
5399 if (must_pass_in_stack_var_size_or_pad (mode, type))
5400 return true;
5401
5402 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5403 The layout_type routine is crafty and tries to trick us into passing
5404 currently unsupported vector types on the stack by using TImode. */
5405 return (!TARGET_64BIT && mode == TImode
5406 && type && TREE_CODE (type) != VECTOR_TYPE);
5407 }
5408
5409 /* It returns the size, in bytes, of the area reserved for arguments passed
5410 in registers for the function represented by fndecl dependent to the used
5411 abi format. */
5412 int
5413 ix86_reg_parm_stack_space (const_tree fndecl)
5414 {
5415 enum calling_abi call_abi = SYSV_ABI;
5416 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5417 call_abi = ix86_function_abi (fndecl);
5418 else
5419 call_abi = ix86_function_type_abi (fndecl);
5420 if (TARGET_64BIT && call_abi == MS_ABI)
5421 return 32;
5422 return 0;
5423 }
5424
5425 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5426 call abi used. */
5427 enum calling_abi
5428 ix86_function_type_abi (const_tree fntype)
5429 {
5430 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5431 {
5432 enum calling_abi abi = ix86_abi;
5433 if (abi == SYSV_ABI)
5434 {
5435 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5436 abi = MS_ABI;
5437 }
5438 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5439 abi = SYSV_ABI;
5440 return abi;
5441 }
5442 return ix86_abi;
5443 }
5444
5445 static bool
5446 ix86_function_ms_hook_prologue (const_tree fn)
5447 {
5448 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5449 {
5450 if (decl_function_context (fn) != NULL_TREE)
5451 error_at (DECL_SOURCE_LOCATION (fn),
5452 "ms_hook_prologue is not compatible with nested function");
5453 else
5454 return true;
5455 }
5456 return false;
5457 }
5458
5459 static enum calling_abi
5460 ix86_function_abi (const_tree fndecl)
5461 {
5462 if (! fndecl)
5463 return ix86_abi;
5464 return ix86_function_type_abi (TREE_TYPE (fndecl));
5465 }
5466
5467 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5468 call abi used. */
5469 enum calling_abi
5470 ix86_cfun_abi (void)
5471 {
5472 if (! cfun)
5473 return ix86_abi;
5474 return cfun->machine->call_abi;
5475 }
5476
5477 /* Write the extra assembler code needed to declare a function properly. */
5478
5479 void
5480 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5481 tree decl)
5482 {
5483 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5484
5485 if (is_ms_hook)
5486 {
5487 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5488 unsigned int filler_cc = 0xcccccccc;
5489
5490 for (i = 0; i < filler_count; i += 4)
5491 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5492 }
5493
5494 #ifdef SUBTARGET_ASM_UNWIND_INIT
5495 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5496 #endif
5497
5498 ASM_OUTPUT_LABEL (asm_out_file, fname);
5499
5500 /* Output magic byte marker, if hot-patch attribute is set. */
5501 if (is_ms_hook)
5502 {
5503 if (TARGET_64BIT)
5504 {
5505 /* leaq [%rsp + 0], %rsp */
5506 asm_fprintf (asm_out_file, ASM_BYTE
5507 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5508 }
5509 else
5510 {
5511 /* movl.s %edi, %edi
5512 push %ebp
5513 movl.s %esp, %ebp */
5514 asm_fprintf (asm_out_file, ASM_BYTE
5515 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5516 }
5517 }
5518 }
5519
5520 /* regclass.c */
5521 extern void init_regs (void);
5522
5523 /* Implementation of call abi switching target hook. Specific to FNDECL
5524 the specific call register sets are set. See also
5525 ix86_conditional_register_usage for more details. */
5526 void
5527 ix86_call_abi_override (const_tree fndecl)
5528 {
5529 if (fndecl == NULL_TREE)
5530 cfun->machine->call_abi = ix86_abi;
5531 else
5532 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5533 }
5534
5535 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5536 expensive re-initialization of init_regs each time we switch function context
5537 since this is needed only during RTL expansion. */
5538 static void
5539 ix86_maybe_switch_abi (void)
5540 {
5541 if (TARGET_64BIT &&
5542 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5543 reinit_regs ();
5544 }
5545
5546 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5547 for a call to a function whose data type is FNTYPE.
5548 For a library call, FNTYPE is 0. */
5549
5550 void
5551 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5552 tree fntype, /* tree ptr for function decl */
5553 rtx libname, /* SYMBOL_REF of library name or 0 */
5554 tree fndecl,
5555 int caller)
5556 {
5557 struct cgraph_local_info *i;
5558 tree fnret_type;
5559
5560 memset (cum, 0, sizeof (*cum));
5561
5562 /* Initialize for the current callee. */
5563 if (caller)
5564 {
5565 cfun->machine->callee_pass_avx256_p = false;
5566 cfun->machine->callee_return_avx256_p = false;
5567 }
5568
5569 if (fndecl)
5570 {
5571 i = cgraph_local_info (fndecl);
5572 cum->call_abi = ix86_function_abi (fndecl);
5573 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5574 }
5575 else
5576 {
5577 i = NULL;
5578 cum->call_abi = ix86_function_type_abi (fntype);
5579 if (fntype)
5580 fnret_type = TREE_TYPE (fntype);
5581 else
5582 fnret_type = NULL;
5583 }
5584
5585 if (TARGET_VZEROUPPER && fnret_type)
5586 {
5587 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5588 false);
5589 if (function_pass_avx256_p (fnret_value))
5590 {
5591 /* The return value of this function uses 256bit AVX modes. */
5592 if (caller)
5593 cfun->machine->callee_return_avx256_p = true;
5594 else
5595 cfun->machine->caller_return_avx256_p = true;
5596 }
5597 }
5598
5599 cum->caller = caller;
5600
5601 /* Set up the number of registers to use for passing arguments. */
5602
5603 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5604 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5605 "or subtarget optimization implying it");
5606 cum->nregs = ix86_regparm;
5607 if (TARGET_64BIT)
5608 {
5609 cum->nregs = (cum->call_abi == SYSV_ABI
5610 ? X86_64_REGPARM_MAX
5611 : X86_64_MS_REGPARM_MAX);
5612 }
5613 if (TARGET_SSE)
5614 {
5615 cum->sse_nregs = SSE_REGPARM_MAX;
5616 if (TARGET_64BIT)
5617 {
5618 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5619 ? X86_64_SSE_REGPARM_MAX
5620 : X86_64_MS_SSE_REGPARM_MAX);
5621 }
5622 }
5623 if (TARGET_MMX)
5624 cum->mmx_nregs = MMX_REGPARM_MAX;
5625 cum->warn_avx = true;
5626 cum->warn_sse = true;
5627 cum->warn_mmx = true;
5628
5629 /* Because type might mismatch in between caller and callee, we need to
5630 use actual type of function for local calls.
5631 FIXME: cgraph_analyze can be told to actually record if function uses
5632 va_start so for local functions maybe_vaarg can be made aggressive
5633 helping K&R code.
5634 FIXME: once typesytem is fixed, we won't need this code anymore. */
5635 if (i && i->local && i->can_change_signature)
5636 fntype = TREE_TYPE (fndecl);
5637 cum->maybe_vaarg = (fntype
5638 ? (!prototype_p (fntype) || stdarg_p (fntype))
5639 : !libname);
5640
5641 if (!TARGET_64BIT)
5642 {
5643 /* If there are variable arguments, then we won't pass anything
5644 in registers in 32-bit mode. */
5645 if (stdarg_p (fntype))
5646 {
5647 cum->nregs = 0;
5648 cum->sse_nregs = 0;
5649 cum->mmx_nregs = 0;
5650 cum->warn_avx = 0;
5651 cum->warn_sse = 0;
5652 cum->warn_mmx = 0;
5653 return;
5654 }
5655
5656 /* Use ecx and edx registers if function has fastcall attribute,
5657 else look for regparm information. */
5658 if (fntype)
5659 {
5660 unsigned int ccvt = ix86_get_callcvt (fntype);
5661 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5662 {
5663 cum->nregs = 1;
5664 cum->fastcall = 1; /* Same first register as in fastcall. */
5665 }
5666 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5667 {
5668 cum->nregs = 2;
5669 cum->fastcall = 1;
5670 }
5671 else
5672 cum->nregs = ix86_function_regparm (fntype, fndecl);
5673 }
5674
5675 /* Set up the number of SSE registers used for passing SFmode
5676 and DFmode arguments. Warn for mismatching ABI. */
5677 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5678 }
5679 }
5680
5681 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5682 But in the case of vector types, it is some vector mode.
5683
5684 When we have only some of our vector isa extensions enabled, then there
5685 are some modes for which vector_mode_supported_p is false. For these
5686 modes, the generic vector support in gcc will choose some non-vector mode
5687 in order to implement the type. By computing the natural mode, we'll
5688 select the proper ABI location for the operand and not depend on whatever
5689 the middle-end decides to do with these vector types.
5690
5691 The midde-end can't deal with the vector types > 16 bytes. In this
5692 case, we return the original mode and warn ABI change if CUM isn't
5693 NULL. */
5694
5695 static enum machine_mode
5696 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5697 {
5698 enum machine_mode mode = TYPE_MODE (type);
5699
5700 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5701 {
5702 HOST_WIDE_INT size = int_size_in_bytes (type);
5703 if ((size == 8 || size == 16 || size == 32)
5704 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5705 && TYPE_VECTOR_SUBPARTS (type) > 1)
5706 {
5707 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5708
5709 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5710 mode = MIN_MODE_VECTOR_FLOAT;
5711 else
5712 mode = MIN_MODE_VECTOR_INT;
5713
5714 /* Get the mode which has this inner mode and number of units. */
5715 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5716 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5717 && GET_MODE_INNER (mode) == innermode)
5718 {
5719 if (size == 32 && !TARGET_AVX)
5720 {
5721 static bool warnedavx;
5722
5723 if (cum
5724 && !warnedavx
5725 && cum->warn_avx)
5726 {
5727 warnedavx = true;
5728 warning (0, "AVX vector argument without AVX "
5729 "enabled changes the ABI");
5730 }
5731 return TYPE_MODE (type);
5732 }
5733 else
5734 return mode;
5735 }
5736
5737 gcc_unreachable ();
5738 }
5739 }
5740
5741 return mode;
5742 }
5743
5744 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5745 this may not agree with the mode that the type system has chosen for the
5746 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5747 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5748
5749 static rtx
5750 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5751 unsigned int regno)
5752 {
5753 rtx tmp;
5754
5755 if (orig_mode != BLKmode)
5756 tmp = gen_rtx_REG (orig_mode, regno);
5757 else
5758 {
5759 tmp = gen_rtx_REG (mode, regno);
5760 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5761 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5762 }
5763
5764 return tmp;
5765 }
5766
5767 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5768 of this code is to classify each 8bytes of incoming argument by the register
5769 class and assign registers accordingly. */
5770
5771 /* Return the union class of CLASS1 and CLASS2.
5772 See the x86-64 PS ABI for details. */
5773
5774 static enum x86_64_reg_class
5775 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5776 {
5777 /* Rule #1: If both classes are equal, this is the resulting class. */
5778 if (class1 == class2)
5779 return class1;
5780
5781 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5782 the other class. */
5783 if (class1 == X86_64_NO_CLASS)
5784 return class2;
5785 if (class2 == X86_64_NO_CLASS)
5786 return class1;
5787
5788 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5789 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5790 return X86_64_MEMORY_CLASS;
5791
5792 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5793 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5794 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5795 return X86_64_INTEGERSI_CLASS;
5796 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5797 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5798 return X86_64_INTEGER_CLASS;
5799
5800 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5801 MEMORY is used. */
5802 if (class1 == X86_64_X87_CLASS
5803 || class1 == X86_64_X87UP_CLASS
5804 || class1 == X86_64_COMPLEX_X87_CLASS
5805 || class2 == X86_64_X87_CLASS
5806 || class2 == X86_64_X87UP_CLASS
5807 || class2 == X86_64_COMPLEX_X87_CLASS)
5808 return X86_64_MEMORY_CLASS;
5809
5810 /* Rule #6: Otherwise class SSE is used. */
5811 return X86_64_SSE_CLASS;
5812 }
5813
5814 /* Classify the argument of type TYPE and mode MODE.
5815 CLASSES will be filled by the register class used to pass each word
5816 of the operand. The number of words is returned. In case the parameter
5817 should be passed in memory, 0 is returned. As a special case for zero
5818 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5819
5820 BIT_OFFSET is used internally for handling records and specifies offset
5821 of the offset in bits modulo 256 to avoid overflow cases.
5822
5823 See the x86-64 PS ABI for details.
5824 */
5825
5826 static int
5827 classify_argument (enum machine_mode mode, const_tree type,
5828 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5829 {
5830 HOST_WIDE_INT bytes =
5831 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5832 int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5833
5834 /* Variable sized entities are always passed/returned in memory. */
5835 if (bytes < 0)
5836 return 0;
5837
5838 if (mode != VOIDmode
5839 && targetm.calls.must_pass_in_stack (mode, type))
5840 return 0;
5841
5842 if (type && AGGREGATE_TYPE_P (type))
5843 {
5844 int i;
5845 tree field;
5846 enum x86_64_reg_class subclasses[MAX_CLASSES];
5847
5848 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5849 if (bytes > 32)
5850 return 0;
5851
5852 for (i = 0; i < words; i++)
5853 classes[i] = X86_64_NO_CLASS;
5854
5855 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5856 signalize memory class, so handle it as special case. */
5857 if (!words)
5858 {
5859 classes[0] = X86_64_NO_CLASS;
5860 return 1;
5861 }
5862
5863 /* Classify each field of record and merge classes. */
5864 switch (TREE_CODE (type))
5865 {
5866 case RECORD_TYPE:
5867 /* And now merge the fields of structure. */
5868 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5869 {
5870 if (TREE_CODE (field) == FIELD_DECL)
5871 {
5872 int num;
5873
5874 if (TREE_TYPE (field) == error_mark_node)
5875 continue;
5876
5877 /* Bitfields are always classified as integer. Handle them
5878 early, since later code would consider them to be
5879 misaligned integers. */
5880 if (DECL_BIT_FIELD (field))
5881 {
5882 for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5883 i < ((int_bit_position (field) + (bit_offset % 64))
5884 + tree_low_cst (DECL_SIZE (field), 0)
5885 + 63) / 8 / 8; i++)
5886 classes[i] =
5887 merge_classes (X86_64_INTEGER_CLASS,
5888 classes[i]);
5889 }
5890 else
5891 {
5892 int pos;
5893
5894 type = TREE_TYPE (field);
5895
5896 /* Flexible array member is ignored. */
5897 if (TYPE_MODE (type) == BLKmode
5898 && TREE_CODE (type) == ARRAY_TYPE
5899 && TYPE_SIZE (type) == NULL_TREE
5900 && TYPE_DOMAIN (type) != NULL_TREE
5901 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5902 == NULL_TREE))
5903 {
5904 static bool warned;
5905
5906 if (!warned && warn_psabi)
5907 {
5908 warned = true;
5909 inform (input_location,
5910 "the ABI of passing struct with"
5911 " a flexible array member has"
5912 " changed in GCC 4.4");
5913 }
5914 continue;
5915 }
5916 num = classify_argument (TYPE_MODE (type), type,
5917 subclasses,
5918 (int_bit_position (field)
5919 + bit_offset) % 256);
5920 if (!num)
5921 return 0;
5922 pos = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5923 for (i = 0; i < num && (i + pos) < words; i++)
5924 classes[i + pos] =
5925 merge_classes (subclasses[i], classes[i + pos]);
5926 }
5927 }
5928 }
5929 break;
5930
5931 case ARRAY_TYPE:
5932 /* Arrays are handled as small records. */
5933 {
5934 int num;
5935 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5936 TREE_TYPE (type), subclasses, bit_offset);
5937 if (!num)
5938 return 0;
5939
5940 /* The partial classes are now full classes. */
5941 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5942 subclasses[0] = X86_64_SSE_CLASS;
5943 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5944 && !((bit_offset % 64) == 0 && bytes == 4))
5945 subclasses[0] = X86_64_INTEGER_CLASS;
5946
5947 for (i = 0; i < words; i++)
5948 classes[i] = subclasses[i % num];
5949
5950 break;
5951 }
5952 case UNION_TYPE:
5953 case QUAL_UNION_TYPE:
5954 /* Unions are similar to RECORD_TYPE but offset is always 0.
5955 */
5956 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5957 {
5958 if (TREE_CODE (field) == FIELD_DECL)
5959 {
5960 int num;
5961
5962 if (TREE_TYPE (field) == error_mark_node)
5963 continue;
5964
5965 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5966 TREE_TYPE (field), subclasses,
5967 bit_offset);
5968 if (!num)
5969 return 0;
5970 for (i = 0; i < num; i++)
5971 classes[i] = merge_classes (subclasses[i], classes[i]);
5972 }
5973 }
5974 break;
5975
5976 default:
5977 gcc_unreachable ();
5978 }
5979
5980 if (words > 2)
5981 {
5982 /* When size > 16 bytes, if the first one isn't
5983 X86_64_SSE_CLASS or any other ones aren't
5984 X86_64_SSEUP_CLASS, everything should be passed in
5985 memory. */
5986 if (classes[0] != X86_64_SSE_CLASS)
5987 return 0;
5988
5989 for (i = 1; i < words; i++)
5990 if (classes[i] != X86_64_SSEUP_CLASS)
5991 return 0;
5992 }
5993
5994 /* Final merger cleanup. */
5995 for (i = 0; i < words; i++)
5996 {
5997 /* If one class is MEMORY, everything should be passed in
5998 memory. */
5999 if (classes[i] == X86_64_MEMORY_CLASS)
6000 return 0;
6001
6002 /* The X86_64_SSEUP_CLASS should be always preceded by
6003 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6004 if (classes[i] == X86_64_SSEUP_CLASS
6005 && classes[i - 1] != X86_64_SSE_CLASS
6006 && classes[i - 1] != X86_64_SSEUP_CLASS)
6007 {
6008 /* The first one should never be X86_64_SSEUP_CLASS. */
6009 gcc_assert (i != 0);
6010 classes[i] = X86_64_SSE_CLASS;
6011 }
6012
6013 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6014 everything should be passed in memory. */
6015 if (classes[i] == X86_64_X87UP_CLASS
6016 && (classes[i - 1] != X86_64_X87_CLASS))
6017 {
6018 static bool warned;
6019
6020 /* The first one should never be X86_64_X87UP_CLASS. */
6021 gcc_assert (i != 0);
6022 if (!warned && warn_psabi)
6023 {
6024 warned = true;
6025 inform (input_location,
6026 "the ABI of passing union with long double"
6027 " has changed in GCC 4.4");
6028 }
6029 return 0;
6030 }
6031 }
6032 return words;
6033 }
6034
6035 /* Compute alignment needed. We align all types to natural boundaries with
6036 exception of XFmode that is aligned to 64bits. */
6037 if (mode != VOIDmode && mode != BLKmode)
6038 {
6039 int mode_alignment = GET_MODE_BITSIZE (mode);
6040
6041 if (mode == XFmode)
6042 mode_alignment = 128;
6043 else if (mode == XCmode)
6044 mode_alignment = 256;
6045 if (COMPLEX_MODE_P (mode))
6046 mode_alignment /= 2;
6047 /* Misaligned fields are always returned in memory. */
6048 if (bit_offset % mode_alignment)
6049 return 0;
6050 }
6051
6052 /* for V1xx modes, just use the base mode */
6053 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6054 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6055 mode = GET_MODE_INNER (mode);
6056
6057 /* Classification of atomic types. */
6058 switch (mode)
6059 {
6060 case SDmode:
6061 case DDmode:
6062 classes[0] = X86_64_SSE_CLASS;
6063 return 1;
6064 case TDmode:
6065 classes[0] = X86_64_SSE_CLASS;
6066 classes[1] = X86_64_SSEUP_CLASS;
6067 return 2;
6068 case DImode:
6069 case SImode:
6070 case HImode:
6071 case QImode:
6072 case CSImode:
6073 case CHImode:
6074 case CQImode:
6075 {
6076 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6077
6078 if (size <= 32)
6079 {
6080 classes[0] = X86_64_INTEGERSI_CLASS;
6081 return 1;
6082 }
6083 else if (size <= 64)
6084 {
6085 classes[0] = X86_64_INTEGER_CLASS;
6086 return 1;
6087 }
6088 else if (size <= 64+32)
6089 {
6090 classes[0] = X86_64_INTEGER_CLASS;
6091 classes[1] = X86_64_INTEGERSI_CLASS;
6092 return 2;
6093 }
6094 else if (size <= 64+64)
6095 {
6096 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6097 return 2;
6098 }
6099 else
6100 gcc_unreachable ();
6101 }
6102 case CDImode:
6103 case TImode:
6104 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6105 return 2;
6106 case COImode:
6107 case OImode:
6108 /* OImode shouldn't be used directly. */
6109 gcc_unreachable ();
6110 case CTImode:
6111 return 0;
6112 case SFmode:
6113 if (!(bit_offset % 64))
6114 classes[0] = X86_64_SSESF_CLASS;
6115 else
6116 classes[0] = X86_64_SSE_CLASS;
6117 return 1;
6118 case DFmode:
6119 classes[0] = X86_64_SSEDF_CLASS;
6120 return 1;
6121 case XFmode:
6122 classes[0] = X86_64_X87_CLASS;
6123 classes[1] = X86_64_X87UP_CLASS;
6124 return 2;
6125 case TFmode:
6126 classes[0] = X86_64_SSE_CLASS;
6127 classes[1] = X86_64_SSEUP_CLASS;
6128 return 2;
6129 case SCmode:
6130 classes[0] = X86_64_SSE_CLASS;
6131 if (!(bit_offset % 64))
6132 return 1;
6133 else
6134 {
6135 static bool warned;
6136
6137 if (!warned && warn_psabi)
6138 {
6139 warned = true;
6140 inform (input_location,
6141 "the ABI of passing structure with complex float"
6142 " member has changed in GCC 4.4");
6143 }
6144 classes[1] = X86_64_SSESF_CLASS;
6145 return 2;
6146 }
6147 case DCmode:
6148 classes[0] = X86_64_SSEDF_CLASS;
6149 classes[1] = X86_64_SSEDF_CLASS;
6150 return 2;
6151 case XCmode:
6152 classes[0] = X86_64_COMPLEX_X87_CLASS;
6153 return 1;
6154 case TCmode:
6155 /* This modes is larger than 16 bytes. */
6156 return 0;
6157 case V8SFmode:
6158 case V8SImode:
6159 case V32QImode:
6160 case V16HImode:
6161 case V4DFmode:
6162 case V4DImode:
6163 classes[0] = X86_64_SSE_CLASS;
6164 classes[1] = X86_64_SSEUP_CLASS;
6165 classes[2] = X86_64_SSEUP_CLASS;
6166 classes[3] = X86_64_SSEUP_CLASS;
6167 return 4;
6168 case V4SFmode:
6169 case V4SImode:
6170 case V16QImode:
6171 case V8HImode:
6172 case V2DFmode:
6173 case V2DImode:
6174 classes[0] = X86_64_SSE_CLASS;
6175 classes[1] = X86_64_SSEUP_CLASS;
6176 return 2;
6177 case V1TImode:
6178 case V1DImode:
6179 case V2SFmode:
6180 case V2SImode:
6181 case V4HImode:
6182 case V8QImode:
6183 classes[0] = X86_64_SSE_CLASS;
6184 return 1;
6185 case BLKmode:
6186 case VOIDmode:
6187 return 0;
6188 default:
6189 gcc_assert (VECTOR_MODE_P (mode));
6190
6191 if (bytes > 16)
6192 return 0;
6193
6194 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6195
6196 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6197 classes[0] = X86_64_INTEGERSI_CLASS;
6198 else
6199 classes[0] = X86_64_INTEGER_CLASS;
6200 classes[1] = X86_64_INTEGER_CLASS;
6201 return 1 + (bytes > 8);
6202 }
6203 }
6204
6205 /* Examine the argument and return set number of register required in each
6206 class. Return 0 iff parameter should be passed in memory. */
6207 static int
6208 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6209 int *int_nregs, int *sse_nregs)
6210 {
6211 enum x86_64_reg_class regclass[MAX_CLASSES];
6212 int n = classify_argument (mode, type, regclass, 0);
6213
6214 *int_nregs = 0;
6215 *sse_nregs = 0;
6216 if (!n)
6217 return 0;
6218 for (n--; n >= 0; n--)
6219 switch (regclass[n])
6220 {
6221 case X86_64_INTEGER_CLASS:
6222 case X86_64_INTEGERSI_CLASS:
6223 (*int_nregs)++;
6224 break;
6225 case X86_64_SSE_CLASS:
6226 case X86_64_SSESF_CLASS:
6227 case X86_64_SSEDF_CLASS:
6228 (*sse_nregs)++;
6229 break;
6230 case X86_64_NO_CLASS:
6231 case X86_64_SSEUP_CLASS:
6232 break;
6233 case X86_64_X87_CLASS:
6234 case X86_64_X87UP_CLASS:
6235 if (!in_return)
6236 return 0;
6237 break;
6238 case X86_64_COMPLEX_X87_CLASS:
6239 return in_return ? 2 : 0;
6240 case X86_64_MEMORY_CLASS:
6241 gcc_unreachable ();
6242 }
6243 return 1;
6244 }
6245
6246 /* Construct container for the argument used by GCC interface. See
6247 FUNCTION_ARG for the detailed description. */
6248
6249 static rtx
6250 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6251 const_tree type, int in_return, int nintregs, int nsseregs,
6252 const int *intreg, int sse_regno)
6253 {
6254 /* The following variables hold the static issued_error state. */
6255 static bool issued_sse_arg_error;
6256 static bool issued_sse_ret_error;
6257 static bool issued_x87_ret_error;
6258
6259 enum machine_mode tmpmode;
6260 int bytes =
6261 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6262 enum x86_64_reg_class regclass[MAX_CLASSES];
6263 int n;
6264 int i;
6265 int nexps = 0;
6266 int needed_sseregs, needed_intregs;
6267 rtx exp[MAX_CLASSES];
6268 rtx ret;
6269
6270 n = classify_argument (mode, type, regclass, 0);
6271 if (!n)
6272 return NULL;
6273 if (!examine_argument (mode, type, in_return, &needed_intregs,
6274 &needed_sseregs))
6275 return NULL;
6276 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6277 return NULL;
6278
6279 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6280 some less clueful developer tries to use floating-point anyway. */
6281 if (needed_sseregs && !TARGET_SSE)
6282 {
6283 if (in_return)
6284 {
6285 if (!issued_sse_ret_error)
6286 {
6287 error ("SSE register return with SSE disabled");
6288 issued_sse_ret_error = true;
6289 }
6290 }
6291 else if (!issued_sse_arg_error)
6292 {
6293 error ("SSE register argument with SSE disabled");
6294 issued_sse_arg_error = true;
6295 }
6296 return NULL;
6297 }
6298
6299 /* Likewise, error if the ABI requires us to return values in the
6300 x87 registers and the user specified -mno-80387. */
6301 if (!TARGET_80387 && in_return)
6302 for (i = 0; i < n; i++)
6303 if (regclass[i] == X86_64_X87_CLASS
6304 || regclass[i] == X86_64_X87UP_CLASS
6305 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6306 {
6307 if (!issued_x87_ret_error)
6308 {
6309 error ("x87 register return with x87 disabled");
6310 issued_x87_ret_error = true;
6311 }
6312 return NULL;
6313 }
6314
6315 /* First construct simple cases. Avoid SCmode, since we want to use
6316 single register to pass this type. */
6317 if (n == 1 && mode != SCmode)
6318 switch (regclass[0])
6319 {
6320 case X86_64_INTEGER_CLASS:
6321 case X86_64_INTEGERSI_CLASS:
6322 return gen_rtx_REG (mode, intreg[0]);
6323 case X86_64_SSE_CLASS:
6324 case X86_64_SSESF_CLASS:
6325 case X86_64_SSEDF_CLASS:
6326 if (mode != BLKmode)
6327 return gen_reg_or_parallel (mode, orig_mode,
6328 SSE_REGNO (sse_regno));
6329 break;
6330 case X86_64_X87_CLASS:
6331 case X86_64_COMPLEX_X87_CLASS:
6332 return gen_rtx_REG (mode, FIRST_STACK_REG);
6333 case X86_64_NO_CLASS:
6334 /* Zero sized array, struct or class. */
6335 return NULL;
6336 default:
6337 gcc_unreachable ();
6338 }
6339 if (n == 2 && regclass[0] == X86_64_SSE_CLASS
6340 && regclass[1] == X86_64_SSEUP_CLASS && mode != BLKmode)
6341 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6342 if (n == 4
6343 && regclass[0] == X86_64_SSE_CLASS
6344 && regclass[1] == X86_64_SSEUP_CLASS
6345 && regclass[2] == X86_64_SSEUP_CLASS
6346 && regclass[3] == X86_64_SSEUP_CLASS
6347 && mode != BLKmode)
6348 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6349
6350 if (n == 2
6351 && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS)
6352 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6353 if (n == 2 && regclass[0] == X86_64_INTEGER_CLASS
6354 && regclass[1] == X86_64_INTEGER_CLASS
6355 && (mode == CDImode || mode == TImode || mode == TFmode)
6356 && intreg[0] + 1 == intreg[1])
6357 return gen_rtx_REG (mode, intreg[0]);
6358
6359 /* Otherwise figure out the entries of the PARALLEL. */
6360 for (i = 0; i < n; i++)
6361 {
6362 int pos;
6363
6364 switch (regclass[i])
6365 {
6366 case X86_64_NO_CLASS:
6367 break;
6368 case X86_64_INTEGER_CLASS:
6369 case X86_64_INTEGERSI_CLASS:
6370 /* Merge TImodes on aligned occasions here too. */
6371 if (i * 8 + 8 > bytes)
6372 tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6373 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6374 tmpmode = SImode;
6375 else
6376 tmpmode = DImode;
6377 /* We've requested 24 bytes we don't have mode for. Use DImode. */
6378 if (tmpmode == BLKmode)
6379 tmpmode = DImode;
6380 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6381 gen_rtx_REG (tmpmode, *intreg),
6382 GEN_INT (i*8));
6383 intreg++;
6384 break;
6385 case X86_64_SSESF_CLASS:
6386 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6387 gen_rtx_REG (SFmode,
6388 SSE_REGNO (sse_regno)),
6389 GEN_INT (i*8));
6390 sse_regno++;
6391 break;
6392 case X86_64_SSEDF_CLASS:
6393 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6394 gen_rtx_REG (DFmode,
6395 SSE_REGNO (sse_regno)),
6396 GEN_INT (i*8));
6397 sse_regno++;
6398 break;
6399 case X86_64_SSE_CLASS:
6400 pos = i;
6401 switch (n)
6402 {
6403 case 1:
6404 tmpmode = DImode;
6405 break;
6406 case 2:
6407 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6408 {
6409 tmpmode = TImode;
6410 i++;
6411 }
6412 else
6413 tmpmode = DImode;
6414 break;
6415 case 4:
6416 gcc_assert (i == 0
6417 && regclass[1] == X86_64_SSEUP_CLASS
6418 && regclass[2] == X86_64_SSEUP_CLASS
6419 && regclass[3] == X86_64_SSEUP_CLASS);
6420 tmpmode = OImode;
6421 i += 3;
6422 break;
6423 default:
6424 gcc_unreachable ();
6425 }
6426 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6427 gen_rtx_REG (tmpmode,
6428 SSE_REGNO (sse_regno)),
6429 GEN_INT (pos*8));
6430 sse_regno++;
6431 break;
6432 default:
6433 gcc_unreachable ();
6434 }
6435 }
6436
6437 /* Empty aligned struct, union or class. */
6438 if (nexps == 0)
6439 return NULL;
6440
6441 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6442 for (i = 0; i < nexps; i++)
6443 XVECEXP (ret, 0, i) = exp [i];
6444 return ret;
6445 }
6446
6447 /* Update the data in CUM to advance over an argument of mode MODE
6448 and data type TYPE. (TYPE is null for libcalls where that information
6449 may not be available.) */
6450
6451 static void
6452 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6453 const_tree type, HOST_WIDE_INT bytes,
6454 HOST_WIDE_INT words)
6455 {
6456 switch (mode)
6457 {
6458 default:
6459 break;
6460
6461 case BLKmode:
6462 if (bytes < 0)
6463 break;
6464 /* FALLTHRU */
6465
6466 case DImode:
6467 case SImode:
6468 case HImode:
6469 case QImode:
6470 cum->words += words;
6471 cum->nregs -= words;
6472 cum->regno += words;
6473
6474 if (cum->nregs <= 0)
6475 {
6476 cum->nregs = 0;
6477 cum->regno = 0;
6478 }
6479 break;
6480
6481 case OImode:
6482 /* OImode shouldn't be used directly. */
6483 gcc_unreachable ();
6484
6485 case DFmode:
6486 if (cum->float_in_sse < 2)
6487 break;
6488 case SFmode:
6489 if (cum->float_in_sse < 1)
6490 break;
6491 /* FALLTHRU */
6492
6493 case V8SFmode:
6494 case V8SImode:
6495 case V32QImode:
6496 case V16HImode:
6497 case V4DFmode:
6498 case V4DImode:
6499 case TImode:
6500 case V16QImode:
6501 case V8HImode:
6502 case V4SImode:
6503 case V2DImode:
6504 case V4SFmode:
6505 case V2DFmode:
6506 if (!type || !AGGREGATE_TYPE_P (type))
6507 {
6508 cum->sse_words += words;
6509 cum->sse_nregs -= 1;
6510 cum->sse_regno += 1;
6511 if (cum->sse_nregs <= 0)
6512 {
6513 cum->sse_nregs = 0;
6514 cum->sse_regno = 0;
6515 }
6516 }
6517 break;
6518
6519 case V8QImode:
6520 case V4HImode:
6521 case V2SImode:
6522 case V2SFmode:
6523 case V1TImode:
6524 case V1DImode:
6525 if (!type || !AGGREGATE_TYPE_P (type))
6526 {
6527 cum->mmx_words += words;
6528 cum->mmx_nregs -= 1;
6529 cum->mmx_regno += 1;
6530 if (cum->mmx_nregs <= 0)
6531 {
6532 cum->mmx_nregs = 0;
6533 cum->mmx_regno = 0;
6534 }
6535 }
6536 break;
6537 }
6538 }
6539
6540 static void
6541 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6542 const_tree type, HOST_WIDE_INT words, bool named)
6543 {
6544 int int_nregs, sse_nregs;
6545
6546 /* Unnamed 256bit vector mode parameters are passed on stack. */
6547 if (!named && VALID_AVX256_REG_MODE (mode))
6548 return;
6549
6550 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6551 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6552 {
6553 cum->nregs -= int_nregs;
6554 cum->sse_nregs -= sse_nregs;
6555 cum->regno += int_nregs;
6556 cum->sse_regno += sse_nregs;
6557 }
6558 else
6559 {
6560 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6561 cum->words = (cum->words + align - 1) & ~(align - 1);
6562 cum->words += words;
6563 }
6564 }
6565
6566 static void
6567 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6568 HOST_WIDE_INT words)
6569 {
6570 /* Otherwise, this should be passed indirect. */
6571 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6572
6573 cum->words += words;
6574 if (cum->nregs > 0)
6575 {
6576 cum->nregs -= 1;
6577 cum->regno += 1;
6578 }
6579 }
6580
6581 /* Update the data in CUM to advance over an argument of mode MODE and
6582 data type TYPE. (TYPE is null for libcalls where that information
6583 may not be available.) */
6584
6585 static void
6586 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6587 const_tree type, bool named)
6588 {
6589 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6590 HOST_WIDE_INT bytes, words;
6591
6592 if (mode == BLKmode)
6593 bytes = int_size_in_bytes (type);
6594 else
6595 bytes = GET_MODE_SIZE (mode);
6596 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6597
6598 if (type)
6599 mode = type_natural_mode (type, NULL);
6600
6601 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6602 function_arg_advance_ms_64 (cum, bytes, words);
6603 else if (TARGET_64BIT)
6604 function_arg_advance_64 (cum, mode, type, words, named);
6605 else
6606 function_arg_advance_32 (cum, mode, type, bytes, words);
6607 }
6608
6609 /* Define where to put the arguments to a function.
6610 Value is zero to push the argument on the stack,
6611 or a hard register in which to store the argument.
6612
6613 MODE is the argument's machine mode.
6614 TYPE is the data type of the argument (as a tree).
6615 This is null for libcalls where that information may
6616 not be available.
6617 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6618 the preceding args and about the function being called.
6619 NAMED is nonzero if this argument is a named parameter
6620 (otherwise it is an extra parameter matching an ellipsis). */
6621
6622 static rtx
6623 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6624 enum machine_mode orig_mode, const_tree type,
6625 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6626 {
6627 static bool warnedsse, warnedmmx;
6628
6629 /* Avoid the AL settings for the Unix64 ABI. */
6630 if (mode == VOIDmode)
6631 return constm1_rtx;
6632
6633 switch (mode)
6634 {
6635 default:
6636 break;
6637
6638 case BLKmode:
6639 if (bytes < 0)
6640 break;
6641 /* FALLTHRU */
6642 case DImode:
6643 case SImode:
6644 case HImode:
6645 case QImode:
6646 if (words <= cum->nregs)
6647 {
6648 int regno = cum->regno;
6649
6650 /* Fastcall allocates the first two DWORD (SImode) or
6651 smaller arguments to ECX and EDX if it isn't an
6652 aggregate type . */
6653 if (cum->fastcall)
6654 {
6655 if (mode == BLKmode
6656 || mode == DImode
6657 || (type && AGGREGATE_TYPE_P (type)))
6658 break;
6659
6660 /* ECX not EAX is the first allocated register. */
6661 if (regno == AX_REG)
6662 regno = CX_REG;
6663 }
6664 return gen_rtx_REG (mode, regno);
6665 }
6666 break;
6667
6668 case DFmode:
6669 if (cum->float_in_sse < 2)
6670 break;
6671 case SFmode:
6672 if (cum->float_in_sse < 1)
6673 break;
6674 /* FALLTHRU */
6675 case TImode:
6676 /* In 32bit, we pass TImode in xmm registers. */
6677 case V16QImode:
6678 case V8HImode:
6679 case V4SImode:
6680 case V2DImode:
6681 case V4SFmode:
6682 case V2DFmode:
6683 if (!type || !AGGREGATE_TYPE_P (type))
6684 {
6685 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6686 {
6687 warnedsse = true;
6688 warning (0, "SSE vector argument without SSE enabled "
6689 "changes the ABI");
6690 }
6691 if (cum->sse_nregs)
6692 return gen_reg_or_parallel (mode, orig_mode,
6693 cum->sse_regno + FIRST_SSE_REG);
6694 }
6695 break;
6696
6697 case OImode:
6698 /* OImode shouldn't be used directly. */
6699 gcc_unreachable ();
6700
6701 case V8SFmode:
6702 case V8SImode:
6703 case V32QImode:
6704 case V16HImode:
6705 case V4DFmode:
6706 case V4DImode:
6707 if (!type || !AGGREGATE_TYPE_P (type))
6708 {
6709 if (cum->sse_nregs)
6710 return gen_reg_or_parallel (mode, orig_mode,
6711 cum->sse_regno + FIRST_SSE_REG);
6712 }
6713 break;
6714
6715 case V8QImode:
6716 case V4HImode:
6717 case V2SImode:
6718 case V2SFmode:
6719 case V1TImode:
6720 case V1DImode:
6721 if (!type || !AGGREGATE_TYPE_P (type))
6722 {
6723 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6724 {
6725 warnedmmx = true;
6726 warning (0, "MMX vector argument without MMX enabled "
6727 "changes the ABI");
6728 }
6729 if (cum->mmx_nregs)
6730 return gen_reg_or_parallel (mode, orig_mode,
6731 cum->mmx_regno + FIRST_MMX_REG);
6732 }
6733 break;
6734 }
6735
6736 return NULL_RTX;
6737 }
6738
6739 static rtx
6740 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6741 enum machine_mode orig_mode, const_tree type, bool named)
6742 {
6743 /* Handle a hidden AL argument containing number of registers
6744 for varargs x86-64 functions. */
6745 if (mode == VOIDmode)
6746 return GEN_INT (cum->maybe_vaarg
6747 ? (cum->sse_nregs < 0
6748 ? X86_64_SSE_REGPARM_MAX
6749 : cum->sse_regno)
6750 : -1);
6751
6752 switch (mode)
6753 {
6754 default:
6755 break;
6756
6757 case V8SFmode:
6758 case V8SImode:
6759 case V32QImode:
6760 case V16HImode:
6761 case V4DFmode:
6762 case V4DImode:
6763 /* Unnamed 256bit vector mode parameters are passed on stack. */
6764 if (!named)
6765 return NULL;
6766 break;
6767 }
6768
6769 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6770 cum->sse_nregs,
6771 &x86_64_int_parameter_registers [cum->regno],
6772 cum->sse_regno);
6773 }
6774
6775 static rtx
6776 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6777 enum machine_mode orig_mode, bool named,
6778 HOST_WIDE_INT bytes)
6779 {
6780 unsigned int regno;
6781
6782 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6783 We use value of -2 to specify that current function call is MSABI. */
6784 if (mode == VOIDmode)
6785 return GEN_INT (-2);
6786
6787 /* If we've run out of registers, it goes on the stack. */
6788 if (cum->nregs == 0)
6789 return NULL_RTX;
6790
6791 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6792
6793 /* Only floating point modes are passed in anything but integer regs. */
6794 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6795 {
6796 if (named)
6797 regno = cum->regno + FIRST_SSE_REG;
6798 else
6799 {
6800 rtx t1, t2;
6801
6802 /* Unnamed floating parameters are passed in both the
6803 SSE and integer registers. */
6804 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6805 t2 = gen_rtx_REG (mode, regno);
6806 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6807 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6808 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6809 }
6810 }
6811 /* Handle aggregated types passed in register. */
6812 if (orig_mode == BLKmode)
6813 {
6814 if (bytes > 0 && bytes <= 8)
6815 mode = (bytes > 4 ? DImode : SImode);
6816 if (mode == BLKmode)
6817 mode = DImode;
6818 }
6819
6820 return gen_reg_or_parallel (mode, orig_mode, regno);
6821 }
6822
6823 /* Return where to put the arguments to a function.
6824 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6825
6826 MODE is the argument's machine mode. TYPE is the data type of the
6827 argument. It is null for libcalls where that information may not be
6828 available. CUM gives information about the preceding args and about
6829 the function being called. NAMED is nonzero if this argument is a
6830 named parameter (otherwise it is an extra parameter matching an
6831 ellipsis). */
6832
6833 static rtx
6834 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6835 const_tree type, bool named)
6836 {
6837 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6838 enum machine_mode mode = omode;
6839 HOST_WIDE_INT bytes, words;
6840 rtx arg;
6841
6842 if (mode == BLKmode)
6843 bytes = int_size_in_bytes (type);
6844 else
6845 bytes = GET_MODE_SIZE (mode);
6846 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6847
6848 /* To simplify the code below, represent vector types with a vector mode
6849 even if MMX/SSE are not active. */
6850 if (type && TREE_CODE (type) == VECTOR_TYPE)
6851 mode = type_natural_mode (type, cum);
6852
6853 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6854 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6855 else if (TARGET_64BIT)
6856 arg = function_arg_64 (cum, mode, omode, type, named);
6857 else
6858 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6859
6860 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6861 {
6862 /* This argument uses 256bit AVX modes. */
6863 if (cum->caller)
6864 cfun->machine->callee_pass_avx256_p = true;
6865 else
6866 cfun->machine->caller_pass_avx256_p = true;
6867 }
6868
6869 return arg;
6870 }
6871
6872 /* A C expression that indicates when an argument must be passed by
6873 reference. If nonzero for an argument, a copy of that argument is
6874 made in memory and a pointer to the argument is passed instead of
6875 the argument itself. The pointer is passed in whatever way is
6876 appropriate for passing a pointer to that type. */
6877
6878 static bool
6879 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6880 enum machine_mode mode ATTRIBUTE_UNUSED,
6881 const_tree type, bool named ATTRIBUTE_UNUSED)
6882 {
6883 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6884
6885 /* See Windows x64 Software Convention. */
6886 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6887 {
6888 int msize = (int) GET_MODE_SIZE (mode);
6889 if (type)
6890 {
6891 /* Arrays are passed by reference. */
6892 if (TREE_CODE (type) == ARRAY_TYPE)
6893 return true;
6894
6895 if (AGGREGATE_TYPE_P (type))
6896 {
6897 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6898 are passed by reference. */
6899 msize = int_size_in_bytes (type);
6900 }
6901 }
6902
6903 /* __m128 is passed by reference. */
6904 switch (msize) {
6905 case 1: case 2: case 4: case 8:
6906 break;
6907 default:
6908 return true;
6909 }
6910 }
6911 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6912 return 1;
6913
6914 return 0;
6915 }
6916
6917 /* Return true when TYPE should be 128bit aligned for 32bit argument
6918 passing ABI. XXX: This function is obsolete and is only used for
6919 checking psABI compatibility with previous versions of GCC. */
6920
6921 static bool
6922 ix86_compat_aligned_value_p (const_tree type)
6923 {
6924 enum machine_mode mode = TYPE_MODE (type);
6925 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6926 || mode == TDmode
6927 || mode == TFmode
6928 || mode == TCmode)
6929 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6930 return true;
6931 if (TYPE_ALIGN (type) < 128)
6932 return false;
6933
6934 if (AGGREGATE_TYPE_P (type))
6935 {
6936 /* Walk the aggregates recursively. */
6937 switch (TREE_CODE (type))
6938 {
6939 case RECORD_TYPE:
6940 case UNION_TYPE:
6941 case QUAL_UNION_TYPE:
6942 {
6943 tree field;
6944
6945 /* Walk all the structure fields. */
6946 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6947 {
6948 if (TREE_CODE (field) == FIELD_DECL
6949 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
6950 return true;
6951 }
6952 break;
6953 }
6954
6955 case ARRAY_TYPE:
6956 /* Just for use if some languages passes arrays by value. */
6957 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
6958 return true;
6959 break;
6960
6961 default:
6962 gcc_unreachable ();
6963 }
6964 }
6965 return false;
6966 }
6967
6968 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
6969 XXX: This function is obsolete and is only used for checking psABI
6970 compatibility with previous versions of GCC. */
6971
6972 static unsigned int
6973 ix86_compat_function_arg_boundary (enum machine_mode mode,
6974 const_tree type, unsigned int align)
6975 {
6976 /* In 32bit, only _Decimal128 and __float128 are aligned to their
6977 natural boundaries. */
6978 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6979 {
6980 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
6981 make an exception for SSE modes since these require 128bit
6982 alignment.
6983
6984 The handling here differs from field_alignment. ICC aligns MMX
6985 arguments to 4 byte boundaries, while structure fields are aligned
6986 to 8 byte boundaries. */
6987 if (!type)
6988 {
6989 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
6990 align = PARM_BOUNDARY;
6991 }
6992 else
6993 {
6994 if (!ix86_compat_aligned_value_p (type))
6995 align = PARM_BOUNDARY;
6996 }
6997 }
6998 if (align > BIGGEST_ALIGNMENT)
6999 align = BIGGEST_ALIGNMENT;
7000 return align;
7001 }
7002
7003 /* Return true when TYPE should be 128bit aligned for 32bit argument
7004 passing ABI. */
7005
7006 static bool
7007 ix86_contains_aligned_value_p (const_tree type)
7008 {
7009 enum machine_mode mode = TYPE_MODE (type);
7010
7011 if (mode == XFmode || mode == XCmode)
7012 return false;
7013
7014 if (TYPE_ALIGN (type) < 128)
7015 return false;
7016
7017 if (AGGREGATE_TYPE_P (type))
7018 {
7019 /* Walk the aggregates recursively. */
7020 switch (TREE_CODE (type))
7021 {
7022 case RECORD_TYPE:
7023 case UNION_TYPE:
7024 case QUAL_UNION_TYPE:
7025 {
7026 tree field;
7027
7028 /* Walk all the structure fields. */
7029 for (field = TYPE_FIELDS (type);
7030 field;
7031 field = DECL_CHAIN (field))
7032 {
7033 if (TREE_CODE (field) == FIELD_DECL
7034 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7035 return true;
7036 }
7037 break;
7038 }
7039
7040 case ARRAY_TYPE:
7041 /* Just for use if some languages passes arrays by value. */
7042 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7043 return true;
7044 break;
7045
7046 default:
7047 gcc_unreachable ();
7048 }
7049 }
7050 else
7051 return TYPE_ALIGN (type) >= 128;
7052
7053 return false;
7054 }
7055
7056 /* Gives the alignment boundary, in bits, of an argument with the
7057 specified mode and type. */
7058
7059 static unsigned int
7060 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7061 {
7062 unsigned int align;
7063 if (type)
7064 {
7065 /* Since the main variant type is used for call, we convert it to
7066 the main variant type. */
7067 type = TYPE_MAIN_VARIANT (type);
7068 align = TYPE_ALIGN (type);
7069 }
7070 else
7071 align = GET_MODE_ALIGNMENT (mode);
7072 if (align < PARM_BOUNDARY)
7073 align = PARM_BOUNDARY;
7074 else
7075 {
7076 static bool warned;
7077 unsigned int saved_align = align;
7078
7079 if (!TARGET_64BIT)
7080 {
7081 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7082 if (!type)
7083 {
7084 if (mode == XFmode || mode == XCmode)
7085 align = PARM_BOUNDARY;
7086 }
7087 else if (!ix86_contains_aligned_value_p (type))
7088 align = PARM_BOUNDARY;
7089
7090 if (align < 128)
7091 align = PARM_BOUNDARY;
7092 }
7093
7094 if (warn_psabi
7095 && !warned
7096 && align != ix86_compat_function_arg_boundary (mode, type,
7097 saved_align))
7098 {
7099 warned = true;
7100 inform (input_location,
7101 "The ABI for passing parameters with %d-byte"
7102 " alignment has changed in GCC 4.6",
7103 align / BITS_PER_UNIT);
7104 }
7105 }
7106
7107 return align;
7108 }
7109
7110 /* Return true if N is a possible register number of function value. */
7111
7112 static bool
7113 ix86_function_value_regno_p (const unsigned int regno)
7114 {
7115 switch (regno)
7116 {
7117 case AX_REG:
7118 return true;
7119
7120 case FIRST_FLOAT_REG:
7121 /* TODO: The function should depend on current function ABI but
7122 builtins.c would need updating then. Therefore we use the
7123 default ABI. */
7124 if (TARGET_64BIT && ix86_abi == MS_ABI)
7125 return false;
7126 return TARGET_FLOAT_RETURNS_IN_80387;
7127
7128 case FIRST_SSE_REG:
7129 return TARGET_SSE;
7130
7131 case FIRST_MMX_REG:
7132 if (TARGET_MACHO || TARGET_64BIT)
7133 return false;
7134 return TARGET_MMX;
7135 }
7136
7137 return false;
7138 }
7139
7140 /* Define how to find the value returned by a function.
7141 VALTYPE is the data type of the value (as a tree).
7142 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7143 otherwise, FUNC is 0. */
7144
7145 static rtx
7146 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7147 const_tree fntype, const_tree fn)
7148 {
7149 unsigned int regno;
7150
7151 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7152 we normally prevent this case when mmx is not available. However
7153 some ABIs may require the result to be returned like DImode. */
7154 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7155 regno = FIRST_MMX_REG;
7156
7157 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7158 we prevent this case when sse is not available. However some ABIs
7159 may require the result to be returned like integer TImode. */
7160 else if (mode == TImode
7161 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7162 regno = FIRST_SSE_REG;
7163
7164 /* 32-byte vector modes in %ymm0. */
7165 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7166 regno = FIRST_SSE_REG;
7167
7168 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7169 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7170 regno = FIRST_FLOAT_REG;
7171 else
7172 /* Most things go in %eax. */
7173 regno = AX_REG;
7174
7175 /* Override FP return register with %xmm0 for local functions when
7176 SSE math is enabled or for functions with sseregparm attribute. */
7177 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7178 {
7179 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7180 if ((sse_level >= 1 && mode == SFmode)
7181 || (sse_level == 2 && mode == DFmode))
7182 regno = FIRST_SSE_REG;
7183 }
7184
7185 /* OImode shouldn't be used directly. */
7186 gcc_assert (mode != OImode);
7187
7188 return gen_rtx_REG (orig_mode, regno);
7189 }
7190
7191 static rtx
7192 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7193 const_tree valtype)
7194 {
7195 rtx ret;
7196
7197 /* Handle libcalls, which don't provide a type node. */
7198 if (valtype == NULL)
7199 {
7200 unsigned int regno;
7201
7202 switch (mode)
7203 {
7204 case SFmode:
7205 case SCmode:
7206 case DFmode:
7207 case DCmode:
7208 case TFmode:
7209 case SDmode:
7210 case DDmode:
7211 case TDmode:
7212 regno = FIRST_SSE_REG;
7213 break;
7214 case XFmode:
7215 case XCmode:
7216 regno = FIRST_FLOAT_REG;
7217 break;
7218 case TCmode:
7219 return NULL;
7220 default:
7221 regno = AX_REG;
7222 }
7223
7224 return gen_rtx_REG (mode, regno);
7225 }
7226 else if (POINTER_TYPE_P (valtype))
7227 {
7228 /* Pointers are always returned in Pmode. */
7229 mode = Pmode;
7230 }
7231
7232 ret = construct_container (mode, orig_mode, valtype, 1,
7233 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7234 x86_64_int_return_registers, 0);
7235
7236 /* For zero sized structures, construct_container returns NULL, but we
7237 need to keep rest of compiler happy by returning meaningful value. */
7238 if (!ret)
7239 ret = gen_rtx_REG (orig_mode, AX_REG);
7240
7241 return ret;
7242 }
7243
7244 static rtx
7245 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7246 {
7247 unsigned int regno = AX_REG;
7248
7249 if (TARGET_SSE)
7250 {
7251 switch (GET_MODE_SIZE (mode))
7252 {
7253 case 16:
7254 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7255 && !COMPLEX_MODE_P (mode))
7256 regno = FIRST_SSE_REG;
7257 break;
7258 case 8:
7259 case 4:
7260 if (mode == SFmode || mode == DFmode)
7261 regno = FIRST_SSE_REG;
7262 break;
7263 default:
7264 break;
7265 }
7266 }
7267 return gen_rtx_REG (orig_mode, regno);
7268 }
7269
7270 static rtx
7271 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7272 enum machine_mode orig_mode, enum machine_mode mode)
7273 {
7274 const_tree fn, fntype;
7275
7276 fn = NULL_TREE;
7277 if (fntype_or_decl && DECL_P (fntype_or_decl))
7278 fn = fntype_or_decl;
7279 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7280
7281 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7282 return function_value_ms_64 (orig_mode, mode);
7283 else if (TARGET_64BIT)
7284 return function_value_64 (orig_mode, mode, valtype);
7285 else
7286 return function_value_32 (orig_mode, mode, fntype, fn);
7287 }
7288
7289 static rtx
7290 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7291 bool outgoing ATTRIBUTE_UNUSED)
7292 {
7293 enum machine_mode mode, orig_mode;
7294
7295 orig_mode = TYPE_MODE (valtype);
7296 mode = type_natural_mode (valtype, NULL);
7297 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7298 }
7299
7300 /* Pointer function arguments and return values are promoted to Pmode. */
7301
7302 static enum machine_mode
7303 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7304 int *punsignedp, const_tree fntype,
7305 int for_return)
7306 {
7307 if (type != NULL_TREE && POINTER_TYPE_P (type))
7308 {
7309 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7310 return Pmode;
7311 }
7312 return default_promote_function_mode (type, mode, punsignedp, fntype,
7313 for_return);
7314 }
7315
7316 rtx
7317 ix86_libcall_value (enum machine_mode mode)
7318 {
7319 return ix86_function_value_1 (NULL, NULL, mode, mode);
7320 }
7321
7322 /* Return true iff type is returned in memory. */
7323
7324 static bool ATTRIBUTE_UNUSED
7325 return_in_memory_32 (const_tree type, enum machine_mode mode)
7326 {
7327 HOST_WIDE_INT size;
7328
7329 if (mode == BLKmode)
7330 return true;
7331
7332 size = int_size_in_bytes (type);
7333
7334 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7335 return false;
7336
7337 if (VECTOR_MODE_P (mode) || mode == TImode)
7338 {
7339 /* User-created vectors small enough to fit in EAX. */
7340 if (size < 8)
7341 return false;
7342
7343 /* MMX/3dNow values are returned in MM0,
7344 except when it doesn't exits or the ABI prescribes otherwise. */
7345 if (size == 8)
7346 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7347
7348 /* SSE values are returned in XMM0, except when it doesn't exist. */
7349 if (size == 16)
7350 return !TARGET_SSE;
7351
7352 /* AVX values are returned in YMM0, except when it doesn't exist. */
7353 if (size == 32)
7354 return !TARGET_AVX;
7355 }
7356
7357 if (mode == XFmode)
7358 return false;
7359
7360 if (size > 12)
7361 return true;
7362
7363 /* OImode shouldn't be used directly. */
7364 gcc_assert (mode != OImode);
7365
7366 return false;
7367 }
7368
7369 static bool ATTRIBUTE_UNUSED
7370 return_in_memory_64 (const_tree type, enum machine_mode mode)
7371 {
7372 int needed_intregs, needed_sseregs;
7373 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7374 }
7375
7376 static bool ATTRIBUTE_UNUSED
7377 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7378 {
7379 HOST_WIDE_INT size = int_size_in_bytes (type);
7380
7381 /* __m128 is returned in xmm0. */
7382 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7383 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7384 return false;
7385
7386 /* Otherwise, the size must be exactly in [1248]. */
7387 return size != 1 && size != 2 && size != 4 && size != 8;
7388 }
7389
7390 static bool
7391 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7392 {
7393 #ifdef SUBTARGET_RETURN_IN_MEMORY
7394 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7395 #else
7396 const enum machine_mode mode = type_natural_mode (type, NULL);
7397
7398 if (TARGET_64BIT)
7399 {
7400 if (ix86_function_type_abi (fntype) == MS_ABI)
7401 return return_in_memory_ms_64 (type, mode);
7402 else
7403 return return_in_memory_64 (type, mode);
7404 }
7405 else
7406 return return_in_memory_32 (type, mode);
7407 #endif
7408 }
7409
7410 /* When returning SSE vector types, we have a choice of either
7411 (1) being abi incompatible with a -march switch, or
7412 (2) generating an error.
7413 Given no good solution, I think the safest thing is one warning.
7414 The user won't be able to use -Werror, but....
7415
7416 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7417 called in response to actually generating a caller or callee that
7418 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7419 via aggregate_value_p for general type probing from tree-ssa. */
7420
7421 static rtx
7422 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7423 {
7424 static bool warnedsse, warnedmmx;
7425
7426 if (!TARGET_64BIT && type)
7427 {
7428 /* Look at the return type of the function, not the function type. */
7429 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7430
7431 if (!TARGET_SSE && !warnedsse)
7432 {
7433 if (mode == TImode
7434 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7435 {
7436 warnedsse = true;
7437 warning (0, "SSE vector return without SSE enabled "
7438 "changes the ABI");
7439 }
7440 }
7441
7442 if (!TARGET_MMX && !warnedmmx)
7443 {
7444 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7445 {
7446 warnedmmx = true;
7447 warning (0, "MMX vector return without MMX enabled "
7448 "changes the ABI");
7449 }
7450 }
7451 }
7452
7453 return NULL;
7454 }
7455
7456 \f
7457 /* Create the va_list data type. */
7458
7459 /* Returns the calling convention specific va_list date type.
7460 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7461
7462 static tree
7463 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7464 {
7465 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7466
7467 /* For i386 we use plain pointer to argument area. */
7468 if (!TARGET_64BIT || abi == MS_ABI)
7469 return build_pointer_type (char_type_node);
7470
7471 record = lang_hooks.types.make_type (RECORD_TYPE);
7472 type_decl = build_decl (BUILTINS_LOCATION,
7473 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7474
7475 f_gpr = build_decl (BUILTINS_LOCATION,
7476 FIELD_DECL, get_identifier ("gp_offset"),
7477 unsigned_type_node);
7478 f_fpr = build_decl (BUILTINS_LOCATION,
7479 FIELD_DECL, get_identifier ("fp_offset"),
7480 unsigned_type_node);
7481 f_ovf = build_decl (BUILTINS_LOCATION,
7482 FIELD_DECL, get_identifier ("overflow_arg_area"),
7483 ptr_type_node);
7484 f_sav = build_decl (BUILTINS_LOCATION,
7485 FIELD_DECL, get_identifier ("reg_save_area"),
7486 ptr_type_node);
7487
7488 va_list_gpr_counter_field = f_gpr;
7489 va_list_fpr_counter_field = f_fpr;
7490
7491 DECL_FIELD_CONTEXT (f_gpr) = record;
7492 DECL_FIELD_CONTEXT (f_fpr) = record;
7493 DECL_FIELD_CONTEXT (f_ovf) = record;
7494 DECL_FIELD_CONTEXT (f_sav) = record;
7495
7496 TYPE_STUB_DECL (record) = type_decl;
7497 TYPE_NAME (record) = type_decl;
7498 TYPE_FIELDS (record) = f_gpr;
7499 DECL_CHAIN (f_gpr) = f_fpr;
7500 DECL_CHAIN (f_fpr) = f_ovf;
7501 DECL_CHAIN (f_ovf) = f_sav;
7502
7503 layout_type (record);
7504
7505 /* The correct type is an array type of one element. */
7506 return build_array_type (record, build_index_type (size_zero_node));
7507 }
7508
7509 /* Setup the builtin va_list data type and for 64-bit the additional
7510 calling convention specific va_list data types. */
7511
7512 static tree
7513 ix86_build_builtin_va_list (void)
7514 {
7515 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7516
7517 /* Initialize abi specific va_list builtin types. */
7518 if (TARGET_64BIT)
7519 {
7520 tree t;
7521 if (ix86_abi == MS_ABI)
7522 {
7523 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7524 if (TREE_CODE (t) != RECORD_TYPE)
7525 t = build_variant_type_copy (t);
7526 sysv_va_list_type_node = t;
7527 }
7528 else
7529 {
7530 t = ret;
7531 if (TREE_CODE (t) != RECORD_TYPE)
7532 t = build_variant_type_copy (t);
7533 sysv_va_list_type_node = t;
7534 }
7535 if (ix86_abi != MS_ABI)
7536 {
7537 t = ix86_build_builtin_va_list_abi (MS_ABI);
7538 if (TREE_CODE (t) != RECORD_TYPE)
7539 t = build_variant_type_copy (t);
7540 ms_va_list_type_node = t;
7541 }
7542 else
7543 {
7544 t = ret;
7545 if (TREE_CODE (t) != RECORD_TYPE)
7546 t = build_variant_type_copy (t);
7547 ms_va_list_type_node = t;
7548 }
7549 }
7550
7551 return ret;
7552 }
7553
7554 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7555
7556 static void
7557 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7558 {
7559 rtx save_area, mem;
7560 alias_set_type set;
7561 int i, max;
7562
7563 /* GPR size of varargs save area. */
7564 if (cfun->va_list_gpr_size)
7565 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7566 else
7567 ix86_varargs_gpr_size = 0;
7568
7569 /* FPR size of varargs save area. We don't need it if we don't pass
7570 anything in SSE registers. */
7571 if (TARGET_SSE && cfun->va_list_fpr_size)
7572 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7573 else
7574 ix86_varargs_fpr_size = 0;
7575
7576 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7577 return;
7578
7579 save_area = frame_pointer_rtx;
7580 set = get_varargs_alias_set ();
7581
7582 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7583 if (max > X86_64_REGPARM_MAX)
7584 max = X86_64_REGPARM_MAX;
7585
7586 for (i = cum->regno; i < max; i++)
7587 {
7588 mem = gen_rtx_MEM (Pmode,
7589 plus_constant (save_area, i * UNITS_PER_WORD));
7590 MEM_NOTRAP_P (mem) = 1;
7591 set_mem_alias_set (mem, set);
7592 emit_move_insn (mem, gen_rtx_REG (Pmode,
7593 x86_64_int_parameter_registers[i]));
7594 }
7595
7596 if (ix86_varargs_fpr_size)
7597 {
7598 enum machine_mode smode;
7599 rtx label, test;
7600
7601 /* Now emit code to save SSE registers. The AX parameter contains number
7602 of SSE parameter registers used to call this function, though all we
7603 actually check here is the zero/non-zero status. */
7604
7605 label = gen_label_rtx ();
7606 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7607 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7608 label));
7609
7610 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7611 we used movdqa (i.e. TImode) instead? Perhaps even better would
7612 be if we could determine the real mode of the data, via a hook
7613 into pass_stdarg. Ignore all that for now. */
7614 smode = V4SFmode;
7615 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7616 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7617
7618 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7619 if (max > X86_64_SSE_REGPARM_MAX)
7620 max = X86_64_SSE_REGPARM_MAX;
7621
7622 for (i = cum->sse_regno; i < max; ++i)
7623 {
7624 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7625 mem = gen_rtx_MEM (smode, mem);
7626 MEM_NOTRAP_P (mem) = 1;
7627 set_mem_alias_set (mem, set);
7628 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7629
7630 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7631 }
7632
7633 emit_label (label);
7634 }
7635 }
7636
7637 static void
7638 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7639 {
7640 alias_set_type set = get_varargs_alias_set ();
7641 int i;
7642
7643 /* Reset to zero, as there might be a sysv vaarg used
7644 before. */
7645 ix86_varargs_gpr_size = 0;
7646 ix86_varargs_fpr_size = 0;
7647
7648 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7649 {
7650 rtx reg, mem;
7651
7652 mem = gen_rtx_MEM (Pmode,
7653 plus_constant (virtual_incoming_args_rtx,
7654 i * UNITS_PER_WORD));
7655 MEM_NOTRAP_P (mem) = 1;
7656 set_mem_alias_set (mem, set);
7657
7658 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7659 emit_move_insn (mem, reg);
7660 }
7661 }
7662
7663 static void
7664 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7665 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7666 int no_rtl)
7667 {
7668 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7669 CUMULATIVE_ARGS next_cum;
7670 tree fntype;
7671
7672 /* This argument doesn't appear to be used anymore. Which is good,
7673 because the old code here didn't suppress rtl generation. */
7674 gcc_assert (!no_rtl);
7675
7676 if (!TARGET_64BIT)
7677 return;
7678
7679 fntype = TREE_TYPE (current_function_decl);
7680
7681 /* For varargs, we do not want to skip the dummy va_dcl argument.
7682 For stdargs, we do want to skip the last named argument. */
7683 next_cum = *cum;
7684 if (stdarg_p (fntype))
7685 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7686 true);
7687
7688 if (cum->call_abi == MS_ABI)
7689 setup_incoming_varargs_ms_64 (&next_cum);
7690 else
7691 setup_incoming_varargs_64 (&next_cum);
7692 }
7693
7694 /* Checks if TYPE is of kind va_list char *. */
7695
7696 static bool
7697 is_va_list_char_pointer (tree type)
7698 {
7699 tree canonic;
7700
7701 /* For 32-bit it is always true. */
7702 if (!TARGET_64BIT)
7703 return true;
7704 canonic = ix86_canonical_va_list_type (type);
7705 return (canonic == ms_va_list_type_node
7706 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7707 }
7708
7709 /* Implement va_start. */
7710
7711 static void
7712 ix86_va_start (tree valist, rtx nextarg)
7713 {
7714 HOST_WIDE_INT words, n_gpr, n_fpr;
7715 tree f_gpr, f_fpr, f_ovf, f_sav;
7716 tree gpr, fpr, ovf, sav, t;
7717 tree type;
7718 rtx ovf_rtx;
7719
7720 if (flag_split_stack
7721 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7722 {
7723 unsigned int scratch_regno;
7724
7725 /* When we are splitting the stack, we can't refer to the stack
7726 arguments using internal_arg_pointer, because they may be on
7727 the old stack. The split stack prologue will arrange to
7728 leave a pointer to the old stack arguments in a scratch
7729 register, which we here copy to a pseudo-register. The split
7730 stack prologue can't set the pseudo-register directly because
7731 it (the prologue) runs before any registers have been saved. */
7732
7733 scratch_regno = split_stack_prologue_scratch_regno ();
7734 if (scratch_regno != INVALID_REGNUM)
7735 {
7736 rtx reg, seq;
7737
7738 reg = gen_reg_rtx (Pmode);
7739 cfun->machine->split_stack_varargs_pointer = reg;
7740
7741 start_sequence ();
7742 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7743 seq = get_insns ();
7744 end_sequence ();
7745
7746 push_topmost_sequence ();
7747 emit_insn_after (seq, entry_of_function ());
7748 pop_topmost_sequence ();
7749 }
7750 }
7751
7752 /* Only 64bit target needs something special. */
7753 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7754 {
7755 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7756 std_expand_builtin_va_start (valist, nextarg);
7757 else
7758 {
7759 rtx va_r, next;
7760
7761 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7762 next = expand_binop (ptr_mode, add_optab,
7763 cfun->machine->split_stack_varargs_pointer,
7764 crtl->args.arg_offset_rtx,
7765 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7766 convert_move (va_r, next, 0);
7767 }
7768 return;
7769 }
7770
7771 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7772 f_fpr = DECL_CHAIN (f_gpr);
7773 f_ovf = DECL_CHAIN (f_fpr);
7774 f_sav = DECL_CHAIN (f_ovf);
7775
7776 valist = build_simple_mem_ref (valist);
7777 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7778 /* The following should be folded into the MEM_REF offset. */
7779 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7780 f_gpr, NULL_TREE);
7781 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7782 f_fpr, NULL_TREE);
7783 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7784 f_ovf, NULL_TREE);
7785 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7786 f_sav, NULL_TREE);
7787
7788 /* Count number of gp and fp argument registers used. */
7789 words = crtl->args.info.words;
7790 n_gpr = crtl->args.info.regno;
7791 n_fpr = crtl->args.info.sse_regno;
7792
7793 if (cfun->va_list_gpr_size)
7794 {
7795 type = TREE_TYPE (gpr);
7796 t = build2 (MODIFY_EXPR, type,
7797 gpr, build_int_cst (type, n_gpr * 8));
7798 TREE_SIDE_EFFECTS (t) = 1;
7799 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7800 }
7801
7802 if (TARGET_SSE && cfun->va_list_fpr_size)
7803 {
7804 type = TREE_TYPE (fpr);
7805 t = build2 (MODIFY_EXPR, type, fpr,
7806 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7807 TREE_SIDE_EFFECTS (t) = 1;
7808 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7809 }
7810
7811 /* Find the overflow area. */
7812 type = TREE_TYPE (ovf);
7813 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7814 ovf_rtx = crtl->args.internal_arg_pointer;
7815 else
7816 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7817 t = make_tree (type, ovf_rtx);
7818 if (words != 0)
7819 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7820 t = build2 (MODIFY_EXPR, type, ovf, t);
7821 TREE_SIDE_EFFECTS (t) = 1;
7822 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7823
7824 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7825 {
7826 /* Find the register save area.
7827 Prologue of the function save it right above stack frame. */
7828 type = TREE_TYPE (sav);
7829 t = make_tree (type, frame_pointer_rtx);
7830 if (!ix86_varargs_gpr_size)
7831 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7832 t = build2 (MODIFY_EXPR, type, sav, t);
7833 TREE_SIDE_EFFECTS (t) = 1;
7834 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7835 }
7836 }
7837
7838 /* Implement va_arg. */
7839
7840 static tree
7841 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7842 gimple_seq *post_p)
7843 {
7844 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7845 tree f_gpr, f_fpr, f_ovf, f_sav;
7846 tree gpr, fpr, ovf, sav, t;
7847 int size, rsize;
7848 tree lab_false, lab_over = NULL_TREE;
7849 tree addr, t2;
7850 rtx container;
7851 int indirect_p = 0;
7852 tree ptrtype;
7853 enum machine_mode nat_mode;
7854 unsigned int arg_boundary;
7855
7856 /* Only 64bit target needs something special. */
7857 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7858 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7859
7860 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7861 f_fpr = DECL_CHAIN (f_gpr);
7862 f_ovf = DECL_CHAIN (f_fpr);
7863 f_sav = DECL_CHAIN (f_ovf);
7864
7865 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7866 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7867 valist = build_va_arg_indirect_ref (valist);
7868 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7869 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7870 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7871
7872 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7873 if (indirect_p)
7874 type = build_pointer_type (type);
7875 size = int_size_in_bytes (type);
7876 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7877
7878 nat_mode = type_natural_mode (type, NULL);
7879 switch (nat_mode)
7880 {
7881 case V8SFmode:
7882 case V8SImode:
7883 case V32QImode:
7884 case V16HImode:
7885 case V4DFmode:
7886 case V4DImode:
7887 /* Unnamed 256bit vector mode parameters are passed on stack. */
7888 if (!TARGET_64BIT_MS_ABI)
7889 {
7890 container = NULL;
7891 break;
7892 }
7893
7894 default:
7895 container = construct_container (nat_mode, TYPE_MODE (type),
7896 type, 0, X86_64_REGPARM_MAX,
7897 X86_64_SSE_REGPARM_MAX, intreg,
7898 0);
7899 break;
7900 }
7901
7902 /* Pull the value out of the saved registers. */
7903
7904 addr = create_tmp_var (ptr_type_node, "addr");
7905
7906 if (container)
7907 {
7908 int needed_intregs, needed_sseregs;
7909 bool need_temp;
7910 tree int_addr, sse_addr;
7911
7912 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7913 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7914
7915 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7916
7917 need_temp = (!REG_P (container)
7918 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7919 || TYPE_ALIGN (type) > 128));
7920
7921 /* In case we are passing structure, verify that it is consecutive block
7922 on the register save area. If not we need to do moves. */
7923 if (!need_temp && !REG_P (container))
7924 {
7925 /* Verify that all registers are strictly consecutive */
7926 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7927 {
7928 int i;
7929
7930 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7931 {
7932 rtx slot = XVECEXP (container, 0, i);
7933 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7934 || INTVAL (XEXP (slot, 1)) != i * 16)
7935 need_temp = 1;
7936 }
7937 }
7938 else
7939 {
7940 int i;
7941
7942 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7943 {
7944 rtx slot = XVECEXP (container, 0, i);
7945 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7946 || INTVAL (XEXP (slot, 1)) != i * 8)
7947 need_temp = 1;
7948 }
7949 }
7950 }
7951 if (!need_temp)
7952 {
7953 int_addr = addr;
7954 sse_addr = addr;
7955 }
7956 else
7957 {
7958 int_addr = create_tmp_var (ptr_type_node, "int_addr");
7959 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7960 }
7961
7962 /* First ensure that we fit completely in registers. */
7963 if (needed_intregs)
7964 {
7965 t = build_int_cst (TREE_TYPE (gpr),
7966 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7967 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7968 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7969 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7970 gimplify_and_add (t, pre_p);
7971 }
7972 if (needed_sseregs)
7973 {
7974 t = build_int_cst (TREE_TYPE (fpr),
7975 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
7976 + X86_64_REGPARM_MAX * 8);
7977 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
7978 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7979 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7980 gimplify_and_add (t, pre_p);
7981 }
7982
7983 /* Compute index to start of area used for integer regs. */
7984 if (needed_intregs)
7985 {
7986 /* int_addr = gpr + sav; */
7987 t = fold_build_pointer_plus (sav, gpr);
7988 gimplify_assign (int_addr, t, pre_p);
7989 }
7990 if (needed_sseregs)
7991 {
7992 /* sse_addr = fpr + sav; */
7993 t = fold_build_pointer_plus (sav, fpr);
7994 gimplify_assign (sse_addr, t, pre_p);
7995 }
7996 if (need_temp)
7997 {
7998 int i, prev_size = 0;
7999 tree temp = create_tmp_var (type, "va_arg_tmp");
8000
8001 /* addr = &temp; */
8002 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8003 gimplify_assign (addr, t, pre_p);
8004
8005 for (i = 0; i < XVECLEN (container, 0); i++)
8006 {
8007 rtx slot = XVECEXP (container, 0, i);
8008 rtx reg = XEXP (slot, 0);
8009 enum machine_mode mode = GET_MODE (reg);
8010 tree piece_type;
8011 tree addr_type;
8012 tree daddr_type;
8013 tree src_addr, src;
8014 int src_offset;
8015 tree dest_addr, dest;
8016 int cur_size = GET_MODE_SIZE (mode);
8017
8018 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8019 prev_size = INTVAL (XEXP (slot, 1));
8020 if (prev_size + cur_size > size)
8021 {
8022 cur_size = size - prev_size;
8023 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8024 if (mode == BLKmode)
8025 mode = QImode;
8026 }
8027 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8028 if (mode == GET_MODE (reg))
8029 addr_type = build_pointer_type (piece_type);
8030 else
8031 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8032 true);
8033 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8034 true);
8035
8036 if (SSE_REGNO_P (REGNO (reg)))
8037 {
8038 src_addr = sse_addr;
8039 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8040 }
8041 else
8042 {
8043 src_addr = int_addr;
8044 src_offset = REGNO (reg) * 8;
8045 }
8046 src_addr = fold_convert (addr_type, src_addr);
8047 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8048
8049 dest_addr = fold_convert (daddr_type, addr);
8050 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8051 if (cur_size == GET_MODE_SIZE (mode))
8052 {
8053 src = build_va_arg_indirect_ref (src_addr);
8054 dest = build_va_arg_indirect_ref (dest_addr);
8055
8056 gimplify_assign (dest, src, pre_p);
8057 }
8058 else
8059 {
8060 tree copy
8061 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8062 3, dest_addr, src_addr,
8063 size_int (cur_size));
8064 gimplify_and_add (copy, pre_p);
8065 }
8066 prev_size += cur_size;
8067 }
8068 }
8069
8070 if (needed_intregs)
8071 {
8072 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8073 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8074 gimplify_assign (gpr, t, pre_p);
8075 }
8076
8077 if (needed_sseregs)
8078 {
8079 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8080 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8081 gimplify_assign (fpr, t, pre_p);
8082 }
8083
8084 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8085
8086 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8087 }
8088
8089 /* ... otherwise out of the overflow area. */
8090
8091 /* When we align parameter on stack for caller, if the parameter
8092 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8093 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8094 here with caller. */
8095 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8096 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8097 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8098
8099 /* Care for on-stack alignment if needed. */
8100 if (arg_boundary <= 64 || size == 0)
8101 t = ovf;
8102 else
8103 {
8104 HOST_WIDE_INT align = arg_boundary / 8;
8105 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8106 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8107 build_int_cst (TREE_TYPE (t), -align));
8108 }
8109
8110 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8111 gimplify_assign (addr, t, pre_p);
8112
8113 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8114 gimplify_assign (unshare_expr (ovf), t, pre_p);
8115
8116 if (container)
8117 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8118
8119 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8120 addr = fold_convert (ptrtype, addr);
8121
8122 if (indirect_p)
8123 addr = build_va_arg_indirect_ref (addr);
8124 return build_va_arg_indirect_ref (addr);
8125 }
8126 \f
8127 /* Return true if OPNUM's MEM should be matched
8128 in movabs* patterns. */
8129
8130 bool
8131 ix86_check_movabs (rtx insn, int opnum)
8132 {
8133 rtx set, mem;
8134
8135 set = PATTERN (insn);
8136 if (GET_CODE (set) == PARALLEL)
8137 set = XVECEXP (set, 0, 0);
8138 gcc_assert (GET_CODE (set) == SET);
8139 mem = XEXP (set, opnum);
8140 while (GET_CODE (mem) == SUBREG)
8141 mem = SUBREG_REG (mem);
8142 gcc_assert (MEM_P (mem));
8143 return volatile_ok || !MEM_VOLATILE_P (mem);
8144 }
8145 \f
8146 /* Initialize the table of extra 80387 mathematical constants. */
8147
8148 static void
8149 init_ext_80387_constants (void)
8150 {
8151 static const char * cst[5] =
8152 {
8153 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8154 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8155 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8156 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8157 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8158 };
8159 int i;
8160
8161 for (i = 0; i < 5; i++)
8162 {
8163 real_from_string (&ext_80387_constants_table[i], cst[i]);
8164 /* Ensure each constant is rounded to XFmode precision. */
8165 real_convert (&ext_80387_constants_table[i],
8166 XFmode, &ext_80387_constants_table[i]);
8167 }
8168
8169 ext_80387_constants_init = 1;
8170 }
8171
8172 /* Return non-zero if the constant is something that
8173 can be loaded with a special instruction. */
8174
8175 int
8176 standard_80387_constant_p (rtx x)
8177 {
8178 enum machine_mode mode = GET_MODE (x);
8179
8180 REAL_VALUE_TYPE r;
8181
8182 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8183 return -1;
8184
8185 if (x == CONST0_RTX (mode))
8186 return 1;
8187 if (x == CONST1_RTX (mode))
8188 return 2;
8189
8190 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8191
8192 /* For XFmode constants, try to find a special 80387 instruction when
8193 optimizing for size or on those CPUs that benefit from them. */
8194 if (mode == XFmode
8195 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8196 {
8197 int i;
8198
8199 if (! ext_80387_constants_init)
8200 init_ext_80387_constants ();
8201
8202 for (i = 0; i < 5; i++)
8203 if (real_identical (&r, &ext_80387_constants_table[i]))
8204 return i + 3;
8205 }
8206
8207 /* Load of the constant -0.0 or -1.0 will be split as
8208 fldz;fchs or fld1;fchs sequence. */
8209 if (real_isnegzero (&r))
8210 return 8;
8211 if (real_identical (&r, &dconstm1))
8212 return 9;
8213
8214 return 0;
8215 }
8216
8217 /* Return the opcode of the special instruction to be used to load
8218 the constant X. */
8219
8220 const char *
8221 standard_80387_constant_opcode (rtx x)
8222 {
8223 switch (standard_80387_constant_p (x))
8224 {
8225 case 1:
8226 return "fldz";
8227 case 2:
8228 return "fld1";
8229 case 3:
8230 return "fldlg2";
8231 case 4:
8232 return "fldln2";
8233 case 5:
8234 return "fldl2e";
8235 case 6:
8236 return "fldl2t";
8237 case 7:
8238 return "fldpi";
8239 case 8:
8240 case 9:
8241 return "#";
8242 default:
8243 gcc_unreachable ();
8244 }
8245 }
8246
8247 /* Return the CONST_DOUBLE representing the 80387 constant that is
8248 loaded by the specified special instruction. The argument IDX
8249 matches the return value from standard_80387_constant_p. */
8250
8251 rtx
8252 standard_80387_constant_rtx (int idx)
8253 {
8254 int i;
8255
8256 if (! ext_80387_constants_init)
8257 init_ext_80387_constants ();
8258
8259 switch (idx)
8260 {
8261 case 3:
8262 case 4:
8263 case 5:
8264 case 6:
8265 case 7:
8266 i = idx - 3;
8267 break;
8268
8269 default:
8270 gcc_unreachable ();
8271 }
8272
8273 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8274 XFmode);
8275 }
8276
8277 /* Return 1 if X is all 0s and 2 if x is all 1s
8278 in supported SSE/AVX vector mode. */
8279
8280 int
8281 standard_sse_constant_p (rtx x)
8282 {
8283 enum machine_mode mode = GET_MODE (x);
8284
8285 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8286 return 1;
8287 if (vector_all_ones_operand (x, mode))
8288 switch (mode)
8289 {
8290 case V16QImode:
8291 case V8HImode:
8292 case V4SImode:
8293 case V2DImode:
8294 if (TARGET_SSE2)
8295 return 2;
8296 case V32QImode:
8297 case V16HImode:
8298 case V8SImode:
8299 case V4DImode:
8300 if (TARGET_AVX2)
8301 return 2;
8302 default:
8303 break;
8304 }
8305
8306 return 0;
8307 }
8308
8309 /* Return the opcode of the special instruction to be used to load
8310 the constant X. */
8311
8312 const char *
8313 standard_sse_constant_opcode (rtx insn, rtx x)
8314 {
8315 switch (standard_sse_constant_p (x))
8316 {
8317 case 1:
8318 switch (get_attr_mode (insn))
8319 {
8320 case MODE_TI:
8321 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8322 return "%vpxor\t%0, %d0";
8323 case MODE_V2DF:
8324 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8325 return "%vxorpd\t%0, %d0";
8326 case MODE_V4SF:
8327 return "%vxorps\t%0, %d0";
8328
8329 case MODE_OI:
8330 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8331 return "vpxor\t%x0, %x0, %x0";
8332 case MODE_V4DF:
8333 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8334 return "vxorpd\t%x0, %x0, %x0";
8335 case MODE_V8SF:
8336 return "vxorps\t%x0, %x0, %x0";
8337
8338 default:
8339 break;
8340 }
8341
8342 case 2:
8343 if (TARGET_AVX)
8344 return "vpcmpeqd\t%0, %0, %0";
8345 else
8346 return "pcmpeqd\t%0, %0";
8347
8348 default:
8349 break;
8350 }
8351 gcc_unreachable ();
8352 }
8353
8354 /* Returns true if OP contains a symbol reference */
8355
8356 bool
8357 symbolic_reference_mentioned_p (rtx op)
8358 {
8359 const char *fmt;
8360 int i;
8361
8362 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8363 return true;
8364
8365 fmt = GET_RTX_FORMAT (GET_CODE (op));
8366 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8367 {
8368 if (fmt[i] == 'E')
8369 {
8370 int j;
8371
8372 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8373 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8374 return true;
8375 }
8376
8377 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8378 return true;
8379 }
8380
8381 return false;
8382 }
8383
8384 /* Return true if it is appropriate to emit `ret' instructions in the
8385 body of a function. Do this only if the epilogue is simple, needing a
8386 couple of insns. Prior to reloading, we can't tell how many registers
8387 must be saved, so return false then. Return false if there is no frame
8388 marker to de-allocate. */
8389
8390 bool
8391 ix86_can_use_return_insn_p (void)
8392 {
8393 struct ix86_frame frame;
8394
8395 if (! reload_completed || frame_pointer_needed)
8396 return 0;
8397
8398 /* Don't allow more than 32k pop, since that's all we can do
8399 with one instruction. */
8400 if (crtl->args.pops_args && crtl->args.size >= 32768)
8401 return 0;
8402
8403 ix86_compute_frame_layout (&frame);
8404 return (frame.stack_pointer_offset == UNITS_PER_WORD
8405 && (frame.nregs + frame.nsseregs) == 0);
8406 }
8407 \f
8408 /* Value should be nonzero if functions must have frame pointers.
8409 Zero means the frame pointer need not be set up (and parms may
8410 be accessed via the stack pointer) in functions that seem suitable. */
8411
8412 static bool
8413 ix86_frame_pointer_required (void)
8414 {
8415 /* If we accessed previous frames, then the generated code expects
8416 to be able to access the saved ebp value in our frame. */
8417 if (cfun->machine->accesses_prev_frame)
8418 return true;
8419
8420 /* Several x86 os'es need a frame pointer for other reasons,
8421 usually pertaining to setjmp. */
8422 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8423 return true;
8424
8425 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8426 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8427 return true;
8428
8429 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8430 turns off the frame pointer by default. Turn it back on now if
8431 we've not got a leaf function. */
8432 if (TARGET_OMIT_LEAF_FRAME_POINTER
8433 && (!current_function_is_leaf
8434 || ix86_current_function_calls_tls_descriptor))
8435 return true;
8436
8437 if (crtl->profile && !flag_fentry)
8438 return true;
8439
8440 return false;
8441 }
8442
8443 /* Record that the current function accesses previous call frames. */
8444
8445 void
8446 ix86_setup_frame_addresses (void)
8447 {
8448 cfun->machine->accesses_prev_frame = 1;
8449 }
8450 \f
8451 #ifndef USE_HIDDEN_LINKONCE
8452 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8453 # define USE_HIDDEN_LINKONCE 1
8454 # else
8455 # define USE_HIDDEN_LINKONCE 0
8456 # endif
8457 #endif
8458
8459 static int pic_labels_used;
8460
8461 /* Fills in the label name that should be used for a pc thunk for
8462 the given register. */
8463
8464 static void
8465 get_pc_thunk_name (char name[32], unsigned int regno)
8466 {
8467 gcc_assert (!TARGET_64BIT);
8468
8469 if (USE_HIDDEN_LINKONCE)
8470 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8471 else
8472 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8473 }
8474
8475
8476 /* This function generates code for -fpic that loads %ebx with
8477 the return address of the caller and then returns. */
8478
8479 static void
8480 ix86_code_end (void)
8481 {
8482 rtx xops[2];
8483 int regno;
8484
8485 for (regno = AX_REG; regno <= SP_REG; regno++)
8486 {
8487 char name[32];
8488 tree decl;
8489
8490 if (!(pic_labels_used & (1 << regno)))
8491 continue;
8492
8493 get_pc_thunk_name (name, regno);
8494
8495 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8496 get_identifier (name),
8497 build_function_type_list (void_type_node, NULL_TREE));
8498 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8499 NULL_TREE, void_type_node);
8500 TREE_PUBLIC (decl) = 1;
8501 TREE_STATIC (decl) = 1;
8502
8503 #if TARGET_MACHO
8504 if (TARGET_MACHO)
8505 {
8506 switch_to_section (darwin_sections[text_coal_section]);
8507 fputs ("\t.weak_definition\t", asm_out_file);
8508 assemble_name (asm_out_file, name);
8509 fputs ("\n\t.private_extern\t", asm_out_file);
8510 assemble_name (asm_out_file, name);
8511 putc ('\n', asm_out_file);
8512 ASM_OUTPUT_LABEL (asm_out_file, name);
8513 DECL_WEAK (decl) = 1;
8514 }
8515 else
8516 #endif
8517 if (USE_HIDDEN_LINKONCE)
8518 {
8519 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8520
8521 targetm.asm_out.unique_section (decl, 0);
8522 switch_to_section (get_named_section (decl, NULL, 0));
8523
8524 targetm.asm_out.globalize_label (asm_out_file, name);
8525 fputs ("\t.hidden\t", asm_out_file);
8526 assemble_name (asm_out_file, name);
8527 putc ('\n', asm_out_file);
8528 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8529 }
8530 else
8531 {
8532 switch_to_section (text_section);
8533 ASM_OUTPUT_LABEL (asm_out_file, name);
8534 }
8535
8536 DECL_INITIAL (decl) = make_node (BLOCK);
8537 current_function_decl = decl;
8538 init_function_start (decl);
8539 first_function_block_is_cold = false;
8540 /* Make sure unwind info is emitted for the thunk if needed. */
8541 final_start_function (emit_barrier (), asm_out_file, 1);
8542
8543 /* Pad stack IP move with 4 instructions (two NOPs count
8544 as one instruction). */
8545 if (TARGET_PAD_SHORT_FUNCTION)
8546 {
8547 int i = 8;
8548
8549 while (i--)
8550 fputs ("\tnop\n", asm_out_file);
8551 }
8552
8553 xops[0] = gen_rtx_REG (Pmode, regno);
8554 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8555 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8556 fputs ("\tret\n", asm_out_file);
8557 final_end_function ();
8558 init_insn_lengths ();
8559 free_after_compilation (cfun);
8560 set_cfun (NULL);
8561 current_function_decl = NULL;
8562 }
8563
8564 if (flag_split_stack)
8565 file_end_indicate_split_stack ();
8566 }
8567
8568 /* Emit code for the SET_GOT patterns. */
8569
8570 const char *
8571 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8572 {
8573 rtx xops[3];
8574
8575 xops[0] = dest;
8576
8577 if (TARGET_VXWORKS_RTP && flag_pic)
8578 {
8579 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8580 xops[2] = gen_rtx_MEM (Pmode,
8581 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8582 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8583
8584 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8585 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8586 an unadorned address. */
8587 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8588 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8589 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8590 return "";
8591 }
8592
8593 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8594
8595 if (!flag_pic)
8596 {
8597 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8598
8599 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8600
8601 #if TARGET_MACHO
8602 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8603 is what will be referenced by the Mach-O PIC subsystem. */
8604 if (!label)
8605 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8606 #endif
8607
8608 targetm.asm_out.internal_label (asm_out_file, "L",
8609 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8610 }
8611 else
8612 {
8613 char name[32];
8614 get_pc_thunk_name (name, REGNO (dest));
8615 pic_labels_used |= 1 << REGNO (dest);
8616
8617 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8618 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8619 output_asm_insn ("call\t%X2", xops);
8620 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8621 is what will be referenced by the Mach-O PIC subsystem. */
8622 #if TARGET_MACHO
8623 if (!label)
8624 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8625 else
8626 targetm.asm_out.internal_label (asm_out_file, "L",
8627 CODE_LABEL_NUMBER (label));
8628 #endif
8629 }
8630
8631 if (!TARGET_MACHO)
8632 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8633
8634 return "";
8635 }
8636
8637 /* Generate an "push" pattern for input ARG. */
8638
8639 static rtx
8640 gen_push (rtx arg)
8641 {
8642 struct machine_function *m = cfun->machine;
8643
8644 if (m->fs.cfa_reg == stack_pointer_rtx)
8645 m->fs.cfa_offset += UNITS_PER_WORD;
8646 m->fs.sp_offset += UNITS_PER_WORD;
8647
8648 return gen_rtx_SET (VOIDmode,
8649 gen_rtx_MEM (Pmode,
8650 gen_rtx_PRE_DEC (Pmode,
8651 stack_pointer_rtx)),
8652 arg);
8653 }
8654
8655 /* Generate an "pop" pattern for input ARG. */
8656
8657 static rtx
8658 gen_pop (rtx arg)
8659 {
8660 return gen_rtx_SET (VOIDmode,
8661 arg,
8662 gen_rtx_MEM (Pmode,
8663 gen_rtx_POST_INC (Pmode,
8664 stack_pointer_rtx)));
8665 }
8666
8667 /* Return >= 0 if there is an unused call-clobbered register available
8668 for the entire function. */
8669
8670 static unsigned int
8671 ix86_select_alt_pic_regnum (void)
8672 {
8673 if (current_function_is_leaf
8674 && !crtl->profile
8675 && !ix86_current_function_calls_tls_descriptor)
8676 {
8677 int i, drap;
8678 /* Can't use the same register for both PIC and DRAP. */
8679 if (crtl->drap_reg)
8680 drap = REGNO (crtl->drap_reg);
8681 else
8682 drap = -1;
8683 for (i = 2; i >= 0; --i)
8684 if (i != drap && !df_regs_ever_live_p (i))
8685 return i;
8686 }
8687
8688 return INVALID_REGNUM;
8689 }
8690
8691 /* Return TRUE if we need to save REGNO. */
8692
8693 static bool
8694 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8695 {
8696 if (pic_offset_table_rtx
8697 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8698 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8699 || crtl->profile
8700 || crtl->calls_eh_return
8701 || crtl->uses_const_pool))
8702 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8703
8704 if (crtl->calls_eh_return && maybe_eh_return)
8705 {
8706 unsigned i;
8707 for (i = 0; ; i++)
8708 {
8709 unsigned test = EH_RETURN_DATA_REGNO (i);
8710 if (test == INVALID_REGNUM)
8711 break;
8712 if (test == regno)
8713 return true;
8714 }
8715 }
8716
8717 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8718 return true;
8719
8720 return (df_regs_ever_live_p (regno)
8721 && !call_used_regs[regno]
8722 && !fixed_regs[regno]
8723 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8724 }
8725
8726 /* Return number of saved general prupose registers. */
8727
8728 static int
8729 ix86_nsaved_regs (void)
8730 {
8731 int nregs = 0;
8732 int regno;
8733
8734 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8735 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8736 nregs ++;
8737 return nregs;
8738 }
8739
8740 /* Return number of saved SSE registrers. */
8741
8742 static int
8743 ix86_nsaved_sseregs (void)
8744 {
8745 int nregs = 0;
8746 int regno;
8747
8748 if (!TARGET_64BIT_MS_ABI)
8749 return 0;
8750 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8751 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8752 nregs ++;
8753 return nregs;
8754 }
8755
8756 /* Given FROM and TO register numbers, say whether this elimination is
8757 allowed. If stack alignment is needed, we can only replace argument
8758 pointer with hard frame pointer, or replace frame pointer with stack
8759 pointer. Otherwise, frame pointer elimination is automatically
8760 handled and all other eliminations are valid. */
8761
8762 static bool
8763 ix86_can_eliminate (const int from, const int to)
8764 {
8765 if (stack_realign_fp)
8766 return ((from == ARG_POINTER_REGNUM
8767 && to == HARD_FRAME_POINTER_REGNUM)
8768 || (from == FRAME_POINTER_REGNUM
8769 && to == STACK_POINTER_REGNUM));
8770 else
8771 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8772 }
8773
8774 /* Return the offset between two registers, one to be eliminated, and the other
8775 its replacement, at the start of a routine. */
8776
8777 HOST_WIDE_INT
8778 ix86_initial_elimination_offset (int from, int to)
8779 {
8780 struct ix86_frame frame;
8781 ix86_compute_frame_layout (&frame);
8782
8783 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8784 return frame.hard_frame_pointer_offset;
8785 else if (from == FRAME_POINTER_REGNUM
8786 && to == HARD_FRAME_POINTER_REGNUM)
8787 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8788 else
8789 {
8790 gcc_assert (to == STACK_POINTER_REGNUM);
8791
8792 if (from == ARG_POINTER_REGNUM)
8793 return frame.stack_pointer_offset;
8794
8795 gcc_assert (from == FRAME_POINTER_REGNUM);
8796 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8797 }
8798 }
8799
8800 /* In a dynamically-aligned function, we can't know the offset from
8801 stack pointer to frame pointer, so we must ensure that setjmp
8802 eliminates fp against the hard fp (%ebp) rather than trying to
8803 index from %esp up to the top of the frame across a gap that is
8804 of unknown (at compile-time) size. */
8805 static rtx
8806 ix86_builtin_setjmp_frame_value (void)
8807 {
8808 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8809 }
8810
8811 /* When using -fsplit-stack, the allocation routines set a field in
8812 the TCB to the bottom of the stack plus this much space, measured
8813 in bytes. */
8814
8815 #define SPLIT_STACK_AVAILABLE 256
8816
8817 /* Fill structure ix86_frame about frame of currently computed function. */
8818
8819 static void
8820 ix86_compute_frame_layout (struct ix86_frame *frame)
8821 {
8822 unsigned int stack_alignment_needed;
8823 HOST_WIDE_INT offset;
8824 unsigned int preferred_alignment;
8825 HOST_WIDE_INT size = get_frame_size ();
8826 HOST_WIDE_INT to_allocate;
8827
8828 frame->nregs = ix86_nsaved_regs ();
8829 frame->nsseregs = ix86_nsaved_sseregs ();
8830
8831 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8832 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8833
8834 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8835 function prologues and leaf. */
8836 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8837 && (!current_function_is_leaf || cfun->calls_alloca != 0
8838 || ix86_current_function_calls_tls_descriptor))
8839 {
8840 preferred_alignment = 16;
8841 stack_alignment_needed = 16;
8842 crtl->preferred_stack_boundary = 128;
8843 crtl->stack_alignment_needed = 128;
8844 }
8845
8846 gcc_assert (!size || stack_alignment_needed);
8847 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8848 gcc_assert (preferred_alignment <= stack_alignment_needed);
8849
8850 /* For SEH we have to limit the amount of code movement into the prologue.
8851 At present we do this via a BLOCKAGE, at which point there's very little
8852 scheduling that can be done, which means that there's very little point
8853 in doing anything except PUSHs. */
8854 if (TARGET_SEH)
8855 cfun->machine->use_fast_prologue_epilogue = false;
8856
8857 /* During reload iteration the amount of registers saved can change.
8858 Recompute the value as needed. Do not recompute when amount of registers
8859 didn't change as reload does multiple calls to the function and does not
8860 expect the decision to change within single iteration. */
8861 else if (!optimize_function_for_size_p (cfun)
8862 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8863 {
8864 int count = frame->nregs;
8865 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8866
8867 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8868
8869 /* The fast prologue uses move instead of push to save registers. This
8870 is significantly longer, but also executes faster as modern hardware
8871 can execute the moves in parallel, but can't do that for push/pop.
8872
8873 Be careful about choosing what prologue to emit: When function takes
8874 many instructions to execute we may use slow version as well as in
8875 case function is known to be outside hot spot (this is known with
8876 feedback only). Weight the size of function by number of registers
8877 to save as it is cheap to use one or two push instructions but very
8878 slow to use many of them. */
8879 if (count)
8880 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8881 if (node->frequency < NODE_FREQUENCY_NORMAL
8882 || (flag_branch_probabilities
8883 && node->frequency < NODE_FREQUENCY_HOT))
8884 cfun->machine->use_fast_prologue_epilogue = false;
8885 else
8886 cfun->machine->use_fast_prologue_epilogue
8887 = !expensive_function_p (count);
8888 }
8889
8890 frame->save_regs_using_mov
8891 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8892 /* If static stack checking is enabled and done with probes,
8893 the registers need to be saved before allocating the frame. */
8894 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8895
8896 /* Skip return address. */
8897 offset = UNITS_PER_WORD;
8898
8899 /* Skip pushed static chain. */
8900 if (ix86_static_chain_on_stack)
8901 offset += UNITS_PER_WORD;
8902
8903 /* Skip saved base pointer. */
8904 if (frame_pointer_needed)
8905 offset += UNITS_PER_WORD;
8906 frame->hfp_save_offset = offset;
8907
8908 /* The traditional frame pointer location is at the top of the frame. */
8909 frame->hard_frame_pointer_offset = offset;
8910
8911 /* Register save area */
8912 offset += frame->nregs * UNITS_PER_WORD;
8913 frame->reg_save_offset = offset;
8914
8915 /* Align and set SSE register save area. */
8916 if (frame->nsseregs)
8917 {
8918 /* The only ABI that has saved SSE registers (Win64) also has a
8919 16-byte aligned default stack, and thus we don't need to be
8920 within the re-aligned local stack frame to save them. */
8921 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8922 offset = (offset + 16 - 1) & -16;
8923 offset += frame->nsseregs * 16;
8924 }
8925 frame->sse_reg_save_offset = offset;
8926
8927 /* The re-aligned stack starts here. Values before this point are not
8928 directly comparable with values below this point. In order to make
8929 sure that no value happens to be the same before and after, force
8930 the alignment computation below to add a non-zero value. */
8931 if (stack_realign_fp)
8932 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8933
8934 /* Va-arg area */
8935 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8936 offset += frame->va_arg_size;
8937
8938 /* Align start of frame for local function. */
8939 if (stack_realign_fp
8940 || offset != frame->sse_reg_save_offset
8941 || size != 0
8942 || !current_function_is_leaf
8943 || cfun->calls_alloca
8944 || ix86_current_function_calls_tls_descriptor)
8945 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
8946
8947 /* Frame pointer points here. */
8948 frame->frame_pointer_offset = offset;
8949
8950 offset += size;
8951
8952 /* Add outgoing arguments area. Can be skipped if we eliminated
8953 all the function calls as dead code.
8954 Skipping is however impossible when function calls alloca. Alloca
8955 expander assumes that last crtl->outgoing_args_size
8956 of stack frame are unused. */
8957 if (ACCUMULATE_OUTGOING_ARGS
8958 && (!current_function_is_leaf || cfun->calls_alloca
8959 || ix86_current_function_calls_tls_descriptor))
8960 {
8961 offset += crtl->outgoing_args_size;
8962 frame->outgoing_arguments_size = crtl->outgoing_args_size;
8963 }
8964 else
8965 frame->outgoing_arguments_size = 0;
8966
8967 /* Align stack boundary. Only needed if we're calling another function
8968 or using alloca. */
8969 if (!current_function_is_leaf || cfun->calls_alloca
8970 || ix86_current_function_calls_tls_descriptor)
8971 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
8972
8973 /* We've reached end of stack frame. */
8974 frame->stack_pointer_offset = offset;
8975
8976 /* Size prologue needs to allocate. */
8977 to_allocate = offset - frame->sse_reg_save_offset;
8978
8979 if ((!to_allocate && frame->nregs <= 1)
8980 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
8981 frame->save_regs_using_mov = false;
8982
8983 if (ix86_using_red_zone ()
8984 && current_function_sp_is_unchanging
8985 && current_function_is_leaf
8986 && !ix86_current_function_calls_tls_descriptor)
8987 {
8988 frame->red_zone_size = to_allocate;
8989 if (frame->save_regs_using_mov)
8990 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
8991 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
8992 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
8993 }
8994 else
8995 frame->red_zone_size = 0;
8996 frame->stack_pointer_offset -= frame->red_zone_size;
8997
8998 /* The SEH frame pointer location is near the bottom of the frame.
8999 This is enforced by the fact that the difference between the
9000 stack pointer and the frame pointer is limited to 240 bytes in
9001 the unwind data structure. */
9002 if (TARGET_SEH)
9003 {
9004 HOST_WIDE_INT diff;
9005
9006 /* If we can leave the frame pointer where it is, do so. */
9007 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9008 if (diff > 240 || (diff & 15) != 0)
9009 {
9010 /* Ideally we'd determine what portion of the local stack frame
9011 (within the constraint of the lowest 240) is most heavily used.
9012 But without that complication, simply bias the frame pointer
9013 by 128 bytes so as to maximize the amount of the local stack
9014 frame that is addressable with 8-bit offsets. */
9015 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9016 }
9017 }
9018 }
9019
9020 /* This is semi-inlined memory_address_length, but simplified
9021 since we know that we're always dealing with reg+offset, and
9022 to avoid having to create and discard all that rtl. */
9023
9024 static inline int
9025 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9026 {
9027 int len = 4;
9028
9029 if (offset == 0)
9030 {
9031 /* EBP and R13 cannot be encoded without an offset. */
9032 len = (regno == BP_REG || regno == R13_REG);
9033 }
9034 else if (IN_RANGE (offset, -128, 127))
9035 len = 1;
9036
9037 /* ESP and R12 must be encoded with a SIB byte. */
9038 if (regno == SP_REG || regno == R12_REG)
9039 len++;
9040
9041 return len;
9042 }
9043
9044 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9045 The valid base registers are taken from CFUN->MACHINE->FS. */
9046
9047 static rtx
9048 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9049 {
9050 const struct machine_function *m = cfun->machine;
9051 rtx base_reg = NULL;
9052 HOST_WIDE_INT base_offset = 0;
9053
9054 if (m->use_fast_prologue_epilogue)
9055 {
9056 /* Choose the base register most likely to allow the most scheduling
9057 opportunities. Generally FP is valid througout the function,
9058 while DRAP must be reloaded within the epilogue. But choose either
9059 over the SP due to increased encoding size. */
9060
9061 if (m->fs.fp_valid)
9062 {
9063 base_reg = hard_frame_pointer_rtx;
9064 base_offset = m->fs.fp_offset - cfa_offset;
9065 }
9066 else if (m->fs.drap_valid)
9067 {
9068 base_reg = crtl->drap_reg;
9069 base_offset = 0 - cfa_offset;
9070 }
9071 else if (m->fs.sp_valid)
9072 {
9073 base_reg = stack_pointer_rtx;
9074 base_offset = m->fs.sp_offset - cfa_offset;
9075 }
9076 }
9077 else
9078 {
9079 HOST_WIDE_INT toffset;
9080 int len = 16, tlen;
9081
9082 /* Choose the base register with the smallest address encoding.
9083 With a tie, choose FP > DRAP > SP. */
9084 if (m->fs.sp_valid)
9085 {
9086 base_reg = stack_pointer_rtx;
9087 base_offset = m->fs.sp_offset - cfa_offset;
9088 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9089 }
9090 if (m->fs.drap_valid)
9091 {
9092 toffset = 0 - cfa_offset;
9093 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9094 if (tlen <= len)
9095 {
9096 base_reg = crtl->drap_reg;
9097 base_offset = toffset;
9098 len = tlen;
9099 }
9100 }
9101 if (m->fs.fp_valid)
9102 {
9103 toffset = m->fs.fp_offset - cfa_offset;
9104 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9105 if (tlen <= len)
9106 {
9107 base_reg = hard_frame_pointer_rtx;
9108 base_offset = toffset;
9109 len = tlen;
9110 }
9111 }
9112 }
9113 gcc_assert (base_reg != NULL);
9114
9115 return plus_constant (base_reg, base_offset);
9116 }
9117
9118 /* Emit code to save registers in the prologue. */
9119
9120 static void
9121 ix86_emit_save_regs (void)
9122 {
9123 unsigned int regno;
9124 rtx insn;
9125
9126 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9127 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9128 {
9129 insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
9130 RTX_FRAME_RELATED_P (insn) = 1;
9131 }
9132 }
9133
9134 /* Emit a single register save at CFA - CFA_OFFSET. */
9135
9136 static void
9137 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9138 HOST_WIDE_INT cfa_offset)
9139 {
9140 struct machine_function *m = cfun->machine;
9141 rtx reg = gen_rtx_REG (mode, regno);
9142 rtx mem, addr, base, insn;
9143
9144 addr = choose_baseaddr (cfa_offset);
9145 mem = gen_frame_mem (mode, addr);
9146
9147 /* For SSE saves, we need to indicate the 128-bit alignment. */
9148 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9149
9150 insn = emit_move_insn (mem, reg);
9151 RTX_FRAME_RELATED_P (insn) = 1;
9152
9153 base = addr;
9154 if (GET_CODE (base) == PLUS)
9155 base = XEXP (base, 0);
9156 gcc_checking_assert (REG_P (base));
9157
9158 /* When saving registers into a re-aligned local stack frame, avoid
9159 any tricky guessing by dwarf2out. */
9160 if (m->fs.realigned)
9161 {
9162 gcc_checking_assert (stack_realign_drap);
9163
9164 if (regno == REGNO (crtl->drap_reg))
9165 {
9166 /* A bit of a hack. We force the DRAP register to be saved in
9167 the re-aligned stack frame, which provides us with a copy
9168 of the CFA that will last past the prologue. Install it. */
9169 gcc_checking_assert (cfun->machine->fs.fp_valid);
9170 addr = plus_constant (hard_frame_pointer_rtx,
9171 cfun->machine->fs.fp_offset - cfa_offset);
9172 mem = gen_rtx_MEM (mode, addr);
9173 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9174 }
9175 else
9176 {
9177 /* The frame pointer is a stable reference within the
9178 aligned frame. Use it. */
9179 gcc_checking_assert (cfun->machine->fs.fp_valid);
9180 addr = plus_constant (hard_frame_pointer_rtx,
9181 cfun->machine->fs.fp_offset - cfa_offset);
9182 mem = gen_rtx_MEM (mode, addr);
9183 add_reg_note (insn, REG_CFA_EXPRESSION,
9184 gen_rtx_SET (VOIDmode, mem, reg));
9185 }
9186 }
9187
9188 /* The memory may not be relative to the current CFA register,
9189 which means that we may need to generate a new pattern for
9190 use by the unwind info. */
9191 else if (base != m->fs.cfa_reg)
9192 {
9193 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9194 mem = gen_rtx_MEM (mode, addr);
9195 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9196 }
9197 }
9198
9199 /* Emit code to save registers using MOV insns.
9200 First register is stored at CFA - CFA_OFFSET. */
9201 static void
9202 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9203 {
9204 unsigned int regno;
9205
9206 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9207 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9208 {
9209 ix86_emit_save_reg_using_mov (Pmode, regno, cfa_offset);
9210 cfa_offset -= UNITS_PER_WORD;
9211 }
9212 }
9213
9214 /* Emit code to save SSE registers using MOV insns.
9215 First register is stored at CFA - CFA_OFFSET. */
9216 static void
9217 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9218 {
9219 unsigned int regno;
9220
9221 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9222 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9223 {
9224 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9225 cfa_offset -= 16;
9226 }
9227 }
9228
9229 static GTY(()) rtx queued_cfa_restores;
9230
9231 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9232 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9233 Don't add the note if the previously saved value will be left untouched
9234 within stack red-zone till return, as unwinders can find the same value
9235 in the register and on the stack. */
9236
9237 static void
9238 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9239 {
9240 if (!crtl->shrink_wrapped
9241 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9242 return;
9243
9244 if (insn)
9245 {
9246 add_reg_note (insn, REG_CFA_RESTORE, reg);
9247 RTX_FRAME_RELATED_P (insn) = 1;
9248 }
9249 else
9250 queued_cfa_restores
9251 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9252 }
9253
9254 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9255
9256 static void
9257 ix86_add_queued_cfa_restore_notes (rtx insn)
9258 {
9259 rtx last;
9260 if (!queued_cfa_restores)
9261 return;
9262 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9263 ;
9264 XEXP (last, 1) = REG_NOTES (insn);
9265 REG_NOTES (insn) = queued_cfa_restores;
9266 queued_cfa_restores = NULL_RTX;
9267 RTX_FRAME_RELATED_P (insn) = 1;
9268 }
9269
9270 /* Expand prologue or epilogue stack adjustment.
9271 The pattern exist to put a dependency on all ebp-based memory accesses.
9272 STYLE should be negative if instructions should be marked as frame related,
9273 zero if %r11 register is live and cannot be freely used and positive
9274 otherwise. */
9275
9276 static void
9277 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9278 int style, bool set_cfa)
9279 {
9280 struct machine_function *m = cfun->machine;
9281 rtx insn;
9282 bool add_frame_related_expr = false;
9283
9284 if (! TARGET_64BIT)
9285 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9286 else if (x86_64_immediate_operand (offset, DImode))
9287 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9288 else
9289 {
9290 rtx tmp;
9291 /* r11 is used by indirect sibcall return as well, set before the
9292 epilogue and used after the epilogue. */
9293 if (style)
9294 tmp = gen_rtx_REG (DImode, R11_REG);
9295 else
9296 {
9297 gcc_assert (src != hard_frame_pointer_rtx
9298 && dest != hard_frame_pointer_rtx);
9299 tmp = hard_frame_pointer_rtx;
9300 }
9301 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9302 if (style < 0)
9303 add_frame_related_expr = true;
9304
9305 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9306 }
9307
9308 insn = emit_insn (insn);
9309 if (style >= 0)
9310 ix86_add_queued_cfa_restore_notes (insn);
9311
9312 if (set_cfa)
9313 {
9314 rtx r;
9315
9316 gcc_assert (m->fs.cfa_reg == src);
9317 m->fs.cfa_offset += INTVAL (offset);
9318 m->fs.cfa_reg = dest;
9319
9320 r = gen_rtx_PLUS (Pmode, src, offset);
9321 r = gen_rtx_SET (VOIDmode, dest, r);
9322 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9323 RTX_FRAME_RELATED_P (insn) = 1;
9324 }
9325 else if (style < 0)
9326 {
9327 RTX_FRAME_RELATED_P (insn) = 1;
9328 if (add_frame_related_expr)
9329 {
9330 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9331 r = gen_rtx_SET (VOIDmode, dest, r);
9332 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9333 }
9334 }
9335
9336 if (dest == stack_pointer_rtx)
9337 {
9338 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9339 bool valid = m->fs.sp_valid;
9340
9341 if (src == hard_frame_pointer_rtx)
9342 {
9343 valid = m->fs.fp_valid;
9344 ooffset = m->fs.fp_offset;
9345 }
9346 else if (src == crtl->drap_reg)
9347 {
9348 valid = m->fs.drap_valid;
9349 ooffset = 0;
9350 }
9351 else
9352 {
9353 /* Else there are two possibilities: SP itself, which we set
9354 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9355 taken care of this by hand along the eh_return path. */
9356 gcc_checking_assert (src == stack_pointer_rtx
9357 || offset == const0_rtx);
9358 }
9359
9360 m->fs.sp_offset = ooffset - INTVAL (offset);
9361 m->fs.sp_valid = valid;
9362 }
9363 }
9364
9365 /* Find an available register to be used as dynamic realign argument
9366 pointer regsiter. Such a register will be written in prologue and
9367 used in begin of body, so it must not be
9368 1. parameter passing register.
9369 2. GOT pointer.
9370 We reuse static-chain register if it is available. Otherwise, we
9371 use DI for i386 and R13 for x86-64. We chose R13 since it has
9372 shorter encoding.
9373
9374 Return: the regno of chosen register. */
9375
9376 static unsigned int
9377 find_drap_reg (void)
9378 {
9379 tree decl = cfun->decl;
9380
9381 if (TARGET_64BIT)
9382 {
9383 /* Use R13 for nested function or function need static chain.
9384 Since function with tail call may use any caller-saved
9385 registers in epilogue, DRAP must not use caller-saved
9386 register in such case. */
9387 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9388 return R13_REG;
9389
9390 return R10_REG;
9391 }
9392 else
9393 {
9394 /* Use DI for nested function or function need static chain.
9395 Since function with tail call may use any caller-saved
9396 registers in epilogue, DRAP must not use caller-saved
9397 register in such case. */
9398 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9399 return DI_REG;
9400
9401 /* Reuse static chain register if it isn't used for parameter
9402 passing. */
9403 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9404 {
9405 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9406 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9407 return CX_REG;
9408 }
9409 return DI_REG;
9410 }
9411 }
9412
9413 /* Return minimum incoming stack alignment. */
9414
9415 static unsigned int
9416 ix86_minimum_incoming_stack_boundary (bool sibcall)
9417 {
9418 unsigned int incoming_stack_boundary;
9419
9420 /* Prefer the one specified at command line. */
9421 if (ix86_user_incoming_stack_boundary)
9422 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9423 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9424 if -mstackrealign is used, it isn't used for sibcall check and
9425 estimated stack alignment is 128bit. */
9426 else if (!sibcall
9427 && !TARGET_64BIT
9428 && ix86_force_align_arg_pointer
9429 && crtl->stack_alignment_estimated == 128)
9430 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9431 else
9432 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9433
9434 /* Incoming stack alignment can be changed on individual functions
9435 via force_align_arg_pointer attribute. We use the smallest
9436 incoming stack boundary. */
9437 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9438 && lookup_attribute (ix86_force_align_arg_pointer_string,
9439 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9440 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9441
9442 /* The incoming stack frame has to be aligned at least at
9443 parm_stack_boundary. */
9444 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9445 incoming_stack_boundary = crtl->parm_stack_boundary;
9446
9447 /* Stack at entrance of main is aligned by runtime. We use the
9448 smallest incoming stack boundary. */
9449 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9450 && DECL_NAME (current_function_decl)
9451 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9452 && DECL_FILE_SCOPE_P (current_function_decl))
9453 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9454
9455 return incoming_stack_boundary;
9456 }
9457
9458 /* Update incoming stack boundary and estimated stack alignment. */
9459
9460 static void
9461 ix86_update_stack_boundary (void)
9462 {
9463 ix86_incoming_stack_boundary
9464 = ix86_minimum_incoming_stack_boundary (false);
9465
9466 /* x86_64 vararg needs 16byte stack alignment for register save
9467 area. */
9468 if (TARGET_64BIT
9469 && cfun->stdarg
9470 && crtl->stack_alignment_estimated < 128)
9471 crtl->stack_alignment_estimated = 128;
9472 }
9473
9474 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9475 needed or an rtx for DRAP otherwise. */
9476
9477 static rtx
9478 ix86_get_drap_rtx (void)
9479 {
9480 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9481 crtl->need_drap = true;
9482
9483 if (stack_realign_drap)
9484 {
9485 /* Assign DRAP to vDRAP and returns vDRAP */
9486 unsigned int regno = find_drap_reg ();
9487 rtx drap_vreg;
9488 rtx arg_ptr;
9489 rtx seq, insn;
9490
9491 arg_ptr = gen_rtx_REG (Pmode, regno);
9492 crtl->drap_reg = arg_ptr;
9493
9494 start_sequence ();
9495 drap_vreg = copy_to_reg (arg_ptr);
9496 seq = get_insns ();
9497 end_sequence ();
9498
9499 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9500 if (!optimize)
9501 {
9502 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9503 RTX_FRAME_RELATED_P (insn) = 1;
9504 }
9505 return drap_vreg;
9506 }
9507 else
9508 return NULL;
9509 }
9510
9511 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9512
9513 static rtx
9514 ix86_internal_arg_pointer (void)
9515 {
9516 return virtual_incoming_args_rtx;
9517 }
9518
9519 struct scratch_reg {
9520 rtx reg;
9521 bool saved;
9522 };
9523
9524 /* Return a short-lived scratch register for use on function entry.
9525 In 32-bit mode, it is valid only after the registers are saved
9526 in the prologue. This register must be released by means of
9527 release_scratch_register_on_entry once it is dead. */
9528
9529 static void
9530 get_scratch_register_on_entry (struct scratch_reg *sr)
9531 {
9532 int regno;
9533
9534 sr->saved = false;
9535
9536 if (TARGET_64BIT)
9537 {
9538 /* We always use R11 in 64-bit mode. */
9539 regno = R11_REG;
9540 }
9541 else
9542 {
9543 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9544 bool fastcall_p
9545 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9546 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9547 int regparm = ix86_function_regparm (fntype, decl);
9548 int drap_regno
9549 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9550
9551 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9552 for the static chain register. */
9553 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9554 && drap_regno != AX_REG)
9555 regno = AX_REG;
9556 else if (regparm < 2 && drap_regno != DX_REG)
9557 regno = DX_REG;
9558 /* ecx is the static chain register. */
9559 else if (regparm < 3 && !fastcall_p && !static_chain_p
9560 && drap_regno != CX_REG)
9561 regno = CX_REG;
9562 else if (ix86_save_reg (BX_REG, true))
9563 regno = BX_REG;
9564 /* esi is the static chain register. */
9565 else if (!(regparm == 3 && static_chain_p)
9566 && ix86_save_reg (SI_REG, true))
9567 regno = SI_REG;
9568 else if (ix86_save_reg (DI_REG, true))
9569 regno = DI_REG;
9570 else
9571 {
9572 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9573 sr->saved = true;
9574 }
9575 }
9576
9577 sr->reg = gen_rtx_REG (Pmode, regno);
9578 if (sr->saved)
9579 {
9580 rtx insn = emit_insn (gen_push (sr->reg));
9581 RTX_FRAME_RELATED_P (insn) = 1;
9582 }
9583 }
9584
9585 /* Release a scratch register obtained from the preceding function. */
9586
9587 static void
9588 release_scratch_register_on_entry (struct scratch_reg *sr)
9589 {
9590 if (sr->saved)
9591 {
9592 rtx x, insn = emit_insn (gen_pop (sr->reg));
9593
9594 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9595 RTX_FRAME_RELATED_P (insn) = 1;
9596 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9597 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9598 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9599 }
9600 }
9601
9602 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9603
9604 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9605
9606 static void
9607 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9608 {
9609 /* We skip the probe for the first interval + a small dope of 4 words and
9610 probe that many bytes past the specified size to maintain a protection
9611 area at the botton of the stack. */
9612 const int dope = 4 * UNITS_PER_WORD;
9613 rtx size_rtx = GEN_INT (size), last;
9614
9615 /* See if we have a constant small number of probes to generate. If so,
9616 that's the easy case. The run-time loop is made up of 11 insns in the
9617 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9618 for n # of intervals. */
9619 if (size <= 5 * PROBE_INTERVAL)
9620 {
9621 HOST_WIDE_INT i, adjust;
9622 bool first_probe = true;
9623
9624 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9625 values of N from 1 until it exceeds SIZE. If only one probe is
9626 needed, this will not generate any code. Then adjust and probe
9627 to PROBE_INTERVAL + SIZE. */
9628 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9629 {
9630 if (first_probe)
9631 {
9632 adjust = 2 * PROBE_INTERVAL + dope;
9633 first_probe = false;
9634 }
9635 else
9636 adjust = PROBE_INTERVAL;
9637
9638 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9639 plus_constant (stack_pointer_rtx, -adjust)));
9640 emit_stack_probe (stack_pointer_rtx);
9641 }
9642
9643 if (first_probe)
9644 adjust = size + PROBE_INTERVAL + dope;
9645 else
9646 adjust = size + PROBE_INTERVAL - i;
9647
9648 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9649 plus_constant (stack_pointer_rtx, -adjust)));
9650 emit_stack_probe (stack_pointer_rtx);
9651
9652 /* Adjust back to account for the additional first interval. */
9653 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9654 plus_constant (stack_pointer_rtx,
9655 PROBE_INTERVAL + dope)));
9656 }
9657
9658 /* Otherwise, do the same as above, but in a loop. Note that we must be
9659 extra careful with variables wrapping around because we might be at
9660 the very top (or the very bottom) of the address space and we have
9661 to be able to handle this case properly; in particular, we use an
9662 equality test for the loop condition. */
9663 else
9664 {
9665 HOST_WIDE_INT rounded_size;
9666 struct scratch_reg sr;
9667
9668 get_scratch_register_on_entry (&sr);
9669
9670
9671 /* Step 1: round SIZE to the previous multiple of the interval. */
9672
9673 rounded_size = size & -PROBE_INTERVAL;
9674
9675
9676 /* Step 2: compute initial and final value of the loop counter. */
9677
9678 /* SP = SP_0 + PROBE_INTERVAL. */
9679 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9680 plus_constant (stack_pointer_rtx,
9681 - (PROBE_INTERVAL + dope))));
9682
9683 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9684 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9685 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9686 gen_rtx_PLUS (Pmode, sr.reg,
9687 stack_pointer_rtx)));
9688
9689
9690 /* Step 3: the loop
9691
9692 while (SP != LAST_ADDR)
9693 {
9694 SP = SP + PROBE_INTERVAL
9695 probe at SP
9696 }
9697
9698 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9699 values of N from 1 until it is equal to ROUNDED_SIZE. */
9700
9701 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9702
9703
9704 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9705 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9706
9707 if (size != rounded_size)
9708 {
9709 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9710 plus_constant (stack_pointer_rtx,
9711 rounded_size - size)));
9712 emit_stack_probe (stack_pointer_rtx);
9713 }
9714
9715 /* Adjust back to account for the additional first interval. */
9716 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9717 plus_constant (stack_pointer_rtx,
9718 PROBE_INTERVAL + dope)));
9719
9720 release_scratch_register_on_entry (&sr);
9721 }
9722
9723 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9724
9725 /* Even if the stack pointer isn't the CFA register, we need to correctly
9726 describe the adjustments made to it, in particular differentiate the
9727 frame-related ones from the frame-unrelated ones. */
9728 if (size > 0)
9729 {
9730 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9731 XVECEXP (expr, 0, 0)
9732 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9733 plus_constant (stack_pointer_rtx, -size));
9734 XVECEXP (expr, 0, 1)
9735 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9736 plus_constant (stack_pointer_rtx,
9737 PROBE_INTERVAL + dope + size));
9738 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9739 RTX_FRAME_RELATED_P (last) = 1;
9740
9741 cfun->machine->fs.sp_offset += size;
9742 }
9743
9744 /* Make sure nothing is scheduled before we are done. */
9745 emit_insn (gen_blockage ());
9746 }
9747
9748 /* Adjust the stack pointer up to REG while probing it. */
9749
9750 const char *
9751 output_adjust_stack_and_probe (rtx reg)
9752 {
9753 static int labelno = 0;
9754 char loop_lab[32], end_lab[32];
9755 rtx xops[2];
9756
9757 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9758 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9759
9760 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9761
9762 /* Jump to END_LAB if SP == LAST_ADDR. */
9763 xops[0] = stack_pointer_rtx;
9764 xops[1] = reg;
9765 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9766 fputs ("\tje\t", asm_out_file);
9767 assemble_name_raw (asm_out_file, end_lab);
9768 fputc ('\n', asm_out_file);
9769
9770 /* SP = SP + PROBE_INTERVAL. */
9771 xops[1] = GEN_INT (PROBE_INTERVAL);
9772 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9773
9774 /* Probe at SP. */
9775 xops[1] = const0_rtx;
9776 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9777
9778 fprintf (asm_out_file, "\tjmp\t");
9779 assemble_name_raw (asm_out_file, loop_lab);
9780 fputc ('\n', asm_out_file);
9781
9782 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9783
9784 return "";
9785 }
9786
9787 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9788 inclusive. These are offsets from the current stack pointer. */
9789
9790 static void
9791 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9792 {
9793 /* See if we have a constant small number of probes to generate. If so,
9794 that's the easy case. The run-time loop is made up of 7 insns in the
9795 generic case while the compile-time loop is made up of n insns for n #
9796 of intervals. */
9797 if (size <= 7 * PROBE_INTERVAL)
9798 {
9799 HOST_WIDE_INT i;
9800
9801 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9802 it exceeds SIZE. If only one probe is needed, this will not
9803 generate any code. Then probe at FIRST + SIZE. */
9804 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9805 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9806
9807 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9808 }
9809
9810 /* Otherwise, do the same as above, but in a loop. Note that we must be
9811 extra careful with variables wrapping around because we might be at
9812 the very top (or the very bottom) of the address space and we have
9813 to be able to handle this case properly; in particular, we use an
9814 equality test for the loop condition. */
9815 else
9816 {
9817 HOST_WIDE_INT rounded_size, last;
9818 struct scratch_reg sr;
9819
9820 get_scratch_register_on_entry (&sr);
9821
9822
9823 /* Step 1: round SIZE to the previous multiple of the interval. */
9824
9825 rounded_size = size & -PROBE_INTERVAL;
9826
9827
9828 /* Step 2: compute initial and final value of the loop counter. */
9829
9830 /* TEST_OFFSET = FIRST. */
9831 emit_move_insn (sr.reg, GEN_INT (-first));
9832
9833 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9834 last = first + rounded_size;
9835
9836
9837 /* Step 3: the loop
9838
9839 while (TEST_ADDR != LAST_ADDR)
9840 {
9841 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9842 probe at TEST_ADDR
9843 }
9844
9845 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9846 until it is equal to ROUNDED_SIZE. */
9847
9848 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9849
9850
9851 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9852 that SIZE is equal to ROUNDED_SIZE. */
9853
9854 if (size != rounded_size)
9855 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9856 stack_pointer_rtx,
9857 sr.reg),
9858 rounded_size - size));
9859
9860 release_scratch_register_on_entry (&sr);
9861 }
9862
9863 /* Make sure nothing is scheduled before we are done. */
9864 emit_insn (gen_blockage ());
9865 }
9866
9867 /* Probe a range of stack addresses from REG to END, inclusive. These are
9868 offsets from the current stack pointer. */
9869
9870 const char *
9871 output_probe_stack_range (rtx reg, rtx end)
9872 {
9873 static int labelno = 0;
9874 char loop_lab[32], end_lab[32];
9875 rtx xops[3];
9876
9877 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9878 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9879
9880 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9881
9882 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9883 xops[0] = reg;
9884 xops[1] = end;
9885 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9886 fputs ("\tje\t", asm_out_file);
9887 assemble_name_raw (asm_out_file, end_lab);
9888 fputc ('\n', asm_out_file);
9889
9890 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9891 xops[1] = GEN_INT (PROBE_INTERVAL);
9892 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9893
9894 /* Probe at TEST_ADDR. */
9895 xops[0] = stack_pointer_rtx;
9896 xops[1] = reg;
9897 xops[2] = const0_rtx;
9898 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9899
9900 fprintf (asm_out_file, "\tjmp\t");
9901 assemble_name_raw (asm_out_file, loop_lab);
9902 fputc ('\n', asm_out_file);
9903
9904 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9905
9906 return "";
9907 }
9908
9909 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9910 to be generated in correct form. */
9911 static void
9912 ix86_finalize_stack_realign_flags (void)
9913 {
9914 /* Check if stack realign is really needed after reload, and
9915 stores result in cfun */
9916 unsigned int incoming_stack_boundary
9917 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9918 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9919 unsigned int stack_realign = (incoming_stack_boundary
9920 < (current_function_is_leaf
9921 ? crtl->max_used_stack_slot_alignment
9922 : crtl->stack_alignment_needed));
9923
9924 if (crtl->stack_realign_finalized)
9925 {
9926 /* After stack_realign_needed is finalized, we can't no longer
9927 change it. */
9928 gcc_assert (crtl->stack_realign_needed == stack_realign);
9929 return;
9930 }
9931
9932 /* If the only reason for frame_pointer_needed is that we conservatively
9933 assumed stack realignment might be needed, but in the end nothing that
9934 needed the stack alignment had been spilled, clear frame_pointer_needed
9935 and say we don't need stack realignment. */
9936 if (stack_realign
9937 && !crtl->need_drap
9938 && frame_pointer_needed
9939 && current_function_is_leaf
9940 && flag_omit_frame_pointer
9941 && current_function_sp_is_unchanging
9942 && !ix86_current_function_calls_tls_descriptor
9943 && !crtl->accesses_prior_frames
9944 && !cfun->calls_alloca
9945 && !crtl->calls_eh_return
9946 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
9947 && !ix86_frame_pointer_required ()
9948 && get_frame_size () == 0
9949 && ix86_nsaved_sseregs () == 0
9950 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
9951 {
9952 HARD_REG_SET set_up_by_prologue, prologue_used;
9953 basic_block bb;
9954
9955 CLEAR_HARD_REG_SET (prologue_used);
9956 CLEAR_HARD_REG_SET (set_up_by_prologue);
9957 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
9958 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
9959 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
9960 HARD_FRAME_POINTER_REGNUM);
9961 FOR_EACH_BB (bb)
9962 {
9963 rtx insn;
9964 FOR_BB_INSNS (bb, insn)
9965 if (NONDEBUG_INSN_P (insn)
9966 && requires_stack_frame_p (insn, prologue_used,
9967 set_up_by_prologue))
9968 {
9969 crtl->stack_realign_needed = stack_realign;
9970 crtl->stack_realign_finalized = true;
9971 return;
9972 }
9973 }
9974
9975 frame_pointer_needed = false;
9976 stack_realign = false;
9977 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
9978 crtl->stack_alignment_needed = incoming_stack_boundary;
9979 crtl->stack_alignment_estimated = incoming_stack_boundary;
9980 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
9981 crtl->preferred_stack_boundary = incoming_stack_boundary;
9982 df_finish_pass (true);
9983 df_scan_alloc (NULL);
9984 df_scan_blocks ();
9985 df_compute_regs_ever_live (true);
9986 df_analyze ();
9987 }
9988
9989 crtl->stack_realign_needed = stack_realign;
9990 crtl->stack_realign_finalized = true;
9991 }
9992
9993 /* Expand the prologue into a bunch of separate insns. */
9994
9995 void
9996 ix86_expand_prologue (void)
9997 {
9998 struct machine_function *m = cfun->machine;
9999 rtx insn, t;
10000 bool pic_reg_used;
10001 struct ix86_frame frame;
10002 HOST_WIDE_INT allocate;
10003 bool int_registers_saved;
10004
10005 ix86_finalize_stack_realign_flags ();
10006
10007 /* DRAP should not coexist with stack_realign_fp */
10008 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10009
10010 memset (&m->fs, 0, sizeof (m->fs));
10011
10012 /* Initialize CFA state for before the prologue. */
10013 m->fs.cfa_reg = stack_pointer_rtx;
10014 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10015
10016 /* Track SP offset to the CFA. We continue tracking this after we've
10017 swapped the CFA register away from SP. In the case of re-alignment
10018 this is fudged; we're interested to offsets within the local frame. */
10019 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10020 m->fs.sp_valid = true;
10021
10022 ix86_compute_frame_layout (&frame);
10023
10024 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10025 {
10026 /* We should have already generated an error for any use of
10027 ms_hook on a nested function. */
10028 gcc_checking_assert (!ix86_static_chain_on_stack);
10029
10030 /* Check if profiling is active and we shall use profiling before
10031 prologue variant. If so sorry. */
10032 if (crtl->profile && flag_fentry != 0)
10033 sorry ("ms_hook_prologue attribute isn%'t compatible "
10034 "with -mfentry for 32-bit");
10035
10036 /* In ix86_asm_output_function_label we emitted:
10037 8b ff movl.s %edi,%edi
10038 55 push %ebp
10039 8b ec movl.s %esp,%ebp
10040
10041 This matches the hookable function prologue in Win32 API
10042 functions in Microsoft Windows XP Service Pack 2 and newer.
10043 Wine uses this to enable Windows apps to hook the Win32 API
10044 functions provided by Wine.
10045
10046 What that means is that we've already set up the frame pointer. */
10047
10048 if (frame_pointer_needed
10049 && !(crtl->drap_reg && crtl->stack_realign_needed))
10050 {
10051 rtx push, mov;
10052
10053 /* We've decided to use the frame pointer already set up.
10054 Describe this to the unwinder by pretending that both
10055 push and mov insns happen right here.
10056
10057 Putting the unwind info here at the end of the ms_hook
10058 is done so that we can make absolutely certain we get
10059 the required byte sequence at the start of the function,
10060 rather than relying on an assembler that can produce
10061 the exact encoding required.
10062
10063 However it does mean (in the unpatched case) that we have
10064 a 1 insn window where the asynchronous unwind info is
10065 incorrect. However, if we placed the unwind info at
10066 its correct location we would have incorrect unwind info
10067 in the patched case. Which is probably all moot since
10068 I don't expect Wine generates dwarf2 unwind info for the
10069 system libraries that use this feature. */
10070
10071 insn = emit_insn (gen_blockage ());
10072
10073 push = gen_push (hard_frame_pointer_rtx);
10074 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10075 stack_pointer_rtx);
10076 RTX_FRAME_RELATED_P (push) = 1;
10077 RTX_FRAME_RELATED_P (mov) = 1;
10078
10079 RTX_FRAME_RELATED_P (insn) = 1;
10080 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10081 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10082
10083 /* Note that gen_push incremented m->fs.cfa_offset, even
10084 though we didn't emit the push insn here. */
10085 m->fs.cfa_reg = hard_frame_pointer_rtx;
10086 m->fs.fp_offset = m->fs.cfa_offset;
10087 m->fs.fp_valid = true;
10088 }
10089 else
10090 {
10091 /* The frame pointer is not needed so pop %ebp again.
10092 This leaves us with a pristine state. */
10093 emit_insn (gen_pop (hard_frame_pointer_rtx));
10094 }
10095 }
10096
10097 /* The first insn of a function that accepts its static chain on the
10098 stack is to push the register that would be filled in by a direct
10099 call. This insn will be skipped by the trampoline. */
10100 else if (ix86_static_chain_on_stack)
10101 {
10102 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10103 emit_insn (gen_blockage ());
10104
10105 /* We don't want to interpret this push insn as a register save,
10106 only as a stack adjustment. The real copy of the register as
10107 a save will be done later, if needed. */
10108 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10109 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10110 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10111 RTX_FRAME_RELATED_P (insn) = 1;
10112 }
10113
10114 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10115 of DRAP is needed and stack realignment is really needed after reload */
10116 if (stack_realign_drap)
10117 {
10118 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10119
10120 /* Only need to push parameter pointer reg if it is caller saved. */
10121 if (!call_used_regs[REGNO (crtl->drap_reg)])
10122 {
10123 /* Push arg pointer reg */
10124 insn = emit_insn (gen_push (crtl->drap_reg));
10125 RTX_FRAME_RELATED_P (insn) = 1;
10126 }
10127
10128 /* Grab the argument pointer. */
10129 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10130 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10131 RTX_FRAME_RELATED_P (insn) = 1;
10132 m->fs.cfa_reg = crtl->drap_reg;
10133 m->fs.cfa_offset = 0;
10134
10135 /* Align the stack. */
10136 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10137 stack_pointer_rtx,
10138 GEN_INT (-align_bytes)));
10139 RTX_FRAME_RELATED_P (insn) = 1;
10140
10141 /* Replicate the return address on the stack so that return
10142 address can be reached via (argp - 1) slot. This is needed
10143 to implement macro RETURN_ADDR_RTX and intrinsic function
10144 expand_builtin_return_addr etc. */
10145 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10146 t = gen_frame_mem (Pmode, t);
10147 insn = emit_insn (gen_push (t));
10148 RTX_FRAME_RELATED_P (insn) = 1;
10149
10150 /* For the purposes of frame and register save area addressing,
10151 we've started over with a new frame. */
10152 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10153 m->fs.realigned = true;
10154 }
10155
10156 if (frame_pointer_needed && !m->fs.fp_valid)
10157 {
10158 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10159 slower on all targets. Also sdb doesn't like it. */
10160 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10161 RTX_FRAME_RELATED_P (insn) = 1;
10162
10163 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10164 {
10165 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10166 RTX_FRAME_RELATED_P (insn) = 1;
10167
10168 if (m->fs.cfa_reg == stack_pointer_rtx)
10169 m->fs.cfa_reg = hard_frame_pointer_rtx;
10170 m->fs.fp_offset = m->fs.sp_offset;
10171 m->fs.fp_valid = true;
10172 }
10173 }
10174
10175 int_registers_saved = (frame.nregs == 0);
10176
10177 if (!int_registers_saved)
10178 {
10179 /* If saving registers via PUSH, do so now. */
10180 if (!frame.save_regs_using_mov)
10181 {
10182 ix86_emit_save_regs ();
10183 int_registers_saved = true;
10184 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10185 }
10186
10187 /* When using red zone we may start register saving before allocating
10188 the stack frame saving one cycle of the prologue. However, avoid
10189 doing this if we have to probe the stack; at least on x86_64 the
10190 stack probe can turn into a call that clobbers a red zone location. */
10191 else if (ix86_using_red_zone ()
10192 && (! TARGET_STACK_PROBE
10193 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10194 {
10195 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10196 int_registers_saved = true;
10197 }
10198 }
10199
10200 if (stack_realign_fp)
10201 {
10202 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10203 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10204
10205 /* The computation of the size of the re-aligned stack frame means
10206 that we must allocate the size of the register save area before
10207 performing the actual alignment. Otherwise we cannot guarantee
10208 that there's enough storage above the realignment point. */
10209 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10210 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10211 GEN_INT (m->fs.sp_offset
10212 - frame.sse_reg_save_offset),
10213 -1, false);
10214
10215 /* Align the stack. */
10216 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10217 stack_pointer_rtx,
10218 GEN_INT (-align_bytes)));
10219
10220 /* For the purposes of register save area addressing, the stack
10221 pointer is no longer valid. As for the value of sp_offset,
10222 see ix86_compute_frame_layout, which we need to match in order
10223 to pass verification of stack_pointer_offset at the end. */
10224 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10225 m->fs.sp_valid = false;
10226 }
10227
10228 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10229
10230 if (flag_stack_usage_info)
10231 {
10232 /* We start to count from ARG_POINTER. */
10233 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10234
10235 /* If it was realigned, take into account the fake frame. */
10236 if (stack_realign_drap)
10237 {
10238 if (ix86_static_chain_on_stack)
10239 stack_size += UNITS_PER_WORD;
10240
10241 if (!call_used_regs[REGNO (crtl->drap_reg)])
10242 stack_size += UNITS_PER_WORD;
10243
10244 /* This over-estimates by 1 minimal-stack-alignment-unit but
10245 mitigates that by counting in the new return address slot. */
10246 current_function_dynamic_stack_size
10247 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10248 }
10249
10250 current_function_static_stack_size = stack_size;
10251 }
10252
10253 /* The stack has already been decremented by the instruction calling us
10254 so probe if the size is non-negative to preserve the protection area. */
10255 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10256 {
10257 /* We expect the registers to be saved when probes are used. */
10258 gcc_assert (int_registers_saved);
10259
10260 if (STACK_CHECK_MOVING_SP)
10261 {
10262 ix86_adjust_stack_and_probe (allocate);
10263 allocate = 0;
10264 }
10265 else
10266 {
10267 HOST_WIDE_INT size = allocate;
10268
10269 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10270 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10271
10272 if (TARGET_STACK_PROBE)
10273 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10274 else
10275 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10276 }
10277 }
10278
10279 if (allocate == 0)
10280 ;
10281 else if (!ix86_target_stack_probe ()
10282 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10283 {
10284 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10285 GEN_INT (-allocate), -1,
10286 m->fs.cfa_reg == stack_pointer_rtx);
10287 }
10288 else
10289 {
10290 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10291 rtx r10 = NULL;
10292 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10293
10294 bool eax_live = false;
10295 bool r10_live = false;
10296
10297 if (TARGET_64BIT)
10298 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10299 if (!TARGET_64BIT_MS_ABI)
10300 eax_live = ix86_eax_live_at_start_p ();
10301
10302 if (eax_live)
10303 {
10304 emit_insn (gen_push (eax));
10305 allocate -= UNITS_PER_WORD;
10306 }
10307 if (r10_live)
10308 {
10309 r10 = gen_rtx_REG (Pmode, R10_REG);
10310 emit_insn (gen_push (r10));
10311 allocate -= UNITS_PER_WORD;
10312 }
10313
10314 emit_move_insn (eax, GEN_INT (allocate));
10315 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10316
10317 /* Use the fact that AX still contains ALLOCATE. */
10318 adjust_stack_insn = (TARGET_64BIT
10319 ? gen_pro_epilogue_adjust_stack_di_sub
10320 : gen_pro_epilogue_adjust_stack_si_sub);
10321
10322 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10323 stack_pointer_rtx, eax));
10324
10325 /* Note that SEH directives need to continue tracking the stack
10326 pointer even after the frame pointer has been set up. */
10327 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10328 {
10329 if (m->fs.cfa_reg == stack_pointer_rtx)
10330 m->fs.cfa_offset += allocate;
10331
10332 RTX_FRAME_RELATED_P (insn) = 1;
10333 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10334 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10335 plus_constant (stack_pointer_rtx,
10336 -allocate)));
10337 }
10338 m->fs.sp_offset += allocate;
10339
10340 if (r10_live && eax_live)
10341 {
10342 t = choose_baseaddr (m->fs.sp_offset - allocate);
10343 emit_move_insn (r10, gen_frame_mem (Pmode, t));
10344 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10345 emit_move_insn (eax, gen_frame_mem (Pmode, t));
10346 }
10347 else if (eax_live || r10_live)
10348 {
10349 t = choose_baseaddr (m->fs.sp_offset - allocate);
10350 emit_move_insn ((eax_live ? eax : r10), gen_frame_mem (Pmode, t));
10351 }
10352 }
10353 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10354
10355 /* If we havn't already set up the frame pointer, do so now. */
10356 if (frame_pointer_needed && !m->fs.fp_valid)
10357 {
10358 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10359 GEN_INT (frame.stack_pointer_offset
10360 - frame.hard_frame_pointer_offset));
10361 insn = emit_insn (insn);
10362 RTX_FRAME_RELATED_P (insn) = 1;
10363 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10364
10365 if (m->fs.cfa_reg == stack_pointer_rtx)
10366 m->fs.cfa_reg = hard_frame_pointer_rtx;
10367 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10368 m->fs.fp_valid = true;
10369 }
10370
10371 if (!int_registers_saved)
10372 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10373 if (frame.nsseregs)
10374 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10375
10376 pic_reg_used = false;
10377 if (pic_offset_table_rtx
10378 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10379 || crtl->profile))
10380 {
10381 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10382
10383 if (alt_pic_reg_used != INVALID_REGNUM)
10384 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10385
10386 pic_reg_used = true;
10387 }
10388
10389 if (pic_reg_used)
10390 {
10391 if (TARGET_64BIT)
10392 {
10393 if (ix86_cmodel == CM_LARGE_PIC)
10394 {
10395 rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
10396 rtx label = gen_label_rtx ();
10397 emit_label (label);
10398 LABEL_PRESERVE_P (label) = 1;
10399 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10400 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
10401 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10402 insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
10403 pic_offset_table_rtx, tmp_reg));
10404 }
10405 else
10406 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10407 }
10408 else
10409 {
10410 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10411 RTX_FRAME_RELATED_P (insn) = 1;
10412 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10413 }
10414 }
10415
10416 /* In the pic_reg_used case, make sure that the got load isn't deleted
10417 when mcount needs it. Blockage to avoid call movement across mcount
10418 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10419 note. */
10420 if (crtl->profile && !flag_fentry && pic_reg_used)
10421 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10422
10423 if (crtl->drap_reg && !crtl->stack_realign_needed)
10424 {
10425 /* vDRAP is setup but after reload it turns out stack realign
10426 isn't necessary, here we will emit prologue to setup DRAP
10427 without stack realign adjustment */
10428 t = choose_baseaddr (0);
10429 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10430 }
10431
10432 /* Prevent instructions from being scheduled into register save push
10433 sequence when access to the redzone area is done through frame pointer.
10434 The offset between the frame pointer and the stack pointer is calculated
10435 relative to the value of the stack pointer at the end of the function
10436 prologue, and moving instructions that access redzone area via frame
10437 pointer inside push sequence violates this assumption. */
10438 if (frame_pointer_needed && frame.red_zone_size)
10439 emit_insn (gen_memory_blockage ());
10440
10441 /* Emit cld instruction if stringops are used in the function. */
10442 if (TARGET_CLD && ix86_current_function_needs_cld)
10443 emit_insn (gen_cld ());
10444
10445 /* SEH requires that the prologue end within 256 bytes of the start of
10446 the function. Prevent instruction schedules that would extend that.
10447 Further, prevent alloca modifications to the stack pointer from being
10448 combined with prologue modifications. */
10449 if (TARGET_SEH)
10450 emit_insn (gen_prologue_use (stack_pointer_rtx));
10451 }
10452
10453 /* Emit code to restore REG using a POP insn. */
10454
10455 static void
10456 ix86_emit_restore_reg_using_pop (rtx reg)
10457 {
10458 struct machine_function *m = cfun->machine;
10459 rtx insn = emit_insn (gen_pop (reg));
10460
10461 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10462 m->fs.sp_offset -= UNITS_PER_WORD;
10463
10464 if (m->fs.cfa_reg == crtl->drap_reg
10465 && REGNO (reg) == REGNO (crtl->drap_reg))
10466 {
10467 /* Previously we'd represented the CFA as an expression
10468 like *(%ebp - 8). We've just popped that value from
10469 the stack, which means we need to reset the CFA to
10470 the drap register. This will remain until we restore
10471 the stack pointer. */
10472 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10473 RTX_FRAME_RELATED_P (insn) = 1;
10474
10475 /* This means that the DRAP register is valid for addressing too. */
10476 m->fs.drap_valid = true;
10477 return;
10478 }
10479
10480 if (m->fs.cfa_reg == stack_pointer_rtx)
10481 {
10482 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10483 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10484 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10485 RTX_FRAME_RELATED_P (insn) = 1;
10486
10487 m->fs.cfa_offset -= UNITS_PER_WORD;
10488 }
10489
10490 /* When the frame pointer is the CFA, and we pop it, we are
10491 swapping back to the stack pointer as the CFA. This happens
10492 for stack frames that don't allocate other data, so we assume
10493 the stack pointer is now pointing at the return address, i.e.
10494 the function entry state, which makes the offset be 1 word. */
10495 if (reg == hard_frame_pointer_rtx)
10496 {
10497 m->fs.fp_valid = false;
10498 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10499 {
10500 m->fs.cfa_reg = stack_pointer_rtx;
10501 m->fs.cfa_offset -= UNITS_PER_WORD;
10502
10503 add_reg_note (insn, REG_CFA_DEF_CFA,
10504 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10505 GEN_INT (m->fs.cfa_offset)));
10506 RTX_FRAME_RELATED_P (insn) = 1;
10507 }
10508 }
10509 }
10510
10511 /* Emit code to restore saved registers using POP insns. */
10512
10513 static void
10514 ix86_emit_restore_regs_using_pop (void)
10515 {
10516 unsigned int regno;
10517
10518 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10519 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10520 ix86_emit_restore_reg_using_pop (gen_rtx_REG (Pmode, regno));
10521 }
10522
10523 /* Emit code and notes for the LEAVE instruction. */
10524
10525 static void
10526 ix86_emit_leave (void)
10527 {
10528 struct machine_function *m = cfun->machine;
10529 rtx insn = emit_insn (ix86_gen_leave ());
10530
10531 ix86_add_queued_cfa_restore_notes (insn);
10532
10533 gcc_assert (m->fs.fp_valid);
10534 m->fs.sp_valid = true;
10535 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10536 m->fs.fp_valid = false;
10537
10538 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10539 {
10540 m->fs.cfa_reg = stack_pointer_rtx;
10541 m->fs.cfa_offset = m->fs.sp_offset;
10542
10543 add_reg_note (insn, REG_CFA_DEF_CFA,
10544 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10545 RTX_FRAME_RELATED_P (insn) = 1;
10546 }
10547 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10548 m->fs.fp_offset);
10549 }
10550
10551 /* Emit code to restore saved registers using MOV insns.
10552 First register is restored from CFA - CFA_OFFSET. */
10553 static void
10554 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10555 bool maybe_eh_return)
10556 {
10557 struct machine_function *m = cfun->machine;
10558 unsigned int regno;
10559
10560 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10561 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10562 {
10563 rtx reg = gen_rtx_REG (Pmode, regno);
10564 rtx insn, mem;
10565
10566 mem = choose_baseaddr (cfa_offset);
10567 mem = gen_frame_mem (Pmode, mem);
10568 insn = emit_move_insn (reg, mem);
10569
10570 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10571 {
10572 /* Previously we'd represented the CFA as an expression
10573 like *(%ebp - 8). We've just popped that value from
10574 the stack, which means we need to reset the CFA to
10575 the drap register. This will remain until we restore
10576 the stack pointer. */
10577 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10578 RTX_FRAME_RELATED_P (insn) = 1;
10579
10580 /* This means that the DRAP register is valid for addressing. */
10581 m->fs.drap_valid = true;
10582 }
10583 else
10584 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10585
10586 cfa_offset -= UNITS_PER_WORD;
10587 }
10588 }
10589
10590 /* Emit code to restore saved registers using MOV insns.
10591 First register is restored from CFA - CFA_OFFSET. */
10592 static void
10593 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10594 bool maybe_eh_return)
10595 {
10596 unsigned int regno;
10597
10598 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10599 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10600 {
10601 rtx reg = gen_rtx_REG (V4SFmode, regno);
10602 rtx mem;
10603
10604 mem = choose_baseaddr (cfa_offset);
10605 mem = gen_rtx_MEM (V4SFmode, mem);
10606 set_mem_align (mem, 128);
10607 emit_move_insn (reg, mem);
10608
10609 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10610
10611 cfa_offset -= 16;
10612 }
10613 }
10614
10615 /* Emit vzeroupper if needed. */
10616
10617 void
10618 ix86_maybe_emit_epilogue_vzeroupper (void)
10619 {
10620 if (TARGET_VZEROUPPER
10621 && !TREE_THIS_VOLATILE (cfun->decl)
10622 && !cfun->machine->caller_return_avx256_p)
10623 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10624 }
10625
10626 /* Restore function stack, frame, and registers. */
10627
10628 void
10629 ix86_expand_epilogue (int style)
10630 {
10631 struct machine_function *m = cfun->machine;
10632 struct machine_frame_state frame_state_save = m->fs;
10633 struct ix86_frame frame;
10634 bool restore_regs_via_mov;
10635 bool using_drap;
10636
10637 ix86_finalize_stack_realign_flags ();
10638 ix86_compute_frame_layout (&frame);
10639
10640 m->fs.sp_valid = (!frame_pointer_needed
10641 || (current_function_sp_is_unchanging
10642 && !stack_realign_fp));
10643 gcc_assert (!m->fs.sp_valid
10644 || m->fs.sp_offset == frame.stack_pointer_offset);
10645
10646 /* The FP must be valid if the frame pointer is present. */
10647 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10648 gcc_assert (!m->fs.fp_valid
10649 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10650
10651 /* We must have *some* valid pointer to the stack frame. */
10652 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10653
10654 /* The DRAP is never valid at this point. */
10655 gcc_assert (!m->fs.drap_valid);
10656
10657 /* See the comment about red zone and frame
10658 pointer usage in ix86_expand_prologue. */
10659 if (frame_pointer_needed && frame.red_zone_size)
10660 emit_insn (gen_memory_blockage ());
10661
10662 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10663 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10664
10665 /* Determine the CFA offset of the end of the red-zone. */
10666 m->fs.red_zone_offset = 0;
10667 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10668 {
10669 /* The red-zone begins below the return address. */
10670 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10671
10672 /* When the register save area is in the aligned portion of
10673 the stack, determine the maximum runtime displacement that
10674 matches up with the aligned frame. */
10675 if (stack_realign_drap)
10676 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10677 + UNITS_PER_WORD);
10678 }
10679
10680 /* Special care must be taken for the normal return case of a function
10681 using eh_return: the eax and edx registers are marked as saved, but
10682 not restored along this path. Adjust the save location to match. */
10683 if (crtl->calls_eh_return && style != 2)
10684 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10685
10686 /* EH_RETURN requires the use of moves to function properly. */
10687 if (crtl->calls_eh_return)
10688 restore_regs_via_mov = true;
10689 /* SEH requires the use of pops to identify the epilogue. */
10690 else if (TARGET_SEH)
10691 restore_regs_via_mov = false;
10692 /* If we're only restoring one register and sp is not valid then
10693 using a move instruction to restore the register since it's
10694 less work than reloading sp and popping the register. */
10695 else if (!m->fs.sp_valid && frame.nregs <= 1)
10696 restore_regs_via_mov = true;
10697 else if (TARGET_EPILOGUE_USING_MOVE
10698 && cfun->machine->use_fast_prologue_epilogue
10699 && (frame.nregs > 1
10700 || m->fs.sp_offset != frame.reg_save_offset))
10701 restore_regs_via_mov = true;
10702 else if (frame_pointer_needed
10703 && !frame.nregs
10704 && m->fs.sp_offset != frame.reg_save_offset)
10705 restore_regs_via_mov = true;
10706 else if (frame_pointer_needed
10707 && TARGET_USE_LEAVE
10708 && cfun->machine->use_fast_prologue_epilogue
10709 && frame.nregs == 1)
10710 restore_regs_via_mov = true;
10711 else
10712 restore_regs_via_mov = false;
10713
10714 if (restore_regs_via_mov || frame.nsseregs)
10715 {
10716 /* Ensure that the entire register save area is addressable via
10717 the stack pointer, if we will restore via sp. */
10718 if (TARGET_64BIT
10719 && m->fs.sp_offset > 0x7fffffff
10720 && !(m->fs.fp_valid || m->fs.drap_valid)
10721 && (frame.nsseregs + frame.nregs) != 0)
10722 {
10723 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10724 GEN_INT (m->fs.sp_offset
10725 - frame.sse_reg_save_offset),
10726 style,
10727 m->fs.cfa_reg == stack_pointer_rtx);
10728 }
10729 }
10730
10731 /* If there are any SSE registers to restore, then we have to do it
10732 via moves, since there's obviously no pop for SSE regs. */
10733 if (frame.nsseregs)
10734 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10735 style == 2);
10736
10737 if (restore_regs_via_mov)
10738 {
10739 rtx t;
10740
10741 if (frame.nregs)
10742 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10743
10744 /* eh_return epilogues need %ecx added to the stack pointer. */
10745 if (style == 2)
10746 {
10747 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10748
10749 /* Stack align doesn't work with eh_return. */
10750 gcc_assert (!stack_realign_drap);
10751 /* Neither does regparm nested functions. */
10752 gcc_assert (!ix86_static_chain_on_stack);
10753
10754 if (frame_pointer_needed)
10755 {
10756 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10757 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10758 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10759
10760 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10761 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10762
10763 /* Note that we use SA as a temporary CFA, as the return
10764 address is at the proper place relative to it. We
10765 pretend this happens at the FP restore insn because
10766 prior to this insn the FP would be stored at the wrong
10767 offset relative to SA, and after this insn we have no
10768 other reasonable register to use for the CFA. We don't
10769 bother resetting the CFA to the SP for the duration of
10770 the return insn. */
10771 add_reg_note (insn, REG_CFA_DEF_CFA,
10772 plus_constant (sa, UNITS_PER_WORD));
10773 ix86_add_queued_cfa_restore_notes (insn);
10774 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10775 RTX_FRAME_RELATED_P (insn) = 1;
10776
10777 m->fs.cfa_reg = sa;
10778 m->fs.cfa_offset = UNITS_PER_WORD;
10779 m->fs.fp_valid = false;
10780
10781 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10782 const0_rtx, style, false);
10783 }
10784 else
10785 {
10786 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10787 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10788 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10789 ix86_add_queued_cfa_restore_notes (insn);
10790
10791 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10792 if (m->fs.cfa_offset != UNITS_PER_WORD)
10793 {
10794 m->fs.cfa_offset = UNITS_PER_WORD;
10795 add_reg_note (insn, REG_CFA_DEF_CFA,
10796 plus_constant (stack_pointer_rtx,
10797 UNITS_PER_WORD));
10798 RTX_FRAME_RELATED_P (insn) = 1;
10799 }
10800 }
10801 m->fs.sp_offset = UNITS_PER_WORD;
10802 m->fs.sp_valid = true;
10803 }
10804 }
10805 else
10806 {
10807 /* SEH requires that the function end with (1) a stack adjustment
10808 if necessary, (2) a sequence of pops, and (3) a return or
10809 jump instruction. Prevent insns from the function body from
10810 being scheduled into this sequence. */
10811 if (TARGET_SEH)
10812 {
10813 /* Prevent a catch region from being adjacent to the standard
10814 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10815 several other flags that would be interesting to test are
10816 not yet set up. */
10817 if (flag_non_call_exceptions)
10818 emit_insn (gen_nops (const1_rtx));
10819 else
10820 emit_insn (gen_blockage ());
10821 }
10822
10823 /* First step is to deallocate the stack frame so that we can
10824 pop the registers. */
10825 if (!m->fs.sp_valid)
10826 {
10827 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10828 GEN_INT (m->fs.fp_offset
10829 - frame.reg_save_offset),
10830 style, false);
10831 }
10832 else if (m->fs.sp_offset != frame.reg_save_offset)
10833 {
10834 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10835 GEN_INT (m->fs.sp_offset
10836 - frame.reg_save_offset),
10837 style,
10838 m->fs.cfa_reg == stack_pointer_rtx);
10839 }
10840
10841 ix86_emit_restore_regs_using_pop ();
10842 }
10843
10844 /* If we used a stack pointer and haven't already got rid of it,
10845 then do so now. */
10846 if (m->fs.fp_valid)
10847 {
10848 /* If the stack pointer is valid and pointing at the frame
10849 pointer store address, then we only need a pop. */
10850 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10851 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10852 /* Leave results in shorter dependency chains on CPUs that are
10853 able to grok it fast. */
10854 else if (TARGET_USE_LEAVE
10855 || optimize_function_for_size_p (cfun)
10856 || !cfun->machine->use_fast_prologue_epilogue)
10857 ix86_emit_leave ();
10858 else
10859 {
10860 pro_epilogue_adjust_stack (stack_pointer_rtx,
10861 hard_frame_pointer_rtx,
10862 const0_rtx, style, !using_drap);
10863 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10864 }
10865 }
10866
10867 if (using_drap)
10868 {
10869 int param_ptr_offset = UNITS_PER_WORD;
10870 rtx insn;
10871
10872 gcc_assert (stack_realign_drap);
10873
10874 if (ix86_static_chain_on_stack)
10875 param_ptr_offset += UNITS_PER_WORD;
10876 if (!call_used_regs[REGNO (crtl->drap_reg)])
10877 param_ptr_offset += UNITS_PER_WORD;
10878
10879 insn = emit_insn (gen_rtx_SET
10880 (VOIDmode, stack_pointer_rtx,
10881 gen_rtx_PLUS (Pmode,
10882 crtl->drap_reg,
10883 GEN_INT (-param_ptr_offset))));
10884 m->fs.cfa_reg = stack_pointer_rtx;
10885 m->fs.cfa_offset = param_ptr_offset;
10886 m->fs.sp_offset = param_ptr_offset;
10887 m->fs.realigned = false;
10888
10889 add_reg_note (insn, REG_CFA_DEF_CFA,
10890 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10891 GEN_INT (param_ptr_offset)));
10892 RTX_FRAME_RELATED_P (insn) = 1;
10893
10894 if (!call_used_regs[REGNO (crtl->drap_reg)])
10895 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10896 }
10897
10898 /* At this point the stack pointer must be valid, and we must have
10899 restored all of the registers. We may not have deallocated the
10900 entire stack frame. We've delayed this until now because it may
10901 be possible to merge the local stack deallocation with the
10902 deallocation forced by ix86_static_chain_on_stack. */
10903 gcc_assert (m->fs.sp_valid);
10904 gcc_assert (!m->fs.fp_valid);
10905 gcc_assert (!m->fs.realigned);
10906 if (m->fs.sp_offset != UNITS_PER_WORD)
10907 {
10908 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10909 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10910 style, true);
10911 }
10912 else
10913 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10914
10915 /* Sibcall epilogues don't want a return instruction. */
10916 if (style == 0)
10917 {
10918 m->fs = frame_state_save;
10919 return;
10920 }
10921
10922 /* Emit vzeroupper if needed. */
10923 ix86_maybe_emit_epilogue_vzeroupper ();
10924
10925 if (crtl->args.pops_args && crtl->args.size)
10926 {
10927 rtx popc = GEN_INT (crtl->args.pops_args);
10928
10929 /* i386 can only pop 64K bytes. If asked to pop more, pop return
10930 address, do explicit add, and jump indirectly to the caller. */
10931
10932 if (crtl->args.pops_args >= 65536)
10933 {
10934 rtx ecx = gen_rtx_REG (SImode, CX_REG);
10935 rtx insn;
10936
10937 /* There is no "pascal" calling convention in any 64bit ABI. */
10938 gcc_assert (!TARGET_64BIT);
10939
10940 insn = emit_insn (gen_pop (ecx));
10941 m->fs.cfa_offset -= UNITS_PER_WORD;
10942 m->fs.sp_offset -= UNITS_PER_WORD;
10943
10944 add_reg_note (insn, REG_CFA_ADJUST_CFA,
10945 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
10946 add_reg_note (insn, REG_CFA_REGISTER,
10947 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
10948 RTX_FRAME_RELATED_P (insn) = 1;
10949
10950 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10951 popc, -1, true);
10952 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
10953 }
10954 else
10955 emit_jump_insn (gen_simple_return_pop_internal (popc));
10956 }
10957 else
10958 emit_jump_insn (gen_simple_return_internal ());
10959
10960 /* Restore the state back to the state from the prologue,
10961 so that it's correct for the next epilogue. */
10962 m->fs = frame_state_save;
10963 }
10964
10965 /* Reset from the function's potential modifications. */
10966
10967 static void
10968 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
10969 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
10970 {
10971 if (pic_offset_table_rtx)
10972 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
10973 #if TARGET_MACHO
10974 /* Mach-O doesn't support labels at the end of objects, so if
10975 it looks like we might want one, insert a NOP. */
10976 {
10977 rtx insn = get_last_insn ();
10978 rtx deleted_debug_label = NULL_RTX;
10979 while (insn
10980 && NOTE_P (insn)
10981 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
10982 {
10983 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
10984 notes only, instead set their CODE_LABEL_NUMBER to -1,
10985 otherwise there would be code generation differences
10986 in between -g and -g0. */
10987 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
10988 deleted_debug_label = insn;
10989 insn = PREV_INSN (insn);
10990 }
10991 if (insn
10992 && (LABEL_P (insn)
10993 || (NOTE_P (insn)
10994 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
10995 fputs ("\tnop\n", file);
10996 else if (deleted_debug_label)
10997 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
10998 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
10999 CODE_LABEL_NUMBER (insn) = -1;
11000 }
11001 #endif
11002
11003 }
11004
11005 /* Return a scratch register to use in the split stack prologue. The
11006 split stack prologue is used for -fsplit-stack. It is the first
11007 instructions in the function, even before the regular prologue.
11008 The scratch register can be any caller-saved register which is not
11009 used for parameters or for the static chain. */
11010
11011 static unsigned int
11012 split_stack_prologue_scratch_regno (void)
11013 {
11014 if (TARGET_64BIT)
11015 return R11_REG;
11016 else
11017 {
11018 bool is_fastcall;
11019 int regparm;
11020
11021 is_fastcall = (lookup_attribute ("fastcall",
11022 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11023 != NULL);
11024 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11025
11026 if (is_fastcall)
11027 {
11028 if (DECL_STATIC_CHAIN (cfun->decl))
11029 {
11030 sorry ("-fsplit-stack does not support fastcall with "
11031 "nested function");
11032 return INVALID_REGNUM;
11033 }
11034 return AX_REG;
11035 }
11036 else if (regparm < 3)
11037 {
11038 if (!DECL_STATIC_CHAIN (cfun->decl))
11039 return CX_REG;
11040 else
11041 {
11042 if (regparm >= 2)
11043 {
11044 sorry ("-fsplit-stack does not support 2 register "
11045 " parameters for a nested function");
11046 return INVALID_REGNUM;
11047 }
11048 return DX_REG;
11049 }
11050 }
11051 else
11052 {
11053 /* FIXME: We could make this work by pushing a register
11054 around the addition and comparison. */
11055 sorry ("-fsplit-stack does not support 3 register parameters");
11056 return INVALID_REGNUM;
11057 }
11058 }
11059 }
11060
11061 /* A SYMBOL_REF for the function which allocates new stackspace for
11062 -fsplit-stack. */
11063
11064 static GTY(()) rtx split_stack_fn;
11065
11066 /* A SYMBOL_REF for the more stack function when using the large
11067 model. */
11068
11069 static GTY(()) rtx split_stack_fn_large;
11070
11071 /* Handle -fsplit-stack. These are the first instructions in the
11072 function, even before the regular prologue. */
11073
11074 void
11075 ix86_expand_split_stack_prologue (void)
11076 {
11077 struct ix86_frame frame;
11078 HOST_WIDE_INT allocate;
11079 unsigned HOST_WIDE_INT args_size;
11080 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11081 rtx scratch_reg = NULL_RTX;
11082 rtx varargs_label = NULL_RTX;
11083 rtx fn;
11084
11085 gcc_assert (flag_split_stack && reload_completed);
11086
11087 ix86_finalize_stack_realign_flags ();
11088 ix86_compute_frame_layout (&frame);
11089 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11090
11091 /* This is the label we will branch to if we have enough stack
11092 space. We expect the basic block reordering pass to reverse this
11093 branch if optimizing, so that we branch in the unlikely case. */
11094 label = gen_label_rtx ();
11095
11096 /* We need to compare the stack pointer minus the frame size with
11097 the stack boundary in the TCB. The stack boundary always gives
11098 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11099 can compare directly. Otherwise we need to do an addition. */
11100
11101 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11102 UNSPEC_STACK_CHECK);
11103 limit = gen_rtx_CONST (Pmode, limit);
11104 limit = gen_rtx_MEM (Pmode, limit);
11105 if (allocate < SPLIT_STACK_AVAILABLE)
11106 current = stack_pointer_rtx;
11107 else
11108 {
11109 unsigned int scratch_regno;
11110 rtx offset;
11111
11112 /* We need a scratch register to hold the stack pointer minus
11113 the required frame size. Since this is the very start of the
11114 function, the scratch register can be any caller-saved
11115 register which is not used for parameters. */
11116 offset = GEN_INT (- allocate);
11117 scratch_regno = split_stack_prologue_scratch_regno ();
11118 if (scratch_regno == INVALID_REGNUM)
11119 return;
11120 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11121 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11122 {
11123 /* We don't use ix86_gen_add3 in this case because it will
11124 want to split to lea, but when not optimizing the insn
11125 will not be split after this point. */
11126 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11127 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11128 offset)));
11129 }
11130 else
11131 {
11132 emit_move_insn (scratch_reg, offset);
11133 emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
11134 stack_pointer_rtx));
11135 }
11136 current = scratch_reg;
11137 }
11138
11139 ix86_expand_branch (GEU, current, limit, label);
11140 jump_insn = get_last_insn ();
11141 JUMP_LABEL (jump_insn) = label;
11142
11143 /* Mark the jump as very likely to be taken. */
11144 add_reg_note (jump_insn, REG_BR_PROB,
11145 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11146
11147 if (split_stack_fn == NULL_RTX)
11148 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11149 fn = split_stack_fn;
11150
11151 /* Get more stack space. We pass in the desired stack space and the
11152 size of the arguments to copy to the new stack. In 32-bit mode
11153 we push the parameters; __morestack will return on a new stack
11154 anyhow. In 64-bit mode we pass the parameters in r10 and
11155 r11. */
11156 allocate_rtx = GEN_INT (allocate);
11157 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11158 call_fusage = NULL_RTX;
11159 if (TARGET_64BIT)
11160 {
11161 rtx reg10, reg11;
11162
11163 reg10 = gen_rtx_REG (Pmode, R10_REG);
11164 reg11 = gen_rtx_REG (Pmode, R11_REG);
11165
11166 /* If this function uses a static chain, it will be in %r10.
11167 Preserve it across the call to __morestack. */
11168 if (DECL_STATIC_CHAIN (cfun->decl))
11169 {
11170 rtx rax;
11171
11172 rax = gen_rtx_REG (Pmode, AX_REG);
11173 emit_move_insn (rax, reg10);
11174 use_reg (&call_fusage, rax);
11175 }
11176
11177 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11178 {
11179 HOST_WIDE_INT argval;
11180
11181 /* When using the large model we need to load the address
11182 into a register, and we've run out of registers. So we
11183 switch to a different calling convention, and we call a
11184 different function: __morestack_large. We pass the
11185 argument size in the upper 32 bits of r10 and pass the
11186 frame size in the lower 32 bits. */
11187 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11188 gcc_assert ((args_size & 0xffffffff) == args_size);
11189
11190 if (split_stack_fn_large == NULL_RTX)
11191 split_stack_fn_large =
11192 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11193
11194 if (ix86_cmodel == CM_LARGE_PIC)
11195 {
11196 rtx label, x;
11197
11198 label = gen_label_rtx ();
11199 emit_label (label);
11200 LABEL_PRESERVE_P (label) = 1;
11201 emit_insn (gen_set_rip_rex64 (reg10, label));
11202 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11203 emit_insn (gen_adddi3 (reg10, reg10, reg11));
11204 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11205 UNSPEC_GOT);
11206 x = gen_rtx_CONST (Pmode, x);
11207 emit_move_insn (reg11, x);
11208 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11209 x = gen_const_mem (Pmode, x);
11210 emit_move_insn (reg11, x);
11211 }
11212 else
11213 emit_move_insn (reg11, split_stack_fn_large);
11214
11215 fn = reg11;
11216
11217 argval = ((args_size << 16) << 16) + allocate;
11218 emit_move_insn (reg10, GEN_INT (argval));
11219 }
11220 else
11221 {
11222 emit_move_insn (reg10, allocate_rtx);
11223 emit_move_insn (reg11, GEN_INT (args_size));
11224 use_reg (&call_fusage, reg11);
11225 }
11226
11227 use_reg (&call_fusage, reg10);
11228 }
11229 else
11230 {
11231 emit_insn (gen_push (GEN_INT (args_size)));
11232 emit_insn (gen_push (allocate_rtx));
11233 }
11234 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11235 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11236 NULL_RTX, false);
11237 add_function_usage_to (call_insn, call_fusage);
11238
11239 /* In order to make call/return prediction work right, we now need
11240 to execute a return instruction. See
11241 libgcc/config/i386/morestack.S for the details on how this works.
11242
11243 For flow purposes gcc must not see this as a return
11244 instruction--we need control flow to continue at the subsequent
11245 label. Therefore, we use an unspec. */
11246 gcc_assert (crtl->args.pops_args < 65536);
11247 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11248
11249 /* If we are in 64-bit mode and this function uses a static chain,
11250 we saved %r10 in %rax before calling _morestack. */
11251 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11252 emit_move_insn (gen_rtx_REG (Pmode, R10_REG),
11253 gen_rtx_REG (Pmode, AX_REG));
11254
11255 /* If this function calls va_start, we need to store a pointer to
11256 the arguments on the old stack, because they may not have been
11257 all copied to the new stack. At this point the old stack can be
11258 found at the frame pointer value used by __morestack, because
11259 __morestack has set that up before calling back to us. Here we
11260 store that pointer in a scratch register, and in
11261 ix86_expand_prologue we store the scratch register in a stack
11262 slot. */
11263 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11264 {
11265 unsigned int scratch_regno;
11266 rtx frame_reg;
11267 int words;
11268
11269 scratch_regno = split_stack_prologue_scratch_regno ();
11270 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11271 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11272
11273 /* 64-bit:
11274 fp -> old fp value
11275 return address within this function
11276 return address of caller of this function
11277 stack arguments
11278 So we add three words to get to the stack arguments.
11279
11280 32-bit:
11281 fp -> old fp value
11282 return address within this function
11283 first argument to __morestack
11284 second argument to __morestack
11285 return address of caller of this function
11286 stack arguments
11287 So we add five words to get to the stack arguments.
11288 */
11289 words = TARGET_64BIT ? 3 : 5;
11290 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11291 gen_rtx_PLUS (Pmode, frame_reg,
11292 GEN_INT (words * UNITS_PER_WORD))));
11293
11294 varargs_label = gen_label_rtx ();
11295 emit_jump_insn (gen_jump (varargs_label));
11296 JUMP_LABEL (get_last_insn ()) = varargs_label;
11297
11298 emit_barrier ();
11299 }
11300
11301 emit_label (label);
11302 LABEL_NUSES (label) = 1;
11303
11304 /* If this function calls va_start, we now have to set the scratch
11305 register for the case where we do not call __morestack. In this
11306 case we need to set it based on the stack pointer. */
11307 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11308 {
11309 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11310 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11311 GEN_INT (UNITS_PER_WORD))));
11312
11313 emit_label (varargs_label);
11314 LABEL_NUSES (varargs_label) = 1;
11315 }
11316 }
11317
11318 /* We may have to tell the dataflow pass that the split stack prologue
11319 is initializing a scratch register. */
11320
11321 static void
11322 ix86_live_on_entry (bitmap regs)
11323 {
11324 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11325 {
11326 gcc_assert (flag_split_stack);
11327 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11328 }
11329 }
11330 \f
11331 /* Determine if op is suitable SUBREG RTX for address. */
11332
11333 static bool
11334 ix86_address_subreg_operand (rtx op)
11335 {
11336 enum machine_mode mode;
11337
11338 if (!REG_P (op))
11339 return false;
11340
11341 mode = GET_MODE (op);
11342
11343 if (GET_MODE_CLASS (mode) != MODE_INT)
11344 return false;
11345
11346 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11347 failures when the register is one word out of a two word structure. */
11348 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11349 return false;
11350
11351 /* Allow only SUBREGs of non-eliminable hard registers. */
11352 return register_no_elim_operand (op, mode);
11353 }
11354
11355 /* Extract the parts of an RTL expression that is a valid memory address
11356 for an instruction. Return 0 if the structure of the address is
11357 grossly off. Return -1 if the address contains ASHIFT, so it is not
11358 strictly valid, but still used for computing length of lea instruction. */
11359
11360 int
11361 ix86_decompose_address (rtx addr, struct ix86_address *out)
11362 {
11363 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11364 rtx base_reg, index_reg;
11365 HOST_WIDE_INT scale = 1;
11366 rtx scale_rtx = NULL_RTX;
11367 rtx tmp;
11368 int retval = 1;
11369 enum ix86_address_seg seg = SEG_DEFAULT;
11370
11371 /* Allow zero-extended SImode addresses,
11372 they will be emitted with addr32 prefix. */
11373 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11374 {
11375 if (GET_CODE (addr) == ZERO_EXTEND
11376 && GET_MODE (XEXP (addr, 0)) == SImode)
11377 addr = XEXP (addr, 0);
11378 else if (GET_CODE (addr) == AND
11379 && const_32bit_mask (XEXP (addr, 1), DImode))
11380 {
11381 addr = XEXP (addr, 0);
11382
11383 /* Strip subreg. */
11384 if (GET_CODE (addr) == SUBREG
11385 && GET_MODE (SUBREG_REG (addr)) == SImode)
11386 addr = SUBREG_REG (addr);
11387 }
11388 }
11389
11390 if (REG_P (addr))
11391 base = addr;
11392 else if (GET_CODE (addr) == SUBREG)
11393 {
11394 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11395 base = addr;
11396 else
11397 return 0;
11398 }
11399 else if (GET_CODE (addr) == PLUS)
11400 {
11401 rtx addends[4], op;
11402 int n = 0, i;
11403
11404 op = addr;
11405 do
11406 {
11407 if (n >= 4)
11408 return 0;
11409 addends[n++] = XEXP (op, 1);
11410 op = XEXP (op, 0);
11411 }
11412 while (GET_CODE (op) == PLUS);
11413 if (n >= 4)
11414 return 0;
11415 addends[n] = op;
11416
11417 for (i = n; i >= 0; --i)
11418 {
11419 op = addends[i];
11420 switch (GET_CODE (op))
11421 {
11422 case MULT:
11423 if (index)
11424 return 0;
11425 index = XEXP (op, 0);
11426 scale_rtx = XEXP (op, 1);
11427 break;
11428
11429 case ASHIFT:
11430 if (index)
11431 return 0;
11432 index = XEXP (op, 0);
11433 tmp = XEXP (op, 1);
11434 if (!CONST_INT_P (tmp))
11435 return 0;
11436 scale = INTVAL (tmp);
11437 if ((unsigned HOST_WIDE_INT) scale > 3)
11438 return 0;
11439 scale = 1 << scale;
11440 break;
11441
11442 case UNSPEC:
11443 if (XINT (op, 1) == UNSPEC_TP
11444 && TARGET_TLS_DIRECT_SEG_REFS
11445 && seg == SEG_DEFAULT)
11446 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11447 else
11448 return 0;
11449 break;
11450
11451 case SUBREG:
11452 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11453 return 0;
11454 /* FALLTHRU */
11455
11456 case REG:
11457 if (!base)
11458 base = op;
11459 else if (!index)
11460 index = op;
11461 else
11462 return 0;
11463 break;
11464
11465 case CONST:
11466 case CONST_INT:
11467 case SYMBOL_REF:
11468 case LABEL_REF:
11469 if (disp)
11470 return 0;
11471 disp = op;
11472 break;
11473
11474 default:
11475 return 0;
11476 }
11477 }
11478 }
11479 else if (GET_CODE (addr) == MULT)
11480 {
11481 index = XEXP (addr, 0); /* index*scale */
11482 scale_rtx = XEXP (addr, 1);
11483 }
11484 else if (GET_CODE (addr) == ASHIFT)
11485 {
11486 /* We're called for lea too, which implements ashift on occasion. */
11487 index = XEXP (addr, 0);
11488 tmp = XEXP (addr, 1);
11489 if (!CONST_INT_P (tmp))
11490 return 0;
11491 scale = INTVAL (tmp);
11492 if ((unsigned HOST_WIDE_INT) scale > 3)
11493 return 0;
11494 scale = 1 << scale;
11495 retval = -1;
11496 }
11497 else
11498 disp = addr; /* displacement */
11499
11500 if (index)
11501 {
11502 if (REG_P (index))
11503 ;
11504 else if (GET_CODE (index) == SUBREG
11505 && ix86_address_subreg_operand (SUBREG_REG (index)))
11506 ;
11507 else
11508 return 0;
11509 }
11510
11511 /* Extract the integral value of scale. */
11512 if (scale_rtx)
11513 {
11514 if (!CONST_INT_P (scale_rtx))
11515 return 0;
11516 scale = INTVAL (scale_rtx);
11517 }
11518
11519 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11520 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11521
11522 /* Avoid useless 0 displacement. */
11523 if (disp == const0_rtx && (base || index))
11524 disp = NULL_RTX;
11525
11526 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11527 if (base_reg && index_reg && scale == 1
11528 && (index_reg == arg_pointer_rtx
11529 || index_reg == frame_pointer_rtx
11530 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11531 {
11532 rtx tmp;
11533 tmp = base, base = index, index = tmp;
11534 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11535 }
11536
11537 /* Special case: %ebp cannot be encoded as a base without a displacement.
11538 Similarly %r13. */
11539 if (!disp
11540 && base_reg
11541 && (base_reg == hard_frame_pointer_rtx
11542 || base_reg == frame_pointer_rtx
11543 || base_reg == arg_pointer_rtx
11544 || (REG_P (base_reg)
11545 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11546 || REGNO (base_reg) == R13_REG))))
11547 disp = const0_rtx;
11548
11549 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11550 Avoid this by transforming to [%esi+0].
11551 Reload calls address legitimization without cfun defined, so we need
11552 to test cfun for being non-NULL. */
11553 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11554 && base_reg && !index_reg && !disp
11555 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11556 disp = const0_rtx;
11557
11558 /* Special case: encode reg+reg instead of reg*2. */
11559 if (!base && index && scale == 2)
11560 base = index, base_reg = index_reg, scale = 1;
11561
11562 /* Special case: scaling cannot be encoded without base or displacement. */
11563 if (!base && !disp && index && scale != 1)
11564 disp = const0_rtx;
11565
11566 out->base = base;
11567 out->index = index;
11568 out->disp = disp;
11569 out->scale = scale;
11570 out->seg = seg;
11571
11572 return retval;
11573 }
11574 \f
11575 /* Return cost of the memory address x.
11576 For i386, it is better to use a complex address than let gcc copy
11577 the address into a reg and make a new pseudo. But not if the address
11578 requires to two regs - that would mean more pseudos with longer
11579 lifetimes. */
11580 static int
11581 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11582 {
11583 struct ix86_address parts;
11584 int cost = 1;
11585 int ok = ix86_decompose_address (x, &parts);
11586
11587 gcc_assert (ok);
11588
11589 if (parts.base && GET_CODE (parts.base) == SUBREG)
11590 parts.base = SUBREG_REG (parts.base);
11591 if (parts.index && GET_CODE (parts.index) == SUBREG)
11592 parts.index = SUBREG_REG (parts.index);
11593
11594 /* Attempt to minimize number of registers in the address. */
11595 if ((parts.base
11596 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11597 || (parts.index
11598 && (!REG_P (parts.index)
11599 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11600 cost++;
11601
11602 if (parts.base
11603 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11604 && parts.index
11605 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11606 && parts.base != parts.index)
11607 cost++;
11608
11609 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11610 since it's predecode logic can't detect the length of instructions
11611 and it degenerates to vector decoded. Increase cost of such
11612 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11613 to split such addresses or even refuse such addresses at all.
11614
11615 Following addressing modes are affected:
11616 [base+scale*index]
11617 [scale*index+disp]
11618 [base+index]
11619
11620 The first and last case may be avoidable by explicitly coding the zero in
11621 memory address, but I don't have AMD-K6 machine handy to check this
11622 theory. */
11623
11624 if (TARGET_K6
11625 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11626 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11627 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11628 cost += 10;
11629
11630 return cost;
11631 }
11632 \f
11633 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11634 this is used for to form addresses to local data when -fPIC is in
11635 use. */
11636
11637 static bool
11638 darwin_local_data_pic (rtx disp)
11639 {
11640 return (GET_CODE (disp) == UNSPEC
11641 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11642 }
11643
11644 /* Determine if a given RTX is a valid constant. We already know this
11645 satisfies CONSTANT_P. */
11646
11647 static bool
11648 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11649 {
11650 switch (GET_CODE (x))
11651 {
11652 case CONST:
11653 x = XEXP (x, 0);
11654
11655 if (GET_CODE (x) == PLUS)
11656 {
11657 if (!CONST_INT_P (XEXP (x, 1)))
11658 return false;
11659 x = XEXP (x, 0);
11660 }
11661
11662 if (TARGET_MACHO && darwin_local_data_pic (x))
11663 return true;
11664
11665 /* Only some unspecs are valid as "constants". */
11666 if (GET_CODE (x) == UNSPEC)
11667 switch (XINT (x, 1))
11668 {
11669 case UNSPEC_GOT:
11670 case UNSPEC_GOTOFF:
11671 case UNSPEC_PLTOFF:
11672 return TARGET_64BIT;
11673 case UNSPEC_TPOFF:
11674 case UNSPEC_NTPOFF:
11675 x = XVECEXP (x, 0, 0);
11676 return (GET_CODE (x) == SYMBOL_REF
11677 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11678 case UNSPEC_DTPOFF:
11679 x = XVECEXP (x, 0, 0);
11680 return (GET_CODE (x) == SYMBOL_REF
11681 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11682 default:
11683 return false;
11684 }
11685
11686 /* We must have drilled down to a symbol. */
11687 if (GET_CODE (x) == LABEL_REF)
11688 return true;
11689 if (GET_CODE (x) != SYMBOL_REF)
11690 return false;
11691 /* FALLTHRU */
11692
11693 case SYMBOL_REF:
11694 /* TLS symbols are never valid. */
11695 if (SYMBOL_REF_TLS_MODEL (x))
11696 return false;
11697
11698 /* DLLIMPORT symbols are never valid. */
11699 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11700 && SYMBOL_REF_DLLIMPORT_P (x))
11701 return false;
11702
11703 #if TARGET_MACHO
11704 /* mdynamic-no-pic */
11705 if (MACHO_DYNAMIC_NO_PIC_P)
11706 return machopic_symbol_defined_p (x);
11707 #endif
11708 break;
11709
11710 case CONST_DOUBLE:
11711 if (GET_MODE (x) == TImode
11712 && x != CONST0_RTX (TImode)
11713 && !TARGET_64BIT)
11714 return false;
11715 break;
11716
11717 case CONST_VECTOR:
11718 if (!standard_sse_constant_p (x))
11719 return false;
11720
11721 default:
11722 break;
11723 }
11724
11725 /* Otherwise we handle everything else in the move patterns. */
11726 return true;
11727 }
11728
11729 /* Determine if it's legal to put X into the constant pool. This
11730 is not possible for the address of thread-local symbols, which
11731 is checked above. */
11732
11733 static bool
11734 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11735 {
11736 /* We can always put integral constants and vectors in memory. */
11737 switch (GET_CODE (x))
11738 {
11739 case CONST_INT:
11740 case CONST_DOUBLE:
11741 case CONST_VECTOR:
11742 return false;
11743
11744 default:
11745 break;
11746 }
11747 return !ix86_legitimate_constant_p (mode, x);
11748 }
11749
11750
11751 /* Nonzero if the constant value X is a legitimate general operand
11752 when generating PIC code. It is given that flag_pic is on and
11753 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11754
11755 bool
11756 legitimate_pic_operand_p (rtx x)
11757 {
11758 rtx inner;
11759
11760 switch (GET_CODE (x))
11761 {
11762 case CONST:
11763 inner = XEXP (x, 0);
11764 if (GET_CODE (inner) == PLUS
11765 && CONST_INT_P (XEXP (inner, 1)))
11766 inner = XEXP (inner, 0);
11767
11768 /* Only some unspecs are valid as "constants". */
11769 if (GET_CODE (inner) == UNSPEC)
11770 switch (XINT (inner, 1))
11771 {
11772 case UNSPEC_GOT:
11773 case UNSPEC_GOTOFF:
11774 case UNSPEC_PLTOFF:
11775 return TARGET_64BIT;
11776 case UNSPEC_TPOFF:
11777 x = XVECEXP (inner, 0, 0);
11778 return (GET_CODE (x) == SYMBOL_REF
11779 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11780 case UNSPEC_MACHOPIC_OFFSET:
11781 return legitimate_pic_address_disp_p (x);
11782 default:
11783 return false;
11784 }
11785 /* FALLTHRU */
11786
11787 case SYMBOL_REF:
11788 case LABEL_REF:
11789 return legitimate_pic_address_disp_p (x);
11790
11791 default:
11792 return true;
11793 }
11794 }
11795
11796 /* Determine if a given CONST RTX is a valid memory displacement
11797 in PIC mode. */
11798
11799 bool
11800 legitimate_pic_address_disp_p (rtx disp)
11801 {
11802 bool saw_plus;
11803
11804 /* In 64bit mode we can allow direct addresses of symbols and labels
11805 when they are not dynamic symbols. */
11806 if (TARGET_64BIT)
11807 {
11808 rtx op0 = disp, op1;
11809
11810 switch (GET_CODE (disp))
11811 {
11812 case LABEL_REF:
11813 return true;
11814
11815 case CONST:
11816 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11817 break;
11818 op0 = XEXP (XEXP (disp, 0), 0);
11819 op1 = XEXP (XEXP (disp, 0), 1);
11820 if (!CONST_INT_P (op1)
11821 || INTVAL (op1) >= 16*1024*1024
11822 || INTVAL (op1) < -16*1024*1024)
11823 break;
11824 if (GET_CODE (op0) == LABEL_REF)
11825 return true;
11826 if (GET_CODE (op0) == CONST
11827 && GET_CODE (XEXP (op0, 0)) == UNSPEC
11828 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
11829 return true;
11830 if (GET_CODE (op0) == UNSPEC
11831 && XINT (op0, 1) == UNSPEC_PCREL)
11832 return true;
11833 if (GET_CODE (op0) != SYMBOL_REF)
11834 break;
11835 /* FALLTHRU */
11836
11837 case SYMBOL_REF:
11838 /* TLS references should always be enclosed in UNSPEC. */
11839 if (SYMBOL_REF_TLS_MODEL (op0))
11840 return false;
11841 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11842 && ix86_cmodel != CM_LARGE_PIC)
11843 return true;
11844 break;
11845
11846 default:
11847 break;
11848 }
11849 }
11850 if (GET_CODE (disp) != CONST)
11851 return false;
11852 disp = XEXP (disp, 0);
11853
11854 if (TARGET_64BIT)
11855 {
11856 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11857 of GOT tables. We should not need these anyway. */
11858 if (GET_CODE (disp) != UNSPEC
11859 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11860 && XINT (disp, 1) != UNSPEC_GOTOFF
11861 && XINT (disp, 1) != UNSPEC_PCREL
11862 && XINT (disp, 1) != UNSPEC_PLTOFF))
11863 return false;
11864
11865 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11866 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11867 return false;
11868 return true;
11869 }
11870
11871 saw_plus = false;
11872 if (GET_CODE (disp) == PLUS)
11873 {
11874 if (!CONST_INT_P (XEXP (disp, 1)))
11875 return false;
11876 disp = XEXP (disp, 0);
11877 saw_plus = true;
11878 }
11879
11880 if (TARGET_MACHO && darwin_local_data_pic (disp))
11881 return true;
11882
11883 if (GET_CODE (disp) != UNSPEC)
11884 return false;
11885
11886 switch (XINT (disp, 1))
11887 {
11888 case UNSPEC_GOT:
11889 if (saw_plus)
11890 return false;
11891 /* We need to check for both symbols and labels because VxWorks loads
11892 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11893 details. */
11894 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11895 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11896 case UNSPEC_GOTOFF:
11897 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11898 While ABI specify also 32bit relocation but we don't produce it in
11899 small PIC model at all. */
11900 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11901 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11902 && !TARGET_64BIT)
11903 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11904 return false;
11905 case UNSPEC_GOTTPOFF:
11906 case UNSPEC_GOTNTPOFF:
11907 case UNSPEC_INDNTPOFF:
11908 if (saw_plus)
11909 return false;
11910 disp = XVECEXP (disp, 0, 0);
11911 return (GET_CODE (disp) == SYMBOL_REF
11912 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
11913 case UNSPEC_NTPOFF:
11914 disp = XVECEXP (disp, 0, 0);
11915 return (GET_CODE (disp) == SYMBOL_REF
11916 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
11917 case UNSPEC_DTPOFF:
11918 disp = XVECEXP (disp, 0, 0);
11919 return (GET_CODE (disp) == SYMBOL_REF
11920 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
11921 }
11922
11923 return false;
11924 }
11925
11926 /* Recognizes RTL expressions that are valid memory addresses for an
11927 instruction. The MODE argument is the machine mode for the MEM
11928 expression that wants to use this address.
11929
11930 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
11931 convert common non-canonical forms to canonical form so that they will
11932 be recognized. */
11933
11934 static bool
11935 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
11936 rtx addr, bool strict)
11937 {
11938 struct ix86_address parts;
11939 rtx base, index, disp;
11940 HOST_WIDE_INT scale;
11941
11942 /* Since constant address in x32 is signed extended to 64bit,
11943 we have to prevent addresses from 0x80000000 to 0xffffffff. */
11944 if (TARGET_X32
11945 && CONST_INT_P (addr)
11946 && INTVAL (addr) < 0)
11947 return false;
11948
11949 if (ix86_decompose_address (addr, &parts) <= 0)
11950 /* Decomposition failed. */
11951 return false;
11952
11953 base = parts.base;
11954 index = parts.index;
11955 disp = parts.disp;
11956 scale = parts.scale;
11957
11958 /* Validate base register. */
11959 if (base)
11960 {
11961 rtx reg;
11962
11963 if (REG_P (base))
11964 reg = base;
11965 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
11966 reg = SUBREG_REG (base);
11967 else
11968 /* Base is not a register. */
11969 return false;
11970
11971 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
11972 return false;
11973
11974 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
11975 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
11976 /* Base is not valid. */
11977 return false;
11978 }
11979
11980 /* Validate index register. */
11981 if (index)
11982 {
11983 rtx reg;
11984
11985 if (REG_P (index))
11986 reg = index;
11987 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
11988 reg = SUBREG_REG (index);
11989 else
11990 /* Index is not a register. */
11991 return false;
11992
11993 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
11994 return false;
11995
11996 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
11997 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
11998 /* Index is not valid. */
11999 return false;
12000 }
12001
12002 /* Index and base should have the same mode. */
12003 if (base && index
12004 && GET_MODE (base) != GET_MODE (index))
12005 return false;
12006
12007 /* Validate scale factor. */
12008 if (scale != 1)
12009 {
12010 if (!index)
12011 /* Scale without index. */
12012 return false;
12013
12014 if (scale != 2 && scale != 4 && scale != 8)
12015 /* Scale is not a valid multiplier. */
12016 return false;
12017 }
12018
12019 /* Validate displacement. */
12020 if (disp)
12021 {
12022 if (GET_CODE (disp) == CONST
12023 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12024 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12025 switch (XINT (XEXP (disp, 0), 1))
12026 {
12027 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12028 used. While ABI specify also 32bit relocations, we don't produce
12029 them at all and use IP relative instead. */
12030 case UNSPEC_GOT:
12031 case UNSPEC_GOTOFF:
12032 gcc_assert (flag_pic);
12033 if (!TARGET_64BIT)
12034 goto is_legitimate_pic;
12035
12036 /* 64bit address unspec. */
12037 return false;
12038
12039 case UNSPEC_GOTPCREL:
12040 case UNSPEC_PCREL:
12041 gcc_assert (flag_pic);
12042 goto is_legitimate_pic;
12043
12044 case UNSPEC_GOTTPOFF:
12045 case UNSPEC_GOTNTPOFF:
12046 case UNSPEC_INDNTPOFF:
12047 case UNSPEC_NTPOFF:
12048 case UNSPEC_DTPOFF:
12049 break;
12050
12051 case UNSPEC_STACK_CHECK:
12052 gcc_assert (flag_split_stack);
12053 break;
12054
12055 default:
12056 /* Invalid address unspec. */
12057 return false;
12058 }
12059
12060 else if (SYMBOLIC_CONST (disp)
12061 && (flag_pic
12062 || (TARGET_MACHO
12063 #if TARGET_MACHO
12064 && MACHOPIC_INDIRECT
12065 && !machopic_operand_p (disp)
12066 #endif
12067 )))
12068 {
12069
12070 is_legitimate_pic:
12071 if (TARGET_64BIT && (index || base))
12072 {
12073 /* foo@dtpoff(%rX) is ok. */
12074 if (GET_CODE (disp) != CONST
12075 || GET_CODE (XEXP (disp, 0)) != PLUS
12076 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12077 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12078 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12079 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12080 /* Non-constant pic memory reference. */
12081 return false;
12082 }
12083 else if ((!TARGET_MACHO || flag_pic)
12084 && ! legitimate_pic_address_disp_p (disp))
12085 /* Displacement is an invalid pic construct. */
12086 return false;
12087 #if TARGET_MACHO
12088 else if (MACHO_DYNAMIC_NO_PIC_P
12089 && !ix86_legitimate_constant_p (Pmode, disp))
12090 /* displacment must be referenced via non_lazy_pointer */
12091 return false;
12092 #endif
12093
12094 /* This code used to verify that a symbolic pic displacement
12095 includes the pic_offset_table_rtx register.
12096
12097 While this is good idea, unfortunately these constructs may
12098 be created by "adds using lea" optimization for incorrect
12099 code like:
12100
12101 int a;
12102 int foo(int i)
12103 {
12104 return *(&a+i);
12105 }
12106
12107 This code is nonsensical, but results in addressing
12108 GOT table with pic_offset_table_rtx base. We can't
12109 just refuse it easily, since it gets matched by
12110 "addsi3" pattern, that later gets split to lea in the
12111 case output register differs from input. While this
12112 can be handled by separate addsi pattern for this case
12113 that never results in lea, this seems to be easier and
12114 correct fix for crash to disable this test. */
12115 }
12116 else if (GET_CODE (disp) != LABEL_REF
12117 && !CONST_INT_P (disp)
12118 && (GET_CODE (disp) != CONST
12119 || !ix86_legitimate_constant_p (Pmode, disp))
12120 && (GET_CODE (disp) != SYMBOL_REF
12121 || !ix86_legitimate_constant_p (Pmode, disp)))
12122 /* Displacement is not constant. */
12123 return false;
12124 else if (TARGET_64BIT
12125 && !x86_64_immediate_operand (disp, VOIDmode))
12126 /* Displacement is out of range. */
12127 return false;
12128 }
12129
12130 /* Everything looks valid. */
12131 return true;
12132 }
12133
12134 /* Determine if a given RTX is a valid constant address. */
12135
12136 bool
12137 constant_address_p (rtx x)
12138 {
12139 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12140 }
12141 \f
12142 /* Return a unique alias set for the GOT. */
12143
12144 static alias_set_type
12145 ix86_GOT_alias_set (void)
12146 {
12147 static alias_set_type set = -1;
12148 if (set == -1)
12149 set = new_alias_set ();
12150 return set;
12151 }
12152
12153 /* Return a legitimate reference for ORIG (an address) using the
12154 register REG. If REG is 0, a new pseudo is generated.
12155
12156 There are two types of references that must be handled:
12157
12158 1. Global data references must load the address from the GOT, via
12159 the PIC reg. An insn is emitted to do this load, and the reg is
12160 returned.
12161
12162 2. Static data references, constant pool addresses, and code labels
12163 compute the address as an offset from the GOT, whose base is in
12164 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12165 differentiate them from global data objects. The returned
12166 address is the PIC reg + an unspec constant.
12167
12168 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12169 reg also appears in the address. */
12170
12171 static rtx
12172 legitimize_pic_address (rtx orig, rtx reg)
12173 {
12174 rtx addr = orig;
12175 rtx new_rtx = orig;
12176 rtx base;
12177
12178 #if TARGET_MACHO
12179 if (TARGET_MACHO && !TARGET_64BIT)
12180 {
12181 if (reg == 0)
12182 reg = gen_reg_rtx (Pmode);
12183 /* Use the generic Mach-O PIC machinery. */
12184 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12185 }
12186 #endif
12187
12188 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12189 new_rtx = addr;
12190 else if (TARGET_64BIT
12191 && ix86_cmodel != CM_SMALL_PIC
12192 && gotoff_operand (addr, Pmode))
12193 {
12194 rtx tmpreg;
12195 /* This symbol may be referenced via a displacement from the PIC
12196 base address (@GOTOFF). */
12197
12198 if (reload_in_progress)
12199 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12200 if (GET_CODE (addr) == CONST)
12201 addr = XEXP (addr, 0);
12202 if (GET_CODE (addr) == PLUS)
12203 {
12204 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12205 UNSPEC_GOTOFF);
12206 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12207 }
12208 else
12209 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12210 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12211 if (!reg)
12212 tmpreg = gen_reg_rtx (Pmode);
12213 else
12214 tmpreg = reg;
12215 emit_move_insn (tmpreg, new_rtx);
12216
12217 if (reg != 0)
12218 {
12219 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12220 tmpreg, 1, OPTAB_DIRECT);
12221 new_rtx = reg;
12222 }
12223 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12224 }
12225 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12226 {
12227 /* This symbol may be referenced via a displacement from the PIC
12228 base address (@GOTOFF). */
12229
12230 if (reload_in_progress)
12231 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12232 if (GET_CODE (addr) == CONST)
12233 addr = XEXP (addr, 0);
12234 if (GET_CODE (addr) == PLUS)
12235 {
12236 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12237 UNSPEC_GOTOFF);
12238 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12239 }
12240 else
12241 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12242 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12243 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12244
12245 if (reg != 0)
12246 {
12247 emit_move_insn (reg, new_rtx);
12248 new_rtx = reg;
12249 }
12250 }
12251 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12252 /* We can't use @GOTOFF for text labels on VxWorks;
12253 see gotoff_operand. */
12254 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12255 {
12256 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12257 {
12258 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12259 return legitimize_dllimport_symbol (addr, true);
12260 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12261 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12262 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12263 {
12264 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12265 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12266 }
12267 }
12268
12269 /* For x64 PE-COFF there is no GOT table. So we use address
12270 directly. */
12271 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12272 {
12273 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12274 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12275
12276 if (reg == 0)
12277 reg = gen_reg_rtx (Pmode);
12278 emit_move_insn (reg, new_rtx);
12279 new_rtx = reg;
12280 }
12281 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12282 {
12283 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12284 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12285 new_rtx = gen_const_mem (Pmode, new_rtx);
12286 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12287
12288 if (reg == 0)
12289 reg = gen_reg_rtx (Pmode);
12290 /* Use directly gen_movsi, otherwise the address is loaded
12291 into register for CSE. We don't want to CSE this addresses,
12292 instead we CSE addresses from the GOT table, so skip this. */
12293 emit_insn (gen_movsi (reg, new_rtx));
12294 new_rtx = reg;
12295 }
12296 else
12297 {
12298 /* This symbol must be referenced via a load from the
12299 Global Offset Table (@GOT). */
12300
12301 if (reload_in_progress)
12302 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12303 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12304 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12305 if (TARGET_64BIT)
12306 new_rtx = force_reg (Pmode, new_rtx);
12307 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12308 new_rtx = gen_const_mem (Pmode, new_rtx);
12309 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12310
12311 if (reg == 0)
12312 reg = gen_reg_rtx (Pmode);
12313 emit_move_insn (reg, new_rtx);
12314 new_rtx = reg;
12315 }
12316 }
12317 else
12318 {
12319 if (CONST_INT_P (addr)
12320 && !x86_64_immediate_operand (addr, VOIDmode))
12321 {
12322 if (reg)
12323 {
12324 emit_move_insn (reg, addr);
12325 new_rtx = reg;
12326 }
12327 else
12328 new_rtx = force_reg (Pmode, addr);
12329 }
12330 else if (GET_CODE (addr) == CONST)
12331 {
12332 addr = XEXP (addr, 0);
12333
12334 /* We must match stuff we generate before. Assume the only
12335 unspecs that can get here are ours. Not that we could do
12336 anything with them anyway.... */
12337 if (GET_CODE (addr) == UNSPEC
12338 || (GET_CODE (addr) == PLUS
12339 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12340 return orig;
12341 gcc_assert (GET_CODE (addr) == PLUS);
12342 }
12343 if (GET_CODE (addr) == PLUS)
12344 {
12345 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12346
12347 /* Check first to see if this is a constant offset from a @GOTOFF
12348 symbol reference. */
12349 if (gotoff_operand (op0, Pmode)
12350 && CONST_INT_P (op1))
12351 {
12352 if (!TARGET_64BIT)
12353 {
12354 if (reload_in_progress)
12355 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12356 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12357 UNSPEC_GOTOFF);
12358 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12359 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12360 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12361
12362 if (reg != 0)
12363 {
12364 emit_move_insn (reg, new_rtx);
12365 new_rtx = reg;
12366 }
12367 }
12368 else
12369 {
12370 if (INTVAL (op1) < -16*1024*1024
12371 || INTVAL (op1) >= 16*1024*1024)
12372 {
12373 if (!x86_64_immediate_operand (op1, Pmode))
12374 op1 = force_reg (Pmode, op1);
12375 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12376 }
12377 }
12378 }
12379 else
12380 {
12381 base = legitimize_pic_address (XEXP (addr, 0), reg);
12382 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12383 base == reg ? NULL_RTX : reg);
12384
12385 if (CONST_INT_P (new_rtx))
12386 new_rtx = plus_constant (base, INTVAL (new_rtx));
12387 else
12388 {
12389 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12390 {
12391 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12392 new_rtx = XEXP (new_rtx, 1);
12393 }
12394 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12395 }
12396 }
12397 }
12398 }
12399 return new_rtx;
12400 }
12401 \f
12402 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12403
12404 static rtx
12405 get_thread_pointer (bool to_reg)
12406 {
12407 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12408
12409 if (GET_MODE (tp) != Pmode)
12410 tp = convert_to_mode (Pmode, tp, 1);
12411
12412 if (to_reg)
12413 tp = copy_addr_to_reg (tp);
12414
12415 return tp;
12416 }
12417
12418 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12419
12420 static GTY(()) rtx ix86_tls_symbol;
12421
12422 static rtx
12423 ix86_tls_get_addr (void)
12424 {
12425 if (!ix86_tls_symbol)
12426 {
12427 const char *sym
12428 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12429 ? "___tls_get_addr" : "__tls_get_addr");
12430
12431 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12432 }
12433
12434 return ix86_tls_symbol;
12435 }
12436
12437 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12438
12439 static GTY(()) rtx ix86_tls_module_base_symbol;
12440
12441 rtx
12442 ix86_tls_module_base (void)
12443 {
12444 if (!ix86_tls_module_base_symbol)
12445 {
12446 ix86_tls_module_base_symbol
12447 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12448
12449 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12450 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12451 }
12452
12453 return ix86_tls_module_base_symbol;
12454 }
12455
12456 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12457 false if we expect this to be used for a memory address and true if
12458 we expect to load the address into a register. */
12459
12460 static rtx
12461 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12462 {
12463 rtx dest, base, off;
12464 rtx pic = NULL_RTX, tp = NULL_RTX;
12465 int type;
12466
12467 switch (model)
12468 {
12469 case TLS_MODEL_GLOBAL_DYNAMIC:
12470 dest = gen_reg_rtx (Pmode);
12471
12472 if (!TARGET_64BIT)
12473 {
12474 if (flag_pic)
12475 pic = pic_offset_table_rtx;
12476 else
12477 {
12478 pic = gen_reg_rtx (Pmode);
12479 emit_insn (gen_set_got (pic));
12480 }
12481 }
12482
12483 if (TARGET_GNU2_TLS)
12484 {
12485 if (TARGET_64BIT)
12486 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12487 else
12488 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12489
12490 tp = get_thread_pointer (true);
12491 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12492
12493 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12494 }
12495 else
12496 {
12497 rtx caddr = ix86_tls_get_addr ();
12498
12499 if (TARGET_64BIT)
12500 {
12501 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12502
12503 start_sequence ();
12504 emit_call_insn (gen_tls_global_dynamic_64 (rax, x, caddr));
12505 insns = get_insns ();
12506 end_sequence ();
12507
12508 RTL_CONST_CALL_P (insns) = 1;
12509 emit_libcall_block (insns, dest, rax, x);
12510 }
12511 else
12512 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12513 }
12514 break;
12515
12516 case TLS_MODEL_LOCAL_DYNAMIC:
12517 base = gen_reg_rtx (Pmode);
12518
12519 if (!TARGET_64BIT)
12520 {
12521 if (flag_pic)
12522 pic = pic_offset_table_rtx;
12523 else
12524 {
12525 pic = gen_reg_rtx (Pmode);
12526 emit_insn (gen_set_got (pic));
12527 }
12528 }
12529
12530 if (TARGET_GNU2_TLS)
12531 {
12532 rtx tmp = ix86_tls_module_base ();
12533
12534 if (TARGET_64BIT)
12535 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12536 else
12537 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12538
12539 tp = get_thread_pointer (true);
12540 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12541 gen_rtx_MINUS (Pmode, tmp, tp));
12542 }
12543 else
12544 {
12545 rtx caddr = ix86_tls_get_addr ();
12546
12547 if (TARGET_64BIT)
12548 {
12549 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12550
12551 start_sequence ();
12552 emit_call_insn (gen_tls_local_dynamic_base_64 (rax, caddr));
12553 insns = get_insns ();
12554 end_sequence ();
12555
12556 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12557 share the LD_BASE result with other LD model accesses. */
12558 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12559 UNSPEC_TLS_LD_BASE);
12560
12561 RTL_CONST_CALL_P (insns) = 1;
12562 emit_libcall_block (insns, base, rax, eqv);
12563 }
12564 else
12565 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12566 }
12567
12568 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12569 off = gen_rtx_CONST (Pmode, off);
12570
12571 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12572
12573 if (TARGET_GNU2_TLS)
12574 {
12575 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12576
12577 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12578 }
12579 break;
12580
12581 case TLS_MODEL_INITIAL_EXEC:
12582 if (TARGET_64BIT)
12583 {
12584 if (TARGET_SUN_TLS)
12585 {
12586 /* The Sun linker took the AMD64 TLS spec literally
12587 and can only handle %rax as destination of the
12588 initial executable code sequence. */
12589
12590 dest = gen_reg_rtx (Pmode);
12591 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12592 return dest;
12593 }
12594
12595 pic = NULL;
12596 type = UNSPEC_GOTNTPOFF;
12597 }
12598 else if (flag_pic)
12599 {
12600 if (reload_in_progress)
12601 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12602 pic = pic_offset_table_rtx;
12603 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12604 }
12605 else if (!TARGET_ANY_GNU_TLS)
12606 {
12607 pic = gen_reg_rtx (Pmode);
12608 emit_insn (gen_set_got (pic));
12609 type = UNSPEC_GOTTPOFF;
12610 }
12611 else
12612 {
12613 pic = NULL;
12614 type = UNSPEC_INDNTPOFF;
12615 }
12616
12617 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
12618 off = gen_rtx_CONST (Pmode, off);
12619 if (pic)
12620 off = gen_rtx_PLUS (Pmode, pic, off);
12621 off = gen_const_mem (Pmode, off);
12622 set_mem_alias_set (off, ix86_GOT_alias_set ());
12623
12624 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12625 {
12626 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12627 off = force_reg (Pmode, off);
12628 return gen_rtx_PLUS (Pmode, base, off);
12629 }
12630 else
12631 {
12632 base = get_thread_pointer (true);
12633 dest = gen_reg_rtx (Pmode);
12634 emit_insn (gen_subsi3 (dest, base, off));
12635 }
12636 break;
12637
12638 case TLS_MODEL_LOCAL_EXEC:
12639 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12640 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12641 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12642 off = gen_rtx_CONST (Pmode, off);
12643
12644 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12645 {
12646 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12647 return gen_rtx_PLUS (Pmode, base, off);
12648 }
12649 else
12650 {
12651 base = get_thread_pointer (true);
12652 dest = gen_reg_rtx (Pmode);
12653 emit_insn (gen_subsi3 (dest, base, off));
12654 }
12655 break;
12656
12657 default:
12658 gcc_unreachable ();
12659 }
12660
12661 return dest;
12662 }
12663
12664 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12665 to symbol DECL. */
12666
12667 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12668 htab_t dllimport_map;
12669
12670 static tree
12671 get_dllimport_decl (tree decl)
12672 {
12673 struct tree_map *h, in;
12674 void **loc;
12675 const char *name;
12676 const char *prefix;
12677 size_t namelen, prefixlen;
12678 char *imp_name;
12679 tree to;
12680 rtx rtl;
12681
12682 if (!dllimport_map)
12683 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12684
12685 in.hash = htab_hash_pointer (decl);
12686 in.base.from = decl;
12687 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12688 h = (struct tree_map *) *loc;
12689 if (h)
12690 return h->to;
12691
12692 *loc = h = ggc_alloc_tree_map ();
12693 h->hash = in.hash;
12694 h->base.from = decl;
12695 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12696 VAR_DECL, NULL, ptr_type_node);
12697 DECL_ARTIFICIAL (to) = 1;
12698 DECL_IGNORED_P (to) = 1;
12699 DECL_EXTERNAL (to) = 1;
12700 TREE_READONLY (to) = 1;
12701
12702 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12703 name = targetm.strip_name_encoding (name);
12704 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12705 ? "*__imp_" : "*__imp__";
12706 namelen = strlen (name);
12707 prefixlen = strlen (prefix);
12708 imp_name = (char *) alloca (namelen + prefixlen + 1);
12709 memcpy (imp_name, prefix, prefixlen);
12710 memcpy (imp_name + prefixlen, name, namelen + 1);
12711
12712 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12713 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12714 SET_SYMBOL_REF_DECL (rtl, to);
12715 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12716
12717 rtl = gen_const_mem (Pmode, rtl);
12718 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12719
12720 SET_DECL_RTL (to, rtl);
12721 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12722
12723 return to;
12724 }
12725
12726 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12727 true if we require the result be a register. */
12728
12729 static rtx
12730 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12731 {
12732 tree imp_decl;
12733 rtx x;
12734
12735 gcc_assert (SYMBOL_REF_DECL (symbol));
12736 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12737
12738 x = DECL_RTL (imp_decl);
12739 if (want_reg)
12740 x = force_reg (Pmode, x);
12741 return x;
12742 }
12743
12744 /* Try machine-dependent ways of modifying an illegitimate address
12745 to be legitimate. If we find one, return the new, valid address.
12746 This macro is used in only one place: `memory_address' in explow.c.
12747
12748 OLDX is the address as it was before break_out_memory_refs was called.
12749 In some cases it is useful to look at this to decide what needs to be done.
12750
12751 It is always safe for this macro to do nothing. It exists to recognize
12752 opportunities to optimize the output.
12753
12754 For the 80386, we handle X+REG by loading X into a register R and
12755 using R+REG. R will go in a general reg and indexing will be used.
12756 However, if REG is a broken-out memory address or multiplication,
12757 nothing needs to be done because REG can certainly go in a general reg.
12758
12759 When -fpic is used, special handling is needed for symbolic references.
12760 See comments by legitimize_pic_address in i386.c for details. */
12761
12762 static rtx
12763 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12764 enum machine_mode mode)
12765 {
12766 int changed = 0;
12767 unsigned log;
12768
12769 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12770 if (log)
12771 return legitimize_tls_address (x, (enum tls_model) log, false);
12772 if (GET_CODE (x) == CONST
12773 && GET_CODE (XEXP (x, 0)) == PLUS
12774 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12775 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12776 {
12777 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12778 (enum tls_model) log, false);
12779 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12780 }
12781
12782 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12783 {
12784 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12785 return legitimize_dllimport_symbol (x, true);
12786 if (GET_CODE (x) == CONST
12787 && GET_CODE (XEXP (x, 0)) == PLUS
12788 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12789 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12790 {
12791 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12792 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12793 }
12794 }
12795
12796 if (flag_pic && SYMBOLIC_CONST (x))
12797 return legitimize_pic_address (x, 0);
12798
12799 #if TARGET_MACHO
12800 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12801 return machopic_indirect_data_reference (x, 0);
12802 #endif
12803
12804 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12805 if (GET_CODE (x) == ASHIFT
12806 && CONST_INT_P (XEXP (x, 1))
12807 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12808 {
12809 changed = 1;
12810 log = INTVAL (XEXP (x, 1));
12811 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12812 GEN_INT (1 << log));
12813 }
12814
12815 if (GET_CODE (x) == PLUS)
12816 {
12817 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12818
12819 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12820 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12821 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12822 {
12823 changed = 1;
12824 log = INTVAL (XEXP (XEXP (x, 0), 1));
12825 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12826 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12827 GEN_INT (1 << log));
12828 }
12829
12830 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12831 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12832 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12833 {
12834 changed = 1;
12835 log = INTVAL (XEXP (XEXP (x, 1), 1));
12836 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12837 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12838 GEN_INT (1 << log));
12839 }
12840
12841 /* Put multiply first if it isn't already. */
12842 if (GET_CODE (XEXP (x, 1)) == MULT)
12843 {
12844 rtx tmp = XEXP (x, 0);
12845 XEXP (x, 0) = XEXP (x, 1);
12846 XEXP (x, 1) = tmp;
12847 changed = 1;
12848 }
12849
12850 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
12851 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
12852 created by virtual register instantiation, register elimination, and
12853 similar optimizations. */
12854 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
12855 {
12856 changed = 1;
12857 x = gen_rtx_PLUS (Pmode,
12858 gen_rtx_PLUS (Pmode, XEXP (x, 0),
12859 XEXP (XEXP (x, 1), 0)),
12860 XEXP (XEXP (x, 1), 1));
12861 }
12862
12863 /* Canonicalize
12864 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
12865 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
12866 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
12867 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12868 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
12869 && CONSTANT_P (XEXP (x, 1)))
12870 {
12871 rtx constant;
12872 rtx other = NULL_RTX;
12873
12874 if (CONST_INT_P (XEXP (x, 1)))
12875 {
12876 constant = XEXP (x, 1);
12877 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
12878 }
12879 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
12880 {
12881 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
12882 other = XEXP (x, 1);
12883 }
12884 else
12885 constant = 0;
12886
12887 if (constant)
12888 {
12889 changed = 1;
12890 x = gen_rtx_PLUS (Pmode,
12891 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
12892 XEXP (XEXP (XEXP (x, 0), 1), 0)),
12893 plus_constant (other, INTVAL (constant)));
12894 }
12895 }
12896
12897 if (changed && ix86_legitimate_address_p (mode, x, false))
12898 return x;
12899
12900 if (GET_CODE (XEXP (x, 0)) == MULT)
12901 {
12902 changed = 1;
12903 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
12904 }
12905
12906 if (GET_CODE (XEXP (x, 1)) == MULT)
12907 {
12908 changed = 1;
12909 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
12910 }
12911
12912 if (changed
12913 && REG_P (XEXP (x, 1))
12914 && REG_P (XEXP (x, 0)))
12915 return x;
12916
12917 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
12918 {
12919 changed = 1;
12920 x = legitimize_pic_address (x, 0);
12921 }
12922
12923 if (changed && ix86_legitimate_address_p (mode, x, false))
12924 return x;
12925
12926 if (REG_P (XEXP (x, 0)))
12927 {
12928 rtx temp = gen_reg_rtx (Pmode);
12929 rtx val = force_operand (XEXP (x, 1), temp);
12930 if (val != temp)
12931 {
12932 if (GET_MODE (val) != Pmode)
12933 val = convert_to_mode (Pmode, val, 1);
12934 emit_move_insn (temp, val);
12935 }
12936
12937 XEXP (x, 1) = temp;
12938 return x;
12939 }
12940
12941 else if (REG_P (XEXP (x, 1)))
12942 {
12943 rtx temp = gen_reg_rtx (Pmode);
12944 rtx val = force_operand (XEXP (x, 0), temp);
12945 if (val != temp)
12946 {
12947 if (GET_MODE (val) != Pmode)
12948 val = convert_to_mode (Pmode, val, 1);
12949 emit_move_insn (temp, val);
12950 }
12951
12952 XEXP (x, 0) = temp;
12953 return x;
12954 }
12955 }
12956
12957 return x;
12958 }
12959 \f
12960 /* Print an integer constant expression in assembler syntax. Addition
12961 and subtraction are the only arithmetic that may appear in these
12962 expressions. FILE is the stdio stream to write to, X is the rtx, and
12963 CODE is the operand print code from the output string. */
12964
12965 static void
12966 output_pic_addr_const (FILE *file, rtx x, int code)
12967 {
12968 char buf[256];
12969
12970 switch (GET_CODE (x))
12971 {
12972 case PC:
12973 gcc_assert (flag_pic);
12974 putc ('.', file);
12975 break;
12976
12977 case SYMBOL_REF:
12978 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
12979 output_addr_const (file, x);
12980 else
12981 {
12982 const char *name = XSTR (x, 0);
12983
12984 /* Mark the decl as referenced so that cgraph will
12985 output the function. */
12986 if (SYMBOL_REF_DECL (x))
12987 mark_decl_referenced (SYMBOL_REF_DECL (x));
12988
12989 #if TARGET_MACHO
12990 if (MACHOPIC_INDIRECT
12991 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
12992 name = machopic_indirection_name (x, /*stub_p=*/true);
12993 #endif
12994 assemble_name (file, name);
12995 }
12996 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12997 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
12998 fputs ("@PLT", file);
12999 break;
13000
13001 case LABEL_REF:
13002 x = XEXP (x, 0);
13003 /* FALLTHRU */
13004 case CODE_LABEL:
13005 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13006 assemble_name (asm_out_file, buf);
13007 break;
13008
13009 case CONST_INT:
13010 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13011 break;
13012
13013 case CONST:
13014 /* This used to output parentheses around the expression,
13015 but that does not work on the 386 (either ATT or BSD assembler). */
13016 output_pic_addr_const (file, XEXP (x, 0), code);
13017 break;
13018
13019 case CONST_DOUBLE:
13020 if (GET_MODE (x) == VOIDmode)
13021 {
13022 /* We can use %d if the number is <32 bits and positive. */
13023 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13024 fprintf (file, "0x%lx%08lx",
13025 (unsigned long) CONST_DOUBLE_HIGH (x),
13026 (unsigned long) CONST_DOUBLE_LOW (x));
13027 else
13028 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13029 }
13030 else
13031 /* We can't handle floating point constants;
13032 TARGET_PRINT_OPERAND must handle them. */
13033 output_operand_lossage ("floating constant misused");
13034 break;
13035
13036 case PLUS:
13037 /* Some assemblers need integer constants to appear first. */
13038 if (CONST_INT_P (XEXP (x, 0)))
13039 {
13040 output_pic_addr_const (file, XEXP (x, 0), code);
13041 putc ('+', file);
13042 output_pic_addr_const (file, XEXP (x, 1), code);
13043 }
13044 else
13045 {
13046 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13047 output_pic_addr_const (file, XEXP (x, 1), code);
13048 putc ('+', file);
13049 output_pic_addr_const (file, XEXP (x, 0), code);
13050 }
13051 break;
13052
13053 case MINUS:
13054 if (!TARGET_MACHO)
13055 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13056 output_pic_addr_const (file, XEXP (x, 0), code);
13057 putc ('-', file);
13058 output_pic_addr_const (file, XEXP (x, 1), code);
13059 if (!TARGET_MACHO)
13060 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13061 break;
13062
13063 case UNSPEC:
13064 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13065 {
13066 bool f = i386_asm_output_addr_const_extra (file, x);
13067 gcc_assert (f);
13068 break;
13069 }
13070
13071 gcc_assert (XVECLEN (x, 0) == 1);
13072 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13073 switch (XINT (x, 1))
13074 {
13075 case UNSPEC_GOT:
13076 fputs ("@GOT", file);
13077 break;
13078 case UNSPEC_GOTOFF:
13079 fputs ("@GOTOFF", file);
13080 break;
13081 case UNSPEC_PLTOFF:
13082 fputs ("@PLTOFF", file);
13083 break;
13084 case UNSPEC_PCREL:
13085 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13086 "(%rip)" : "[rip]", file);
13087 break;
13088 case UNSPEC_GOTPCREL:
13089 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13090 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13091 break;
13092 case UNSPEC_GOTTPOFF:
13093 /* FIXME: This might be @TPOFF in Sun ld too. */
13094 fputs ("@gottpoff", file);
13095 break;
13096 case UNSPEC_TPOFF:
13097 fputs ("@tpoff", file);
13098 break;
13099 case UNSPEC_NTPOFF:
13100 if (TARGET_64BIT)
13101 fputs ("@tpoff", file);
13102 else
13103 fputs ("@ntpoff", file);
13104 break;
13105 case UNSPEC_DTPOFF:
13106 fputs ("@dtpoff", file);
13107 break;
13108 case UNSPEC_GOTNTPOFF:
13109 if (TARGET_64BIT)
13110 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13111 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13112 else
13113 fputs ("@gotntpoff", file);
13114 break;
13115 case UNSPEC_INDNTPOFF:
13116 fputs ("@indntpoff", file);
13117 break;
13118 #if TARGET_MACHO
13119 case UNSPEC_MACHOPIC_OFFSET:
13120 putc ('-', file);
13121 machopic_output_function_base_name (file);
13122 break;
13123 #endif
13124 default:
13125 output_operand_lossage ("invalid UNSPEC as operand");
13126 break;
13127 }
13128 break;
13129
13130 default:
13131 output_operand_lossage ("invalid expression as operand");
13132 }
13133 }
13134
13135 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13136 We need to emit DTP-relative relocations. */
13137
13138 static void ATTRIBUTE_UNUSED
13139 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13140 {
13141 fputs (ASM_LONG, file);
13142 output_addr_const (file, x);
13143 fputs ("@dtpoff", file);
13144 switch (size)
13145 {
13146 case 4:
13147 break;
13148 case 8:
13149 fputs (", 0", file);
13150 break;
13151 default:
13152 gcc_unreachable ();
13153 }
13154 }
13155
13156 /* Return true if X is a representation of the PIC register. This copes
13157 with calls from ix86_find_base_term, where the register might have
13158 been replaced by a cselib value. */
13159
13160 static bool
13161 ix86_pic_register_p (rtx x)
13162 {
13163 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13164 return (pic_offset_table_rtx
13165 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13166 else
13167 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13168 }
13169
13170 /* Helper function for ix86_delegitimize_address.
13171 Attempt to delegitimize TLS local-exec accesses. */
13172
13173 static rtx
13174 ix86_delegitimize_tls_address (rtx orig_x)
13175 {
13176 rtx x = orig_x, unspec;
13177 struct ix86_address addr;
13178
13179 if (!TARGET_TLS_DIRECT_SEG_REFS)
13180 return orig_x;
13181 if (MEM_P (x))
13182 x = XEXP (x, 0);
13183 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13184 return orig_x;
13185 if (ix86_decompose_address (x, &addr) == 0
13186 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13187 || addr.disp == NULL_RTX
13188 || GET_CODE (addr.disp) != CONST)
13189 return orig_x;
13190 unspec = XEXP (addr.disp, 0);
13191 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13192 unspec = XEXP (unspec, 0);
13193 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13194 return orig_x;
13195 x = XVECEXP (unspec, 0, 0);
13196 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13197 if (unspec != XEXP (addr.disp, 0))
13198 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13199 if (addr.index)
13200 {
13201 rtx idx = addr.index;
13202 if (addr.scale != 1)
13203 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13204 x = gen_rtx_PLUS (Pmode, idx, x);
13205 }
13206 if (addr.base)
13207 x = gen_rtx_PLUS (Pmode, addr.base, x);
13208 if (MEM_P (orig_x))
13209 x = replace_equiv_address_nv (orig_x, x);
13210 return x;
13211 }
13212
13213 /* In the name of slightly smaller debug output, and to cater to
13214 general assembler lossage, recognize PIC+GOTOFF and turn it back
13215 into a direct symbol reference.
13216
13217 On Darwin, this is necessary to avoid a crash, because Darwin
13218 has a different PIC label for each routine but the DWARF debugging
13219 information is not associated with any particular routine, so it's
13220 necessary to remove references to the PIC label from RTL stored by
13221 the DWARF output code. */
13222
13223 static rtx
13224 ix86_delegitimize_address (rtx x)
13225 {
13226 rtx orig_x = delegitimize_mem_from_attrs (x);
13227 /* addend is NULL or some rtx if x is something+GOTOFF where
13228 something doesn't include the PIC register. */
13229 rtx addend = NULL_RTX;
13230 /* reg_addend is NULL or a multiple of some register. */
13231 rtx reg_addend = NULL_RTX;
13232 /* const_addend is NULL or a const_int. */
13233 rtx const_addend = NULL_RTX;
13234 /* This is the result, or NULL. */
13235 rtx result = NULL_RTX;
13236
13237 x = orig_x;
13238
13239 if (MEM_P (x))
13240 x = XEXP (x, 0);
13241
13242 if (TARGET_64BIT)
13243 {
13244 if (GET_CODE (x) == CONST
13245 && GET_CODE (XEXP (x, 0)) == PLUS
13246 && GET_MODE (XEXP (x, 0)) == Pmode
13247 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13248 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13249 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13250 {
13251 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13252 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13253 if (MEM_P (orig_x))
13254 x = replace_equiv_address_nv (orig_x, x);
13255 return x;
13256 }
13257 if (GET_CODE (x) != CONST
13258 || GET_CODE (XEXP (x, 0)) != UNSPEC
13259 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13260 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13261 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13262 return ix86_delegitimize_tls_address (orig_x);
13263 x = XVECEXP (XEXP (x, 0), 0, 0);
13264 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13265 {
13266 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13267 GET_MODE (x), 0);
13268 if (x == NULL_RTX)
13269 return orig_x;
13270 }
13271 return x;
13272 }
13273
13274 if (GET_CODE (x) != PLUS
13275 || GET_CODE (XEXP (x, 1)) != CONST)
13276 return ix86_delegitimize_tls_address (orig_x);
13277
13278 if (ix86_pic_register_p (XEXP (x, 0)))
13279 /* %ebx + GOT/GOTOFF */
13280 ;
13281 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13282 {
13283 /* %ebx + %reg * scale + GOT/GOTOFF */
13284 reg_addend = XEXP (x, 0);
13285 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13286 reg_addend = XEXP (reg_addend, 1);
13287 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13288 reg_addend = XEXP (reg_addend, 0);
13289 else
13290 {
13291 reg_addend = NULL_RTX;
13292 addend = XEXP (x, 0);
13293 }
13294 }
13295 else
13296 addend = XEXP (x, 0);
13297
13298 x = XEXP (XEXP (x, 1), 0);
13299 if (GET_CODE (x) == PLUS
13300 && CONST_INT_P (XEXP (x, 1)))
13301 {
13302 const_addend = XEXP (x, 1);
13303 x = XEXP (x, 0);
13304 }
13305
13306 if (GET_CODE (x) == UNSPEC
13307 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13308 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13309 result = XVECEXP (x, 0, 0);
13310
13311 if (TARGET_MACHO && darwin_local_data_pic (x)
13312 && !MEM_P (orig_x))
13313 result = XVECEXP (x, 0, 0);
13314
13315 if (! result)
13316 return ix86_delegitimize_tls_address (orig_x);
13317
13318 if (const_addend)
13319 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13320 if (reg_addend)
13321 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13322 if (addend)
13323 {
13324 /* If the rest of original X doesn't involve the PIC register, add
13325 addend and subtract pic_offset_table_rtx. This can happen e.g.
13326 for code like:
13327 leal (%ebx, %ecx, 4), %ecx
13328 ...
13329 movl foo@GOTOFF(%ecx), %edx
13330 in which case we return (%ecx - %ebx) + foo. */
13331 if (pic_offset_table_rtx)
13332 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13333 pic_offset_table_rtx),
13334 result);
13335 else
13336 return orig_x;
13337 }
13338 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13339 {
13340 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13341 if (result == NULL_RTX)
13342 return orig_x;
13343 }
13344 return result;
13345 }
13346
13347 /* If X is a machine specific address (i.e. a symbol or label being
13348 referenced as a displacement from the GOT implemented using an
13349 UNSPEC), then return the base term. Otherwise return X. */
13350
13351 rtx
13352 ix86_find_base_term (rtx x)
13353 {
13354 rtx term;
13355
13356 if (TARGET_64BIT)
13357 {
13358 if (GET_CODE (x) != CONST)
13359 return x;
13360 term = XEXP (x, 0);
13361 if (GET_CODE (term) == PLUS
13362 && (CONST_INT_P (XEXP (term, 1))
13363 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13364 term = XEXP (term, 0);
13365 if (GET_CODE (term) != UNSPEC
13366 || (XINT (term, 1) != UNSPEC_GOTPCREL
13367 && XINT (term, 1) != UNSPEC_PCREL))
13368 return x;
13369
13370 return XVECEXP (term, 0, 0);
13371 }
13372
13373 return ix86_delegitimize_address (x);
13374 }
13375 \f
13376 static void
13377 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13378 int fp, FILE *file)
13379 {
13380 const char *suffix;
13381
13382 if (mode == CCFPmode || mode == CCFPUmode)
13383 {
13384 code = ix86_fp_compare_code_to_integer (code);
13385 mode = CCmode;
13386 }
13387 if (reverse)
13388 code = reverse_condition (code);
13389
13390 switch (code)
13391 {
13392 case EQ:
13393 switch (mode)
13394 {
13395 case CCAmode:
13396 suffix = "a";
13397 break;
13398
13399 case CCCmode:
13400 suffix = "c";
13401 break;
13402
13403 case CCOmode:
13404 suffix = "o";
13405 break;
13406
13407 case CCSmode:
13408 suffix = "s";
13409 break;
13410
13411 default:
13412 suffix = "e";
13413 }
13414 break;
13415 case NE:
13416 switch (mode)
13417 {
13418 case CCAmode:
13419 suffix = "na";
13420 break;
13421
13422 case CCCmode:
13423 suffix = "nc";
13424 break;
13425
13426 case CCOmode:
13427 suffix = "no";
13428 break;
13429
13430 case CCSmode:
13431 suffix = "ns";
13432 break;
13433
13434 default:
13435 suffix = "ne";
13436 }
13437 break;
13438 case GT:
13439 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13440 suffix = "g";
13441 break;
13442 case GTU:
13443 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13444 Those same assemblers have the same but opposite lossage on cmov. */
13445 if (mode == CCmode)
13446 suffix = fp ? "nbe" : "a";
13447 else if (mode == CCCmode)
13448 suffix = "b";
13449 else
13450 gcc_unreachable ();
13451 break;
13452 case LT:
13453 switch (mode)
13454 {
13455 case CCNOmode:
13456 case CCGOCmode:
13457 suffix = "s";
13458 break;
13459
13460 case CCmode:
13461 case CCGCmode:
13462 suffix = "l";
13463 break;
13464
13465 default:
13466 gcc_unreachable ();
13467 }
13468 break;
13469 case LTU:
13470 gcc_assert (mode == CCmode || mode == CCCmode);
13471 suffix = "b";
13472 break;
13473 case GE:
13474 switch (mode)
13475 {
13476 case CCNOmode:
13477 case CCGOCmode:
13478 suffix = "ns";
13479 break;
13480
13481 case CCmode:
13482 case CCGCmode:
13483 suffix = "ge";
13484 break;
13485
13486 default:
13487 gcc_unreachable ();
13488 }
13489 break;
13490 case GEU:
13491 /* ??? As above. */
13492 gcc_assert (mode == CCmode || mode == CCCmode);
13493 suffix = fp ? "nb" : "ae";
13494 break;
13495 case LE:
13496 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13497 suffix = "le";
13498 break;
13499 case LEU:
13500 /* ??? As above. */
13501 if (mode == CCmode)
13502 suffix = "be";
13503 else if (mode == CCCmode)
13504 suffix = fp ? "nb" : "ae";
13505 else
13506 gcc_unreachable ();
13507 break;
13508 case UNORDERED:
13509 suffix = fp ? "u" : "p";
13510 break;
13511 case ORDERED:
13512 suffix = fp ? "nu" : "np";
13513 break;
13514 default:
13515 gcc_unreachable ();
13516 }
13517 fputs (suffix, file);
13518 }
13519
13520 /* Print the name of register X to FILE based on its machine mode and number.
13521 If CODE is 'w', pretend the mode is HImode.
13522 If CODE is 'b', pretend the mode is QImode.
13523 If CODE is 'k', pretend the mode is SImode.
13524 If CODE is 'q', pretend the mode is DImode.
13525 If CODE is 'x', pretend the mode is V4SFmode.
13526 If CODE is 't', pretend the mode is V8SFmode.
13527 If CODE is 'h', pretend the reg is the 'high' byte register.
13528 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13529 If CODE is 'd', duplicate the operand for AVX instruction.
13530 */
13531
13532 void
13533 print_reg (rtx x, int code, FILE *file)
13534 {
13535 const char *reg;
13536 bool duplicated = code == 'd' && TARGET_AVX;
13537
13538 gcc_assert (x == pc_rtx
13539 || (REGNO (x) != ARG_POINTER_REGNUM
13540 && REGNO (x) != FRAME_POINTER_REGNUM
13541 && REGNO (x) != FLAGS_REG
13542 && REGNO (x) != FPSR_REG
13543 && REGNO (x) != FPCR_REG));
13544
13545 if (ASSEMBLER_DIALECT == ASM_ATT)
13546 putc ('%', file);
13547
13548 if (x == pc_rtx)
13549 {
13550 gcc_assert (TARGET_64BIT);
13551 fputs ("rip", file);
13552 return;
13553 }
13554
13555 if (code == 'w' || MMX_REG_P (x))
13556 code = 2;
13557 else if (code == 'b')
13558 code = 1;
13559 else if (code == 'k')
13560 code = 4;
13561 else if (code == 'q')
13562 code = 8;
13563 else if (code == 'y')
13564 code = 3;
13565 else if (code == 'h')
13566 code = 0;
13567 else if (code == 'x')
13568 code = 16;
13569 else if (code == 't')
13570 code = 32;
13571 else
13572 code = GET_MODE_SIZE (GET_MODE (x));
13573
13574 /* Irritatingly, AMD extended registers use different naming convention
13575 from the normal registers: "r%d[bwd]" */
13576 if (REX_INT_REG_P (x))
13577 {
13578 gcc_assert (TARGET_64BIT);
13579 putc ('r', file);
13580 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13581 switch (code)
13582 {
13583 case 0:
13584 error ("extended registers have no high halves");
13585 break;
13586 case 1:
13587 putc ('b', file);
13588 break;
13589 case 2:
13590 putc ('w', file);
13591 break;
13592 case 4:
13593 putc ('d', file);
13594 break;
13595 case 8:
13596 /* no suffix */
13597 break;
13598 default:
13599 error ("unsupported operand size for extended register");
13600 break;
13601 }
13602 return;
13603 }
13604
13605 reg = NULL;
13606 switch (code)
13607 {
13608 case 3:
13609 if (STACK_TOP_P (x))
13610 {
13611 reg = "st(0)";
13612 break;
13613 }
13614 /* FALLTHRU */
13615 case 8:
13616 case 4:
13617 case 12:
13618 if (! ANY_FP_REG_P (x))
13619 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13620 /* FALLTHRU */
13621 case 16:
13622 case 2:
13623 normal:
13624 reg = hi_reg_name[REGNO (x)];
13625 break;
13626 case 1:
13627 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13628 goto normal;
13629 reg = qi_reg_name[REGNO (x)];
13630 break;
13631 case 0:
13632 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13633 goto normal;
13634 reg = qi_high_reg_name[REGNO (x)];
13635 break;
13636 case 32:
13637 if (SSE_REG_P (x))
13638 {
13639 gcc_assert (!duplicated);
13640 putc ('y', file);
13641 fputs (hi_reg_name[REGNO (x)] + 1, file);
13642 return;
13643 }
13644 break;
13645 default:
13646 gcc_unreachable ();
13647 }
13648
13649 fputs (reg, file);
13650 if (duplicated)
13651 {
13652 if (ASSEMBLER_DIALECT == ASM_ATT)
13653 fprintf (file, ", %%%s", reg);
13654 else
13655 fprintf (file, ", %s", reg);
13656 }
13657 }
13658
13659 /* Locate some local-dynamic symbol still in use by this function
13660 so that we can print its name in some tls_local_dynamic_base
13661 pattern. */
13662
13663 static int
13664 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13665 {
13666 rtx x = *px;
13667
13668 if (GET_CODE (x) == SYMBOL_REF
13669 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13670 {
13671 cfun->machine->some_ld_name = XSTR (x, 0);
13672 return 1;
13673 }
13674
13675 return 0;
13676 }
13677
13678 static const char *
13679 get_some_local_dynamic_name (void)
13680 {
13681 rtx insn;
13682
13683 if (cfun->machine->some_ld_name)
13684 return cfun->machine->some_ld_name;
13685
13686 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13687 if (NONDEBUG_INSN_P (insn)
13688 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13689 return cfun->machine->some_ld_name;
13690
13691 return NULL;
13692 }
13693
13694 /* Meaning of CODE:
13695 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13696 C -- print opcode suffix for set/cmov insn.
13697 c -- like C, but print reversed condition
13698 F,f -- likewise, but for floating-point.
13699 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13700 otherwise nothing
13701 R -- print the prefix for register names.
13702 z -- print the opcode suffix for the size of the current operand.
13703 Z -- likewise, with special suffixes for x87 instructions.
13704 * -- print a star (in certain assembler syntax)
13705 A -- print an absolute memory reference.
13706 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13707 s -- print a shift double count, followed by the assemblers argument
13708 delimiter.
13709 b -- print the QImode name of the register for the indicated operand.
13710 %b0 would print %al if operands[0] is reg 0.
13711 w -- likewise, print the HImode name of the register.
13712 k -- likewise, print the SImode name of the register.
13713 q -- likewise, print the DImode name of the register.
13714 x -- likewise, print the V4SFmode name of the register.
13715 t -- likewise, print the V8SFmode name of the register.
13716 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13717 y -- print "st(0)" instead of "st" as a register.
13718 d -- print duplicated register operand for AVX instruction.
13719 D -- print condition for SSE cmp instruction.
13720 P -- if PIC, print an @PLT suffix.
13721 p -- print raw symbol name.
13722 X -- don't print any sort of PIC '@' suffix for a symbol.
13723 & -- print some in-use local-dynamic symbol name.
13724 H -- print a memory address offset by 8; used for sse high-parts
13725 Y -- print condition for XOP pcom* instruction.
13726 + -- print a branch hint as 'cs' or 'ds' prefix
13727 ; -- print a semicolon (after prefixes due to bug in older gas).
13728 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13729 @ -- print a segment register of thread base pointer load
13730 */
13731
13732 void
13733 ix86_print_operand (FILE *file, rtx x, int code)
13734 {
13735 if (code)
13736 {
13737 switch (code)
13738 {
13739 case '*':
13740 if (ASSEMBLER_DIALECT == ASM_ATT)
13741 putc ('*', file);
13742 return;
13743
13744 case '&':
13745 {
13746 const char *name = get_some_local_dynamic_name ();
13747 if (name == NULL)
13748 output_operand_lossage ("'%%&' used without any "
13749 "local dynamic TLS references");
13750 else
13751 assemble_name (file, name);
13752 return;
13753 }
13754
13755 case 'A':
13756 switch (ASSEMBLER_DIALECT)
13757 {
13758 case ASM_ATT:
13759 putc ('*', file);
13760 break;
13761
13762 case ASM_INTEL:
13763 /* Intel syntax. For absolute addresses, registers should not
13764 be surrounded by braces. */
13765 if (!REG_P (x))
13766 {
13767 putc ('[', file);
13768 ix86_print_operand (file, x, 0);
13769 putc (']', file);
13770 return;
13771 }
13772 break;
13773
13774 default:
13775 gcc_unreachable ();
13776 }
13777
13778 ix86_print_operand (file, x, 0);
13779 return;
13780
13781
13782 case 'L':
13783 if (ASSEMBLER_DIALECT == ASM_ATT)
13784 putc ('l', file);
13785 return;
13786
13787 case 'W':
13788 if (ASSEMBLER_DIALECT == ASM_ATT)
13789 putc ('w', file);
13790 return;
13791
13792 case 'B':
13793 if (ASSEMBLER_DIALECT == ASM_ATT)
13794 putc ('b', file);
13795 return;
13796
13797 case 'Q':
13798 if (ASSEMBLER_DIALECT == ASM_ATT)
13799 putc ('l', file);
13800 return;
13801
13802 case 'S':
13803 if (ASSEMBLER_DIALECT == ASM_ATT)
13804 putc ('s', file);
13805 return;
13806
13807 case 'T':
13808 if (ASSEMBLER_DIALECT == ASM_ATT)
13809 putc ('t', file);
13810 return;
13811
13812 case 'z':
13813 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13814 {
13815 /* Opcodes don't get size suffixes if using Intel opcodes. */
13816 if (ASSEMBLER_DIALECT == ASM_INTEL)
13817 return;
13818
13819 switch (GET_MODE_SIZE (GET_MODE (x)))
13820 {
13821 case 1:
13822 putc ('b', file);
13823 return;
13824
13825 case 2:
13826 putc ('w', file);
13827 return;
13828
13829 case 4:
13830 putc ('l', file);
13831 return;
13832
13833 case 8:
13834 putc ('q', file);
13835 return;
13836
13837 default:
13838 output_operand_lossage
13839 ("invalid operand size for operand code '%c'", code);
13840 return;
13841 }
13842 }
13843
13844 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13845 warning
13846 (0, "non-integer operand used with operand code '%c'", code);
13847 /* FALLTHRU */
13848
13849 case 'Z':
13850 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
13851 if (ASSEMBLER_DIALECT == ASM_INTEL)
13852 return;
13853
13854 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13855 {
13856 switch (GET_MODE_SIZE (GET_MODE (x)))
13857 {
13858 case 2:
13859 #ifdef HAVE_AS_IX86_FILDS
13860 putc ('s', file);
13861 #endif
13862 return;
13863
13864 case 4:
13865 putc ('l', file);
13866 return;
13867
13868 case 8:
13869 #ifdef HAVE_AS_IX86_FILDQ
13870 putc ('q', file);
13871 #else
13872 fputs ("ll", file);
13873 #endif
13874 return;
13875
13876 default:
13877 break;
13878 }
13879 }
13880 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13881 {
13882 /* 387 opcodes don't get size suffixes
13883 if the operands are registers. */
13884 if (STACK_REG_P (x))
13885 return;
13886
13887 switch (GET_MODE_SIZE (GET_MODE (x)))
13888 {
13889 case 4:
13890 putc ('s', file);
13891 return;
13892
13893 case 8:
13894 putc ('l', file);
13895 return;
13896
13897 case 12:
13898 case 16:
13899 putc ('t', file);
13900 return;
13901
13902 default:
13903 break;
13904 }
13905 }
13906 else
13907 {
13908 output_operand_lossage
13909 ("invalid operand type used with operand code '%c'", code);
13910 return;
13911 }
13912
13913 output_operand_lossage
13914 ("invalid operand size for operand code '%c'", code);
13915 return;
13916
13917 case 'd':
13918 case 'b':
13919 case 'w':
13920 case 'k':
13921 case 'q':
13922 case 'h':
13923 case 't':
13924 case 'y':
13925 case 'x':
13926 case 'X':
13927 case 'P':
13928 case 'p':
13929 break;
13930
13931 case 's':
13932 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
13933 {
13934 ix86_print_operand (file, x, 0);
13935 fputs (", ", file);
13936 }
13937 return;
13938
13939 case 'D':
13940 /* Little bit of braindamage here. The SSE compare instructions
13941 does use completely different names for the comparisons that the
13942 fp conditional moves. */
13943 if (TARGET_AVX)
13944 {
13945 switch (GET_CODE (x))
13946 {
13947 case EQ:
13948 fputs ("eq", file);
13949 break;
13950 case UNEQ:
13951 fputs ("eq_us", file);
13952 break;
13953 case LT:
13954 fputs ("lt", file);
13955 break;
13956 case UNLT:
13957 fputs ("nge", file);
13958 break;
13959 case LE:
13960 fputs ("le", file);
13961 break;
13962 case UNLE:
13963 fputs ("ngt", file);
13964 break;
13965 case UNORDERED:
13966 fputs ("unord", file);
13967 break;
13968 case NE:
13969 fputs ("neq", file);
13970 break;
13971 case LTGT:
13972 fputs ("neq_oq", file);
13973 break;
13974 case GE:
13975 fputs ("ge", file);
13976 break;
13977 case UNGE:
13978 fputs ("nlt", file);
13979 break;
13980 case GT:
13981 fputs ("gt", file);
13982 break;
13983 case UNGT:
13984 fputs ("nle", file);
13985 break;
13986 case ORDERED:
13987 fputs ("ord", file);
13988 break;
13989 default:
13990 output_operand_lossage ("operand is not a condition code, "
13991 "invalid operand code 'D'");
13992 return;
13993 }
13994 }
13995 else
13996 {
13997 switch (GET_CODE (x))
13998 {
13999 case EQ:
14000 case UNEQ:
14001 fputs ("eq", file);
14002 break;
14003 case LT:
14004 case UNLT:
14005 fputs ("lt", file);
14006 break;
14007 case LE:
14008 case UNLE:
14009 fputs ("le", file);
14010 break;
14011 case UNORDERED:
14012 fputs ("unord", file);
14013 break;
14014 case NE:
14015 case LTGT:
14016 fputs ("neq", file);
14017 break;
14018 case UNGE:
14019 case GE:
14020 fputs ("nlt", file);
14021 break;
14022 case UNGT:
14023 case GT:
14024 fputs ("nle", file);
14025 break;
14026 case ORDERED:
14027 fputs ("ord", file);
14028 break;
14029 default:
14030 output_operand_lossage ("operand is not a condition code, "
14031 "invalid operand code 'D'");
14032 return;
14033 }
14034 }
14035 return;
14036 case 'O':
14037 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14038 if (ASSEMBLER_DIALECT == ASM_ATT)
14039 {
14040 switch (GET_MODE (x))
14041 {
14042 case HImode: putc ('w', file); break;
14043 case SImode:
14044 case SFmode: putc ('l', file); break;
14045 case DImode:
14046 case DFmode: putc ('q', file); break;
14047 default: gcc_unreachable ();
14048 }
14049 putc ('.', file);
14050 }
14051 #endif
14052 return;
14053 case 'C':
14054 if (!COMPARISON_P (x))
14055 {
14056 output_operand_lossage ("operand is neither a constant nor a "
14057 "condition code, invalid operand code "
14058 "'C'");
14059 return;
14060 }
14061 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
14062 return;
14063 case 'F':
14064 if (!COMPARISON_P (x))
14065 {
14066 output_operand_lossage ("operand is neither a constant nor a "
14067 "condition code, invalid operand code "
14068 "'F'");
14069 return;
14070 }
14071 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14072 if (ASSEMBLER_DIALECT == ASM_ATT)
14073 putc ('.', file);
14074 #endif
14075 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
14076 return;
14077
14078 /* Like above, but reverse condition */
14079 case 'c':
14080 /* Check to see if argument to %c is really a constant
14081 and not a condition code which needs to be reversed. */
14082 if (!COMPARISON_P (x))
14083 {
14084 output_operand_lossage ("operand is neither a constant nor a "
14085 "condition code, invalid operand "
14086 "code 'c'");
14087 return;
14088 }
14089 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
14090 return;
14091 case 'f':
14092 if (!COMPARISON_P (x))
14093 {
14094 output_operand_lossage ("operand is neither a constant nor a "
14095 "condition code, invalid operand "
14096 "code 'f'");
14097 return;
14098 }
14099 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14100 if (ASSEMBLER_DIALECT == ASM_ATT)
14101 putc ('.', file);
14102 #endif
14103 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14104 return;
14105
14106 case 'H':
14107 if (!offsettable_memref_p (x))
14108 {
14109 output_operand_lossage ("operand is not an offsettable memory "
14110 "reference, invalid operand "
14111 "code 'H'");
14112 return;
14113 }
14114 /* It doesn't actually matter what mode we use here, as we're
14115 only going to use this for printing. */
14116 x = adjust_address_nv (x, DImode, 8);
14117 break;
14118
14119 case '+':
14120 {
14121 rtx x;
14122
14123 if (!optimize
14124 || optimize_function_for_size_p (cfun) || !TARGET_BRANCH_PREDICTION_HINTS)
14125 return;
14126
14127 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14128 if (x)
14129 {
14130 int pred_val = INTVAL (XEXP (x, 0));
14131
14132 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14133 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14134 {
14135 int taken = pred_val > REG_BR_PROB_BASE / 2;
14136 int cputaken = final_forward_branch_p (current_output_insn) == 0;
14137
14138 /* Emit hints only in the case default branch prediction
14139 heuristics would fail. */
14140 if (taken != cputaken)
14141 {
14142 /* We use 3e (DS) prefix for taken branches and
14143 2e (CS) prefix for not taken branches. */
14144 if (taken)
14145 fputs ("ds ; ", file);
14146 else
14147 fputs ("cs ; ", file);
14148 }
14149 }
14150 }
14151 return;
14152 }
14153
14154 case 'Y':
14155 switch (GET_CODE (x))
14156 {
14157 case NE:
14158 fputs ("neq", file);
14159 break;
14160 case EQ:
14161 fputs ("eq", file);
14162 break;
14163 case GE:
14164 case GEU:
14165 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14166 break;
14167 case GT:
14168 case GTU:
14169 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14170 break;
14171 case LE:
14172 case LEU:
14173 fputs ("le", file);
14174 break;
14175 case LT:
14176 case LTU:
14177 fputs ("lt", file);
14178 break;
14179 case UNORDERED:
14180 fputs ("unord", file);
14181 break;
14182 case ORDERED:
14183 fputs ("ord", file);
14184 break;
14185 case UNEQ:
14186 fputs ("ueq", file);
14187 break;
14188 case UNGE:
14189 fputs ("nlt", file);
14190 break;
14191 case UNGT:
14192 fputs ("nle", file);
14193 break;
14194 case UNLE:
14195 fputs ("ule", file);
14196 break;
14197 case UNLT:
14198 fputs ("ult", file);
14199 break;
14200 case LTGT:
14201 fputs ("une", file);
14202 break;
14203 default:
14204 output_operand_lossage ("operand is not a condition code, "
14205 "invalid operand code 'Y'");
14206 return;
14207 }
14208 return;
14209
14210 case ';':
14211 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14212 putc (';', file);
14213 #endif
14214 return;
14215
14216 case '@':
14217 if (ASSEMBLER_DIALECT == ASM_ATT)
14218 putc ('%', file);
14219
14220 /* The kernel uses a different segment register for performance
14221 reasons; a system call would not have to trash the userspace
14222 segment register, which would be expensive. */
14223 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14224 fputs ("fs", file);
14225 else
14226 fputs ("gs", file);
14227 return;
14228
14229 case '~':
14230 putc (TARGET_AVX2 ? 'i' : 'f', file);
14231 return;
14232
14233 default:
14234 output_operand_lossage ("invalid operand code '%c'", code);
14235 }
14236 }
14237
14238 if (REG_P (x))
14239 print_reg (x, code, file);
14240
14241 else if (MEM_P (x))
14242 {
14243 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14244 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14245 && GET_MODE (x) != BLKmode)
14246 {
14247 const char * size;
14248 switch (GET_MODE_SIZE (GET_MODE (x)))
14249 {
14250 case 1: size = "BYTE"; break;
14251 case 2: size = "WORD"; break;
14252 case 4: size = "DWORD"; break;
14253 case 8: size = "QWORD"; break;
14254 case 12: size = "TBYTE"; break;
14255 case 16:
14256 if (GET_MODE (x) == XFmode)
14257 size = "TBYTE";
14258 else
14259 size = "XMMWORD";
14260 break;
14261 case 32: size = "YMMWORD"; break;
14262 default:
14263 gcc_unreachable ();
14264 }
14265
14266 /* Check for explicit size override (codes 'b', 'w', 'k',
14267 'q' and 'x') */
14268 if (code == 'b')
14269 size = "BYTE";
14270 else if (code == 'w')
14271 size = "WORD";
14272 else if (code == 'k')
14273 size = "DWORD";
14274 else if (code == 'q')
14275 size = "QWORD";
14276 else if (code == 'x')
14277 size = "XMMWORD";
14278
14279 fputs (size, file);
14280 fputs (" PTR ", file);
14281 }
14282
14283 x = XEXP (x, 0);
14284 /* Avoid (%rip) for call operands. */
14285 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14286 && !CONST_INT_P (x))
14287 output_addr_const (file, x);
14288 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14289 output_operand_lossage ("invalid constraints for operand");
14290 else
14291 output_address (x);
14292 }
14293
14294 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14295 {
14296 REAL_VALUE_TYPE r;
14297 long l;
14298
14299 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14300 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14301
14302 if (ASSEMBLER_DIALECT == ASM_ATT)
14303 putc ('$', file);
14304 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14305 if (code == 'q')
14306 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14307 else
14308 fprintf (file, "0x%08x", (unsigned int) l);
14309 }
14310
14311 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14312 {
14313 REAL_VALUE_TYPE r;
14314 long l[2];
14315
14316 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14317 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14318
14319 if (ASSEMBLER_DIALECT == ASM_ATT)
14320 putc ('$', file);
14321 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14322 }
14323
14324 /* These float cases don't actually occur as immediate operands. */
14325 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14326 {
14327 char dstr[30];
14328
14329 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14330 fputs (dstr, file);
14331 }
14332
14333 else
14334 {
14335 /* We have patterns that allow zero sets of memory, for instance.
14336 In 64-bit mode, we should probably support all 8-byte vectors,
14337 since we can in fact encode that into an immediate. */
14338 if (GET_CODE (x) == CONST_VECTOR)
14339 {
14340 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14341 x = const0_rtx;
14342 }
14343
14344 if (code != 'P' && code != 'p')
14345 {
14346 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14347 {
14348 if (ASSEMBLER_DIALECT == ASM_ATT)
14349 putc ('$', file);
14350 }
14351 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14352 || GET_CODE (x) == LABEL_REF)
14353 {
14354 if (ASSEMBLER_DIALECT == ASM_ATT)
14355 putc ('$', file);
14356 else
14357 fputs ("OFFSET FLAT:", file);
14358 }
14359 }
14360 if (CONST_INT_P (x))
14361 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14362 else if (flag_pic || MACHOPIC_INDIRECT)
14363 output_pic_addr_const (file, x, code);
14364 else
14365 output_addr_const (file, x);
14366 }
14367 }
14368
14369 static bool
14370 ix86_print_operand_punct_valid_p (unsigned char code)
14371 {
14372 return (code == '@' || code == '*' || code == '+'
14373 || code == '&' || code == ';' || code == '~');
14374 }
14375 \f
14376 /* Print a memory operand whose address is ADDR. */
14377
14378 static void
14379 ix86_print_operand_address (FILE *file, rtx addr)
14380 {
14381 struct ix86_address parts;
14382 rtx base, index, disp;
14383 int scale;
14384 int ok;
14385 bool vsib = false;
14386
14387 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14388 {
14389 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14390 gcc_assert (parts.index == NULL_RTX);
14391 parts.index = XVECEXP (addr, 0, 1);
14392 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14393 addr = XVECEXP (addr, 0, 0);
14394 vsib = true;
14395 }
14396 else
14397 ok = ix86_decompose_address (addr, &parts);
14398
14399 gcc_assert (ok);
14400
14401 if (parts.base && GET_CODE (parts.base) == SUBREG)
14402 {
14403 rtx tmp = SUBREG_REG (parts.base);
14404 parts.base = simplify_subreg (GET_MODE (parts.base),
14405 tmp, GET_MODE (tmp), 0);
14406 }
14407
14408 if (parts.index && GET_CODE (parts.index) == SUBREG)
14409 {
14410 rtx tmp = SUBREG_REG (parts.index);
14411 parts.index = simplify_subreg (GET_MODE (parts.index),
14412 tmp, GET_MODE (tmp), 0);
14413 }
14414
14415 base = parts.base;
14416 index = parts.index;
14417 disp = parts.disp;
14418 scale = parts.scale;
14419
14420 switch (parts.seg)
14421 {
14422 case SEG_DEFAULT:
14423 break;
14424 case SEG_FS:
14425 case SEG_GS:
14426 if (ASSEMBLER_DIALECT == ASM_ATT)
14427 putc ('%', file);
14428 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14429 break;
14430 default:
14431 gcc_unreachable ();
14432 }
14433
14434 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14435 if (TARGET_64BIT && !base && !index)
14436 {
14437 rtx symbol = disp;
14438
14439 if (GET_CODE (disp) == CONST
14440 && GET_CODE (XEXP (disp, 0)) == PLUS
14441 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14442 symbol = XEXP (XEXP (disp, 0), 0);
14443
14444 if (GET_CODE (symbol) == LABEL_REF
14445 || (GET_CODE (symbol) == SYMBOL_REF
14446 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14447 base = pc_rtx;
14448 }
14449 if (!base && !index)
14450 {
14451 /* Displacement only requires special attention. */
14452
14453 if (CONST_INT_P (disp))
14454 {
14455 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14456 fputs ("ds:", file);
14457 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14458 }
14459 else if (flag_pic)
14460 output_pic_addr_const (file, disp, 0);
14461 else
14462 output_addr_const (file, disp);
14463 }
14464 else
14465 {
14466 int code = 0;
14467
14468 /* Print SImode registers for zero-extended addresses to force
14469 addr32 prefix. Otherwise print DImode registers to avoid it. */
14470 if (TARGET_64BIT)
14471 code = ((GET_CODE (addr) == ZERO_EXTEND
14472 || GET_CODE (addr) == AND)
14473 ? 'l'
14474 : 'q');
14475
14476 if (ASSEMBLER_DIALECT == ASM_ATT)
14477 {
14478 if (disp)
14479 {
14480 if (flag_pic)
14481 output_pic_addr_const (file, disp, 0);
14482 else if (GET_CODE (disp) == LABEL_REF)
14483 output_asm_label (disp);
14484 else
14485 output_addr_const (file, disp);
14486 }
14487
14488 putc ('(', file);
14489 if (base)
14490 print_reg (base, code, file);
14491 if (index)
14492 {
14493 putc (',', file);
14494 print_reg (index, vsib ? 0 : code, file);
14495 if (scale != 1 || vsib)
14496 fprintf (file, ",%d", scale);
14497 }
14498 putc (')', file);
14499 }
14500 else
14501 {
14502 rtx offset = NULL_RTX;
14503
14504 if (disp)
14505 {
14506 /* Pull out the offset of a symbol; print any symbol itself. */
14507 if (GET_CODE (disp) == CONST
14508 && GET_CODE (XEXP (disp, 0)) == PLUS
14509 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14510 {
14511 offset = XEXP (XEXP (disp, 0), 1);
14512 disp = gen_rtx_CONST (VOIDmode,
14513 XEXP (XEXP (disp, 0), 0));
14514 }
14515
14516 if (flag_pic)
14517 output_pic_addr_const (file, disp, 0);
14518 else if (GET_CODE (disp) == LABEL_REF)
14519 output_asm_label (disp);
14520 else if (CONST_INT_P (disp))
14521 offset = disp;
14522 else
14523 output_addr_const (file, disp);
14524 }
14525
14526 putc ('[', file);
14527 if (base)
14528 {
14529 print_reg (base, code, file);
14530 if (offset)
14531 {
14532 if (INTVAL (offset) >= 0)
14533 putc ('+', file);
14534 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14535 }
14536 }
14537 else if (offset)
14538 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14539 else
14540 putc ('0', file);
14541
14542 if (index)
14543 {
14544 putc ('+', file);
14545 print_reg (index, vsib ? 0 : code, file);
14546 if (scale != 1 || vsib)
14547 fprintf (file, "*%d", scale);
14548 }
14549 putc (']', file);
14550 }
14551 }
14552 }
14553
14554 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14555
14556 static bool
14557 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14558 {
14559 rtx op;
14560
14561 if (GET_CODE (x) != UNSPEC)
14562 return false;
14563
14564 op = XVECEXP (x, 0, 0);
14565 switch (XINT (x, 1))
14566 {
14567 case UNSPEC_GOTTPOFF:
14568 output_addr_const (file, op);
14569 /* FIXME: This might be @TPOFF in Sun ld. */
14570 fputs ("@gottpoff", file);
14571 break;
14572 case UNSPEC_TPOFF:
14573 output_addr_const (file, op);
14574 fputs ("@tpoff", file);
14575 break;
14576 case UNSPEC_NTPOFF:
14577 output_addr_const (file, op);
14578 if (TARGET_64BIT)
14579 fputs ("@tpoff", file);
14580 else
14581 fputs ("@ntpoff", file);
14582 break;
14583 case UNSPEC_DTPOFF:
14584 output_addr_const (file, op);
14585 fputs ("@dtpoff", file);
14586 break;
14587 case UNSPEC_GOTNTPOFF:
14588 output_addr_const (file, op);
14589 if (TARGET_64BIT)
14590 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14591 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14592 else
14593 fputs ("@gotntpoff", file);
14594 break;
14595 case UNSPEC_INDNTPOFF:
14596 output_addr_const (file, op);
14597 fputs ("@indntpoff", file);
14598 break;
14599 #if TARGET_MACHO
14600 case UNSPEC_MACHOPIC_OFFSET:
14601 output_addr_const (file, op);
14602 putc ('-', file);
14603 machopic_output_function_base_name (file);
14604 break;
14605 #endif
14606
14607 case UNSPEC_STACK_CHECK:
14608 {
14609 int offset;
14610
14611 gcc_assert (flag_split_stack);
14612
14613 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14614 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14615 #else
14616 gcc_unreachable ();
14617 #endif
14618
14619 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14620 }
14621 break;
14622
14623 default:
14624 return false;
14625 }
14626
14627 return true;
14628 }
14629 \f
14630 /* Split one or more double-mode RTL references into pairs of half-mode
14631 references. The RTL can be REG, offsettable MEM, integer constant, or
14632 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14633 split and "num" is its length. lo_half and hi_half are output arrays
14634 that parallel "operands". */
14635
14636 void
14637 split_double_mode (enum machine_mode mode, rtx operands[],
14638 int num, rtx lo_half[], rtx hi_half[])
14639 {
14640 enum machine_mode half_mode;
14641 unsigned int byte;
14642
14643 switch (mode)
14644 {
14645 case TImode:
14646 half_mode = DImode;
14647 break;
14648 case DImode:
14649 half_mode = SImode;
14650 break;
14651 default:
14652 gcc_unreachable ();
14653 }
14654
14655 byte = GET_MODE_SIZE (half_mode);
14656
14657 while (num--)
14658 {
14659 rtx op = operands[num];
14660
14661 /* simplify_subreg refuse to split volatile memory addresses,
14662 but we still have to handle it. */
14663 if (MEM_P (op))
14664 {
14665 lo_half[num] = adjust_address (op, half_mode, 0);
14666 hi_half[num] = adjust_address (op, half_mode, byte);
14667 }
14668 else
14669 {
14670 lo_half[num] = simplify_gen_subreg (half_mode, op,
14671 GET_MODE (op) == VOIDmode
14672 ? mode : GET_MODE (op), 0);
14673 hi_half[num] = simplify_gen_subreg (half_mode, op,
14674 GET_MODE (op) == VOIDmode
14675 ? mode : GET_MODE (op), byte);
14676 }
14677 }
14678 }
14679 \f
14680 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14681 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14682 is the expression of the binary operation. The output may either be
14683 emitted here, or returned to the caller, like all output_* functions.
14684
14685 There is no guarantee that the operands are the same mode, as they
14686 might be within FLOAT or FLOAT_EXTEND expressions. */
14687
14688 #ifndef SYSV386_COMPAT
14689 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14690 wants to fix the assemblers because that causes incompatibility
14691 with gcc. No-one wants to fix gcc because that causes
14692 incompatibility with assemblers... You can use the option of
14693 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14694 #define SYSV386_COMPAT 1
14695 #endif
14696
14697 const char *
14698 output_387_binary_op (rtx insn, rtx *operands)
14699 {
14700 static char buf[40];
14701 const char *p;
14702 const char *ssep;
14703 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14704
14705 #ifdef ENABLE_CHECKING
14706 /* Even if we do not want to check the inputs, this documents input
14707 constraints. Which helps in understanding the following code. */
14708 if (STACK_REG_P (operands[0])
14709 && ((REG_P (operands[1])
14710 && REGNO (operands[0]) == REGNO (operands[1])
14711 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14712 || (REG_P (operands[2])
14713 && REGNO (operands[0]) == REGNO (operands[2])
14714 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14715 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14716 ; /* ok */
14717 else
14718 gcc_assert (is_sse);
14719 #endif
14720
14721 switch (GET_CODE (operands[3]))
14722 {
14723 case PLUS:
14724 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14725 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14726 p = "fiadd";
14727 else
14728 p = "fadd";
14729 ssep = "vadd";
14730 break;
14731
14732 case MINUS:
14733 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14734 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14735 p = "fisub";
14736 else
14737 p = "fsub";
14738 ssep = "vsub";
14739 break;
14740
14741 case MULT:
14742 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14743 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14744 p = "fimul";
14745 else
14746 p = "fmul";
14747 ssep = "vmul";
14748 break;
14749
14750 case DIV:
14751 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14752 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14753 p = "fidiv";
14754 else
14755 p = "fdiv";
14756 ssep = "vdiv";
14757 break;
14758
14759 default:
14760 gcc_unreachable ();
14761 }
14762
14763 if (is_sse)
14764 {
14765 if (TARGET_AVX)
14766 {
14767 strcpy (buf, ssep);
14768 if (GET_MODE (operands[0]) == SFmode)
14769 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14770 else
14771 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14772 }
14773 else
14774 {
14775 strcpy (buf, ssep + 1);
14776 if (GET_MODE (operands[0]) == SFmode)
14777 strcat (buf, "ss\t{%2, %0|%0, %2}");
14778 else
14779 strcat (buf, "sd\t{%2, %0|%0, %2}");
14780 }
14781 return buf;
14782 }
14783 strcpy (buf, p);
14784
14785 switch (GET_CODE (operands[3]))
14786 {
14787 case MULT:
14788 case PLUS:
14789 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14790 {
14791 rtx temp = operands[2];
14792 operands[2] = operands[1];
14793 operands[1] = temp;
14794 }
14795
14796 /* know operands[0] == operands[1]. */
14797
14798 if (MEM_P (operands[2]))
14799 {
14800 p = "%Z2\t%2";
14801 break;
14802 }
14803
14804 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14805 {
14806 if (STACK_TOP_P (operands[0]))
14807 /* How is it that we are storing to a dead operand[2]?
14808 Well, presumably operands[1] is dead too. We can't
14809 store the result to st(0) as st(0) gets popped on this
14810 instruction. Instead store to operands[2] (which I
14811 think has to be st(1)). st(1) will be popped later.
14812 gcc <= 2.8.1 didn't have this check and generated
14813 assembly code that the Unixware assembler rejected. */
14814 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14815 else
14816 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14817 break;
14818 }
14819
14820 if (STACK_TOP_P (operands[0]))
14821 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14822 else
14823 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14824 break;
14825
14826 case MINUS:
14827 case DIV:
14828 if (MEM_P (operands[1]))
14829 {
14830 p = "r%Z1\t%1";
14831 break;
14832 }
14833
14834 if (MEM_P (operands[2]))
14835 {
14836 p = "%Z2\t%2";
14837 break;
14838 }
14839
14840 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14841 {
14842 #if SYSV386_COMPAT
14843 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14844 derived assemblers, confusingly reverse the direction of
14845 the operation for fsub{r} and fdiv{r} when the
14846 destination register is not st(0). The Intel assembler
14847 doesn't have this brain damage. Read !SYSV386_COMPAT to
14848 figure out what the hardware really does. */
14849 if (STACK_TOP_P (operands[0]))
14850 p = "{p\t%0, %2|rp\t%2, %0}";
14851 else
14852 p = "{rp\t%2, %0|p\t%0, %2}";
14853 #else
14854 if (STACK_TOP_P (operands[0]))
14855 /* As above for fmul/fadd, we can't store to st(0). */
14856 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14857 else
14858 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14859 #endif
14860 break;
14861 }
14862
14863 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
14864 {
14865 #if SYSV386_COMPAT
14866 if (STACK_TOP_P (operands[0]))
14867 p = "{rp\t%0, %1|p\t%1, %0}";
14868 else
14869 p = "{p\t%1, %0|rp\t%0, %1}";
14870 #else
14871 if (STACK_TOP_P (operands[0]))
14872 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
14873 else
14874 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
14875 #endif
14876 break;
14877 }
14878
14879 if (STACK_TOP_P (operands[0]))
14880 {
14881 if (STACK_TOP_P (operands[1]))
14882 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14883 else
14884 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
14885 break;
14886 }
14887 else if (STACK_TOP_P (operands[1]))
14888 {
14889 #if SYSV386_COMPAT
14890 p = "{\t%1, %0|r\t%0, %1}";
14891 #else
14892 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
14893 #endif
14894 }
14895 else
14896 {
14897 #if SYSV386_COMPAT
14898 p = "{r\t%2, %0|\t%0, %2}";
14899 #else
14900 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14901 #endif
14902 }
14903 break;
14904
14905 default:
14906 gcc_unreachable ();
14907 }
14908
14909 strcat (buf, p);
14910 return buf;
14911 }
14912
14913 /* Return needed mode for entity in optimize_mode_switching pass. */
14914
14915 int
14916 ix86_mode_needed (int entity, rtx insn)
14917 {
14918 enum attr_i387_cw mode;
14919
14920 /* The mode UNINITIALIZED is used to store control word after a
14921 function call or ASM pattern. The mode ANY specify that function
14922 has no requirements on the control word and make no changes in the
14923 bits we are interested in. */
14924
14925 if (CALL_P (insn)
14926 || (NONJUMP_INSN_P (insn)
14927 && (asm_noperands (PATTERN (insn)) >= 0
14928 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
14929 return I387_CW_UNINITIALIZED;
14930
14931 if (recog_memoized (insn) < 0)
14932 return I387_CW_ANY;
14933
14934 mode = get_attr_i387_cw (insn);
14935
14936 switch (entity)
14937 {
14938 case I387_TRUNC:
14939 if (mode == I387_CW_TRUNC)
14940 return mode;
14941 break;
14942
14943 case I387_FLOOR:
14944 if (mode == I387_CW_FLOOR)
14945 return mode;
14946 break;
14947
14948 case I387_CEIL:
14949 if (mode == I387_CW_CEIL)
14950 return mode;
14951 break;
14952
14953 case I387_MASK_PM:
14954 if (mode == I387_CW_MASK_PM)
14955 return mode;
14956 break;
14957
14958 default:
14959 gcc_unreachable ();
14960 }
14961
14962 return I387_CW_ANY;
14963 }
14964
14965 /* Output code to initialize control word copies used by trunc?f?i and
14966 rounding patterns. CURRENT_MODE is set to current control word,
14967 while NEW_MODE is set to new control word. */
14968
14969 void
14970 emit_i387_cw_initialization (int mode)
14971 {
14972 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
14973 rtx new_mode;
14974
14975 enum ix86_stack_slot slot;
14976
14977 rtx reg = gen_reg_rtx (HImode);
14978
14979 emit_insn (gen_x86_fnstcw_1 (stored_mode));
14980 emit_move_insn (reg, copy_rtx (stored_mode));
14981
14982 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
14983 || optimize_function_for_size_p (cfun))
14984 {
14985 switch (mode)
14986 {
14987 case I387_CW_TRUNC:
14988 /* round toward zero (truncate) */
14989 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
14990 slot = SLOT_CW_TRUNC;
14991 break;
14992
14993 case I387_CW_FLOOR:
14994 /* round down toward -oo */
14995 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
14996 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
14997 slot = SLOT_CW_FLOOR;
14998 break;
14999
15000 case I387_CW_CEIL:
15001 /* round up toward +oo */
15002 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15003 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15004 slot = SLOT_CW_CEIL;
15005 break;
15006
15007 case I387_CW_MASK_PM:
15008 /* mask precision exception for nearbyint() */
15009 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15010 slot = SLOT_CW_MASK_PM;
15011 break;
15012
15013 default:
15014 gcc_unreachable ();
15015 }
15016 }
15017 else
15018 {
15019 switch (mode)
15020 {
15021 case I387_CW_TRUNC:
15022 /* round toward zero (truncate) */
15023 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15024 slot = SLOT_CW_TRUNC;
15025 break;
15026
15027 case I387_CW_FLOOR:
15028 /* round down toward -oo */
15029 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15030 slot = SLOT_CW_FLOOR;
15031 break;
15032
15033 case I387_CW_CEIL:
15034 /* round up toward +oo */
15035 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15036 slot = SLOT_CW_CEIL;
15037 break;
15038
15039 case I387_CW_MASK_PM:
15040 /* mask precision exception for nearbyint() */
15041 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15042 slot = SLOT_CW_MASK_PM;
15043 break;
15044
15045 default:
15046 gcc_unreachable ();
15047 }
15048 }
15049
15050 gcc_assert (slot < MAX_386_STACK_LOCALS);
15051
15052 new_mode = assign_386_stack_local (HImode, slot);
15053 emit_move_insn (new_mode, reg);
15054 }
15055
15056 /* Output code for INSN to convert a float to a signed int. OPERANDS
15057 are the insn operands. The output may be [HSD]Imode and the input
15058 operand may be [SDX]Fmode. */
15059
15060 const char *
15061 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15062 {
15063 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15064 int dimode_p = GET_MODE (operands[0]) == DImode;
15065 int round_mode = get_attr_i387_cw (insn);
15066
15067 /* Jump through a hoop or two for DImode, since the hardware has no
15068 non-popping instruction. We used to do this a different way, but
15069 that was somewhat fragile and broke with post-reload splitters. */
15070 if ((dimode_p || fisttp) && !stack_top_dies)
15071 output_asm_insn ("fld\t%y1", operands);
15072
15073 gcc_assert (STACK_TOP_P (operands[1]));
15074 gcc_assert (MEM_P (operands[0]));
15075 gcc_assert (GET_MODE (operands[1]) != TFmode);
15076
15077 if (fisttp)
15078 output_asm_insn ("fisttp%Z0\t%0", operands);
15079 else
15080 {
15081 if (round_mode != I387_CW_ANY)
15082 output_asm_insn ("fldcw\t%3", operands);
15083 if (stack_top_dies || dimode_p)
15084 output_asm_insn ("fistp%Z0\t%0", operands);
15085 else
15086 output_asm_insn ("fist%Z0\t%0", operands);
15087 if (round_mode != I387_CW_ANY)
15088 output_asm_insn ("fldcw\t%2", operands);
15089 }
15090
15091 return "";
15092 }
15093
15094 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15095 have the values zero or one, indicates the ffreep insn's operand
15096 from the OPERANDS array. */
15097
15098 static const char *
15099 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15100 {
15101 if (TARGET_USE_FFREEP)
15102 #ifdef HAVE_AS_IX86_FFREEP
15103 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15104 #else
15105 {
15106 static char retval[32];
15107 int regno = REGNO (operands[opno]);
15108
15109 gcc_assert (FP_REGNO_P (regno));
15110
15111 regno -= FIRST_STACK_REG;
15112
15113 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15114 return retval;
15115 }
15116 #endif
15117
15118 return opno ? "fstp\t%y1" : "fstp\t%y0";
15119 }
15120
15121
15122 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15123 should be used. UNORDERED_P is true when fucom should be used. */
15124
15125 const char *
15126 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15127 {
15128 int stack_top_dies;
15129 rtx cmp_op0, cmp_op1;
15130 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15131
15132 if (eflags_p)
15133 {
15134 cmp_op0 = operands[0];
15135 cmp_op1 = operands[1];
15136 }
15137 else
15138 {
15139 cmp_op0 = operands[1];
15140 cmp_op1 = operands[2];
15141 }
15142
15143 if (is_sse)
15144 {
15145 if (GET_MODE (operands[0]) == SFmode)
15146 if (unordered_p)
15147 return "%vucomiss\t{%1, %0|%0, %1}";
15148 else
15149 return "%vcomiss\t{%1, %0|%0, %1}";
15150 else
15151 if (unordered_p)
15152 return "%vucomisd\t{%1, %0|%0, %1}";
15153 else
15154 return "%vcomisd\t{%1, %0|%0, %1}";
15155 }
15156
15157 gcc_assert (STACK_TOP_P (cmp_op0));
15158
15159 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15160
15161 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15162 {
15163 if (stack_top_dies)
15164 {
15165 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15166 return output_387_ffreep (operands, 1);
15167 }
15168 else
15169 return "ftst\n\tfnstsw\t%0";
15170 }
15171
15172 if (STACK_REG_P (cmp_op1)
15173 && stack_top_dies
15174 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15175 && REGNO (cmp_op1) != FIRST_STACK_REG)
15176 {
15177 /* If both the top of the 387 stack dies, and the other operand
15178 is also a stack register that dies, then this must be a
15179 `fcompp' float compare */
15180
15181 if (eflags_p)
15182 {
15183 /* There is no double popping fcomi variant. Fortunately,
15184 eflags is immune from the fstp's cc clobbering. */
15185 if (unordered_p)
15186 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15187 else
15188 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15189 return output_387_ffreep (operands, 0);
15190 }
15191 else
15192 {
15193 if (unordered_p)
15194 return "fucompp\n\tfnstsw\t%0";
15195 else
15196 return "fcompp\n\tfnstsw\t%0";
15197 }
15198 }
15199 else
15200 {
15201 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15202
15203 static const char * const alt[16] =
15204 {
15205 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15206 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15207 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15208 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15209
15210 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15211 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15212 NULL,
15213 NULL,
15214
15215 "fcomi\t{%y1, %0|%0, %y1}",
15216 "fcomip\t{%y1, %0|%0, %y1}",
15217 "fucomi\t{%y1, %0|%0, %y1}",
15218 "fucomip\t{%y1, %0|%0, %y1}",
15219
15220 NULL,
15221 NULL,
15222 NULL,
15223 NULL
15224 };
15225
15226 int mask;
15227 const char *ret;
15228
15229 mask = eflags_p << 3;
15230 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15231 mask |= unordered_p << 1;
15232 mask |= stack_top_dies;
15233
15234 gcc_assert (mask < 16);
15235 ret = alt[mask];
15236 gcc_assert (ret);
15237
15238 return ret;
15239 }
15240 }
15241
15242 void
15243 ix86_output_addr_vec_elt (FILE *file, int value)
15244 {
15245 const char *directive = ASM_LONG;
15246
15247 #ifdef ASM_QUAD
15248 if (TARGET_LP64)
15249 directive = ASM_QUAD;
15250 #else
15251 gcc_assert (!TARGET_64BIT);
15252 #endif
15253
15254 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15255 }
15256
15257 void
15258 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15259 {
15260 const char *directive = ASM_LONG;
15261
15262 #ifdef ASM_QUAD
15263 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15264 directive = ASM_QUAD;
15265 #else
15266 gcc_assert (!TARGET_64BIT);
15267 #endif
15268 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15269 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15270 fprintf (file, "%s%s%d-%s%d\n",
15271 directive, LPREFIX, value, LPREFIX, rel);
15272 else if (HAVE_AS_GOTOFF_IN_DATA)
15273 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15274 #if TARGET_MACHO
15275 else if (TARGET_MACHO)
15276 {
15277 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15278 machopic_output_function_base_name (file);
15279 putc ('\n', file);
15280 }
15281 #endif
15282 else
15283 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15284 GOT_SYMBOL_NAME, LPREFIX, value);
15285 }
15286 \f
15287 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15288 for the target. */
15289
15290 void
15291 ix86_expand_clear (rtx dest)
15292 {
15293 rtx tmp;
15294
15295 /* We play register width games, which are only valid after reload. */
15296 gcc_assert (reload_completed);
15297
15298 /* Avoid HImode and its attendant prefix byte. */
15299 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15300 dest = gen_rtx_REG (SImode, REGNO (dest));
15301 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15302
15303 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15304 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15305 {
15306 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15307 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15308 }
15309
15310 emit_insn (tmp);
15311 }
15312
15313 /* X is an unchanging MEM. If it is a constant pool reference, return
15314 the constant pool rtx, else NULL. */
15315
15316 rtx
15317 maybe_get_pool_constant (rtx x)
15318 {
15319 x = ix86_delegitimize_address (XEXP (x, 0));
15320
15321 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15322 return get_pool_constant (x);
15323
15324 return NULL_RTX;
15325 }
15326
15327 void
15328 ix86_expand_move (enum machine_mode mode, rtx operands[])
15329 {
15330 rtx op0, op1;
15331 enum tls_model model;
15332
15333 op0 = operands[0];
15334 op1 = operands[1];
15335
15336 if (GET_CODE (op1) == SYMBOL_REF)
15337 {
15338 model = SYMBOL_REF_TLS_MODEL (op1);
15339 if (model)
15340 {
15341 op1 = legitimize_tls_address (op1, model, true);
15342 op1 = force_operand (op1, op0);
15343 if (op1 == op0)
15344 return;
15345 if (GET_MODE (op1) != mode)
15346 op1 = convert_to_mode (mode, op1, 1);
15347 }
15348 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15349 && SYMBOL_REF_DLLIMPORT_P (op1))
15350 op1 = legitimize_dllimport_symbol (op1, false);
15351 }
15352 else if (GET_CODE (op1) == CONST
15353 && GET_CODE (XEXP (op1, 0)) == PLUS
15354 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15355 {
15356 rtx addend = XEXP (XEXP (op1, 0), 1);
15357 rtx symbol = XEXP (XEXP (op1, 0), 0);
15358 rtx tmp = NULL;
15359
15360 model = SYMBOL_REF_TLS_MODEL (symbol);
15361 if (model)
15362 tmp = legitimize_tls_address (symbol, model, true);
15363 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15364 && SYMBOL_REF_DLLIMPORT_P (symbol))
15365 tmp = legitimize_dllimport_symbol (symbol, true);
15366
15367 if (tmp)
15368 {
15369 tmp = force_operand (tmp, NULL);
15370 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15371 op0, 1, OPTAB_DIRECT);
15372 if (tmp == op0)
15373 return;
15374 if (GET_MODE (tmp) != mode)
15375 op1 = convert_to_mode (mode, tmp, 1);
15376 }
15377 }
15378
15379 if ((flag_pic || MACHOPIC_INDIRECT)
15380 && symbolic_operand (op1, mode))
15381 {
15382 if (TARGET_MACHO && !TARGET_64BIT)
15383 {
15384 #if TARGET_MACHO
15385 /* dynamic-no-pic */
15386 if (MACHOPIC_INDIRECT)
15387 {
15388 rtx temp = ((reload_in_progress
15389 || ((op0 && REG_P (op0))
15390 && mode == Pmode))
15391 ? op0 : gen_reg_rtx (Pmode));
15392 op1 = machopic_indirect_data_reference (op1, temp);
15393 if (MACHOPIC_PURE)
15394 op1 = machopic_legitimize_pic_address (op1, mode,
15395 temp == op1 ? 0 : temp);
15396 }
15397 if (op0 != op1 && GET_CODE (op0) != MEM)
15398 {
15399 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15400 emit_insn (insn);
15401 return;
15402 }
15403 if (GET_CODE (op0) == MEM)
15404 op1 = force_reg (Pmode, op1);
15405 else
15406 {
15407 rtx temp = op0;
15408 if (GET_CODE (temp) != REG)
15409 temp = gen_reg_rtx (Pmode);
15410 temp = legitimize_pic_address (op1, temp);
15411 if (temp == op0)
15412 return;
15413 op1 = temp;
15414 }
15415 /* dynamic-no-pic */
15416 #endif
15417 }
15418 else
15419 {
15420 if (MEM_P (op0))
15421 op1 = force_reg (mode, op1);
15422 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15423 {
15424 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15425 op1 = legitimize_pic_address (op1, reg);
15426 if (op0 == op1)
15427 return;
15428 if (GET_MODE (op1) != mode)
15429 op1 = convert_to_mode (mode, op1, 1);
15430 }
15431 }
15432 }
15433 else
15434 {
15435 if (MEM_P (op0)
15436 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15437 || !push_operand (op0, mode))
15438 && MEM_P (op1))
15439 op1 = force_reg (mode, op1);
15440
15441 if (push_operand (op0, mode)
15442 && ! general_no_elim_operand (op1, mode))
15443 op1 = copy_to_mode_reg (mode, op1);
15444
15445 /* Force large constants in 64bit compilation into register
15446 to get them CSEed. */
15447 if (can_create_pseudo_p ()
15448 && (mode == DImode) && TARGET_64BIT
15449 && immediate_operand (op1, mode)
15450 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15451 && !register_operand (op0, mode)
15452 && optimize)
15453 op1 = copy_to_mode_reg (mode, op1);
15454
15455 if (can_create_pseudo_p ()
15456 && FLOAT_MODE_P (mode)
15457 && GET_CODE (op1) == CONST_DOUBLE)
15458 {
15459 /* If we are loading a floating point constant to a register,
15460 force the value to memory now, since we'll get better code
15461 out the back end. */
15462
15463 op1 = validize_mem (force_const_mem (mode, op1));
15464 if (!register_operand (op0, mode))
15465 {
15466 rtx temp = gen_reg_rtx (mode);
15467 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15468 emit_move_insn (op0, temp);
15469 return;
15470 }
15471 }
15472 }
15473
15474 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15475 }
15476
15477 void
15478 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15479 {
15480 rtx op0 = operands[0], op1 = operands[1];
15481 unsigned int align = GET_MODE_ALIGNMENT (mode);
15482
15483 /* Force constants other than zero into memory. We do not know how
15484 the instructions used to build constants modify the upper 64 bits
15485 of the register, once we have that information we may be able
15486 to handle some of them more efficiently. */
15487 if (can_create_pseudo_p ()
15488 && register_operand (op0, mode)
15489 && (CONSTANT_P (op1)
15490 || (GET_CODE (op1) == SUBREG
15491 && CONSTANT_P (SUBREG_REG (op1))))
15492 && !standard_sse_constant_p (op1))
15493 op1 = validize_mem (force_const_mem (mode, op1));
15494
15495 /* We need to check memory alignment for SSE mode since attribute
15496 can make operands unaligned. */
15497 if (can_create_pseudo_p ()
15498 && SSE_REG_MODE_P (mode)
15499 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15500 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15501 {
15502 rtx tmp[2];
15503
15504 /* ix86_expand_vector_move_misalign() does not like constants ... */
15505 if (CONSTANT_P (op1)
15506 || (GET_CODE (op1) == SUBREG
15507 && CONSTANT_P (SUBREG_REG (op1))))
15508 op1 = validize_mem (force_const_mem (mode, op1));
15509
15510 /* ... nor both arguments in memory. */
15511 if (!register_operand (op0, mode)
15512 && !register_operand (op1, mode))
15513 op1 = force_reg (mode, op1);
15514
15515 tmp[0] = op0; tmp[1] = op1;
15516 ix86_expand_vector_move_misalign (mode, tmp);
15517 return;
15518 }
15519
15520 /* Make operand1 a register if it isn't already. */
15521 if (can_create_pseudo_p ()
15522 && !register_operand (op0, mode)
15523 && !register_operand (op1, mode))
15524 {
15525 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15526 return;
15527 }
15528
15529 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15530 }
15531
15532 /* Split 32-byte AVX unaligned load and store if needed. */
15533
15534 static void
15535 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15536 {
15537 rtx m;
15538 rtx (*extract) (rtx, rtx, rtx);
15539 rtx (*move_unaligned) (rtx, rtx);
15540 enum machine_mode mode;
15541
15542 switch (GET_MODE (op0))
15543 {
15544 default:
15545 gcc_unreachable ();
15546 case V32QImode:
15547 extract = gen_avx_vextractf128v32qi;
15548 move_unaligned = gen_avx_movdqu256;
15549 mode = V16QImode;
15550 break;
15551 case V8SFmode:
15552 extract = gen_avx_vextractf128v8sf;
15553 move_unaligned = gen_avx_movups256;
15554 mode = V4SFmode;
15555 break;
15556 case V4DFmode:
15557 extract = gen_avx_vextractf128v4df;
15558 move_unaligned = gen_avx_movupd256;
15559 mode = V2DFmode;
15560 break;
15561 }
15562
15563 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15564 {
15565 rtx r = gen_reg_rtx (mode);
15566 m = adjust_address (op1, mode, 0);
15567 emit_move_insn (r, m);
15568 m = adjust_address (op1, mode, 16);
15569 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15570 emit_move_insn (op0, r);
15571 }
15572 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15573 {
15574 m = adjust_address (op0, mode, 0);
15575 emit_insn (extract (m, op1, const0_rtx));
15576 m = adjust_address (op0, mode, 16);
15577 emit_insn (extract (m, op1, const1_rtx));
15578 }
15579 else
15580 emit_insn (move_unaligned (op0, op1));
15581 }
15582
15583 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15584 straight to ix86_expand_vector_move. */
15585 /* Code generation for scalar reg-reg moves of single and double precision data:
15586 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15587 movaps reg, reg
15588 else
15589 movss reg, reg
15590 if (x86_sse_partial_reg_dependency == true)
15591 movapd reg, reg
15592 else
15593 movsd reg, reg
15594
15595 Code generation for scalar loads of double precision data:
15596 if (x86_sse_split_regs == true)
15597 movlpd mem, reg (gas syntax)
15598 else
15599 movsd mem, reg
15600
15601 Code generation for unaligned packed loads of single precision data
15602 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15603 if (x86_sse_unaligned_move_optimal)
15604 movups mem, reg
15605
15606 if (x86_sse_partial_reg_dependency == true)
15607 {
15608 xorps reg, reg
15609 movlps mem, reg
15610 movhps mem+8, reg
15611 }
15612 else
15613 {
15614 movlps mem, reg
15615 movhps mem+8, reg
15616 }
15617
15618 Code generation for unaligned packed loads of double precision data
15619 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15620 if (x86_sse_unaligned_move_optimal)
15621 movupd mem, reg
15622
15623 if (x86_sse_split_regs == true)
15624 {
15625 movlpd mem, reg
15626 movhpd mem+8, reg
15627 }
15628 else
15629 {
15630 movsd mem, reg
15631 movhpd mem+8, reg
15632 }
15633 */
15634
15635 void
15636 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15637 {
15638 rtx op0, op1, m;
15639
15640 op0 = operands[0];
15641 op1 = operands[1];
15642
15643 if (TARGET_AVX)
15644 {
15645 switch (GET_MODE_CLASS (mode))
15646 {
15647 case MODE_VECTOR_INT:
15648 case MODE_INT:
15649 switch (GET_MODE_SIZE (mode))
15650 {
15651 case 16:
15652 /* If we're optimizing for size, movups is the smallest. */
15653 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15654 {
15655 op0 = gen_lowpart (V4SFmode, op0);
15656 op1 = gen_lowpart (V4SFmode, op1);
15657 emit_insn (gen_sse_movups (op0, op1));
15658 return;
15659 }
15660 op0 = gen_lowpart (V16QImode, op0);
15661 op1 = gen_lowpart (V16QImode, op1);
15662 emit_insn (gen_sse2_movdqu (op0, op1));
15663 break;
15664 case 32:
15665 op0 = gen_lowpart (V32QImode, op0);
15666 op1 = gen_lowpart (V32QImode, op1);
15667 ix86_avx256_split_vector_move_misalign (op0, op1);
15668 break;
15669 default:
15670 gcc_unreachable ();
15671 }
15672 break;
15673 case MODE_VECTOR_FLOAT:
15674 op0 = gen_lowpart (mode, op0);
15675 op1 = gen_lowpart (mode, op1);
15676
15677 switch (mode)
15678 {
15679 case V4SFmode:
15680 emit_insn (gen_sse_movups (op0, op1));
15681 break;
15682 case V8SFmode:
15683 ix86_avx256_split_vector_move_misalign (op0, op1);
15684 break;
15685 case V2DFmode:
15686 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15687 {
15688 op0 = gen_lowpart (V4SFmode, op0);
15689 op1 = gen_lowpart (V4SFmode, op1);
15690 emit_insn (gen_sse_movups (op0, op1));
15691 return;
15692 }
15693 emit_insn (gen_sse2_movupd (op0, op1));
15694 break;
15695 case V4DFmode:
15696 ix86_avx256_split_vector_move_misalign (op0, op1);
15697 break;
15698 default:
15699 gcc_unreachable ();
15700 }
15701 break;
15702
15703 default:
15704 gcc_unreachable ();
15705 }
15706
15707 return;
15708 }
15709
15710 if (MEM_P (op1))
15711 {
15712 /* If we're optimizing for size, movups is the smallest. */
15713 if (optimize_insn_for_size_p ()
15714 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15715 {
15716 op0 = gen_lowpart (V4SFmode, op0);
15717 op1 = gen_lowpart (V4SFmode, op1);
15718 emit_insn (gen_sse_movups (op0, op1));
15719 return;
15720 }
15721
15722 /* ??? If we have typed data, then it would appear that using
15723 movdqu is the only way to get unaligned data loaded with
15724 integer type. */
15725 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15726 {
15727 op0 = gen_lowpart (V16QImode, op0);
15728 op1 = gen_lowpart (V16QImode, op1);
15729 emit_insn (gen_sse2_movdqu (op0, op1));
15730 return;
15731 }
15732
15733 if (TARGET_SSE2 && mode == V2DFmode)
15734 {
15735 rtx zero;
15736
15737 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15738 {
15739 op0 = gen_lowpart (V2DFmode, op0);
15740 op1 = gen_lowpart (V2DFmode, op1);
15741 emit_insn (gen_sse2_movupd (op0, op1));
15742 return;
15743 }
15744
15745 /* When SSE registers are split into halves, we can avoid
15746 writing to the top half twice. */
15747 if (TARGET_SSE_SPLIT_REGS)
15748 {
15749 emit_clobber (op0);
15750 zero = op0;
15751 }
15752 else
15753 {
15754 /* ??? Not sure about the best option for the Intel chips.
15755 The following would seem to satisfy; the register is
15756 entirely cleared, breaking the dependency chain. We
15757 then store to the upper half, with a dependency depth
15758 of one. A rumor has it that Intel recommends two movsd
15759 followed by an unpacklpd, but this is unconfirmed. And
15760 given that the dependency depth of the unpacklpd would
15761 still be one, I'm not sure why this would be better. */
15762 zero = CONST0_RTX (V2DFmode);
15763 }
15764
15765 m = adjust_address (op1, DFmode, 0);
15766 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15767 m = adjust_address (op1, DFmode, 8);
15768 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15769 }
15770 else
15771 {
15772 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15773 {
15774 op0 = gen_lowpart (V4SFmode, op0);
15775 op1 = gen_lowpart (V4SFmode, op1);
15776 emit_insn (gen_sse_movups (op0, op1));
15777 return;
15778 }
15779
15780 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15781 emit_move_insn (op0, CONST0_RTX (mode));
15782 else
15783 emit_clobber (op0);
15784
15785 if (mode != V4SFmode)
15786 op0 = gen_lowpart (V4SFmode, op0);
15787 m = adjust_address (op1, V2SFmode, 0);
15788 emit_insn (gen_sse_loadlps (op0, op0, m));
15789 m = adjust_address (op1, V2SFmode, 8);
15790 emit_insn (gen_sse_loadhps (op0, op0, m));
15791 }
15792 }
15793 else if (MEM_P (op0))
15794 {
15795 /* If we're optimizing for size, movups is the smallest. */
15796 if (optimize_insn_for_size_p ()
15797 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15798 {
15799 op0 = gen_lowpart (V4SFmode, op0);
15800 op1 = gen_lowpart (V4SFmode, op1);
15801 emit_insn (gen_sse_movups (op0, op1));
15802 return;
15803 }
15804
15805 /* ??? Similar to above, only less clear because of quote
15806 typeless stores unquote. */
15807 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15808 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15809 {
15810 op0 = gen_lowpart (V16QImode, op0);
15811 op1 = gen_lowpart (V16QImode, op1);
15812 emit_insn (gen_sse2_movdqu (op0, op1));
15813 return;
15814 }
15815
15816 if (TARGET_SSE2 && mode == V2DFmode)
15817 {
15818 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15819 {
15820 op0 = gen_lowpart (V2DFmode, op0);
15821 op1 = gen_lowpart (V2DFmode, op1);
15822 emit_insn (gen_sse2_movupd (op0, op1));
15823 }
15824 else
15825 {
15826 m = adjust_address (op0, DFmode, 0);
15827 emit_insn (gen_sse2_storelpd (m, op1));
15828 m = adjust_address (op0, DFmode, 8);
15829 emit_insn (gen_sse2_storehpd (m, op1));
15830 }
15831 }
15832 else
15833 {
15834 if (mode != V4SFmode)
15835 op1 = gen_lowpart (V4SFmode, op1);
15836
15837 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15838 {
15839 op0 = gen_lowpart (V4SFmode, op0);
15840 emit_insn (gen_sse_movups (op0, op1));
15841 }
15842 else
15843 {
15844 m = adjust_address (op0, V2SFmode, 0);
15845 emit_insn (gen_sse_storelps (m, op1));
15846 m = adjust_address (op0, V2SFmode, 8);
15847 emit_insn (gen_sse_storehps (m, op1));
15848 }
15849 }
15850 }
15851 else
15852 gcc_unreachable ();
15853 }
15854
15855 /* Expand a push in MODE. This is some mode for which we do not support
15856 proper push instructions, at least from the registers that we expect
15857 the value to live in. */
15858
15859 void
15860 ix86_expand_push (enum machine_mode mode, rtx x)
15861 {
15862 rtx tmp;
15863
15864 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
15865 GEN_INT (-GET_MODE_SIZE (mode)),
15866 stack_pointer_rtx, 1, OPTAB_DIRECT);
15867 if (tmp != stack_pointer_rtx)
15868 emit_move_insn (stack_pointer_rtx, tmp);
15869
15870 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
15871
15872 /* When we push an operand onto stack, it has to be aligned at least
15873 at the function argument boundary. However since we don't have
15874 the argument type, we can't determine the actual argument
15875 boundary. */
15876 emit_move_insn (tmp, x);
15877 }
15878
15879 /* Helper function of ix86_fixup_binary_operands to canonicalize
15880 operand order. Returns true if the operands should be swapped. */
15881
15882 static bool
15883 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
15884 rtx operands[])
15885 {
15886 rtx dst = operands[0];
15887 rtx src1 = operands[1];
15888 rtx src2 = operands[2];
15889
15890 /* If the operation is not commutative, we can't do anything. */
15891 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
15892 return false;
15893
15894 /* Highest priority is that src1 should match dst. */
15895 if (rtx_equal_p (dst, src1))
15896 return false;
15897 if (rtx_equal_p (dst, src2))
15898 return true;
15899
15900 /* Next highest priority is that immediate constants come second. */
15901 if (immediate_operand (src2, mode))
15902 return false;
15903 if (immediate_operand (src1, mode))
15904 return true;
15905
15906 /* Lowest priority is that memory references should come second. */
15907 if (MEM_P (src2))
15908 return false;
15909 if (MEM_P (src1))
15910 return true;
15911
15912 return false;
15913 }
15914
15915
15916 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
15917 destination to use for the operation. If different from the true
15918 destination in operands[0], a copy operation will be required. */
15919
15920 rtx
15921 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
15922 rtx operands[])
15923 {
15924 rtx dst = operands[0];
15925 rtx src1 = operands[1];
15926 rtx src2 = operands[2];
15927
15928 /* Canonicalize operand order. */
15929 if (ix86_swap_binary_operands_p (code, mode, operands))
15930 {
15931 rtx temp;
15932
15933 /* It is invalid to swap operands of different modes. */
15934 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
15935
15936 temp = src1;
15937 src1 = src2;
15938 src2 = temp;
15939 }
15940
15941 /* Both source operands cannot be in memory. */
15942 if (MEM_P (src1) && MEM_P (src2))
15943 {
15944 /* Optimization: Only read from memory once. */
15945 if (rtx_equal_p (src1, src2))
15946 {
15947 src2 = force_reg (mode, src2);
15948 src1 = src2;
15949 }
15950 else
15951 src2 = force_reg (mode, src2);
15952 }
15953
15954 /* If the destination is memory, and we do not have matching source
15955 operands, do things in registers. */
15956 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
15957 dst = gen_reg_rtx (mode);
15958
15959 /* Source 1 cannot be a constant. */
15960 if (CONSTANT_P (src1))
15961 src1 = force_reg (mode, src1);
15962
15963 /* Source 1 cannot be a non-matching memory. */
15964 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
15965 src1 = force_reg (mode, src1);
15966
15967 /* Improve address combine. */
15968 if (code == PLUS
15969 && GET_MODE_CLASS (mode) == MODE_INT
15970 && MEM_P (src2))
15971 src2 = force_reg (mode, src2);
15972
15973 operands[1] = src1;
15974 operands[2] = src2;
15975 return dst;
15976 }
15977
15978 /* Similarly, but assume that the destination has already been
15979 set up properly. */
15980
15981 void
15982 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
15983 enum machine_mode mode, rtx operands[])
15984 {
15985 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
15986 gcc_assert (dst == operands[0]);
15987 }
15988
15989 /* Attempt to expand a binary operator. Make the expansion closer to the
15990 actual machine, then just general_operand, which will allow 3 separate
15991 memory references (one output, two input) in a single insn. */
15992
15993 void
15994 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
15995 rtx operands[])
15996 {
15997 rtx src1, src2, dst, op, clob;
15998
15999 dst = ix86_fixup_binary_operands (code, mode, operands);
16000 src1 = operands[1];
16001 src2 = operands[2];
16002
16003 /* Emit the instruction. */
16004
16005 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16006 if (reload_in_progress)
16007 {
16008 /* Reload doesn't know about the flags register, and doesn't know that
16009 it doesn't want to clobber it. We can only do this with PLUS. */
16010 gcc_assert (code == PLUS);
16011 emit_insn (op);
16012 }
16013 else if (reload_completed
16014 && code == PLUS
16015 && !rtx_equal_p (dst, src1))
16016 {
16017 /* This is going to be an LEA; avoid splitting it later. */
16018 emit_insn (op);
16019 }
16020 else
16021 {
16022 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16023 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16024 }
16025
16026 /* Fix up the destination if needed. */
16027 if (dst != operands[0])
16028 emit_move_insn (operands[0], dst);
16029 }
16030
16031 /* Return TRUE or FALSE depending on whether the binary operator meets the
16032 appropriate constraints. */
16033
16034 bool
16035 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16036 rtx operands[3])
16037 {
16038 rtx dst = operands[0];
16039 rtx src1 = operands[1];
16040 rtx src2 = operands[2];
16041
16042 /* Both source operands cannot be in memory. */
16043 if (MEM_P (src1) && MEM_P (src2))
16044 return false;
16045
16046 /* Canonicalize operand order for commutative operators. */
16047 if (ix86_swap_binary_operands_p (code, mode, operands))
16048 {
16049 rtx temp = src1;
16050 src1 = src2;
16051 src2 = temp;
16052 }
16053
16054 /* If the destination is memory, we must have a matching source operand. */
16055 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16056 return false;
16057
16058 /* Source 1 cannot be a constant. */
16059 if (CONSTANT_P (src1))
16060 return false;
16061
16062 /* Source 1 cannot be a non-matching memory. */
16063 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16064 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16065 return (code == AND
16066 && (mode == HImode
16067 || mode == SImode
16068 || (TARGET_64BIT && mode == DImode))
16069 && satisfies_constraint_L (src2));
16070
16071 return true;
16072 }
16073
16074 /* Attempt to expand a unary operator. Make the expansion closer to the
16075 actual machine, then just general_operand, which will allow 2 separate
16076 memory references (one output, one input) in a single insn. */
16077
16078 void
16079 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16080 rtx operands[])
16081 {
16082 int matching_memory;
16083 rtx src, dst, op, clob;
16084
16085 dst = operands[0];
16086 src = operands[1];
16087
16088 /* If the destination is memory, and we do not have matching source
16089 operands, do things in registers. */
16090 matching_memory = 0;
16091 if (MEM_P (dst))
16092 {
16093 if (rtx_equal_p (dst, src))
16094 matching_memory = 1;
16095 else
16096 dst = gen_reg_rtx (mode);
16097 }
16098
16099 /* When source operand is memory, destination must match. */
16100 if (MEM_P (src) && !matching_memory)
16101 src = force_reg (mode, src);
16102
16103 /* Emit the instruction. */
16104
16105 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16106 if (reload_in_progress || code == NOT)
16107 {
16108 /* Reload doesn't know about the flags register, and doesn't know that
16109 it doesn't want to clobber it. */
16110 gcc_assert (code == NOT);
16111 emit_insn (op);
16112 }
16113 else
16114 {
16115 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16116 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16117 }
16118
16119 /* Fix up the destination if needed. */
16120 if (dst != operands[0])
16121 emit_move_insn (operands[0], dst);
16122 }
16123
16124 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16125 divisor are within the range [0-255]. */
16126
16127 void
16128 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16129 bool signed_p)
16130 {
16131 rtx end_label, qimode_label;
16132 rtx insn, div, mod;
16133 rtx scratch, tmp0, tmp1, tmp2;
16134 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16135 rtx (*gen_zero_extend) (rtx, rtx);
16136 rtx (*gen_test_ccno_1) (rtx, rtx);
16137
16138 switch (mode)
16139 {
16140 case SImode:
16141 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16142 gen_test_ccno_1 = gen_testsi_ccno_1;
16143 gen_zero_extend = gen_zero_extendqisi2;
16144 break;
16145 case DImode:
16146 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16147 gen_test_ccno_1 = gen_testdi_ccno_1;
16148 gen_zero_extend = gen_zero_extendqidi2;
16149 break;
16150 default:
16151 gcc_unreachable ();
16152 }
16153
16154 end_label = gen_label_rtx ();
16155 qimode_label = gen_label_rtx ();
16156
16157 scratch = gen_reg_rtx (mode);
16158
16159 /* Use 8bit unsigned divimod if dividend and divisor are within
16160 the range [0-255]. */
16161 emit_move_insn (scratch, operands[2]);
16162 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16163 scratch, 1, OPTAB_DIRECT);
16164 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16165 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16166 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16167 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16168 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16169 pc_rtx);
16170 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16171 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16172 JUMP_LABEL (insn) = qimode_label;
16173
16174 /* Generate original signed/unsigned divimod. */
16175 div = gen_divmod4_1 (operands[0], operands[1],
16176 operands[2], operands[3]);
16177 emit_insn (div);
16178
16179 /* Branch to the end. */
16180 emit_jump_insn (gen_jump (end_label));
16181 emit_barrier ();
16182
16183 /* Generate 8bit unsigned divide. */
16184 emit_label (qimode_label);
16185 /* Don't use operands[0] for result of 8bit divide since not all
16186 registers support QImode ZERO_EXTRACT. */
16187 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16188 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16189 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16190 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16191
16192 if (signed_p)
16193 {
16194 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16195 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16196 }
16197 else
16198 {
16199 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16200 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16201 }
16202
16203 /* Extract remainder from AH. */
16204 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16205 if (REG_P (operands[1]))
16206 insn = emit_move_insn (operands[1], tmp1);
16207 else
16208 {
16209 /* Need a new scratch register since the old one has result
16210 of 8bit divide. */
16211 scratch = gen_reg_rtx (mode);
16212 emit_move_insn (scratch, tmp1);
16213 insn = emit_move_insn (operands[1], scratch);
16214 }
16215 set_unique_reg_note (insn, REG_EQUAL, mod);
16216
16217 /* Zero extend quotient from AL. */
16218 tmp1 = gen_lowpart (QImode, tmp0);
16219 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16220 set_unique_reg_note (insn, REG_EQUAL, div);
16221
16222 emit_label (end_label);
16223 }
16224
16225 #define LEA_MAX_STALL (3)
16226 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16227
16228 /* Increase given DISTANCE in half-cycles according to
16229 dependencies between PREV and NEXT instructions.
16230 Add 1 half-cycle if there is no dependency and
16231 go to next cycle if there is some dependecy. */
16232
16233 static unsigned int
16234 increase_distance (rtx prev, rtx next, unsigned int distance)
16235 {
16236 df_ref *use_rec;
16237 df_ref *def_rec;
16238
16239 if (!prev || !next)
16240 return distance + (distance & 1) + 2;
16241
16242 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16243 return distance + 1;
16244
16245 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16246 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16247 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16248 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16249 return distance + (distance & 1) + 2;
16250
16251 return distance + 1;
16252 }
16253
16254 /* Function checks if instruction INSN defines register number
16255 REGNO1 or REGNO2. */
16256
16257 static bool
16258 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16259 rtx insn)
16260 {
16261 df_ref *def_rec;
16262
16263 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16264 if (DF_REF_REG_DEF_P (*def_rec)
16265 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16266 && (regno1 == DF_REF_REGNO (*def_rec)
16267 || regno2 == DF_REF_REGNO (*def_rec)))
16268 {
16269 return true;
16270 }
16271
16272 return false;
16273 }
16274
16275 /* Function checks if instruction INSN uses register number
16276 REGNO as a part of address expression. */
16277
16278 static bool
16279 insn_uses_reg_mem (unsigned int regno, rtx insn)
16280 {
16281 df_ref *use_rec;
16282
16283 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16284 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16285 return true;
16286
16287 return false;
16288 }
16289
16290 /* Search backward for non-agu definition of register number REGNO1
16291 or register number REGNO2 in basic block starting from instruction
16292 START up to head of basic block or instruction INSN.
16293
16294 Function puts true value into *FOUND var if definition was found
16295 and false otherwise.
16296
16297 Distance in half-cycles between START and found instruction or head
16298 of BB is added to DISTANCE and returned. */
16299
16300 static int
16301 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16302 rtx insn, int distance,
16303 rtx start, bool *found)
16304 {
16305 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16306 rtx prev = start;
16307 rtx next = NULL;
16308
16309 *found = false;
16310
16311 while (prev
16312 && prev != insn
16313 && distance < LEA_SEARCH_THRESHOLD)
16314 {
16315 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16316 {
16317 distance = increase_distance (prev, next, distance);
16318 if (insn_defines_reg (regno1, regno2, prev))
16319 {
16320 if (recog_memoized (prev) < 0
16321 || get_attr_type (prev) != TYPE_LEA)
16322 {
16323 *found = true;
16324 return distance;
16325 }
16326 }
16327
16328 next = prev;
16329 }
16330 if (prev == BB_HEAD (bb))
16331 break;
16332
16333 prev = PREV_INSN (prev);
16334 }
16335
16336 return distance;
16337 }
16338
16339 /* Search backward for non-agu definition of register number REGNO1
16340 or register number REGNO2 in INSN's basic block until
16341 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16342 2. Reach neighbour BBs boundary, or
16343 3. Reach agu definition.
16344 Returns the distance between the non-agu definition point and INSN.
16345 If no definition point, returns -1. */
16346
16347 static int
16348 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16349 rtx insn)
16350 {
16351 basic_block bb = BLOCK_FOR_INSN (insn);
16352 int distance = 0;
16353 bool found = false;
16354
16355 if (insn != BB_HEAD (bb))
16356 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16357 distance, PREV_INSN (insn),
16358 &found);
16359
16360 if (!found && distance < LEA_SEARCH_THRESHOLD)
16361 {
16362 edge e;
16363 edge_iterator ei;
16364 bool simple_loop = false;
16365
16366 FOR_EACH_EDGE (e, ei, bb->preds)
16367 if (e->src == bb)
16368 {
16369 simple_loop = true;
16370 break;
16371 }
16372
16373 if (simple_loop)
16374 distance = distance_non_agu_define_in_bb (regno1, regno2,
16375 insn, distance,
16376 BB_END (bb), &found);
16377 else
16378 {
16379 int shortest_dist = -1;
16380 bool found_in_bb = false;
16381
16382 FOR_EACH_EDGE (e, ei, bb->preds)
16383 {
16384 int bb_dist
16385 = distance_non_agu_define_in_bb (regno1, regno2,
16386 insn, distance,
16387 BB_END (e->src),
16388 &found_in_bb);
16389 if (found_in_bb)
16390 {
16391 if (shortest_dist < 0)
16392 shortest_dist = bb_dist;
16393 else if (bb_dist > 0)
16394 shortest_dist = MIN (bb_dist, shortest_dist);
16395
16396 found = true;
16397 }
16398 }
16399
16400 distance = shortest_dist;
16401 }
16402 }
16403
16404 /* get_attr_type may modify recog data. We want to make sure
16405 that recog data is valid for instruction INSN, on which
16406 distance_non_agu_define is called. INSN is unchanged here. */
16407 extract_insn_cached (insn);
16408
16409 if (!found)
16410 return -1;
16411
16412 return distance >> 1;
16413 }
16414
16415 /* Return the distance in half-cycles between INSN and the next
16416 insn that uses register number REGNO in memory address added
16417 to DISTANCE. Return -1 if REGNO0 is set.
16418
16419 Put true value into *FOUND if register usage was found and
16420 false otherwise.
16421 Put true value into *REDEFINED if register redefinition was
16422 found and false otherwise. */
16423
16424 static int
16425 distance_agu_use_in_bb (unsigned int regno,
16426 rtx insn, int distance, rtx start,
16427 bool *found, bool *redefined)
16428 {
16429 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16430 rtx next = start;
16431 rtx prev = NULL;
16432
16433 *found = false;
16434 *redefined = false;
16435
16436 while (next
16437 && next != insn
16438 && distance < LEA_SEARCH_THRESHOLD)
16439 {
16440 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16441 {
16442 distance = increase_distance(prev, next, distance);
16443 if (insn_uses_reg_mem (regno, next))
16444 {
16445 /* Return DISTANCE if OP0 is used in memory
16446 address in NEXT. */
16447 *found = true;
16448 return distance;
16449 }
16450
16451 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16452 {
16453 /* Return -1 if OP0 is set in NEXT. */
16454 *redefined = true;
16455 return -1;
16456 }
16457
16458 prev = next;
16459 }
16460
16461 if (next == BB_END (bb))
16462 break;
16463
16464 next = NEXT_INSN (next);
16465 }
16466
16467 return distance;
16468 }
16469
16470 /* Return the distance between INSN and the next insn that uses
16471 register number REGNO0 in memory address. Return -1 if no such
16472 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16473
16474 static int
16475 distance_agu_use (unsigned int regno0, rtx insn)
16476 {
16477 basic_block bb = BLOCK_FOR_INSN (insn);
16478 int distance = 0;
16479 bool found = false;
16480 bool redefined = false;
16481
16482 if (insn != BB_END (bb))
16483 distance = distance_agu_use_in_bb (regno0, insn, distance,
16484 NEXT_INSN (insn),
16485 &found, &redefined);
16486
16487 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16488 {
16489 edge e;
16490 edge_iterator ei;
16491 bool simple_loop = false;
16492
16493 FOR_EACH_EDGE (e, ei, bb->succs)
16494 if (e->dest == bb)
16495 {
16496 simple_loop = true;
16497 break;
16498 }
16499
16500 if (simple_loop)
16501 distance = distance_agu_use_in_bb (regno0, insn,
16502 distance, BB_HEAD (bb),
16503 &found, &redefined);
16504 else
16505 {
16506 int shortest_dist = -1;
16507 bool found_in_bb = false;
16508 bool redefined_in_bb = false;
16509
16510 FOR_EACH_EDGE (e, ei, bb->succs)
16511 {
16512 int bb_dist
16513 = distance_agu_use_in_bb (regno0, insn,
16514 distance, BB_HEAD (e->dest),
16515 &found_in_bb, &redefined_in_bb);
16516 if (found_in_bb)
16517 {
16518 if (shortest_dist < 0)
16519 shortest_dist = bb_dist;
16520 else if (bb_dist > 0)
16521 shortest_dist = MIN (bb_dist, shortest_dist);
16522
16523 found = true;
16524 }
16525 }
16526
16527 distance = shortest_dist;
16528 }
16529 }
16530
16531 if (!found || redefined)
16532 return -1;
16533
16534 return distance >> 1;
16535 }
16536
16537 /* Define this macro to tune LEA priority vs ADD, it take effect when
16538 there is a dilemma of choicing LEA or ADD
16539 Negative value: ADD is more preferred than LEA
16540 Zero: Netrual
16541 Positive value: LEA is more preferred than ADD*/
16542 #define IX86_LEA_PRIORITY 0
16543
16544 /* Return true if usage of lea INSN has performance advantage
16545 over a sequence of instructions. Instructions sequence has
16546 SPLIT_COST cycles higher latency than lea latency. */
16547
16548 bool
16549 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16550 unsigned int regno2, unsigned int split_cost)
16551 {
16552 int dist_define, dist_use;
16553
16554 dist_define = distance_non_agu_define (regno1, regno2, insn);
16555 dist_use = distance_agu_use (regno0, insn);
16556
16557 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16558 {
16559 /* If there is no non AGU operand definition, no AGU
16560 operand usage and split cost is 0 then both lea
16561 and non lea variants have same priority. Currently
16562 we prefer lea for 64 bit code and non lea on 32 bit
16563 code. */
16564 if (dist_use < 0 && split_cost == 0)
16565 return TARGET_64BIT || IX86_LEA_PRIORITY;
16566 else
16567 return true;
16568 }
16569
16570 /* With longer definitions distance lea is more preferable.
16571 Here we change it to take into account splitting cost and
16572 lea priority. */
16573 dist_define += split_cost + IX86_LEA_PRIORITY;
16574
16575 /* If there is no use in memory addess then we just check
16576 that split cost does not exceed AGU stall. */
16577 if (dist_use < 0)
16578 return dist_define >= LEA_MAX_STALL;
16579
16580 /* If this insn has both backward non-agu dependence and forward
16581 agu dependence, the one with short distance takes effect. */
16582 return dist_define >= dist_use;
16583 }
16584
16585 /* Return true if it is legal to clobber flags by INSN and
16586 false otherwise. */
16587
16588 static bool
16589 ix86_ok_to_clobber_flags (rtx insn)
16590 {
16591 basic_block bb = BLOCK_FOR_INSN (insn);
16592 df_ref *use;
16593 bitmap live;
16594
16595 while (insn)
16596 {
16597 if (NONDEBUG_INSN_P (insn))
16598 {
16599 for (use = DF_INSN_USES (insn); *use; use++)
16600 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16601 return false;
16602
16603 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16604 return true;
16605 }
16606
16607 if (insn == BB_END (bb))
16608 break;
16609
16610 insn = NEXT_INSN (insn);
16611 }
16612
16613 live = df_get_live_out(bb);
16614 return !REGNO_REG_SET_P (live, FLAGS_REG);
16615 }
16616
16617 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16618 move and add to avoid AGU stalls. */
16619
16620 bool
16621 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16622 {
16623 unsigned int regno0 = true_regnum (operands[0]);
16624 unsigned int regno1 = true_regnum (operands[1]);
16625 unsigned int regno2 = true_regnum (operands[2]);
16626
16627 /* Check if we need to optimize. */
16628 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16629 return false;
16630
16631 /* Check it is correct to split here. */
16632 if (!ix86_ok_to_clobber_flags(insn))
16633 return false;
16634
16635 /* We need to split only adds with non destructive
16636 destination operand. */
16637 if (regno0 == regno1 || regno0 == regno2)
16638 return false;
16639 else
16640 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16641 }
16642
16643 /* Return true if we should emit lea instruction instead of mov
16644 instruction. */
16645
16646 bool
16647 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16648 {
16649 unsigned int regno0;
16650 unsigned int regno1;
16651
16652 /* Check if we need to optimize. */
16653 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16654 return false;
16655
16656 /* Use lea for reg to reg moves only. */
16657 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16658 return false;
16659
16660 regno0 = true_regnum (operands[0]);
16661 regno1 = true_regnum (operands[1]);
16662
16663 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16664 }
16665
16666 /* Return true if we need to split lea into a sequence of
16667 instructions to avoid AGU stalls. */
16668
16669 bool
16670 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16671 {
16672 unsigned int regno0 = true_regnum (operands[0]) ;
16673 unsigned int regno1 = -1;
16674 unsigned int regno2 = -1;
16675 unsigned int split_cost = 0;
16676 struct ix86_address parts;
16677 int ok;
16678
16679 /* Check we need to optimize. */
16680 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16681 return false;
16682
16683 /* Check it is correct to split here. */
16684 if (!ix86_ok_to_clobber_flags(insn))
16685 return false;
16686
16687 ok = ix86_decompose_address (operands[1], &parts);
16688 gcc_assert (ok);
16689
16690 /* We should not split into add if non legitimate pic
16691 operand is used as displacement. */
16692 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16693 return false;
16694
16695 if (parts.base)
16696 regno1 = true_regnum (parts.base);
16697 if (parts.index)
16698 regno2 = true_regnum (parts.index);
16699
16700 /* Compute how many cycles we will add to execution time
16701 if split lea into a sequence of instructions. */
16702 if (parts.base || parts.index)
16703 {
16704 /* Have to use mov instruction if non desctructive
16705 destination form is used. */
16706 if (regno1 != regno0 && regno2 != regno0)
16707 split_cost += 1;
16708
16709 /* Have to add index to base if both exist. */
16710 if (parts.base && parts.index)
16711 split_cost += 1;
16712
16713 /* Have to use shift and adds if scale is 2 or greater. */
16714 if (parts.scale > 1)
16715 {
16716 if (regno0 != regno1)
16717 split_cost += 1;
16718 else if (regno2 == regno0)
16719 split_cost += 4;
16720 else
16721 split_cost += parts.scale;
16722 }
16723
16724 /* Have to use add instruction with immediate if
16725 disp is non zero. */
16726 if (parts.disp && parts.disp != const0_rtx)
16727 split_cost += 1;
16728
16729 /* Subtract the price of lea. */
16730 split_cost -= 1;
16731 }
16732
16733 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16734 }
16735
16736 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16737 matches destination. RTX includes clobber of FLAGS_REG. */
16738
16739 static void
16740 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16741 rtx dst, rtx src)
16742 {
16743 rtx op, clob;
16744
16745 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16746 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16747
16748 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16749 }
16750
16751 /* Split lea instructions into a sequence of instructions
16752 which are executed on ALU to avoid AGU stalls.
16753 It is assumed that it is allowed to clobber flags register
16754 at lea position. */
16755
16756 extern void
16757 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16758 {
16759 unsigned int regno0 = true_regnum (operands[0]) ;
16760 unsigned int regno1 = INVALID_REGNUM;
16761 unsigned int regno2 = INVALID_REGNUM;
16762 struct ix86_address parts;
16763 rtx tmp;
16764 int ok, adds;
16765
16766 ok = ix86_decompose_address (operands[1], &parts);
16767 gcc_assert (ok);
16768
16769 if (parts.base)
16770 {
16771 if (GET_MODE (parts.base) != mode)
16772 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16773 regno1 = true_regnum (parts.base);
16774 }
16775
16776 if (parts.index)
16777 {
16778 if (GET_MODE (parts.index) != mode)
16779 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16780 regno2 = true_regnum (parts.index);
16781 }
16782
16783 if (parts.scale > 1)
16784 {
16785 /* Case r1 = r1 + ... */
16786 if (regno1 == regno0)
16787 {
16788 /* If we have a case r1 = r1 + C * r1 then we
16789 should use multiplication which is very
16790 expensive. Assume cost model is wrong if we
16791 have such case here. */
16792 gcc_assert (regno2 != regno0);
16793
16794 for (adds = parts.scale; adds > 0; adds--)
16795 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16796 }
16797 else
16798 {
16799 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16800 if (regno0 != regno2)
16801 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16802
16803 /* Use shift for scaling. */
16804 ix86_emit_binop (ASHIFT, mode, operands[0],
16805 GEN_INT (exact_log2 (parts.scale)));
16806
16807 if (parts.base)
16808 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16809
16810 if (parts.disp && parts.disp != const0_rtx)
16811 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16812 }
16813 }
16814 else if (!parts.base && !parts.index)
16815 {
16816 gcc_assert(parts.disp);
16817 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16818 }
16819 else
16820 {
16821 if (!parts.base)
16822 {
16823 if (regno0 != regno2)
16824 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16825 }
16826 else if (!parts.index)
16827 {
16828 if (regno0 != regno1)
16829 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16830 }
16831 else
16832 {
16833 if (regno0 == regno1)
16834 tmp = parts.index;
16835 else if (regno0 == regno2)
16836 tmp = parts.base;
16837 else
16838 {
16839 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16840 tmp = parts.index;
16841 }
16842
16843 ix86_emit_binop (PLUS, mode, operands[0], tmp);
16844 }
16845
16846 if (parts.disp && parts.disp != const0_rtx)
16847 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16848 }
16849 }
16850
16851 /* Return true if it is ok to optimize an ADD operation to LEA
16852 operation to avoid flag register consumation. For most processors,
16853 ADD is faster than LEA. For the processors like ATOM, if the
16854 destination register of LEA holds an actual address which will be
16855 used soon, LEA is better and otherwise ADD is better. */
16856
16857 bool
16858 ix86_lea_for_add_ok (rtx insn, rtx operands[])
16859 {
16860 unsigned int regno0 = true_regnum (operands[0]);
16861 unsigned int regno1 = true_regnum (operands[1]);
16862 unsigned int regno2 = true_regnum (operands[2]);
16863
16864 /* If a = b + c, (a!=b && a!=c), must use lea form. */
16865 if (regno0 != regno1 && regno0 != regno2)
16866 return true;
16867
16868 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16869 return false;
16870
16871 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
16872 }
16873
16874 /* Return true if destination reg of SET_BODY is shift count of
16875 USE_BODY. */
16876
16877 static bool
16878 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
16879 {
16880 rtx set_dest;
16881 rtx shift_rtx;
16882 int i;
16883
16884 /* Retrieve destination of SET_BODY. */
16885 switch (GET_CODE (set_body))
16886 {
16887 case SET:
16888 set_dest = SET_DEST (set_body);
16889 if (!set_dest || !REG_P (set_dest))
16890 return false;
16891 break;
16892 case PARALLEL:
16893 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
16894 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
16895 use_body))
16896 return true;
16897 default:
16898 return false;
16899 break;
16900 }
16901
16902 /* Retrieve shift count of USE_BODY. */
16903 switch (GET_CODE (use_body))
16904 {
16905 case SET:
16906 shift_rtx = XEXP (use_body, 1);
16907 break;
16908 case PARALLEL:
16909 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
16910 if (ix86_dep_by_shift_count_body (set_body,
16911 XVECEXP (use_body, 0, i)))
16912 return true;
16913 default:
16914 return false;
16915 break;
16916 }
16917
16918 if (shift_rtx
16919 && (GET_CODE (shift_rtx) == ASHIFT
16920 || GET_CODE (shift_rtx) == LSHIFTRT
16921 || GET_CODE (shift_rtx) == ASHIFTRT
16922 || GET_CODE (shift_rtx) == ROTATE
16923 || GET_CODE (shift_rtx) == ROTATERT))
16924 {
16925 rtx shift_count = XEXP (shift_rtx, 1);
16926
16927 /* Return true if shift count is dest of SET_BODY. */
16928 if (REG_P (shift_count)
16929 && true_regnum (set_dest) == true_regnum (shift_count))
16930 return true;
16931 }
16932
16933 return false;
16934 }
16935
16936 /* Return true if destination reg of SET_INSN is shift count of
16937 USE_INSN. */
16938
16939 bool
16940 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
16941 {
16942 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
16943 PATTERN (use_insn));
16944 }
16945
16946 /* Return TRUE or FALSE depending on whether the unary operator meets the
16947 appropriate constraints. */
16948
16949 bool
16950 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
16951 enum machine_mode mode ATTRIBUTE_UNUSED,
16952 rtx operands[2] ATTRIBUTE_UNUSED)
16953 {
16954 /* If one of operands is memory, source and destination must match. */
16955 if ((MEM_P (operands[0])
16956 || MEM_P (operands[1]))
16957 && ! rtx_equal_p (operands[0], operands[1]))
16958 return false;
16959 return true;
16960 }
16961
16962 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
16963 are ok, keeping in mind the possible movddup alternative. */
16964
16965 bool
16966 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
16967 {
16968 if (MEM_P (operands[0]))
16969 return rtx_equal_p (operands[0], operands[1 + high]);
16970 if (MEM_P (operands[1]) && MEM_P (operands[2]))
16971 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
16972 return true;
16973 }
16974
16975 /* Post-reload splitter for converting an SF or DFmode value in an
16976 SSE register into an unsigned SImode. */
16977
16978 void
16979 ix86_split_convert_uns_si_sse (rtx operands[])
16980 {
16981 enum machine_mode vecmode;
16982 rtx value, large, zero_or_two31, input, two31, x;
16983
16984 large = operands[1];
16985 zero_or_two31 = operands[2];
16986 input = operands[3];
16987 two31 = operands[4];
16988 vecmode = GET_MODE (large);
16989 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
16990
16991 /* Load up the value into the low element. We must ensure that the other
16992 elements are valid floats -- zero is the easiest such value. */
16993 if (MEM_P (input))
16994 {
16995 if (vecmode == V4SFmode)
16996 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
16997 else
16998 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
16999 }
17000 else
17001 {
17002 input = gen_rtx_REG (vecmode, REGNO (input));
17003 emit_move_insn (value, CONST0_RTX (vecmode));
17004 if (vecmode == V4SFmode)
17005 emit_insn (gen_sse_movss (value, value, input));
17006 else
17007 emit_insn (gen_sse2_movsd (value, value, input));
17008 }
17009
17010 emit_move_insn (large, two31);
17011 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17012
17013 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17014 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17015
17016 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17017 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17018
17019 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17020 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17021
17022 large = gen_rtx_REG (V4SImode, REGNO (large));
17023 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17024
17025 x = gen_rtx_REG (V4SImode, REGNO (value));
17026 if (vecmode == V4SFmode)
17027 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17028 else
17029 emit_insn (gen_sse2_cvttpd2dq (x, value));
17030 value = x;
17031
17032 emit_insn (gen_xorv4si3 (value, value, large));
17033 }
17034
17035 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17036 Expects the 64-bit DImode to be supplied in a pair of integral
17037 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17038 -mfpmath=sse, !optimize_size only. */
17039
17040 void
17041 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17042 {
17043 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17044 rtx int_xmm, fp_xmm;
17045 rtx biases, exponents;
17046 rtx x;
17047
17048 int_xmm = gen_reg_rtx (V4SImode);
17049 if (TARGET_INTER_UNIT_MOVES)
17050 emit_insn (gen_movdi_to_sse (int_xmm, input));
17051 else if (TARGET_SSE_SPLIT_REGS)
17052 {
17053 emit_clobber (int_xmm);
17054 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17055 }
17056 else
17057 {
17058 x = gen_reg_rtx (V2DImode);
17059 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17060 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17061 }
17062
17063 x = gen_rtx_CONST_VECTOR (V4SImode,
17064 gen_rtvec (4, GEN_INT (0x43300000UL),
17065 GEN_INT (0x45300000UL),
17066 const0_rtx, const0_rtx));
17067 exponents = validize_mem (force_const_mem (V4SImode, x));
17068
17069 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17070 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17071
17072 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17073 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17074 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17075 (0x1.0p84 + double(fp_value_hi_xmm)).
17076 Note these exponents differ by 32. */
17077
17078 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17079
17080 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17081 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17082 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17083 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17084 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17085 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17086 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17087 biases = validize_mem (force_const_mem (V2DFmode, biases));
17088 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17089
17090 /* Add the upper and lower DFmode values together. */
17091 if (TARGET_SSE3)
17092 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17093 else
17094 {
17095 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17096 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17097 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17098 }
17099
17100 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17101 }
17102
17103 /* Not used, but eases macroization of patterns. */
17104 void
17105 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17106 rtx input ATTRIBUTE_UNUSED)
17107 {
17108 gcc_unreachable ();
17109 }
17110
17111 /* Convert an unsigned SImode value into a DFmode. Only currently used
17112 for SSE, but applicable anywhere. */
17113
17114 void
17115 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17116 {
17117 REAL_VALUE_TYPE TWO31r;
17118 rtx x, fp;
17119
17120 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17121 NULL, 1, OPTAB_DIRECT);
17122
17123 fp = gen_reg_rtx (DFmode);
17124 emit_insn (gen_floatsidf2 (fp, x));
17125
17126 real_ldexp (&TWO31r, &dconst1, 31);
17127 x = const_double_from_real_value (TWO31r, DFmode);
17128
17129 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17130 if (x != target)
17131 emit_move_insn (target, x);
17132 }
17133
17134 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17135 32-bit mode; otherwise we have a direct convert instruction. */
17136
17137 void
17138 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17139 {
17140 REAL_VALUE_TYPE TWO32r;
17141 rtx fp_lo, fp_hi, x;
17142
17143 fp_lo = gen_reg_rtx (DFmode);
17144 fp_hi = gen_reg_rtx (DFmode);
17145
17146 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17147
17148 real_ldexp (&TWO32r, &dconst1, 32);
17149 x = const_double_from_real_value (TWO32r, DFmode);
17150 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17151
17152 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17153
17154 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17155 0, OPTAB_DIRECT);
17156 if (x != target)
17157 emit_move_insn (target, x);
17158 }
17159
17160 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17161 For x86_32, -mfpmath=sse, !optimize_size only. */
17162 void
17163 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17164 {
17165 REAL_VALUE_TYPE ONE16r;
17166 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17167
17168 real_ldexp (&ONE16r, &dconst1, 16);
17169 x = const_double_from_real_value (ONE16r, SFmode);
17170 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17171 NULL, 0, OPTAB_DIRECT);
17172 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17173 NULL, 0, OPTAB_DIRECT);
17174 fp_hi = gen_reg_rtx (SFmode);
17175 fp_lo = gen_reg_rtx (SFmode);
17176 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17177 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17178 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17179 0, OPTAB_DIRECT);
17180 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17181 0, OPTAB_DIRECT);
17182 if (!rtx_equal_p (target, fp_hi))
17183 emit_move_insn (target, fp_hi);
17184 }
17185
17186 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17187 a vector of unsigned ints VAL to vector of floats TARGET. */
17188
17189 void
17190 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17191 {
17192 rtx tmp[8];
17193 REAL_VALUE_TYPE TWO16r;
17194 enum machine_mode intmode = GET_MODE (val);
17195 enum machine_mode fltmode = GET_MODE (target);
17196 rtx (*cvt) (rtx, rtx);
17197
17198 if (intmode == V4SImode)
17199 cvt = gen_floatv4siv4sf2;
17200 else
17201 cvt = gen_floatv8siv8sf2;
17202 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17203 tmp[0] = force_reg (intmode, tmp[0]);
17204 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17205 OPTAB_DIRECT);
17206 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17207 NULL_RTX, 1, OPTAB_DIRECT);
17208 tmp[3] = gen_reg_rtx (fltmode);
17209 emit_insn (cvt (tmp[3], tmp[1]));
17210 tmp[4] = gen_reg_rtx (fltmode);
17211 emit_insn (cvt (tmp[4], tmp[2]));
17212 real_ldexp (&TWO16r, &dconst1, 16);
17213 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17214 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17215 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17216 OPTAB_DIRECT);
17217 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17218 OPTAB_DIRECT);
17219 if (tmp[7] != target)
17220 emit_move_insn (target, tmp[7]);
17221 }
17222
17223 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17224 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17225 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17226 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17227
17228 rtx
17229 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17230 {
17231 REAL_VALUE_TYPE TWO31r;
17232 rtx two31r, tmp[4];
17233 enum machine_mode mode = GET_MODE (val);
17234 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17235 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17236 rtx (*cmp) (rtx, rtx, rtx, rtx);
17237 int i;
17238
17239 for (i = 0; i < 3; i++)
17240 tmp[i] = gen_reg_rtx (mode);
17241 real_ldexp (&TWO31r, &dconst1, 31);
17242 two31r = const_double_from_real_value (TWO31r, scalarmode);
17243 two31r = ix86_build_const_vector (mode, 1, two31r);
17244 two31r = force_reg (mode, two31r);
17245 switch (mode)
17246 {
17247 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17248 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17249 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17250 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17251 default: gcc_unreachable ();
17252 }
17253 tmp[3] = gen_rtx_LE (mode, two31r, val);
17254 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17255 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17256 0, OPTAB_DIRECT);
17257 if (intmode == V4SImode || TARGET_AVX2)
17258 *xorp = expand_simple_binop (intmode, ASHIFT,
17259 gen_lowpart (intmode, tmp[0]),
17260 GEN_INT (31), NULL_RTX, 0,
17261 OPTAB_DIRECT);
17262 else
17263 {
17264 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17265 two31 = ix86_build_const_vector (intmode, 1, two31);
17266 *xorp = expand_simple_binop (intmode, AND,
17267 gen_lowpart (intmode, tmp[0]),
17268 two31, NULL_RTX, 0,
17269 OPTAB_DIRECT);
17270 }
17271 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17272 0, OPTAB_DIRECT);
17273 }
17274
17275 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17276 then replicate the value for all elements of the vector
17277 register. */
17278
17279 rtx
17280 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17281 {
17282 int i, n_elt;
17283 rtvec v;
17284 enum machine_mode scalar_mode;
17285
17286 switch (mode)
17287 {
17288 case V32QImode:
17289 case V16QImode:
17290 case V16HImode:
17291 case V8HImode:
17292 case V8SImode:
17293 case V4SImode:
17294 case V4DImode:
17295 case V2DImode:
17296 gcc_assert (vect);
17297 case V8SFmode:
17298 case V4SFmode:
17299 case V4DFmode:
17300 case V2DFmode:
17301 n_elt = GET_MODE_NUNITS (mode);
17302 v = rtvec_alloc (n_elt);
17303 scalar_mode = GET_MODE_INNER (mode);
17304
17305 RTVEC_ELT (v, 0) = value;
17306
17307 for (i = 1; i < n_elt; ++i)
17308 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17309
17310 return gen_rtx_CONST_VECTOR (mode, v);
17311
17312 default:
17313 gcc_unreachable ();
17314 }
17315 }
17316
17317 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17318 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17319 for an SSE register. If VECT is true, then replicate the mask for
17320 all elements of the vector register. If INVERT is true, then create
17321 a mask excluding the sign bit. */
17322
17323 rtx
17324 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17325 {
17326 enum machine_mode vec_mode, imode;
17327 HOST_WIDE_INT hi, lo;
17328 int shift = 63;
17329 rtx v;
17330 rtx mask;
17331
17332 /* Find the sign bit, sign extended to 2*HWI. */
17333 switch (mode)
17334 {
17335 case V8SImode:
17336 case V4SImode:
17337 case V8SFmode:
17338 case V4SFmode:
17339 vec_mode = mode;
17340 mode = GET_MODE_INNER (mode);
17341 imode = SImode;
17342 lo = 0x80000000, hi = lo < 0;
17343 break;
17344
17345 case V4DImode:
17346 case V2DImode:
17347 case V4DFmode:
17348 case V2DFmode:
17349 vec_mode = mode;
17350 mode = GET_MODE_INNER (mode);
17351 imode = DImode;
17352 if (HOST_BITS_PER_WIDE_INT >= 64)
17353 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17354 else
17355 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17356 break;
17357
17358 case TImode:
17359 case TFmode:
17360 vec_mode = VOIDmode;
17361 if (HOST_BITS_PER_WIDE_INT >= 64)
17362 {
17363 imode = TImode;
17364 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17365 }
17366 else
17367 {
17368 rtvec vec;
17369
17370 imode = DImode;
17371 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17372
17373 if (invert)
17374 {
17375 lo = ~lo, hi = ~hi;
17376 v = constm1_rtx;
17377 }
17378 else
17379 v = const0_rtx;
17380
17381 mask = immed_double_const (lo, hi, imode);
17382
17383 vec = gen_rtvec (2, v, mask);
17384 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17385 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17386
17387 return v;
17388 }
17389 break;
17390
17391 default:
17392 gcc_unreachable ();
17393 }
17394
17395 if (invert)
17396 lo = ~lo, hi = ~hi;
17397
17398 /* Force this value into the low part of a fp vector constant. */
17399 mask = immed_double_const (lo, hi, imode);
17400 mask = gen_lowpart (mode, mask);
17401
17402 if (vec_mode == VOIDmode)
17403 return force_reg (mode, mask);
17404
17405 v = ix86_build_const_vector (vec_mode, vect, mask);
17406 return force_reg (vec_mode, v);
17407 }
17408
17409 /* Generate code for floating point ABS or NEG. */
17410
17411 void
17412 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17413 rtx operands[])
17414 {
17415 rtx mask, set, dst, src;
17416 bool use_sse = false;
17417 bool vector_mode = VECTOR_MODE_P (mode);
17418 enum machine_mode vmode = mode;
17419
17420 if (vector_mode)
17421 use_sse = true;
17422 else if (mode == TFmode)
17423 use_sse = true;
17424 else if (TARGET_SSE_MATH)
17425 {
17426 use_sse = SSE_FLOAT_MODE_P (mode);
17427 if (mode == SFmode)
17428 vmode = V4SFmode;
17429 else if (mode == DFmode)
17430 vmode = V2DFmode;
17431 }
17432
17433 /* NEG and ABS performed with SSE use bitwise mask operations.
17434 Create the appropriate mask now. */
17435 if (use_sse)
17436 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17437 else
17438 mask = NULL_RTX;
17439
17440 dst = operands[0];
17441 src = operands[1];
17442
17443 set = gen_rtx_fmt_e (code, mode, src);
17444 set = gen_rtx_SET (VOIDmode, dst, set);
17445
17446 if (mask)
17447 {
17448 rtx use, clob;
17449 rtvec par;
17450
17451 use = gen_rtx_USE (VOIDmode, mask);
17452 if (vector_mode)
17453 par = gen_rtvec (2, set, use);
17454 else
17455 {
17456 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17457 par = gen_rtvec (3, set, use, clob);
17458 }
17459 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17460 }
17461 else
17462 emit_insn (set);
17463 }
17464
17465 /* Expand a copysign operation. Special case operand 0 being a constant. */
17466
17467 void
17468 ix86_expand_copysign (rtx operands[])
17469 {
17470 enum machine_mode mode, vmode;
17471 rtx dest, op0, op1, mask, nmask;
17472
17473 dest = operands[0];
17474 op0 = operands[1];
17475 op1 = operands[2];
17476
17477 mode = GET_MODE (dest);
17478
17479 if (mode == SFmode)
17480 vmode = V4SFmode;
17481 else if (mode == DFmode)
17482 vmode = V2DFmode;
17483 else
17484 vmode = mode;
17485
17486 if (GET_CODE (op0) == CONST_DOUBLE)
17487 {
17488 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17489
17490 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17491 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17492
17493 if (mode == SFmode || mode == DFmode)
17494 {
17495 if (op0 == CONST0_RTX (mode))
17496 op0 = CONST0_RTX (vmode);
17497 else
17498 {
17499 rtx v = ix86_build_const_vector (vmode, false, op0);
17500
17501 op0 = force_reg (vmode, v);
17502 }
17503 }
17504 else if (op0 != CONST0_RTX (mode))
17505 op0 = force_reg (mode, op0);
17506
17507 mask = ix86_build_signbit_mask (vmode, 0, 0);
17508
17509 if (mode == SFmode)
17510 copysign_insn = gen_copysignsf3_const;
17511 else if (mode == DFmode)
17512 copysign_insn = gen_copysigndf3_const;
17513 else
17514 copysign_insn = gen_copysigntf3_const;
17515
17516 emit_insn (copysign_insn (dest, op0, op1, mask));
17517 }
17518 else
17519 {
17520 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17521
17522 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17523 mask = ix86_build_signbit_mask (vmode, 0, 0);
17524
17525 if (mode == SFmode)
17526 copysign_insn = gen_copysignsf3_var;
17527 else if (mode == DFmode)
17528 copysign_insn = gen_copysigndf3_var;
17529 else
17530 copysign_insn = gen_copysigntf3_var;
17531
17532 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17533 }
17534 }
17535
17536 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17537 be a constant, and so has already been expanded into a vector constant. */
17538
17539 void
17540 ix86_split_copysign_const (rtx operands[])
17541 {
17542 enum machine_mode mode, vmode;
17543 rtx dest, op0, mask, x;
17544
17545 dest = operands[0];
17546 op0 = operands[1];
17547 mask = operands[3];
17548
17549 mode = GET_MODE (dest);
17550 vmode = GET_MODE (mask);
17551
17552 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17553 x = gen_rtx_AND (vmode, dest, mask);
17554 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17555
17556 if (op0 != CONST0_RTX (vmode))
17557 {
17558 x = gen_rtx_IOR (vmode, dest, op0);
17559 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17560 }
17561 }
17562
17563 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17564 so we have to do two masks. */
17565
17566 void
17567 ix86_split_copysign_var (rtx operands[])
17568 {
17569 enum machine_mode mode, vmode;
17570 rtx dest, scratch, op0, op1, mask, nmask, x;
17571
17572 dest = operands[0];
17573 scratch = operands[1];
17574 op0 = operands[2];
17575 op1 = operands[3];
17576 nmask = operands[4];
17577 mask = operands[5];
17578
17579 mode = GET_MODE (dest);
17580 vmode = GET_MODE (mask);
17581
17582 if (rtx_equal_p (op0, op1))
17583 {
17584 /* Shouldn't happen often (it's useless, obviously), but when it does
17585 we'd generate incorrect code if we continue below. */
17586 emit_move_insn (dest, op0);
17587 return;
17588 }
17589
17590 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17591 {
17592 gcc_assert (REGNO (op1) == REGNO (scratch));
17593
17594 x = gen_rtx_AND (vmode, scratch, mask);
17595 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17596
17597 dest = mask;
17598 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17599 x = gen_rtx_NOT (vmode, dest);
17600 x = gen_rtx_AND (vmode, x, op0);
17601 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17602 }
17603 else
17604 {
17605 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17606 {
17607 x = gen_rtx_AND (vmode, scratch, mask);
17608 }
17609 else /* alternative 2,4 */
17610 {
17611 gcc_assert (REGNO (mask) == REGNO (scratch));
17612 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17613 x = gen_rtx_AND (vmode, scratch, op1);
17614 }
17615 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17616
17617 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17618 {
17619 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17620 x = gen_rtx_AND (vmode, dest, nmask);
17621 }
17622 else /* alternative 3,4 */
17623 {
17624 gcc_assert (REGNO (nmask) == REGNO (dest));
17625 dest = nmask;
17626 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17627 x = gen_rtx_AND (vmode, dest, op0);
17628 }
17629 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17630 }
17631
17632 x = gen_rtx_IOR (vmode, dest, scratch);
17633 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17634 }
17635
17636 /* Return TRUE or FALSE depending on whether the first SET in INSN
17637 has source and destination with matching CC modes, and that the
17638 CC mode is at least as constrained as REQ_MODE. */
17639
17640 bool
17641 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17642 {
17643 rtx set;
17644 enum machine_mode set_mode;
17645
17646 set = PATTERN (insn);
17647 if (GET_CODE (set) == PARALLEL)
17648 set = XVECEXP (set, 0, 0);
17649 gcc_assert (GET_CODE (set) == SET);
17650 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17651
17652 set_mode = GET_MODE (SET_DEST (set));
17653 switch (set_mode)
17654 {
17655 case CCNOmode:
17656 if (req_mode != CCNOmode
17657 && (req_mode != CCmode
17658 || XEXP (SET_SRC (set), 1) != const0_rtx))
17659 return false;
17660 break;
17661 case CCmode:
17662 if (req_mode == CCGCmode)
17663 return false;
17664 /* FALLTHRU */
17665 case CCGCmode:
17666 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17667 return false;
17668 /* FALLTHRU */
17669 case CCGOCmode:
17670 if (req_mode == CCZmode)
17671 return false;
17672 /* FALLTHRU */
17673 case CCZmode:
17674 break;
17675
17676 case CCAmode:
17677 case CCCmode:
17678 case CCOmode:
17679 case CCSmode:
17680 if (set_mode != req_mode)
17681 return false;
17682 break;
17683
17684 default:
17685 gcc_unreachable ();
17686 }
17687
17688 return GET_MODE (SET_SRC (set)) == set_mode;
17689 }
17690
17691 /* Generate insn patterns to do an integer compare of OPERANDS. */
17692
17693 static rtx
17694 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17695 {
17696 enum machine_mode cmpmode;
17697 rtx tmp, flags;
17698
17699 cmpmode = SELECT_CC_MODE (code, op0, op1);
17700 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17701
17702 /* This is very simple, but making the interface the same as in the
17703 FP case makes the rest of the code easier. */
17704 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17705 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17706
17707 /* Return the test that should be put into the flags user, i.e.
17708 the bcc, scc, or cmov instruction. */
17709 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17710 }
17711
17712 /* Figure out whether to use ordered or unordered fp comparisons.
17713 Return the appropriate mode to use. */
17714
17715 enum machine_mode
17716 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17717 {
17718 /* ??? In order to make all comparisons reversible, we do all comparisons
17719 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17720 all forms trapping and nontrapping comparisons, we can make inequality
17721 comparisons trapping again, since it results in better code when using
17722 FCOM based compares. */
17723 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17724 }
17725
17726 enum machine_mode
17727 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17728 {
17729 enum machine_mode mode = GET_MODE (op0);
17730
17731 if (SCALAR_FLOAT_MODE_P (mode))
17732 {
17733 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17734 return ix86_fp_compare_mode (code);
17735 }
17736
17737 switch (code)
17738 {
17739 /* Only zero flag is needed. */
17740 case EQ: /* ZF=0 */
17741 case NE: /* ZF!=0 */
17742 return CCZmode;
17743 /* Codes needing carry flag. */
17744 case GEU: /* CF=0 */
17745 case LTU: /* CF=1 */
17746 /* Detect overflow checks. They need just the carry flag. */
17747 if (GET_CODE (op0) == PLUS
17748 && rtx_equal_p (op1, XEXP (op0, 0)))
17749 return CCCmode;
17750 else
17751 return CCmode;
17752 case GTU: /* CF=0 & ZF=0 */
17753 case LEU: /* CF=1 | ZF=1 */
17754 /* Detect overflow checks. They need just the carry flag. */
17755 if (GET_CODE (op0) == MINUS
17756 && rtx_equal_p (op1, XEXP (op0, 0)))
17757 return CCCmode;
17758 else
17759 return CCmode;
17760 /* Codes possibly doable only with sign flag when
17761 comparing against zero. */
17762 case GE: /* SF=OF or SF=0 */
17763 case LT: /* SF<>OF or SF=1 */
17764 if (op1 == const0_rtx)
17765 return CCGOCmode;
17766 else
17767 /* For other cases Carry flag is not required. */
17768 return CCGCmode;
17769 /* Codes doable only with sign flag when comparing
17770 against zero, but we miss jump instruction for it
17771 so we need to use relational tests against overflow
17772 that thus needs to be zero. */
17773 case GT: /* ZF=0 & SF=OF */
17774 case LE: /* ZF=1 | SF<>OF */
17775 if (op1 == const0_rtx)
17776 return CCNOmode;
17777 else
17778 return CCGCmode;
17779 /* strcmp pattern do (use flags) and combine may ask us for proper
17780 mode. */
17781 case USE:
17782 return CCmode;
17783 default:
17784 gcc_unreachable ();
17785 }
17786 }
17787
17788 /* Return the fixed registers used for condition codes. */
17789
17790 static bool
17791 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17792 {
17793 *p1 = FLAGS_REG;
17794 *p2 = FPSR_REG;
17795 return true;
17796 }
17797
17798 /* If two condition code modes are compatible, return a condition code
17799 mode which is compatible with both. Otherwise, return
17800 VOIDmode. */
17801
17802 static enum machine_mode
17803 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17804 {
17805 if (m1 == m2)
17806 return m1;
17807
17808 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17809 return VOIDmode;
17810
17811 if ((m1 == CCGCmode && m2 == CCGOCmode)
17812 || (m1 == CCGOCmode && m2 == CCGCmode))
17813 return CCGCmode;
17814
17815 switch (m1)
17816 {
17817 default:
17818 gcc_unreachable ();
17819
17820 case CCmode:
17821 case CCGCmode:
17822 case CCGOCmode:
17823 case CCNOmode:
17824 case CCAmode:
17825 case CCCmode:
17826 case CCOmode:
17827 case CCSmode:
17828 case CCZmode:
17829 switch (m2)
17830 {
17831 default:
17832 return VOIDmode;
17833
17834 case CCmode:
17835 case CCGCmode:
17836 case CCGOCmode:
17837 case CCNOmode:
17838 case CCAmode:
17839 case CCCmode:
17840 case CCOmode:
17841 case CCSmode:
17842 case CCZmode:
17843 return CCmode;
17844 }
17845
17846 case CCFPmode:
17847 case CCFPUmode:
17848 /* These are only compatible with themselves, which we already
17849 checked above. */
17850 return VOIDmode;
17851 }
17852 }
17853
17854
17855 /* Return a comparison we can do and that it is equivalent to
17856 swap_condition (code) apart possibly from orderedness.
17857 But, never change orderedness if TARGET_IEEE_FP, returning
17858 UNKNOWN in that case if necessary. */
17859
17860 static enum rtx_code
17861 ix86_fp_swap_condition (enum rtx_code code)
17862 {
17863 switch (code)
17864 {
17865 case GT: /* GTU - CF=0 & ZF=0 */
17866 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
17867 case GE: /* GEU - CF=0 */
17868 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
17869 case UNLT: /* LTU - CF=1 */
17870 return TARGET_IEEE_FP ? UNKNOWN : GT;
17871 case UNLE: /* LEU - CF=1 | ZF=1 */
17872 return TARGET_IEEE_FP ? UNKNOWN : GE;
17873 default:
17874 return swap_condition (code);
17875 }
17876 }
17877
17878 /* Return cost of comparison CODE using the best strategy for performance.
17879 All following functions do use number of instructions as a cost metrics.
17880 In future this should be tweaked to compute bytes for optimize_size and
17881 take into account performance of various instructions on various CPUs. */
17882
17883 static int
17884 ix86_fp_comparison_cost (enum rtx_code code)
17885 {
17886 int arith_cost;
17887
17888 /* The cost of code using bit-twiddling on %ah. */
17889 switch (code)
17890 {
17891 case UNLE:
17892 case UNLT:
17893 case LTGT:
17894 case GT:
17895 case GE:
17896 case UNORDERED:
17897 case ORDERED:
17898 case UNEQ:
17899 arith_cost = 4;
17900 break;
17901 case LT:
17902 case NE:
17903 case EQ:
17904 case UNGE:
17905 arith_cost = TARGET_IEEE_FP ? 5 : 4;
17906 break;
17907 case LE:
17908 case UNGT:
17909 arith_cost = TARGET_IEEE_FP ? 6 : 4;
17910 break;
17911 default:
17912 gcc_unreachable ();
17913 }
17914
17915 switch (ix86_fp_comparison_strategy (code))
17916 {
17917 case IX86_FPCMP_COMI:
17918 return arith_cost > 4 ? 3 : 2;
17919 case IX86_FPCMP_SAHF:
17920 return arith_cost > 4 ? 4 : 3;
17921 default:
17922 return arith_cost;
17923 }
17924 }
17925
17926 /* Return strategy to use for floating-point. We assume that fcomi is always
17927 preferrable where available, since that is also true when looking at size
17928 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
17929
17930 enum ix86_fpcmp_strategy
17931 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
17932 {
17933 /* Do fcomi/sahf based test when profitable. */
17934
17935 if (TARGET_CMOVE)
17936 return IX86_FPCMP_COMI;
17937
17938 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
17939 return IX86_FPCMP_SAHF;
17940
17941 return IX86_FPCMP_ARITH;
17942 }
17943
17944 /* Swap, force into registers, or otherwise massage the two operands
17945 to a fp comparison. The operands are updated in place; the new
17946 comparison code is returned. */
17947
17948 static enum rtx_code
17949 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
17950 {
17951 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
17952 rtx op0 = *pop0, op1 = *pop1;
17953 enum machine_mode op_mode = GET_MODE (op0);
17954 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
17955
17956 /* All of the unordered compare instructions only work on registers.
17957 The same is true of the fcomi compare instructions. The XFmode
17958 compare instructions require registers except when comparing
17959 against zero or when converting operand 1 from fixed point to
17960 floating point. */
17961
17962 if (!is_sse
17963 && (fpcmp_mode == CCFPUmode
17964 || (op_mode == XFmode
17965 && ! (standard_80387_constant_p (op0) == 1
17966 || standard_80387_constant_p (op1) == 1)
17967 && GET_CODE (op1) != FLOAT)
17968 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
17969 {
17970 op0 = force_reg (op_mode, op0);
17971 op1 = force_reg (op_mode, op1);
17972 }
17973 else
17974 {
17975 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
17976 things around if they appear profitable, otherwise force op0
17977 into a register. */
17978
17979 if (standard_80387_constant_p (op0) == 0
17980 || (MEM_P (op0)
17981 && ! (standard_80387_constant_p (op1) == 0
17982 || MEM_P (op1))))
17983 {
17984 enum rtx_code new_code = ix86_fp_swap_condition (code);
17985 if (new_code != UNKNOWN)
17986 {
17987 rtx tmp;
17988 tmp = op0, op0 = op1, op1 = tmp;
17989 code = new_code;
17990 }
17991 }
17992
17993 if (!REG_P (op0))
17994 op0 = force_reg (op_mode, op0);
17995
17996 if (CONSTANT_P (op1))
17997 {
17998 int tmp = standard_80387_constant_p (op1);
17999 if (tmp == 0)
18000 op1 = validize_mem (force_const_mem (op_mode, op1));
18001 else if (tmp == 1)
18002 {
18003 if (TARGET_CMOVE)
18004 op1 = force_reg (op_mode, op1);
18005 }
18006 else
18007 op1 = force_reg (op_mode, op1);
18008 }
18009 }
18010
18011 /* Try to rearrange the comparison to make it cheaper. */
18012 if (ix86_fp_comparison_cost (code)
18013 > ix86_fp_comparison_cost (swap_condition (code))
18014 && (REG_P (op1) || can_create_pseudo_p ()))
18015 {
18016 rtx tmp;
18017 tmp = op0, op0 = op1, op1 = tmp;
18018 code = swap_condition (code);
18019 if (!REG_P (op0))
18020 op0 = force_reg (op_mode, op0);
18021 }
18022
18023 *pop0 = op0;
18024 *pop1 = op1;
18025 return code;
18026 }
18027
18028 /* Convert comparison codes we use to represent FP comparison to integer
18029 code that will result in proper branch. Return UNKNOWN if no such code
18030 is available. */
18031
18032 enum rtx_code
18033 ix86_fp_compare_code_to_integer (enum rtx_code code)
18034 {
18035 switch (code)
18036 {
18037 case GT:
18038 return GTU;
18039 case GE:
18040 return GEU;
18041 case ORDERED:
18042 case UNORDERED:
18043 return code;
18044 break;
18045 case UNEQ:
18046 return EQ;
18047 break;
18048 case UNLT:
18049 return LTU;
18050 break;
18051 case UNLE:
18052 return LEU;
18053 break;
18054 case LTGT:
18055 return NE;
18056 break;
18057 default:
18058 return UNKNOWN;
18059 }
18060 }
18061
18062 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18063
18064 static rtx
18065 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18066 {
18067 enum machine_mode fpcmp_mode, intcmp_mode;
18068 rtx tmp, tmp2;
18069
18070 fpcmp_mode = ix86_fp_compare_mode (code);
18071 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18072
18073 /* Do fcomi/sahf based test when profitable. */
18074 switch (ix86_fp_comparison_strategy (code))
18075 {
18076 case IX86_FPCMP_COMI:
18077 intcmp_mode = fpcmp_mode;
18078 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18079 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18080 tmp);
18081 emit_insn (tmp);
18082 break;
18083
18084 case IX86_FPCMP_SAHF:
18085 intcmp_mode = fpcmp_mode;
18086 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18087 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18088 tmp);
18089
18090 if (!scratch)
18091 scratch = gen_reg_rtx (HImode);
18092 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18093 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18094 break;
18095
18096 case IX86_FPCMP_ARITH:
18097 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18098 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18099 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18100 if (!scratch)
18101 scratch = gen_reg_rtx (HImode);
18102 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18103
18104 /* In the unordered case, we have to check C2 for NaN's, which
18105 doesn't happen to work out to anything nice combination-wise.
18106 So do some bit twiddling on the value we've got in AH to come
18107 up with an appropriate set of condition codes. */
18108
18109 intcmp_mode = CCNOmode;
18110 switch (code)
18111 {
18112 case GT:
18113 case UNGT:
18114 if (code == GT || !TARGET_IEEE_FP)
18115 {
18116 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18117 code = EQ;
18118 }
18119 else
18120 {
18121 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18122 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18123 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18124 intcmp_mode = CCmode;
18125 code = GEU;
18126 }
18127 break;
18128 case LT:
18129 case UNLT:
18130 if (code == LT && TARGET_IEEE_FP)
18131 {
18132 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18133 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18134 intcmp_mode = CCmode;
18135 code = EQ;
18136 }
18137 else
18138 {
18139 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18140 code = NE;
18141 }
18142 break;
18143 case GE:
18144 case UNGE:
18145 if (code == GE || !TARGET_IEEE_FP)
18146 {
18147 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18148 code = EQ;
18149 }
18150 else
18151 {
18152 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18153 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18154 code = NE;
18155 }
18156 break;
18157 case LE:
18158 case UNLE:
18159 if (code == LE && TARGET_IEEE_FP)
18160 {
18161 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18162 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18163 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18164 intcmp_mode = CCmode;
18165 code = LTU;
18166 }
18167 else
18168 {
18169 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18170 code = NE;
18171 }
18172 break;
18173 case EQ:
18174 case UNEQ:
18175 if (code == EQ && TARGET_IEEE_FP)
18176 {
18177 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18178 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18179 intcmp_mode = CCmode;
18180 code = EQ;
18181 }
18182 else
18183 {
18184 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18185 code = NE;
18186 }
18187 break;
18188 case NE:
18189 case LTGT:
18190 if (code == NE && TARGET_IEEE_FP)
18191 {
18192 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18193 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18194 GEN_INT (0x40)));
18195 code = NE;
18196 }
18197 else
18198 {
18199 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18200 code = EQ;
18201 }
18202 break;
18203
18204 case UNORDERED:
18205 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18206 code = NE;
18207 break;
18208 case ORDERED:
18209 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18210 code = EQ;
18211 break;
18212
18213 default:
18214 gcc_unreachable ();
18215 }
18216 break;
18217
18218 default:
18219 gcc_unreachable();
18220 }
18221
18222 /* Return the test that should be put into the flags user, i.e.
18223 the bcc, scc, or cmov instruction. */
18224 return gen_rtx_fmt_ee (code, VOIDmode,
18225 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18226 const0_rtx);
18227 }
18228
18229 static rtx
18230 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18231 {
18232 rtx ret;
18233
18234 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18235 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18236
18237 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18238 {
18239 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18240 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18241 }
18242 else
18243 ret = ix86_expand_int_compare (code, op0, op1);
18244
18245 return ret;
18246 }
18247
18248 void
18249 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18250 {
18251 enum machine_mode mode = GET_MODE (op0);
18252 rtx tmp;
18253
18254 switch (mode)
18255 {
18256 case SFmode:
18257 case DFmode:
18258 case XFmode:
18259 case QImode:
18260 case HImode:
18261 case SImode:
18262 simple:
18263 tmp = ix86_expand_compare (code, op0, op1);
18264 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18265 gen_rtx_LABEL_REF (VOIDmode, label),
18266 pc_rtx);
18267 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18268 return;
18269
18270 case DImode:
18271 if (TARGET_64BIT)
18272 goto simple;
18273 case TImode:
18274 /* Expand DImode branch into multiple compare+branch. */
18275 {
18276 rtx lo[2], hi[2], label2;
18277 enum rtx_code code1, code2, code3;
18278 enum machine_mode submode;
18279
18280 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18281 {
18282 tmp = op0, op0 = op1, op1 = tmp;
18283 code = swap_condition (code);
18284 }
18285
18286 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18287 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18288
18289 submode = mode == DImode ? SImode : DImode;
18290
18291 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18292 avoid two branches. This costs one extra insn, so disable when
18293 optimizing for size. */
18294
18295 if ((code == EQ || code == NE)
18296 && (!optimize_insn_for_size_p ()
18297 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18298 {
18299 rtx xor0, xor1;
18300
18301 xor1 = hi[0];
18302 if (hi[1] != const0_rtx)
18303 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18304 NULL_RTX, 0, OPTAB_WIDEN);
18305
18306 xor0 = lo[0];
18307 if (lo[1] != const0_rtx)
18308 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18309 NULL_RTX, 0, OPTAB_WIDEN);
18310
18311 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18312 NULL_RTX, 0, OPTAB_WIDEN);
18313
18314 ix86_expand_branch (code, tmp, const0_rtx, label);
18315 return;
18316 }
18317
18318 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18319 op1 is a constant and the low word is zero, then we can just
18320 examine the high word. Similarly for low word -1 and
18321 less-or-equal-than or greater-than. */
18322
18323 if (CONST_INT_P (hi[1]))
18324 switch (code)
18325 {
18326 case LT: case LTU: case GE: case GEU:
18327 if (lo[1] == const0_rtx)
18328 {
18329 ix86_expand_branch (code, hi[0], hi[1], label);
18330 return;
18331 }
18332 break;
18333 case LE: case LEU: case GT: case GTU:
18334 if (lo[1] == constm1_rtx)
18335 {
18336 ix86_expand_branch (code, hi[0], hi[1], label);
18337 return;
18338 }
18339 break;
18340 default:
18341 break;
18342 }
18343
18344 /* Otherwise, we need two or three jumps. */
18345
18346 label2 = gen_label_rtx ();
18347
18348 code1 = code;
18349 code2 = swap_condition (code);
18350 code3 = unsigned_condition (code);
18351
18352 switch (code)
18353 {
18354 case LT: case GT: case LTU: case GTU:
18355 break;
18356
18357 case LE: code1 = LT; code2 = GT; break;
18358 case GE: code1 = GT; code2 = LT; break;
18359 case LEU: code1 = LTU; code2 = GTU; break;
18360 case GEU: code1 = GTU; code2 = LTU; break;
18361
18362 case EQ: code1 = UNKNOWN; code2 = NE; break;
18363 case NE: code2 = UNKNOWN; break;
18364
18365 default:
18366 gcc_unreachable ();
18367 }
18368
18369 /*
18370 * a < b =>
18371 * if (hi(a) < hi(b)) goto true;
18372 * if (hi(a) > hi(b)) goto false;
18373 * if (lo(a) < lo(b)) goto true;
18374 * false:
18375 */
18376
18377 if (code1 != UNKNOWN)
18378 ix86_expand_branch (code1, hi[0], hi[1], label);
18379 if (code2 != UNKNOWN)
18380 ix86_expand_branch (code2, hi[0], hi[1], label2);
18381
18382 ix86_expand_branch (code3, lo[0], lo[1], label);
18383
18384 if (code2 != UNKNOWN)
18385 emit_label (label2);
18386 return;
18387 }
18388
18389 default:
18390 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18391 goto simple;
18392 }
18393 }
18394
18395 /* Split branch based on floating point condition. */
18396 void
18397 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18398 rtx target1, rtx target2, rtx tmp, rtx pushed)
18399 {
18400 rtx condition;
18401 rtx i;
18402
18403 if (target2 != pc_rtx)
18404 {
18405 rtx tmp = target2;
18406 code = reverse_condition_maybe_unordered (code);
18407 target2 = target1;
18408 target1 = tmp;
18409 }
18410
18411 condition = ix86_expand_fp_compare (code, op1, op2,
18412 tmp);
18413
18414 /* Remove pushed operand from stack. */
18415 if (pushed)
18416 ix86_free_from_memory (GET_MODE (pushed));
18417
18418 i = emit_jump_insn (gen_rtx_SET
18419 (VOIDmode, pc_rtx,
18420 gen_rtx_IF_THEN_ELSE (VOIDmode,
18421 condition, target1, target2)));
18422 if (split_branch_probability >= 0)
18423 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18424 }
18425
18426 void
18427 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18428 {
18429 rtx ret;
18430
18431 gcc_assert (GET_MODE (dest) == QImode);
18432
18433 ret = ix86_expand_compare (code, op0, op1);
18434 PUT_MODE (ret, QImode);
18435 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18436 }
18437
18438 /* Expand comparison setting or clearing carry flag. Return true when
18439 successful and set pop for the operation. */
18440 static bool
18441 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18442 {
18443 enum machine_mode mode =
18444 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18445
18446 /* Do not handle double-mode compares that go through special path. */
18447 if (mode == (TARGET_64BIT ? TImode : DImode))
18448 return false;
18449
18450 if (SCALAR_FLOAT_MODE_P (mode))
18451 {
18452 rtx compare_op, compare_seq;
18453
18454 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18455
18456 /* Shortcut: following common codes never translate
18457 into carry flag compares. */
18458 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18459 || code == ORDERED || code == UNORDERED)
18460 return false;
18461
18462 /* These comparisons require zero flag; swap operands so they won't. */
18463 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18464 && !TARGET_IEEE_FP)
18465 {
18466 rtx tmp = op0;
18467 op0 = op1;
18468 op1 = tmp;
18469 code = swap_condition (code);
18470 }
18471
18472 /* Try to expand the comparison and verify that we end up with
18473 carry flag based comparison. This fails to be true only when
18474 we decide to expand comparison using arithmetic that is not
18475 too common scenario. */
18476 start_sequence ();
18477 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18478 compare_seq = get_insns ();
18479 end_sequence ();
18480
18481 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18482 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18483 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18484 else
18485 code = GET_CODE (compare_op);
18486
18487 if (code != LTU && code != GEU)
18488 return false;
18489
18490 emit_insn (compare_seq);
18491 *pop = compare_op;
18492 return true;
18493 }
18494
18495 if (!INTEGRAL_MODE_P (mode))
18496 return false;
18497
18498 switch (code)
18499 {
18500 case LTU:
18501 case GEU:
18502 break;
18503
18504 /* Convert a==0 into (unsigned)a<1. */
18505 case EQ:
18506 case NE:
18507 if (op1 != const0_rtx)
18508 return false;
18509 op1 = const1_rtx;
18510 code = (code == EQ ? LTU : GEU);
18511 break;
18512
18513 /* Convert a>b into b<a or a>=b-1. */
18514 case GTU:
18515 case LEU:
18516 if (CONST_INT_P (op1))
18517 {
18518 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18519 /* Bail out on overflow. We still can swap operands but that
18520 would force loading of the constant into register. */
18521 if (op1 == const0_rtx
18522 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18523 return false;
18524 code = (code == GTU ? GEU : LTU);
18525 }
18526 else
18527 {
18528 rtx tmp = op1;
18529 op1 = op0;
18530 op0 = tmp;
18531 code = (code == GTU ? LTU : GEU);
18532 }
18533 break;
18534
18535 /* Convert a>=0 into (unsigned)a<0x80000000. */
18536 case LT:
18537 case GE:
18538 if (mode == DImode || op1 != const0_rtx)
18539 return false;
18540 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18541 code = (code == LT ? GEU : LTU);
18542 break;
18543 case LE:
18544 case GT:
18545 if (mode == DImode || op1 != constm1_rtx)
18546 return false;
18547 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18548 code = (code == LE ? GEU : LTU);
18549 break;
18550
18551 default:
18552 return false;
18553 }
18554 /* Swapping operands may cause constant to appear as first operand. */
18555 if (!nonimmediate_operand (op0, VOIDmode))
18556 {
18557 if (!can_create_pseudo_p ())
18558 return false;
18559 op0 = force_reg (mode, op0);
18560 }
18561 *pop = ix86_expand_compare (code, op0, op1);
18562 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18563 return true;
18564 }
18565
18566 bool
18567 ix86_expand_int_movcc (rtx operands[])
18568 {
18569 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18570 rtx compare_seq, compare_op;
18571 enum machine_mode mode = GET_MODE (operands[0]);
18572 bool sign_bit_compare_p = false;
18573 rtx op0 = XEXP (operands[1], 0);
18574 rtx op1 = XEXP (operands[1], 1);
18575
18576 start_sequence ();
18577 compare_op = ix86_expand_compare (code, op0, op1);
18578 compare_seq = get_insns ();
18579 end_sequence ();
18580
18581 compare_code = GET_CODE (compare_op);
18582
18583 if ((op1 == const0_rtx && (code == GE || code == LT))
18584 || (op1 == constm1_rtx && (code == GT || code == LE)))
18585 sign_bit_compare_p = true;
18586
18587 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18588 HImode insns, we'd be swallowed in word prefix ops. */
18589
18590 if ((mode != HImode || TARGET_FAST_PREFIX)
18591 && (mode != (TARGET_64BIT ? TImode : DImode))
18592 && CONST_INT_P (operands[2])
18593 && CONST_INT_P (operands[3]))
18594 {
18595 rtx out = operands[0];
18596 HOST_WIDE_INT ct = INTVAL (operands[2]);
18597 HOST_WIDE_INT cf = INTVAL (operands[3]);
18598 HOST_WIDE_INT diff;
18599
18600 diff = ct - cf;
18601 /* Sign bit compares are better done using shifts than we do by using
18602 sbb. */
18603 if (sign_bit_compare_p
18604 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18605 {
18606 /* Detect overlap between destination and compare sources. */
18607 rtx tmp = out;
18608
18609 if (!sign_bit_compare_p)
18610 {
18611 rtx flags;
18612 bool fpcmp = false;
18613
18614 compare_code = GET_CODE (compare_op);
18615
18616 flags = XEXP (compare_op, 0);
18617
18618 if (GET_MODE (flags) == CCFPmode
18619 || GET_MODE (flags) == CCFPUmode)
18620 {
18621 fpcmp = true;
18622 compare_code
18623 = ix86_fp_compare_code_to_integer (compare_code);
18624 }
18625
18626 /* To simplify rest of code, restrict to the GEU case. */
18627 if (compare_code == LTU)
18628 {
18629 HOST_WIDE_INT tmp = ct;
18630 ct = cf;
18631 cf = tmp;
18632 compare_code = reverse_condition (compare_code);
18633 code = reverse_condition (code);
18634 }
18635 else
18636 {
18637 if (fpcmp)
18638 PUT_CODE (compare_op,
18639 reverse_condition_maybe_unordered
18640 (GET_CODE (compare_op)));
18641 else
18642 PUT_CODE (compare_op,
18643 reverse_condition (GET_CODE (compare_op)));
18644 }
18645 diff = ct - cf;
18646
18647 if (reg_overlap_mentioned_p (out, op0)
18648 || reg_overlap_mentioned_p (out, op1))
18649 tmp = gen_reg_rtx (mode);
18650
18651 if (mode == DImode)
18652 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18653 else
18654 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18655 flags, compare_op));
18656 }
18657 else
18658 {
18659 if (code == GT || code == GE)
18660 code = reverse_condition (code);
18661 else
18662 {
18663 HOST_WIDE_INT tmp = ct;
18664 ct = cf;
18665 cf = tmp;
18666 diff = ct - cf;
18667 }
18668 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18669 }
18670
18671 if (diff == 1)
18672 {
18673 /*
18674 * cmpl op0,op1
18675 * sbbl dest,dest
18676 * [addl dest, ct]
18677 *
18678 * Size 5 - 8.
18679 */
18680 if (ct)
18681 tmp = expand_simple_binop (mode, PLUS,
18682 tmp, GEN_INT (ct),
18683 copy_rtx (tmp), 1, OPTAB_DIRECT);
18684 }
18685 else if (cf == -1)
18686 {
18687 /*
18688 * cmpl op0,op1
18689 * sbbl dest,dest
18690 * orl $ct, dest
18691 *
18692 * Size 8.
18693 */
18694 tmp = expand_simple_binop (mode, IOR,
18695 tmp, GEN_INT (ct),
18696 copy_rtx (tmp), 1, OPTAB_DIRECT);
18697 }
18698 else if (diff == -1 && ct)
18699 {
18700 /*
18701 * cmpl op0,op1
18702 * sbbl dest,dest
18703 * notl dest
18704 * [addl dest, cf]
18705 *
18706 * Size 8 - 11.
18707 */
18708 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18709 if (cf)
18710 tmp = expand_simple_binop (mode, PLUS,
18711 copy_rtx (tmp), GEN_INT (cf),
18712 copy_rtx (tmp), 1, OPTAB_DIRECT);
18713 }
18714 else
18715 {
18716 /*
18717 * cmpl op0,op1
18718 * sbbl dest,dest
18719 * [notl dest]
18720 * andl cf - ct, dest
18721 * [addl dest, ct]
18722 *
18723 * Size 8 - 11.
18724 */
18725
18726 if (cf == 0)
18727 {
18728 cf = ct;
18729 ct = 0;
18730 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18731 }
18732
18733 tmp = expand_simple_binop (mode, AND,
18734 copy_rtx (tmp),
18735 gen_int_mode (cf - ct, mode),
18736 copy_rtx (tmp), 1, OPTAB_DIRECT);
18737 if (ct)
18738 tmp = expand_simple_binop (mode, PLUS,
18739 copy_rtx (tmp), GEN_INT (ct),
18740 copy_rtx (tmp), 1, OPTAB_DIRECT);
18741 }
18742
18743 if (!rtx_equal_p (tmp, out))
18744 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18745
18746 return true;
18747 }
18748
18749 if (diff < 0)
18750 {
18751 enum machine_mode cmp_mode = GET_MODE (op0);
18752
18753 HOST_WIDE_INT tmp;
18754 tmp = ct, ct = cf, cf = tmp;
18755 diff = -diff;
18756
18757 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18758 {
18759 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18760
18761 /* We may be reversing unordered compare to normal compare, that
18762 is not valid in general (we may convert non-trapping condition
18763 to trapping one), however on i386 we currently emit all
18764 comparisons unordered. */
18765 compare_code = reverse_condition_maybe_unordered (compare_code);
18766 code = reverse_condition_maybe_unordered (code);
18767 }
18768 else
18769 {
18770 compare_code = reverse_condition (compare_code);
18771 code = reverse_condition (code);
18772 }
18773 }
18774
18775 compare_code = UNKNOWN;
18776 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18777 && CONST_INT_P (op1))
18778 {
18779 if (op1 == const0_rtx
18780 && (code == LT || code == GE))
18781 compare_code = code;
18782 else if (op1 == constm1_rtx)
18783 {
18784 if (code == LE)
18785 compare_code = LT;
18786 else if (code == GT)
18787 compare_code = GE;
18788 }
18789 }
18790
18791 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18792 if (compare_code != UNKNOWN
18793 && GET_MODE (op0) == GET_MODE (out)
18794 && (cf == -1 || ct == -1))
18795 {
18796 /* If lea code below could be used, only optimize
18797 if it results in a 2 insn sequence. */
18798
18799 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18800 || diff == 3 || diff == 5 || diff == 9)
18801 || (compare_code == LT && ct == -1)
18802 || (compare_code == GE && cf == -1))
18803 {
18804 /*
18805 * notl op1 (if necessary)
18806 * sarl $31, op1
18807 * orl cf, op1
18808 */
18809 if (ct != -1)
18810 {
18811 cf = ct;
18812 ct = -1;
18813 code = reverse_condition (code);
18814 }
18815
18816 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18817
18818 out = expand_simple_binop (mode, IOR,
18819 out, GEN_INT (cf),
18820 out, 1, OPTAB_DIRECT);
18821 if (out != operands[0])
18822 emit_move_insn (operands[0], out);
18823
18824 return true;
18825 }
18826 }
18827
18828
18829 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
18830 || diff == 3 || diff == 5 || diff == 9)
18831 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
18832 && (mode != DImode
18833 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
18834 {
18835 /*
18836 * xorl dest,dest
18837 * cmpl op1,op2
18838 * setcc dest
18839 * lea cf(dest*(ct-cf)),dest
18840 *
18841 * Size 14.
18842 *
18843 * This also catches the degenerate setcc-only case.
18844 */
18845
18846 rtx tmp;
18847 int nops;
18848
18849 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18850
18851 nops = 0;
18852 /* On x86_64 the lea instruction operates on Pmode, so we need
18853 to get arithmetics done in proper mode to match. */
18854 if (diff == 1)
18855 tmp = copy_rtx (out);
18856 else
18857 {
18858 rtx out1;
18859 out1 = copy_rtx (out);
18860 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
18861 nops++;
18862 if (diff & 1)
18863 {
18864 tmp = gen_rtx_PLUS (mode, tmp, out1);
18865 nops++;
18866 }
18867 }
18868 if (cf != 0)
18869 {
18870 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
18871 nops++;
18872 }
18873 if (!rtx_equal_p (tmp, out))
18874 {
18875 if (nops == 1)
18876 out = force_operand (tmp, copy_rtx (out));
18877 else
18878 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
18879 }
18880 if (!rtx_equal_p (out, operands[0]))
18881 emit_move_insn (operands[0], copy_rtx (out));
18882
18883 return true;
18884 }
18885
18886 /*
18887 * General case: Jumpful:
18888 * xorl dest,dest cmpl op1, op2
18889 * cmpl op1, op2 movl ct, dest
18890 * setcc dest jcc 1f
18891 * decl dest movl cf, dest
18892 * andl (cf-ct),dest 1:
18893 * addl ct,dest
18894 *
18895 * Size 20. Size 14.
18896 *
18897 * This is reasonably steep, but branch mispredict costs are
18898 * high on modern cpus, so consider failing only if optimizing
18899 * for space.
18900 */
18901
18902 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18903 && BRANCH_COST (optimize_insn_for_speed_p (),
18904 false) >= 2)
18905 {
18906 if (cf == 0)
18907 {
18908 enum machine_mode cmp_mode = GET_MODE (op0);
18909
18910 cf = ct;
18911 ct = 0;
18912
18913 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18914 {
18915 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18916
18917 /* We may be reversing unordered compare to normal compare,
18918 that is not valid in general (we may convert non-trapping
18919 condition to trapping one), however on i386 we currently
18920 emit all comparisons unordered. */
18921 code = reverse_condition_maybe_unordered (code);
18922 }
18923 else
18924 {
18925 code = reverse_condition (code);
18926 if (compare_code != UNKNOWN)
18927 compare_code = reverse_condition (compare_code);
18928 }
18929 }
18930
18931 if (compare_code != UNKNOWN)
18932 {
18933 /* notl op1 (if needed)
18934 sarl $31, op1
18935 andl (cf-ct), op1
18936 addl ct, op1
18937
18938 For x < 0 (resp. x <= -1) there will be no notl,
18939 so if possible swap the constants to get rid of the
18940 complement.
18941 True/false will be -1/0 while code below (store flag
18942 followed by decrement) is 0/-1, so the constants need
18943 to be exchanged once more. */
18944
18945 if (compare_code == GE || !cf)
18946 {
18947 code = reverse_condition (code);
18948 compare_code = LT;
18949 }
18950 else
18951 {
18952 HOST_WIDE_INT tmp = cf;
18953 cf = ct;
18954 ct = tmp;
18955 }
18956
18957 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18958 }
18959 else
18960 {
18961 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18962
18963 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
18964 constm1_rtx,
18965 copy_rtx (out), 1, OPTAB_DIRECT);
18966 }
18967
18968 out = expand_simple_binop (mode, AND, copy_rtx (out),
18969 gen_int_mode (cf - ct, mode),
18970 copy_rtx (out), 1, OPTAB_DIRECT);
18971 if (ct)
18972 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
18973 copy_rtx (out), 1, OPTAB_DIRECT);
18974 if (!rtx_equal_p (out, operands[0]))
18975 emit_move_insn (operands[0], copy_rtx (out));
18976
18977 return true;
18978 }
18979 }
18980
18981 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18982 {
18983 /* Try a few things more with specific constants and a variable. */
18984
18985 optab op;
18986 rtx var, orig_out, out, tmp;
18987
18988 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
18989 return false;
18990
18991 /* If one of the two operands is an interesting constant, load a
18992 constant with the above and mask it in with a logical operation. */
18993
18994 if (CONST_INT_P (operands[2]))
18995 {
18996 var = operands[3];
18997 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
18998 operands[3] = constm1_rtx, op = and_optab;
18999 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19000 operands[3] = const0_rtx, op = ior_optab;
19001 else
19002 return false;
19003 }
19004 else if (CONST_INT_P (operands[3]))
19005 {
19006 var = operands[2];
19007 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19008 operands[2] = constm1_rtx, op = and_optab;
19009 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19010 operands[2] = const0_rtx, op = ior_optab;
19011 else
19012 return false;
19013 }
19014 else
19015 return false;
19016
19017 orig_out = operands[0];
19018 tmp = gen_reg_rtx (mode);
19019 operands[0] = tmp;
19020
19021 /* Recurse to get the constant loaded. */
19022 if (ix86_expand_int_movcc (operands) == 0)
19023 return false;
19024
19025 /* Mask in the interesting variable. */
19026 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19027 OPTAB_WIDEN);
19028 if (!rtx_equal_p (out, orig_out))
19029 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19030
19031 return true;
19032 }
19033
19034 /*
19035 * For comparison with above,
19036 *
19037 * movl cf,dest
19038 * movl ct,tmp
19039 * cmpl op1,op2
19040 * cmovcc tmp,dest
19041 *
19042 * Size 15.
19043 */
19044
19045 if (! nonimmediate_operand (operands[2], mode))
19046 operands[2] = force_reg (mode, operands[2]);
19047 if (! nonimmediate_operand (operands[3], mode))
19048 operands[3] = force_reg (mode, operands[3]);
19049
19050 if (! register_operand (operands[2], VOIDmode)
19051 && (mode == QImode
19052 || ! register_operand (operands[3], VOIDmode)))
19053 operands[2] = force_reg (mode, operands[2]);
19054
19055 if (mode == QImode
19056 && ! register_operand (operands[3], VOIDmode))
19057 operands[3] = force_reg (mode, operands[3]);
19058
19059 emit_insn (compare_seq);
19060 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19061 gen_rtx_IF_THEN_ELSE (mode,
19062 compare_op, operands[2],
19063 operands[3])));
19064 return true;
19065 }
19066
19067 /* Swap, force into registers, or otherwise massage the two operands
19068 to an sse comparison with a mask result. Thus we differ a bit from
19069 ix86_prepare_fp_compare_args which expects to produce a flags result.
19070
19071 The DEST operand exists to help determine whether to commute commutative
19072 operators. The POP0/POP1 operands are updated in place. The new
19073 comparison code is returned, or UNKNOWN if not implementable. */
19074
19075 static enum rtx_code
19076 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19077 rtx *pop0, rtx *pop1)
19078 {
19079 rtx tmp;
19080
19081 switch (code)
19082 {
19083 case LTGT:
19084 case UNEQ:
19085 /* AVX supports all the needed comparisons. */
19086 if (TARGET_AVX)
19087 break;
19088 /* We have no LTGT as an operator. We could implement it with
19089 NE & ORDERED, but this requires an extra temporary. It's
19090 not clear that it's worth it. */
19091 return UNKNOWN;
19092
19093 case LT:
19094 case LE:
19095 case UNGT:
19096 case UNGE:
19097 /* These are supported directly. */
19098 break;
19099
19100 case EQ:
19101 case NE:
19102 case UNORDERED:
19103 case ORDERED:
19104 /* AVX has 3 operand comparisons, no need to swap anything. */
19105 if (TARGET_AVX)
19106 break;
19107 /* For commutative operators, try to canonicalize the destination
19108 operand to be first in the comparison - this helps reload to
19109 avoid extra moves. */
19110 if (!dest || !rtx_equal_p (dest, *pop1))
19111 break;
19112 /* FALLTHRU */
19113
19114 case GE:
19115 case GT:
19116 case UNLE:
19117 case UNLT:
19118 /* These are not supported directly before AVX, and furthermore
19119 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19120 comparison operands to transform into something that is
19121 supported. */
19122 tmp = *pop0;
19123 *pop0 = *pop1;
19124 *pop1 = tmp;
19125 code = swap_condition (code);
19126 break;
19127
19128 default:
19129 gcc_unreachable ();
19130 }
19131
19132 return code;
19133 }
19134
19135 /* Detect conditional moves that exactly match min/max operational
19136 semantics. Note that this is IEEE safe, as long as we don't
19137 interchange the operands.
19138
19139 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19140 and TRUE if the operation is successful and instructions are emitted. */
19141
19142 static bool
19143 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19144 rtx cmp_op1, rtx if_true, rtx if_false)
19145 {
19146 enum machine_mode mode;
19147 bool is_min;
19148 rtx tmp;
19149
19150 if (code == LT)
19151 ;
19152 else if (code == UNGE)
19153 {
19154 tmp = if_true;
19155 if_true = if_false;
19156 if_false = tmp;
19157 }
19158 else
19159 return false;
19160
19161 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19162 is_min = true;
19163 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19164 is_min = false;
19165 else
19166 return false;
19167
19168 mode = GET_MODE (dest);
19169
19170 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19171 but MODE may be a vector mode and thus not appropriate. */
19172 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19173 {
19174 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19175 rtvec v;
19176
19177 if_true = force_reg (mode, if_true);
19178 v = gen_rtvec (2, if_true, if_false);
19179 tmp = gen_rtx_UNSPEC (mode, v, u);
19180 }
19181 else
19182 {
19183 code = is_min ? SMIN : SMAX;
19184 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19185 }
19186
19187 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19188 return true;
19189 }
19190
19191 /* Expand an sse vector comparison. Return the register with the result. */
19192
19193 static rtx
19194 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19195 rtx op_true, rtx op_false)
19196 {
19197 enum machine_mode mode = GET_MODE (dest);
19198 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19199 rtx x;
19200
19201 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19202 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19203 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19204
19205 if (optimize
19206 || reg_overlap_mentioned_p (dest, op_true)
19207 || reg_overlap_mentioned_p (dest, op_false))
19208 dest = gen_reg_rtx (mode);
19209
19210 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19211 if (cmp_mode != mode)
19212 {
19213 x = force_reg (cmp_mode, x);
19214 convert_move (dest, x, false);
19215 }
19216 else
19217 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19218
19219 return dest;
19220 }
19221
19222 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19223 operations. This is used for both scalar and vector conditional moves. */
19224
19225 static void
19226 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19227 {
19228 enum machine_mode mode = GET_MODE (dest);
19229 rtx t2, t3, x;
19230
19231 if (vector_all_ones_operand (op_true, mode)
19232 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19233 {
19234 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19235 }
19236 else if (op_false == CONST0_RTX (mode))
19237 {
19238 op_true = force_reg (mode, op_true);
19239 x = gen_rtx_AND (mode, cmp, op_true);
19240 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19241 }
19242 else if (op_true == CONST0_RTX (mode))
19243 {
19244 op_false = force_reg (mode, op_false);
19245 x = gen_rtx_NOT (mode, cmp);
19246 x = gen_rtx_AND (mode, x, op_false);
19247 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19248 }
19249 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19250 {
19251 op_false = force_reg (mode, op_false);
19252 x = gen_rtx_IOR (mode, cmp, op_false);
19253 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19254 }
19255 else if (TARGET_XOP)
19256 {
19257 op_true = force_reg (mode, op_true);
19258
19259 if (!nonimmediate_operand (op_false, mode))
19260 op_false = force_reg (mode, op_false);
19261
19262 emit_insn (gen_rtx_SET (mode, dest,
19263 gen_rtx_IF_THEN_ELSE (mode, cmp,
19264 op_true,
19265 op_false)));
19266 }
19267 else
19268 {
19269 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19270
19271 if (!nonimmediate_operand (op_true, mode))
19272 op_true = force_reg (mode, op_true);
19273
19274 op_false = force_reg (mode, op_false);
19275
19276 switch (mode)
19277 {
19278 case V4SFmode:
19279 if (TARGET_SSE4_1)
19280 gen = gen_sse4_1_blendvps;
19281 break;
19282 case V2DFmode:
19283 if (TARGET_SSE4_1)
19284 gen = gen_sse4_1_blendvpd;
19285 break;
19286 case V16QImode:
19287 case V8HImode:
19288 case V4SImode:
19289 case V2DImode:
19290 if (TARGET_SSE4_1)
19291 {
19292 gen = gen_sse4_1_pblendvb;
19293 dest = gen_lowpart (V16QImode, dest);
19294 op_false = gen_lowpart (V16QImode, op_false);
19295 op_true = gen_lowpart (V16QImode, op_true);
19296 cmp = gen_lowpart (V16QImode, cmp);
19297 }
19298 break;
19299 case V8SFmode:
19300 if (TARGET_AVX)
19301 gen = gen_avx_blendvps256;
19302 break;
19303 case V4DFmode:
19304 if (TARGET_AVX)
19305 gen = gen_avx_blendvpd256;
19306 break;
19307 case V32QImode:
19308 case V16HImode:
19309 case V8SImode:
19310 case V4DImode:
19311 if (TARGET_AVX2)
19312 {
19313 gen = gen_avx2_pblendvb;
19314 dest = gen_lowpart (V32QImode, dest);
19315 op_false = gen_lowpart (V32QImode, op_false);
19316 op_true = gen_lowpart (V32QImode, op_true);
19317 cmp = gen_lowpart (V32QImode, cmp);
19318 }
19319 break;
19320 default:
19321 break;
19322 }
19323
19324 if (gen != NULL)
19325 emit_insn (gen (dest, op_false, op_true, cmp));
19326 else
19327 {
19328 op_true = force_reg (mode, op_true);
19329
19330 t2 = gen_reg_rtx (mode);
19331 if (optimize)
19332 t3 = gen_reg_rtx (mode);
19333 else
19334 t3 = dest;
19335
19336 x = gen_rtx_AND (mode, op_true, cmp);
19337 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19338
19339 x = gen_rtx_NOT (mode, cmp);
19340 x = gen_rtx_AND (mode, x, op_false);
19341 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19342
19343 x = gen_rtx_IOR (mode, t3, t2);
19344 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19345 }
19346 }
19347 }
19348
19349 /* Expand a floating-point conditional move. Return true if successful. */
19350
19351 bool
19352 ix86_expand_fp_movcc (rtx operands[])
19353 {
19354 enum machine_mode mode = GET_MODE (operands[0]);
19355 enum rtx_code code = GET_CODE (operands[1]);
19356 rtx tmp, compare_op;
19357 rtx op0 = XEXP (operands[1], 0);
19358 rtx op1 = XEXP (operands[1], 1);
19359
19360 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19361 {
19362 enum machine_mode cmode;
19363
19364 /* Since we've no cmove for sse registers, don't force bad register
19365 allocation just to gain access to it. Deny movcc when the
19366 comparison mode doesn't match the move mode. */
19367 cmode = GET_MODE (op0);
19368 if (cmode == VOIDmode)
19369 cmode = GET_MODE (op1);
19370 if (cmode != mode)
19371 return false;
19372
19373 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19374 if (code == UNKNOWN)
19375 return false;
19376
19377 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19378 operands[2], operands[3]))
19379 return true;
19380
19381 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19382 operands[2], operands[3]);
19383 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19384 return true;
19385 }
19386
19387 /* The floating point conditional move instructions don't directly
19388 support conditions resulting from a signed integer comparison. */
19389
19390 compare_op = ix86_expand_compare (code, op0, op1);
19391 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19392 {
19393 tmp = gen_reg_rtx (QImode);
19394 ix86_expand_setcc (tmp, code, op0, op1);
19395
19396 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19397 }
19398
19399 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19400 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19401 operands[2], operands[3])));
19402
19403 return true;
19404 }
19405
19406 /* Expand a floating-point vector conditional move; a vcond operation
19407 rather than a movcc operation. */
19408
19409 bool
19410 ix86_expand_fp_vcond (rtx operands[])
19411 {
19412 enum rtx_code code = GET_CODE (operands[3]);
19413 rtx cmp;
19414
19415 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19416 &operands[4], &operands[5]);
19417 if (code == UNKNOWN)
19418 {
19419 rtx temp;
19420 switch (GET_CODE (operands[3]))
19421 {
19422 case LTGT:
19423 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19424 operands[5], operands[0], operands[0]);
19425 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19426 operands[5], operands[1], operands[2]);
19427 code = AND;
19428 break;
19429 case UNEQ:
19430 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19431 operands[5], operands[0], operands[0]);
19432 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19433 operands[5], operands[1], operands[2]);
19434 code = IOR;
19435 break;
19436 default:
19437 gcc_unreachable ();
19438 }
19439 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19440 OPTAB_DIRECT);
19441 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19442 return true;
19443 }
19444
19445 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19446 operands[5], operands[1], operands[2]))
19447 return true;
19448
19449 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19450 operands[1], operands[2]);
19451 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19452 return true;
19453 }
19454
19455 /* Expand a signed/unsigned integral vector conditional move. */
19456
19457 bool
19458 ix86_expand_int_vcond (rtx operands[])
19459 {
19460 enum machine_mode data_mode = GET_MODE (operands[0]);
19461 enum machine_mode mode = GET_MODE (operands[4]);
19462 enum rtx_code code = GET_CODE (operands[3]);
19463 bool negate = false;
19464 rtx x, cop0, cop1;
19465
19466 cop0 = operands[4];
19467 cop1 = operands[5];
19468
19469 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19470 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19471 if ((code == LT || code == GE)
19472 && data_mode == mode
19473 && cop1 == CONST0_RTX (mode)
19474 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19475 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19476 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19477 && (GET_MODE_SIZE (data_mode) == 16
19478 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19479 {
19480 rtx negop = operands[2 - (code == LT)];
19481 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19482 if (negop == CONST1_RTX (data_mode))
19483 {
19484 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19485 operands[0], 1, OPTAB_DIRECT);
19486 if (res != operands[0])
19487 emit_move_insn (operands[0], res);
19488 return true;
19489 }
19490 else if (GET_MODE_INNER (data_mode) != DImode
19491 && vector_all_ones_operand (negop, data_mode))
19492 {
19493 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19494 operands[0], 0, OPTAB_DIRECT);
19495 if (res != operands[0])
19496 emit_move_insn (operands[0], res);
19497 return true;
19498 }
19499 }
19500
19501 if (!nonimmediate_operand (cop1, mode))
19502 cop1 = force_reg (mode, cop1);
19503 if (!general_operand (operands[1], data_mode))
19504 operands[1] = force_reg (data_mode, operands[1]);
19505 if (!general_operand (operands[2], data_mode))
19506 operands[2] = force_reg (data_mode, operands[2]);
19507
19508 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19509 if (TARGET_XOP
19510 && (mode == V16QImode || mode == V8HImode
19511 || mode == V4SImode || mode == V2DImode))
19512 ;
19513 else
19514 {
19515 /* Canonicalize the comparison to EQ, GT, GTU. */
19516 switch (code)
19517 {
19518 case EQ:
19519 case GT:
19520 case GTU:
19521 break;
19522
19523 case NE:
19524 case LE:
19525 case LEU:
19526 code = reverse_condition (code);
19527 negate = true;
19528 break;
19529
19530 case GE:
19531 case GEU:
19532 code = reverse_condition (code);
19533 negate = true;
19534 /* FALLTHRU */
19535
19536 case LT:
19537 case LTU:
19538 code = swap_condition (code);
19539 x = cop0, cop0 = cop1, cop1 = x;
19540 break;
19541
19542 default:
19543 gcc_unreachable ();
19544 }
19545
19546 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19547 if (mode == V2DImode)
19548 {
19549 switch (code)
19550 {
19551 case EQ:
19552 /* SSE4.1 supports EQ. */
19553 if (!TARGET_SSE4_1)
19554 return false;
19555 break;
19556
19557 case GT:
19558 case GTU:
19559 /* SSE4.2 supports GT/GTU. */
19560 if (!TARGET_SSE4_2)
19561 return false;
19562 break;
19563
19564 default:
19565 gcc_unreachable ();
19566 }
19567 }
19568
19569 /* Unsigned parallel compare is not supported by the hardware.
19570 Play some tricks to turn this into a signed comparison
19571 against 0. */
19572 if (code == GTU)
19573 {
19574 cop0 = force_reg (mode, cop0);
19575
19576 switch (mode)
19577 {
19578 case V8SImode:
19579 case V4DImode:
19580 case V4SImode:
19581 case V2DImode:
19582 {
19583 rtx t1, t2, mask;
19584 rtx (*gen_sub3) (rtx, rtx, rtx);
19585
19586 switch (mode)
19587 {
19588 case V8SImode: gen_sub3 = gen_subv8si3; break;
19589 case V4DImode: gen_sub3 = gen_subv4di3; break;
19590 case V4SImode: gen_sub3 = gen_subv4si3; break;
19591 case V2DImode: gen_sub3 = gen_subv2di3; break;
19592 default:
19593 gcc_unreachable ();
19594 }
19595 /* Subtract (-(INT MAX) - 1) from both operands to make
19596 them signed. */
19597 mask = ix86_build_signbit_mask (mode, true, false);
19598 t1 = gen_reg_rtx (mode);
19599 emit_insn (gen_sub3 (t1, cop0, mask));
19600
19601 t2 = gen_reg_rtx (mode);
19602 emit_insn (gen_sub3 (t2, cop1, mask));
19603
19604 cop0 = t1;
19605 cop1 = t2;
19606 code = GT;
19607 }
19608 break;
19609
19610 case V32QImode:
19611 case V16HImode:
19612 case V16QImode:
19613 case V8HImode:
19614 /* Perform a parallel unsigned saturating subtraction. */
19615 x = gen_reg_rtx (mode);
19616 emit_insn (gen_rtx_SET (VOIDmode, x,
19617 gen_rtx_US_MINUS (mode, cop0, cop1)));
19618
19619 cop0 = x;
19620 cop1 = CONST0_RTX (mode);
19621 code = EQ;
19622 negate = !negate;
19623 break;
19624
19625 default:
19626 gcc_unreachable ();
19627 }
19628 }
19629 }
19630
19631 /* Allow the comparison to be done in one mode, but the movcc to
19632 happen in another mode. */
19633 if (data_mode == mode)
19634 {
19635 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19636 operands[1+negate], operands[2-negate]);
19637 }
19638 else
19639 {
19640 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19641 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19642 code, cop0, cop1,
19643 operands[1+negate], operands[2-negate]);
19644 x = gen_lowpart (data_mode, x);
19645 }
19646
19647 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19648 operands[2-negate]);
19649 return true;
19650 }
19651
19652 /* Expand a variable vector permutation. */
19653
19654 void
19655 ix86_expand_vec_perm (rtx operands[])
19656 {
19657 rtx target = operands[0];
19658 rtx op0 = operands[1];
19659 rtx op1 = operands[2];
19660 rtx mask = operands[3];
19661 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19662 enum machine_mode mode = GET_MODE (op0);
19663 enum machine_mode maskmode = GET_MODE (mask);
19664 int w, e, i;
19665 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19666
19667 /* Number of elements in the vector. */
19668 w = GET_MODE_NUNITS (mode);
19669 e = GET_MODE_UNIT_SIZE (mode);
19670 gcc_assert (w <= 32);
19671
19672 if (TARGET_AVX2)
19673 {
19674 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19675 {
19676 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19677 an constant shuffle operand. With a tiny bit of effort we can
19678 use VPERMD instead. A re-interpretation stall for V4DFmode is
19679 unfortunate but there's no avoiding it.
19680 Similarly for V16HImode we don't have instructions for variable
19681 shuffling, while for V32QImode we can use after preparing suitable
19682 masks vpshufb; vpshufb; vpermq; vpor. */
19683
19684 if (mode == V16HImode)
19685 {
19686 maskmode = mode = V32QImode;
19687 w = 32;
19688 e = 1;
19689 }
19690 else
19691 {
19692 maskmode = mode = V8SImode;
19693 w = 8;
19694 e = 4;
19695 }
19696 t1 = gen_reg_rtx (maskmode);
19697
19698 /* Replicate the low bits of the V4DImode mask into V8SImode:
19699 mask = { A B C D }
19700 t1 = { A A B B C C D D }. */
19701 for (i = 0; i < w / 2; ++i)
19702 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19703 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19704 vt = force_reg (maskmode, vt);
19705 mask = gen_lowpart (maskmode, mask);
19706 if (maskmode == V8SImode)
19707 emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
19708 else
19709 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19710
19711 /* Multiply the shuffle indicies by two. */
19712 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19713 OPTAB_DIRECT);
19714
19715 /* Add one to the odd shuffle indicies:
19716 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19717 for (i = 0; i < w / 2; ++i)
19718 {
19719 vec[i * 2] = const0_rtx;
19720 vec[i * 2 + 1] = const1_rtx;
19721 }
19722 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19723 vt = force_const_mem (maskmode, vt);
19724 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19725 OPTAB_DIRECT);
19726
19727 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19728 operands[3] = mask = t1;
19729 target = gen_lowpart (mode, target);
19730 op0 = gen_lowpart (mode, op0);
19731 op1 = gen_lowpart (mode, op1);
19732 }
19733
19734 switch (mode)
19735 {
19736 case V8SImode:
19737 /* The VPERMD and VPERMPS instructions already properly ignore
19738 the high bits of the shuffle elements. No need for us to
19739 perform an AND ourselves. */
19740 if (one_operand_shuffle)
19741 emit_insn (gen_avx2_permvarv8si (target, mask, op0));
19742 else
19743 {
19744 t1 = gen_reg_rtx (V8SImode);
19745 t2 = gen_reg_rtx (V8SImode);
19746 emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
19747 emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
19748 goto merge_two;
19749 }
19750 return;
19751
19752 case V8SFmode:
19753 mask = gen_lowpart (V8SFmode, mask);
19754 if (one_operand_shuffle)
19755 emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
19756 else
19757 {
19758 t1 = gen_reg_rtx (V8SFmode);
19759 t2 = gen_reg_rtx (V8SFmode);
19760 emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
19761 emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
19762 goto merge_two;
19763 }
19764 return;
19765
19766 case V4SImode:
19767 /* By combining the two 128-bit input vectors into one 256-bit
19768 input vector, we can use VPERMD and VPERMPS for the full
19769 two-operand shuffle. */
19770 t1 = gen_reg_rtx (V8SImode);
19771 t2 = gen_reg_rtx (V8SImode);
19772 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19773 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19774 emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
19775 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19776 return;
19777
19778 case V4SFmode:
19779 t1 = gen_reg_rtx (V8SFmode);
19780 t2 = gen_reg_rtx (V8SFmode);
19781 mask = gen_lowpart (V4SFmode, mask);
19782 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19783 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19784 emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
19785 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19786 return;
19787
19788 case V32QImode:
19789 t1 = gen_reg_rtx (V32QImode);
19790 t2 = gen_reg_rtx (V32QImode);
19791 t3 = gen_reg_rtx (V32QImode);
19792 vt2 = GEN_INT (128);
19793 for (i = 0; i < 32; i++)
19794 vec[i] = vt2;
19795 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19796 vt = force_reg (V32QImode, vt);
19797 for (i = 0; i < 32; i++)
19798 vec[i] = i < 16 ? vt2 : const0_rtx;
19799 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19800 vt2 = force_reg (V32QImode, vt2);
19801 /* From mask create two adjusted masks, which contain the same
19802 bits as mask in the low 7 bits of each vector element.
19803 The first mask will have the most significant bit clear
19804 if it requests element from the same 128-bit lane
19805 and MSB set if it requests element from the other 128-bit lane.
19806 The second mask will have the opposite values of the MSB,
19807 and additionally will have its 128-bit lanes swapped.
19808 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19809 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19810 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19811 stands for other 12 bytes. */
19812 /* The bit whether element is from the same lane or the other
19813 lane is bit 4, so shift it up by 3 to the MSB position. */
19814 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19815 gen_lowpart (V4DImode, mask),
19816 GEN_INT (3)));
19817 /* Clear MSB bits from the mask just in case it had them set. */
19818 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19819 /* After this t1 will have MSB set for elements from other lane. */
19820 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
19821 /* Clear bits other than MSB. */
19822 emit_insn (gen_andv32qi3 (t1, t1, vt));
19823 /* Or in the lower bits from mask into t3. */
19824 emit_insn (gen_iorv32qi3 (t3, t1, t2));
19825 /* And invert MSB bits in t1, so MSB is set for elements from the same
19826 lane. */
19827 emit_insn (gen_xorv32qi3 (t1, t1, vt));
19828 /* Swap 128-bit lanes in t3. */
19829 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19830 gen_lowpart (V4DImode, t3),
19831 const2_rtx, GEN_INT (3),
19832 const0_rtx, const1_rtx));
19833 /* And or in the lower bits from mask into t1. */
19834 emit_insn (gen_iorv32qi3 (t1, t1, t2));
19835 if (one_operand_shuffle)
19836 {
19837 /* Each of these shuffles will put 0s in places where
19838 element from the other 128-bit lane is needed, otherwise
19839 will shuffle in the requested value. */
19840 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
19841 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
19842 /* For t3 the 128-bit lanes are swapped again. */
19843 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19844 gen_lowpart (V4DImode, t3),
19845 const2_rtx, GEN_INT (3),
19846 const0_rtx, const1_rtx));
19847 /* And oring both together leads to the result. */
19848 emit_insn (gen_iorv32qi3 (target, t1, t3));
19849 return;
19850 }
19851
19852 t4 = gen_reg_rtx (V32QImode);
19853 /* Similarly to the above one_operand_shuffle code,
19854 just for repeated twice for each operand. merge_two:
19855 code will merge the two results together. */
19856 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
19857 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
19858 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
19859 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
19860 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
19861 gen_lowpart (V4DImode, t4),
19862 const2_rtx, GEN_INT (3),
19863 const0_rtx, const1_rtx));
19864 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19865 gen_lowpart (V4DImode, t3),
19866 const2_rtx, GEN_INT (3),
19867 const0_rtx, const1_rtx));
19868 emit_insn (gen_iorv32qi3 (t4, t2, t4));
19869 emit_insn (gen_iorv32qi3 (t3, t1, t3));
19870 t1 = t4;
19871 t2 = t3;
19872 goto merge_two;
19873
19874 default:
19875 gcc_assert (GET_MODE_SIZE (mode) <= 16);
19876 break;
19877 }
19878 }
19879
19880 if (TARGET_XOP)
19881 {
19882 /* The XOP VPPERM insn supports three inputs. By ignoring the
19883 one_operand_shuffle special case, we avoid creating another
19884 set of constant vectors in memory. */
19885 one_operand_shuffle = false;
19886
19887 /* mask = mask & {2*w-1, ...} */
19888 vt = GEN_INT (2*w - 1);
19889 }
19890 else
19891 {
19892 /* mask = mask & {w-1, ...} */
19893 vt = GEN_INT (w - 1);
19894 }
19895
19896 for (i = 0; i < w; i++)
19897 vec[i] = vt;
19898 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19899 mask = expand_simple_binop (maskmode, AND, mask, vt,
19900 NULL_RTX, 0, OPTAB_DIRECT);
19901
19902 /* For non-QImode operations, convert the word permutation control
19903 into a byte permutation control. */
19904 if (mode != V16QImode)
19905 {
19906 mask = expand_simple_binop (maskmode, ASHIFT, mask,
19907 GEN_INT (exact_log2 (e)),
19908 NULL_RTX, 0, OPTAB_DIRECT);
19909
19910 /* Convert mask to vector of chars. */
19911 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
19912
19913 /* Replicate each of the input bytes into byte positions:
19914 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
19915 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
19916 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
19917 for (i = 0; i < 16; ++i)
19918 vec[i] = GEN_INT (i/e * e);
19919 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19920 vt = force_const_mem (V16QImode, vt);
19921 if (TARGET_XOP)
19922 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
19923 else
19924 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
19925
19926 /* Convert it into the byte positions by doing
19927 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
19928 for (i = 0; i < 16; ++i)
19929 vec[i] = GEN_INT (i % e);
19930 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19931 vt = force_const_mem (V16QImode, vt);
19932 emit_insn (gen_addv16qi3 (mask, mask, vt));
19933 }
19934
19935 /* The actual shuffle operations all operate on V16QImode. */
19936 op0 = gen_lowpart (V16QImode, op0);
19937 op1 = gen_lowpart (V16QImode, op1);
19938 target = gen_lowpart (V16QImode, target);
19939
19940 if (TARGET_XOP)
19941 {
19942 emit_insn (gen_xop_pperm (target, op0, op1, mask));
19943 }
19944 else if (one_operand_shuffle)
19945 {
19946 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
19947 }
19948 else
19949 {
19950 rtx xops[6];
19951 bool ok;
19952
19953 /* Shuffle the two input vectors independently. */
19954 t1 = gen_reg_rtx (V16QImode);
19955 t2 = gen_reg_rtx (V16QImode);
19956 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
19957 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
19958
19959 merge_two:
19960 /* Then merge them together. The key is whether any given control
19961 element contained a bit set that indicates the second word. */
19962 mask = operands[3];
19963 vt = GEN_INT (w);
19964 if (maskmode == V2DImode && !TARGET_SSE4_1)
19965 {
19966 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
19967 more shuffle to convert the V2DI input mask into a V4SI
19968 input mask. At which point the masking that expand_int_vcond
19969 will work as desired. */
19970 rtx t3 = gen_reg_rtx (V4SImode);
19971 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
19972 const0_rtx, const0_rtx,
19973 const2_rtx, const2_rtx));
19974 mask = t3;
19975 maskmode = V4SImode;
19976 e = w = 4;
19977 }
19978
19979 for (i = 0; i < w; i++)
19980 vec[i] = vt;
19981 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19982 vt = force_reg (maskmode, vt);
19983 mask = expand_simple_binop (maskmode, AND, mask, vt,
19984 NULL_RTX, 0, OPTAB_DIRECT);
19985
19986 xops[0] = gen_lowpart (mode, operands[0]);
19987 xops[1] = gen_lowpart (mode, t2);
19988 xops[2] = gen_lowpart (mode, t1);
19989 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
19990 xops[4] = mask;
19991 xops[5] = vt;
19992 ok = ix86_expand_int_vcond (xops);
19993 gcc_assert (ok);
19994 }
19995 }
19996
19997 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
19998 true if we should do zero extension, else sign extension. HIGH_P is
19999 true if we want the N/2 high elements, else the low elements. */
20000
20001 void
20002 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
20003 {
20004 enum machine_mode imode = GET_MODE (operands[1]);
20005 rtx tmp, dest;
20006
20007 if (TARGET_SSE4_1)
20008 {
20009 rtx (*unpack)(rtx, rtx);
20010 rtx (*extract)(rtx, rtx) = NULL;
20011 enum machine_mode halfmode = BLKmode;
20012
20013 switch (imode)
20014 {
20015 case V32QImode:
20016 if (unsigned_p)
20017 unpack = gen_avx2_zero_extendv16qiv16hi2;
20018 else
20019 unpack = gen_avx2_sign_extendv16qiv16hi2;
20020 halfmode = V16QImode;
20021 extract
20022 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20023 break;
20024 case V16HImode:
20025 if (unsigned_p)
20026 unpack = gen_avx2_zero_extendv8hiv8si2;
20027 else
20028 unpack = gen_avx2_sign_extendv8hiv8si2;
20029 halfmode = V8HImode;
20030 extract
20031 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20032 break;
20033 case V8SImode:
20034 if (unsigned_p)
20035 unpack = gen_avx2_zero_extendv4siv4di2;
20036 else
20037 unpack = gen_avx2_sign_extendv4siv4di2;
20038 halfmode = V4SImode;
20039 extract
20040 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20041 break;
20042 case V16QImode:
20043 if (unsigned_p)
20044 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20045 else
20046 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20047 break;
20048 case V8HImode:
20049 if (unsigned_p)
20050 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20051 else
20052 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20053 break;
20054 case V4SImode:
20055 if (unsigned_p)
20056 unpack = gen_sse4_1_zero_extendv2siv2di2;
20057 else
20058 unpack = gen_sse4_1_sign_extendv2siv2di2;
20059 break;
20060 default:
20061 gcc_unreachable ();
20062 }
20063
20064 if (GET_MODE_SIZE (imode) == 32)
20065 {
20066 tmp = gen_reg_rtx (halfmode);
20067 emit_insn (extract (tmp, operands[1]));
20068 }
20069 else if (high_p)
20070 {
20071 /* Shift higher 8 bytes to lower 8 bytes. */
20072 tmp = gen_reg_rtx (imode);
20073 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20074 gen_lowpart (V1TImode, operands[1]),
20075 GEN_INT (64)));
20076 }
20077 else
20078 tmp = operands[1];
20079
20080 emit_insn (unpack (operands[0], tmp));
20081 }
20082 else
20083 {
20084 rtx (*unpack)(rtx, rtx, rtx);
20085
20086 switch (imode)
20087 {
20088 case V16QImode:
20089 if (high_p)
20090 unpack = gen_vec_interleave_highv16qi;
20091 else
20092 unpack = gen_vec_interleave_lowv16qi;
20093 break;
20094 case V8HImode:
20095 if (high_p)
20096 unpack = gen_vec_interleave_highv8hi;
20097 else
20098 unpack = gen_vec_interleave_lowv8hi;
20099 break;
20100 case V4SImode:
20101 if (high_p)
20102 unpack = gen_vec_interleave_highv4si;
20103 else
20104 unpack = gen_vec_interleave_lowv4si;
20105 break;
20106 default:
20107 gcc_unreachable ();
20108 }
20109
20110 dest = gen_lowpart (imode, operands[0]);
20111
20112 if (unsigned_p)
20113 tmp = force_reg (imode, CONST0_RTX (imode));
20114 else
20115 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20116 operands[1], pc_rtx, pc_rtx);
20117
20118 emit_insn (unpack (dest, operands[1], tmp));
20119 }
20120 }
20121
20122 /* Expand conditional increment or decrement using adb/sbb instructions.
20123 The default case using setcc followed by the conditional move can be
20124 done by generic code. */
20125 bool
20126 ix86_expand_int_addcc (rtx operands[])
20127 {
20128 enum rtx_code code = GET_CODE (operands[1]);
20129 rtx flags;
20130 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20131 rtx compare_op;
20132 rtx val = const0_rtx;
20133 bool fpcmp = false;
20134 enum machine_mode mode;
20135 rtx op0 = XEXP (operands[1], 0);
20136 rtx op1 = XEXP (operands[1], 1);
20137
20138 if (operands[3] != const1_rtx
20139 && operands[3] != constm1_rtx)
20140 return false;
20141 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20142 return false;
20143 code = GET_CODE (compare_op);
20144
20145 flags = XEXP (compare_op, 0);
20146
20147 if (GET_MODE (flags) == CCFPmode
20148 || GET_MODE (flags) == CCFPUmode)
20149 {
20150 fpcmp = true;
20151 code = ix86_fp_compare_code_to_integer (code);
20152 }
20153
20154 if (code != LTU)
20155 {
20156 val = constm1_rtx;
20157 if (fpcmp)
20158 PUT_CODE (compare_op,
20159 reverse_condition_maybe_unordered
20160 (GET_CODE (compare_op)));
20161 else
20162 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20163 }
20164
20165 mode = GET_MODE (operands[0]);
20166
20167 /* Construct either adc or sbb insn. */
20168 if ((code == LTU) == (operands[3] == constm1_rtx))
20169 {
20170 switch (mode)
20171 {
20172 case QImode:
20173 insn = gen_subqi3_carry;
20174 break;
20175 case HImode:
20176 insn = gen_subhi3_carry;
20177 break;
20178 case SImode:
20179 insn = gen_subsi3_carry;
20180 break;
20181 case DImode:
20182 insn = gen_subdi3_carry;
20183 break;
20184 default:
20185 gcc_unreachable ();
20186 }
20187 }
20188 else
20189 {
20190 switch (mode)
20191 {
20192 case QImode:
20193 insn = gen_addqi3_carry;
20194 break;
20195 case HImode:
20196 insn = gen_addhi3_carry;
20197 break;
20198 case SImode:
20199 insn = gen_addsi3_carry;
20200 break;
20201 case DImode:
20202 insn = gen_adddi3_carry;
20203 break;
20204 default:
20205 gcc_unreachable ();
20206 }
20207 }
20208 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20209
20210 return true;
20211 }
20212
20213
20214 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20215 but works for floating pointer parameters and nonoffsetable memories.
20216 For pushes, it returns just stack offsets; the values will be saved
20217 in the right order. Maximally three parts are generated. */
20218
20219 static int
20220 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20221 {
20222 int size;
20223
20224 if (!TARGET_64BIT)
20225 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20226 else
20227 size = (GET_MODE_SIZE (mode) + 4) / 8;
20228
20229 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20230 gcc_assert (size >= 2 && size <= 4);
20231
20232 /* Optimize constant pool reference to immediates. This is used by fp
20233 moves, that force all constants to memory to allow combining. */
20234 if (MEM_P (operand) && MEM_READONLY_P (operand))
20235 {
20236 rtx tmp = maybe_get_pool_constant (operand);
20237 if (tmp)
20238 operand = tmp;
20239 }
20240
20241 if (MEM_P (operand) && !offsettable_memref_p (operand))
20242 {
20243 /* The only non-offsetable memories we handle are pushes. */
20244 int ok = push_operand (operand, VOIDmode);
20245
20246 gcc_assert (ok);
20247
20248 operand = copy_rtx (operand);
20249 PUT_MODE (operand, Pmode);
20250 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20251 return size;
20252 }
20253
20254 if (GET_CODE (operand) == CONST_VECTOR)
20255 {
20256 enum machine_mode imode = int_mode_for_mode (mode);
20257 /* Caution: if we looked through a constant pool memory above,
20258 the operand may actually have a different mode now. That's
20259 ok, since we want to pun this all the way back to an integer. */
20260 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20261 gcc_assert (operand != NULL);
20262 mode = imode;
20263 }
20264
20265 if (!TARGET_64BIT)
20266 {
20267 if (mode == DImode)
20268 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20269 else
20270 {
20271 int i;
20272
20273 if (REG_P (operand))
20274 {
20275 gcc_assert (reload_completed);
20276 for (i = 0; i < size; i++)
20277 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20278 }
20279 else if (offsettable_memref_p (operand))
20280 {
20281 operand = adjust_address (operand, SImode, 0);
20282 parts[0] = operand;
20283 for (i = 1; i < size; i++)
20284 parts[i] = adjust_address (operand, SImode, 4 * i);
20285 }
20286 else if (GET_CODE (operand) == CONST_DOUBLE)
20287 {
20288 REAL_VALUE_TYPE r;
20289 long l[4];
20290
20291 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20292 switch (mode)
20293 {
20294 case TFmode:
20295 real_to_target (l, &r, mode);
20296 parts[3] = gen_int_mode (l[3], SImode);
20297 parts[2] = gen_int_mode (l[2], SImode);
20298 break;
20299 case XFmode:
20300 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20301 parts[2] = gen_int_mode (l[2], SImode);
20302 break;
20303 case DFmode:
20304 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20305 break;
20306 default:
20307 gcc_unreachable ();
20308 }
20309 parts[1] = gen_int_mode (l[1], SImode);
20310 parts[0] = gen_int_mode (l[0], SImode);
20311 }
20312 else
20313 gcc_unreachable ();
20314 }
20315 }
20316 else
20317 {
20318 if (mode == TImode)
20319 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20320 if (mode == XFmode || mode == TFmode)
20321 {
20322 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20323 if (REG_P (operand))
20324 {
20325 gcc_assert (reload_completed);
20326 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20327 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20328 }
20329 else if (offsettable_memref_p (operand))
20330 {
20331 operand = adjust_address (operand, DImode, 0);
20332 parts[0] = operand;
20333 parts[1] = adjust_address (operand, upper_mode, 8);
20334 }
20335 else if (GET_CODE (operand) == CONST_DOUBLE)
20336 {
20337 REAL_VALUE_TYPE r;
20338 long l[4];
20339
20340 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20341 real_to_target (l, &r, mode);
20342
20343 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20344 if (HOST_BITS_PER_WIDE_INT >= 64)
20345 parts[0]
20346 = gen_int_mode
20347 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20348 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20349 DImode);
20350 else
20351 parts[0] = immed_double_const (l[0], l[1], DImode);
20352
20353 if (upper_mode == SImode)
20354 parts[1] = gen_int_mode (l[2], SImode);
20355 else if (HOST_BITS_PER_WIDE_INT >= 64)
20356 parts[1]
20357 = gen_int_mode
20358 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20359 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20360 DImode);
20361 else
20362 parts[1] = immed_double_const (l[2], l[3], DImode);
20363 }
20364 else
20365 gcc_unreachable ();
20366 }
20367 }
20368
20369 return size;
20370 }
20371
20372 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20373 Return false when normal moves are needed; true when all required
20374 insns have been emitted. Operands 2-4 contain the input values
20375 int the correct order; operands 5-7 contain the output values. */
20376
20377 void
20378 ix86_split_long_move (rtx operands[])
20379 {
20380 rtx part[2][4];
20381 int nparts, i, j;
20382 int push = 0;
20383 int collisions = 0;
20384 enum machine_mode mode = GET_MODE (operands[0]);
20385 bool collisionparts[4];
20386
20387 /* The DFmode expanders may ask us to move double.
20388 For 64bit target this is single move. By hiding the fact
20389 here we simplify i386.md splitters. */
20390 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20391 {
20392 /* Optimize constant pool reference to immediates. This is used by
20393 fp moves, that force all constants to memory to allow combining. */
20394
20395 if (MEM_P (operands[1])
20396 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20397 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20398 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20399 if (push_operand (operands[0], VOIDmode))
20400 {
20401 operands[0] = copy_rtx (operands[0]);
20402 PUT_MODE (operands[0], Pmode);
20403 }
20404 else
20405 operands[0] = gen_lowpart (DImode, operands[0]);
20406 operands[1] = gen_lowpart (DImode, operands[1]);
20407 emit_move_insn (operands[0], operands[1]);
20408 return;
20409 }
20410
20411 /* The only non-offsettable memory we handle is push. */
20412 if (push_operand (operands[0], VOIDmode))
20413 push = 1;
20414 else
20415 gcc_assert (!MEM_P (operands[0])
20416 || offsettable_memref_p (operands[0]));
20417
20418 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20419 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20420
20421 /* When emitting push, take care for source operands on the stack. */
20422 if (push && MEM_P (operands[1])
20423 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20424 {
20425 rtx src_base = XEXP (part[1][nparts - 1], 0);
20426
20427 /* Compensate for the stack decrement by 4. */
20428 if (!TARGET_64BIT && nparts == 3
20429 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20430 src_base = plus_constant (src_base, 4);
20431
20432 /* src_base refers to the stack pointer and is
20433 automatically decreased by emitted push. */
20434 for (i = 0; i < nparts; i++)
20435 part[1][i] = change_address (part[1][i],
20436 GET_MODE (part[1][i]), src_base);
20437 }
20438
20439 /* We need to do copy in the right order in case an address register
20440 of the source overlaps the destination. */
20441 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20442 {
20443 rtx tmp;
20444
20445 for (i = 0; i < nparts; i++)
20446 {
20447 collisionparts[i]
20448 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20449 if (collisionparts[i])
20450 collisions++;
20451 }
20452
20453 /* Collision in the middle part can be handled by reordering. */
20454 if (collisions == 1 && nparts == 3 && collisionparts [1])
20455 {
20456 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20457 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20458 }
20459 else if (collisions == 1
20460 && nparts == 4
20461 && (collisionparts [1] || collisionparts [2]))
20462 {
20463 if (collisionparts [1])
20464 {
20465 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20466 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20467 }
20468 else
20469 {
20470 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20471 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20472 }
20473 }
20474
20475 /* If there are more collisions, we can't handle it by reordering.
20476 Do an lea to the last part and use only one colliding move. */
20477 else if (collisions > 1)
20478 {
20479 rtx base;
20480
20481 collisions = 1;
20482
20483 base = part[0][nparts - 1];
20484
20485 /* Handle the case when the last part isn't valid for lea.
20486 Happens in 64-bit mode storing the 12-byte XFmode. */
20487 if (GET_MODE (base) != Pmode)
20488 base = gen_rtx_REG (Pmode, REGNO (base));
20489
20490 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20491 part[1][0] = replace_equiv_address (part[1][0], base);
20492 for (i = 1; i < nparts; i++)
20493 {
20494 tmp = plus_constant (base, UNITS_PER_WORD * i);
20495 part[1][i] = replace_equiv_address (part[1][i], tmp);
20496 }
20497 }
20498 }
20499
20500 if (push)
20501 {
20502 if (!TARGET_64BIT)
20503 {
20504 if (nparts == 3)
20505 {
20506 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20507 emit_insn (gen_addsi3 (stack_pointer_rtx,
20508 stack_pointer_rtx, GEN_INT (-4)));
20509 emit_move_insn (part[0][2], part[1][2]);
20510 }
20511 else if (nparts == 4)
20512 {
20513 emit_move_insn (part[0][3], part[1][3]);
20514 emit_move_insn (part[0][2], part[1][2]);
20515 }
20516 }
20517 else
20518 {
20519 /* In 64bit mode we don't have 32bit push available. In case this is
20520 register, it is OK - we will just use larger counterpart. We also
20521 retype memory - these comes from attempt to avoid REX prefix on
20522 moving of second half of TFmode value. */
20523 if (GET_MODE (part[1][1]) == SImode)
20524 {
20525 switch (GET_CODE (part[1][1]))
20526 {
20527 case MEM:
20528 part[1][1] = adjust_address (part[1][1], DImode, 0);
20529 break;
20530
20531 case REG:
20532 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20533 break;
20534
20535 default:
20536 gcc_unreachable ();
20537 }
20538
20539 if (GET_MODE (part[1][0]) == SImode)
20540 part[1][0] = part[1][1];
20541 }
20542 }
20543 emit_move_insn (part[0][1], part[1][1]);
20544 emit_move_insn (part[0][0], part[1][0]);
20545 return;
20546 }
20547
20548 /* Choose correct order to not overwrite the source before it is copied. */
20549 if ((REG_P (part[0][0])
20550 && REG_P (part[1][1])
20551 && (REGNO (part[0][0]) == REGNO (part[1][1])
20552 || (nparts == 3
20553 && REGNO (part[0][0]) == REGNO (part[1][2]))
20554 || (nparts == 4
20555 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20556 || (collisions > 0
20557 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20558 {
20559 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20560 {
20561 operands[2 + i] = part[0][j];
20562 operands[6 + i] = part[1][j];
20563 }
20564 }
20565 else
20566 {
20567 for (i = 0; i < nparts; i++)
20568 {
20569 operands[2 + i] = part[0][i];
20570 operands[6 + i] = part[1][i];
20571 }
20572 }
20573
20574 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20575 if (optimize_insn_for_size_p ())
20576 {
20577 for (j = 0; j < nparts - 1; j++)
20578 if (CONST_INT_P (operands[6 + j])
20579 && operands[6 + j] != const0_rtx
20580 && REG_P (operands[2 + j]))
20581 for (i = j; i < nparts - 1; i++)
20582 if (CONST_INT_P (operands[7 + i])
20583 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20584 operands[7 + i] = operands[2 + j];
20585 }
20586
20587 for (i = 0; i < nparts; i++)
20588 emit_move_insn (operands[2 + i], operands[6 + i]);
20589
20590 return;
20591 }
20592
20593 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20594 left shift by a constant, either using a single shift or
20595 a sequence of add instructions. */
20596
20597 static void
20598 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20599 {
20600 rtx (*insn)(rtx, rtx, rtx);
20601
20602 if (count == 1
20603 || (count * ix86_cost->add <= ix86_cost->shift_const
20604 && !optimize_insn_for_size_p ()))
20605 {
20606 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20607 while (count-- > 0)
20608 emit_insn (insn (operand, operand, operand));
20609 }
20610 else
20611 {
20612 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20613 emit_insn (insn (operand, operand, GEN_INT (count)));
20614 }
20615 }
20616
20617 void
20618 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20619 {
20620 rtx (*gen_ashl3)(rtx, rtx, rtx);
20621 rtx (*gen_shld)(rtx, rtx, rtx);
20622 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20623
20624 rtx low[2], high[2];
20625 int count;
20626
20627 if (CONST_INT_P (operands[2]))
20628 {
20629 split_double_mode (mode, operands, 2, low, high);
20630 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20631
20632 if (count >= half_width)
20633 {
20634 emit_move_insn (high[0], low[1]);
20635 emit_move_insn (low[0], const0_rtx);
20636
20637 if (count > half_width)
20638 ix86_expand_ashl_const (high[0], count - half_width, mode);
20639 }
20640 else
20641 {
20642 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20643
20644 if (!rtx_equal_p (operands[0], operands[1]))
20645 emit_move_insn (operands[0], operands[1]);
20646
20647 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20648 ix86_expand_ashl_const (low[0], count, mode);
20649 }
20650 return;
20651 }
20652
20653 split_double_mode (mode, operands, 1, low, high);
20654
20655 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20656
20657 if (operands[1] == const1_rtx)
20658 {
20659 /* Assuming we've chosen a QImode capable registers, then 1 << N
20660 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20661 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20662 {
20663 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20664
20665 ix86_expand_clear (low[0]);
20666 ix86_expand_clear (high[0]);
20667 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20668
20669 d = gen_lowpart (QImode, low[0]);
20670 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20671 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20672 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20673
20674 d = gen_lowpart (QImode, high[0]);
20675 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20676 s = gen_rtx_NE (QImode, flags, const0_rtx);
20677 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20678 }
20679
20680 /* Otherwise, we can get the same results by manually performing
20681 a bit extract operation on bit 5/6, and then performing the two
20682 shifts. The two methods of getting 0/1 into low/high are exactly
20683 the same size. Avoiding the shift in the bit extract case helps
20684 pentium4 a bit; no one else seems to care much either way. */
20685 else
20686 {
20687 enum machine_mode half_mode;
20688 rtx (*gen_lshr3)(rtx, rtx, rtx);
20689 rtx (*gen_and3)(rtx, rtx, rtx);
20690 rtx (*gen_xor3)(rtx, rtx, rtx);
20691 HOST_WIDE_INT bits;
20692 rtx x;
20693
20694 if (mode == DImode)
20695 {
20696 half_mode = SImode;
20697 gen_lshr3 = gen_lshrsi3;
20698 gen_and3 = gen_andsi3;
20699 gen_xor3 = gen_xorsi3;
20700 bits = 5;
20701 }
20702 else
20703 {
20704 half_mode = DImode;
20705 gen_lshr3 = gen_lshrdi3;
20706 gen_and3 = gen_anddi3;
20707 gen_xor3 = gen_xordi3;
20708 bits = 6;
20709 }
20710
20711 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20712 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20713 else
20714 x = gen_lowpart (half_mode, operands[2]);
20715 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20716
20717 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20718 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20719 emit_move_insn (low[0], high[0]);
20720 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20721 }
20722
20723 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20724 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20725 return;
20726 }
20727
20728 if (operands[1] == constm1_rtx)
20729 {
20730 /* For -1 << N, we can avoid the shld instruction, because we
20731 know that we're shifting 0...31/63 ones into a -1. */
20732 emit_move_insn (low[0], constm1_rtx);
20733 if (optimize_insn_for_size_p ())
20734 emit_move_insn (high[0], low[0]);
20735 else
20736 emit_move_insn (high[0], constm1_rtx);
20737 }
20738 else
20739 {
20740 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20741
20742 if (!rtx_equal_p (operands[0], operands[1]))
20743 emit_move_insn (operands[0], operands[1]);
20744
20745 split_double_mode (mode, operands, 1, low, high);
20746 emit_insn (gen_shld (high[0], low[0], operands[2]));
20747 }
20748
20749 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20750
20751 if (TARGET_CMOVE && scratch)
20752 {
20753 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20754 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20755
20756 ix86_expand_clear (scratch);
20757 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20758 }
20759 else
20760 {
20761 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20762 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20763
20764 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20765 }
20766 }
20767
20768 void
20769 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20770 {
20771 rtx (*gen_ashr3)(rtx, rtx, rtx)
20772 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20773 rtx (*gen_shrd)(rtx, rtx, rtx);
20774 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20775
20776 rtx low[2], high[2];
20777 int count;
20778
20779 if (CONST_INT_P (operands[2]))
20780 {
20781 split_double_mode (mode, operands, 2, low, high);
20782 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20783
20784 if (count == GET_MODE_BITSIZE (mode) - 1)
20785 {
20786 emit_move_insn (high[0], high[1]);
20787 emit_insn (gen_ashr3 (high[0], high[0],
20788 GEN_INT (half_width - 1)));
20789 emit_move_insn (low[0], high[0]);
20790
20791 }
20792 else if (count >= half_width)
20793 {
20794 emit_move_insn (low[0], high[1]);
20795 emit_move_insn (high[0], low[0]);
20796 emit_insn (gen_ashr3 (high[0], high[0],
20797 GEN_INT (half_width - 1)));
20798
20799 if (count > half_width)
20800 emit_insn (gen_ashr3 (low[0], low[0],
20801 GEN_INT (count - half_width)));
20802 }
20803 else
20804 {
20805 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20806
20807 if (!rtx_equal_p (operands[0], operands[1]))
20808 emit_move_insn (operands[0], operands[1]);
20809
20810 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20811 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20812 }
20813 }
20814 else
20815 {
20816 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20817
20818 if (!rtx_equal_p (operands[0], operands[1]))
20819 emit_move_insn (operands[0], operands[1]);
20820
20821 split_double_mode (mode, operands, 1, low, high);
20822
20823 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20824 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
20825
20826 if (TARGET_CMOVE && scratch)
20827 {
20828 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20829 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20830
20831 emit_move_insn (scratch, high[0]);
20832 emit_insn (gen_ashr3 (scratch, scratch,
20833 GEN_INT (half_width - 1)));
20834 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20835 scratch));
20836 }
20837 else
20838 {
20839 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
20840 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
20841
20842 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
20843 }
20844 }
20845 }
20846
20847 void
20848 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
20849 {
20850 rtx (*gen_lshr3)(rtx, rtx, rtx)
20851 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
20852 rtx (*gen_shrd)(rtx, rtx, rtx);
20853 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20854
20855 rtx low[2], high[2];
20856 int count;
20857
20858 if (CONST_INT_P (operands[2]))
20859 {
20860 split_double_mode (mode, operands, 2, low, high);
20861 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20862
20863 if (count >= half_width)
20864 {
20865 emit_move_insn (low[0], high[1]);
20866 ix86_expand_clear (high[0]);
20867
20868 if (count > half_width)
20869 emit_insn (gen_lshr3 (low[0], low[0],
20870 GEN_INT (count - half_width)));
20871 }
20872 else
20873 {
20874 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20875
20876 if (!rtx_equal_p (operands[0], operands[1]))
20877 emit_move_insn (operands[0], operands[1]);
20878
20879 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20880 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
20881 }
20882 }
20883 else
20884 {
20885 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20886
20887 if (!rtx_equal_p (operands[0], operands[1]))
20888 emit_move_insn (operands[0], operands[1]);
20889
20890 split_double_mode (mode, operands, 1, low, high);
20891
20892 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20893 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
20894
20895 if (TARGET_CMOVE && scratch)
20896 {
20897 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20898 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20899
20900 ix86_expand_clear (scratch);
20901 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20902 scratch));
20903 }
20904 else
20905 {
20906 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20907 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20908
20909 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
20910 }
20911 }
20912 }
20913
20914 /* Predict just emitted jump instruction to be taken with probability PROB. */
20915 static void
20916 predict_jump (int prob)
20917 {
20918 rtx insn = get_last_insn ();
20919 gcc_assert (JUMP_P (insn));
20920 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
20921 }
20922
20923 /* Helper function for the string operations below. Dest VARIABLE whether
20924 it is aligned to VALUE bytes. If true, jump to the label. */
20925 static rtx
20926 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
20927 {
20928 rtx label = gen_label_rtx ();
20929 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
20930 if (GET_MODE (variable) == DImode)
20931 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
20932 else
20933 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
20934 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
20935 1, label);
20936 if (epilogue)
20937 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20938 else
20939 predict_jump (REG_BR_PROB_BASE * 90 / 100);
20940 return label;
20941 }
20942
20943 /* Adjust COUNTER by the VALUE. */
20944 static void
20945 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
20946 {
20947 rtx (*gen_add)(rtx, rtx, rtx)
20948 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
20949
20950 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
20951 }
20952
20953 /* Zero extend possibly SImode EXP to Pmode register. */
20954 rtx
20955 ix86_zero_extend_to_Pmode (rtx exp)
20956 {
20957 rtx r;
20958 if (GET_MODE (exp) == VOIDmode)
20959 return force_reg (Pmode, exp);
20960 if (GET_MODE (exp) == Pmode)
20961 return copy_to_mode_reg (Pmode, exp);
20962 r = gen_reg_rtx (Pmode);
20963 emit_insn (gen_zero_extendsidi2 (r, exp));
20964 return r;
20965 }
20966
20967 /* Divide COUNTREG by SCALE. */
20968 static rtx
20969 scale_counter (rtx countreg, int scale)
20970 {
20971 rtx sc;
20972
20973 if (scale == 1)
20974 return countreg;
20975 if (CONST_INT_P (countreg))
20976 return GEN_INT (INTVAL (countreg) / scale);
20977 gcc_assert (REG_P (countreg));
20978
20979 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
20980 GEN_INT (exact_log2 (scale)),
20981 NULL, 1, OPTAB_DIRECT);
20982 return sc;
20983 }
20984
20985 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
20986 DImode for constant loop counts. */
20987
20988 static enum machine_mode
20989 counter_mode (rtx count_exp)
20990 {
20991 if (GET_MODE (count_exp) != VOIDmode)
20992 return GET_MODE (count_exp);
20993 if (!CONST_INT_P (count_exp))
20994 return Pmode;
20995 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
20996 return DImode;
20997 return SImode;
20998 }
20999
21000 /* When SRCPTR is non-NULL, output simple loop to move memory
21001 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21002 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21003 equivalent loop to set memory by VALUE (supposed to be in MODE).
21004
21005 The size is rounded down to whole number of chunk size moved at once.
21006 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21007
21008
21009 static void
21010 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21011 rtx destptr, rtx srcptr, rtx value,
21012 rtx count, enum machine_mode mode, int unroll,
21013 int expected_size)
21014 {
21015 rtx out_label, top_label, iter, tmp;
21016 enum machine_mode iter_mode = counter_mode (count);
21017 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21018 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21019 rtx size;
21020 rtx x_addr;
21021 rtx y_addr;
21022 int i;
21023
21024 top_label = gen_label_rtx ();
21025 out_label = gen_label_rtx ();
21026 iter = gen_reg_rtx (iter_mode);
21027
21028 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21029 NULL, 1, OPTAB_DIRECT);
21030 /* Those two should combine. */
21031 if (piece_size == const1_rtx)
21032 {
21033 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21034 true, out_label);
21035 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21036 }
21037 emit_move_insn (iter, const0_rtx);
21038
21039 emit_label (top_label);
21040
21041 tmp = convert_modes (Pmode, iter_mode, iter, true);
21042 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21043 destmem = change_address (destmem, mode, x_addr);
21044
21045 if (srcmem)
21046 {
21047 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21048 srcmem = change_address (srcmem, mode, y_addr);
21049
21050 /* When unrolling for chips that reorder memory reads and writes,
21051 we can save registers by using single temporary.
21052 Also using 4 temporaries is overkill in 32bit mode. */
21053 if (!TARGET_64BIT && 0)
21054 {
21055 for (i = 0; i < unroll; i++)
21056 {
21057 if (i)
21058 {
21059 destmem =
21060 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21061 srcmem =
21062 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21063 }
21064 emit_move_insn (destmem, srcmem);
21065 }
21066 }
21067 else
21068 {
21069 rtx tmpreg[4];
21070 gcc_assert (unroll <= 4);
21071 for (i = 0; i < unroll; i++)
21072 {
21073 tmpreg[i] = gen_reg_rtx (mode);
21074 if (i)
21075 {
21076 srcmem =
21077 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21078 }
21079 emit_move_insn (tmpreg[i], srcmem);
21080 }
21081 for (i = 0; i < unroll; i++)
21082 {
21083 if (i)
21084 {
21085 destmem =
21086 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21087 }
21088 emit_move_insn (destmem, tmpreg[i]);
21089 }
21090 }
21091 }
21092 else
21093 for (i = 0; i < unroll; i++)
21094 {
21095 if (i)
21096 destmem =
21097 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21098 emit_move_insn (destmem, value);
21099 }
21100
21101 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21102 true, OPTAB_LIB_WIDEN);
21103 if (tmp != iter)
21104 emit_move_insn (iter, tmp);
21105
21106 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21107 true, top_label);
21108 if (expected_size != -1)
21109 {
21110 expected_size /= GET_MODE_SIZE (mode) * unroll;
21111 if (expected_size == 0)
21112 predict_jump (0);
21113 else if (expected_size > REG_BR_PROB_BASE)
21114 predict_jump (REG_BR_PROB_BASE - 1);
21115 else
21116 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21117 }
21118 else
21119 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21120 iter = ix86_zero_extend_to_Pmode (iter);
21121 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21122 true, OPTAB_LIB_WIDEN);
21123 if (tmp != destptr)
21124 emit_move_insn (destptr, tmp);
21125 if (srcptr)
21126 {
21127 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21128 true, OPTAB_LIB_WIDEN);
21129 if (tmp != srcptr)
21130 emit_move_insn (srcptr, tmp);
21131 }
21132 emit_label (out_label);
21133 }
21134
21135 /* Output "rep; mov" instruction.
21136 Arguments have same meaning as for previous function */
21137 static void
21138 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21139 rtx destptr, rtx srcptr,
21140 rtx count,
21141 enum machine_mode mode)
21142 {
21143 rtx destexp;
21144 rtx srcexp;
21145 rtx countreg;
21146 HOST_WIDE_INT rounded_count;
21147
21148 /* If the size is known, it is shorter to use rep movs. */
21149 if (mode == QImode && CONST_INT_P (count)
21150 && !(INTVAL (count) & 3))
21151 mode = SImode;
21152
21153 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21154 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21155 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21156 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21157 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21158 if (mode != QImode)
21159 {
21160 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21161 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21162 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21163 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21164 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21165 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21166 }
21167 else
21168 {
21169 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21170 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21171 }
21172 if (CONST_INT_P (count))
21173 {
21174 rounded_count = (INTVAL (count)
21175 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21176 destmem = shallow_copy_rtx (destmem);
21177 srcmem = shallow_copy_rtx (srcmem);
21178 set_mem_size (destmem, rounded_count);
21179 set_mem_size (srcmem, rounded_count);
21180 }
21181 else
21182 {
21183 if (MEM_SIZE_KNOWN_P (destmem))
21184 clear_mem_size (destmem);
21185 if (MEM_SIZE_KNOWN_P (srcmem))
21186 clear_mem_size (srcmem);
21187 }
21188 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21189 destexp, srcexp));
21190 }
21191
21192 /* Output "rep; stos" instruction.
21193 Arguments have same meaning as for previous function */
21194 static void
21195 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21196 rtx count, enum machine_mode mode,
21197 rtx orig_value)
21198 {
21199 rtx destexp;
21200 rtx countreg;
21201 HOST_WIDE_INT rounded_count;
21202
21203 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21204 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21205 value = force_reg (mode, gen_lowpart (mode, value));
21206 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21207 if (mode != QImode)
21208 {
21209 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21210 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21211 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21212 }
21213 else
21214 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21215 if (orig_value == const0_rtx && CONST_INT_P (count))
21216 {
21217 rounded_count = (INTVAL (count)
21218 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21219 destmem = shallow_copy_rtx (destmem);
21220 set_mem_size (destmem, rounded_count);
21221 }
21222 else if (MEM_SIZE_KNOWN_P (destmem))
21223 clear_mem_size (destmem);
21224 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21225 }
21226
21227 static void
21228 emit_strmov (rtx destmem, rtx srcmem,
21229 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21230 {
21231 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21232 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21233 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21234 }
21235
21236 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21237 static void
21238 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21239 rtx destptr, rtx srcptr, rtx count, int max_size)
21240 {
21241 rtx src, dest;
21242 if (CONST_INT_P (count))
21243 {
21244 HOST_WIDE_INT countval = INTVAL (count);
21245 int offset = 0;
21246
21247 if ((countval & 0x10) && max_size > 16)
21248 {
21249 if (TARGET_64BIT)
21250 {
21251 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21252 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21253 }
21254 else
21255 gcc_unreachable ();
21256 offset += 16;
21257 }
21258 if ((countval & 0x08) && max_size > 8)
21259 {
21260 if (TARGET_64BIT)
21261 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21262 else
21263 {
21264 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21265 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21266 }
21267 offset += 8;
21268 }
21269 if ((countval & 0x04) && max_size > 4)
21270 {
21271 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21272 offset += 4;
21273 }
21274 if ((countval & 0x02) && max_size > 2)
21275 {
21276 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21277 offset += 2;
21278 }
21279 if ((countval & 0x01) && max_size > 1)
21280 {
21281 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21282 offset += 1;
21283 }
21284 return;
21285 }
21286 if (max_size > 8)
21287 {
21288 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21289 count, 1, OPTAB_DIRECT);
21290 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21291 count, QImode, 1, 4);
21292 return;
21293 }
21294
21295 /* When there are stringops, we can cheaply increase dest and src pointers.
21296 Otherwise we save code size by maintaining offset (zero is readily
21297 available from preceding rep operation) and using x86 addressing modes.
21298 */
21299 if (TARGET_SINGLE_STRINGOP)
21300 {
21301 if (max_size > 4)
21302 {
21303 rtx label = ix86_expand_aligntest (count, 4, true);
21304 src = change_address (srcmem, SImode, srcptr);
21305 dest = change_address (destmem, SImode, destptr);
21306 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21307 emit_label (label);
21308 LABEL_NUSES (label) = 1;
21309 }
21310 if (max_size > 2)
21311 {
21312 rtx label = ix86_expand_aligntest (count, 2, true);
21313 src = change_address (srcmem, HImode, srcptr);
21314 dest = change_address (destmem, HImode, destptr);
21315 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21316 emit_label (label);
21317 LABEL_NUSES (label) = 1;
21318 }
21319 if (max_size > 1)
21320 {
21321 rtx label = ix86_expand_aligntest (count, 1, true);
21322 src = change_address (srcmem, QImode, srcptr);
21323 dest = change_address (destmem, QImode, destptr);
21324 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21325 emit_label (label);
21326 LABEL_NUSES (label) = 1;
21327 }
21328 }
21329 else
21330 {
21331 rtx offset = force_reg (Pmode, const0_rtx);
21332 rtx tmp;
21333
21334 if (max_size > 4)
21335 {
21336 rtx label = ix86_expand_aligntest (count, 4, true);
21337 src = change_address (srcmem, SImode, srcptr);
21338 dest = change_address (destmem, SImode, destptr);
21339 emit_move_insn (dest, src);
21340 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21341 true, OPTAB_LIB_WIDEN);
21342 if (tmp != offset)
21343 emit_move_insn (offset, tmp);
21344 emit_label (label);
21345 LABEL_NUSES (label) = 1;
21346 }
21347 if (max_size > 2)
21348 {
21349 rtx label = ix86_expand_aligntest (count, 2, true);
21350 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21351 src = change_address (srcmem, HImode, tmp);
21352 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21353 dest = change_address (destmem, HImode, tmp);
21354 emit_move_insn (dest, src);
21355 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21356 true, OPTAB_LIB_WIDEN);
21357 if (tmp != offset)
21358 emit_move_insn (offset, tmp);
21359 emit_label (label);
21360 LABEL_NUSES (label) = 1;
21361 }
21362 if (max_size > 1)
21363 {
21364 rtx label = ix86_expand_aligntest (count, 1, true);
21365 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21366 src = change_address (srcmem, QImode, tmp);
21367 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21368 dest = change_address (destmem, QImode, tmp);
21369 emit_move_insn (dest, src);
21370 emit_label (label);
21371 LABEL_NUSES (label) = 1;
21372 }
21373 }
21374 }
21375
21376 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21377 static void
21378 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21379 rtx count, int max_size)
21380 {
21381 count =
21382 expand_simple_binop (counter_mode (count), AND, count,
21383 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21384 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21385 gen_lowpart (QImode, value), count, QImode,
21386 1, max_size / 2);
21387 }
21388
21389 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21390 static void
21391 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21392 {
21393 rtx dest;
21394
21395 if (CONST_INT_P (count))
21396 {
21397 HOST_WIDE_INT countval = INTVAL (count);
21398 int offset = 0;
21399
21400 if ((countval & 0x10) && max_size > 16)
21401 {
21402 if (TARGET_64BIT)
21403 {
21404 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21405 emit_insn (gen_strset (destptr, dest, value));
21406 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21407 emit_insn (gen_strset (destptr, dest, value));
21408 }
21409 else
21410 gcc_unreachable ();
21411 offset += 16;
21412 }
21413 if ((countval & 0x08) && max_size > 8)
21414 {
21415 if (TARGET_64BIT)
21416 {
21417 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21418 emit_insn (gen_strset (destptr, dest, value));
21419 }
21420 else
21421 {
21422 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21423 emit_insn (gen_strset (destptr, dest, value));
21424 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21425 emit_insn (gen_strset (destptr, dest, value));
21426 }
21427 offset += 8;
21428 }
21429 if ((countval & 0x04) && max_size > 4)
21430 {
21431 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21432 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21433 offset += 4;
21434 }
21435 if ((countval & 0x02) && max_size > 2)
21436 {
21437 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21438 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21439 offset += 2;
21440 }
21441 if ((countval & 0x01) && max_size > 1)
21442 {
21443 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21444 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21445 offset += 1;
21446 }
21447 return;
21448 }
21449 if (max_size > 32)
21450 {
21451 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21452 return;
21453 }
21454 if (max_size > 16)
21455 {
21456 rtx label = ix86_expand_aligntest (count, 16, true);
21457 if (TARGET_64BIT)
21458 {
21459 dest = change_address (destmem, DImode, destptr);
21460 emit_insn (gen_strset (destptr, dest, value));
21461 emit_insn (gen_strset (destptr, dest, value));
21462 }
21463 else
21464 {
21465 dest = change_address (destmem, SImode, destptr);
21466 emit_insn (gen_strset (destptr, dest, value));
21467 emit_insn (gen_strset (destptr, dest, value));
21468 emit_insn (gen_strset (destptr, dest, value));
21469 emit_insn (gen_strset (destptr, dest, value));
21470 }
21471 emit_label (label);
21472 LABEL_NUSES (label) = 1;
21473 }
21474 if (max_size > 8)
21475 {
21476 rtx label = ix86_expand_aligntest (count, 8, true);
21477 if (TARGET_64BIT)
21478 {
21479 dest = change_address (destmem, DImode, destptr);
21480 emit_insn (gen_strset (destptr, dest, value));
21481 }
21482 else
21483 {
21484 dest = change_address (destmem, SImode, destptr);
21485 emit_insn (gen_strset (destptr, dest, value));
21486 emit_insn (gen_strset (destptr, dest, value));
21487 }
21488 emit_label (label);
21489 LABEL_NUSES (label) = 1;
21490 }
21491 if (max_size > 4)
21492 {
21493 rtx label = ix86_expand_aligntest (count, 4, true);
21494 dest = change_address (destmem, SImode, destptr);
21495 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21496 emit_label (label);
21497 LABEL_NUSES (label) = 1;
21498 }
21499 if (max_size > 2)
21500 {
21501 rtx label = ix86_expand_aligntest (count, 2, true);
21502 dest = change_address (destmem, HImode, destptr);
21503 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21504 emit_label (label);
21505 LABEL_NUSES (label) = 1;
21506 }
21507 if (max_size > 1)
21508 {
21509 rtx label = ix86_expand_aligntest (count, 1, true);
21510 dest = change_address (destmem, QImode, destptr);
21511 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21512 emit_label (label);
21513 LABEL_NUSES (label) = 1;
21514 }
21515 }
21516
21517 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21518 DESIRED_ALIGNMENT. */
21519 static void
21520 expand_movmem_prologue (rtx destmem, rtx srcmem,
21521 rtx destptr, rtx srcptr, rtx count,
21522 int align, int desired_alignment)
21523 {
21524 if (align <= 1 && desired_alignment > 1)
21525 {
21526 rtx label = ix86_expand_aligntest (destptr, 1, false);
21527 srcmem = change_address (srcmem, QImode, srcptr);
21528 destmem = change_address (destmem, QImode, destptr);
21529 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21530 ix86_adjust_counter (count, 1);
21531 emit_label (label);
21532 LABEL_NUSES (label) = 1;
21533 }
21534 if (align <= 2 && desired_alignment > 2)
21535 {
21536 rtx label = ix86_expand_aligntest (destptr, 2, false);
21537 srcmem = change_address (srcmem, HImode, srcptr);
21538 destmem = change_address (destmem, HImode, destptr);
21539 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21540 ix86_adjust_counter (count, 2);
21541 emit_label (label);
21542 LABEL_NUSES (label) = 1;
21543 }
21544 if (align <= 4 && desired_alignment > 4)
21545 {
21546 rtx label = ix86_expand_aligntest (destptr, 4, false);
21547 srcmem = change_address (srcmem, SImode, srcptr);
21548 destmem = change_address (destmem, SImode, destptr);
21549 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21550 ix86_adjust_counter (count, 4);
21551 emit_label (label);
21552 LABEL_NUSES (label) = 1;
21553 }
21554 gcc_assert (desired_alignment <= 8);
21555 }
21556
21557 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21558 ALIGN_BYTES is how many bytes need to be copied. */
21559 static rtx
21560 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21561 int desired_align, int align_bytes)
21562 {
21563 rtx src = *srcp;
21564 rtx orig_dst = dst;
21565 rtx orig_src = src;
21566 int off = 0;
21567 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21568 if (src_align_bytes >= 0)
21569 src_align_bytes = desired_align - src_align_bytes;
21570 if (align_bytes & 1)
21571 {
21572 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21573 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21574 off = 1;
21575 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21576 }
21577 if (align_bytes & 2)
21578 {
21579 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21580 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21581 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21582 set_mem_align (dst, 2 * BITS_PER_UNIT);
21583 if (src_align_bytes >= 0
21584 && (src_align_bytes & 1) == (align_bytes & 1)
21585 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21586 set_mem_align (src, 2 * BITS_PER_UNIT);
21587 off = 2;
21588 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21589 }
21590 if (align_bytes & 4)
21591 {
21592 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21593 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21594 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21595 set_mem_align (dst, 4 * BITS_PER_UNIT);
21596 if (src_align_bytes >= 0)
21597 {
21598 unsigned int src_align = 0;
21599 if ((src_align_bytes & 3) == (align_bytes & 3))
21600 src_align = 4;
21601 else if ((src_align_bytes & 1) == (align_bytes & 1))
21602 src_align = 2;
21603 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21604 set_mem_align (src, src_align * BITS_PER_UNIT);
21605 }
21606 off = 4;
21607 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21608 }
21609 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21610 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21611 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21612 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21613 if (src_align_bytes >= 0)
21614 {
21615 unsigned int src_align = 0;
21616 if ((src_align_bytes & 7) == (align_bytes & 7))
21617 src_align = 8;
21618 else if ((src_align_bytes & 3) == (align_bytes & 3))
21619 src_align = 4;
21620 else if ((src_align_bytes & 1) == (align_bytes & 1))
21621 src_align = 2;
21622 if (src_align > (unsigned int) desired_align)
21623 src_align = desired_align;
21624 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21625 set_mem_align (src, src_align * BITS_PER_UNIT);
21626 }
21627 if (MEM_SIZE_KNOWN_P (orig_dst))
21628 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21629 if (MEM_SIZE_KNOWN_P (orig_src))
21630 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21631 *srcp = src;
21632 return dst;
21633 }
21634
21635 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21636 DESIRED_ALIGNMENT. */
21637 static void
21638 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21639 int align, int desired_alignment)
21640 {
21641 if (align <= 1 && desired_alignment > 1)
21642 {
21643 rtx label = ix86_expand_aligntest (destptr, 1, false);
21644 destmem = change_address (destmem, QImode, destptr);
21645 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21646 ix86_adjust_counter (count, 1);
21647 emit_label (label);
21648 LABEL_NUSES (label) = 1;
21649 }
21650 if (align <= 2 && desired_alignment > 2)
21651 {
21652 rtx label = ix86_expand_aligntest (destptr, 2, false);
21653 destmem = change_address (destmem, HImode, destptr);
21654 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21655 ix86_adjust_counter (count, 2);
21656 emit_label (label);
21657 LABEL_NUSES (label) = 1;
21658 }
21659 if (align <= 4 && desired_alignment > 4)
21660 {
21661 rtx label = ix86_expand_aligntest (destptr, 4, false);
21662 destmem = change_address (destmem, SImode, destptr);
21663 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21664 ix86_adjust_counter (count, 4);
21665 emit_label (label);
21666 LABEL_NUSES (label) = 1;
21667 }
21668 gcc_assert (desired_alignment <= 8);
21669 }
21670
21671 /* Set enough from DST to align DST known to by aligned by ALIGN to
21672 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21673 static rtx
21674 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21675 int desired_align, int align_bytes)
21676 {
21677 int off = 0;
21678 rtx orig_dst = dst;
21679 if (align_bytes & 1)
21680 {
21681 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21682 off = 1;
21683 emit_insn (gen_strset (destreg, dst,
21684 gen_lowpart (QImode, value)));
21685 }
21686 if (align_bytes & 2)
21687 {
21688 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21689 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21690 set_mem_align (dst, 2 * BITS_PER_UNIT);
21691 off = 2;
21692 emit_insn (gen_strset (destreg, dst,
21693 gen_lowpart (HImode, value)));
21694 }
21695 if (align_bytes & 4)
21696 {
21697 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21698 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21699 set_mem_align (dst, 4 * BITS_PER_UNIT);
21700 off = 4;
21701 emit_insn (gen_strset (destreg, dst,
21702 gen_lowpart (SImode, value)));
21703 }
21704 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21705 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21706 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21707 if (MEM_SIZE_KNOWN_P (orig_dst))
21708 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21709 return dst;
21710 }
21711
21712 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21713 static enum stringop_alg
21714 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21715 int *dynamic_check)
21716 {
21717 const struct stringop_algs * algs;
21718 bool optimize_for_speed;
21719 /* Algorithms using the rep prefix want at least edi and ecx;
21720 additionally, memset wants eax and memcpy wants esi. Don't
21721 consider such algorithms if the user has appropriated those
21722 registers for their own purposes. */
21723 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21724 || (memset
21725 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21726
21727 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21728 || (alg != rep_prefix_1_byte \
21729 && alg != rep_prefix_4_byte \
21730 && alg != rep_prefix_8_byte))
21731 const struct processor_costs *cost;
21732
21733 /* Even if the string operation call is cold, we still might spend a lot
21734 of time processing large blocks. */
21735 if (optimize_function_for_size_p (cfun)
21736 || (optimize_insn_for_size_p ()
21737 && expected_size != -1 && expected_size < 256))
21738 optimize_for_speed = false;
21739 else
21740 optimize_for_speed = true;
21741
21742 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21743
21744 *dynamic_check = -1;
21745 if (memset)
21746 algs = &cost->memset[TARGET_64BIT != 0];
21747 else
21748 algs = &cost->memcpy[TARGET_64BIT != 0];
21749 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21750 return ix86_stringop_alg;
21751 /* rep; movq or rep; movl is the smallest variant. */
21752 else if (!optimize_for_speed)
21753 {
21754 if (!count || (count & 3))
21755 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21756 else
21757 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21758 }
21759 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21760 */
21761 else if (expected_size != -1 && expected_size < 4)
21762 return loop_1_byte;
21763 else if (expected_size != -1)
21764 {
21765 unsigned int i;
21766 enum stringop_alg alg = libcall;
21767 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21768 {
21769 /* We get here if the algorithms that were not libcall-based
21770 were rep-prefix based and we are unable to use rep prefixes
21771 based on global register usage. Break out of the loop and
21772 use the heuristic below. */
21773 if (algs->size[i].max == 0)
21774 break;
21775 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21776 {
21777 enum stringop_alg candidate = algs->size[i].alg;
21778
21779 if (candidate != libcall && ALG_USABLE_P (candidate))
21780 alg = candidate;
21781 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21782 last non-libcall inline algorithm. */
21783 if (TARGET_INLINE_ALL_STRINGOPS)
21784 {
21785 /* When the current size is best to be copied by a libcall,
21786 but we are still forced to inline, run the heuristic below
21787 that will pick code for medium sized blocks. */
21788 if (alg != libcall)
21789 return alg;
21790 break;
21791 }
21792 else if (ALG_USABLE_P (candidate))
21793 return candidate;
21794 }
21795 }
21796 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21797 }
21798 /* When asked to inline the call anyway, try to pick meaningful choice.
21799 We look for maximal size of block that is faster to copy by hand and
21800 take blocks of at most of that size guessing that average size will
21801 be roughly half of the block.
21802
21803 If this turns out to be bad, we might simply specify the preferred
21804 choice in ix86_costs. */
21805 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21806 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21807 {
21808 int max = -1;
21809 enum stringop_alg alg;
21810 int i;
21811 bool any_alg_usable_p = true;
21812
21813 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21814 {
21815 enum stringop_alg candidate = algs->size[i].alg;
21816 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21817
21818 if (candidate != libcall && candidate
21819 && ALG_USABLE_P (candidate))
21820 max = algs->size[i].max;
21821 }
21822 /* If there aren't any usable algorithms, then recursing on
21823 smaller sizes isn't going to find anything. Just return the
21824 simple byte-at-a-time copy loop. */
21825 if (!any_alg_usable_p)
21826 {
21827 /* Pick something reasonable. */
21828 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21829 *dynamic_check = 128;
21830 return loop_1_byte;
21831 }
21832 if (max == -1)
21833 max = 4096;
21834 alg = decide_alg (count, max / 2, memset, dynamic_check);
21835 gcc_assert (*dynamic_check == -1);
21836 gcc_assert (alg != libcall);
21837 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21838 *dynamic_check = max;
21839 return alg;
21840 }
21841 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
21842 #undef ALG_USABLE_P
21843 }
21844
21845 /* Decide on alignment. We know that the operand is already aligned to ALIGN
21846 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
21847 static int
21848 decide_alignment (int align,
21849 enum stringop_alg alg,
21850 int expected_size)
21851 {
21852 int desired_align = 0;
21853 switch (alg)
21854 {
21855 case no_stringop:
21856 gcc_unreachable ();
21857 case loop:
21858 case unrolled_loop:
21859 desired_align = GET_MODE_SIZE (Pmode);
21860 break;
21861 case rep_prefix_8_byte:
21862 desired_align = 8;
21863 break;
21864 case rep_prefix_4_byte:
21865 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21866 copying whole cacheline at once. */
21867 if (TARGET_PENTIUMPRO)
21868 desired_align = 8;
21869 else
21870 desired_align = 4;
21871 break;
21872 case rep_prefix_1_byte:
21873 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21874 copying whole cacheline at once. */
21875 if (TARGET_PENTIUMPRO)
21876 desired_align = 8;
21877 else
21878 desired_align = 1;
21879 break;
21880 case loop_1_byte:
21881 desired_align = 1;
21882 break;
21883 case libcall:
21884 return 0;
21885 }
21886
21887 if (optimize_size)
21888 desired_align = 1;
21889 if (desired_align < align)
21890 desired_align = align;
21891 if (expected_size != -1 && expected_size < 4)
21892 desired_align = align;
21893 return desired_align;
21894 }
21895
21896 /* Return the smallest power of 2 greater than VAL. */
21897 static int
21898 smallest_pow2_greater_than (int val)
21899 {
21900 int ret = 1;
21901 while (ret <= val)
21902 ret <<= 1;
21903 return ret;
21904 }
21905
21906 /* Expand string move (memcpy) operation. Use i386 string operations
21907 when profitable. expand_setmem contains similar code. The code
21908 depends upon architecture, block size and alignment, but always has
21909 the same overall structure:
21910
21911 1) Prologue guard: Conditional that jumps up to epilogues for small
21912 blocks that can be handled by epilogue alone. This is faster
21913 but also needed for correctness, since prologue assume the block
21914 is larger than the desired alignment.
21915
21916 Optional dynamic check for size and libcall for large
21917 blocks is emitted here too, with -minline-stringops-dynamically.
21918
21919 2) Prologue: copy first few bytes in order to get destination
21920 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
21921 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
21922 copied. We emit either a jump tree on power of two sized
21923 blocks, or a byte loop.
21924
21925 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
21926 with specified algorithm.
21927
21928 4) Epilogue: code copying tail of the block that is too small to be
21929 handled by main body (or up to size guarded by prologue guard). */
21930
21931 bool
21932 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
21933 rtx expected_align_exp, rtx expected_size_exp)
21934 {
21935 rtx destreg;
21936 rtx srcreg;
21937 rtx label = NULL;
21938 rtx tmp;
21939 rtx jump_around_label = NULL;
21940 HOST_WIDE_INT align = 1;
21941 unsigned HOST_WIDE_INT count = 0;
21942 HOST_WIDE_INT expected_size = -1;
21943 int size_needed = 0, epilogue_size_needed;
21944 int desired_align = 0, align_bytes = 0;
21945 enum stringop_alg alg;
21946 int dynamic_check;
21947 bool need_zero_guard = false;
21948
21949 if (CONST_INT_P (align_exp))
21950 align = INTVAL (align_exp);
21951 /* i386 can do misaligned access on reasonably increased cost. */
21952 if (CONST_INT_P (expected_align_exp)
21953 && INTVAL (expected_align_exp) > align)
21954 align = INTVAL (expected_align_exp);
21955 /* ALIGN is the minimum of destination and source alignment, but we care here
21956 just about destination alignment. */
21957 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
21958 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
21959
21960 if (CONST_INT_P (count_exp))
21961 count = expected_size = INTVAL (count_exp);
21962 if (CONST_INT_P (expected_size_exp) && count == 0)
21963 expected_size = INTVAL (expected_size_exp);
21964
21965 /* Make sure we don't need to care about overflow later on. */
21966 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
21967 return false;
21968
21969 /* Step 0: Decide on preferred algorithm, desired alignment and
21970 size of chunks to be copied by main loop. */
21971
21972 alg = decide_alg (count, expected_size, false, &dynamic_check);
21973 desired_align = decide_alignment (align, alg, expected_size);
21974
21975 if (!TARGET_ALIGN_STRINGOPS)
21976 align = desired_align;
21977
21978 if (alg == libcall)
21979 return false;
21980 gcc_assert (alg != no_stringop);
21981 if (!count)
21982 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
21983 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21984 srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
21985 switch (alg)
21986 {
21987 case libcall:
21988 case no_stringop:
21989 gcc_unreachable ();
21990 case loop:
21991 need_zero_guard = true;
21992 size_needed = GET_MODE_SIZE (Pmode);
21993 break;
21994 case unrolled_loop:
21995 need_zero_guard = true;
21996 size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
21997 break;
21998 case rep_prefix_8_byte:
21999 size_needed = 8;
22000 break;
22001 case rep_prefix_4_byte:
22002 size_needed = 4;
22003 break;
22004 case rep_prefix_1_byte:
22005 size_needed = 1;
22006 break;
22007 case loop_1_byte:
22008 need_zero_guard = true;
22009 size_needed = 1;
22010 break;
22011 }
22012
22013 epilogue_size_needed = size_needed;
22014
22015 /* Step 1: Prologue guard. */
22016
22017 /* Alignment code needs count to be in register. */
22018 if (CONST_INT_P (count_exp) && desired_align > align)
22019 {
22020 if (INTVAL (count_exp) > desired_align
22021 && INTVAL (count_exp) > size_needed)
22022 {
22023 align_bytes
22024 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22025 if (align_bytes <= 0)
22026 align_bytes = 0;
22027 else
22028 align_bytes = desired_align - align_bytes;
22029 }
22030 if (align_bytes == 0)
22031 count_exp = force_reg (counter_mode (count_exp), count_exp);
22032 }
22033 gcc_assert (desired_align >= 1 && align >= 1);
22034
22035 /* Ensure that alignment prologue won't copy past end of block. */
22036 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22037 {
22038 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22039 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22040 Make sure it is power of 2. */
22041 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22042
22043 if (count)
22044 {
22045 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22046 {
22047 /* If main algorithm works on QImode, no epilogue is needed.
22048 For small sizes just don't align anything. */
22049 if (size_needed == 1)
22050 desired_align = align;
22051 else
22052 goto epilogue;
22053 }
22054 }
22055 else
22056 {
22057 label = gen_label_rtx ();
22058 emit_cmp_and_jump_insns (count_exp,
22059 GEN_INT (epilogue_size_needed),
22060 LTU, 0, counter_mode (count_exp), 1, label);
22061 if (expected_size == -1 || expected_size < epilogue_size_needed)
22062 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22063 else
22064 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22065 }
22066 }
22067
22068 /* Emit code to decide on runtime whether library call or inline should be
22069 used. */
22070 if (dynamic_check != -1)
22071 {
22072 if (CONST_INT_P (count_exp))
22073 {
22074 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22075 {
22076 emit_block_move_via_libcall (dst, src, count_exp, false);
22077 count_exp = const0_rtx;
22078 goto epilogue;
22079 }
22080 }
22081 else
22082 {
22083 rtx hot_label = gen_label_rtx ();
22084 jump_around_label = gen_label_rtx ();
22085 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22086 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22087 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22088 emit_block_move_via_libcall (dst, src, count_exp, false);
22089 emit_jump (jump_around_label);
22090 emit_label (hot_label);
22091 }
22092 }
22093
22094 /* Step 2: Alignment prologue. */
22095
22096 if (desired_align > align)
22097 {
22098 if (align_bytes == 0)
22099 {
22100 /* Except for the first move in epilogue, we no longer know
22101 constant offset in aliasing info. It don't seems to worth
22102 the pain to maintain it for the first move, so throw away
22103 the info early. */
22104 src = change_address (src, BLKmode, srcreg);
22105 dst = change_address (dst, BLKmode, destreg);
22106 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22107 desired_align);
22108 }
22109 else
22110 {
22111 /* If we know how many bytes need to be stored before dst is
22112 sufficiently aligned, maintain aliasing info accurately. */
22113 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22114 desired_align, align_bytes);
22115 count_exp = plus_constant (count_exp, -align_bytes);
22116 count -= align_bytes;
22117 }
22118 if (need_zero_guard
22119 && (count < (unsigned HOST_WIDE_INT) size_needed
22120 || (align_bytes == 0
22121 && count < ((unsigned HOST_WIDE_INT) size_needed
22122 + desired_align - align))))
22123 {
22124 /* It is possible that we copied enough so the main loop will not
22125 execute. */
22126 gcc_assert (size_needed > 1);
22127 if (label == NULL_RTX)
22128 label = gen_label_rtx ();
22129 emit_cmp_and_jump_insns (count_exp,
22130 GEN_INT (size_needed),
22131 LTU, 0, counter_mode (count_exp), 1, label);
22132 if (expected_size == -1
22133 || expected_size < (desired_align - align) / 2 + size_needed)
22134 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22135 else
22136 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22137 }
22138 }
22139 if (label && size_needed == 1)
22140 {
22141 emit_label (label);
22142 LABEL_NUSES (label) = 1;
22143 label = NULL;
22144 epilogue_size_needed = 1;
22145 }
22146 else if (label == NULL_RTX)
22147 epilogue_size_needed = size_needed;
22148
22149 /* Step 3: Main loop. */
22150
22151 switch (alg)
22152 {
22153 case libcall:
22154 case no_stringop:
22155 gcc_unreachable ();
22156 case loop_1_byte:
22157 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22158 count_exp, QImode, 1, expected_size);
22159 break;
22160 case loop:
22161 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22162 count_exp, Pmode, 1, expected_size);
22163 break;
22164 case unrolled_loop:
22165 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22166 registers for 4 temporaries anyway. */
22167 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22168 count_exp, Pmode, TARGET_64BIT ? 4 : 2,
22169 expected_size);
22170 break;
22171 case rep_prefix_8_byte:
22172 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22173 DImode);
22174 break;
22175 case rep_prefix_4_byte:
22176 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22177 SImode);
22178 break;
22179 case rep_prefix_1_byte:
22180 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22181 QImode);
22182 break;
22183 }
22184 /* Adjust properly the offset of src and dest memory for aliasing. */
22185 if (CONST_INT_P (count_exp))
22186 {
22187 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22188 (count / size_needed) * size_needed);
22189 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22190 (count / size_needed) * size_needed);
22191 }
22192 else
22193 {
22194 src = change_address (src, BLKmode, srcreg);
22195 dst = change_address (dst, BLKmode, destreg);
22196 }
22197
22198 /* Step 4: Epilogue to copy the remaining bytes. */
22199 epilogue:
22200 if (label)
22201 {
22202 /* When the main loop is done, COUNT_EXP might hold original count,
22203 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22204 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22205 bytes. Compensate if needed. */
22206
22207 if (size_needed < epilogue_size_needed)
22208 {
22209 tmp =
22210 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22211 GEN_INT (size_needed - 1), count_exp, 1,
22212 OPTAB_DIRECT);
22213 if (tmp != count_exp)
22214 emit_move_insn (count_exp, tmp);
22215 }
22216 emit_label (label);
22217 LABEL_NUSES (label) = 1;
22218 }
22219
22220 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22221 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22222 epilogue_size_needed);
22223 if (jump_around_label)
22224 emit_label (jump_around_label);
22225 return true;
22226 }
22227
22228 /* Helper function for memcpy. For QImode value 0xXY produce
22229 0xXYXYXYXY of wide specified by MODE. This is essentially
22230 a * 0x10101010, but we can do slightly better than
22231 synth_mult by unwinding the sequence by hand on CPUs with
22232 slow multiply. */
22233 static rtx
22234 promote_duplicated_reg (enum machine_mode mode, rtx val)
22235 {
22236 enum machine_mode valmode = GET_MODE (val);
22237 rtx tmp;
22238 int nops = mode == DImode ? 3 : 2;
22239
22240 gcc_assert (mode == SImode || mode == DImode);
22241 if (val == const0_rtx)
22242 return copy_to_mode_reg (mode, const0_rtx);
22243 if (CONST_INT_P (val))
22244 {
22245 HOST_WIDE_INT v = INTVAL (val) & 255;
22246
22247 v |= v << 8;
22248 v |= v << 16;
22249 if (mode == DImode)
22250 v |= (v << 16) << 16;
22251 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22252 }
22253
22254 if (valmode == VOIDmode)
22255 valmode = QImode;
22256 if (valmode != QImode)
22257 val = gen_lowpart (QImode, val);
22258 if (mode == QImode)
22259 return val;
22260 if (!TARGET_PARTIAL_REG_STALL)
22261 nops--;
22262 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22263 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22264 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22265 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22266 {
22267 rtx reg = convert_modes (mode, QImode, val, true);
22268 tmp = promote_duplicated_reg (mode, const1_rtx);
22269 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22270 OPTAB_DIRECT);
22271 }
22272 else
22273 {
22274 rtx reg = convert_modes (mode, QImode, val, true);
22275
22276 if (!TARGET_PARTIAL_REG_STALL)
22277 if (mode == SImode)
22278 emit_insn (gen_movsi_insv_1 (reg, reg));
22279 else
22280 emit_insn (gen_movdi_insv_1 (reg, reg));
22281 else
22282 {
22283 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22284 NULL, 1, OPTAB_DIRECT);
22285 reg =
22286 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22287 }
22288 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22289 NULL, 1, OPTAB_DIRECT);
22290 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22291 if (mode == SImode)
22292 return reg;
22293 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22294 NULL, 1, OPTAB_DIRECT);
22295 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22296 return reg;
22297 }
22298 }
22299
22300 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22301 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22302 alignment from ALIGN to DESIRED_ALIGN. */
22303 static rtx
22304 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22305 {
22306 rtx promoted_val;
22307
22308 if (TARGET_64BIT
22309 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22310 promoted_val = promote_duplicated_reg (DImode, val);
22311 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22312 promoted_val = promote_duplicated_reg (SImode, val);
22313 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22314 promoted_val = promote_duplicated_reg (HImode, val);
22315 else
22316 promoted_val = val;
22317
22318 return promoted_val;
22319 }
22320
22321 /* Expand string clear operation (bzero). Use i386 string operations when
22322 profitable. See expand_movmem comment for explanation of individual
22323 steps performed. */
22324 bool
22325 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22326 rtx expected_align_exp, rtx expected_size_exp)
22327 {
22328 rtx destreg;
22329 rtx label = NULL;
22330 rtx tmp;
22331 rtx jump_around_label = NULL;
22332 HOST_WIDE_INT align = 1;
22333 unsigned HOST_WIDE_INT count = 0;
22334 HOST_WIDE_INT expected_size = -1;
22335 int size_needed = 0, epilogue_size_needed;
22336 int desired_align = 0, align_bytes = 0;
22337 enum stringop_alg alg;
22338 rtx promoted_val = NULL;
22339 bool force_loopy_epilogue = false;
22340 int dynamic_check;
22341 bool need_zero_guard = false;
22342
22343 if (CONST_INT_P (align_exp))
22344 align = INTVAL (align_exp);
22345 /* i386 can do misaligned access on reasonably increased cost. */
22346 if (CONST_INT_P (expected_align_exp)
22347 && INTVAL (expected_align_exp) > align)
22348 align = INTVAL (expected_align_exp);
22349 if (CONST_INT_P (count_exp))
22350 count = expected_size = INTVAL (count_exp);
22351 if (CONST_INT_P (expected_size_exp) && count == 0)
22352 expected_size = INTVAL (expected_size_exp);
22353
22354 /* Make sure we don't need to care about overflow later on. */
22355 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22356 return false;
22357
22358 /* Step 0: Decide on preferred algorithm, desired alignment and
22359 size of chunks to be copied by main loop. */
22360
22361 alg = decide_alg (count, expected_size, true, &dynamic_check);
22362 desired_align = decide_alignment (align, alg, expected_size);
22363
22364 if (!TARGET_ALIGN_STRINGOPS)
22365 align = desired_align;
22366
22367 if (alg == libcall)
22368 return false;
22369 gcc_assert (alg != no_stringop);
22370 if (!count)
22371 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22372 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22373 switch (alg)
22374 {
22375 case libcall:
22376 case no_stringop:
22377 gcc_unreachable ();
22378 case loop:
22379 need_zero_guard = true;
22380 size_needed = GET_MODE_SIZE (Pmode);
22381 break;
22382 case unrolled_loop:
22383 need_zero_guard = true;
22384 size_needed = GET_MODE_SIZE (Pmode) * 4;
22385 break;
22386 case rep_prefix_8_byte:
22387 size_needed = 8;
22388 break;
22389 case rep_prefix_4_byte:
22390 size_needed = 4;
22391 break;
22392 case rep_prefix_1_byte:
22393 size_needed = 1;
22394 break;
22395 case loop_1_byte:
22396 need_zero_guard = true;
22397 size_needed = 1;
22398 break;
22399 }
22400 epilogue_size_needed = size_needed;
22401
22402 /* Step 1: Prologue guard. */
22403
22404 /* Alignment code needs count to be in register. */
22405 if (CONST_INT_P (count_exp) && desired_align > align)
22406 {
22407 if (INTVAL (count_exp) > desired_align
22408 && INTVAL (count_exp) > size_needed)
22409 {
22410 align_bytes
22411 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22412 if (align_bytes <= 0)
22413 align_bytes = 0;
22414 else
22415 align_bytes = desired_align - align_bytes;
22416 }
22417 if (align_bytes == 0)
22418 {
22419 enum machine_mode mode = SImode;
22420 if (TARGET_64BIT && (count & ~0xffffffff))
22421 mode = DImode;
22422 count_exp = force_reg (mode, count_exp);
22423 }
22424 }
22425 /* Do the cheap promotion to allow better CSE across the
22426 main loop and epilogue (ie one load of the big constant in the
22427 front of all code. */
22428 if (CONST_INT_P (val_exp))
22429 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22430 desired_align, align);
22431 /* Ensure that alignment prologue won't copy past end of block. */
22432 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22433 {
22434 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22435 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22436 Make sure it is power of 2. */
22437 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22438
22439 /* To improve performance of small blocks, we jump around the VAL
22440 promoting mode. This mean that if the promoted VAL is not constant,
22441 we might not use it in the epilogue and have to use byte
22442 loop variant. */
22443 if (epilogue_size_needed > 2 && !promoted_val)
22444 force_loopy_epilogue = true;
22445 if (count)
22446 {
22447 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22448 {
22449 /* If main algorithm works on QImode, no epilogue is needed.
22450 For small sizes just don't align anything. */
22451 if (size_needed == 1)
22452 desired_align = align;
22453 else
22454 goto epilogue;
22455 }
22456 }
22457 else
22458 {
22459 label = gen_label_rtx ();
22460 emit_cmp_and_jump_insns (count_exp,
22461 GEN_INT (epilogue_size_needed),
22462 LTU, 0, counter_mode (count_exp), 1, label);
22463 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22464 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22465 else
22466 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22467 }
22468 }
22469 if (dynamic_check != -1)
22470 {
22471 rtx hot_label = gen_label_rtx ();
22472 jump_around_label = gen_label_rtx ();
22473 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22474 LEU, 0, counter_mode (count_exp), 1, hot_label);
22475 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22476 set_storage_via_libcall (dst, count_exp, val_exp, false);
22477 emit_jump (jump_around_label);
22478 emit_label (hot_label);
22479 }
22480
22481 /* Step 2: Alignment prologue. */
22482
22483 /* Do the expensive promotion once we branched off the small blocks. */
22484 if (!promoted_val)
22485 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22486 desired_align, align);
22487 gcc_assert (desired_align >= 1 && align >= 1);
22488
22489 if (desired_align > align)
22490 {
22491 if (align_bytes == 0)
22492 {
22493 /* Except for the first move in epilogue, we no longer know
22494 constant offset in aliasing info. It don't seems to worth
22495 the pain to maintain it for the first move, so throw away
22496 the info early. */
22497 dst = change_address (dst, BLKmode, destreg);
22498 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22499 desired_align);
22500 }
22501 else
22502 {
22503 /* If we know how many bytes need to be stored before dst is
22504 sufficiently aligned, maintain aliasing info accurately. */
22505 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22506 desired_align, align_bytes);
22507 count_exp = plus_constant (count_exp, -align_bytes);
22508 count -= align_bytes;
22509 }
22510 if (need_zero_guard
22511 && (count < (unsigned HOST_WIDE_INT) size_needed
22512 || (align_bytes == 0
22513 && count < ((unsigned HOST_WIDE_INT) size_needed
22514 + desired_align - align))))
22515 {
22516 /* It is possible that we copied enough so the main loop will not
22517 execute. */
22518 gcc_assert (size_needed > 1);
22519 if (label == NULL_RTX)
22520 label = gen_label_rtx ();
22521 emit_cmp_and_jump_insns (count_exp,
22522 GEN_INT (size_needed),
22523 LTU, 0, counter_mode (count_exp), 1, label);
22524 if (expected_size == -1
22525 || expected_size < (desired_align - align) / 2 + size_needed)
22526 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22527 else
22528 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22529 }
22530 }
22531 if (label && size_needed == 1)
22532 {
22533 emit_label (label);
22534 LABEL_NUSES (label) = 1;
22535 label = NULL;
22536 promoted_val = val_exp;
22537 epilogue_size_needed = 1;
22538 }
22539 else if (label == NULL_RTX)
22540 epilogue_size_needed = size_needed;
22541
22542 /* Step 3: Main loop. */
22543
22544 switch (alg)
22545 {
22546 case libcall:
22547 case no_stringop:
22548 gcc_unreachable ();
22549 case loop_1_byte:
22550 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22551 count_exp, QImode, 1, expected_size);
22552 break;
22553 case loop:
22554 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22555 count_exp, Pmode, 1, expected_size);
22556 break;
22557 case unrolled_loop:
22558 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22559 count_exp, Pmode, 4, expected_size);
22560 break;
22561 case rep_prefix_8_byte:
22562 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22563 DImode, val_exp);
22564 break;
22565 case rep_prefix_4_byte:
22566 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22567 SImode, val_exp);
22568 break;
22569 case rep_prefix_1_byte:
22570 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22571 QImode, val_exp);
22572 break;
22573 }
22574 /* Adjust properly the offset of src and dest memory for aliasing. */
22575 if (CONST_INT_P (count_exp))
22576 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22577 (count / size_needed) * size_needed);
22578 else
22579 dst = change_address (dst, BLKmode, destreg);
22580
22581 /* Step 4: Epilogue to copy the remaining bytes. */
22582
22583 if (label)
22584 {
22585 /* When the main loop is done, COUNT_EXP might hold original count,
22586 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22587 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22588 bytes. Compensate if needed. */
22589
22590 if (size_needed < epilogue_size_needed)
22591 {
22592 tmp =
22593 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22594 GEN_INT (size_needed - 1), count_exp, 1,
22595 OPTAB_DIRECT);
22596 if (tmp != count_exp)
22597 emit_move_insn (count_exp, tmp);
22598 }
22599 emit_label (label);
22600 LABEL_NUSES (label) = 1;
22601 }
22602 epilogue:
22603 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22604 {
22605 if (force_loopy_epilogue)
22606 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22607 epilogue_size_needed);
22608 else
22609 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22610 epilogue_size_needed);
22611 }
22612 if (jump_around_label)
22613 emit_label (jump_around_label);
22614 return true;
22615 }
22616
22617 /* Expand the appropriate insns for doing strlen if not just doing
22618 repnz; scasb
22619
22620 out = result, initialized with the start address
22621 align_rtx = alignment of the address.
22622 scratch = scratch register, initialized with the startaddress when
22623 not aligned, otherwise undefined
22624
22625 This is just the body. It needs the initializations mentioned above and
22626 some address computing at the end. These things are done in i386.md. */
22627
22628 static void
22629 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22630 {
22631 int align;
22632 rtx tmp;
22633 rtx align_2_label = NULL_RTX;
22634 rtx align_3_label = NULL_RTX;
22635 rtx align_4_label = gen_label_rtx ();
22636 rtx end_0_label = gen_label_rtx ();
22637 rtx mem;
22638 rtx tmpreg = gen_reg_rtx (SImode);
22639 rtx scratch = gen_reg_rtx (SImode);
22640 rtx cmp;
22641
22642 align = 0;
22643 if (CONST_INT_P (align_rtx))
22644 align = INTVAL (align_rtx);
22645
22646 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22647
22648 /* Is there a known alignment and is it less than 4? */
22649 if (align < 4)
22650 {
22651 rtx scratch1 = gen_reg_rtx (Pmode);
22652 emit_move_insn (scratch1, out);
22653 /* Is there a known alignment and is it not 2? */
22654 if (align != 2)
22655 {
22656 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22657 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22658
22659 /* Leave just the 3 lower bits. */
22660 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22661 NULL_RTX, 0, OPTAB_WIDEN);
22662
22663 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22664 Pmode, 1, align_4_label);
22665 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22666 Pmode, 1, align_2_label);
22667 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22668 Pmode, 1, align_3_label);
22669 }
22670 else
22671 {
22672 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22673 check if is aligned to 4 - byte. */
22674
22675 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22676 NULL_RTX, 0, OPTAB_WIDEN);
22677
22678 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22679 Pmode, 1, align_4_label);
22680 }
22681
22682 mem = change_address (src, QImode, out);
22683
22684 /* Now compare the bytes. */
22685
22686 /* Compare the first n unaligned byte on a byte per byte basis. */
22687 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22688 QImode, 1, end_0_label);
22689
22690 /* Increment the address. */
22691 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22692
22693 /* Not needed with an alignment of 2 */
22694 if (align != 2)
22695 {
22696 emit_label (align_2_label);
22697
22698 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22699 end_0_label);
22700
22701 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22702
22703 emit_label (align_3_label);
22704 }
22705
22706 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22707 end_0_label);
22708
22709 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22710 }
22711
22712 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22713 align this loop. It gives only huge programs, but does not help to
22714 speed up. */
22715 emit_label (align_4_label);
22716
22717 mem = change_address (src, SImode, out);
22718 emit_move_insn (scratch, mem);
22719 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22720
22721 /* This formula yields a nonzero result iff one of the bytes is zero.
22722 This saves three branches inside loop and many cycles. */
22723
22724 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22725 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22726 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22727 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22728 gen_int_mode (0x80808080, SImode)));
22729 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22730 align_4_label);
22731
22732 if (TARGET_CMOVE)
22733 {
22734 rtx reg = gen_reg_rtx (SImode);
22735 rtx reg2 = gen_reg_rtx (Pmode);
22736 emit_move_insn (reg, tmpreg);
22737 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22738
22739 /* If zero is not in the first two bytes, move two bytes forward. */
22740 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22741 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22742 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22743 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22744 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22745 reg,
22746 tmpreg)));
22747 /* Emit lea manually to avoid clobbering of flags. */
22748 emit_insn (gen_rtx_SET (SImode, reg2,
22749 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22750
22751 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22752 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22753 emit_insn (gen_rtx_SET (VOIDmode, out,
22754 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22755 reg2,
22756 out)));
22757 }
22758 else
22759 {
22760 rtx end_2_label = gen_label_rtx ();
22761 /* Is zero in the first two bytes? */
22762
22763 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22764 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22765 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22766 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22767 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22768 pc_rtx);
22769 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22770 JUMP_LABEL (tmp) = end_2_label;
22771
22772 /* Not in the first two. Move two bytes forward. */
22773 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22774 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22775
22776 emit_label (end_2_label);
22777
22778 }
22779
22780 /* Avoid branch in fixing the byte. */
22781 tmpreg = gen_lowpart (QImode, tmpreg);
22782 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22783 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22784 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22785 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22786
22787 emit_label (end_0_label);
22788 }
22789
22790 /* Expand strlen. */
22791
22792 bool
22793 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22794 {
22795 rtx addr, scratch1, scratch2, scratch3, scratch4;
22796
22797 /* The generic case of strlen expander is long. Avoid it's
22798 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22799
22800 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22801 && !TARGET_INLINE_ALL_STRINGOPS
22802 && !optimize_insn_for_size_p ()
22803 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22804 return false;
22805
22806 addr = force_reg (Pmode, XEXP (src, 0));
22807 scratch1 = gen_reg_rtx (Pmode);
22808
22809 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22810 && !optimize_insn_for_size_p ())
22811 {
22812 /* Well it seems that some optimizer does not combine a call like
22813 foo(strlen(bar), strlen(bar));
22814 when the move and the subtraction is done here. It does calculate
22815 the length just once when these instructions are done inside of
22816 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
22817 often used and I use one fewer register for the lifetime of
22818 output_strlen_unroll() this is better. */
22819
22820 emit_move_insn (out, addr);
22821
22822 ix86_expand_strlensi_unroll_1 (out, src, align);
22823
22824 /* strlensi_unroll_1 returns the address of the zero at the end of
22825 the string, like memchr(), so compute the length by subtracting
22826 the start address. */
22827 emit_insn (ix86_gen_sub3 (out, out, addr));
22828 }
22829 else
22830 {
22831 rtx unspec;
22832
22833 /* Can't use this if the user has appropriated eax, ecx, or edi. */
22834 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
22835 return false;
22836
22837 scratch2 = gen_reg_rtx (Pmode);
22838 scratch3 = gen_reg_rtx (Pmode);
22839 scratch4 = force_reg (Pmode, constm1_rtx);
22840
22841 emit_move_insn (scratch3, addr);
22842 eoschar = force_reg (QImode, eoschar);
22843
22844 src = replace_equiv_address_nv (src, scratch3);
22845
22846 /* If .md starts supporting :P, this can be done in .md. */
22847 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
22848 scratch4), UNSPEC_SCAS);
22849 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
22850 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
22851 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
22852 }
22853 return true;
22854 }
22855
22856 /* For given symbol (function) construct code to compute address of it's PLT
22857 entry in large x86-64 PIC model. */
22858 rtx
22859 construct_plt_address (rtx symbol)
22860 {
22861 rtx tmp = gen_reg_rtx (Pmode);
22862 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
22863
22864 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
22865 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
22866
22867 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
22868 emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
22869 return tmp;
22870 }
22871
22872 rtx
22873 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
22874 rtx callarg2,
22875 rtx pop, bool sibcall)
22876 {
22877 /* We need to represent that SI and DI registers are clobbered
22878 by SYSV calls. */
22879 static int clobbered_registers[] = {
22880 XMM6_REG, XMM7_REG, XMM8_REG,
22881 XMM9_REG, XMM10_REG, XMM11_REG,
22882 XMM12_REG, XMM13_REG, XMM14_REG,
22883 XMM15_REG, SI_REG, DI_REG
22884 };
22885 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
22886 rtx use = NULL, call;
22887 unsigned int vec_len;
22888
22889 if (pop == const0_rtx)
22890 pop = NULL;
22891 gcc_assert (!TARGET_64BIT || !pop);
22892
22893 if (TARGET_MACHO && !TARGET_64BIT)
22894 {
22895 #if TARGET_MACHO
22896 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
22897 fnaddr = machopic_indirect_call_target (fnaddr);
22898 #endif
22899 }
22900 else
22901 {
22902 /* Static functions and indirect calls don't need the pic register. */
22903 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
22904 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22905 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
22906 use_reg (&use, pic_offset_table_rtx);
22907 }
22908
22909 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
22910 {
22911 rtx al = gen_rtx_REG (QImode, AX_REG);
22912 emit_move_insn (al, callarg2);
22913 use_reg (&use, al);
22914 }
22915
22916 if (ix86_cmodel == CM_LARGE_PIC
22917 && MEM_P (fnaddr)
22918 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22919 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
22920 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
22921 else if (sibcall
22922 ? !sibcall_insn_operand (XEXP (fnaddr, 0), Pmode)
22923 : !call_insn_operand (XEXP (fnaddr, 0), Pmode))
22924 {
22925 fnaddr = XEXP (fnaddr, 0);
22926 if (GET_MODE (fnaddr) != Pmode)
22927 fnaddr = convert_to_mode (Pmode, fnaddr, 1);
22928 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (Pmode, fnaddr));
22929 }
22930
22931 vec_len = 0;
22932 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
22933 if (retval)
22934 call = gen_rtx_SET (VOIDmode, retval, call);
22935 vec[vec_len++] = call;
22936
22937 if (pop)
22938 {
22939 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
22940 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
22941 vec[vec_len++] = pop;
22942 }
22943
22944 if (TARGET_64BIT_MS_ABI
22945 && (!callarg2 || INTVAL (callarg2) != -2))
22946 {
22947 unsigned i;
22948
22949 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
22950 UNSPEC_MS_TO_SYSV_CALL);
22951
22952 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
22953 vec[vec_len++]
22954 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
22955 ? TImode : DImode,
22956 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
22957 ? TImode : DImode,
22958 clobbered_registers[i]));
22959 }
22960
22961 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
22962 if (TARGET_VZEROUPPER)
22963 {
22964 int avx256;
22965 if (cfun->machine->callee_pass_avx256_p)
22966 {
22967 if (cfun->machine->callee_return_avx256_p)
22968 avx256 = callee_return_pass_avx256;
22969 else
22970 avx256 = callee_pass_avx256;
22971 }
22972 else if (cfun->machine->callee_return_avx256_p)
22973 avx256 = callee_return_avx256;
22974 else
22975 avx256 = call_no_avx256;
22976
22977 if (reload_completed)
22978 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
22979 else
22980 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
22981 gen_rtvec (1, GEN_INT (avx256)),
22982 UNSPEC_CALL_NEEDS_VZEROUPPER);
22983 }
22984
22985 if (vec_len > 1)
22986 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
22987 call = emit_call_insn (call);
22988 if (use)
22989 CALL_INSN_FUNCTION_USAGE (call) = use;
22990
22991 return call;
22992 }
22993
22994 void
22995 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
22996 {
22997 rtx pat = PATTERN (insn);
22998 rtvec vec = XVEC (pat, 0);
22999 int len = GET_NUM_ELEM (vec) - 1;
23000
23001 /* Strip off the last entry of the parallel. */
23002 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23003 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23004 if (len == 1)
23005 pat = RTVEC_ELT (vec, 0);
23006 else
23007 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23008
23009 emit_insn (gen_avx_vzeroupper (vzeroupper));
23010 emit_call_insn (pat);
23011 }
23012
23013 /* Output the assembly for a call instruction. */
23014
23015 const char *
23016 ix86_output_call_insn (rtx insn, rtx call_op)
23017 {
23018 bool direct_p = constant_call_address_operand (call_op, Pmode);
23019 bool seh_nop_p = false;
23020 const char *xasm;
23021
23022 if (SIBLING_CALL_P (insn))
23023 {
23024 if (direct_p)
23025 xasm = "jmp\t%P0";
23026 /* SEH epilogue detection requires the indirect branch case
23027 to include REX.W. */
23028 else if (TARGET_SEH)
23029 xasm = "rex.W jmp %A0";
23030 else
23031 xasm = "jmp\t%A0";
23032
23033 output_asm_insn (xasm, &call_op);
23034 return "";
23035 }
23036
23037 /* SEH unwinding can require an extra nop to be emitted in several
23038 circumstances. Determine if we have one of those. */
23039 if (TARGET_SEH)
23040 {
23041 rtx i;
23042
23043 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23044 {
23045 /* If we get to another real insn, we don't need the nop. */
23046 if (INSN_P (i))
23047 break;
23048
23049 /* If we get to the epilogue note, prevent a catch region from
23050 being adjacent to the standard epilogue sequence. If non-
23051 call-exceptions, we'll have done this during epilogue emission. */
23052 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23053 && !flag_non_call_exceptions
23054 && !can_throw_internal (insn))
23055 {
23056 seh_nop_p = true;
23057 break;
23058 }
23059 }
23060
23061 /* If we didn't find a real insn following the call, prevent the
23062 unwinder from looking into the next function. */
23063 if (i == NULL)
23064 seh_nop_p = true;
23065 }
23066
23067 if (direct_p)
23068 xasm = "call\t%P0";
23069 else
23070 xasm = "call\t%A0";
23071
23072 output_asm_insn (xasm, &call_op);
23073
23074 if (seh_nop_p)
23075 return "nop";
23076
23077 return "";
23078 }
23079 \f
23080 /* Clear stack slot assignments remembered from previous functions.
23081 This is called from INIT_EXPANDERS once before RTL is emitted for each
23082 function. */
23083
23084 static struct machine_function *
23085 ix86_init_machine_status (void)
23086 {
23087 struct machine_function *f;
23088
23089 f = ggc_alloc_cleared_machine_function ();
23090 f->use_fast_prologue_epilogue_nregs = -1;
23091 f->tls_descriptor_call_expanded_p = 0;
23092 f->call_abi = ix86_abi;
23093
23094 return f;
23095 }
23096
23097 /* Return a MEM corresponding to a stack slot with mode MODE.
23098 Allocate a new slot if necessary.
23099
23100 The RTL for a function can have several slots available: N is
23101 which slot to use. */
23102
23103 rtx
23104 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23105 {
23106 struct stack_local_entry *s;
23107
23108 gcc_assert (n < MAX_386_STACK_LOCALS);
23109
23110 /* Virtual slot is valid only before vregs are instantiated. */
23111 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23112
23113 for (s = ix86_stack_locals; s; s = s->next)
23114 if (s->mode == mode && s->n == n)
23115 return validize_mem (copy_rtx (s->rtl));
23116
23117 s = ggc_alloc_stack_local_entry ();
23118 s->n = n;
23119 s->mode = mode;
23120 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23121
23122 s->next = ix86_stack_locals;
23123 ix86_stack_locals = s;
23124 return validize_mem (s->rtl);
23125 }
23126 \f
23127 /* Calculate the length of the memory address in the instruction encoding.
23128 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23129 or other prefixes. */
23130
23131 int
23132 memory_address_length (rtx addr)
23133 {
23134 struct ix86_address parts;
23135 rtx base, index, disp;
23136 int len;
23137 int ok;
23138
23139 if (GET_CODE (addr) == PRE_DEC
23140 || GET_CODE (addr) == POST_INC
23141 || GET_CODE (addr) == PRE_MODIFY
23142 || GET_CODE (addr) == POST_MODIFY)
23143 return 0;
23144
23145 ok = ix86_decompose_address (addr, &parts);
23146 gcc_assert (ok);
23147
23148 if (parts.base && GET_CODE (parts.base) == SUBREG)
23149 parts.base = SUBREG_REG (parts.base);
23150 if (parts.index && GET_CODE (parts.index) == SUBREG)
23151 parts.index = SUBREG_REG (parts.index);
23152
23153 base = parts.base;
23154 index = parts.index;
23155 disp = parts.disp;
23156
23157 /* Add length of addr32 prefix. */
23158 len = (GET_CODE (addr) == ZERO_EXTEND
23159 || GET_CODE (addr) == AND);
23160
23161 /* Rule of thumb:
23162 - esp as the base always wants an index,
23163 - ebp as the base always wants a displacement,
23164 - r12 as the base always wants an index,
23165 - r13 as the base always wants a displacement. */
23166
23167 /* Register Indirect. */
23168 if (base && !index && !disp)
23169 {
23170 /* esp (for its index) and ebp (for its displacement) need
23171 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23172 code. */
23173 if (REG_P (addr)
23174 && (addr == arg_pointer_rtx
23175 || addr == frame_pointer_rtx
23176 || REGNO (addr) == SP_REG
23177 || REGNO (addr) == BP_REG
23178 || REGNO (addr) == R12_REG
23179 || REGNO (addr) == R13_REG))
23180 len = 1;
23181 }
23182
23183 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23184 is not disp32, but disp32(%rip), so for disp32
23185 SIB byte is needed, unless print_operand_address
23186 optimizes it into disp32(%rip) or (%rip) is implied
23187 by UNSPEC. */
23188 else if (disp && !base && !index)
23189 {
23190 len = 4;
23191 if (TARGET_64BIT)
23192 {
23193 rtx symbol = disp;
23194
23195 if (GET_CODE (disp) == CONST)
23196 symbol = XEXP (disp, 0);
23197 if (GET_CODE (symbol) == PLUS
23198 && CONST_INT_P (XEXP (symbol, 1)))
23199 symbol = XEXP (symbol, 0);
23200
23201 if (GET_CODE (symbol) != LABEL_REF
23202 && (GET_CODE (symbol) != SYMBOL_REF
23203 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23204 && (GET_CODE (symbol) != UNSPEC
23205 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23206 && XINT (symbol, 1) != UNSPEC_PCREL
23207 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23208 len += 1;
23209 }
23210 }
23211
23212 else
23213 {
23214 /* Find the length of the displacement constant. */
23215 if (disp)
23216 {
23217 if (base && satisfies_constraint_K (disp))
23218 len = 1;
23219 else
23220 len = 4;
23221 }
23222 /* ebp always wants a displacement. Similarly r13. */
23223 else if (base && REG_P (base)
23224 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23225 len = 1;
23226
23227 /* An index requires the two-byte modrm form.... */
23228 if (index
23229 /* ...like esp (or r12), which always wants an index. */
23230 || base == arg_pointer_rtx
23231 || base == frame_pointer_rtx
23232 || (base && REG_P (base)
23233 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23234 len += 1;
23235 }
23236
23237 switch (parts.seg)
23238 {
23239 case SEG_FS:
23240 case SEG_GS:
23241 len += 1;
23242 break;
23243 default:
23244 break;
23245 }
23246
23247 return len;
23248 }
23249
23250 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23251 is set, expect that insn have 8bit immediate alternative. */
23252 int
23253 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23254 {
23255 int len = 0;
23256 int i;
23257 extract_insn_cached (insn);
23258 for (i = recog_data.n_operands - 1; i >= 0; --i)
23259 if (CONSTANT_P (recog_data.operand[i]))
23260 {
23261 enum attr_mode mode = get_attr_mode (insn);
23262
23263 gcc_assert (!len);
23264 if (shortform && CONST_INT_P (recog_data.operand[i]))
23265 {
23266 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23267 switch (mode)
23268 {
23269 case MODE_QI:
23270 len = 1;
23271 continue;
23272 case MODE_HI:
23273 ival = trunc_int_for_mode (ival, HImode);
23274 break;
23275 case MODE_SI:
23276 ival = trunc_int_for_mode (ival, SImode);
23277 break;
23278 default:
23279 break;
23280 }
23281 if (IN_RANGE (ival, -128, 127))
23282 {
23283 len = 1;
23284 continue;
23285 }
23286 }
23287 switch (mode)
23288 {
23289 case MODE_QI:
23290 len = 1;
23291 break;
23292 case MODE_HI:
23293 len = 2;
23294 break;
23295 case MODE_SI:
23296 len = 4;
23297 break;
23298 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23299 case MODE_DI:
23300 len = 4;
23301 break;
23302 default:
23303 fatal_insn ("unknown insn mode", insn);
23304 }
23305 }
23306 return len;
23307 }
23308 /* Compute default value for "length_address" attribute. */
23309 int
23310 ix86_attr_length_address_default (rtx insn)
23311 {
23312 int i;
23313
23314 if (get_attr_type (insn) == TYPE_LEA)
23315 {
23316 rtx set = PATTERN (insn), addr;
23317
23318 if (GET_CODE (set) == PARALLEL)
23319 set = XVECEXP (set, 0, 0);
23320
23321 gcc_assert (GET_CODE (set) == SET);
23322
23323 addr = SET_SRC (set);
23324 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23325 {
23326 if (GET_CODE (addr) == ZERO_EXTEND)
23327 addr = XEXP (addr, 0);
23328 if (GET_CODE (addr) == SUBREG)
23329 addr = SUBREG_REG (addr);
23330 }
23331
23332 return memory_address_length (addr);
23333 }
23334
23335 extract_insn_cached (insn);
23336 for (i = recog_data.n_operands - 1; i >= 0; --i)
23337 if (MEM_P (recog_data.operand[i]))
23338 {
23339 constrain_operands_cached (reload_completed);
23340 if (which_alternative != -1)
23341 {
23342 const char *constraints = recog_data.constraints[i];
23343 int alt = which_alternative;
23344
23345 while (*constraints == '=' || *constraints == '+')
23346 constraints++;
23347 while (alt-- > 0)
23348 while (*constraints++ != ',')
23349 ;
23350 /* Skip ignored operands. */
23351 if (*constraints == 'X')
23352 continue;
23353 }
23354 return memory_address_length (XEXP (recog_data.operand[i], 0));
23355 }
23356 return 0;
23357 }
23358
23359 /* Compute default value for "length_vex" attribute. It includes
23360 2 or 3 byte VEX prefix and 1 opcode byte. */
23361
23362 int
23363 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23364 {
23365 int i;
23366
23367 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23368 byte VEX prefix. */
23369 if (!has_0f_opcode || has_vex_w)
23370 return 3 + 1;
23371
23372 /* We can always use 2 byte VEX prefix in 32bit. */
23373 if (!TARGET_64BIT)
23374 return 2 + 1;
23375
23376 extract_insn_cached (insn);
23377
23378 for (i = recog_data.n_operands - 1; i >= 0; --i)
23379 if (REG_P (recog_data.operand[i]))
23380 {
23381 /* REX.W bit uses 3 byte VEX prefix. */
23382 if (GET_MODE (recog_data.operand[i]) == DImode
23383 && GENERAL_REG_P (recog_data.operand[i]))
23384 return 3 + 1;
23385 }
23386 else
23387 {
23388 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23389 if (MEM_P (recog_data.operand[i])
23390 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23391 return 3 + 1;
23392 }
23393
23394 return 2 + 1;
23395 }
23396 \f
23397 /* Return the maximum number of instructions a cpu can issue. */
23398
23399 static int
23400 ix86_issue_rate (void)
23401 {
23402 switch (ix86_tune)
23403 {
23404 case PROCESSOR_PENTIUM:
23405 case PROCESSOR_ATOM:
23406 case PROCESSOR_K6:
23407 return 2;
23408
23409 case PROCESSOR_PENTIUMPRO:
23410 case PROCESSOR_PENTIUM4:
23411 case PROCESSOR_CORE2_32:
23412 case PROCESSOR_CORE2_64:
23413 case PROCESSOR_COREI7_32:
23414 case PROCESSOR_COREI7_64:
23415 case PROCESSOR_ATHLON:
23416 case PROCESSOR_K8:
23417 case PROCESSOR_AMDFAM10:
23418 case PROCESSOR_NOCONA:
23419 case PROCESSOR_GENERIC32:
23420 case PROCESSOR_GENERIC64:
23421 case PROCESSOR_BDVER1:
23422 case PROCESSOR_BDVER2:
23423 case PROCESSOR_BTVER1:
23424 return 3;
23425
23426 default:
23427 return 1;
23428 }
23429 }
23430
23431 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23432 by DEP_INSN and nothing set by DEP_INSN. */
23433
23434 static bool
23435 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23436 {
23437 rtx set, set2;
23438
23439 /* Simplify the test for uninteresting insns. */
23440 if (insn_type != TYPE_SETCC
23441 && insn_type != TYPE_ICMOV
23442 && insn_type != TYPE_FCMOV
23443 && insn_type != TYPE_IBR)
23444 return false;
23445
23446 if ((set = single_set (dep_insn)) != 0)
23447 {
23448 set = SET_DEST (set);
23449 set2 = NULL_RTX;
23450 }
23451 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23452 && XVECLEN (PATTERN (dep_insn), 0) == 2
23453 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23454 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23455 {
23456 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23457 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23458 }
23459 else
23460 return false;
23461
23462 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23463 return false;
23464
23465 /* This test is true if the dependent insn reads the flags but
23466 not any other potentially set register. */
23467 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23468 return false;
23469
23470 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23471 return false;
23472
23473 return true;
23474 }
23475
23476 /* Return true iff USE_INSN has a memory address with operands set by
23477 SET_INSN. */
23478
23479 bool
23480 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23481 {
23482 int i;
23483 extract_insn_cached (use_insn);
23484 for (i = recog_data.n_operands - 1; i >= 0; --i)
23485 if (MEM_P (recog_data.operand[i]))
23486 {
23487 rtx addr = XEXP (recog_data.operand[i], 0);
23488 return modified_in_p (addr, set_insn) != 0;
23489 }
23490 return false;
23491 }
23492
23493 static int
23494 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23495 {
23496 enum attr_type insn_type, dep_insn_type;
23497 enum attr_memory memory;
23498 rtx set, set2;
23499 int dep_insn_code_number;
23500
23501 /* Anti and output dependencies have zero cost on all CPUs. */
23502 if (REG_NOTE_KIND (link) != 0)
23503 return 0;
23504
23505 dep_insn_code_number = recog_memoized (dep_insn);
23506
23507 /* If we can't recognize the insns, we can't really do anything. */
23508 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23509 return cost;
23510
23511 insn_type = get_attr_type (insn);
23512 dep_insn_type = get_attr_type (dep_insn);
23513
23514 switch (ix86_tune)
23515 {
23516 case PROCESSOR_PENTIUM:
23517 /* Address Generation Interlock adds a cycle of latency. */
23518 if (insn_type == TYPE_LEA)
23519 {
23520 rtx addr = PATTERN (insn);
23521
23522 if (GET_CODE (addr) == PARALLEL)
23523 addr = XVECEXP (addr, 0, 0);
23524
23525 gcc_assert (GET_CODE (addr) == SET);
23526
23527 addr = SET_SRC (addr);
23528 if (modified_in_p (addr, dep_insn))
23529 cost += 1;
23530 }
23531 else if (ix86_agi_dependent (dep_insn, insn))
23532 cost += 1;
23533
23534 /* ??? Compares pair with jump/setcc. */
23535 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23536 cost = 0;
23537
23538 /* Floating point stores require value to be ready one cycle earlier. */
23539 if (insn_type == TYPE_FMOV
23540 && get_attr_memory (insn) == MEMORY_STORE
23541 && !ix86_agi_dependent (dep_insn, insn))
23542 cost += 1;
23543 break;
23544
23545 case PROCESSOR_PENTIUMPRO:
23546 memory = get_attr_memory (insn);
23547
23548 /* INT->FP conversion is expensive. */
23549 if (get_attr_fp_int_src (dep_insn))
23550 cost += 5;
23551
23552 /* There is one cycle extra latency between an FP op and a store. */
23553 if (insn_type == TYPE_FMOV
23554 && (set = single_set (dep_insn)) != NULL_RTX
23555 && (set2 = single_set (insn)) != NULL_RTX
23556 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23557 && MEM_P (SET_DEST (set2)))
23558 cost += 1;
23559
23560 /* Show ability of reorder buffer to hide latency of load by executing
23561 in parallel with previous instruction in case
23562 previous instruction is not needed to compute the address. */
23563 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23564 && !ix86_agi_dependent (dep_insn, insn))
23565 {
23566 /* Claim moves to take one cycle, as core can issue one load
23567 at time and the next load can start cycle later. */
23568 if (dep_insn_type == TYPE_IMOV
23569 || dep_insn_type == TYPE_FMOV)
23570 cost = 1;
23571 else if (cost > 1)
23572 cost--;
23573 }
23574 break;
23575
23576 case PROCESSOR_K6:
23577 memory = get_attr_memory (insn);
23578
23579 /* The esp dependency is resolved before the instruction is really
23580 finished. */
23581 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23582 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23583 return 1;
23584
23585 /* INT->FP conversion is expensive. */
23586 if (get_attr_fp_int_src (dep_insn))
23587 cost += 5;
23588
23589 /* Show ability of reorder buffer to hide latency of load by executing
23590 in parallel with previous instruction in case
23591 previous instruction is not needed to compute the address. */
23592 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23593 && !ix86_agi_dependent (dep_insn, insn))
23594 {
23595 /* Claim moves to take one cycle, as core can issue one load
23596 at time and the next load can start cycle later. */
23597 if (dep_insn_type == TYPE_IMOV
23598 || dep_insn_type == TYPE_FMOV)
23599 cost = 1;
23600 else if (cost > 2)
23601 cost -= 2;
23602 else
23603 cost = 1;
23604 }
23605 break;
23606
23607 case PROCESSOR_ATHLON:
23608 case PROCESSOR_K8:
23609 case PROCESSOR_AMDFAM10:
23610 case PROCESSOR_BDVER1:
23611 case PROCESSOR_BDVER2:
23612 case PROCESSOR_BTVER1:
23613 case PROCESSOR_ATOM:
23614 case PROCESSOR_GENERIC32:
23615 case PROCESSOR_GENERIC64:
23616 memory = get_attr_memory (insn);
23617
23618 /* Show ability of reorder buffer to hide latency of load by executing
23619 in parallel with previous instruction in case
23620 previous instruction is not needed to compute the address. */
23621 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23622 && !ix86_agi_dependent (dep_insn, insn))
23623 {
23624 enum attr_unit unit = get_attr_unit (insn);
23625 int loadcost = 3;
23626
23627 /* Because of the difference between the length of integer and
23628 floating unit pipeline preparation stages, the memory operands
23629 for floating point are cheaper.
23630
23631 ??? For Athlon it the difference is most probably 2. */
23632 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23633 loadcost = 3;
23634 else
23635 loadcost = TARGET_ATHLON ? 2 : 0;
23636
23637 if (cost >= loadcost)
23638 cost -= loadcost;
23639 else
23640 cost = 0;
23641 }
23642
23643 default:
23644 break;
23645 }
23646
23647 return cost;
23648 }
23649
23650 /* How many alternative schedules to try. This should be as wide as the
23651 scheduling freedom in the DFA, but no wider. Making this value too
23652 large results extra work for the scheduler. */
23653
23654 static int
23655 ia32_multipass_dfa_lookahead (void)
23656 {
23657 switch (ix86_tune)
23658 {
23659 case PROCESSOR_PENTIUM:
23660 return 2;
23661
23662 case PROCESSOR_PENTIUMPRO:
23663 case PROCESSOR_K6:
23664 return 1;
23665
23666 case PROCESSOR_CORE2_32:
23667 case PROCESSOR_CORE2_64:
23668 case PROCESSOR_COREI7_32:
23669 case PROCESSOR_COREI7_64:
23670 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23671 as many instructions can be executed on a cycle, i.e.,
23672 issue_rate. I wonder why tuning for many CPUs does not do this. */
23673 return ix86_issue_rate ();
23674
23675 default:
23676 return 0;
23677 }
23678 }
23679
23680 \f
23681
23682 /* Model decoder of Core 2/i7.
23683 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23684 track the instruction fetch block boundaries and make sure that long
23685 (9+ bytes) instructions are assigned to D0. */
23686
23687 /* Maximum length of an insn that can be handled by
23688 a secondary decoder unit. '8' for Core 2/i7. */
23689 static int core2i7_secondary_decoder_max_insn_size;
23690
23691 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23692 '16' for Core 2/i7. */
23693 static int core2i7_ifetch_block_size;
23694
23695 /* Maximum number of instructions decoder can handle per cycle.
23696 '6' for Core 2/i7. */
23697 static int core2i7_ifetch_block_max_insns;
23698
23699 typedef struct ix86_first_cycle_multipass_data_ *
23700 ix86_first_cycle_multipass_data_t;
23701 typedef const struct ix86_first_cycle_multipass_data_ *
23702 const_ix86_first_cycle_multipass_data_t;
23703
23704 /* A variable to store target state across calls to max_issue within
23705 one cycle. */
23706 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23707 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23708
23709 /* Initialize DATA. */
23710 static void
23711 core2i7_first_cycle_multipass_init (void *_data)
23712 {
23713 ix86_first_cycle_multipass_data_t data
23714 = (ix86_first_cycle_multipass_data_t) _data;
23715
23716 data->ifetch_block_len = 0;
23717 data->ifetch_block_n_insns = 0;
23718 data->ready_try_change = NULL;
23719 data->ready_try_change_size = 0;
23720 }
23721
23722 /* Advancing the cycle; reset ifetch block counts. */
23723 static void
23724 core2i7_dfa_post_advance_cycle (void)
23725 {
23726 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
23727
23728 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23729
23730 data->ifetch_block_len = 0;
23731 data->ifetch_block_n_insns = 0;
23732 }
23733
23734 static int min_insn_size (rtx);
23735
23736 /* Filter out insns from ready_try that the core will not be able to issue
23737 on current cycle due to decoder. */
23738 static void
23739 core2i7_first_cycle_multipass_filter_ready_try
23740 (const_ix86_first_cycle_multipass_data_t data,
23741 char *ready_try, int n_ready, bool first_cycle_insn_p)
23742 {
23743 while (n_ready--)
23744 {
23745 rtx insn;
23746 int insn_size;
23747
23748 if (ready_try[n_ready])
23749 continue;
23750
23751 insn = get_ready_element (n_ready);
23752 insn_size = min_insn_size (insn);
23753
23754 if (/* If this is a too long an insn for a secondary decoder ... */
23755 (!first_cycle_insn_p
23756 && insn_size > core2i7_secondary_decoder_max_insn_size)
23757 /* ... or it would not fit into the ifetch block ... */
23758 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
23759 /* ... or the decoder is full already ... */
23760 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
23761 /* ... mask the insn out. */
23762 {
23763 ready_try[n_ready] = 1;
23764
23765 if (data->ready_try_change)
23766 SET_BIT (data->ready_try_change, n_ready);
23767 }
23768 }
23769 }
23770
23771 /* Prepare for a new round of multipass lookahead scheduling. */
23772 static void
23773 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
23774 bool first_cycle_insn_p)
23775 {
23776 ix86_first_cycle_multipass_data_t data
23777 = (ix86_first_cycle_multipass_data_t) _data;
23778 const_ix86_first_cycle_multipass_data_t prev_data
23779 = ix86_first_cycle_multipass_data;
23780
23781 /* Restore the state from the end of the previous round. */
23782 data->ifetch_block_len = prev_data->ifetch_block_len;
23783 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
23784
23785 /* Filter instructions that cannot be issued on current cycle due to
23786 decoder restrictions. */
23787 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23788 first_cycle_insn_p);
23789 }
23790
23791 /* INSN is being issued in current solution. Account for its impact on
23792 the decoder model. */
23793 static void
23794 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
23795 rtx insn, const void *_prev_data)
23796 {
23797 ix86_first_cycle_multipass_data_t data
23798 = (ix86_first_cycle_multipass_data_t) _data;
23799 const_ix86_first_cycle_multipass_data_t prev_data
23800 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
23801
23802 int insn_size = min_insn_size (insn);
23803
23804 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
23805 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
23806 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
23807 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23808
23809 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
23810 if (!data->ready_try_change)
23811 {
23812 data->ready_try_change = sbitmap_alloc (n_ready);
23813 data->ready_try_change_size = n_ready;
23814 }
23815 else if (data->ready_try_change_size < n_ready)
23816 {
23817 data->ready_try_change = sbitmap_resize (data->ready_try_change,
23818 n_ready, 0);
23819 data->ready_try_change_size = n_ready;
23820 }
23821 sbitmap_zero (data->ready_try_change);
23822
23823 /* Filter out insns from ready_try that the core will not be able to issue
23824 on current cycle due to decoder. */
23825 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23826 false);
23827 }
23828
23829 /* Revert the effect on ready_try. */
23830 static void
23831 core2i7_first_cycle_multipass_backtrack (const void *_data,
23832 char *ready_try,
23833 int n_ready ATTRIBUTE_UNUSED)
23834 {
23835 const_ix86_first_cycle_multipass_data_t data
23836 = (const_ix86_first_cycle_multipass_data_t) _data;
23837 unsigned int i = 0;
23838 sbitmap_iterator sbi;
23839
23840 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
23841 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
23842 {
23843 ready_try[i] = 0;
23844 }
23845 }
23846
23847 /* Save the result of multipass lookahead scheduling for the next round. */
23848 static void
23849 core2i7_first_cycle_multipass_end (const void *_data)
23850 {
23851 const_ix86_first_cycle_multipass_data_t data
23852 = (const_ix86_first_cycle_multipass_data_t) _data;
23853 ix86_first_cycle_multipass_data_t next_data
23854 = ix86_first_cycle_multipass_data;
23855
23856 if (data != NULL)
23857 {
23858 next_data->ifetch_block_len = data->ifetch_block_len;
23859 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
23860 }
23861 }
23862
23863 /* Deallocate target data. */
23864 static void
23865 core2i7_first_cycle_multipass_fini (void *_data)
23866 {
23867 ix86_first_cycle_multipass_data_t data
23868 = (ix86_first_cycle_multipass_data_t) _data;
23869
23870 if (data->ready_try_change)
23871 {
23872 sbitmap_free (data->ready_try_change);
23873 data->ready_try_change = NULL;
23874 data->ready_try_change_size = 0;
23875 }
23876 }
23877
23878 /* Prepare for scheduling pass. */
23879 static void
23880 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
23881 int verbose ATTRIBUTE_UNUSED,
23882 int max_uid ATTRIBUTE_UNUSED)
23883 {
23884 /* Install scheduling hooks for current CPU. Some of these hooks are used
23885 in time-critical parts of the scheduler, so we only set them up when
23886 they are actually used. */
23887 switch (ix86_tune)
23888 {
23889 case PROCESSOR_CORE2_32:
23890 case PROCESSOR_CORE2_64:
23891 case PROCESSOR_COREI7_32:
23892 case PROCESSOR_COREI7_64:
23893 targetm.sched.dfa_post_advance_cycle
23894 = core2i7_dfa_post_advance_cycle;
23895 targetm.sched.first_cycle_multipass_init
23896 = core2i7_first_cycle_multipass_init;
23897 targetm.sched.first_cycle_multipass_begin
23898 = core2i7_first_cycle_multipass_begin;
23899 targetm.sched.first_cycle_multipass_issue
23900 = core2i7_first_cycle_multipass_issue;
23901 targetm.sched.first_cycle_multipass_backtrack
23902 = core2i7_first_cycle_multipass_backtrack;
23903 targetm.sched.first_cycle_multipass_end
23904 = core2i7_first_cycle_multipass_end;
23905 targetm.sched.first_cycle_multipass_fini
23906 = core2i7_first_cycle_multipass_fini;
23907
23908 /* Set decoder parameters. */
23909 core2i7_secondary_decoder_max_insn_size = 8;
23910 core2i7_ifetch_block_size = 16;
23911 core2i7_ifetch_block_max_insns = 6;
23912 break;
23913
23914 default:
23915 targetm.sched.dfa_post_advance_cycle = NULL;
23916 targetm.sched.first_cycle_multipass_init = NULL;
23917 targetm.sched.first_cycle_multipass_begin = NULL;
23918 targetm.sched.first_cycle_multipass_issue = NULL;
23919 targetm.sched.first_cycle_multipass_backtrack = NULL;
23920 targetm.sched.first_cycle_multipass_end = NULL;
23921 targetm.sched.first_cycle_multipass_fini = NULL;
23922 break;
23923 }
23924 }
23925
23926 \f
23927 /* Compute the alignment given to a constant that is being placed in memory.
23928 EXP is the constant and ALIGN is the alignment that the object would
23929 ordinarily have.
23930 The value of this function is used instead of that alignment to align
23931 the object. */
23932
23933 int
23934 ix86_constant_alignment (tree exp, int align)
23935 {
23936 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
23937 || TREE_CODE (exp) == INTEGER_CST)
23938 {
23939 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
23940 return 64;
23941 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
23942 return 128;
23943 }
23944 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
23945 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
23946 return BITS_PER_WORD;
23947
23948 return align;
23949 }
23950
23951 /* Compute the alignment for a static variable.
23952 TYPE is the data type, and ALIGN is the alignment that
23953 the object would ordinarily have. The value of this function is used
23954 instead of that alignment to align the object. */
23955
23956 int
23957 ix86_data_alignment (tree type, int align)
23958 {
23959 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
23960
23961 if (AGGREGATE_TYPE_P (type)
23962 && TYPE_SIZE (type)
23963 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23964 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
23965 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
23966 && align < max_align)
23967 align = max_align;
23968
23969 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
23970 to 16byte boundary. */
23971 if (TARGET_64BIT)
23972 {
23973 if (AGGREGATE_TYPE_P (type)
23974 && TYPE_SIZE (type)
23975 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23976 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
23977 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
23978 return 128;
23979 }
23980
23981 if (TREE_CODE (type) == ARRAY_TYPE)
23982 {
23983 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
23984 return 64;
23985 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
23986 return 128;
23987 }
23988 else if (TREE_CODE (type) == COMPLEX_TYPE)
23989 {
23990
23991 if (TYPE_MODE (type) == DCmode && align < 64)
23992 return 64;
23993 if ((TYPE_MODE (type) == XCmode
23994 || TYPE_MODE (type) == TCmode) && align < 128)
23995 return 128;
23996 }
23997 else if ((TREE_CODE (type) == RECORD_TYPE
23998 || TREE_CODE (type) == UNION_TYPE
23999 || TREE_CODE (type) == QUAL_UNION_TYPE)
24000 && TYPE_FIELDS (type))
24001 {
24002 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24003 return 64;
24004 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24005 return 128;
24006 }
24007 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24008 || TREE_CODE (type) == INTEGER_TYPE)
24009 {
24010 if (TYPE_MODE (type) == DFmode && align < 64)
24011 return 64;
24012 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24013 return 128;
24014 }
24015
24016 return align;
24017 }
24018
24019 /* Compute the alignment for a local variable or a stack slot. EXP is
24020 the data type or decl itself, MODE is the widest mode available and
24021 ALIGN is the alignment that the object would ordinarily have. The
24022 value of this macro is used instead of that alignment to align the
24023 object. */
24024
24025 unsigned int
24026 ix86_local_alignment (tree exp, enum machine_mode mode,
24027 unsigned int align)
24028 {
24029 tree type, decl;
24030
24031 if (exp && DECL_P (exp))
24032 {
24033 type = TREE_TYPE (exp);
24034 decl = exp;
24035 }
24036 else
24037 {
24038 type = exp;
24039 decl = NULL;
24040 }
24041
24042 /* Don't do dynamic stack realignment for long long objects with
24043 -mpreferred-stack-boundary=2. */
24044 if (!TARGET_64BIT
24045 && align == 64
24046 && ix86_preferred_stack_boundary < 64
24047 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24048 && (!type || !TYPE_USER_ALIGN (type))
24049 && (!decl || !DECL_USER_ALIGN (decl)))
24050 align = 32;
24051
24052 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24053 register in MODE. We will return the largest alignment of XF
24054 and DF. */
24055 if (!type)
24056 {
24057 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24058 align = GET_MODE_ALIGNMENT (DFmode);
24059 return align;
24060 }
24061
24062 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24063 to 16byte boundary. Exact wording is:
24064
24065 An array uses the same alignment as its elements, except that a local or
24066 global array variable of length at least 16 bytes or
24067 a C99 variable-length array variable always has alignment of at least 16 bytes.
24068
24069 This was added to allow use of aligned SSE instructions at arrays. This
24070 rule is meant for static storage (where compiler can not do the analysis
24071 by itself). We follow it for automatic variables only when convenient.
24072 We fully control everything in the function compiled and functions from
24073 other unit can not rely on the alignment.
24074
24075 Exclude va_list type. It is the common case of local array where
24076 we can not benefit from the alignment. */
24077 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24078 && TARGET_SSE)
24079 {
24080 if (AGGREGATE_TYPE_P (type)
24081 && (va_list_type_node == NULL_TREE
24082 || (TYPE_MAIN_VARIANT (type)
24083 != TYPE_MAIN_VARIANT (va_list_type_node)))
24084 && TYPE_SIZE (type)
24085 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24086 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24087 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24088 return 128;
24089 }
24090 if (TREE_CODE (type) == ARRAY_TYPE)
24091 {
24092 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24093 return 64;
24094 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24095 return 128;
24096 }
24097 else if (TREE_CODE (type) == COMPLEX_TYPE)
24098 {
24099 if (TYPE_MODE (type) == DCmode && align < 64)
24100 return 64;
24101 if ((TYPE_MODE (type) == XCmode
24102 || TYPE_MODE (type) == TCmode) && align < 128)
24103 return 128;
24104 }
24105 else if ((TREE_CODE (type) == RECORD_TYPE
24106 || TREE_CODE (type) == UNION_TYPE
24107 || TREE_CODE (type) == QUAL_UNION_TYPE)
24108 && TYPE_FIELDS (type))
24109 {
24110 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24111 return 64;
24112 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24113 return 128;
24114 }
24115 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24116 || TREE_CODE (type) == INTEGER_TYPE)
24117 {
24118
24119 if (TYPE_MODE (type) == DFmode && align < 64)
24120 return 64;
24121 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24122 return 128;
24123 }
24124 return align;
24125 }
24126
24127 /* Compute the minimum required alignment for dynamic stack realignment
24128 purposes for a local variable, parameter or a stack slot. EXP is
24129 the data type or decl itself, MODE is its mode and ALIGN is the
24130 alignment that the object would ordinarily have. */
24131
24132 unsigned int
24133 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24134 unsigned int align)
24135 {
24136 tree type, decl;
24137
24138 if (exp && DECL_P (exp))
24139 {
24140 type = TREE_TYPE (exp);
24141 decl = exp;
24142 }
24143 else
24144 {
24145 type = exp;
24146 decl = NULL;
24147 }
24148
24149 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24150 return align;
24151
24152 /* Don't do dynamic stack realignment for long long objects with
24153 -mpreferred-stack-boundary=2. */
24154 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24155 && (!type || !TYPE_USER_ALIGN (type))
24156 && (!decl || !DECL_USER_ALIGN (decl)))
24157 return 32;
24158
24159 return align;
24160 }
24161 \f
24162 /* Find a location for the static chain incoming to a nested function.
24163 This is a register, unless all free registers are used by arguments. */
24164
24165 static rtx
24166 ix86_static_chain (const_tree fndecl, bool incoming_p)
24167 {
24168 unsigned regno;
24169
24170 if (!DECL_STATIC_CHAIN (fndecl))
24171 return NULL;
24172
24173 if (TARGET_64BIT)
24174 {
24175 /* We always use R10 in 64-bit mode. */
24176 regno = R10_REG;
24177 }
24178 else
24179 {
24180 tree fntype;
24181 unsigned int ccvt;
24182
24183 /* By default in 32-bit mode we use ECX to pass the static chain. */
24184 regno = CX_REG;
24185
24186 fntype = TREE_TYPE (fndecl);
24187 ccvt = ix86_get_callcvt (fntype);
24188 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24189 {
24190 /* Fastcall functions use ecx/edx for arguments, which leaves
24191 us with EAX for the static chain.
24192 Thiscall functions use ecx for arguments, which also
24193 leaves us with EAX for the static chain. */
24194 regno = AX_REG;
24195 }
24196 else if (ix86_function_regparm (fntype, fndecl) == 3)
24197 {
24198 /* For regparm 3, we have no free call-clobbered registers in
24199 which to store the static chain. In order to implement this,
24200 we have the trampoline push the static chain to the stack.
24201 However, we can't push a value below the return address when
24202 we call the nested function directly, so we have to use an
24203 alternate entry point. For this we use ESI, and have the
24204 alternate entry point push ESI, so that things appear the
24205 same once we're executing the nested function. */
24206 if (incoming_p)
24207 {
24208 if (fndecl == current_function_decl)
24209 ix86_static_chain_on_stack = true;
24210 return gen_frame_mem (SImode,
24211 plus_constant (arg_pointer_rtx, -8));
24212 }
24213 regno = SI_REG;
24214 }
24215 }
24216
24217 return gen_rtx_REG (Pmode, regno);
24218 }
24219
24220 /* Emit RTL insns to initialize the variable parts of a trampoline.
24221 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24222 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24223 to be passed to the target function. */
24224
24225 static void
24226 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24227 {
24228 rtx mem, fnaddr;
24229 int opcode;
24230 int offset = 0;
24231
24232 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24233
24234 if (TARGET_64BIT)
24235 {
24236 int size;
24237
24238 /* Load the function address to r11. Try to load address using
24239 the shorter movl instead of movabs. We may want to support
24240 movq for kernel mode, but kernel does not use trampolines at
24241 the moment. */
24242 if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24243 {
24244 fnaddr = copy_to_mode_reg (DImode, fnaddr);
24245
24246 mem = adjust_address (m_tramp, HImode, offset);
24247 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24248
24249 mem = adjust_address (m_tramp, SImode, offset + 2);
24250 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24251 offset += 6;
24252 }
24253 else
24254 {
24255 mem = adjust_address (m_tramp, HImode, offset);
24256 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24257
24258 mem = adjust_address (m_tramp, DImode, offset + 2);
24259 emit_move_insn (mem, fnaddr);
24260 offset += 10;
24261 }
24262
24263 /* Load static chain using movabs to r10. Use the
24264 shorter movl instead of movabs for x32. */
24265 if (TARGET_X32)
24266 {
24267 opcode = 0xba41;
24268 size = 6;
24269 }
24270 else
24271 {
24272 opcode = 0xba49;
24273 size = 10;
24274 }
24275
24276 mem = adjust_address (m_tramp, HImode, offset);
24277 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24278
24279 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24280 emit_move_insn (mem, chain_value);
24281 offset += size;
24282
24283 /* Jump to r11; the last (unused) byte is a nop, only there to
24284 pad the write out to a single 32-bit store. */
24285 mem = adjust_address (m_tramp, SImode, offset);
24286 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24287 offset += 4;
24288 }
24289 else
24290 {
24291 rtx disp, chain;
24292
24293 /* Depending on the static chain location, either load a register
24294 with a constant, or push the constant to the stack. All of the
24295 instructions are the same size. */
24296 chain = ix86_static_chain (fndecl, true);
24297 if (REG_P (chain))
24298 {
24299 switch (REGNO (chain))
24300 {
24301 case AX_REG:
24302 opcode = 0xb8; break;
24303 case CX_REG:
24304 opcode = 0xb9; break;
24305 default:
24306 gcc_unreachable ();
24307 }
24308 }
24309 else
24310 opcode = 0x68;
24311
24312 mem = adjust_address (m_tramp, QImode, offset);
24313 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24314
24315 mem = adjust_address (m_tramp, SImode, offset + 1);
24316 emit_move_insn (mem, chain_value);
24317 offset += 5;
24318
24319 mem = adjust_address (m_tramp, QImode, offset);
24320 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24321
24322 mem = adjust_address (m_tramp, SImode, offset + 1);
24323
24324 /* Compute offset from the end of the jmp to the target function.
24325 In the case in which the trampoline stores the static chain on
24326 the stack, we need to skip the first insn which pushes the
24327 (call-saved) register static chain; this push is 1 byte. */
24328 offset += 5;
24329 disp = expand_binop (SImode, sub_optab, fnaddr,
24330 plus_constant (XEXP (m_tramp, 0),
24331 offset - (MEM_P (chain) ? 1 : 0)),
24332 NULL_RTX, 1, OPTAB_DIRECT);
24333 emit_move_insn (mem, disp);
24334 }
24335
24336 gcc_assert (offset <= TRAMPOLINE_SIZE);
24337
24338 #ifdef HAVE_ENABLE_EXECUTE_STACK
24339 #ifdef CHECK_EXECUTE_STACK_ENABLED
24340 if (CHECK_EXECUTE_STACK_ENABLED)
24341 #endif
24342 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24343 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24344 #endif
24345 }
24346 \f
24347 /* The following file contains several enumerations and data structures
24348 built from the definitions in i386-builtin-types.def. */
24349
24350 #include "i386-builtin-types.inc"
24351
24352 /* Table for the ix86 builtin non-function types. */
24353 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24354
24355 /* Retrieve an element from the above table, building some of
24356 the types lazily. */
24357
24358 static tree
24359 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24360 {
24361 unsigned int index;
24362 tree type, itype;
24363
24364 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24365
24366 type = ix86_builtin_type_tab[(int) tcode];
24367 if (type != NULL)
24368 return type;
24369
24370 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24371 if (tcode <= IX86_BT_LAST_VECT)
24372 {
24373 enum machine_mode mode;
24374
24375 index = tcode - IX86_BT_LAST_PRIM - 1;
24376 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24377 mode = ix86_builtin_type_vect_mode[index];
24378
24379 type = build_vector_type_for_mode (itype, mode);
24380 }
24381 else
24382 {
24383 int quals;
24384
24385 index = tcode - IX86_BT_LAST_VECT - 1;
24386 if (tcode <= IX86_BT_LAST_PTR)
24387 quals = TYPE_UNQUALIFIED;
24388 else
24389 quals = TYPE_QUAL_CONST;
24390
24391 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24392 if (quals != TYPE_UNQUALIFIED)
24393 itype = build_qualified_type (itype, quals);
24394
24395 type = build_pointer_type (itype);
24396 }
24397
24398 ix86_builtin_type_tab[(int) tcode] = type;
24399 return type;
24400 }
24401
24402 /* Table for the ix86 builtin function types. */
24403 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24404
24405 /* Retrieve an element from the above table, building some of
24406 the types lazily. */
24407
24408 static tree
24409 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24410 {
24411 tree type;
24412
24413 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24414
24415 type = ix86_builtin_func_type_tab[(int) tcode];
24416 if (type != NULL)
24417 return type;
24418
24419 if (tcode <= IX86_BT_LAST_FUNC)
24420 {
24421 unsigned start = ix86_builtin_func_start[(int) tcode];
24422 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24423 tree rtype, atype, args = void_list_node;
24424 unsigned i;
24425
24426 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24427 for (i = after - 1; i > start; --i)
24428 {
24429 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24430 args = tree_cons (NULL, atype, args);
24431 }
24432
24433 type = build_function_type (rtype, args);
24434 }
24435 else
24436 {
24437 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24438 enum ix86_builtin_func_type icode;
24439
24440 icode = ix86_builtin_func_alias_base[index];
24441 type = ix86_get_builtin_func_type (icode);
24442 }
24443
24444 ix86_builtin_func_type_tab[(int) tcode] = type;
24445 return type;
24446 }
24447
24448
24449 /* Codes for all the SSE/MMX builtins. */
24450 enum ix86_builtins
24451 {
24452 IX86_BUILTIN_ADDPS,
24453 IX86_BUILTIN_ADDSS,
24454 IX86_BUILTIN_DIVPS,
24455 IX86_BUILTIN_DIVSS,
24456 IX86_BUILTIN_MULPS,
24457 IX86_BUILTIN_MULSS,
24458 IX86_BUILTIN_SUBPS,
24459 IX86_BUILTIN_SUBSS,
24460
24461 IX86_BUILTIN_CMPEQPS,
24462 IX86_BUILTIN_CMPLTPS,
24463 IX86_BUILTIN_CMPLEPS,
24464 IX86_BUILTIN_CMPGTPS,
24465 IX86_BUILTIN_CMPGEPS,
24466 IX86_BUILTIN_CMPNEQPS,
24467 IX86_BUILTIN_CMPNLTPS,
24468 IX86_BUILTIN_CMPNLEPS,
24469 IX86_BUILTIN_CMPNGTPS,
24470 IX86_BUILTIN_CMPNGEPS,
24471 IX86_BUILTIN_CMPORDPS,
24472 IX86_BUILTIN_CMPUNORDPS,
24473 IX86_BUILTIN_CMPEQSS,
24474 IX86_BUILTIN_CMPLTSS,
24475 IX86_BUILTIN_CMPLESS,
24476 IX86_BUILTIN_CMPNEQSS,
24477 IX86_BUILTIN_CMPNLTSS,
24478 IX86_BUILTIN_CMPNLESS,
24479 IX86_BUILTIN_CMPNGTSS,
24480 IX86_BUILTIN_CMPNGESS,
24481 IX86_BUILTIN_CMPORDSS,
24482 IX86_BUILTIN_CMPUNORDSS,
24483
24484 IX86_BUILTIN_COMIEQSS,
24485 IX86_BUILTIN_COMILTSS,
24486 IX86_BUILTIN_COMILESS,
24487 IX86_BUILTIN_COMIGTSS,
24488 IX86_BUILTIN_COMIGESS,
24489 IX86_BUILTIN_COMINEQSS,
24490 IX86_BUILTIN_UCOMIEQSS,
24491 IX86_BUILTIN_UCOMILTSS,
24492 IX86_BUILTIN_UCOMILESS,
24493 IX86_BUILTIN_UCOMIGTSS,
24494 IX86_BUILTIN_UCOMIGESS,
24495 IX86_BUILTIN_UCOMINEQSS,
24496
24497 IX86_BUILTIN_CVTPI2PS,
24498 IX86_BUILTIN_CVTPS2PI,
24499 IX86_BUILTIN_CVTSI2SS,
24500 IX86_BUILTIN_CVTSI642SS,
24501 IX86_BUILTIN_CVTSS2SI,
24502 IX86_BUILTIN_CVTSS2SI64,
24503 IX86_BUILTIN_CVTTPS2PI,
24504 IX86_BUILTIN_CVTTSS2SI,
24505 IX86_BUILTIN_CVTTSS2SI64,
24506
24507 IX86_BUILTIN_MAXPS,
24508 IX86_BUILTIN_MAXSS,
24509 IX86_BUILTIN_MINPS,
24510 IX86_BUILTIN_MINSS,
24511
24512 IX86_BUILTIN_LOADUPS,
24513 IX86_BUILTIN_STOREUPS,
24514 IX86_BUILTIN_MOVSS,
24515
24516 IX86_BUILTIN_MOVHLPS,
24517 IX86_BUILTIN_MOVLHPS,
24518 IX86_BUILTIN_LOADHPS,
24519 IX86_BUILTIN_LOADLPS,
24520 IX86_BUILTIN_STOREHPS,
24521 IX86_BUILTIN_STORELPS,
24522
24523 IX86_BUILTIN_MASKMOVQ,
24524 IX86_BUILTIN_MOVMSKPS,
24525 IX86_BUILTIN_PMOVMSKB,
24526
24527 IX86_BUILTIN_MOVNTPS,
24528 IX86_BUILTIN_MOVNTQ,
24529
24530 IX86_BUILTIN_LOADDQU,
24531 IX86_BUILTIN_STOREDQU,
24532
24533 IX86_BUILTIN_PACKSSWB,
24534 IX86_BUILTIN_PACKSSDW,
24535 IX86_BUILTIN_PACKUSWB,
24536
24537 IX86_BUILTIN_PADDB,
24538 IX86_BUILTIN_PADDW,
24539 IX86_BUILTIN_PADDD,
24540 IX86_BUILTIN_PADDQ,
24541 IX86_BUILTIN_PADDSB,
24542 IX86_BUILTIN_PADDSW,
24543 IX86_BUILTIN_PADDUSB,
24544 IX86_BUILTIN_PADDUSW,
24545 IX86_BUILTIN_PSUBB,
24546 IX86_BUILTIN_PSUBW,
24547 IX86_BUILTIN_PSUBD,
24548 IX86_BUILTIN_PSUBQ,
24549 IX86_BUILTIN_PSUBSB,
24550 IX86_BUILTIN_PSUBSW,
24551 IX86_BUILTIN_PSUBUSB,
24552 IX86_BUILTIN_PSUBUSW,
24553
24554 IX86_BUILTIN_PAND,
24555 IX86_BUILTIN_PANDN,
24556 IX86_BUILTIN_POR,
24557 IX86_BUILTIN_PXOR,
24558
24559 IX86_BUILTIN_PAVGB,
24560 IX86_BUILTIN_PAVGW,
24561
24562 IX86_BUILTIN_PCMPEQB,
24563 IX86_BUILTIN_PCMPEQW,
24564 IX86_BUILTIN_PCMPEQD,
24565 IX86_BUILTIN_PCMPGTB,
24566 IX86_BUILTIN_PCMPGTW,
24567 IX86_BUILTIN_PCMPGTD,
24568
24569 IX86_BUILTIN_PMADDWD,
24570
24571 IX86_BUILTIN_PMAXSW,
24572 IX86_BUILTIN_PMAXUB,
24573 IX86_BUILTIN_PMINSW,
24574 IX86_BUILTIN_PMINUB,
24575
24576 IX86_BUILTIN_PMULHUW,
24577 IX86_BUILTIN_PMULHW,
24578 IX86_BUILTIN_PMULLW,
24579
24580 IX86_BUILTIN_PSADBW,
24581 IX86_BUILTIN_PSHUFW,
24582
24583 IX86_BUILTIN_PSLLW,
24584 IX86_BUILTIN_PSLLD,
24585 IX86_BUILTIN_PSLLQ,
24586 IX86_BUILTIN_PSRAW,
24587 IX86_BUILTIN_PSRAD,
24588 IX86_BUILTIN_PSRLW,
24589 IX86_BUILTIN_PSRLD,
24590 IX86_BUILTIN_PSRLQ,
24591 IX86_BUILTIN_PSLLWI,
24592 IX86_BUILTIN_PSLLDI,
24593 IX86_BUILTIN_PSLLQI,
24594 IX86_BUILTIN_PSRAWI,
24595 IX86_BUILTIN_PSRADI,
24596 IX86_BUILTIN_PSRLWI,
24597 IX86_BUILTIN_PSRLDI,
24598 IX86_BUILTIN_PSRLQI,
24599
24600 IX86_BUILTIN_PUNPCKHBW,
24601 IX86_BUILTIN_PUNPCKHWD,
24602 IX86_BUILTIN_PUNPCKHDQ,
24603 IX86_BUILTIN_PUNPCKLBW,
24604 IX86_BUILTIN_PUNPCKLWD,
24605 IX86_BUILTIN_PUNPCKLDQ,
24606
24607 IX86_BUILTIN_SHUFPS,
24608
24609 IX86_BUILTIN_RCPPS,
24610 IX86_BUILTIN_RCPSS,
24611 IX86_BUILTIN_RSQRTPS,
24612 IX86_BUILTIN_RSQRTPS_NR,
24613 IX86_BUILTIN_RSQRTSS,
24614 IX86_BUILTIN_RSQRTF,
24615 IX86_BUILTIN_SQRTPS,
24616 IX86_BUILTIN_SQRTPS_NR,
24617 IX86_BUILTIN_SQRTSS,
24618
24619 IX86_BUILTIN_UNPCKHPS,
24620 IX86_BUILTIN_UNPCKLPS,
24621
24622 IX86_BUILTIN_ANDPS,
24623 IX86_BUILTIN_ANDNPS,
24624 IX86_BUILTIN_ORPS,
24625 IX86_BUILTIN_XORPS,
24626
24627 IX86_BUILTIN_EMMS,
24628 IX86_BUILTIN_LDMXCSR,
24629 IX86_BUILTIN_STMXCSR,
24630 IX86_BUILTIN_SFENCE,
24631
24632 /* 3DNow! Original */
24633 IX86_BUILTIN_FEMMS,
24634 IX86_BUILTIN_PAVGUSB,
24635 IX86_BUILTIN_PF2ID,
24636 IX86_BUILTIN_PFACC,
24637 IX86_BUILTIN_PFADD,
24638 IX86_BUILTIN_PFCMPEQ,
24639 IX86_BUILTIN_PFCMPGE,
24640 IX86_BUILTIN_PFCMPGT,
24641 IX86_BUILTIN_PFMAX,
24642 IX86_BUILTIN_PFMIN,
24643 IX86_BUILTIN_PFMUL,
24644 IX86_BUILTIN_PFRCP,
24645 IX86_BUILTIN_PFRCPIT1,
24646 IX86_BUILTIN_PFRCPIT2,
24647 IX86_BUILTIN_PFRSQIT1,
24648 IX86_BUILTIN_PFRSQRT,
24649 IX86_BUILTIN_PFSUB,
24650 IX86_BUILTIN_PFSUBR,
24651 IX86_BUILTIN_PI2FD,
24652 IX86_BUILTIN_PMULHRW,
24653
24654 /* 3DNow! Athlon Extensions */
24655 IX86_BUILTIN_PF2IW,
24656 IX86_BUILTIN_PFNACC,
24657 IX86_BUILTIN_PFPNACC,
24658 IX86_BUILTIN_PI2FW,
24659 IX86_BUILTIN_PSWAPDSI,
24660 IX86_BUILTIN_PSWAPDSF,
24661
24662 /* SSE2 */
24663 IX86_BUILTIN_ADDPD,
24664 IX86_BUILTIN_ADDSD,
24665 IX86_BUILTIN_DIVPD,
24666 IX86_BUILTIN_DIVSD,
24667 IX86_BUILTIN_MULPD,
24668 IX86_BUILTIN_MULSD,
24669 IX86_BUILTIN_SUBPD,
24670 IX86_BUILTIN_SUBSD,
24671
24672 IX86_BUILTIN_CMPEQPD,
24673 IX86_BUILTIN_CMPLTPD,
24674 IX86_BUILTIN_CMPLEPD,
24675 IX86_BUILTIN_CMPGTPD,
24676 IX86_BUILTIN_CMPGEPD,
24677 IX86_BUILTIN_CMPNEQPD,
24678 IX86_BUILTIN_CMPNLTPD,
24679 IX86_BUILTIN_CMPNLEPD,
24680 IX86_BUILTIN_CMPNGTPD,
24681 IX86_BUILTIN_CMPNGEPD,
24682 IX86_BUILTIN_CMPORDPD,
24683 IX86_BUILTIN_CMPUNORDPD,
24684 IX86_BUILTIN_CMPEQSD,
24685 IX86_BUILTIN_CMPLTSD,
24686 IX86_BUILTIN_CMPLESD,
24687 IX86_BUILTIN_CMPNEQSD,
24688 IX86_BUILTIN_CMPNLTSD,
24689 IX86_BUILTIN_CMPNLESD,
24690 IX86_BUILTIN_CMPORDSD,
24691 IX86_BUILTIN_CMPUNORDSD,
24692
24693 IX86_BUILTIN_COMIEQSD,
24694 IX86_BUILTIN_COMILTSD,
24695 IX86_BUILTIN_COMILESD,
24696 IX86_BUILTIN_COMIGTSD,
24697 IX86_BUILTIN_COMIGESD,
24698 IX86_BUILTIN_COMINEQSD,
24699 IX86_BUILTIN_UCOMIEQSD,
24700 IX86_BUILTIN_UCOMILTSD,
24701 IX86_BUILTIN_UCOMILESD,
24702 IX86_BUILTIN_UCOMIGTSD,
24703 IX86_BUILTIN_UCOMIGESD,
24704 IX86_BUILTIN_UCOMINEQSD,
24705
24706 IX86_BUILTIN_MAXPD,
24707 IX86_BUILTIN_MAXSD,
24708 IX86_BUILTIN_MINPD,
24709 IX86_BUILTIN_MINSD,
24710
24711 IX86_BUILTIN_ANDPD,
24712 IX86_BUILTIN_ANDNPD,
24713 IX86_BUILTIN_ORPD,
24714 IX86_BUILTIN_XORPD,
24715
24716 IX86_BUILTIN_SQRTPD,
24717 IX86_BUILTIN_SQRTSD,
24718
24719 IX86_BUILTIN_UNPCKHPD,
24720 IX86_BUILTIN_UNPCKLPD,
24721
24722 IX86_BUILTIN_SHUFPD,
24723
24724 IX86_BUILTIN_LOADUPD,
24725 IX86_BUILTIN_STOREUPD,
24726 IX86_BUILTIN_MOVSD,
24727
24728 IX86_BUILTIN_LOADHPD,
24729 IX86_BUILTIN_LOADLPD,
24730
24731 IX86_BUILTIN_CVTDQ2PD,
24732 IX86_BUILTIN_CVTDQ2PS,
24733
24734 IX86_BUILTIN_CVTPD2DQ,
24735 IX86_BUILTIN_CVTPD2PI,
24736 IX86_BUILTIN_CVTPD2PS,
24737 IX86_BUILTIN_CVTTPD2DQ,
24738 IX86_BUILTIN_CVTTPD2PI,
24739
24740 IX86_BUILTIN_CVTPI2PD,
24741 IX86_BUILTIN_CVTSI2SD,
24742 IX86_BUILTIN_CVTSI642SD,
24743
24744 IX86_BUILTIN_CVTSD2SI,
24745 IX86_BUILTIN_CVTSD2SI64,
24746 IX86_BUILTIN_CVTSD2SS,
24747 IX86_BUILTIN_CVTSS2SD,
24748 IX86_BUILTIN_CVTTSD2SI,
24749 IX86_BUILTIN_CVTTSD2SI64,
24750
24751 IX86_BUILTIN_CVTPS2DQ,
24752 IX86_BUILTIN_CVTPS2PD,
24753 IX86_BUILTIN_CVTTPS2DQ,
24754
24755 IX86_BUILTIN_MOVNTI,
24756 IX86_BUILTIN_MOVNTI64,
24757 IX86_BUILTIN_MOVNTPD,
24758 IX86_BUILTIN_MOVNTDQ,
24759
24760 IX86_BUILTIN_MOVQ128,
24761
24762 /* SSE2 MMX */
24763 IX86_BUILTIN_MASKMOVDQU,
24764 IX86_BUILTIN_MOVMSKPD,
24765 IX86_BUILTIN_PMOVMSKB128,
24766
24767 IX86_BUILTIN_PACKSSWB128,
24768 IX86_BUILTIN_PACKSSDW128,
24769 IX86_BUILTIN_PACKUSWB128,
24770
24771 IX86_BUILTIN_PADDB128,
24772 IX86_BUILTIN_PADDW128,
24773 IX86_BUILTIN_PADDD128,
24774 IX86_BUILTIN_PADDQ128,
24775 IX86_BUILTIN_PADDSB128,
24776 IX86_BUILTIN_PADDSW128,
24777 IX86_BUILTIN_PADDUSB128,
24778 IX86_BUILTIN_PADDUSW128,
24779 IX86_BUILTIN_PSUBB128,
24780 IX86_BUILTIN_PSUBW128,
24781 IX86_BUILTIN_PSUBD128,
24782 IX86_BUILTIN_PSUBQ128,
24783 IX86_BUILTIN_PSUBSB128,
24784 IX86_BUILTIN_PSUBSW128,
24785 IX86_BUILTIN_PSUBUSB128,
24786 IX86_BUILTIN_PSUBUSW128,
24787
24788 IX86_BUILTIN_PAND128,
24789 IX86_BUILTIN_PANDN128,
24790 IX86_BUILTIN_POR128,
24791 IX86_BUILTIN_PXOR128,
24792
24793 IX86_BUILTIN_PAVGB128,
24794 IX86_BUILTIN_PAVGW128,
24795
24796 IX86_BUILTIN_PCMPEQB128,
24797 IX86_BUILTIN_PCMPEQW128,
24798 IX86_BUILTIN_PCMPEQD128,
24799 IX86_BUILTIN_PCMPGTB128,
24800 IX86_BUILTIN_PCMPGTW128,
24801 IX86_BUILTIN_PCMPGTD128,
24802
24803 IX86_BUILTIN_PMADDWD128,
24804
24805 IX86_BUILTIN_PMAXSW128,
24806 IX86_BUILTIN_PMAXUB128,
24807 IX86_BUILTIN_PMINSW128,
24808 IX86_BUILTIN_PMINUB128,
24809
24810 IX86_BUILTIN_PMULUDQ,
24811 IX86_BUILTIN_PMULUDQ128,
24812 IX86_BUILTIN_PMULHUW128,
24813 IX86_BUILTIN_PMULHW128,
24814 IX86_BUILTIN_PMULLW128,
24815
24816 IX86_BUILTIN_PSADBW128,
24817 IX86_BUILTIN_PSHUFHW,
24818 IX86_BUILTIN_PSHUFLW,
24819 IX86_BUILTIN_PSHUFD,
24820
24821 IX86_BUILTIN_PSLLDQI128,
24822 IX86_BUILTIN_PSLLWI128,
24823 IX86_BUILTIN_PSLLDI128,
24824 IX86_BUILTIN_PSLLQI128,
24825 IX86_BUILTIN_PSRAWI128,
24826 IX86_BUILTIN_PSRADI128,
24827 IX86_BUILTIN_PSRLDQI128,
24828 IX86_BUILTIN_PSRLWI128,
24829 IX86_BUILTIN_PSRLDI128,
24830 IX86_BUILTIN_PSRLQI128,
24831
24832 IX86_BUILTIN_PSLLDQ128,
24833 IX86_BUILTIN_PSLLW128,
24834 IX86_BUILTIN_PSLLD128,
24835 IX86_BUILTIN_PSLLQ128,
24836 IX86_BUILTIN_PSRAW128,
24837 IX86_BUILTIN_PSRAD128,
24838 IX86_BUILTIN_PSRLW128,
24839 IX86_BUILTIN_PSRLD128,
24840 IX86_BUILTIN_PSRLQ128,
24841
24842 IX86_BUILTIN_PUNPCKHBW128,
24843 IX86_BUILTIN_PUNPCKHWD128,
24844 IX86_BUILTIN_PUNPCKHDQ128,
24845 IX86_BUILTIN_PUNPCKHQDQ128,
24846 IX86_BUILTIN_PUNPCKLBW128,
24847 IX86_BUILTIN_PUNPCKLWD128,
24848 IX86_BUILTIN_PUNPCKLDQ128,
24849 IX86_BUILTIN_PUNPCKLQDQ128,
24850
24851 IX86_BUILTIN_CLFLUSH,
24852 IX86_BUILTIN_MFENCE,
24853 IX86_BUILTIN_LFENCE,
24854 IX86_BUILTIN_PAUSE,
24855
24856 IX86_BUILTIN_BSRSI,
24857 IX86_BUILTIN_BSRDI,
24858 IX86_BUILTIN_RDPMC,
24859 IX86_BUILTIN_RDTSC,
24860 IX86_BUILTIN_RDTSCP,
24861 IX86_BUILTIN_ROLQI,
24862 IX86_BUILTIN_ROLHI,
24863 IX86_BUILTIN_RORQI,
24864 IX86_BUILTIN_RORHI,
24865
24866 /* SSE3. */
24867 IX86_BUILTIN_ADDSUBPS,
24868 IX86_BUILTIN_HADDPS,
24869 IX86_BUILTIN_HSUBPS,
24870 IX86_BUILTIN_MOVSHDUP,
24871 IX86_BUILTIN_MOVSLDUP,
24872 IX86_BUILTIN_ADDSUBPD,
24873 IX86_BUILTIN_HADDPD,
24874 IX86_BUILTIN_HSUBPD,
24875 IX86_BUILTIN_LDDQU,
24876
24877 IX86_BUILTIN_MONITOR,
24878 IX86_BUILTIN_MWAIT,
24879
24880 /* SSSE3. */
24881 IX86_BUILTIN_PHADDW,
24882 IX86_BUILTIN_PHADDD,
24883 IX86_BUILTIN_PHADDSW,
24884 IX86_BUILTIN_PHSUBW,
24885 IX86_BUILTIN_PHSUBD,
24886 IX86_BUILTIN_PHSUBSW,
24887 IX86_BUILTIN_PMADDUBSW,
24888 IX86_BUILTIN_PMULHRSW,
24889 IX86_BUILTIN_PSHUFB,
24890 IX86_BUILTIN_PSIGNB,
24891 IX86_BUILTIN_PSIGNW,
24892 IX86_BUILTIN_PSIGND,
24893 IX86_BUILTIN_PALIGNR,
24894 IX86_BUILTIN_PABSB,
24895 IX86_BUILTIN_PABSW,
24896 IX86_BUILTIN_PABSD,
24897
24898 IX86_BUILTIN_PHADDW128,
24899 IX86_BUILTIN_PHADDD128,
24900 IX86_BUILTIN_PHADDSW128,
24901 IX86_BUILTIN_PHSUBW128,
24902 IX86_BUILTIN_PHSUBD128,
24903 IX86_BUILTIN_PHSUBSW128,
24904 IX86_BUILTIN_PMADDUBSW128,
24905 IX86_BUILTIN_PMULHRSW128,
24906 IX86_BUILTIN_PSHUFB128,
24907 IX86_BUILTIN_PSIGNB128,
24908 IX86_BUILTIN_PSIGNW128,
24909 IX86_BUILTIN_PSIGND128,
24910 IX86_BUILTIN_PALIGNR128,
24911 IX86_BUILTIN_PABSB128,
24912 IX86_BUILTIN_PABSW128,
24913 IX86_BUILTIN_PABSD128,
24914
24915 /* AMDFAM10 - SSE4A New Instructions. */
24916 IX86_BUILTIN_MOVNTSD,
24917 IX86_BUILTIN_MOVNTSS,
24918 IX86_BUILTIN_EXTRQI,
24919 IX86_BUILTIN_EXTRQ,
24920 IX86_BUILTIN_INSERTQI,
24921 IX86_BUILTIN_INSERTQ,
24922
24923 /* SSE4.1. */
24924 IX86_BUILTIN_BLENDPD,
24925 IX86_BUILTIN_BLENDPS,
24926 IX86_BUILTIN_BLENDVPD,
24927 IX86_BUILTIN_BLENDVPS,
24928 IX86_BUILTIN_PBLENDVB128,
24929 IX86_BUILTIN_PBLENDW128,
24930
24931 IX86_BUILTIN_DPPD,
24932 IX86_BUILTIN_DPPS,
24933
24934 IX86_BUILTIN_INSERTPS128,
24935
24936 IX86_BUILTIN_MOVNTDQA,
24937 IX86_BUILTIN_MPSADBW128,
24938 IX86_BUILTIN_PACKUSDW128,
24939 IX86_BUILTIN_PCMPEQQ,
24940 IX86_BUILTIN_PHMINPOSUW128,
24941
24942 IX86_BUILTIN_PMAXSB128,
24943 IX86_BUILTIN_PMAXSD128,
24944 IX86_BUILTIN_PMAXUD128,
24945 IX86_BUILTIN_PMAXUW128,
24946
24947 IX86_BUILTIN_PMINSB128,
24948 IX86_BUILTIN_PMINSD128,
24949 IX86_BUILTIN_PMINUD128,
24950 IX86_BUILTIN_PMINUW128,
24951
24952 IX86_BUILTIN_PMOVSXBW128,
24953 IX86_BUILTIN_PMOVSXBD128,
24954 IX86_BUILTIN_PMOVSXBQ128,
24955 IX86_BUILTIN_PMOVSXWD128,
24956 IX86_BUILTIN_PMOVSXWQ128,
24957 IX86_BUILTIN_PMOVSXDQ128,
24958
24959 IX86_BUILTIN_PMOVZXBW128,
24960 IX86_BUILTIN_PMOVZXBD128,
24961 IX86_BUILTIN_PMOVZXBQ128,
24962 IX86_BUILTIN_PMOVZXWD128,
24963 IX86_BUILTIN_PMOVZXWQ128,
24964 IX86_BUILTIN_PMOVZXDQ128,
24965
24966 IX86_BUILTIN_PMULDQ128,
24967 IX86_BUILTIN_PMULLD128,
24968
24969 IX86_BUILTIN_ROUNDSD,
24970 IX86_BUILTIN_ROUNDSS,
24971
24972 IX86_BUILTIN_ROUNDPD,
24973 IX86_BUILTIN_ROUNDPS,
24974
24975 IX86_BUILTIN_FLOORPD,
24976 IX86_BUILTIN_CEILPD,
24977 IX86_BUILTIN_TRUNCPD,
24978 IX86_BUILTIN_RINTPD,
24979 IX86_BUILTIN_ROUNDPD_AZ,
24980
24981 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
24982 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
24983 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
24984
24985 IX86_BUILTIN_FLOORPS,
24986 IX86_BUILTIN_CEILPS,
24987 IX86_BUILTIN_TRUNCPS,
24988 IX86_BUILTIN_RINTPS,
24989 IX86_BUILTIN_ROUNDPS_AZ,
24990
24991 IX86_BUILTIN_FLOORPS_SFIX,
24992 IX86_BUILTIN_CEILPS_SFIX,
24993 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
24994
24995 IX86_BUILTIN_PTESTZ,
24996 IX86_BUILTIN_PTESTC,
24997 IX86_BUILTIN_PTESTNZC,
24998
24999 IX86_BUILTIN_VEC_INIT_V2SI,
25000 IX86_BUILTIN_VEC_INIT_V4HI,
25001 IX86_BUILTIN_VEC_INIT_V8QI,
25002 IX86_BUILTIN_VEC_EXT_V2DF,
25003 IX86_BUILTIN_VEC_EXT_V2DI,
25004 IX86_BUILTIN_VEC_EXT_V4SF,
25005 IX86_BUILTIN_VEC_EXT_V4SI,
25006 IX86_BUILTIN_VEC_EXT_V8HI,
25007 IX86_BUILTIN_VEC_EXT_V2SI,
25008 IX86_BUILTIN_VEC_EXT_V4HI,
25009 IX86_BUILTIN_VEC_EXT_V16QI,
25010 IX86_BUILTIN_VEC_SET_V2DI,
25011 IX86_BUILTIN_VEC_SET_V4SF,
25012 IX86_BUILTIN_VEC_SET_V4SI,
25013 IX86_BUILTIN_VEC_SET_V8HI,
25014 IX86_BUILTIN_VEC_SET_V4HI,
25015 IX86_BUILTIN_VEC_SET_V16QI,
25016
25017 IX86_BUILTIN_VEC_PACK_SFIX,
25018 IX86_BUILTIN_VEC_PACK_SFIX256,
25019
25020 /* SSE4.2. */
25021 IX86_BUILTIN_CRC32QI,
25022 IX86_BUILTIN_CRC32HI,
25023 IX86_BUILTIN_CRC32SI,
25024 IX86_BUILTIN_CRC32DI,
25025
25026 IX86_BUILTIN_PCMPESTRI128,
25027 IX86_BUILTIN_PCMPESTRM128,
25028 IX86_BUILTIN_PCMPESTRA128,
25029 IX86_BUILTIN_PCMPESTRC128,
25030 IX86_BUILTIN_PCMPESTRO128,
25031 IX86_BUILTIN_PCMPESTRS128,
25032 IX86_BUILTIN_PCMPESTRZ128,
25033 IX86_BUILTIN_PCMPISTRI128,
25034 IX86_BUILTIN_PCMPISTRM128,
25035 IX86_BUILTIN_PCMPISTRA128,
25036 IX86_BUILTIN_PCMPISTRC128,
25037 IX86_BUILTIN_PCMPISTRO128,
25038 IX86_BUILTIN_PCMPISTRS128,
25039 IX86_BUILTIN_PCMPISTRZ128,
25040
25041 IX86_BUILTIN_PCMPGTQ,
25042
25043 /* AES instructions */
25044 IX86_BUILTIN_AESENC128,
25045 IX86_BUILTIN_AESENCLAST128,
25046 IX86_BUILTIN_AESDEC128,
25047 IX86_BUILTIN_AESDECLAST128,
25048 IX86_BUILTIN_AESIMC128,
25049 IX86_BUILTIN_AESKEYGENASSIST128,
25050
25051 /* PCLMUL instruction */
25052 IX86_BUILTIN_PCLMULQDQ128,
25053
25054 /* AVX */
25055 IX86_BUILTIN_ADDPD256,
25056 IX86_BUILTIN_ADDPS256,
25057 IX86_BUILTIN_ADDSUBPD256,
25058 IX86_BUILTIN_ADDSUBPS256,
25059 IX86_BUILTIN_ANDPD256,
25060 IX86_BUILTIN_ANDPS256,
25061 IX86_BUILTIN_ANDNPD256,
25062 IX86_BUILTIN_ANDNPS256,
25063 IX86_BUILTIN_BLENDPD256,
25064 IX86_BUILTIN_BLENDPS256,
25065 IX86_BUILTIN_BLENDVPD256,
25066 IX86_BUILTIN_BLENDVPS256,
25067 IX86_BUILTIN_DIVPD256,
25068 IX86_BUILTIN_DIVPS256,
25069 IX86_BUILTIN_DPPS256,
25070 IX86_BUILTIN_HADDPD256,
25071 IX86_BUILTIN_HADDPS256,
25072 IX86_BUILTIN_HSUBPD256,
25073 IX86_BUILTIN_HSUBPS256,
25074 IX86_BUILTIN_MAXPD256,
25075 IX86_BUILTIN_MAXPS256,
25076 IX86_BUILTIN_MINPD256,
25077 IX86_BUILTIN_MINPS256,
25078 IX86_BUILTIN_MULPD256,
25079 IX86_BUILTIN_MULPS256,
25080 IX86_BUILTIN_ORPD256,
25081 IX86_BUILTIN_ORPS256,
25082 IX86_BUILTIN_SHUFPD256,
25083 IX86_BUILTIN_SHUFPS256,
25084 IX86_BUILTIN_SUBPD256,
25085 IX86_BUILTIN_SUBPS256,
25086 IX86_BUILTIN_XORPD256,
25087 IX86_BUILTIN_XORPS256,
25088 IX86_BUILTIN_CMPSD,
25089 IX86_BUILTIN_CMPSS,
25090 IX86_BUILTIN_CMPPD,
25091 IX86_BUILTIN_CMPPS,
25092 IX86_BUILTIN_CMPPD256,
25093 IX86_BUILTIN_CMPPS256,
25094 IX86_BUILTIN_CVTDQ2PD256,
25095 IX86_BUILTIN_CVTDQ2PS256,
25096 IX86_BUILTIN_CVTPD2PS256,
25097 IX86_BUILTIN_CVTPS2DQ256,
25098 IX86_BUILTIN_CVTPS2PD256,
25099 IX86_BUILTIN_CVTTPD2DQ256,
25100 IX86_BUILTIN_CVTPD2DQ256,
25101 IX86_BUILTIN_CVTTPS2DQ256,
25102 IX86_BUILTIN_EXTRACTF128PD256,
25103 IX86_BUILTIN_EXTRACTF128PS256,
25104 IX86_BUILTIN_EXTRACTF128SI256,
25105 IX86_BUILTIN_VZEROALL,
25106 IX86_BUILTIN_VZEROUPPER,
25107 IX86_BUILTIN_VPERMILVARPD,
25108 IX86_BUILTIN_VPERMILVARPS,
25109 IX86_BUILTIN_VPERMILVARPD256,
25110 IX86_BUILTIN_VPERMILVARPS256,
25111 IX86_BUILTIN_VPERMILPD,
25112 IX86_BUILTIN_VPERMILPS,
25113 IX86_BUILTIN_VPERMILPD256,
25114 IX86_BUILTIN_VPERMILPS256,
25115 IX86_BUILTIN_VPERMIL2PD,
25116 IX86_BUILTIN_VPERMIL2PS,
25117 IX86_BUILTIN_VPERMIL2PD256,
25118 IX86_BUILTIN_VPERMIL2PS256,
25119 IX86_BUILTIN_VPERM2F128PD256,
25120 IX86_BUILTIN_VPERM2F128PS256,
25121 IX86_BUILTIN_VPERM2F128SI256,
25122 IX86_BUILTIN_VBROADCASTSS,
25123 IX86_BUILTIN_VBROADCASTSD256,
25124 IX86_BUILTIN_VBROADCASTSS256,
25125 IX86_BUILTIN_VBROADCASTPD256,
25126 IX86_BUILTIN_VBROADCASTPS256,
25127 IX86_BUILTIN_VINSERTF128PD256,
25128 IX86_BUILTIN_VINSERTF128PS256,
25129 IX86_BUILTIN_VINSERTF128SI256,
25130 IX86_BUILTIN_LOADUPD256,
25131 IX86_BUILTIN_LOADUPS256,
25132 IX86_BUILTIN_STOREUPD256,
25133 IX86_BUILTIN_STOREUPS256,
25134 IX86_BUILTIN_LDDQU256,
25135 IX86_BUILTIN_MOVNTDQ256,
25136 IX86_BUILTIN_MOVNTPD256,
25137 IX86_BUILTIN_MOVNTPS256,
25138 IX86_BUILTIN_LOADDQU256,
25139 IX86_BUILTIN_STOREDQU256,
25140 IX86_BUILTIN_MASKLOADPD,
25141 IX86_BUILTIN_MASKLOADPS,
25142 IX86_BUILTIN_MASKSTOREPD,
25143 IX86_BUILTIN_MASKSTOREPS,
25144 IX86_BUILTIN_MASKLOADPD256,
25145 IX86_BUILTIN_MASKLOADPS256,
25146 IX86_BUILTIN_MASKSTOREPD256,
25147 IX86_BUILTIN_MASKSTOREPS256,
25148 IX86_BUILTIN_MOVSHDUP256,
25149 IX86_BUILTIN_MOVSLDUP256,
25150 IX86_BUILTIN_MOVDDUP256,
25151
25152 IX86_BUILTIN_SQRTPD256,
25153 IX86_BUILTIN_SQRTPS256,
25154 IX86_BUILTIN_SQRTPS_NR256,
25155 IX86_BUILTIN_RSQRTPS256,
25156 IX86_BUILTIN_RSQRTPS_NR256,
25157
25158 IX86_BUILTIN_RCPPS256,
25159
25160 IX86_BUILTIN_ROUNDPD256,
25161 IX86_BUILTIN_ROUNDPS256,
25162
25163 IX86_BUILTIN_FLOORPD256,
25164 IX86_BUILTIN_CEILPD256,
25165 IX86_BUILTIN_TRUNCPD256,
25166 IX86_BUILTIN_RINTPD256,
25167 IX86_BUILTIN_ROUNDPD_AZ256,
25168
25169 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25170 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25171 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25172
25173 IX86_BUILTIN_FLOORPS256,
25174 IX86_BUILTIN_CEILPS256,
25175 IX86_BUILTIN_TRUNCPS256,
25176 IX86_BUILTIN_RINTPS256,
25177 IX86_BUILTIN_ROUNDPS_AZ256,
25178
25179 IX86_BUILTIN_FLOORPS_SFIX256,
25180 IX86_BUILTIN_CEILPS_SFIX256,
25181 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25182
25183 IX86_BUILTIN_UNPCKHPD256,
25184 IX86_BUILTIN_UNPCKLPD256,
25185 IX86_BUILTIN_UNPCKHPS256,
25186 IX86_BUILTIN_UNPCKLPS256,
25187
25188 IX86_BUILTIN_SI256_SI,
25189 IX86_BUILTIN_PS256_PS,
25190 IX86_BUILTIN_PD256_PD,
25191 IX86_BUILTIN_SI_SI256,
25192 IX86_BUILTIN_PS_PS256,
25193 IX86_BUILTIN_PD_PD256,
25194
25195 IX86_BUILTIN_VTESTZPD,
25196 IX86_BUILTIN_VTESTCPD,
25197 IX86_BUILTIN_VTESTNZCPD,
25198 IX86_BUILTIN_VTESTZPS,
25199 IX86_BUILTIN_VTESTCPS,
25200 IX86_BUILTIN_VTESTNZCPS,
25201 IX86_BUILTIN_VTESTZPD256,
25202 IX86_BUILTIN_VTESTCPD256,
25203 IX86_BUILTIN_VTESTNZCPD256,
25204 IX86_BUILTIN_VTESTZPS256,
25205 IX86_BUILTIN_VTESTCPS256,
25206 IX86_BUILTIN_VTESTNZCPS256,
25207 IX86_BUILTIN_PTESTZ256,
25208 IX86_BUILTIN_PTESTC256,
25209 IX86_BUILTIN_PTESTNZC256,
25210
25211 IX86_BUILTIN_MOVMSKPD256,
25212 IX86_BUILTIN_MOVMSKPS256,
25213
25214 /* AVX2 */
25215 IX86_BUILTIN_MPSADBW256,
25216 IX86_BUILTIN_PABSB256,
25217 IX86_BUILTIN_PABSW256,
25218 IX86_BUILTIN_PABSD256,
25219 IX86_BUILTIN_PACKSSDW256,
25220 IX86_BUILTIN_PACKSSWB256,
25221 IX86_BUILTIN_PACKUSDW256,
25222 IX86_BUILTIN_PACKUSWB256,
25223 IX86_BUILTIN_PADDB256,
25224 IX86_BUILTIN_PADDW256,
25225 IX86_BUILTIN_PADDD256,
25226 IX86_BUILTIN_PADDQ256,
25227 IX86_BUILTIN_PADDSB256,
25228 IX86_BUILTIN_PADDSW256,
25229 IX86_BUILTIN_PADDUSB256,
25230 IX86_BUILTIN_PADDUSW256,
25231 IX86_BUILTIN_PALIGNR256,
25232 IX86_BUILTIN_AND256I,
25233 IX86_BUILTIN_ANDNOT256I,
25234 IX86_BUILTIN_PAVGB256,
25235 IX86_BUILTIN_PAVGW256,
25236 IX86_BUILTIN_PBLENDVB256,
25237 IX86_BUILTIN_PBLENDVW256,
25238 IX86_BUILTIN_PCMPEQB256,
25239 IX86_BUILTIN_PCMPEQW256,
25240 IX86_BUILTIN_PCMPEQD256,
25241 IX86_BUILTIN_PCMPEQQ256,
25242 IX86_BUILTIN_PCMPGTB256,
25243 IX86_BUILTIN_PCMPGTW256,
25244 IX86_BUILTIN_PCMPGTD256,
25245 IX86_BUILTIN_PCMPGTQ256,
25246 IX86_BUILTIN_PHADDW256,
25247 IX86_BUILTIN_PHADDD256,
25248 IX86_BUILTIN_PHADDSW256,
25249 IX86_BUILTIN_PHSUBW256,
25250 IX86_BUILTIN_PHSUBD256,
25251 IX86_BUILTIN_PHSUBSW256,
25252 IX86_BUILTIN_PMADDUBSW256,
25253 IX86_BUILTIN_PMADDWD256,
25254 IX86_BUILTIN_PMAXSB256,
25255 IX86_BUILTIN_PMAXSW256,
25256 IX86_BUILTIN_PMAXSD256,
25257 IX86_BUILTIN_PMAXUB256,
25258 IX86_BUILTIN_PMAXUW256,
25259 IX86_BUILTIN_PMAXUD256,
25260 IX86_BUILTIN_PMINSB256,
25261 IX86_BUILTIN_PMINSW256,
25262 IX86_BUILTIN_PMINSD256,
25263 IX86_BUILTIN_PMINUB256,
25264 IX86_BUILTIN_PMINUW256,
25265 IX86_BUILTIN_PMINUD256,
25266 IX86_BUILTIN_PMOVMSKB256,
25267 IX86_BUILTIN_PMOVSXBW256,
25268 IX86_BUILTIN_PMOVSXBD256,
25269 IX86_BUILTIN_PMOVSXBQ256,
25270 IX86_BUILTIN_PMOVSXWD256,
25271 IX86_BUILTIN_PMOVSXWQ256,
25272 IX86_BUILTIN_PMOVSXDQ256,
25273 IX86_BUILTIN_PMOVZXBW256,
25274 IX86_BUILTIN_PMOVZXBD256,
25275 IX86_BUILTIN_PMOVZXBQ256,
25276 IX86_BUILTIN_PMOVZXWD256,
25277 IX86_BUILTIN_PMOVZXWQ256,
25278 IX86_BUILTIN_PMOVZXDQ256,
25279 IX86_BUILTIN_PMULDQ256,
25280 IX86_BUILTIN_PMULHRSW256,
25281 IX86_BUILTIN_PMULHUW256,
25282 IX86_BUILTIN_PMULHW256,
25283 IX86_BUILTIN_PMULLW256,
25284 IX86_BUILTIN_PMULLD256,
25285 IX86_BUILTIN_PMULUDQ256,
25286 IX86_BUILTIN_POR256,
25287 IX86_BUILTIN_PSADBW256,
25288 IX86_BUILTIN_PSHUFB256,
25289 IX86_BUILTIN_PSHUFD256,
25290 IX86_BUILTIN_PSHUFHW256,
25291 IX86_BUILTIN_PSHUFLW256,
25292 IX86_BUILTIN_PSIGNB256,
25293 IX86_BUILTIN_PSIGNW256,
25294 IX86_BUILTIN_PSIGND256,
25295 IX86_BUILTIN_PSLLDQI256,
25296 IX86_BUILTIN_PSLLWI256,
25297 IX86_BUILTIN_PSLLW256,
25298 IX86_BUILTIN_PSLLDI256,
25299 IX86_BUILTIN_PSLLD256,
25300 IX86_BUILTIN_PSLLQI256,
25301 IX86_BUILTIN_PSLLQ256,
25302 IX86_BUILTIN_PSRAWI256,
25303 IX86_BUILTIN_PSRAW256,
25304 IX86_BUILTIN_PSRADI256,
25305 IX86_BUILTIN_PSRAD256,
25306 IX86_BUILTIN_PSRLDQI256,
25307 IX86_BUILTIN_PSRLWI256,
25308 IX86_BUILTIN_PSRLW256,
25309 IX86_BUILTIN_PSRLDI256,
25310 IX86_BUILTIN_PSRLD256,
25311 IX86_BUILTIN_PSRLQI256,
25312 IX86_BUILTIN_PSRLQ256,
25313 IX86_BUILTIN_PSUBB256,
25314 IX86_BUILTIN_PSUBW256,
25315 IX86_BUILTIN_PSUBD256,
25316 IX86_BUILTIN_PSUBQ256,
25317 IX86_BUILTIN_PSUBSB256,
25318 IX86_BUILTIN_PSUBSW256,
25319 IX86_BUILTIN_PSUBUSB256,
25320 IX86_BUILTIN_PSUBUSW256,
25321 IX86_BUILTIN_PUNPCKHBW256,
25322 IX86_BUILTIN_PUNPCKHWD256,
25323 IX86_BUILTIN_PUNPCKHDQ256,
25324 IX86_BUILTIN_PUNPCKHQDQ256,
25325 IX86_BUILTIN_PUNPCKLBW256,
25326 IX86_BUILTIN_PUNPCKLWD256,
25327 IX86_BUILTIN_PUNPCKLDQ256,
25328 IX86_BUILTIN_PUNPCKLQDQ256,
25329 IX86_BUILTIN_PXOR256,
25330 IX86_BUILTIN_MOVNTDQA256,
25331 IX86_BUILTIN_VBROADCASTSS_PS,
25332 IX86_BUILTIN_VBROADCASTSS_PS256,
25333 IX86_BUILTIN_VBROADCASTSD_PD256,
25334 IX86_BUILTIN_VBROADCASTSI256,
25335 IX86_BUILTIN_PBLENDD256,
25336 IX86_BUILTIN_PBLENDD128,
25337 IX86_BUILTIN_PBROADCASTB256,
25338 IX86_BUILTIN_PBROADCASTW256,
25339 IX86_BUILTIN_PBROADCASTD256,
25340 IX86_BUILTIN_PBROADCASTQ256,
25341 IX86_BUILTIN_PBROADCASTB128,
25342 IX86_BUILTIN_PBROADCASTW128,
25343 IX86_BUILTIN_PBROADCASTD128,
25344 IX86_BUILTIN_PBROADCASTQ128,
25345 IX86_BUILTIN_VPERMVARSI256,
25346 IX86_BUILTIN_VPERMDF256,
25347 IX86_BUILTIN_VPERMVARSF256,
25348 IX86_BUILTIN_VPERMDI256,
25349 IX86_BUILTIN_VPERMTI256,
25350 IX86_BUILTIN_VEXTRACT128I256,
25351 IX86_BUILTIN_VINSERT128I256,
25352 IX86_BUILTIN_MASKLOADD,
25353 IX86_BUILTIN_MASKLOADQ,
25354 IX86_BUILTIN_MASKLOADD256,
25355 IX86_BUILTIN_MASKLOADQ256,
25356 IX86_BUILTIN_MASKSTORED,
25357 IX86_BUILTIN_MASKSTOREQ,
25358 IX86_BUILTIN_MASKSTORED256,
25359 IX86_BUILTIN_MASKSTOREQ256,
25360 IX86_BUILTIN_PSLLVV4DI,
25361 IX86_BUILTIN_PSLLVV2DI,
25362 IX86_BUILTIN_PSLLVV8SI,
25363 IX86_BUILTIN_PSLLVV4SI,
25364 IX86_BUILTIN_PSRAVV8SI,
25365 IX86_BUILTIN_PSRAVV4SI,
25366 IX86_BUILTIN_PSRLVV4DI,
25367 IX86_BUILTIN_PSRLVV2DI,
25368 IX86_BUILTIN_PSRLVV8SI,
25369 IX86_BUILTIN_PSRLVV4SI,
25370
25371 IX86_BUILTIN_GATHERSIV2DF,
25372 IX86_BUILTIN_GATHERSIV4DF,
25373 IX86_BUILTIN_GATHERDIV2DF,
25374 IX86_BUILTIN_GATHERDIV4DF,
25375 IX86_BUILTIN_GATHERSIV4SF,
25376 IX86_BUILTIN_GATHERSIV8SF,
25377 IX86_BUILTIN_GATHERDIV4SF,
25378 IX86_BUILTIN_GATHERDIV8SF,
25379 IX86_BUILTIN_GATHERSIV2DI,
25380 IX86_BUILTIN_GATHERSIV4DI,
25381 IX86_BUILTIN_GATHERDIV2DI,
25382 IX86_BUILTIN_GATHERDIV4DI,
25383 IX86_BUILTIN_GATHERSIV4SI,
25384 IX86_BUILTIN_GATHERSIV8SI,
25385 IX86_BUILTIN_GATHERDIV4SI,
25386 IX86_BUILTIN_GATHERDIV8SI,
25387
25388 /* Alternate 4 element gather for the vectorizer where
25389 all operands are 32-byte wide. */
25390 IX86_BUILTIN_GATHERALTSIV4DF,
25391 IX86_BUILTIN_GATHERALTDIV8SF,
25392 IX86_BUILTIN_GATHERALTSIV4DI,
25393 IX86_BUILTIN_GATHERALTDIV8SI,
25394
25395 /* TFmode support builtins. */
25396 IX86_BUILTIN_INFQ,
25397 IX86_BUILTIN_HUGE_VALQ,
25398 IX86_BUILTIN_FABSQ,
25399 IX86_BUILTIN_COPYSIGNQ,
25400
25401 /* Vectorizer support builtins. */
25402 IX86_BUILTIN_CPYSGNPS,
25403 IX86_BUILTIN_CPYSGNPD,
25404 IX86_BUILTIN_CPYSGNPS256,
25405 IX86_BUILTIN_CPYSGNPD256,
25406
25407 /* FMA4 instructions. */
25408 IX86_BUILTIN_VFMADDSS,
25409 IX86_BUILTIN_VFMADDSD,
25410 IX86_BUILTIN_VFMADDPS,
25411 IX86_BUILTIN_VFMADDPD,
25412 IX86_BUILTIN_VFMADDPS256,
25413 IX86_BUILTIN_VFMADDPD256,
25414 IX86_BUILTIN_VFMADDSUBPS,
25415 IX86_BUILTIN_VFMADDSUBPD,
25416 IX86_BUILTIN_VFMADDSUBPS256,
25417 IX86_BUILTIN_VFMADDSUBPD256,
25418
25419 /* FMA3 instructions. */
25420 IX86_BUILTIN_VFMADDSS3,
25421 IX86_BUILTIN_VFMADDSD3,
25422
25423 /* XOP instructions. */
25424 IX86_BUILTIN_VPCMOV,
25425 IX86_BUILTIN_VPCMOV_V2DI,
25426 IX86_BUILTIN_VPCMOV_V4SI,
25427 IX86_BUILTIN_VPCMOV_V8HI,
25428 IX86_BUILTIN_VPCMOV_V16QI,
25429 IX86_BUILTIN_VPCMOV_V4SF,
25430 IX86_BUILTIN_VPCMOV_V2DF,
25431 IX86_BUILTIN_VPCMOV256,
25432 IX86_BUILTIN_VPCMOV_V4DI256,
25433 IX86_BUILTIN_VPCMOV_V8SI256,
25434 IX86_BUILTIN_VPCMOV_V16HI256,
25435 IX86_BUILTIN_VPCMOV_V32QI256,
25436 IX86_BUILTIN_VPCMOV_V8SF256,
25437 IX86_BUILTIN_VPCMOV_V4DF256,
25438
25439 IX86_BUILTIN_VPPERM,
25440
25441 IX86_BUILTIN_VPMACSSWW,
25442 IX86_BUILTIN_VPMACSWW,
25443 IX86_BUILTIN_VPMACSSWD,
25444 IX86_BUILTIN_VPMACSWD,
25445 IX86_BUILTIN_VPMACSSDD,
25446 IX86_BUILTIN_VPMACSDD,
25447 IX86_BUILTIN_VPMACSSDQL,
25448 IX86_BUILTIN_VPMACSSDQH,
25449 IX86_BUILTIN_VPMACSDQL,
25450 IX86_BUILTIN_VPMACSDQH,
25451 IX86_BUILTIN_VPMADCSSWD,
25452 IX86_BUILTIN_VPMADCSWD,
25453
25454 IX86_BUILTIN_VPHADDBW,
25455 IX86_BUILTIN_VPHADDBD,
25456 IX86_BUILTIN_VPHADDBQ,
25457 IX86_BUILTIN_VPHADDWD,
25458 IX86_BUILTIN_VPHADDWQ,
25459 IX86_BUILTIN_VPHADDDQ,
25460 IX86_BUILTIN_VPHADDUBW,
25461 IX86_BUILTIN_VPHADDUBD,
25462 IX86_BUILTIN_VPHADDUBQ,
25463 IX86_BUILTIN_VPHADDUWD,
25464 IX86_BUILTIN_VPHADDUWQ,
25465 IX86_BUILTIN_VPHADDUDQ,
25466 IX86_BUILTIN_VPHSUBBW,
25467 IX86_BUILTIN_VPHSUBWD,
25468 IX86_BUILTIN_VPHSUBDQ,
25469
25470 IX86_BUILTIN_VPROTB,
25471 IX86_BUILTIN_VPROTW,
25472 IX86_BUILTIN_VPROTD,
25473 IX86_BUILTIN_VPROTQ,
25474 IX86_BUILTIN_VPROTB_IMM,
25475 IX86_BUILTIN_VPROTW_IMM,
25476 IX86_BUILTIN_VPROTD_IMM,
25477 IX86_BUILTIN_VPROTQ_IMM,
25478
25479 IX86_BUILTIN_VPSHLB,
25480 IX86_BUILTIN_VPSHLW,
25481 IX86_BUILTIN_VPSHLD,
25482 IX86_BUILTIN_VPSHLQ,
25483 IX86_BUILTIN_VPSHAB,
25484 IX86_BUILTIN_VPSHAW,
25485 IX86_BUILTIN_VPSHAD,
25486 IX86_BUILTIN_VPSHAQ,
25487
25488 IX86_BUILTIN_VFRCZSS,
25489 IX86_BUILTIN_VFRCZSD,
25490 IX86_BUILTIN_VFRCZPS,
25491 IX86_BUILTIN_VFRCZPD,
25492 IX86_BUILTIN_VFRCZPS256,
25493 IX86_BUILTIN_VFRCZPD256,
25494
25495 IX86_BUILTIN_VPCOMEQUB,
25496 IX86_BUILTIN_VPCOMNEUB,
25497 IX86_BUILTIN_VPCOMLTUB,
25498 IX86_BUILTIN_VPCOMLEUB,
25499 IX86_BUILTIN_VPCOMGTUB,
25500 IX86_BUILTIN_VPCOMGEUB,
25501 IX86_BUILTIN_VPCOMFALSEUB,
25502 IX86_BUILTIN_VPCOMTRUEUB,
25503
25504 IX86_BUILTIN_VPCOMEQUW,
25505 IX86_BUILTIN_VPCOMNEUW,
25506 IX86_BUILTIN_VPCOMLTUW,
25507 IX86_BUILTIN_VPCOMLEUW,
25508 IX86_BUILTIN_VPCOMGTUW,
25509 IX86_BUILTIN_VPCOMGEUW,
25510 IX86_BUILTIN_VPCOMFALSEUW,
25511 IX86_BUILTIN_VPCOMTRUEUW,
25512
25513 IX86_BUILTIN_VPCOMEQUD,
25514 IX86_BUILTIN_VPCOMNEUD,
25515 IX86_BUILTIN_VPCOMLTUD,
25516 IX86_BUILTIN_VPCOMLEUD,
25517 IX86_BUILTIN_VPCOMGTUD,
25518 IX86_BUILTIN_VPCOMGEUD,
25519 IX86_BUILTIN_VPCOMFALSEUD,
25520 IX86_BUILTIN_VPCOMTRUEUD,
25521
25522 IX86_BUILTIN_VPCOMEQUQ,
25523 IX86_BUILTIN_VPCOMNEUQ,
25524 IX86_BUILTIN_VPCOMLTUQ,
25525 IX86_BUILTIN_VPCOMLEUQ,
25526 IX86_BUILTIN_VPCOMGTUQ,
25527 IX86_BUILTIN_VPCOMGEUQ,
25528 IX86_BUILTIN_VPCOMFALSEUQ,
25529 IX86_BUILTIN_VPCOMTRUEUQ,
25530
25531 IX86_BUILTIN_VPCOMEQB,
25532 IX86_BUILTIN_VPCOMNEB,
25533 IX86_BUILTIN_VPCOMLTB,
25534 IX86_BUILTIN_VPCOMLEB,
25535 IX86_BUILTIN_VPCOMGTB,
25536 IX86_BUILTIN_VPCOMGEB,
25537 IX86_BUILTIN_VPCOMFALSEB,
25538 IX86_BUILTIN_VPCOMTRUEB,
25539
25540 IX86_BUILTIN_VPCOMEQW,
25541 IX86_BUILTIN_VPCOMNEW,
25542 IX86_BUILTIN_VPCOMLTW,
25543 IX86_BUILTIN_VPCOMLEW,
25544 IX86_BUILTIN_VPCOMGTW,
25545 IX86_BUILTIN_VPCOMGEW,
25546 IX86_BUILTIN_VPCOMFALSEW,
25547 IX86_BUILTIN_VPCOMTRUEW,
25548
25549 IX86_BUILTIN_VPCOMEQD,
25550 IX86_BUILTIN_VPCOMNED,
25551 IX86_BUILTIN_VPCOMLTD,
25552 IX86_BUILTIN_VPCOMLED,
25553 IX86_BUILTIN_VPCOMGTD,
25554 IX86_BUILTIN_VPCOMGED,
25555 IX86_BUILTIN_VPCOMFALSED,
25556 IX86_BUILTIN_VPCOMTRUED,
25557
25558 IX86_BUILTIN_VPCOMEQQ,
25559 IX86_BUILTIN_VPCOMNEQ,
25560 IX86_BUILTIN_VPCOMLTQ,
25561 IX86_BUILTIN_VPCOMLEQ,
25562 IX86_BUILTIN_VPCOMGTQ,
25563 IX86_BUILTIN_VPCOMGEQ,
25564 IX86_BUILTIN_VPCOMFALSEQ,
25565 IX86_BUILTIN_VPCOMTRUEQ,
25566
25567 /* LWP instructions. */
25568 IX86_BUILTIN_LLWPCB,
25569 IX86_BUILTIN_SLWPCB,
25570 IX86_BUILTIN_LWPVAL32,
25571 IX86_BUILTIN_LWPVAL64,
25572 IX86_BUILTIN_LWPINS32,
25573 IX86_BUILTIN_LWPINS64,
25574
25575 IX86_BUILTIN_CLZS,
25576
25577 /* BMI instructions. */
25578 IX86_BUILTIN_BEXTR32,
25579 IX86_BUILTIN_BEXTR64,
25580 IX86_BUILTIN_CTZS,
25581
25582 /* TBM instructions. */
25583 IX86_BUILTIN_BEXTRI32,
25584 IX86_BUILTIN_BEXTRI64,
25585
25586 /* BMI2 instructions. */
25587 IX86_BUILTIN_BZHI32,
25588 IX86_BUILTIN_BZHI64,
25589 IX86_BUILTIN_PDEP32,
25590 IX86_BUILTIN_PDEP64,
25591 IX86_BUILTIN_PEXT32,
25592 IX86_BUILTIN_PEXT64,
25593
25594 /* FSGSBASE instructions. */
25595 IX86_BUILTIN_RDFSBASE32,
25596 IX86_BUILTIN_RDFSBASE64,
25597 IX86_BUILTIN_RDGSBASE32,
25598 IX86_BUILTIN_RDGSBASE64,
25599 IX86_BUILTIN_WRFSBASE32,
25600 IX86_BUILTIN_WRFSBASE64,
25601 IX86_BUILTIN_WRGSBASE32,
25602 IX86_BUILTIN_WRGSBASE64,
25603
25604 /* RDRND instructions. */
25605 IX86_BUILTIN_RDRAND16_STEP,
25606 IX86_BUILTIN_RDRAND32_STEP,
25607 IX86_BUILTIN_RDRAND64_STEP,
25608
25609 /* F16C instructions. */
25610 IX86_BUILTIN_CVTPH2PS,
25611 IX86_BUILTIN_CVTPH2PS256,
25612 IX86_BUILTIN_CVTPS2PH,
25613 IX86_BUILTIN_CVTPS2PH256,
25614
25615 /* CFString built-in for darwin */
25616 IX86_BUILTIN_CFSTRING,
25617
25618 IX86_BUILTIN_MAX
25619 };
25620
25621 /* Table for the ix86 builtin decls. */
25622 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25623
25624 /* Table of all of the builtin functions that are possible with different ISA's
25625 but are waiting to be built until a function is declared to use that
25626 ISA. */
25627 struct builtin_isa {
25628 const char *name; /* function name */
25629 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25630 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25631 bool const_p; /* true if the declaration is constant */
25632 bool set_and_not_built_p;
25633 };
25634
25635 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25636
25637
25638 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25639 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25640 function decl in the ix86_builtins array. Returns the function decl or
25641 NULL_TREE, if the builtin was not added.
25642
25643 If the front end has a special hook for builtin functions, delay adding
25644 builtin functions that aren't in the current ISA until the ISA is changed
25645 with function specific optimization. Doing so, can save about 300K for the
25646 default compiler. When the builtin is expanded, check at that time whether
25647 it is valid.
25648
25649 If the front end doesn't have a special hook, record all builtins, even if
25650 it isn't an instruction set in the current ISA in case the user uses
25651 function specific options for a different ISA, so that we don't get scope
25652 errors if a builtin is added in the middle of a function scope. */
25653
25654 static inline tree
25655 def_builtin (HOST_WIDE_INT mask, const char *name,
25656 enum ix86_builtin_func_type tcode,
25657 enum ix86_builtins code)
25658 {
25659 tree decl = NULL_TREE;
25660
25661 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25662 {
25663 ix86_builtins_isa[(int) code].isa = mask;
25664
25665 mask &= ~OPTION_MASK_ISA_64BIT;
25666 if (mask == 0
25667 || (mask & ix86_isa_flags) != 0
25668 || (lang_hooks.builtin_function
25669 == lang_hooks.builtin_function_ext_scope))
25670
25671 {
25672 tree type = ix86_get_builtin_func_type (tcode);
25673 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25674 NULL, NULL_TREE);
25675 ix86_builtins[(int) code] = decl;
25676 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25677 }
25678 else
25679 {
25680 ix86_builtins[(int) code] = NULL_TREE;
25681 ix86_builtins_isa[(int) code].tcode = tcode;
25682 ix86_builtins_isa[(int) code].name = name;
25683 ix86_builtins_isa[(int) code].const_p = false;
25684 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25685 }
25686 }
25687
25688 return decl;
25689 }
25690
25691 /* Like def_builtin, but also marks the function decl "const". */
25692
25693 static inline tree
25694 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25695 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25696 {
25697 tree decl = def_builtin (mask, name, tcode, code);
25698 if (decl)
25699 TREE_READONLY (decl) = 1;
25700 else
25701 ix86_builtins_isa[(int) code].const_p = true;
25702
25703 return decl;
25704 }
25705
25706 /* Add any new builtin functions for a given ISA that may not have been
25707 declared. This saves a bit of space compared to adding all of the
25708 declarations to the tree, even if we didn't use them. */
25709
25710 static void
25711 ix86_add_new_builtins (HOST_WIDE_INT isa)
25712 {
25713 int i;
25714
25715 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
25716 {
25717 if ((ix86_builtins_isa[i].isa & isa) != 0
25718 && ix86_builtins_isa[i].set_and_not_built_p)
25719 {
25720 tree decl, type;
25721
25722 /* Don't define the builtin again. */
25723 ix86_builtins_isa[i].set_and_not_built_p = false;
25724
25725 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
25726 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
25727 type, i, BUILT_IN_MD, NULL,
25728 NULL_TREE);
25729
25730 ix86_builtins[i] = decl;
25731 if (ix86_builtins_isa[i].const_p)
25732 TREE_READONLY (decl) = 1;
25733 }
25734 }
25735 }
25736
25737 /* Bits for builtin_description.flag. */
25738
25739 /* Set when we don't support the comparison natively, and should
25740 swap_comparison in order to support it. */
25741 #define BUILTIN_DESC_SWAP_OPERANDS 1
25742
25743 struct builtin_description
25744 {
25745 const HOST_WIDE_INT mask;
25746 const enum insn_code icode;
25747 const char *const name;
25748 const enum ix86_builtins code;
25749 const enum rtx_code comparison;
25750 const int flag;
25751 };
25752
25753 static const struct builtin_description bdesc_comi[] =
25754 {
25755 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
25756 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
25757 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
25758 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
25759 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
25760 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
25761 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
25762 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
25763 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
25764 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
25765 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
25766 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
25767 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
25768 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
25769 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
25770 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
25771 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
25772 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
25773 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
25774 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
25775 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
25776 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
25777 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
25778 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
25779 };
25780
25781 static const struct builtin_description bdesc_pcmpestr[] =
25782 {
25783 /* SSE4.2 */
25784 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
25785 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
25786 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
25787 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
25788 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
25789 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
25790 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
25791 };
25792
25793 static const struct builtin_description bdesc_pcmpistr[] =
25794 {
25795 /* SSE4.2 */
25796 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
25797 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
25798 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
25799 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
25800 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
25801 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
25802 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
25803 };
25804
25805 /* Special builtins with variable number of arguments. */
25806 static const struct builtin_description bdesc_special_args[] =
25807 {
25808 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
25809 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
25810 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
25811
25812 /* MMX */
25813 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25814
25815 /* 3DNow! */
25816 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25817
25818 /* SSE */
25819 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25820 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25821 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25822
25823 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25824 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25825 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25826 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25827
25828 /* SSE or 3DNow!A */
25829 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25830 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
25831
25832 /* SSE2 */
25833 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25834 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25835 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25836 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
25837 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25838 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
25839 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
25840 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
25841 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
25842 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25843
25844 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25845 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25846
25847 /* SSE3 */
25848 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25849
25850 /* SSE4.1 */
25851 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
25852
25853 /* SSE4A */
25854 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25855 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25856
25857 /* AVX */
25858 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
25859 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
25860
25861 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25862 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25863 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25864 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
25865 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
25866
25867 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25868 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25869 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25870 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25871 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25872 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
25873 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25874
25875 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
25876 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25877 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25878
25879 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
25880 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
25881 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
25882 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
25883 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
25884 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
25885 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
25886 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
25887
25888 /* AVX2 */
25889 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
25890 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
25891 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
25892 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
25893 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
25894 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
25895 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
25896 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
25897 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
25898
25899 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
25900 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
25901 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
25902 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
25903 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
25904 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
25905
25906 /* FSGSBASE */
25907 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25908 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25909 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25910 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25911 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25912 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25913 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25914 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25915 };
25916
25917 /* Builtins with variable number of arguments. */
25918 static const struct builtin_description bdesc_args[] =
25919 {
25920 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
25921 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
25922 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
25923 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25924 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25925 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25926 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25927
25928 /* MMX */
25929 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25930 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25931 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25932 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25933 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25934 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25935
25936 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25937 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25938 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25939 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25940 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25941 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25942 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25943 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25944
25945 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25946 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25947
25948 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25949 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25950 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25951 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25952
25953 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25954 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25955 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25956 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25957 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25958 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25959
25960 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25961 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25962 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25963 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25964 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
25965 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
25966
25967 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25968 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
25969 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25970
25971 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
25972
25973 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25974 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25975 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
25976 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25977 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25978 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
25979
25980 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25981 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25982 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
25983 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25984 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25985 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
25986
25987 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25988 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25989 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25990 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
25991
25992 /* 3DNow! */
25993 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
25994 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
25995 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25996 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
25997
25998 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25999 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26000 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26001 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26002 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26003 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26004 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26005 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26006 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26007 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26008 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26009 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26010 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26011 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26012 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26013
26014 /* 3DNow!A */
26015 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26016 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26017 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26018 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26019 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26020 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26021
26022 /* SSE */
26023 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26024 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26025 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26026 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26027 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26028 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26029 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26030 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26031 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26032 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26033 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26034 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26035
26036 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26037
26038 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26039 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26040 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26041 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26042 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26043 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26044 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26045 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26046
26047 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26048 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26049 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26050 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26051 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26052 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26053 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26054 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26055 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26056 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26057 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26058 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26059 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26060 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26061 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26062 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26063 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26064 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26065 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26066 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26067 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26068 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26069
26070 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26071 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26072 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26073 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26074
26075 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26076 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26077 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26078 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26079
26080 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26081
26082 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26083 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26084 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26085 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26086 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26087
26088 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26089 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26090 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26091
26092 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26093
26094 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26095 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26096 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26097
26098 /* SSE MMX or 3Dnow!A */
26099 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26100 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26101 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26102
26103 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26104 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26105 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26106 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26107
26108 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26109 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26110
26111 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26112
26113 /* SSE2 */
26114 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26115
26116 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26117 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26118 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26119 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26120 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26121
26122 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26123 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26124 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26125 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26126 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26127
26128 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26129
26130 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26131 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26132 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26133 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26134
26135 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26136 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26137 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26138
26139 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26140 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26141 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26142 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26143 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26144 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26145 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26146 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26147
26148 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26149 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26150 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26151 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26152 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26153 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26154 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26155 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26156 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26157 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26158 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26159 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26160 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26161 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26162 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26163 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26164 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26165 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26166 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26167 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26168
26169 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26170 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26171 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26172 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26173
26174 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26175 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26176 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26177 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26178
26179 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26180
26181 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26182 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26183 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26184
26185 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26186
26187 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26188 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26189 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26190 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26191 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26192 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26193 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26194 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26195
26196 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26197 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26198 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26199 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26200 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26201 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26202 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26203 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26204
26205 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26206 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26207
26208 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26209 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26210 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26211 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26212
26213 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26214 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26215
26216 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26217 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26218 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26219 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26220 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26221 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26222
26223 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26224 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26225 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26226 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26227
26228 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26229 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26230 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26231 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26232 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26233 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26234 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26235 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26236
26237 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26238 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26239 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26240
26241 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26242 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26243
26244 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26245 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26246
26247 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26248
26249 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26250 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26251 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26252 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26253
26254 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26255 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26256 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26257 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26258 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26259 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26260 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26261
26262 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26263 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26264 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26265 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26266 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26267 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26268 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26269
26270 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26271 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26272 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26273 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26274
26275 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26276 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26277 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26278
26279 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26280
26281 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26282 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26283
26284 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26285
26286 /* SSE2 MMX */
26287 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26288 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26289
26290 /* SSE3 */
26291 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26292 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26293
26294 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26295 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26296 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26297 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26298 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26299 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26300
26301 /* SSSE3 */
26302 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26303 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26304 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26305 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26306 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26307 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26308
26309 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26310 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26311 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26312 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26313 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26314 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26315 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26316 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26317 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26318 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26319 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26320 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26321 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26322 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26323 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26324 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26325 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26326 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26327 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26328 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26329 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26330 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26331 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26332 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26333
26334 /* SSSE3. */
26335 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26336 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26337
26338 /* SSE4.1 */
26339 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26340 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26341 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26342 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26343 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26344 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26345 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26346 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26347 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26348 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26349
26350 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26351 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26352 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26353 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26354 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26355 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26356 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26357 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26358 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26359 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26360 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26361 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26362 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26363
26364 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26365 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26366 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26367 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26368 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26369 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26370 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26371 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26372 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26373 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26374 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26375 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26376
26377 /* SSE4.1 */
26378 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26379 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26380 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26381 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26382
26383 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26384 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26385 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26386 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26387
26388 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26389 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26390
26391 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26392 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26393
26394 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26395 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26396 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26397 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26398
26399 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26400 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26401
26402 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26403 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26404
26405 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26406 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26407 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26408
26409 /* SSE4.2 */
26410 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26411 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26412 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26413 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26414 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26415
26416 /* SSE4A */
26417 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26418 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26419 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26420 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26421
26422 /* AES */
26423 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26424 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26425
26426 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26427 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26428 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26429 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26430
26431 /* PCLMUL */
26432 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26433
26434 /* AVX */
26435 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26436 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26437 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26438 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26439 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26440 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26441 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26442 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26443 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26444 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26445 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26446 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26447 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26448 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26449 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26450 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26451 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26452 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26453 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26454 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26455 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26456 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26457 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26458 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26459 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26460 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26461
26462 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26463 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26464 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26465 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26466
26467 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26468 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26469 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26470 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26471 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26472 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26473 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26474 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26475 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26476 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26477 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26478 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26479 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26480 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26481 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26482 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26483 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26484 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26485 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26486 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26487 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26488 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26489 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26490 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26491 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26492 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26493 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26494 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26495 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26496 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26497 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26498 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26499 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26500 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26501
26502 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26503 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26504 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26505
26506 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26507 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26508 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26509 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26510 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26511
26512 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26513
26514 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26515 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26516
26517 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26518 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26519 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26520 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26521
26522 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26523 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26524
26525 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26526 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26527
26528 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26529 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26530 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26531 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26532
26533 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26534 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26535
26536 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26537 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26538
26539 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26540 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26541 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26542 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26543
26544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26545 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26546 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26547 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26548 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26549 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26550
26551 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26552 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26554 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26557 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26558 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26559 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26560 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26561 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26562 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26563 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26564 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26565 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26566
26567 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26568 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26569
26570 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26571 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26572
26573 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26574
26575 /* AVX2 */
26576 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26577 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26578 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26579 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26580 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26581 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26582 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26583 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26584 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26585 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26586 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26587 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26588 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26589 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26590 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26591 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26592 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26593 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26594 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26595 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26596 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26597 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26598 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26599 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26600 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26601 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26602 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26603 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26604 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26605 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26606 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26607 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26608 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26609 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26610 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26611 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26612 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26613 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26614 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26615 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26616 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26617 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26618 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26619 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26620 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26621 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26622 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26623 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26624 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26625 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26626 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26627 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26628 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26629 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26630 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26631 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26632 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26633 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26634 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26635 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26636 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26637 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26638 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26639 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26640 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26641 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26642 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26643 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26644 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26645 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26646 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26647 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26648 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26649 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26650 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26651 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26652 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26653 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26654 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26655 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26656 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26657 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26658 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26659 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26660 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26661 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26662 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26663 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26664 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26665 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26666 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26667 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26668 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26669 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26670 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26671 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26672 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26673 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26674 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26675 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26676 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26677 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26678 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26679 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26680 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26681 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26682 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26683 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26684 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26685 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26686 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26687 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26688 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26689 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26690 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26691 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26692 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26693 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26694 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26695 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
26696 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26697 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
26698 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
26699 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26700 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26701 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26702 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26703 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26704 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26705 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26706 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26708 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
26709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
26710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
26711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
26712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26714 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26715 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26716 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26717 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26718 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26719 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26720 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26721 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26722
26723 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26724
26725 /* BMI */
26726 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26727 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26728 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26729
26730 /* TBM */
26731 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26732 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26733
26734 /* F16C */
26735 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
26736 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
26737 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
26738 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
26739
26740 /* BMI2 */
26741 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26742 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26743 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26744 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26745 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26746 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26747 };
26748
26749 /* FMA4 and XOP. */
26750 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
26751 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
26752 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
26753 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
26754 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
26755 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
26756 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
26757 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
26758 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
26759 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
26760 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
26761 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
26762 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
26763 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
26764 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
26765 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
26766 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
26767 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
26768 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
26769 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
26770 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
26771 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
26772 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
26773 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
26774 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
26775 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
26776 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
26777 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
26778 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
26779 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
26780 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
26781 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
26782 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
26783 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
26784 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
26785 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
26786 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
26787 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
26788 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
26789 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
26790 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
26791 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
26792 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
26793 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
26794 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
26795 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
26796 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
26797 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
26798 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
26799 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
26800 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
26801 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
26802
26803 static const struct builtin_description bdesc_multi_arg[] =
26804 {
26805 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
26806 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
26807 UNKNOWN, (int)MULTI_ARG_3_SF },
26808 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
26809 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
26810 UNKNOWN, (int)MULTI_ARG_3_DF },
26811
26812 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
26813 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
26814 UNKNOWN, (int)MULTI_ARG_3_SF },
26815 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
26816 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
26817 UNKNOWN, (int)MULTI_ARG_3_DF },
26818
26819 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
26820 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
26821 UNKNOWN, (int)MULTI_ARG_3_SF },
26822 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
26823 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
26824 UNKNOWN, (int)MULTI_ARG_3_DF },
26825 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
26826 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
26827 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26828 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
26829 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
26830 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26831
26832 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
26833 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
26834 UNKNOWN, (int)MULTI_ARG_3_SF },
26835 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
26836 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
26837 UNKNOWN, (int)MULTI_ARG_3_DF },
26838 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
26839 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
26840 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26841 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
26842 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
26843 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26844
26845 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
26846 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
26847 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
26848 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
26849 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
26850 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
26851 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
26852
26853 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26854 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26855 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
26856 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
26857 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
26858 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
26859 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
26860
26861 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
26862
26863 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26864 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26865 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26866 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26867 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26868 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26869 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26870 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26871 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26872 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26873 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26874 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26875
26876 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26877 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
26878 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
26879 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
26880 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
26881 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
26882 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
26883 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
26884 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26885 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
26886 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
26887 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
26888 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26889 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
26890 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
26891 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
26892
26893 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
26894 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
26895 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
26896 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
26897 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
26898 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
26899
26900 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26901 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26902 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26903 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26904 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26905 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26906 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26907 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26908 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26909 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26910 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26911 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26912 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26913 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26914 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26915
26916 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
26917 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26918 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26919 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
26920 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
26921 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
26922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
26923
26924 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
26925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
26928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
26929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
26930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
26931
26932 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
26933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
26936 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
26937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
26938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
26939
26940 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
26944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
26945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
26946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
26947
26948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
26949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
26952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
26953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
26954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
26955
26956 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
26957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
26960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
26961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
26962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
26963
26964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
26965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
26968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
26969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
26970 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
26971
26972 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26973 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
26976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
26977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
26978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
26979
26980 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
26981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
26982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
26983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
26984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
26985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
26986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
26987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
26988
26989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
26990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
26991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
26992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
26993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
26994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
26995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
26996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
26997
26998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
26999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27002
27003 };
27004 \f
27005 /* TM vector builtins. */
27006
27007 /* Reuse the existing x86-specific `struct builtin_description' cause
27008 we're lazy. Add casts to make them fit. */
27009 static const struct builtin_description bdesc_tm[] =
27010 {
27011 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27012 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27013 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27014 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27015 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27016 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27017 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27018
27019 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27020 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27021 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27022 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27023 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27024 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27025 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27026
27027 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27028 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27029 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27030 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27031 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27032 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27033 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27034
27035 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27036 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27037 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27038 };
27039
27040 /* TM callbacks. */
27041
27042 /* Return the builtin decl needed to load a vector of TYPE. */
27043
27044 static tree
27045 ix86_builtin_tm_load (tree type)
27046 {
27047 if (TREE_CODE (type) == VECTOR_TYPE)
27048 {
27049 switch (tree_low_cst (TYPE_SIZE (type), 1))
27050 {
27051 case 64:
27052 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27053 case 128:
27054 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27055 case 256:
27056 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27057 }
27058 }
27059 return NULL_TREE;
27060 }
27061
27062 /* Return the builtin decl needed to store a vector of TYPE. */
27063
27064 static tree
27065 ix86_builtin_tm_store (tree type)
27066 {
27067 if (TREE_CODE (type) == VECTOR_TYPE)
27068 {
27069 switch (tree_low_cst (TYPE_SIZE (type), 1))
27070 {
27071 case 64:
27072 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27073 case 128:
27074 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27075 case 256:
27076 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27077 }
27078 }
27079 return NULL_TREE;
27080 }
27081 \f
27082 /* Initialize the transactional memory vector load/store builtins. */
27083
27084 static void
27085 ix86_init_tm_builtins (void)
27086 {
27087 enum ix86_builtin_func_type ftype;
27088 const struct builtin_description *d;
27089 size_t i;
27090 tree decl;
27091 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27092 tree attrs_log, attrs_type_log;
27093
27094 if (!flag_tm)
27095 return;
27096
27097 /* If there are no builtins defined, we must be compiling in a
27098 language without trans-mem support. */
27099 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27100 return;
27101
27102 /* Use whatever attributes a normal TM load has. */
27103 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27104 attrs_load = DECL_ATTRIBUTES (decl);
27105 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27106 /* Use whatever attributes a normal TM store has. */
27107 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27108 attrs_store = DECL_ATTRIBUTES (decl);
27109 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27110 /* Use whatever attributes a normal TM log has. */
27111 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27112 attrs_log = DECL_ATTRIBUTES (decl);
27113 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27114
27115 for (i = 0, d = bdesc_tm;
27116 i < ARRAY_SIZE (bdesc_tm);
27117 i++, d++)
27118 {
27119 if ((d->mask & ix86_isa_flags) != 0
27120 || (lang_hooks.builtin_function
27121 == lang_hooks.builtin_function_ext_scope))
27122 {
27123 tree type, attrs, attrs_type;
27124 enum built_in_function code = (enum built_in_function) d->code;
27125
27126 ftype = (enum ix86_builtin_func_type) d->flag;
27127 type = ix86_get_builtin_func_type (ftype);
27128
27129 if (BUILTIN_TM_LOAD_P (code))
27130 {
27131 attrs = attrs_load;
27132 attrs_type = attrs_type_load;
27133 }
27134 else if (BUILTIN_TM_STORE_P (code))
27135 {
27136 attrs = attrs_store;
27137 attrs_type = attrs_type_store;
27138 }
27139 else
27140 {
27141 attrs = attrs_log;
27142 attrs_type = attrs_type_log;
27143 }
27144 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27145 /* The builtin without the prefix for
27146 calling it directly. */
27147 d->name + strlen ("__builtin_"),
27148 attrs);
27149 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27150 set the TYPE_ATTRIBUTES. */
27151 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27152
27153 set_builtin_decl (code, decl, false);
27154 }
27155 }
27156 }
27157
27158 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27159 in the current target ISA to allow the user to compile particular modules
27160 with different target specific options that differ from the command line
27161 options. */
27162 static void
27163 ix86_init_mmx_sse_builtins (void)
27164 {
27165 const struct builtin_description * d;
27166 enum ix86_builtin_func_type ftype;
27167 size_t i;
27168
27169 /* Add all special builtins with variable number of operands. */
27170 for (i = 0, d = bdesc_special_args;
27171 i < ARRAY_SIZE (bdesc_special_args);
27172 i++, d++)
27173 {
27174 if (d->name == 0)
27175 continue;
27176
27177 ftype = (enum ix86_builtin_func_type) d->flag;
27178 def_builtin (d->mask, d->name, ftype, d->code);
27179 }
27180
27181 /* Add all builtins with variable number of operands. */
27182 for (i = 0, d = bdesc_args;
27183 i < ARRAY_SIZE (bdesc_args);
27184 i++, d++)
27185 {
27186 if (d->name == 0)
27187 continue;
27188
27189 ftype = (enum ix86_builtin_func_type) d->flag;
27190 def_builtin_const (d->mask, d->name, ftype, d->code);
27191 }
27192
27193 /* pcmpestr[im] insns. */
27194 for (i = 0, d = bdesc_pcmpestr;
27195 i < ARRAY_SIZE (bdesc_pcmpestr);
27196 i++, d++)
27197 {
27198 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27199 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27200 else
27201 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27202 def_builtin_const (d->mask, d->name, ftype, d->code);
27203 }
27204
27205 /* pcmpistr[im] insns. */
27206 for (i = 0, d = bdesc_pcmpistr;
27207 i < ARRAY_SIZE (bdesc_pcmpistr);
27208 i++, d++)
27209 {
27210 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27211 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27212 else
27213 ftype = INT_FTYPE_V16QI_V16QI_INT;
27214 def_builtin_const (d->mask, d->name, ftype, d->code);
27215 }
27216
27217 /* comi/ucomi insns. */
27218 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27219 {
27220 if (d->mask == OPTION_MASK_ISA_SSE2)
27221 ftype = INT_FTYPE_V2DF_V2DF;
27222 else
27223 ftype = INT_FTYPE_V4SF_V4SF;
27224 def_builtin_const (d->mask, d->name, ftype, d->code);
27225 }
27226
27227 /* SSE */
27228 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27229 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27230 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27231 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27232
27233 /* SSE or 3DNow!A */
27234 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27235 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27236 IX86_BUILTIN_MASKMOVQ);
27237
27238 /* SSE2 */
27239 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27240 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27241
27242 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27243 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27244 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27245 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27246
27247 /* SSE3. */
27248 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27249 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27250 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27251 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27252
27253 /* AES */
27254 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27255 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27256 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27257 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27258 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27259 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27260 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27261 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27262 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27263 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27264 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27265 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27266
27267 /* PCLMUL */
27268 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27269 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27270
27271 /* RDRND */
27272 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27273 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27274 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27275 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27276 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27277 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27278 IX86_BUILTIN_RDRAND64_STEP);
27279
27280 /* AVX2 */
27281 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27282 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27283 IX86_BUILTIN_GATHERSIV2DF);
27284
27285 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27286 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27287 IX86_BUILTIN_GATHERSIV4DF);
27288
27289 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27290 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27291 IX86_BUILTIN_GATHERDIV2DF);
27292
27293 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27294 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27295 IX86_BUILTIN_GATHERDIV4DF);
27296
27297 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27298 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27299 IX86_BUILTIN_GATHERSIV4SF);
27300
27301 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27302 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27303 IX86_BUILTIN_GATHERSIV8SF);
27304
27305 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27306 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27307 IX86_BUILTIN_GATHERDIV4SF);
27308
27309 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27310 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27311 IX86_BUILTIN_GATHERDIV8SF);
27312
27313 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27314 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27315 IX86_BUILTIN_GATHERSIV2DI);
27316
27317 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27318 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27319 IX86_BUILTIN_GATHERSIV4DI);
27320
27321 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27322 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27323 IX86_BUILTIN_GATHERDIV2DI);
27324
27325 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27326 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27327 IX86_BUILTIN_GATHERDIV4DI);
27328
27329 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27330 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27331 IX86_BUILTIN_GATHERSIV4SI);
27332
27333 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27334 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27335 IX86_BUILTIN_GATHERSIV8SI);
27336
27337 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27338 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27339 IX86_BUILTIN_GATHERDIV4SI);
27340
27341 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27342 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27343 IX86_BUILTIN_GATHERDIV8SI);
27344
27345 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27346 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27347 IX86_BUILTIN_GATHERALTSIV4DF);
27348
27349 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27350 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27351 IX86_BUILTIN_GATHERALTDIV8SF);
27352
27353 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27354 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27355 IX86_BUILTIN_GATHERALTSIV4DI);
27356
27357 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27358 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27359 IX86_BUILTIN_GATHERALTDIV8SI);
27360
27361 /* MMX access to the vec_init patterns. */
27362 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27363 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27364
27365 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27366 V4HI_FTYPE_HI_HI_HI_HI,
27367 IX86_BUILTIN_VEC_INIT_V4HI);
27368
27369 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27370 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27371 IX86_BUILTIN_VEC_INIT_V8QI);
27372
27373 /* Access to the vec_extract patterns. */
27374 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27375 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27376 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27377 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27378 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27379 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27380 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27381 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27382 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27383 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27384
27385 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27386 "__builtin_ia32_vec_ext_v4hi",
27387 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27388
27389 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27390 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27391
27392 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27393 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27394
27395 /* Access to the vec_set patterns. */
27396 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27397 "__builtin_ia32_vec_set_v2di",
27398 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27399
27400 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27401 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27402
27403 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27404 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27405
27406 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27407 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27408
27409 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27410 "__builtin_ia32_vec_set_v4hi",
27411 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27412
27413 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27414 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27415
27416 /* Add FMA4 multi-arg argument instructions */
27417 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27418 {
27419 if (d->name == 0)
27420 continue;
27421
27422 ftype = (enum ix86_builtin_func_type) d->flag;
27423 def_builtin_const (d->mask, d->name, ftype, d->code);
27424 }
27425 }
27426
27427 /* Internal method for ix86_init_builtins. */
27428
27429 static void
27430 ix86_init_builtins_va_builtins_abi (void)
27431 {
27432 tree ms_va_ref, sysv_va_ref;
27433 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27434 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
27435 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
27436 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
27437
27438 if (!TARGET_64BIT)
27439 return;
27440 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
27441 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
27442 ms_va_ref = build_reference_type (ms_va_list_type_node);
27443 sysv_va_ref =
27444 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
27445
27446 fnvoid_va_end_ms =
27447 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27448 fnvoid_va_start_ms =
27449 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27450 fnvoid_va_end_sysv =
27451 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
27452 fnvoid_va_start_sysv =
27453 build_varargs_function_type_list (void_type_node, sysv_va_ref,
27454 NULL_TREE);
27455 fnvoid_va_copy_ms =
27456 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
27457 NULL_TREE);
27458 fnvoid_va_copy_sysv =
27459 build_function_type_list (void_type_node, sysv_va_ref,
27460 sysv_va_ref, NULL_TREE);
27461
27462 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
27463 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
27464 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
27465 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
27466 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
27467 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
27468 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
27469 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27470 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
27471 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27472 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
27473 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27474 }
27475
27476 static void
27477 ix86_init_builtin_types (void)
27478 {
27479 tree float128_type_node, float80_type_node;
27480
27481 /* The __float80 type. */
27482 float80_type_node = long_double_type_node;
27483 if (TYPE_MODE (float80_type_node) != XFmode)
27484 {
27485 /* The __float80 type. */
27486 float80_type_node = make_node (REAL_TYPE);
27487
27488 TYPE_PRECISION (float80_type_node) = 80;
27489 layout_type (float80_type_node);
27490 }
27491 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
27492
27493 /* The __float128 type. */
27494 float128_type_node = make_node (REAL_TYPE);
27495 TYPE_PRECISION (float128_type_node) = 128;
27496 layout_type (float128_type_node);
27497 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
27498
27499 /* This macro is built by i386-builtin-types.awk. */
27500 DEFINE_BUILTIN_PRIMITIVE_TYPES;
27501 }
27502
27503 static void
27504 ix86_init_builtins (void)
27505 {
27506 tree t;
27507
27508 ix86_init_builtin_types ();
27509
27510 /* TFmode support builtins. */
27511 def_builtin_const (0, "__builtin_infq",
27512 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
27513 def_builtin_const (0, "__builtin_huge_valq",
27514 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
27515
27516 /* We will expand them to normal call if SSE2 isn't available since
27517 they are used by libgcc. */
27518 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27519 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27520 BUILT_IN_MD, "__fabstf2", NULL_TREE);
27521 TREE_READONLY (t) = 1;
27522 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27523
27524 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27525 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27526 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27527 TREE_READONLY (t) = 1;
27528 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27529
27530 ix86_init_tm_builtins ();
27531 ix86_init_mmx_sse_builtins ();
27532
27533 if (TARGET_LP64)
27534 ix86_init_builtins_va_builtins_abi ();
27535
27536 #ifdef SUBTARGET_INIT_BUILTINS
27537 SUBTARGET_INIT_BUILTINS;
27538 #endif
27539 }
27540
27541 /* Return the ix86 builtin for CODE. */
27542
27543 static tree
27544 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27545 {
27546 if (code >= IX86_BUILTIN_MAX)
27547 return error_mark_node;
27548
27549 return ix86_builtins[code];
27550 }
27551
27552 /* Errors in the source file can cause expand_expr to return const0_rtx
27553 where we expect a vector. To avoid crashing, use one of the vector
27554 clear instructions. */
27555 static rtx
27556 safe_vector_operand (rtx x, enum machine_mode mode)
27557 {
27558 if (x == const0_rtx)
27559 x = CONST0_RTX (mode);
27560 return x;
27561 }
27562
27563 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
27564
27565 static rtx
27566 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27567 {
27568 rtx pat;
27569 tree arg0 = CALL_EXPR_ARG (exp, 0);
27570 tree arg1 = CALL_EXPR_ARG (exp, 1);
27571 rtx op0 = expand_normal (arg0);
27572 rtx op1 = expand_normal (arg1);
27573 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27574 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27575 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27576
27577 if (VECTOR_MODE_P (mode0))
27578 op0 = safe_vector_operand (op0, mode0);
27579 if (VECTOR_MODE_P (mode1))
27580 op1 = safe_vector_operand (op1, mode1);
27581
27582 if (optimize || !target
27583 || GET_MODE (target) != tmode
27584 || !insn_data[icode].operand[0].predicate (target, tmode))
27585 target = gen_reg_rtx (tmode);
27586
27587 if (GET_MODE (op1) == SImode && mode1 == TImode)
27588 {
27589 rtx x = gen_reg_rtx (V4SImode);
27590 emit_insn (gen_sse2_loadd (x, op1));
27591 op1 = gen_lowpart (TImode, x);
27592 }
27593
27594 if (!insn_data[icode].operand[1].predicate (op0, mode0))
27595 op0 = copy_to_mode_reg (mode0, op0);
27596 if (!insn_data[icode].operand[2].predicate (op1, mode1))
27597 op1 = copy_to_mode_reg (mode1, op1);
27598
27599 pat = GEN_FCN (icode) (target, op0, op1);
27600 if (! pat)
27601 return 0;
27602
27603 emit_insn (pat);
27604
27605 return target;
27606 }
27607
27608 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
27609
27610 static rtx
27611 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27612 enum ix86_builtin_func_type m_type,
27613 enum rtx_code sub_code)
27614 {
27615 rtx pat;
27616 int i;
27617 int nargs;
27618 bool comparison_p = false;
27619 bool tf_p = false;
27620 bool last_arg_constant = false;
27621 int num_memory = 0;
27622 struct {
27623 rtx op;
27624 enum machine_mode mode;
27625 } args[4];
27626
27627 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27628
27629 switch (m_type)
27630 {
27631 case MULTI_ARG_4_DF2_DI_I:
27632 case MULTI_ARG_4_DF2_DI_I1:
27633 case MULTI_ARG_4_SF2_SI_I:
27634 case MULTI_ARG_4_SF2_SI_I1:
27635 nargs = 4;
27636 last_arg_constant = true;
27637 break;
27638
27639 case MULTI_ARG_3_SF:
27640 case MULTI_ARG_3_DF:
27641 case MULTI_ARG_3_SF2:
27642 case MULTI_ARG_3_DF2:
27643 case MULTI_ARG_3_DI:
27644 case MULTI_ARG_3_SI:
27645 case MULTI_ARG_3_SI_DI:
27646 case MULTI_ARG_3_HI:
27647 case MULTI_ARG_3_HI_SI:
27648 case MULTI_ARG_3_QI:
27649 case MULTI_ARG_3_DI2:
27650 case MULTI_ARG_3_SI2:
27651 case MULTI_ARG_3_HI2:
27652 case MULTI_ARG_3_QI2:
27653 nargs = 3;
27654 break;
27655
27656 case MULTI_ARG_2_SF:
27657 case MULTI_ARG_2_DF:
27658 case MULTI_ARG_2_DI:
27659 case MULTI_ARG_2_SI:
27660 case MULTI_ARG_2_HI:
27661 case MULTI_ARG_2_QI:
27662 nargs = 2;
27663 break;
27664
27665 case MULTI_ARG_2_DI_IMM:
27666 case MULTI_ARG_2_SI_IMM:
27667 case MULTI_ARG_2_HI_IMM:
27668 case MULTI_ARG_2_QI_IMM:
27669 nargs = 2;
27670 last_arg_constant = true;
27671 break;
27672
27673 case MULTI_ARG_1_SF:
27674 case MULTI_ARG_1_DF:
27675 case MULTI_ARG_1_SF2:
27676 case MULTI_ARG_1_DF2:
27677 case MULTI_ARG_1_DI:
27678 case MULTI_ARG_1_SI:
27679 case MULTI_ARG_1_HI:
27680 case MULTI_ARG_1_QI:
27681 case MULTI_ARG_1_SI_DI:
27682 case MULTI_ARG_1_HI_DI:
27683 case MULTI_ARG_1_HI_SI:
27684 case MULTI_ARG_1_QI_DI:
27685 case MULTI_ARG_1_QI_SI:
27686 case MULTI_ARG_1_QI_HI:
27687 nargs = 1;
27688 break;
27689
27690 case MULTI_ARG_2_DI_CMP:
27691 case MULTI_ARG_2_SI_CMP:
27692 case MULTI_ARG_2_HI_CMP:
27693 case MULTI_ARG_2_QI_CMP:
27694 nargs = 2;
27695 comparison_p = true;
27696 break;
27697
27698 case MULTI_ARG_2_SF_TF:
27699 case MULTI_ARG_2_DF_TF:
27700 case MULTI_ARG_2_DI_TF:
27701 case MULTI_ARG_2_SI_TF:
27702 case MULTI_ARG_2_HI_TF:
27703 case MULTI_ARG_2_QI_TF:
27704 nargs = 2;
27705 tf_p = true;
27706 break;
27707
27708 default:
27709 gcc_unreachable ();
27710 }
27711
27712 if (optimize || !target
27713 || GET_MODE (target) != tmode
27714 || !insn_data[icode].operand[0].predicate (target, tmode))
27715 target = gen_reg_rtx (tmode);
27716
27717 gcc_assert (nargs <= 4);
27718
27719 for (i = 0; i < nargs; i++)
27720 {
27721 tree arg = CALL_EXPR_ARG (exp, i);
27722 rtx op = expand_normal (arg);
27723 int adjust = (comparison_p) ? 1 : 0;
27724 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
27725
27726 if (last_arg_constant && i == nargs - 1)
27727 {
27728 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
27729 {
27730 enum insn_code new_icode = icode;
27731 switch (icode)
27732 {
27733 case CODE_FOR_xop_vpermil2v2df3:
27734 case CODE_FOR_xop_vpermil2v4sf3:
27735 case CODE_FOR_xop_vpermil2v4df3:
27736 case CODE_FOR_xop_vpermil2v8sf3:
27737 error ("the last argument must be a 2-bit immediate");
27738 return gen_reg_rtx (tmode);
27739 case CODE_FOR_xop_rotlv2di3:
27740 new_icode = CODE_FOR_rotlv2di3;
27741 goto xop_rotl;
27742 case CODE_FOR_xop_rotlv4si3:
27743 new_icode = CODE_FOR_rotlv4si3;
27744 goto xop_rotl;
27745 case CODE_FOR_xop_rotlv8hi3:
27746 new_icode = CODE_FOR_rotlv8hi3;
27747 goto xop_rotl;
27748 case CODE_FOR_xop_rotlv16qi3:
27749 new_icode = CODE_FOR_rotlv16qi3;
27750 xop_rotl:
27751 if (CONST_INT_P (op))
27752 {
27753 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
27754 op = GEN_INT (INTVAL (op) & mask);
27755 gcc_checking_assert
27756 (insn_data[icode].operand[i + 1].predicate (op, mode));
27757 }
27758 else
27759 {
27760 gcc_checking_assert
27761 (nargs == 2
27762 && insn_data[new_icode].operand[0].mode == tmode
27763 && insn_data[new_icode].operand[1].mode == tmode
27764 && insn_data[new_icode].operand[2].mode == mode
27765 && insn_data[new_icode].operand[0].predicate
27766 == insn_data[icode].operand[0].predicate
27767 && insn_data[new_icode].operand[1].predicate
27768 == insn_data[icode].operand[1].predicate);
27769 icode = new_icode;
27770 goto non_constant;
27771 }
27772 break;
27773 default:
27774 gcc_unreachable ();
27775 }
27776 }
27777 }
27778 else
27779 {
27780 non_constant:
27781 if (VECTOR_MODE_P (mode))
27782 op = safe_vector_operand (op, mode);
27783
27784 /* If we aren't optimizing, only allow one memory operand to be
27785 generated. */
27786 if (memory_operand (op, mode))
27787 num_memory++;
27788
27789 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
27790
27791 if (optimize
27792 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
27793 || num_memory > 1)
27794 op = force_reg (mode, op);
27795 }
27796
27797 args[i].op = op;
27798 args[i].mode = mode;
27799 }
27800
27801 switch (nargs)
27802 {
27803 case 1:
27804 pat = GEN_FCN (icode) (target, args[0].op);
27805 break;
27806
27807 case 2:
27808 if (tf_p)
27809 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
27810 GEN_INT ((int)sub_code));
27811 else if (! comparison_p)
27812 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
27813 else
27814 {
27815 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
27816 args[0].op,
27817 args[1].op);
27818
27819 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
27820 }
27821 break;
27822
27823 case 3:
27824 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
27825 break;
27826
27827 case 4:
27828 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
27829 break;
27830
27831 default:
27832 gcc_unreachable ();
27833 }
27834
27835 if (! pat)
27836 return 0;
27837
27838 emit_insn (pat);
27839 return target;
27840 }
27841
27842 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
27843 insns with vec_merge. */
27844
27845 static rtx
27846 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
27847 rtx target)
27848 {
27849 rtx pat;
27850 tree arg0 = CALL_EXPR_ARG (exp, 0);
27851 rtx op1, op0 = expand_normal (arg0);
27852 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27853 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27854
27855 if (optimize || !target
27856 || GET_MODE (target) != tmode
27857 || !insn_data[icode].operand[0].predicate (target, tmode))
27858 target = gen_reg_rtx (tmode);
27859
27860 if (VECTOR_MODE_P (mode0))
27861 op0 = safe_vector_operand (op0, mode0);
27862
27863 if ((optimize && !register_operand (op0, mode0))
27864 || !insn_data[icode].operand[1].predicate (op0, mode0))
27865 op0 = copy_to_mode_reg (mode0, op0);
27866
27867 op1 = op0;
27868 if (!insn_data[icode].operand[2].predicate (op1, mode0))
27869 op1 = copy_to_mode_reg (mode0, op1);
27870
27871 pat = GEN_FCN (icode) (target, op0, op1);
27872 if (! pat)
27873 return 0;
27874 emit_insn (pat);
27875 return target;
27876 }
27877
27878 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
27879
27880 static rtx
27881 ix86_expand_sse_compare (const struct builtin_description *d,
27882 tree exp, rtx target, bool swap)
27883 {
27884 rtx pat;
27885 tree arg0 = CALL_EXPR_ARG (exp, 0);
27886 tree arg1 = CALL_EXPR_ARG (exp, 1);
27887 rtx op0 = expand_normal (arg0);
27888 rtx op1 = expand_normal (arg1);
27889 rtx op2;
27890 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27891 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27892 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
27893 enum rtx_code comparison = d->comparison;
27894
27895 if (VECTOR_MODE_P (mode0))
27896 op0 = safe_vector_operand (op0, mode0);
27897 if (VECTOR_MODE_P (mode1))
27898 op1 = safe_vector_operand (op1, mode1);
27899
27900 /* Swap operands if we have a comparison that isn't available in
27901 hardware. */
27902 if (swap)
27903 {
27904 rtx tmp = gen_reg_rtx (mode1);
27905 emit_move_insn (tmp, op1);
27906 op1 = op0;
27907 op0 = tmp;
27908 }
27909
27910 if (optimize || !target
27911 || GET_MODE (target) != tmode
27912 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27913 target = gen_reg_rtx (tmode);
27914
27915 if ((optimize && !register_operand (op0, mode0))
27916 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
27917 op0 = copy_to_mode_reg (mode0, op0);
27918 if ((optimize && !register_operand (op1, mode1))
27919 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
27920 op1 = copy_to_mode_reg (mode1, op1);
27921
27922 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
27923 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
27924 if (! pat)
27925 return 0;
27926 emit_insn (pat);
27927 return target;
27928 }
27929
27930 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
27931
27932 static rtx
27933 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
27934 rtx target)
27935 {
27936 rtx pat;
27937 tree arg0 = CALL_EXPR_ARG (exp, 0);
27938 tree arg1 = CALL_EXPR_ARG (exp, 1);
27939 rtx op0 = expand_normal (arg0);
27940 rtx op1 = expand_normal (arg1);
27941 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
27942 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
27943 enum rtx_code comparison = d->comparison;
27944
27945 if (VECTOR_MODE_P (mode0))
27946 op0 = safe_vector_operand (op0, mode0);
27947 if (VECTOR_MODE_P (mode1))
27948 op1 = safe_vector_operand (op1, mode1);
27949
27950 /* Swap operands if we have a comparison that isn't available in
27951 hardware. */
27952 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
27953 {
27954 rtx tmp = op1;
27955 op1 = op0;
27956 op0 = tmp;
27957 }
27958
27959 target = gen_reg_rtx (SImode);
27960 emit_move_insn (target, const0_rtx);
27961 target = gen_rtx_SUBREG (QImode, target, 0);
27962
27963 if ((optimize && !register_operand (op0, mode0))
27964 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27965 op0 = copy_to_mode_reg (mode0, op0);
27966 if ((optimize && !register_operand (op1, mode1))
27967 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
27968 op1 = copy_to_mode_reg (mode1, op1);
27969
27970 pat = GEN_FCN (d->icode) (op0, op1);
27971 if (! pat)
27972 return 0;
27973 emit_insn (pat);
27974 emit_insn (gen_rtx_SET (VOIDmode,
27975 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27976 gen_rtx_fmt_ee (comparison, QImode,
27977 SET_DEST (pat),
27978 const0_rtx)));
27979
27980 return SUBREG_REG (target);
27981 }
27982
27983 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
27984
27985 static rtx
27986 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
27987 rtx target)
27988 {
27989 rtx pat;
27990 tree arg0 = CALL_EXPR_ARG (exp, 0);
27991 rtx op1, op0 = expand_normal (arg0);
27992 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27993 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27994
27995 if (optimize || target == 0
27996 || GET_MODE (target) != tmode
27997 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27998 target = gen_reg_rtx (tmode);
27999
28000 if (VECTOR_MODE_P (mode0))
28001 op0 = safe_vector_operand (op0, mode0);
28002
28003 if ((optimize && !register_operand (op0, mode0))
28004 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28005 op0 = copy_to_mode_reg (mode0, op0);
28006
28007 op1 = GEN_INT (d->comparison);
28008
28009 pat = GEN_FCN (d->icode) (target, op0, op1);
28010 if (! pat)
28011 return 0;
28012 emit_insn (pat);
28013 return target;
28014 }
28015
28016 static rtx
28017 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28018 tree exp, rtx target)
28019 {
28020 rtx pat;
28021 tree arg0 = CALL_EXPR_ARG (exp, 0);
28022 tree arg1 = CALL_EXPR_ARG (exp, 1);
28023 rtx op0 = expand_normal (arg0);
28024 rtx op1 = expand_normal (arg1);
28025 rtx op2;
28026 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28027 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28028 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28029
28030 if (optimize || target == 0
28031 || GET_MODE (target) != tmode
28032 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28033 target = gen_reg_rtx (tmode);
28034
28035 op0 = safe_vector_operand (op0, mode0);
28036 op1 = safe_vector_operand (op1, mode1);
28037
28038 if ((optimize && !register_operand (op0, mode0))
28039 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28040 op0 = copy_to_mode_reg (mode0, op0);
28041 if ((optimize && !register_operand (op1, mode1))
28042 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28043 op1 = copy_to_mode_reg (mode1, op1);
28044
28045 op2 = GEN_INT (d->comparison);
28046
28047 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28048 if (! pat)
28049 return 0;
28050 emit_insn (pat);
28051 return target;
28052 }
28053
28054 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28055
28056 static rtx
28057 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28058 rtx target)
28059 {
28060 rtx pat;
28061 tree arg0 = CALL_EXPR_ARG (exp, 0);
28062 tree arg1 = CALL_EXPR_ARG (exp, 1);
28063 rtx op0 = expand_normal (arg0);
28064 rtx op1 = expand_normal (arg1);
28065 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28066 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28067 enum rtx_code comparison = d->comparison;
28068
28069 if (VECTOR_MODE_P (mode0))
28070 op0 = safe_vector_operand (op0, mode0);
28071 if (VECTOR_MODE_P (mode1))
28072 op1 = safe_vector_operand (op1, mode1);
28073
28074 target = gen_reg_rtx (SImode);
28075 emit_move_insn (target, const0_rtx);
28076 target = gen_rtx_SUBREG (QImode, target, 0);
28077
28078 if ((optimize && !register_operand (op0, mode0))
28079 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28080 op0 = copy_to_mode_reg (mode0, op0);
28081 if ((optimize && !register_operand (op1, mode1))
28082 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28083 op1 = copy_to_mode_reg (mode1, op1);
28084
28085 pat = GEN_FCN (d->icode) (op0, op1);
28086 if (! pat)
28087 return 0;
28088 emit_insn (pat);
28089 emit_insn (gen_rtx_SET (VOIDmode,
28090 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28091 gen_rtx_fmt_ee (comparison, QImode,
28092 SET_DEST (pat),
28093 const0_rtx)));
28094
28095 return SUBREG_REG (target);
28096 }
28097
28098 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28099
28100 static rtx
28101 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28102 tree exp, rtx target)
28103 {
28104 rtx pat;
28105 tree arg0 = CALL_EXPR_ARG (exp, 0);
28106 tree arg1 = CALL_EXPR_ARG (exp, 1);
28107 tree arg2 = CALL_EXPR_ARG (exp, 2);
28108 tree arg3 = CALL_EXPR_ARG (exp, 3);
28109 tree arg4 = CALL_EXPR_ARG (exp, 4);
28110 rtx scratch0, scratch1;
28111 rtx op0 = expand_normal (arg0);
28112 rtx op1 = expand_normal (arg1);
28113 rtx op2 = expand_normal (arg2);
28114 rtx op3 = expand_normal (arg3);
28115 rtx op4 = expand_normal (arg4);
28116 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28117
28118 tmode0 = insn_data[d->icode].operand[0].mode;
28119 tmode1 = insn_data[d->icode].operand[1].mode;
28120 modev2 = insn_data[d->icode].operand[2].mode;
28121 modei3 = insn_data[d->icode].operand[3].mode;
28122 modev4 = insn_data[d->icode].operand[4].mode;
28123 modei5 = insn_data[d->icode].operand[5].mode;
28124 modeimm = insn_data[d->icode].operand[6].mode;
28125
28126 if (VECTOR_MODE_P (modev2))
28127 op0 = safe_vector_operand (op0, modev2);
28128 if (VECTOR_MODE_P (modev4))
28129 op2 = safe_vector_operand (op2, modev4);
28130
28131 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28132 op0 = copy_to_mode_reg (modev2, op0);
28133 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28134 op1 = copy_to_mode_reg (modei3, op1);
28135 if ((optimize && !register_operand (op2, modev4))
28136 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28137 op2 = copy_to_mode_reg (modev4, op2);
28138 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28139 op3 = copy_to_mode_reg (modei5, op3);
28140
28141 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28142 {
28143 error ("the fifth argument must be an 8-bit immediate");
28144 return const0_rtx;
28145 }
28146
28147 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28148 {
28149 if (optimize || !target
28150 || GET_MODE (target) != tmode0
28151 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28152 target = gen_reg_rtx (tmode0);
28153
28154 scratch1 = gen_reg_rtx (tmode1);
28155
28156 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28157 }
28158 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28159 {
28160 if (optimize || !target
28161 || GET_MODE (target) != tmode1
28162 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28163 target = gen_reg_rtx (tmode1);
28164
28165 scratch0 = gen_reg_rtx (tmode0);
28166
28167 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28168 }
28169 else
28170 {
28171 gcc_assert (d->flag);
28172
28173 scratch0 = gen_reg_rtx (tmode0);
28174 scratch1 = gen_reg_rtx (tmode1);
28175
28176 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28177 }
28178
28179 if (! pat)
28180 return 0;
28181
28182 emit_insn (pat);
28183
28184 if (d->flag)
28185 {
28186 target = gen_reg_rtx (SImode);
28187 emit_move_insn (target, const0_rtx);
28188 target = gen_rtx_SUBREG (QImode, target, 0);
28189
28190 emit_insn
28191 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28192 gen_rtx_fmt_ee (EQ, QImode,
28193 gen_rtx_REG ((enum machine_mode) d->flag,
28194 FLAGS_REG),
28195 const0_rtx)));
28196 return SUBREG_REG (target);
28197 }
28198 else
28199 return target;
28200 }
28201
28202
28203 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28204
28205 static rtx
28206 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28207 tree exp, rtx target)
28208 {
28209 rtx pat;
28210 tree arg0 = CALL_EXPR_ARG (exp, 0);
28211 tree arg1 = CALL_EXPR_ARG (exp, 1);
28212 tree arg2 = CALL_EXPR_ARG (exp, 2);
28213 rtx scratch0, scratch1;
28214 rtx op0 = expand_normal (arg0);
28215 rtx op1 = expand_normal (arg1);
28216 rtx op2 = expand_normal (arg2);
28217 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28218
28219 tmode0 = insn_data[d->icode].operand[0].mode;
28220 tmode1 = insn_data[d->icode].operand[1].mode;
28221 modev2 = insn_data[d->icode].operand[2].mode;
28222 modev3 = insn_data[d->icode].operand[3].mode;
28223 modeimm = insn_data[d->icode].operand[4].mode;
28224
28225 if (VECTOR_MODE_P (modev2))
28226 op0 = safe_vector_operand (op0, modev2);
28227 if (VECTOR_MODE_P (modev3))
28228 op1 = safe_vector_operand (op1, modev3);
28229
28230 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28231 op0 = copy_to_mode_reg (modev2, op0);
28232 if ((optimize && !register_operand (op1, modev3))
28233 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28234 op1 = copy_to_mode_reg (modev3, op1);
28235
28236 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28237 {
28238 error ("the third argument must be an 8-bit immediate");
28239 return const0_rtx;
28240 }
28241
28242 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28243 {
28244 if (optimize || !target
28245 || GET_MODE (target) != tmode0
28246 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28247 target = gen_reg_rtx (tmode0);
28248
28249 scratch1 = gen_reg_rtx (tmode1);
28250
28251 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28252 }
28253 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28254 {
28255 if (optimize || !target
28256 || GET_MODE (target) != tmode1
28257 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28258 target = gen_reg_rtx (tmode1);
28259
28260 scratch0 = gen_reg_rtx (tmode0);
28261
28262 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28263 }
28264 else
28265 {
28266 gcc_assert (d->flag);
28267
28268 scratch0 = gen_reg_rtx (tmode0);
28269 scratch1 = gen_reg_rtx (tmode1);
28270
28271 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28272 }
28273
28274 if (! pat)
28275 return 0;
28276
28277 emit_insn (pat);
28278
28279 if (d->flag)
28280 {
28281 target = gen_reg_rtx (SImode);
28282 emit_move_insn (target, const0_rtx);
28283 target = gen_rtx_SUBREG (QImode, target, 0);
28284
28285 emit_insn
28286 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28287 gen_rtx_fmt_ee (EQ, QImode,
28288 gen_rtx_REG ((enum machine_mode) d->flag,
28289 FLAGS_REG),
28290 const0_rtx)));
28291 return SUBREG_REG (target);
28292 }
28293 else
28294 return target;
28295 }
28296
28297 /* Subroutine of ix86_expand_builtin to take care of insns with
28298 variable number of operands. */
28299
28300 static rtx
28301 ix86_expand_args_builtin (const struct builtin_description *d,
28302 tree exp, rtx target)
28303 {
28304 rtx pat, real_target;
28305 unsigned int i, nargs;
28306 unsigned int nargs_constant = 0;
28307 int num_memory = 0;
28308 struct
28309 {
28310 rtx op;
28311 enum machine_mode mode;
28312 } args[4];
28313 bool last_arg_count = false;
28314 enum insn_code icode = d->icode;
28315 const struct insn_data_d *insn_p = &insn_data[icode];
28316 enum machine_mode tmode = insn_p->operand[0].mode;
28317 enum machine_mode rmode = VOIDmode;
28318 bool swap = false;
28319 enum rtx_code comparison = d->comparison;
28320
28321 switch ((enum ix86_builtin_func_type) d->flag)
28322 {
28323 case V2DF_FTYPE_V2DF_ROUND:
28324 case V4DF_FTYPE_V4DF_ROUND:
28325 case V4SF_FTYPE_V4SF_ROUND:
28326 case V8SF_FTYPE_V8SF_ROUND:
28327 case V4SI_FTYPE_V4SF_ROUND:
28328 case V8SI_FTYPE_V8SF_ROUND:
28329 return ix86_expand_sse_round (d, exp, target);
28330 case V4SI_FTYPE_V2DF_V2DF_ROUND:
28331 case V8SI_FTYPE_V4DF_V4DF_ROUND:
28332 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28333 case INT_FTYPE_V8SF_V8SF_PTEST:
28334 case INT_FTYPE_V4DI_V4DI_PTEST:
28335 case INT_FTYPE_V4DF_V4DF_PTEST:
28336 case INT_FTYPE_V4SF_V4SF_PTEST:
28337 case INT_FTYPE_V2DI_V2DI_PTEST:
28338 case INT_FTYPE_V2DF_V2DF_PTEST:
28339 return ix86_expand_sse_ptest (d, exp, target);
28340 case FLOAT128_FTYPE_FLOAT128:
28341 case FLOAT_FTYPE_FLOAT:
28342 case INT_FTYPE_INT:
28343 case UINT64_FTYPE_INT:
28344 case UINT16_FTYPE_UINT16:
28345 case INT64_FTYPE_INT64:
28346 case INT64_FTYPE_V4SF:
28347 case INT64_FTYPE_V2DF:
28348 case INT_FTYPE_V16QI:
28349 case INT_FTYPE_V8QI:
28350 case INT_FTYPE_V8SF:
28351 case INT_FTYPE_V4DF:
28352 case INT_FTYPE_V4SF:
28353 case INT_FTYPE_V2DF:
28354 case INT_FTYPE_V32QI:
28355 case V16QI_FTYPE_V16QI:
28356 case V8SI_FTYPE_V8SF:
28357 case V8SI_FTYPE_V4SI:
28358 case V8HI_FTYPE_V8HI:
28359 case V8HI_FTYPE_V16QI:
28360 case V8QI_FTYPE_V8QI:
28361 case V8SF_FTYPE_V8SF:
28362 case V8SF_FTYPE_V8SI:
28363 case V8SF_FTYPE_V4SF:
28364 case V8SF_FTYPE_V8HI:
28365 case V4SI_FTYPE_V4SI:
28366 case V4SI_FTYPE_V16QI:
28367 case V4SI_FTYPE_V4SF:
28368 case V4SI_FTYPE_V8SI:
28369 case V4SI_FTYPE_V8HI:
28370 case V4SI_FTYPE_V4DF:
28371 case V4SI_FTYPE_V2DF:
28372 case V4HI_FTYPE_V4HI:
28373 case V4DF_FTYPE_V4DF:
28374 case V4DF_FTYPE_V4SI:
28375 case V4DF_FTYPE_V4SF:
28376 case V4DF_FTYPE_V2DF:
28377 case V4SF_FTYPE_V4SF:
28378 case V4SF_FTYPE_V4SI:
28379 case V4SF_FTYPE_V8SF:
28380 case V4SF_FTYPE_V4DF:
28381 case V4SF_FTYPE_V8HI:
28382 case V4SF_FTYPE_V2DF:
28383 case V2DI_FTYPE_V2DI:
28384 case V2DI_FTYPE_V16QI:
28385 case V2DI_FTYPE_V8HI:
28386 case V2DI_FTYPE_V4SI:
28387 case V2DF_FTYPE_V2DF:
28388 case V2DF_FTYPE_V4SI:
28389 case V2DF_FTYPE_V4DF:
28390 case V2DF_FTYPE_V4SF:
28391 case V2DF_FTYPE_V2SI:
28392 case V2SI_FTYPE_V2SI:
28393 case V2SI_FTYPE_V4SF:
28394 case V2SI_FTYPE_V2SF:
28395 case V2SI_FTYPE_V2DF:
28396 case V2SF_FTYPE_V2SF:
28397 case V2SF_FTYPE_V2SI:
28398 case V32QI_FTYPE_V32QI:
28399 case V32QI_FTYPE_V16QI:
28400 case V16HI_FTYPE_V16HI:
28401 case V16HI_FTYPE_V8HI:
28402 case V8SI_FTYPE_V8SI:
28403 case V16HI_FTYPE_V16QI:
28404 case V8SI_FTYPE_V16QI:
28405 case V4DI_FTYPE_V16QI:
28406 case V8SI_FTYPE_V8HI:
28407 case V4DI_FTYPE_V8HI:
28408 case V4DI_FTYPE_V4SI:
28409 case V4DI_FTYPE_V2DI:
28410 nargs = 1;
28411 break;
28412 case V4SF_FTYPE_V4SF_VEC_MERGE:
28413 case V2DF_FTYPE_V2DF_VEC_MERGE:
28414 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28415 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28416 case V16QI_FTYPE_V16QI_V16QI:
28417 case V16QI_FTYPE_V8HI_V8HI:
28418 case V8QI_FTYPE_V8QI_V8QI:
28419 case V8QI_FTYPE_V4HI_V4HI:
28420 case V8HI_FTYPE_V8HI_V8HI:
28421 case V8HI_FTYPE_V16QI_V16QI:
28422 case V8HI_FTYPE_V4SI_V4SI:
28423 case V8SF_FTYPE_V8SF_V8SF:
28424 case V8SF_FTYPE_V8SF_V8SI:
28425 case V4SI_FTYPE_V4SI_V4SI:
28426 case V4SI_FTYPE_V8HI_V8HI:
28427 case V4SI_FTYPE_V4SF_V4SF:
28428 case V4SI_FTYPE_V2DF_V2DF:
28429 case V4HI_FTYPE_V4HI_V4HI:
28430 case V4HI_FTYPE_V8QI_V8QI:
28431 case V4HI_FTYPE_V2SI_V2SI:
28432 case V4DF_FTYPE_V4DF_V4DF:
28433 case V4DF_FTYPE_V4DF_V4DI:
28434 case V4SF_FTYPE_V4SF_V4SF:
28435 case V4SF_FTYPE_V4SF_V4SI:
28436 case V4SF_FTYPE_V4SF_V2SI:
28437 case V4SF_FTYPE_V4SF_V2DF:
28438 case V4SF_FTYPE_V4SF_DI:
28439 case V4SF_FTYPE_V4SF_SI:
28440 case V2DI_FTYPE_V2DI_V2DI:
28441 case V2DI_FTYPE_V16QI_V16QI:
28442 case V2DI_FTYPE_V4SI_V4SI:
28443 case V2DI_FTYPE_V2DI_V16QI:
28444 case V2DI_FTYPE_V2DF_V2DF:
28445 case V2SI_FTYPE_V2SI_V2SI:
28446 case V2SI_FTYPE_V4HI_V4HI:
28447 case V2SI_FTYPE_V2SF_V2SF:
28448 case V2DF_FTYPE_V2DF_V2DF:
28449 case V2DF_FTYPE_V2DF_V4SF:
28450 case V2DF_FTYPE_V2DF_V2DI:
28451 case V2DF_FTYPE_V2DF_DI:
28452 case V2DF_FTYPE_V2DF_SI:
28453 case V2SF_FTYPE_V2SF_V2SF:
28454 case V1DI_FTYPE_V1DI_V1DI:
28455 case V1DI_FTYPE_V8QI_V8QI:
28456 case V1DI_FTYPE_V2SI_V2SI:
28457 case V32QI_FTYPE_V16HI_V16HI:
28458 case V16HI_FTYPE_V8SI_V8SI:
28459 case V32QI_FTYPE_V32QI_V32QI:
28460 case V16HI_FTYPE_V32QI_V32QI:
28461 case V16HI_FTYPE_V16HI_V16HI:
28462 case V8SI_FTYPE_V4DF_V4DF:
28463 case V8SI_FTYPE_V8SI_V8SI:
28464 case V8SI_FTYPE_V16HI_V16HI:
28465 case V4DI_FTYPE_V4DI_V4DI:
28466 case V4DI_FTYPE_V8SI_V8SI:
28467 if (comparison == UNKNOWN)
28468 return ix86_expand_binop_builtin (icode, exp, target);
28469 nargs = 2;
28470 break;
28471 case V4SF_FTYPE_V4SF_V4SF_SWAP:
28472 case V2DF_FTYPE_V2DF_V2DF_SWAP:
28473 gcc_assert (comparison != UNKNOWN);
28474 nargs = 2;
28475 swap = true;
28476 break;
28477 case V16HI_FTYPE_V16HI_V8HI_COUNT:
28478 case V16HI_FTYPE_V16HI_SI_COUNT:
28479 case V8SI_FTYPE_V8SI_V4SI_COUNT:
28480 case V8SI_FTYPE_V8SI_SI_COUNT:
28481 case V4DI_FTYPE_V4DI_V2DI_COUNT:
28482 case V4DI_FTYPE_V4DI_INT_COUNT:
28483 case V8HI_FTYPE_V8HI_V8HI_COUNT:
28484 case V8HI_FTYPE_V8HI_SI_COUNT:
28485 case V4SI_FTYPE_V4SI_V4SI_COUNT:
28486 case V4SI_FTYPE_V4SI_SI_COUNT:
28487 case V4HI_FTYPE_V4HI_V4HI_COUNT:
28488 case V4HI_FTYPE_V4HI_SI_COUNT:
28489 case V2DI_FTYPE_V2DI_V2DI_COUNT:
28490 case V2DI_FTYPE_V2DI_SI_COUNT:
28491 case V2SI_FTYPE_V2SI_V2SI_COUNT:
28492 case V2SI_FTYPE_V2SI_SI_COUNT:
28493 case V1DI_FTYPE_V1DI_V1DI_COUNT:
28494 case V1DI_FTYPE_V1DI_SI_COUNT:
28495 nargs = 2;
28496 last_arg_count = true;
28497 break;
28498 case UINT64_FTYPE_UINT64_UINT64:
28499 case UINT_FTYPE_UINT_UINT:
28500 case UINT_FTYPE_UINT_USHORT:
28501 case UINT_FTYPE_UINT_UCHAR:
28502 case UINT16_FTYPE_UINT16_INT:
28503 case UINT8_FTYPE_UINT8_INT:
28504 nargs = 2;
28505 break;
28506 case V2DI_FTYPE_V2DI_INT_CONVERT:
28507 nargs = 2;
28508 rmode = V1TImode;
28509 nargs_constant = 1;
28510 break;
28511 case V4DI_FTYPE_V4DI_INT_CONVERT:
28512 nargs = 2;
28513 rmode = V2TImode;
28514 nargs_constant = 1;
28515 break;
28516 case V8HI_FTYPE_V8HI_INT:
28517 case V8HI_FTYPE_V8SF_INT:
28518 case V8HI_FTYPE_V4SF_INT:
28519 case V8SF_FTYPE_V8SF_INT:
28520 case V4SI_FTYPE_V4SI_INT:
28521 case V4SI_FTYPE_V8SI_INT:
28522 case V4HI_FTYPE_V4HI_INT:
28523 case V4DF_FTYPE_V4DF_INT:
28524 case V4SF_FTYPE_V4SF_INT:
28525 case V4SF_FTYPE_V8SF_INT:
28526 case V2DI_FTYPE_V2DI_INT:
28527 case V2DF_FTYPE_V2DF_INT:
28528 case V2DF_FTYPE_V4DF_INT:
28529 case V16HI_FTYPE_V16HI_INT:
28530 case V8SI_FTYPE_V8SI_INT:
28531 case V4DI_FTYPE_V4DI_INT:
28532 case V2DI_FTYPE_V4DI_INT:
28533 nargs = 2;
28534 nargs_constant = 1;
28535 break;
28536 case V16QI_FTYPE_V16QI_V16QI_V16QI:
28537 case V8SF_FTYPE_V8SF_V8SF_V8SF:
28538 case V4DF_FTYPE_V4DF_V4DF_V4DF:
28539 case V4SF_FTYPE_V4SF_V4SF_V4SF:
28540 case V2DF_FTYPE_V2DF_V2DF_V2DF:
28541 case V32QI_FTYPE_V32QI_V32QI_V32QI:
28542 nargs = 3;
28543 break;
28544 case V32QI_FTYPE_V32QI_V32QI_INT:
28545 case V16HI_FTYPE_V16HI_V16HI_INT:
28546 case V16QI_FTYPE_V16QI_V16QI_INT:
28547 case V4DI_FTYPE_V4DI_V4DI_INT:
28548 case V8HI_FTYPE_V8HI_V8HI_INT:
28549 case V8SI_FTYPE_V8SI_V8SI_INT:
28550 case V8SI_FTYPE_V8SI_V4SI_INT:
28551 case V8SF_FTYPE_V8SF_V8SF_INT:
28552 case V8SF_FTYPE_V8SF_V4SF_INT:
28553 case V4SI_FTYPE_V4SI_V4SI_INT:
28554 case V4DF_FTYPE_V4DF_V4DF_INT:
28555 case V4DF_FTYPE_V4DF_V2DF_INT:
28556 case V4SF_FTYPE_V4SF_V4SF_INT:
28557 case V2DI_FTYPE_V2DI_V2DI_INT:
28558 case V4DI_FTYPE_V4DI_V2DI_INT:
28559 case V2DF_FTYPE_V2DF_V2DF_INT:
28560 nargs = 3;
28561 nargs_constant = 1;
28562 break;
28563 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28564 nargs = 3;
28565 rmode = V4DImode;
28566 nargs_constant = 1;
28567 break;
28568 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28569 nargs = 3;
28570 rmode = V2DImode;
28571 nargs_constant = 1;
28572 break;
28573 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28574 nargs = 3;
28575 rmode = DImode;
28576 nargs_constant = 1;
28577 break;
28578 case V2DI_FTYPE_V2DI_UINT_UINT:
28579 nargs = 3;
28580 nargs_constant = 2;
28581 break;
28582 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28583 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28584 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28585 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28586 nargs = 4;
28587 nargs_constant = 1;
28588 break;
28589 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28590 nargs = 4;
28591 nargs_constant = 2;
28592 break;
28593 default:
28594 gcc_unreachable ();
28595 }
28596
28597 gcc_assert (nargs <= ARRAY_SIZE (args));
28598
28599 if (comparison != UNKNOWN)
28600 {
28601 gcc_assert (nargs == 2);
28602 return ix86_expand_sse_compare (d, exp, target, swap);
28603 }
28604
28605 if (rmode == VOIDmode || rmode == tmode)
28606 {
28607 if (optimize
28608 || target == 0
28609 || GET_MODE (target) != tmode
28610 || !insn_p->operand[0].predicate (target, tmode))
28611 target = gen_reg_rtx (tmode);
28612 real_target = target;
28613 }
28614 else
28615 {
28616 target = gen_reg_rtx (rmode);
28617 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28618 }
28619
28620 for (i = 0; i < nargs; i++)
28621 {
28622 tree arg = CALL_EXPR_ARG (exp, i);
28623 rtx op = expand_normal (arg);
28624 enum machine_mode mode = insn_p->operand[i + 1].mode;
28625 bool match = insn_p->operand[i + 1].predicate (op, mode);
28626
28627 if (last_arg_count && (i + 1) == nargs)
28628 {
28629 /* SIMD shift insns take either an 8-bit immediate or
28630 register as count. But builtin functions take int as
28631 count. If count doesn't match, we put it in register. */
28632 if (!match)
28633 {
28634 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28635 if (!insn_p->operand[i + 1].predicate (op, mode))
28636 op = copy_to_reg (op);
28637 }
28638 }
28639 else if ((nargs - i) <= nargs_constant)
28640 {
28641 if (!match)
28642 switch (icode)
28643 {
28644 case CODE_FOR_avx2_inserti128:
28645 case CODE_FOR_avx2_extracti128:
28646 error ("the last argument must be an 1-bit immediate");
28647 return const0_rtx;
28648
28649 case CODE_FOR_sse4_1_roundsd:
28650 case CODE_FOR_sse4_1_roundss:
28651
28652 case CODE_FOR_sse4_1_roundpd:
28653 case CODE_FOR_sse4_1_roundps:
28654 case CODE_FOR_avx_roundpd256:
28655 case CODE_FOR_avx_roundps256:
28656
28657 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
28658 case CODE_FOR_sse4_1_roundps_sfix:
28659 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
28660 case CODE_FOR_avx_roundps_sfix256:
28661
28662 case CODE_FOR_sse4_1_blendps:
28663 case CODE_FOR_avx_blendpd256:
28664 case CODE_FOR_avx_vpermilv4df:
28665 error ("the last argument must be a 4-bit immediate");
28666 return const0_rtx;
28667
28668 case CODE_FOR_sse4_1_blendpd:
28669 case CODE_FOR_avx_vpermilv2df:
28670 case CODE_FOR_xop_vpermil2v2df3:
28671 case CODE_FOR_xop_vpermil2v4sf3:
28672 case CODE_FOR_xop_vpermil2v4df3:
28673 case CODE_FOR_xop_vpermil2v8sf3:
28674 error ("the last argument must be a 2-bit immediate");
28675 return const0_rtx;
28676
28677 case CODE_FOR_avx_vextractf128v4df:
28678 case CODE_FOR_avx_vextractf128v8sf:
28679 case CODE_FOR_avx_vextractf128v8si:
28680 case CODE_FOR_avx_vinsertf128v4df:
28681 case CODE_FOR_avx_vinsertf128v8sf:
28682 case CODE_FOR_avx_vinsertf128v8si:
28683 error ("the last argument must be a 1-bit immediate");
28684 return const0_rtx;
28685
28686 case CODE_FOR_avx_vmcmpv2df3:
28687 case CODE_FOR_avx_vmcmpv4sf3:
28688 case CODE_FOR_avx_cmpv2df3:
28689 case CODE_FOR_avx_cmpv4sf3:
28690 case CODE_FOR_avx_cmpv4df3:
28691 case CODE_FOR_avx_cmpv8sf3:
28692 error ("the last argument must be a 5-bit immediate");
28693 return const0_rtx;
28694
28695 default:
28696 switch (nargs_constant)
28697 {
28698 case 2:
28699 if ((nargs - i) == nargs_constant)
28700 {
28701 error ("the next to last argument must be an 8-bit immediate");
28702 break;
28703 }
28704 case 1:
28705 error ("the last argument must be an 8-bit immediate");
28706 break;
28707 default:
28708 gcc_unreachable ();
28709 }
28710 return const0_rtx;
28711 }
28712 }
28713 else
28714 {
28715 if (VECTOR_MODE_P (mode))
28716 op = safe_vector_operand (op, mode);
28717
28718 /* If we aren't optimizing, only allow one memory operand to
28719 be generated. */
28720 if (memory_operand (op, mode))
28721 num_memory++;
28722
28723 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
28724 {
28725 if (optimize || !match || num_memory > 1)
28726 op = copy_to_mode_reg (mode, op);
28727 }
28728 else
28729 {
28730 op = copy_to_reg (op);
28731 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
28732 }
28733 }
28734
28735 args[i].op = op;
28736 args[i].mode = mode;
28737 }
28738
28739 switch (nargs)
28740 {
28741 case 1:
28742 pat = GEN_FCN (icode) (real_target, args[0].op);
28743 break;
28744 case 2:
28745 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
28746 break;
28747 case 3:
28748 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28749 args[2].op);
28750 break;
28751 case 4:
28752 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28753 args[2].op, args[3].op);
28754 break;
28755 default:
28756 gcc_unreachable ();
28757 }
28758
28759 if (! pat)
28760 return 0;
28761
28762 emit_insn (pat);
28763 return target;
28764 }
28765
28766 /* Subroutine of ix86_expand_builtin to take care of special insns
28767 with variable number of operands. */
28768
28769 static rtx
28770 ix86_expand_special_args_builtin (const struct builtin_description *d,
28771 tree exp, rtx target)
28772 {
28773 tree arg;
28774 rtx pat, op;
28775 unsigned int i, nargs, arg_adjust, memory;
28776 struct
28777 {
28778 rtx op;
28779 enum machine_mode mode;
28780 } args[3];
28781 enum insn_code icode = d->icode;
28782 bool last_arg_constant = false;
28783 const struct insn_data_d *insn_p = &insn_data[icode];
28784 enum machine_mode tmode = insn_p->operand[0].mode;
28785 enum { load, store } klass;
28786
28787 switch ((enum ix86_builtin_func_type) d->flag)
28788 {
28789 case VOID_FTYPE_VOID:
28790 if (icode == CODE_FOR_avx_vzeroupper)
28791 target = GEN_INT (vzeroupper_intrinsic);
28792 emit_insn (GEN_FCN (icode) (target));
28793 return 0;
28794 case VOID_FTYPE_UINT64:
28795 case VOID_FTYPE_UNSIGNED:
28796 nargs = 0;
28797 klass = store;
28798 memory = 0;
28799 break;
28800 case UINT64_FTYPE_VOID:
28801 case UNSIGNED_FTYPE_VOID:
28802 nargs = 0;
28803 klass = load;
28804 memory = 0;
28805 break;
28806 case UINT64_FTYPE_PUNSIGNED:
28807 case V2DI_FTYPE_PV2DI:
28808 case V4DI_FTYPE_PV4DI:
28809 case V32QI_FTYPE_PCCHAR:
28810 case V16QI_FTYPE_PCCHAR:
28811 case V8SF_FTYPE_PCV4SF:
28812 case V8SF_FTYPE_PCFLOAT:
28813 case V4SF_FTYPE_PCFLOAT:
28814 case V4DF_FTYPE_PCV2DF:
28815 case V4DF_FTYPE_PCDOUBLE:
28816 case V2DF_FTYPE_PCDOUBLE:
28817 case VOID_FTYPE_PVOID:
28818 nargs = 1;
28819 klass = load;
28820 memory = 0;
28821 break;
28822 case VOID_FTYPE_PV2SF_V4SF:
28823 case VOID_FTYPE_PV4DI_V4DI:
28824 case VOID_FTYPE_PV2DI_V2DI:
28825 case VOID_FTYPE_PCHAR_V32QI:
28826 case VOID_FTYPE_PCHAR_V16QI:
28827 case VOID_FTYPE_PFLOAT_V8SF:
28828 case VOID_FTYPE_PFLOAT_V4SF:
28829 case VOID_FTYPE_PDOUBLE_V4DF:
28830 case VOID_FTYPE_PDOUBLE_V2DF:
28831 case VOID_FTYPE_PLONGLONG_LONGLONG:
28832 case VOID_FTYPE_PULONGLONG_ULONGLONG:
28833 case VOID_FTYPE_PINT_INT:
28834 nargs = 1;
28835 klass = store;
28836 /* Reserve memory operand for target. */
28837 memory = ARRAY_SIZE (args);
28838 break;
28839 case V4SF_FTYPE_V4SF_PCV2SF:
28840 case V2DF_FTYPE_V2DF_PCDOUBLE:
28841 nargs = 2;
28842 klass = load;
28843 memory = 1;
28844 break;
28845 case V8SF_FTYPE_PCV8SF_V8SI:
28846 case V4DF_FTYPE_PCV4DF_V4DI:
28847 case V4SF_FTYPE_PCV4SF_V4SI:
28848 case V2DF_FTYPE_PCV2DF_V2DI:
28849 case V8SI_FTYPE_PCV8SI_V8SI:
28850 case V4DI_FTYPE_PCV4DI_V4DI:
28851 case V4SI_FTYPE_PCV4SI_V4SI:
28852 case V2DI_FTYPE_PCV2DI_V2DI:
28853 nargs = 2;
28854 klass = load;
28855 memory = 0;
28856 break;
28857 case VOID_FTYPE_PV8SF_V8SI_V8SF:
28858 case VOID_FTYPE_PV4DF_V4DI_V4DF:
28859 case VOID_FTYPE_PV4SF_V4SI_V4SF:
28860 case VOID_FTYPE_PV2DF_V2DI_V2DF:
28861 case VOID_FTYPE_PV8SI_V8SI_V8SI:
28862 case VOID_FTYPE_PV4DI_V4DI_V4DI:
28863 case VOID_FTYPE_PV4SI_V4SI_V4SI:
28864 case VOID_FTYPE_PV2DI_V2DI_V2DI:
28865 nargs = 2;
28866 klass = store;
28867 /* Reserve memory operand for target. */
28868 memory = ARRAY_SIZE (args);
28869 break;
28870 case VOID_FTYPE_UINT_UINT_UINT:
28871 case VOID_FTYPE_UINT64_UINT_UINT:
28872 case UCHAR_FTYPE_UINT_UINT_UINT:
28873 case UCHAR_FTYPE_UINT64_UINT_UINT:
28874 nargs = 3;
28875 klass = load;
28876 memory = ARRAY_SIZE (args);
28877 last_arg_constant = true;
28878 break;
28879 default:
28880 gcc_unreachable ();
28881 }
28882
28883 gcc_assert (nargs <= ARRAY_SIZE (args));
28884
28885 if (klass == store)
28886 {
28887 arg = CALL_EXPR_ARG (exp, 0);
28888 op = expand_normal (arg);
28889 gcc_assert (target == 0);
28890 if (memory)
28891 {
28892 if (GET_MODE (op) != Pmode)
28893 op = convert_to_mode (Pmode, op, 1);
28894 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
28895 }
28896 else
28897 target = force_reg (tmode, op);
28898 arg_adjust = 1;
28899 }
28900 else
28901 {
28902 arg_adjust = 0;
28903 if (optimize
28904 || target == 0
28905 || GET_MODE (target) != tmode
28906 || !insn_p->operand[0].predicate (target, tmode))
28907 target = gen_reg_rtx (tmode);
28908 }
28909
28910 for (i = 0; i < nargs; i++)
28911 {
28912 enum machine_mode mode = insn_p->operand[i + 1].mode;
28913 bool match;
28914
28915 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
28916 op = expand_normal (arg);
28917 match = insn_p->operand[i + 1].predicate (op, mode);
28918
28919 if (last_arg_constant && (i + 1) == nargs)
28920 {
28921 if (!match)
28922 {
28923 if (icode == CODE_FOR_lwp_lwpvalsi3
28924 || icode == CODE_FOR_lwp_lwpinssi3
28925 || icode == CODE_FOR_lwp_lwpvaldi3
28926 || icode == CODE_FOR_lwp_lwpinsdi3)
28927 error ("the last argument must be a 32-bit immediate");
28928 else
28929 error ("the last argument must be an 8-bit immediate");
28930 return const0_rtx;
28931 }
28932 }
28933 else
28934 {
28935 if (i == memory)
28936 {
28937 /* This must be the memory operand. */
28938 if (GET_MODE (op) != Pmode)
28939 op = convert_to_mode (Pmode, op, 1);
28940 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
28941 gcc_assert (GET_MODE (op) == mode
28942 || GET_MODE (op) == VOIDmode);
28943 }
28944 else
28945 {
28946 /* This must be register. */
28947 if (VECTOR_MODE_P (mode))
28948 op = safe_vector_operand (op, mode);
28949
28950 gcc_assert (GET_MODE (op) == mode
28951 || GET_MODE (op) == VOIDmode);
28952 op = copy_to_mode_reg (mode, op);
28953 }
28954 }
28955
28956 args[i].op = op;
28957 args[i].mode = mode;
28958 }
28959
28960 switch (nargs)
28961 {
28962 case 0:
28963 pat = GEN_FCN (icode) (target);
28964 break;
28965 case 1:
28966 pat = GEN_FCN (icode) (target, args[0].op);
28967 break;
28968 case 2:
28969 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28970 break;
28971 case 3:
28972 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28973 break;
28974 default:
28975 gcc_unreachable ();
28976 }
28977
28978 if (! pat)
28979 return 0;
28980 emit_insn (pat);
28981 return klass == store ? 0 : target;
28982 }
28983
28984 /* Return the integer constant in ARG. Constrain it to be in the range
28985 of the subparts of VEC_TYPE; issue an error if not. */
28986
28987 static int
28988 get_element_number (tree vec_type, tree arg)
28989 {
28990 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
28991
28992 if (!host_integerp (arg, 1)
28993 || (elt = tree_low_cst (arg, 1), elt > max))
28994 {
28995 error ("selector must be an integer constant in the range 0..%wi", max);
28996 return 0;
28997 }
28998
28999 return elt;
29000 }
29001
29002 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29003 ix86_expand_vector_init. We DO have language-level syntax for this, in
29004 the form of (type){ init-list }. Except that since we can't place emms
29005 instructions from inside the compiler, we can't allow the use of MMX
29006 registers unless the user explicitly asks for it. So we do *not* define
29007 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29008 we have builtins invoked by mmintrin.h that gives us license to emit
29009 these sorts of instructions. */
29010
29011 static rtx
29012 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29013 {
29014 enum machine_mode tmode = TYPE_MODE (type);
29015 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29016 int i, n_elt = GET_MODE_NUNITS (tmode);
29017 rtvec v = rtvec_alloc (n_elt);
29018
29019 gcc_assert (VECTOR_MODE_P (tmode));
29020 gcc_assert (call_expr_nargs (exp) == n_elt);
29021
29022 for (i = 0; i < n_elt; ++i)
29023 {
29024 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29025 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29026 }
29027
29028 if (!target || !register_operand (target, tmode))
29029 target = gen_reg_rtx (tmode);
29030
29031 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29032 return target;
29033 }
29034
29035 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29036 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29037 had a language-level syntax for referencing vector elements. */
29038
29039 static rtx
29040 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29041 {
29042 enum machine_mode tmode, mode0;
29043 tree arg0, arg1;
29044 int elt;
29045 rtx op0;
29046
29047 arg0 = CALL_EXPR_ARG (exp, 0);
29048 arg1 = CALL_EXPR_ARG (exp, 1);
29049
29050 op0 = expand_normal (arg0);
29051 elt = get_element_number (TREE_TYPE (arg0), arg1);
29052
29053 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29054 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29055 gcc_assert (VECTOR_MODE_P (mode0));
29056
29057 op0 = force_reg (mode0, op0);
29058
29059 if (optimize || !target || !register_operand (target, tmode))
29060 target = gen_reg_rtx (tmode);
29061
29062 ix86_expand_vector_extract (true, target, op0, elt);
29063
29064 return target;
29065 }
29066
29067 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29068 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29069 a language-level syntax for referencing vector elements. */
29070
29071 static rtx
29072 ix86_expand_vec_set_builtin (tree exp)
29073 {
29074 enum machine_mode tmode, mode1;
29075 tree arg0, arg1, arg2;
29076 int elt;
29077 rtx op0, op1, target;
29078
29079 arg0 = CALL_EXPR_ARG (exp, 0);
29080 arg1 = CALL_EXPR_ARG (exp, 1);
29081 arg2 = CALL_EXPR_ARG (exp, 2);
29082
29083 tmode = TYPE_MODE (TREE_TYPE (arg0));
29084 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29085 gcc_assert (VECTOR_MODE_P (tmode));
29086
29087 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29088 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29089 elt = get_element_number (TREE_TYPE (arg0), arg2);
29090
29091 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29092 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29093
29094 op0 = force_reg (tmode, op0);
29095 op1 = force_reg (mode1, op1);
29096
29097 /* OP0 is the source of these builtin functions and shouldn't be
29098 modified. Create a copy, use it and return it as target. */
29099 target = gen_reg_rtx (tmode);
29100 emit_move_insn (target, op0);
29101 ix86_expand_vector_set (true, target, op1, elt);
29102
29103 return target;
29104 }
29105
29106 /* Expand an expression EXP that calls a built-in function,
29107 with result going to TARGET if that's convenient
29108 (and in mode MODE if that's convenient).
29109 SUBTARGET may be used as the target for computing one of EXP's operands.
29110 IGNORE is nonzero if the value is to be ignored. */
29111
29112 static rtx
29113 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29114 enum machine_mode mode ATTRIBUTE_UNUSED,
29115 int ignore ATTRIBUTE_UNUSED)
29116 {
29117 const struct builtin_description *d;
29118 size_t i;
29119 enum insn_code icode;
29120 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29121 tree arg0, arg1, arg2, arg3, arg4;
29122 rtx op0, op1, op2, op3, op4, pat;
29123 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29124 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29125
29126 /* Determine whether the builtin function is available under the current ISA.
29127 Originally the builtin was not created if it wasn't applicable to the
29128 current ISA based on the command line switches. With function specific
29129 options, we need to check in the context of the function making the call
29130 whether it is supported. */
29131 if (ix86_builtins_isa[fcode].isa
29132 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29133 {
29134 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29135 NULL, (enum fpmath_unit) 0, false);
29136
29137 if (!opts)
29138 error ("%qE needs unknown isa option", fndecl);
29139 else
29140 {
29141 gcc_assert (opts != NULL);
29142 error ("%qE needs isa option %s", fndecl, opts);
29143 free (opts);
29144 }
29145 return const0_rtx;
29146 }
29147
29148 switch (fcode)
29149 {
29150 case IX86_BUILTIN_MASKMOVQ:
29151 case IX86_BUILTIN_MASKMOVDQU:
29152 icode = (fcode == IX86_BUILTIN_MASKMOVQ
29153 ? CODE_FOR_mmx_maskmovq
29154 : CODE_FOR_sse2_maskmovdqu);
29155 /* Note the arg order is different from the operand order. */
29156 arg1 = CALL_EXPR_ARG (exp, 0);
29157 arg2 = CALL_EXPR_ARG (exp, 1);
29158 arg0 = CALL_EXPR_ARG (exp, 2);
29159 op0 = expand_normal (arg0);
29160 op1 = expand_normal (arg1);
29161 op2 = expand_normal (arg2);
29162 mode0 = insn_data[icode].operand[0].mode;
29163 mode1 = insn_data[icode].operand[1].mode;
29164 mode2 = insn_data[icode].operand[2].mode;
29165
29166 if (GET_MODE (op0) != Pmode)
29167 op0 = convert_to_mode (Pmode, op0, 1);
29168 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29169
29170 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29171 op0 = copy_to_mode_reg (mode0, op0);
29172 if (!insn_data[icode].operand[1].predicate (op1, mode1))
29173 op1 = copy_to_mode_reg (mode1, op1);
29174 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29175 op2 = copy_to_mode_reg (mode2, op2);
29176 pat = GEN_FCN (icode) (op0, op1, op2);
29177 if (! pat)
29178 return 0;
29179 emit_insn (pat);
29180 return 0;
29181
29182 case IX86_BUILTIN_LDMXCSR:
29183 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29184 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29185 emit_move_insn (target, op0);
29186 emit_insn (gen_sse_ldmxcsr (target));
29187 return 0;
29188
29189 case IX86_BUILTIN_STMXCSR:
29190 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29191 emit_insn (gen_sse_stmxcsr (target));
29192 return copy_to_mode_reg (SImode, target);
29193
29194 case IX86_BUILTIN_CLFLUSH:
29195 arg0 = CALL_EXPR_ARG (exp, 0);
29196 op0 = expand_normal (arg0);
29197 icode = CODE_FOR_sse2_clflush;
29198 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29199 {
29200 if (GET_MODE (op0) != Pmode)
29201 op0 = convert_to_mode (Pmode, op0, 1);
29202 op0 = force_reg (Pmode, op0);
29203 }
29204
29205 emit_insn (gen_sse2_clflush (op0));
29206 return 0;
29207
29208 case IX86_BUILTIN_MONITOR:
29209 arg0 = CALL_EXPR_ARG (exp, 0);
29210 arg1 = CALL_EXPR_ARG (exp, 1);
29211 arg2 = CALL_EXPR_ARG (exp, 2);
29212 op0 = expand_normal (arg0);
29213 op1 = expand_normal (arg1);
29214 op2 = expand_normal (arg2);
29215 if (!REG_P (op0))
29216 {
29217 if (GET_MODE (op0) != Pmode)
29218 op0 = convert_to_mode (Pmode, op0, 1);
29219 op0 = force_reg (Pmode, op0);
29220 }
29221 if (!REG_P (op1))
29222 op1 = copy_to_mode_reg (SImode, op1);
29223 if (!REG_P (op2))
29224 op2 = copy_to_mode_reg (SImode, op2);
29225 emit_insn (ix86_gen_monitor (op0, op1, op2));
29226 return 0;
29227
29228 case IX86_BUILTIN_MWAIT:
29229 arg0 = CALL_EXPR_ARG (exp, 0);
29230 arg1 = CALL_EXPR_ARG (exp, 1);
29231 op0 = expand_normal (arg0);
29232 op1 = expand_normal (arg1);
29233 if (!REG_P (op0))
29234 op0 = copy_to_mode_reg (SImode, op0);
29235 if (!REG_P (op1))
29236 op1 = copy_to_mode_reg (SImode, op1);
29237 emit_insn (gen_sse3_mwait (op0, op1));
29238 return 0;
29239
29240 case IX86_BUILTIN_VEC_INIT_V2SI:
29241 case IX86_BUILTIN_VEC_INIT_V4HI:
29242 case IX86_BUILTIN_VEC_INIT_V8QI:
29243 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29244
29245 case IX86_BUILTIN_VEC_EXT_V2DF:
29246 case IX86_BUILTIN_VEC_EXT_V2DI:
29247 case IX86_BUILTIN_VEC_EXT_V4SF:
29248 case IX86_BUILTIN_VEC_EXT_V4SI:
29249 case IX86_BUILTIN_VEC_EXT_V8HI:
29250 case IX86_BUILTIN_VEC_EXT_V2SI:
29251 case IX86_BUILTIN_VEC_EXT_V4HI:
29252 case IX86_BUILTIN_VEC_EXT_V16QI:
29253 return ix86_expand_vec_ext_builtin (exp, target);
29254
29255 case IX86_BUILTIN_VEC_SET_V2DI:
29256 case IX86_BUILTIN_VEC_SET_V4SF:
29257 case IX86_BUILTIN_VEC_SET_V4SI:
29258 case IX86_BUILTIN_VEC_SET_V8HI:
29259 case IX86_BUILTIN_VEC_SET_V4HI:
29260 case IX86_BUILTIN_VEC_SET_V16QI:
29261 return ix86_expand_vec_set_builtin (exp);
29262
29263 case IX86_BUILTIN_INFQ:
29264 case IX86_BUILTIN_HUGE_VALQ:
29265 {
29266 REAL_VALUE_TYPE inf;
29267 rtx tmp;
29268
29269 real_inf (&inf);
29270 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29271
29272 tmp = validize_mem (force_const_mem (mode, tmp));
29273
29274 if (target == 0)
29275 target = gen_reg_rtx (mode);
29276
29277 emit_move_insn (target, tmp);
29278 return target;
29279 }
29280
29281 case IX86_BUILTIN_LLWPCB:
29282 arg0 = CALL_EXPR_ARG (exp, 0);
29283 op0 = expand_normal (arg0);
29284 icode = CODE_FOR_lwp_llwpcb;
29285 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29286 {
29287 if (GET_MODE (op0) != Pmode)
29288 op0 = convert_to_mode (Pmode, op0, 1);
29289 op0 = force_reg (Pmode, op0);
29290 }
29291 emit_insn (gen_lwp_llwpcb (op0));
29292 return 0;
29293
29294 case IX86_BUILTIN_SLWPCB:
29295 icode = CODE_FOR_lwp_slwpcb;
29296 if (!target
29297 || !insn_data[icode].operand[0].predicate (target, Pmode))
29298 target = gen_reg_rtx (Pmode);
29299 emit_insn (gen_lwp_slwpcb (target));
29300 return target;
29301
29302 case IX86_BUILTIN_BEXTRI32:
29303 case IX86_BUILTIN_BEXTRI64:
29304 arg0 = CALL_EXPR_ARG (exp, 0);
29305 arg1 = CALL_EXPR_ARG (exp, 1);
29306 op0 = expand_normal (arg0);
29307 op1 = expand_normal (arg1);
29308 icode = (fcode == IX86_BUILTIN_BEXTRI32
29309 ? CODE_FOR_tbm_bextri_si
29310 : CODE_FOR_tbm_bextri_di);
29311 if (!CONST_INT_P (op1))
29312 {
29313 error ("last argument must be an immediate");
29314 return const0_rtx;
29315 }
29316 else
29317 {
29318 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29319 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29320 op1 = GEN_INT (length);
29321 op2 = GEN_INT (lsb_index);
29322 pat = GEN_FCN (icode) (target, op0, op1, op2);
29323 if (pat)
29324 emit_insn (pat);
29325 return target;
29326 }
29327
29328 case IX86_BUILTIN_RDRAND16_STEP:
29329 icode = CODE_FOR_rdrandhi_1;
29330 mode0 = HImode;
29331 goto rdrand_step;
29332
29333 case IX86_BUILTIN_RDRAND32_STEP:
29334 icode = CODE_FOR_rdrandsi_1;
29335 mode0 = SImode;
29336 goto rdrand_step;
29337
29338 case IX86_BUILTIN_RDRAND64_STEP:
29339 icode = CODE_FOR_rdranddi_1;
29340 mode0 = DImode;
29341
29342 rdrand_step:
29343 op0 = gen_reg_rtx (mode0);
29344 emit_insn (GEN_FCN (icode) (op0));
29345
29346 arg0 = CALL_EXPR_ARG (exp, 0);
29347 op1 = expand_normal (arg0);
29348 if (!address_operand (op1, VOIDmode))
29349 {
29350 op1 = convert_memory_address (Pmode, op1);
29351 op1 = copy_addr_to_reg (op1);
29352 }
29353 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29354
29355 op1 = gen_reg_rtx (SImode);
29356 emit_move_insn (op1, CONST1_RTX (SImode));
29357
29358 /* Emit SImode conditional move. */
29359 if (mode0 == HImode)
29360 {
29361 op2 = gen_reg_rtx (SImode);
29362 emit_insn (gen_zero_extendhisi2 (op2, op0));
29363 }
29364 else if (mode0 == SImode)
29365 op2 = op0;
29366 else
29367 op2 = gen_rtx_SUBREG (SImode, op0, 0);
29368
29369 if (target == 0)
29370 target = gen_reg_rtx (SImode);
29371
29372 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29373 const0_rtx);
29374 emit_insn (gen_rtx_SET (VOIDmode, target,
29375 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29376 return target;
29377
29378 case IX86_BUILTIN_GATHERSIV2DF:
29379 icode = CODE_FOR_avx2_gathersiv2df;
29380 goto gather_gen;
29381 case IX86_BUILTIN_GATHERSIV4DF:
29382 icode = CODE_FOR_avx2_gathersiv4df;
29383 goto gather_gen;
29384 case IX86_BUILTIN_GATHERDIV2DF:
29385 icode = CODE_FOR_avx2_gatherdiv2df;
29386 goto gather_gen;
29387 case IX86_BUILTIN_GATHERDIV4DF:
29388 icode = CODE_FOR_avx2_gatherdiv4df;
29389 goto gather_gen;
29390 case IX86_BUILTIN_GATHERSIV4SF:
29391 icode = CODE_FOR_avx2_gathersiv4sf;
29392 goto gather_gen;
29393 case IX86_BUILTIN_GATHERSIV8SF:
29394 icode = CODE_FOR_avx2_gathersiv8sf;
29395 goto gather_gen;
29396 case IX86_BUILTIN_GATHERDIV4SF:
29397 icode = CODE_FOR_avx2_gatherdiv4sf;
29398 goto gather_gen;
29399 case IX86_BUILTIN_GATHERDIV8SF:
29400 icode = CODE_FOR_avx2_gatherdiv8sf;
29401 goto gather_gen;
29402 case IX86_BUILTIN_GATHERSIV2DI:
29403 icode = CODE_FOR_avx2_gathersiv2di;
29404 goto gather_gen;
29405 case IX86_BUILTIN_GATHERSIV4DI:
29406 icode = CODE_FOR_avx2_gathersiv4di;
29407 goto gather_gen;
29408 case IX86_BUILTIN_GATHERDIV2DI:
29409 icode = CODE_FOR_avx2_gatherdiv2di;
29410 goto gather_gen;
29411 case IX86_BUILTIN_GATHERDIV4DI:
29412 icode = CODE_FOR_avx2_gatherdiv4di;
29413 goto gather_gen;
29414 case IX86_BUILTIN_GATHERSIV4SI:
29415 icode = CODE_FOR_avx2_gathersiv4si;
29416 goto gather_gen;
29417 case IX86_BUILTIN_GATHERSIV8SI:
29418 icode = CODE_FOR_avx2_gathersiv8si;
29419 goto gather_gen;
29420 case IX86_BUILTIN_GATHERDIV4SI:
29421 icode = CODE_FOR_avx2_gatherdiv4si;
29422 goto gather_gen;
29423 case IX86_BUILTIN_GATHERDIV8SI:
29424 icode = CODE_FOR_avx2_gatherdiv8si;
29425 goto gather_gen;
29426 case IX86_BUILTIN_GATHERALTSIV4DF:
29427 icode = CODE_FOR_avx2_gathersiv4df;
29428 goto gather_gen;
29429 case IX86_BUILTIN_GATHERALTDIV8SF:
29430 icode = CODE_FOR_avx2_gatherdiv8sf;
29431 goto gather_gen;
29432 case IX86_BUILTIN_GATHERALTSIV4DI:
29433 icode = CODE_FOR_avx2_gathersiv4di;
29434 goto gather_gen;
29435 case IX86_BUILTIN_GATHERALTDIV8SI:
29436 icode = CODE_FOR_avx2_gatherdiv8si;
29437 goto gather_gen;
29438
29439 gather_gen:
29440 arg0 = CALL_EXPR_ARG (exp, 0);
29441 arg1 = CALL_EXPR_ARG (exp, 1);
29442 arg2 = CALL_EXPR_ARG (exp, 2);
29443 arg3 = CALL_EXPR_ARG (exp, 3);
29444 arg4 = CALL_EXPR_ARG (exp, 4);
29445 op0 = expand_normal (arg0);
29446 op1 = expand_normal (arg1);
29447 op2 = expand_normal (arg2);
29448 op3 = expand_normal (arg3);
29449 op4 = expand_normal (arg4);
29450 /* Note the arg order is different from the operand order. */
29451 mode0 = insn_data[icode].operand[1].mode;
29452 mode2 = insn_data[icode].operand[3].mode;
29453 mode3 = insn_data[icode].operand[4].mode;
29454 mode4 = insn_data[icode].operand[5].mode;
29455
29456 if (target == NULL_RTX
29457 || GET_MODE (target) != insn_data[icode].operand[0].mode)
29458 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
29459 else
29460 subtarget = target;
29461
29462 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
29463 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
29464 {
29465 rtx half = gen_reg_rtx (V4SImode);
29466 if (!nonimmediate_operand (op2, V8SImode))
29467 op2 = copy_to_mode_reg (V8SImode, op2);
29468 emit_insn (gen_vec_extract_lo_v8si (half, op2));
29469 op2 = half;
29470 }
29471 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
29472 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
29473 {
29474 rtx (*gen) (rtx, rtx);
29475 rtx half = gen_reg_rtx (mode0);
29476 if (mode0 == V4SFmode)
29477 gen = gen_vec_extract_lo_v8sf;
29478 else
29479 gen = gen_vec_extract_lo_v8si;
29480 if (!nonimmediate_operand (op0, GET_MODE (op0)))
29481 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
29482 emit_insn (gen (half, op0));
29483 op0 = half;
29484 if (!nonimmediate_operand (op3, GET_MODE (op3)))
29485 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
29486 emit_insn (gen (half, op3));
29487 op3 = half;
29488 }
29489
29490 /* Force memory operand only with base register here. But we
29491 don't want to do it on memory operand for other builtin
29492 functions. */
29493 if (GET_MODE (op1) != Pmode)
29494 op1 = convert_to_mode (Pmode, op1, 1);
29495 op1 = force_reg (Pmode, op1);
29496
29497 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29498 op0 = copy_to_mode_reg (mode0, op0);
29499 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
29500 op1 = copy_to_mode_reg (Pmode, op1);
29501 if (!insn_data[icode].operand[3].predicate (op2, mode2))
29502 op2 = copy_to_mode_reg (mode2, op2);
29503 if (!insn_data[icode].operand[4].predicate (op3, mode3))
29504 op3 = copy_to_mode_reg (mode3, op3);
29505 if (!insn_data[icode].operand[5].predicate (op4, mode4))
29506 {
29507 error ("last argument must be scale 1, 2, 4, 8");
29508 return const0_rtx;
29509 }
29510
29511 /* Optimize. If mask is known to have all high bits set,
29512 replace op0 with pc_rtx to signal that the instruction
29513 overwrites the whole destination and doesn't use its
29514 previous contents. */
29515 if (optimize)
29516 {
29517 if (TREE_CODE (arg3) == VECTOR_CST)
29518 {
29519 tree elt;
29520 unsigned int negative = 0;
29521 for (elt = TREE_VECTOR_CST_ELTS (arg3);
29522 elt; elt = TREE_CHAIN (elt))
29523 {
29524 tree cst = TREE_VALUE (elt);
29525 if (TREE_CODE (cst) == INTEGER_CST
29526 && tree_int_cst_sign_bit (cst))
29527 negative++;
29528 else if (TREE_CODE (cst) == REAL_CST
29529 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
29530 negative++;
29531 }
29532 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
29533 op0 = pc_rtx;
29534 }
29535 else if (TREE_CODE (arg3) == SSA_NAME)
29536 {
29537 /* Recognize also when mask is like:
29538 __v2df src = _mm_setzero_pd ();
29539 __v2df mask = _mm_cmpeq_pd (src, src);
29540 or
29541 __v8sf src = _mm256_setzero_ps ();
29542 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
29543 as that is a cheaper way to load all ones into
29544 a register than having to load a constant from
29545 memory. */
29546 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
29547 if (is_gimple_call (def_stmt))
29548 {
29549 tree fndecl = gimple_call_fndecl (def_stmt);
29550 if (fndecl
29551 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29552 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
29553 {
29554 case IX86_BUILTIN_CMPPD:
29555 case IX86_BUILTIN_CMPPS:
29556 case IX86_BUILTIN_CMPPD256:
29557 case IX86_BUILTIN_CMPPS256:
29558 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
29559 break;
29560 /* FALLTHRU */
29561 case IX86_BUILTIN_CMPEQPD:
29562 case IX86_BUILTIN_CMPEQPS:
29563 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
29564 && initializer_zerop (gimple_call_arg (def_stmt,
29565 1)))
29566 op0 = pc_rtx;
29567 break;
29568 default:
29569 break;
29570 }
29571 }
29572 }
29573 }
29574
29575 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
29576 if (! pat)
29577 return const0_rtx;
29578 emit_insn (pat);
29579
29580 if (fcode == IX86_BUILTIN_GATHERDIV8SF
29581 || fcode == IX86_BUILTIN_GATHERDIV8SI)
29582 {
29583 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
29584 ? V4SFmode : V4SImode;
29585 if (target == NULL_RTX)
29586 target = gen_reg_rtx (tmode);
29587 if (tmode == V4SFmode)
29588 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
29589 else
29590 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
29591 }
29592 else
29593 target = subtarget;
29594
29595 return target;
29596
29597 default:
29598 break;
29599 }
29600
29601 for (i = 0, d = bdesc_special_args;
29602 i < ARRAY_SIZE (bdesc_special_args);
29603 i++, d++)
29604 if (d->code == fcode)
29605 return ix86_expand_special_args_builtin (d, exp, target);
29606
29607 for (i = 0, d = bdesc_args;
29608 i < ARRAY_SIZE (bdesc_args);
29609 i++, d++)
29610 if (d->code == fcode)
29611 switch (fcode)
29612 {
29613 case IX86_BUILTIN_FABSQ:
29614 case IX86_BUILTIN_COPYSIGNQ:
29615 if (!TARGET_SSE2)
29616 /* Emit a normal call if SSE2 isn't available. */
29617 return expand_call (exp, target, ignore);
29618 default:
29619 return ix86_expand_args_builtin (d, exp, target);
29620 }
29621
29622 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29623 if (d->code == fcode)
29624 return ix86_expand_sse_comi (d, exp, target);
29625
29626 for (i = 0, d = bdesc_pcmpestr;
29627 i < ARRAY_SIZE (bdesc_pcmpestr);
29628 i++, d++)
29629 if (d->code == fcode)
29630 return ix86_expand_sse_pcmpestr (d, exp, target);
29631
29632 for (i = 0, d = bdesc_pcmpistr;
29633 i < ARRAY_SIZE (bdesc_pcmpistr);
29634 i++, d++)
29635 if (d->code == fcode)
29636 return ix86_expand_sse_pcmpistr (d, exp, target);
29637
29638 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29639 if (d->code == fcode)
29640 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
29641 (enum ix86_builtin_func_type)
29642 d->flag, d->comparison);
29643
29644 gcc_unreachable ();
29645 }
29646
29647 /* Returns a function decl for a vectorized version of the builtin function
29648 with builtin function code FN and the result vector type TYPE, or NULL_TREE
29649 if it is not available. */
29650
29651 static tree
29652 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
29653 tree type_in)
29654 {
29655 enum machine_mode in_mode, out_mode;
29656 int in_n, out_n;
29657 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
29658
29659 if (TREE_CODE (type_out) != VECTOR_TYPE
29660 || TREE_CODE (type_in) != VECTOR_TYPE
29661 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
29662 return NULL_TREE;
29663
29664 out_mode = TYPE_MODE (TREE_TYPE (type_out));
29665 out_n = TYPE_VECTOR_SUBPARTS (type_out);
29666 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29667 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29668
29669 switch (fn)
29670 {
29671 case BUILT_IN_SQRT:
29672 if (out_mode == DFmode && in_mode == DFmode)
29673 {
29674 if (out_n == 2 && in_n == 2)
29675 return ix86_builtins[IX86_BUILTIN_SQRTPD];
29676 else if (out_n == 4 && in_n == 4)
29677 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
29678 }
29679 break;
29680
29681 case BUILT_IN_SQRTF:
29682 if (out_mode == SFmode && in_mode == SFmode)
29683 {
29684 if (out_n == 4 && in_n == 4)
29685 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
29686 else if (out_n == 8 && in_n == 8)
29687 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
29688 }
29689 break;
29690
29691 case BUILT_IN_IFLOOR:
29692 case BUILT_IN_LFLOOR:
29693 case BUILT_IN_LLFLOOR:
29694 /* The round insn does not trap on denormals. */
29695 if (flag_trapping_math || !TARGET_ROUND)
29696 break;
29697
29698 if (out_mode == SImode && in_mode == DFmode)
29699 {
29700 if (out_n == 4 && in_n == 2)
29701 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
29702 else if (out_n == 8 && in_n == 4)
29703 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
29704 }
29705 break;
29706
29707 case BUILT_IN_IFLOORF:
29708 case BUILT_IN_LFLOORF:
29709 case BUILT_IN_LLFLOORF:
29710 /* The round insn does not trap on denormals. */
29711 if (flag_trapping_math || !TARGET_ROUND)
29712 break;
29713
29714 if (out_mode == SImode && in_mode == SFmode)
29715 {
29716 if (out_n == 4 && in_n == 4)
29717 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
29718 else if (out_n == 8 && in_n == 8)
29719 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
29720 }
29721 break;
29722
29723 case BUILT_IN_ICEIL:
29724 case BUILT_IN_LCEIL:
29725 case BUILT_IN_LLCEIL:
29726 /* The round insn does not trap on denormals. */
29727 if (flag_trapping_math || !TARGET_ROUND)
29728 break;
29729
29730 if (out_mode == SImode && in_mode == DFmode)
29731 {
29732 if (out_n == 4 && in_n == 2)
29733 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
29734 else if (out_n == 8 && in_n == 4)
29735 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
29736 }
29737 break;
29738
29739 case BUILT_IN_ICEILF:
29740 case BUILT_IN_LCEILF:
29741 case BUILT_IN_LLCEILF:
29742 /* The round insn does not trap on denormals. */
29743 if (flag_trapping_math || !TARGET_ROUND)
29744 break;
29745
29746 if (out_mode == SImode && in_mode == SFmode)
29747 {
29748 if (out_n == 4 && in_n == 4)
29749 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
29750 else if (out_n == 8 && in_n == 8)
29751 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
29752 }
29753 break;
29754
29755 case BUILT_IN_IRINT:
29756 case BUILT_IN_LRINT:
29757 case BUILT_IN_LLRINT:
29758 if (out_mode == SImode && in_mode == DFmode)
29759 {
29760 if (out_n == 4 && in_n == 2)
29761 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
29762 else if (out_n == 8 && in_n == 4)
29763 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
29764 }
29765 break;
29766
29767 case BUILT_IN_IRINTF:
29768 case BUILT_IN_LRINTF:
29769 case BUILT_IN_LLRINTF:
29770 if (out_mode == SImode && in_mode == SFmode)
29771 {
29772 if (out_n == 4 && in_n == 4)
29773 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
29774 else if (out_n == 8 && in_n == 8)
29775 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
29776 }
29777 break;
29778
29779 case BUILT_IN_IROUND:
29780 case BUILT_IN_LROUND:
29781 case BUILT_IN_LLROUND:
29782 /* The round insn does not trap on denormals. */
29783 if (flag_trapping_math || !TARGET_ROUND)
29784 break;
29785
29786 if (out_mode == SImode && in_mode == DFmode)
29787 {
29788 if (out_n == 4 && in_n == 2)
29789 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
29790 else if (out_n == 8 && in_n == 4)
29791 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
29792 }
29793 break;
29794
29795 case BUILT_IN_IROUNDF:
29796 case BUILT_IN_LROUNDF:
29797 case BUILT_IN_LLROUNDF:
29798 /* The round insn does not trap on denormals. */
29799 if (flag_trapping_math || !TARGET_ROUND)
29800 break;
29801
29802 if (out_mode == SImode && in_mode == SFmode)
29803 {
29804 if (out_n == 4 && in_n == 4)
29805 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
29806 else if (out_n == 8 && in_n == 8)
29807 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
29808 }
29809 break;
29810
29811 case BUILT_IN_COPYSIGN:
29812 if (out_mode == DFmode && in_mode == DFmode)
29813 {
29814 if (out_n == 2 && in_n == 2)
29815 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
29816 else if (out_n == 4 && in_n == 4)
29817 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
29818 }
29819 break;
29820
29821 case BUILT_IN_COPYSIGNF:
29822 if (out_mode == SFmode && in_mode == SFmode)
29823 {
29824 if (out_n == 4 && in_n == 4)
29825 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
29826 else if (out_n == 8 && in_n == 8)
29827 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
29828 }
29829 break;
29830
29831 case BUILT_IN_FLOOR:
29832 /* The round insn does not trap on denormals. */
29833 if (flag_trapping_math || !TARGET_ROUND)
29834 break;
29835
29836 if (out_mode == DFmode && in_mode == DFmode)
29837 {
29838 if (out_n == 2 && in_n == 2)
29839 return ix86_builtins[IX86_BUILTIN_FLOORPD];
29840 else if (out_n == 4 && in_n == 4)
29841 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
29842 }
29843 break;
29844
29845 case BUILT_IN_FLOORF:
29846 /* The round insn does not trap on denormals. */
29847 if (flag_trapping_math || !TARGET_ROUND)
29848 break;
29849
29850 if (out_mode == SFmode && in_mode == SFmode)
29851 {
29852 if (out_n == 4 && in_n == 4)
29853 return ix86_builtins[IX86_BUILTIN_FLOORPS];
29854 else if (out_n == 8 && in_n == 8)
29855 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
29856 }
29857 break;
29858
29859 case BUILT_IN_CEIL:
29860 /* The round insn does not trap on denormals. */
29861 if (flag_trapping_math || !TARGET_ROUND)
29862 break;
29863
29864 if (out_mode == DFmode && in_mode == DFmode)
29865 {
29866 if (out_n == 2 && in_n == 2)
29867 return ix86_builtins[IX86_BUILTIN_CEILPD];
29868 else if (out_n == 4 && in_n == 4)
29869 return ix86_builtins[IX86_BUILTIN_CEILPD256];
29870 }
29871 break;
29872
29873 case BUILT_IN_CEILF:
29874 /* The round insn does not trap on denormals. */
29875 if (flag_trapping_math || !TARGET_ROUND)
29876 break;
29877
29878 if (out_mode == SFmode && in_mode == SFmode)
29879 {
29880 if (out_n == 4 && in_n == 4)
29881 return ix86_builtins[IX86_BUILTIN_CEILPS];
29882 else if (out_n == 8 && in_n == 8)
29883 return ix86_builtins[IX86_BUILTIN_CEILPS256];
29884 }
29885 break;
29886
29887 case BUILT_IN_TRUNC:
29888 /* The round insn does not trap on denormals. */
29889 if (flag_trapping_math || !TARGET_ROUND)
29890 break;
29891
29892 if (out_mode == DFmode && in_mode == DFmode)
29893 {
29894 if (out_n == 2 && in_n == 2)
29895 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
29896 else if (out_n == 4 && in_n == 4)
29897 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
29898 }
29899 break;
29900
29901 case BUILT_IN_TRUNCF:
29902 /* The round insn does not trap on denormals. */
29903 if (flag_trapping_math || !TARGET_ROUND)
29904 break;
29905
29906 if (out_mode == SFmode && in_mode == SFmode)
29907 {
29908 if (out_n == 4 && in_n == 4)
29909 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
29910 else if (out_n == 8 && in_n == 8)
29911 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
29912 }
29913 break;
29914
29915 case BUILT_IN_RINT:
29916 /* The round insn does not trap on denormals. */
29917 if (flag_trapping_math || !TARGET_ROUND)
29918 break;
29919
29920 if (out_mode == DFmode && in_mode == DFmode)
29921 {
29922 if (out_n == 2 && in_n == 2)
29923 return ix86_builtins[IX86_BUILTIN_RINTPD];
29924 else if (out_n == 4 && in_n == 4)
29925 return ix86_builtins[IX86_BUILTIN_RINTPD256];
29926 }
29927 break;
29928
29929 case BUILT_IN_RINTF:
29930 /* The round insn does not trap on denormals. */
29931 if (flag_trapping_math || !TARGET_ROUND)
29932 break;
29933
29934 if (out_mode == SFmode && in_mode == SFmode)
29935 {
29936 if (out_n == 4 && in_n == 4)
29937 return ix86_builtins[IX86_BUILTIN_RINTPS];
29938 else if (out_n == 8 && in_n == 8)
29939 return ix86_builtins[IX86_BUILTIN_RINTPS256];
29940 }
29941 break;
29942
29943 case BUILT_IN_ROUND:
29944 /* The round insn does not trap on denormals. */
29945 if (flag_trapping_math || !TARGET_ROUND)
29946 break;
29947
29948 if (out_mode == DFmode && in_mode == DFmode)
29949 {
29950 if (out_n == 2 && in_n == 2)
29951 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
29952 else if (out_n == 4 && in_n == 4)
29953 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
29954 }
29955 break;
29956
29957 case BUILT_IN_ROUNDF:
29958 /* The round insn does not trap on denormals. */
29959 if (flag_trapping_math || !TARGET_ROUND)
29960 break;
29961
29962 if (out_mode == SFmode && in_mode == SFmode)
29963 {
29964 if (out_n == 4 && in_n == 4)
29965 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
29966 else if (out_n == 8 && in_n == 8)
29967 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
29968 }
29969 break;
29970
29971 case BUILT_IN_FMA:
29972 if (out_mode == DFmode && in_mode == DFmode)
29973 {
29974 if (out_n == 2 && in_n == 2)
29975 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
29976 if (out_n == 4 && in_n == 4)
29977 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
29978 }
29979 break;
29980
29981 case BUILT_IN_FMAF:
29982 if (out_mode == SFmode && in_mode == SFmode)
29983 {
29984 if (out_n == 4 && in_n == 4)
29985 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
29986 if (out_n == 8 && in_n == 8)
29987 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
29988 }
29989 break;
29990
29991 default:
29992 break;
29993 }
29994
29995 /* Dispatch to a handler for a vectorization library. */
29996 if (ix86_veclib_handler)
29997 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
29998 type_in);
29999
30000 return NULL_TREE;
30001 }
30002
30003 /* Handler for an SVML-style interface to
30004 a library with vectorized intrinsics. */
30005
30006 static tree
30007 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30008 {
30009 char name[20];
30010 tree fntype, new_fndecl, args;
30011 unsigned arity;
30012 const char *bname;
30013 enum machine_mode el_mode, in_mode;
30014 int n, in_n;
30015
30016 /* The SVML is suitable for unsafe math only. */
30017 if (!flag_unsafe_math_optimizations)
30018 return NULL_TREE;
30019
30020 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30021 n = TYPE_VECTOR_SUBPARTS (type_out);
30022 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30023 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30024 if (el_mode != in_mode
30025 || n != in_n)
30026 return NULL_TREE;
30027
30028 switch (fn)
30029 {
30030 case BUILT_IN_EXP:
30031 case BUILT_IN_LOG:
30032 case BUILT_IN_LOG10:
30033 case BUILT_IN_POW:
30034 case BUILT_IN_TANH:
30035 case BUILT_IN_TAN:
30036 case BUILT_IN_ATAN:
30037 case BUILT_IN_ATAN2:
30038 case BUILT_IN_ATANH:
30039 case BUILT_IN_CBRT:
30040 case BUILT_IN_SINH:
30041 case BUILT_IN_SIN:
30042 case BUILT_IN_ASINH:
30043 case BUILT_IN_ASIN:
30044 case BUILT_IN_COSH:
30045 case BUILT_IN_COS:
30046 case BUILT_IN_ACOSH:
30047 case BUILT_IN_ACOS:
30048 if (el_mode != DFmode || n != 2)
30049 return NULL_TREE;
30050 break;
30051
30052 case BUILT_IN_EXPF:
30053 case BUILT_IN_LOGF:
30054 case BUILT_IN_LOG10F:
30055 case BUILT_IN_POWF:
30056 case BUILT_IN_TANHF:
30057 case BUILT_IN_TANF:
30058 case BUILT_IN_ATANF:
30059 case BUILT_IN_ATAN2F:
30060 case BUILT_IN_ATANHF:
30061 case BUILT_IN_CBRTF:
30062 case BUILT_IN_SINHF:
30063 case BUILT_IN_SINF:
30064 case BUILT_IN_ASINHF:
30065 case BUILT_IN_ASINF:
30066 case BUILT_IN_COSHF:
30067 case BUILT_IN_COSF:
30068 case BUILT_IN_ACOSHF:
30069 case BUILT_IN_ACOSF:
30070 if (el_mode != SFmode || n != 4)
30071 return NULL_TREE;
30072 break;
30073
30074 default:
30075 return NULL_TREE;
30076 }
30077
30078 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30079
30080 if (fn == BUILT_IN_LOGF)
30081 strcpy (name, "vmlsLn4");
30082 else if (fn == BUILT_IN_LOG)
30083 strcpy (name, "vmldLn2");
30084 else if (n == 4)
30085 {
30086 sprintf (name, "vmls%s", bname+10);
30087 name[strlen (name)-1] = '4';
30088 }
30089 else
30090 sprintf (name, "vmld%s2", bname+10);
30091
30092 /* Convert to uppercase. */
30093 name[4] &= ~0x20;
30094
30095 arity = 0;
30096 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30097 args;
30098 args = TREE_CHAIN (args))
30099 arity++;
30100
30101 if (arity == 1)
30102 fntype = build_function_type_list (type_out, type_in, NULL);
30103 else
30104 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30105
30106 /* Build a function declaration for the vectorized function. */
30107 new_fndecl = build_decl (BUILTINS_LOCATION,
30108 FUNCTION_DECL, get_identifier (name), fntype);
30109 TREE_PUBLIC (new_fndecl) = 1;
30110 DECL_EXTERNAL (new_fndecl) = 1;
30111 DECL_IS_NOVOPS (new_fndecl) = 1;
30112 TREE_READONLY (new_fndecl) = 1;
30113
30114 return new_fndecl;
30115 }
30116
30117 /* Handler for an ACML-style interface to
30118 a library with vectorized intrinsics. */
30119
30120 static tree
30121 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30122 {
30123 char name[20] = "__vr.._";
30124 tree fntype, new_fndecl, args;
30125 unsigned arity;
30126 const char *bname;
30127 enum machine_mode el_mode, in_mode;
30128 int n, in_n;
30129
30130 /* The ACML is 64bits only and suitable for unsafe math only as
30131 it does not correctly support parts of IEEE with the required
30132 precision such as denormals. */
30133 if (!TARGET_64BIT
30134 || !flag_unsafe_math_optimizations)
30135 return NULL_TREE;
30136
30137 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30138 n = TYPE_VECTOR_SUBPARTS (type_out);
30139 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30140 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30141 if (el_mode != in_mode
30142 || n != in_n)
30143 return NULL_TREE;
30144
30145 switch (fn)
30146 {
30147 case BUILT_IN_SIN:
30148 case BUILT_IN_COS:
30149 case BUILT_IN_EXP:
30150 case BUILT_IN_LOG:
30151 case BUILT_IN_LOG2:
30152 case BUILT_IN_LOG10:
30153 name[4] = 'd';
30154 name[5] = '2';
30155 if (el_mode != DFmode
30156 || n != 2)
30157 return NULL_TREE;
30158 break;
30159
30160 case BUILT_IN_SINF:
30161 case BUILT_IN_COSF:
30162 case BUILT_IN_EXPF:
30163 case BUILT_IN_POWF:
30164 case BUILT_IN_LOGF:
30165 case BUILT_IN_LOG2F:
30166 case BUILT_IN_LOG10F:
30167 name[4] = 's';
30168 name[5] = '4';
30169 if (el_mode != SFmode
30170 || n != 4)
30171 return NULL_TREE;
30172 break;
30173
30174 default:
30175 return NULL_TREE;
30176 }
30177
30178 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30179 sprintf (name + 7, "%s", bname+10);
30180
30181 arity = 0;
30182 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30183 args;
30184 args = TREE_CHAIN (args))
30185 arity++;
30186
30187 if (arity == 1)
30188 fntype = build_function_type_list (type_out, type_in, NULL);
30189 else
30190 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30191
30192 /* Build a function declaration for the vectorized function. */
30193 new_fndecl = build_decl (BUILTINS_LOCATION,
30194 FUNCTION_DECL, get_identifier (name), fntype);
30195 TREE_PUBLIC (new_fndecl) = 1;
30196 DECL_EXTERNAL (new_fndecl) = 1;
30197 DECL_IS_NOVOPS (new_fndecl) = 1;
30198 TREE_READONLY (new_fndecl) = 1;
30199
30200 return new_fndecl;
30201 }
30202
30203 /* Returns a decl of a function that implements gather load with
30204 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30205 Return NULL_TREE if it is not available. */
30206
30207 static tree
30208 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30209 const_tree index_type, int scale)
30210 {
30211 bool si;
30212 enum ix86_builtins code;
30213
30214 if (! TARGET_AVX2)
30215 return NULL_TREE;
30216
30217 if ((TREE_CODE (index_type) != INTEGER_TYPE
30218 && !POINTER_TYPE_P (index_type))
30219 || (TYPE_MODE (index_type) != SImode
30220 && TYPE_MODE (index_type) != DImode))
30221 return NULL_TREE;
30222
30223 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30224 return NULL_TREE;
30225
30226 /* v*gather* insn sign extends index to pointer mode. */
30227 if (TYPE_PRECISION (index_type) < POINTER_SIZE
30228 && TYPE_UNSIGNED (index_type))
30229 return NULL_TREE;
30230
30231 if (scale <= 0
30232 || scale > 8
30233 || (scale & (scale - 1)) != 0)
30234 return NULL_TREE;
30235
30236 si = TYPE_MODE (index_type) == SImode;
30237 switch (TYPE_MODE (mem_vectype))
30238 {
30239 case V2DFmode:
30240 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30241 break;
30242 case V4DFmode:
30243 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30244 break;
30245 case V2DImode:
30246 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30247 break;
30248 case V4DImode:
30249 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30250 break;
30251 case V4SFmode:
30252 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30253 break;
30254 case V8SFmode:
30255 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30256 break;
30257 case V4SImode:
30258 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30259 break;
30260 case V8SImode:
30261 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30262 break;
30263 default:
30264 return NULL_TREE;
30265 }
30266
30267 return ix86_builtins[code];
30268 }
30269
30270 /* Returns a code for a target-specific builtin that implements
30271 reciprocal of the function, or NULL_TREE if not available. */
30272
30273 static tree
30274 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30275 bool sqrt ATTRIBUTE_UNUSED)
30276 {
30277 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30278 && flag_finite_math_only && !flag_trapping_math
30279 && flag_unsafe_math_optimizations))
30280 return NULL_TREE;
30281
30282 if (md_fn)
30283 /* Machine dependent builtins. */
30284 switch (fn)
30285 {
30286 /* Vectorized version of sqrt to rsqrt conversion. */
30287 case IX86_BUILTIN_SQRTPS_NR:
30288 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30289
30290 case IX86_BUILTIN_SQRTPS_NR256:
30291 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30292
30293 default:
30294 return NULL_TREE;
30295 }
30296 else
30297 /* Normal builtins. */
30298 switch (fn)
30299 {
30300 /* Sqrt to rsqrt conversion. */
30301 case BUILT_IN_SQRTF:
30302 return ix86_builtins[IX86_BUILTIN_RSQRTF];
30303
30304 default:
30305 return NULL_TREE;
30306 }
30307 }
30308 \f
30309 /* Helper for avx_vpermilps256_operand et al. This is also used by
30310 the expansion functions to turn the parallel back into a mask.
30311 The return value is 0 for no match and the imm8+1 for a match. */
30312
30313 int
30314 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30315 {
30316 unsigned i, nelt = GET_MODE_NUNITS (mode);
30317 unsigned mask = 0;
30318 unsigned char ipar[8];
30319
30320 if (XVECLEN (par, 0) != (int) nelt)
30321 return 0;
30322
30323 /* Validate that all of the elements are constants, and not totally
30324 out of range. Copy the data into an integral array to make the
30325 subsequent checks easier. */
30326 for (i = 0; i < nelt; ++i)
30327 {
30328 rtx er = XVECEXP (par, 0, i);
30329 unsigned HOST_WIDE_INT ei;
30330
30331 if (!CONST_INT_P (er))
30332 return 0;
30333 ei = INTVAL (er);
30334 if (ei >= nelt)
30335 return 0;
30336 ipar[i] = ei;
30337 }
30338
30339 switch (mode)
30340 {
30341 case V4DFmode:
30342 /* In the 256-bit DFmode case, we can only move elements within
30343 a 128-bit lane. */
30344 for (i = 0; i < 2; ++i)
30345 {
30346 if (ipar[i] >= 2)
30347 return 0;
30348 mask |= ipar[i] << i;
30349 }
30350 for (i = 2; i < 4; ++i)
30351 {
30352 if (ipar[i] < 2)
30353 return 0;
30354 mask |= (ipar[i] - 2) << i;
30355 }
30356 break;
30357
30358 case V8SFmode:
30359 /* In the 256-bit SFmode case, we have full freedom of movement
30360 within the low 128-bit lane, but the high 128-bit lane must
30361 mirror the exact same pattern. */
30362 for (i = 0; i < 4; ++i)
30363 if (ipar[i] + 4 != ipar[i + 4])
30364 return 0;
30365 nelt = 4;
30366 /* FALLTHRU */
30367
30368 case V2DFmode:
30369 case V4SFmode:
30370 /* In the 128-bit case, we've full freedom in the placement of
30371 the elements from the source operand. */
30372 for (i = 0; i < nelt; ++i)
30373 mask |= ipar[i] << (i * (nelt / 2));
30374 break;
30375
30376 default:
30377 gcc_unreachable ();
30378 }
30379
30380 /* Make sure success has a non-zero value by adding one. */
30381 return mask + 1;
30382 }
30383
30384 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
30385 the expansion functions to turn the parallel back into a mask.
30386 The return value is 0 for no match and the imm8+1 for a match. */
30387
30388 int
30389 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30390 {
30391 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30392 unsigned mask = 0;
30393 unsigned char ipar[8];
30394
30395 if (XVECLEN (par, 0) != (int) nelt)
30396 return 0;
30397
30398 /* Validate that all of the elements are constants, and not totally
30399 out of range. Copy the data into an integral array to make the
30400 subsequent checks easier. */
30401 for (i = 0; i < nelt; ++i)
30402 {
30403 rtx er = XVECEXP (par, 0, i);
30404 unsigned HOST_WIDE_INT ei;
30405
30406 if (!CONST_INT_P (er))
30407 return 0;
30408 ei = INTVAL (er);
30409 if (ei >= 2 * nelt)
30410 return 0;
30411 ipar[i] = ei;
30412 }
30413
30414 /* Validate that the halves of the permute are halves. */
30415 for (i = 0; i < nelt2 - 1; ++i)
30416 if (ipar[i] + 1 != ipar[i + 1])
30417 return 0;
30418 for (i = nelt2; i < nelt - 1; ++i)
30419 if (ipar[i] + 1 != ipar[i + 1])
30420 return 0;
30421
30422 /* Reconstruct the mask. */
30423 for (i = 0; i < 2; ++i)
30424 {
30425 unsigned e = ipar[i * nelt2];
30426 if (e % nelt2)
30427 return 0;
30428 e /= nelt2;
30429 mask |= e << (i * 4);
30430 }
30431
30432 /* Make sure success has a non-zero value by adding one. */
30433 return mask + 1;
30434 }
30435 \f
30436 /* Store OPERAND to the memory after reload is completed. This means
30437 that we can't easily use assign_stack_local. */
30438 rtx
30439 ix86_force_to_memory (enum machine_mode mode, rtx operand)
30440 {
30441 rtx result;
30442
30443 gcc_assert (reload_completed);
30444 if (ix86_using_red_zone ())
30445 {
30446 result = gen_rtx_MEM (mode,
30447 gen_rtx_PLUS (Pmode,
30448 stack_pointer_rtx,
30449 GEN_INT (-RED_ZONE_SIZE)));
30450 emit_move_insn (result, operand);
30451 }
30452 else if (TARGET_64BIT)
30453 {
30454 switch (mode)
30455 {
30456 case HImode:
30457 case SImode:
30458 operand = gen_lowpart (DImode, operand);
30459 /* FALLTHRU */
30460 case DImode:
30461 emit_insn (
30462 gen_rtx_SET (VOIDmode,
30463 gen_rtx_MEM (DImode,
30464 gen_rtx_PRE_DEC (DImode,
30465 stack_pointer_rtx)),
30466 operand));
30467 break;
30468 default:
30469 gcc_unreachable ();
30470 }
30471 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30472 }
30473 else
30474 {
30475 switch (mode)
30476 {
30477 case DImode:
30478 {
30479 rtx operands[2];
30480 split_double_mode (mode, &operand, 1, operands, operands + 1);
30481 emit_insn (
30482 gen_rtx_SET (VOIDmode,
30483 gen_rtx_MEM (SImode,
30484 gen_rtx_PRE_DEC (Pmode,
30485 stack_pointer_rtx)),
30486 operands[1]));
30487 emit_insn (
30488 gen_rtx_SET (VOIDmode,
30489 gen_rtx_MEM (SImode,
30490 gen_rtx_PRE_DEC (Pmode,
30491 stack_pointer_rtx)),
30492 operands[0]));
30493 }
30494 break;
30495 case HImode:
30496 /* Store HImodes as SImodes. */
30497 operand = gen_lowpart (SImode, operand);
30498 /* FALLTHRU */
30499 case SImode:
30500 emit_insn (
30501 gen_rtx_SET (VOIDmode,
30502 gen_rtx_MEM (GET_MODE (operand),
30503 gen_rtx_PRE_DEC (SImode,
30504 stack_pointer_rtx)),
30505 operand));
30506 break;
30507 default:
30508 gcc_unreachable ();
30509 }
30510 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30511 }
30512 return result;
30513 }
30514
30515 /* Free operand from the memory. */
30516 void
30517 ix86_free_from_memory (enum machine_mode mode)
30518 {
30519 if (!ix86_using_red_zone ())
30520 {
30521 int size;
30522
30523 if (mode == DImode || TARGET_64BIT)
30524 size = 8;
30525 else
30526 size = 4;
30527 /* Use LEA to deallocate stack space. In peephole2 it will be converted
30528 to pop or add instruction if registers are available. */
30529 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
30530 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
30531 GEN_INT (size))));
30532 }
30533 }
30534
30535 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
30536
30537 Put float CONST_DOUBLE in the constant pool instead of fp regs.
30538 QImode must go into class Q_REGS.
30539 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
30540 movdf to do mem-to-mem moves through integer regs. */
30541
30542 static reg_class_t
30543 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
30544 {
30545 enum machine_mode mode = GET_MODE (x);
30546
30547 /* We're only allowed to return a subclass of CLASS. Many of the
30548 following checks fail for NO_REGS, so eliminate that early. */
30549 if (regclass == NO_REGS)
30550 return NO_REGS;
30551
30552 /* All classes can load zeros. */
30553 if (x == CONST0_RTX (mode))
30554 return regclass;
30555
30556 /* Force constants into memory if we are loading a (nonzero) constant into
30557 an MMX or SSE register. This is because there are no MMX/SSE instructions
30558 to load from a constant. */
30559 if (CONSTANT_P (x)
30560 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
30561 return NO_REGS;
30562
30563 /* Prefer SSE regs only, if we can use them for math. */
30564 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
30565 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
30566
30567 /* Floating-point constants need more complex checks. */
30568 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
30569 {
30570 /* General regs can load everything. */
30571 if (reg_class_subset_p (regclass, GENERAL_REGS))
30572 return regclass;
30573
30574 /* Floats can load 0 and 1 plus some others. Note that we eliminated
30575 zero above. We only want to wind up preferring 80387 registers if
30576 we plan on doing computation with them. */
30577 if (TARGET_80387
30578 && standard_80387_constant_p (x) > 0)
30579 {
30580 /* Limit class to non-sse. */
30581 if (regclass == FLOAT_SSE_REGS)
30582 return FLOAT_REGS;
30583 if (regclass == FP_TOP_SSE_REGS)
30584 return FP_TOP_REG;
30585 if (regclass == FP_SECOND_SSE_REGS)
30586 return FP_SECOND_REG;
30587 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
30588 return regclass;
30589 }
30590
30591 return NO_REGS;
30592 }
30593
30594 /* Generally when we see PLUS here, it's the function invariant
30595 (plus soft-fp const_int). Which can only be computed into general
30596 regs. */
30597 if (GET_CODE (x) == PLUS)
30598 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
30599
30600 /* QImode constants are easy to load, but non-constant QImode data
30601 must go into Q_REGS. */
30602 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
30603 {
30604 if (reg_class_subset_p (regclass, Q_REGS))
30605 return regclass;
30606 if (reg_class_subset_p (Q_REGS, regclass))
30607 return Q_REGS;
30608 return NO_REGS;
30609 }
30610
30611 return regclass;
30612 }
30613
30614 /* Discourage putting floating-point values in SSE registers unless
30615 SSE math is being used, and likewise for the 387 registers. */
30616 static reg_class_t
30617 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
30618 {
30619 enum machine_mode mode = GET_MODE (x);
30620
30621 /* Restrict the output reload class to the register bank that we are doing
30622 math on. If we would like not to return a subset of CLASS, reject this
30623 alternative: if reload cannot do this, it will still use its choice. */
30624 mode = GET_MODE (x);
30625 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
30626 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
30627
30628 if (X87_FLOAT_MODE_P (mode))
30629 {
30630 if (regclass == FP_TOP_SSE_REGS)
30631 return FP_TOP_REG;
30632 else if (regclass == FP_SECOND_SSE_REGS)
30633 return FP_SECOND_REG;
30634 else
30635 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
30636 }
30637
30638 return regclass;
30639 }
30640
30641 static reg_class_t
30642 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
30643 enum machine_mode mode, secondary_reload_info *sri)
30644 {
30645 /* Double-word spills from general registers to non-offsettable memory
30646 references (zero-extended addresses) require special handling. */
30647 if (TARGET_64BIT
30648 && MEM_P (x)
30649 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
30650 && rclass == GENERAL_REGS
30651 && !offsettable_memref_p (x))
30652 {
30653 sri->icode = (in_p
30654 ? CODE_FOR_reload_noff_load
30655 : CODE_FOR_reload_noff_store);
30656 /* Add the cost of moving address to a temporary. */
30657 sri->extra_cost = 1;
30658
30659 return NO_REGS;
30660 }
30661
30662 /* QImode spills from non-QI registers require
30663 intermediate register on 32bit targets. */
30664 if (!TARGET_64BIT
30665 && !in_p && mode == QImode
30666 && (rclass == GENERAL_REGS
30667 || rclass == LEGACY_REGS
30668 || rclass == INDEX_REGS))
30669 {
30670 int regno;
30671
30672 if (REG_P (x))
30673 regno = REGNO (x);
30674 else
30675 regno = -1;
30676
30677 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
30678 regno = true_regnum (x);
30679
30680 /* Return Q_REGS if the operand is in memory. */
30681 if (regno == -1)
30682 return Q_REGS;
30683 }
30684
30685 /* This condition handles corner case where an expression involving
30686 pointers gets vectorized. We're trying to use the address of a
30687 stack slot as a vector initializer.
30688
30689 (set (reg:V2DI 74 [ vect_cst_.2 ])
30690 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
30691
30692 Eventually frame gets turned into sp+offset like this:
30693
30694 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30695 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30696 (const_int 392 [0x188]))))
30697
30698 That later gets turned into:
30699
30700 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30701 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30702 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
30703
30704 We'll have the following reload recorded:
30705
30706 Reload 0: reload_in (DI) =
30707 (plus:DI (reg/f:DI 7 sp)
30708 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
30709 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30710 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
30711 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
30712 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30713 reload_reg_rtx: (reg:V2DI 22 xmm1)
30714
30715 Which isn't going to work since SSE instructions can't handle scalar
30716 additions. Returning GENERAL_REGS forces the addition into integer
30717 register and reload can handle subsequent reloads without problems. */
30718
30719 if (in_p && GET_CODE (x) == PLUS
30720 && SSE_CLASS_P (rclass)
30721 && SCALAR_INT_MODE_P (mode))
30722 return GENERAL_REGS;
30723
30724 return NO_REGS;
30725 }
30726
30727 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
30728
30729 static bool
30730 ix86_class_likely_spilled_p (reg_class_t rclass)
30731 {
30732 switch (rclass)
30733 {
30734 case AREG:
30735 case DREG:
30736 case CREG:
30737 case BREG:
30738 case AD_REGS:
30739 case SIREG:
30740 case DIREG:
30741 case SSE_FIRST_REG:
30742 case FP_TOP_REG:
30743 case FP_SECOND_REG:
30744 return true;
30745
30746 default:
30747 break;
30748 }
30749
30750 return false;
30751 }
30752
30753 /* If we are copying between general and FP registers, we need a memory
30754 location. The same is true for SSE and MMX registers.
30755
30756 To optimize register_move_cost performance, allow inline variant.
30757
30758 The macro can't work reliably when one of the CLASSES is class containing
30759 registers from multiple units (SSE, MMX, integer). We avoid this by never
30760 combining those units in single alternative in the machine description.
30761 Ensure that this constraint holds to avoid unexpected surprises.
30762
30763 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
30764 enforce these sanity checks. */
30765
30766 static inline bool
30767 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30768 enum machine_mode mode, int strict)
30769 {
30770 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
30771 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
30772 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
30773 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
30774 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
30775 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
30776 {
30777 gcc_assert (!strict);
30778 return true;
30779 }
30780
30781 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
30782 return true;
30783
30784 /* ??? This is a lie. We do have moves between mmx/general, and for
30785 mmx/sse2. But by saying we need secondary memory we discourage the
30786 register allocator from using the mmx registers unless needed. */
30787 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
30788 return true;
30789
30790 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30791 {
30792 /* SSE1 doesn't have any direct moves from other classes. */
30793 if (!TARGET_SSE2)
30794 return true;
30795
30796 /* If the target says that inter-unit moves are more expensive
30797 than moving through memory, then don't generate them. */
30798 if (!TARGET_INTER_UNIT_MOVES)
30799 return true;
30800
30801 /* Between SSE and general, we have moves no larger than word size. */
30802 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
30803 return true;
30804 }
30805
30806 return false;
30807 }
30808
30809 bool
30810 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30811 enum machine_mode mode, int strict)
30812 {
30813 return inline_secondary_memory_needed (class1, class2, mode, strict);
30814 }
30815
30816 /* Implement the TARGET_CLASS_MAX_NREGS hook.
30817
30818 On the 80386, this is the size of MODE in words,
30819 except in the FP regs, where a single reg is always enough. */
30820
30821 static unsigned char
30822 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
30823 {
30824 if (MAYBE_INTEGER_CLASS_P (rclass))
30825 {
30826 if (mode == XFmode)
30827 return (TARGET_64BIT ? 2 : 3);
30828 else if (mode == XCmode)
30829 return (TARGET_64BIT ? 4 : 6);
30830 else
30831 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
30832 }
30833 else
30834 {
30835 if (COMPLEX_MODE_P (mode))
30836 return 2;
30837 else
30838 return 1;
30839 }
30840 }
30841
30842 /* Return true if the registers in CLASS cannot represent the change from
30843 modes FROM to TO. */
30844
30845 bool
30846 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
30847 enum reg_class regclass)
30848 {
30849 if (from == to)
30850 return false;
30851
30852 /* x87 registers can't do subreg at all, as all values are reformatted
30853 to extended precision. */
30854 if (MAYBE_FLOAT_CLASS_P (regclass))
30855 return true;
30856
30857 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
30858 {
30859 /* Vector registers do not support QI or HImode loads. If we don't
30860 disallow a change to these modes, reload will assume it's ok to
30861 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
30862 the vec_dupv4hi pattern. */
30863 if (GET_MODE_SIZE (from) < 4)
30864 return true;
30865
30866 /* Vector registers do not support subreg with nonzero offsets, which
30867 are otherwise valid for integer registers. Since we can't see
30868 whether we have a nonzero offset from here, prohibit all
30869 nonparadoxical subregs changing size. */
30870 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
30871 return true;
30872 }
30873
30874 return false;
30875 }
30876
30877 /* Return the cost of moving data of mode M between a
30878 register and memory. A value of 2 is the default; this cost is
30879 relative to those in `REGISTER_MOVE_COST'.
30880
30881 This function is used extensively by register_move_cost that is used to
30882 build tables at startup. Make it inline in this case.
30883 When IN is 2, return maximum of in and out move cost.
30884
30885 If moving between registers and memory is more expensive than
30886 between two registers, you should define this macro to express the
30887 relative cost.
30888
30889 Model also increased moving costs of QImode registers in non
30890 Q_REGS classes.
30891 */
30892 static inline int
30893 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
30894 int in)
30895 {
30896 int cost;
30897 if (FLOAT_CLASS_P (regclass))
30898 {
30899 int index;
30900 switch (mode)
30901 {
30902 case SFmode:
30903 index = 0;
30904 break;
30905 case DFmode:
30906 index = 1;
30907 break;
30908 case XFmode:
30909 index = 2;
30910 break;
30911 default:
30912 return 100;
30913 }
30914 if (in == 2)
30915 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
30916 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
30917 }
30918 if (SSE_CLASS_P (regclass))
30919 {
30920 int index;
30921 switch (GET_MODE_SIZE (mode))
30922 {
30923 case 4:
30924 index = 0;
30925 break;
30926 case 8:
30927 index = 1;
30928 break;
30929 case 16:
30930 index = 2;
30931 break;
30932 default:
30933 return 100;
30934 }
30935 if (in == 2)
30936 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
30937 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
30938 }
30939 if (MMX_CLASS_P (regclass))
30940 {
30941 int index;
30942 switch (GET_MODE_SIZE (mode))
30943 {
30944 case 4:
30945 index = 0;
30946 break;
30947 case 8:
30948 index = 1;
30949 break;
30950 default:
30951 return 100;
30952 }
30953 if (in)
30954 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
30955 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
30956 }
30957 switch (GET_MODE_SIZE (mode))
30958 {
30959 case 1:
30960 if (Q_CLASS_P (regclass) || TARGET_64BIT)
30961 {
30962 if (!in)
30963 return ix86_cost->int_store[0];
30964 if (TARGET_PARTIAL_REG_DEPENDENCY
30965 && optimize_function_for_speed_p (cfun))
30966 cost = ix86_cost->movzbl_load;
30967 else
30968 cost = ix86_cost->int_load[0];
30969 if (in == 2)
30970 return MAX (cost, ix86_cost->int_store[0]);
30971 return cost;
30972 }
30973 else
30974 {
30975 if (in == 2)
30976 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
30977 if (in)
30978 return ix86_cost->movzbl_load;
30979 else
30980 return ix86_cost->int_store[0] + 4;
30981 }
30982 break;
30983 case 2:
30984 if (in == 2)
30985 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
30986 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
30987 default:
30988 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
30989 if (mode == TFmode)
30990 mode = XFmode;
30991 if (in == 2)
30992 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
30993 else if (in)
30994 cost = ix86_cost->int_load[2];
30995 else
30996 cost = ix86_cost->int_store[2];
30997 return (cost * (((int) GET_MODE_SIZE (mode)
30998 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
30999 }
31000 }
31001
31002 static int
31003 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31004 bool in)
31005 {
31006 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31007 }
31008
31009
31010 /* Return the cost of moving data from a register in class CLASS1 to
31011 one in class CLASS2.
31012
31013 It is not required that the cost always equal 2 when FROM is the same as TO;
31014 on some machines it is expensive to move between registers if they are not
31015 general registers. */
31016
31017 static int
31018 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31019 reg_class_t class2_i)
31020 {
31021 enum reg_class class1 = (enum reg_class) class1_i;
31022 enum reg_class class2 = (enum reg_class) class2_i;
31023
31024 /* In case we require secondary memory, compute cost of the store followed
31025 by load. In order to avoid bad register allocation choices, we need
31026 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
31027
31028 if (inline_secondary_memory_needed (class1, class2, mode, 0))
31029 {
31030 int cost = 1;
31031
31032 cost += inline_memory_move_cost (mode, class1, 2);
31033 cost += inline_memory_move_cost (mode, class2, 2);
31034
31035 /* In case of copying from general_purpose_register we may emit multiple
31036 stores followed by single load causing memory size mismatch stall.
31037 Count this as arbitrarily high cost of 20. */
31038 if (targetm.class_max_nregs (class1, mode)
31039 > targetm.class_max_nregs (class2, mode))
31040 cost += 20;
31041
31042 /* In the case of FP/MMX moves, the registers actually overlap, and we
31043 have to switch modes in order to treat them differently. */
31044 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31045 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31046 cost += 20;
31047
31048 return cost;
31049 }
31050
31051 /* Moves between SSE/MMX and integer unit are expensive. */
31052 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31053 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31054
31055 /* ??? By keeping returned value relatively high, we limit the number
31056 of moves between integer and MMX/SSE registers for all targets.
31057 Additionally, high value prevents problem with x86_modes_tieable_p(),
31058 where integer modes in MMX/SSE registers are not tieable
31059 because of missing QImode and HImode moves to, from or between
31060 MMX/SSE registers. */
31061 return MAX (8, ix86_cost->mmxsse_to_integer);
31062
31063 if (MAYBE_FLOAT_CLASS_P (class1))
31064 return ix86_cost->fp_move;
31065 if (MAYBE_SSE_CLASS_P (class1))
31066 return ix86_cost->sse_move;
31067 if (MAYBE_MMX_CLASS_P (class1))
31068 return ix86_cost->mmx_move;
31069 return 2;
31070 }
31071
31072 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31073 MODE. */
31074
31075 bool
31076 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31077 {
31078 /* Flags and only flags can only hold CCmode values. */
31079 if (CC_REGNO_P (regno))
31080 return GET_MODE_CLASS (mode) == MODE_CC;
31081 if (GET_MODE_CLASS (mode) == MODE_CC
31082 || GET_MODE_CLASS (mode) == MODE_RANDOM
31083 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31084 return false;
31085 if (FP_REGNO_P (regno))
31086 return VALID_FP_MODE_P (mode);
31087 if (SSE_REGNO_P (regno))
31088 {
31089 /* We implement the move patterns for all vector modes into and
31090 out of SSE registers, even when no operation instructions
31091 are available. OImode move is available only when AVX is
31092 enabled. */
31093 return ((TARGET_AVX && mode == OImode)
31094 || VALID_AVX256_REG_MODE (mode)
31095 || VALID_SSE_REG_MODE (mode)
31096 || VALID_SSE2_REG_MODE (mode)
31097 || VALID_MMX_REG_MODE (mode)
31098 || VALID_MMX_REG_MODE_3DNOW (mode));
31099 }
31100 if (MMX_REGNO_P (regno))
31101 {
31102 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31103 so if the register is available at all, then we can move data of
31104 the given mode into or out of it. */
31105 return (VALID_MMX_REG_MODE (mode)
31106 || VALID_MMX_REG_MODE_3DNOW (mode));
31107 }
31108
31109 if (mode == QImode)
31110 {
31111 /* Take care for QImode values - they can be in non-QI regs,
31112 but then they do cause partial register stalls. */
31113 if (regno <= BX_REG || TARGET_64BIT)
31114 return true;
31115 if (!TARGET_PARTIAL_REG_STALL)
31116 return true;
31117 return !can_create_pseudo_p ();
31118 }
31119 /* We handle both integer and floats in the general purpose registers. */
31120 else if (VALID_INT_MODE_P (mode))
31121 return true;
31122 else if (VALID_FP_MODE_P (mode))
31123 return true;
31124 else if (VALID_DFP_MODE_P (mode))
31125 return true;
31126 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
31127 on to use that value in smaller contexts, this can easily force a
31128 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
31129 supporting DImode, allow it. */
31130 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31131 return true;
31132
31133 return false;
31134 }
31135
31136 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
31137 tieable integer mode. */
31138
31139 static bool
31140 ix86_tieable_integer_mode_p (enum machine_mode mode)
31141 {
31142 switch (mode)
31143 {
31144 case HImode:
31145 case SImode:
31146 return true;
31147
31148 case QImode:
31149 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31150
31151 case DImode:
31152 return TARGET_64BIT;
31153
31154 default:
31155 return false;
31156 }
31157 }
31158
31159 /* Return true if MODE1 is accessible in a register that can hold MODE2
31160 without copying. That is, all register classes that can hold MODE2
31161 can also hold MODE1. */
31162
31163 bool
31164 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31165 {
31166 if (mode1 == mode2)
31167 return true;
31168
31169 if (ix86_tieable_integer_mode_p (mode1)
31170 && ix86_tieable_integer_mode_p (mode2))
31171 return true;
31172
31173 /* MODE2 being XFmode implies fp stack or general regs, which means we
31174 can tie any smaller floating point modes to it. Note that we do not
31175 tie this with TFmode. */
31176 if (mode2 == XFmode)
31177 return mode1 == SFmode || mode1 == DFmode;
31178
31179 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31180 that we can tie it with SFmode. */
31181 if (mode2 == DFmode)
31182 return mode1 == SFmode;
31183
31184 /* If MODE2 is only appropriate for an SSE register, then tie with
31185 any other mode acceptable to SSE registers. */
31186 if (GET_MODE_SIZE (mode2) == 16
31187 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31188 return (GET_MODE_SIZE (mode1) == 16
31189 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31190
31191 /* If MODE2 is appropriate for an MMX register, then tie
31192 with any other mode acceptable to MMX registers. */
31193 if (GET_MODE_SIZE (mode2) == 8
31194 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31195 return (GET_MODE_SIZE (mode1) == 8
31196 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31197
31198 return false;
31199 }
31200
31201 /* Compute a (partial) cost for rtx X. Return true if the complete
31202 cost has been computed, and false if subexpressions should be
31203 scanned. In either case, *TOTAL contains the cost result. */
31204
31205 static bool
31206 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
31207 bool speed)
31208 {
31209 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31210 enum machine_mode mode = GET_MODE (x);
31211 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31212
31213 switch (code)
31214 {
31215 case CONST_INT:
31216 case CONST:
31217 case LABEL_REF:
31218 case SYMBOL_REF:
31219 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31220 *total = 3;
31221 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31222 *total = 2;
31223 else if (flag_pic && SYMBOLIC_CONST (x)
31224 && (!TARGET_64BIT
31225 || (!GET_CODE (x) != LABEL_REF
31226 && (GET_CODE (x) != SYMBOL_REF
31227 || !SYMBOL_REF_LOCAL_P (x)))))
31228 *total = 1;
31229 else
31230 *total = 0;
31231 return true;
31232
31233 case CONST_DOUBLE:
31234 if (mode == VOIDmode)
31235 *total = 0;
31236 else
31237 switch (standard_80387_constant_p (x))
31238 {
31239 case 1: /* 0.0 */
31240 *total = 1;
31241 break;
31242 default: /* Other constants */
31243 *total = 2;
31244 break;
31245 case 0:
31246 case -1:
31247 /* Start with (MEM (SYMBOL_REF)), since that's where
31248 it'll probably end up. Add a penalty for size. */
31249 *total = (COSTS_N_INSNS (1)
31250 + (flag_pic != 0 && !TARGET_64BIT)
31251 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
31252 break;
31253 }
31254 return true;
31255
31256 case ZERO_EXTEND:
31257 /* The zero extensions is often completely free on x86_64, so make
31258 it as cheap as possible. */
31259 if (TARGET_64BIT && mode == DImode
31260 && GET_MODE (XEXP (x, 0)) == SImode)
31261 *total = 1;
31262 else if (TARGET_ZERO_EXTEND_WITH_AND)
31263 *total = cost->add;
31264 else
31265 *total = cost->movzx;
31266 return false;
31267
31268 case SIGN_EXTEND:
31269 *total = cost->movsx;
31270 return false;
31271
31272 case ASHIFT:
31273 if (CONST_INT_P (XEXP (x, 1))
31274 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
31275 {
31276 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31277 if (value == 1)
31278 {
31279 *total = cost->add;
31280 return false;
31281 }
31282 if ((value == 2 || value == 3)
31283 && cost->lea <= cost->shift_const)
31284 {
31285 *total = cost->lea;
31286 return false;
31287 }
31288 }
31289 /* FALLTHRU */
31290
31291 case ROTATE:
31292 case ASHIFTRT:
31293 case LSHIFTRT:
31294 case ROTATERT:
31295 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
31296 {
31297 if (CONST_INT_P (XEXP (x, 1)))
31298 {
31299 if (INTVAL (XEXP (x, 1)) > 32)
31300 *total = cost->shift_const + COSTS_N_INSNS (2);
31301 else
31302 *total = cost->shift_const * 2;
31303 }
31304 else
31305 {
31306 if (GET_CODE (XEXP (x, 1)) == AND)
31307 *total = cost->shift_var * 2;
31308 else
31309 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
31310 }
31311 }
31312 else
31313 {
31314 if (CONST_INT_P (XEXP (x, 1)))
31315 *total = cost->shift_const;
31316 else
31317 *total = cost->shift_var;
31318 }
31319 return false;
31320
31321 case FMA:
31322 {
31323 rtx sub;
31324
31325 gcc_assert (FLOAT_MODE_P (mode));
31326 gcc_assert (TARGET_FMA || TARGET_FMA4);
31327
31328 /* ??? SSE scalar/vector cost should be used here. */
31329 /* ??? Bald assumption that fma has the same cost as fmul. */
31330 *total = cost->fmul;
31331 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31332
31333 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
31334 sub = XEXP (x, 0);
31335 if (GET_CODE (sub) == NEG)
31336 sub = XEXP (sub, 0);
31337 *total += rtx_cost (sub, FMA, 0, speed);
31338
31339 sub = XEXP (x, 2);
31340 if (GET_CODE (sub) == NEG)
31341 sub = XEXP (sub, 0);
31342 *total += rtx_cost (sub, FMA, 2, speed);
31343 return true;
31344 }
31345
31346 case MULT:
31347 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31348 {
31349 /* ??? SSE scalar cost should be used here. */
31350 *total = cost->fmul;
31351 return false;
31352 }
31353 else if (X87_FLOAT_MODE_P (mode))
31354 {
31355 *total = cost->fmul;
31356 return false;
31357 }
31358 else if (FLOAT_MODE_P (mode))
31359 {
31360 /* ??? SSE vector cost should be used here. */
31361 *total = cost->fmul;
31362 return false;
31363 }
31364 else
31365 {
31366 rtx op0 = XEXP (x, 0);
31367 rtx op1 = XEXP (x, 1);
31368 int nbits;
31369 if (CONST_INT_P (XEXP (x, 1)))
31370 {
31371 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31372 for (nbits = 0; value != 0; value &= value - 1)
31373 nbits++;
31374 }
31375 else
31376 /* This is arbitrary. */
31377 nbits = 7;
31378
31379 /* Compute costs correctly for widening multiplication. */
31380 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
31381 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
31382 == GET_MODE_SIZE (mode))
31383 {
31384 int is_mulwiden = 0;
31385 enum machine_mode inner_mode = GET_MODE (op0);
31386
31387 if (GET_CODE (op0) == GET_CODE (op1))
31388 is_mulwiden = 1, op1 = XEXP (op1, 0);
31389 else if (CONST_INT_P (op1))
31390 {
31391 if (GET_CODE (op0) == SIGN_EXTEND)
31392 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
31393 == INTVAL (op1);
31394 else
31395 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
31396 }
31397
31398 if (is_mulwiden)
31399 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
31400 }
31401
31402 *total = (cost->mult_init[MODE_INDEX (mode)]
31403 + nbits * cost->mult_bit
31404 + rtx_cost (op0, outer_code, opno, speed)
31405 + rtx_cost (op1, outer_code, opno, speed));
31406
31407 return true;
31408 }
31409
31410 case DIV:
31411 case UDIV:
31412 case MOD:
31413 case UMOD:
31414 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31415 /* ??? SSE cost should be used here. */
31416 *total = cost->fdiv;
31417 else if (X87_FLOAT_MODE_P (mode))
31418 *total = cost->fdiv;
31419 else if (FLOAT_MODE_P (mode))
31420 /* ??? SSE vector cost should be used here. */
31421 *total = cost->fdiv;
31422 else
31423 *total = cost->divide[MODE_INDEX (mode)];
31424 return false;
31425
31426 case PLUS:
31427 if (GET_MODE_CLASS (mode) == MODE_INT
31428 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
31429 {
31430 if (GET_CODE (XEXP (x, 0)) == PLUS
31431 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
31432 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
31433 && CONSTANT_P (XEXP (x, 1)))
31434 {
31435 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
31436 if (val == 2 || val == 4 || val == 8)
31437 {
31438 *total = cost->lea;
31439 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31440 outer_code, opno, speed);
31441 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
31442 outer_code, opno, speed);
31443 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31444 return true;
31445 }
31446 }
31447 else if (GET_CODE (XEXP (x, 0)) == MULT
31448 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
31449 {
31450 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
31451 if (val == 2 || val == 4 || val == 8)
31452 {
31453 *total = cost->lea;
31454 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31455 outer_code, opno, speed);
31456 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31457 return true;
31458 }
31459 }
31460 else if (GET_CODE (XEXP (x, 0)) == PLUS)
31461 {
31462 *total = cost->lea;
31463 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31464 outer_code, opno, speed);
31465 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31466 outer_code, opno, speed);
31467 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31468 return true;
31469 }
31470 }
31471 /* FALLTHRU */
31472
31473 case MINUS:
31474 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31475 {
31476 /* ??? SSE cost should be used here. */
31477 *total = cost->fadd;
31478 return false;
31479 }
31480 else if (X87_FLOAT_MODE_P (mode))
31481 {
31482 *total = cost->fadd;
31483 return false;
31484 }
31485 else if (FLOAT_MODE_P (mode))
31486 {
31487 /* ??? SSE vector cost should be used here. */
31488 *total = cost->fadd;
31489 return false;
31490 }
31491 /* FALLTHRU */
31492
31493 case AND:
31494 case IOR:
31495 case XOR:
31496 if (!TARGET_64BIT && mode == DImode)
31497 {
31498 *total = (cost->add * 2
31499 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
31500 << (GET_MODE (XEXP (x, 0)) != DImode))
31501 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
31502 << (GET_MODE (XEXP (x, 1)) != DImode)));
31503 return true;
31504 }
31505 /* FALLTHRU */
31506
31507 case NEG:
31508 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31509 {
31510 /* ??? SSE cost should be used here. */
31511 *total = cost->fchs;
31512 return false;
31513 }
31514 else if (X87_FLOAT_MODE_P (mode))
31515 {
31516 *total = cost->fchs;
31517 return false;
31518 }
31519 else if (FLOAT_MODE_P (mode))
31520 {
31521 /* ??? SSE vector cost should be used here. */
31522 *total = cost->fchs;
31523 return false;
31524 }
31525 /* FALLTHRU */
31526
31527 case NOT:
31528 if (!TARGET_64BIT && mode == DImode)
31529 *total = cost->add * 2;
31530 else
31531 *total = cost->add;
31532 return false;
31533
31534 case COMPARE:
31535 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
31536 && XEXP (XEXP (x, 0), 1) == const1_rtx
31537 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
31538 && XEXP (x, 1) == const0_rtx)
31539 {
31540 /* This kind of construct is implemented using test[bwl].
31541 Treat it as if we had an AND. */
31542 *total = (cost->add
31543 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
31544 + rtx_cost (const1_rtx, outer_code, opno, speed));
31545 return true;
31546 }
31547 return false;
31548
31549 case FLOAT_EXTEND:
31550 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
31551 *total = 0;
31552 return false;
31553
31554 case ABS:
31555 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31556 /* ??? SSE cost should be used here. */
31557 *total = cost->fabs;
31558 else if (X87_FLOAT_MODE_P (mode))
31559 *total = cost->fabs;
31560 else if (FLOAT_MODE_P (mode))
31561 /* ??? SSE vector cost should be used here. */
31562 *total = cost->fabs;
31563 return false;
31564
31565 case SQRT:
31566 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31567 /* ??? SSE cost should be used here. */
31568 *total = cost->fsqrt;
31569 else if (X87_FLOAT_MODE_P (mode))
31570 *total = cost->fsqrt;
31571 else if (FLOAT_MODE_P (mode))
31572 /* ??? SSE vector cost should be used here. */
31573 *total = cost->fsqrt;
31574 return false;
31575
31576 case UNSPEC:
31577 if (XINT (x, 1) == UNSPEC_TP)
31578 *total = 0;
31579 return false;
31580
31581 case VEC_SELECT:
31582 case VEC_CONCAT:
31583 case VEC_MERGE:
31584 case VEC_DUPLICATE:
31585 /* ??? Assume all of these vector manipulation patterns are
31586 recognizable. In which case they all pretty much have the
31587 same cost. */
31588 *total = COSTS_N_INSNS (1);
31589 return true;
31590
31591 default:
31592 return false;
31593 }
31594 }
31595
31596 #if TARGET_MACHO
31597
31598 static int current_machopic_label_num;
31599
31600 /* Given a symbol name and its associated stub, write out the
31601 definition of the stub. */
31602
31603 void
31604 machopic_output_stub (FILE *file, const char *symb, const char *stub)
31605 {
31606 unsigned int length;
31607 char *binder_name, *symbol_name, lazy_ptr_name[32];
31608 int label = ++current_machopic_label_num;
31609
31610 /* For 64-bit we shouldn't get here. */
31611 gcc_assert (!TARGET_64BIT);
31612
31613 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
31614 symb = targetm.strip_name_encoding (symb);
31615
31616 length = strlen (stub);
31617 binder_name = XALLOCAVEC (char, length + 32);
31618 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
31619
31620 length = strlen (symb);
31621 symbol_name = XALLOCAVEC (char, length + 32);
31622 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
31623
31624 sprintf (lazy_ptr_name, "L%d$lz", label);
31625
31626 if (MACHOPIC_ATT_STUB)
31627 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
31628 else if (MACHOPIC_PURE)
31629 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
31630 else
31631 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
31632
31633 fprintf (file, "%s:\n", stub);
31634 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31635
31636 if (MACHOPIC_ATT_STUB)
31637 {
31638 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
31639 }
31640 else if (MACHOPIC_PURE)
31641 {
31642 /* PIC stub. */
31643 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31644 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
31645 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
31646 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
31647 label, lazy_ptr_name, label);
31648 fprintf (file, "\tjmp\t*%%ecx\n");
31649 }
31650 else
31651 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
31652
31653 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
31654 it needs no stub-binding-helper. */
31655 if (MACHOPIC_ATT_STUB)
31656 return;
31657
31658 fprintf (file, "%s:\n", binder_name);
31659
31660 if (MACHOPIC_PURE)
31661 {
31662 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
31663 fprintf (file, "\tpushl\t%%ecx\n");
31664 }
31665 else
31666 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
31667
31668 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
31669
31670 /* N.B. Keep the correspondence of these
31671 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
31672 old-pic/new-pic/non-pic stubs; altering this will break
31673 compatibility with existing dylibs. */
31674 if (MACHOPIC_PURE)
31675 {
31676 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31677 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
31678 }
31679 else
31680 /* 16-byte -mdynamic-no-pic stub. */
31681 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
31682
31683 fprintf (file, "%s:\n", lazy_ptr_name);
31684 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31685 fprintf (file, ASM_LONG "%s\n", binder_name);
31686 }
31687 #endif /* TARGET_MACHO */
31688
31689 /* Order the registers for register allocator. */
31690
31691 void
31692 x86_order_regs_for_local_alloc (void)
31693 {
31694 int pos = 0;
31695 int i;
31696
31697 /* First allocate the local general purpose registers. */
31698 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31699 if (GENERAL_REGNO_P (i) && call_used_regs[i])
31700 reg_alloc_order [pos++] = i;
31701
31702 /* Global general purpose registers. */
31703 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31704 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
31705 reg_alloc_order [pos++] = i;
31706
31707 /* x87 registers come first in case we are doing FP math
31708 using them. */
31709 if (!TARGET_SSE_MATH)
31710 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31711 reg_alloc_order [pos++] = i;
31712
31713 /* SSE registers. */
31714 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
31715 reg_alloc_order [pos++] = i;
31716 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
31717 reg_alloc_order [pos++] = i;
31718
31719 /* x87 registers. */
31720 if (TARGET_SSE_MATH)
31721 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31722 reg_alloc_order [pos++] = i;
31723
31724 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
31725 reg_alloc_order [pos++] = i;
31726
31727 /* Initialize the rest of array as we do not allocate some registers
31728 at all. */
31729 while (pos < FIRST_PSEUDO_REGISTER)
31730 reg_alloc_order [pos++] = 0;
31731 }
31732
31733 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
31734 in struct attribute_spec handler. */
31735 static tree
31736 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
31737 tree args,
31738 int flags ATTRIBUTE_UNUSED,
31739 bool *no_add_attrs)
31740 {
31741 if (TREE_CODE (*node) != FUNCTION_TYPE
31742 && TREE_CODE (*node) != METHOD_TYPE
31743 && TREE_CODE (*node) != FIELD_DECL
31744 && TREE_CODE (*node) != TYPE_DECL)
31745 {
31746 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31747 name);
31748 *no_add_attrs = true;
31749 return NULL_TREE;
31750 }
31751 if (TARGET_64BIT)
31752 {
31753 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
31754 name);
31755 *no_add_attrs = true;
31756 return NULL_TREE;
31757 }
31758 if (is_attribute_p ("callee_pop_aggregate_return", name))
31759 {
31760 tree cst;
31761
31762 cst = TREE_VALUE (args);
31763 if (TREE_CODE (cst) != INTEGER_CST)
31764 {
31765 warning (OPT_Wattributes,
31766 "%qE attribute requires an integer constant argument",
31767 name);
31768 *no_add_attrs = true;
31769 }
31770 else if (compare_tree_int (cst, 0) != 0
31771 && compare_tree_int (cst, 1) != 0)
31772 {
31773 warning (OPT_Wattributes,
31774 "argument to %qE attribute is neither zero, nor one",
31775 name);
31776 *no_add_attrs = true;
31777 }
31778
31779 return NULL_TREE;
31780 }
31781
31782 return NULL_TREE;
31783 }
31784
31785 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
31786 struct attribute_spec.handler. */
31787 static tree
31788 ix86_handle_abi_attribute (tree *node, tree name,
31789 tree args ATTRIBUTE_UNUSED,
31790 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31791 {
31792 if (TREE_CODE (*node) != FUNCTION_TYPE
31793 && TREE_CODE (*node) != METHOD_TYPE
31794 && TREE_CODE (*node) != FIELD_DECL
31795 && TREE_CODE (*node) != TYPE_DECL)
31796 {
31797 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31798 name);
31799 *no_add_attrs = true;
31800 return NULL_TREE;
31801 }
31802
31803 /* Can combine regparm with all attributes but fastcall. */
31804 if (is_attribute_p ("ms_abi", name))
31805 {
31806 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
31807 {
31808 error ("ms_abi and sysv_abi attributes are not compatible");
31809 }
31810
31811 return NULL_TREE;
31812 }
31813 else if (is_attribute_p ("sysv_abi", name))
31814 {
31815 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
31816 {
31817 error ("ms_abi and sysv_abi attributes are not compatible");
31818 }
31819
31820 return NULL_TREE;
31821 }
31822
31823 return NULL_TREE;
31824 }
31825
31826 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
31827 struct attribute_spec.handler. */
31828 static tree
31829 ix86_handle_struct_attribute (tree *node, tree name,
31830 tree args ATTRIBUTE_UNUSED,
31831 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31832 {
31833 tree *type = NULL;
31834 if (DECL_P (*node))
31835 {
31836 if (TREE_CODE (*node) == TYPE_DECL)
31837 type = &TREE_TYPE (*node);
31838 }
31839 else
31840 type = node;
31841
31842 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
31843 || TREE_CODE (*type) == UNION_TYPE)))
31844 {
31845 warning (OPT_Wattributes, "%qE attribute ignored",
31846 name);
31847 *no_add_attrs = true;
31848 }
31849
31850 else if ((is_attribute_p ("ms_struct", name)
31851 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
31852 || ((is_attribute_p ("gcc_struct", name)
31853 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
31854 {
31855 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
31856 name);
31857 *no_add_attrs = true;
31858 }
31859
31860 return NULL_TREE;
31861 }
31862
31863 static tree
31864 ix86_handle_fndecl_attribute (tree *node, tree name,
31865 tree args ATTRIBUTE_UNUSED,
31866 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31867 {
31868 if (TREE_CODE (*node) != FUNCTION_DECL)
31869 {
31870 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31871 name);
31872 *no_add_attrs = true;
31873 }
31874 return NULL_TREE;
31875 }
31876
31877 static bool
31878 ix86_ms_bitfield_layout_p (const_tree record_type)
31879 {
31880 return ((TARGET_MS_BITFIELD_LAYOUT
31881 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
31882 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
31883 }
31884
31885 /* Returns an expression indicating where the this parameter is
31886 located on entry to the FUNCTION. */
31887
31888 static rtx
31889 x86_this_parameter (tree function)
31890 {
31891 tree type = TREE_TYPE (function);
31892 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
31893 int nregs;
31894
31895 if (TARGET_64BIT)
31896 {
31897 const int *parm_regs;
31898
31899 if (ix86_function_type_abi (type) == MS_ABI)
31900 parm_regs = x86_64_ms_abi_int_parameter_registers;
31901 else
31902 parm_regs = x86_64_int_parameter_registers;
31903 return gen_rtx_REG (DImode, parm_regs[aggr]);
31904 }
31905
31906 nregs = ix86_function_regparm (type, function);
31907
31908 if (nregs > 0 && !stdarg_p (type))
31909 {
31910 int regno;
31911 unsigned int ccvt = ix86_get_callcvt (type);
31912
31913 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31914 regno = aggr ? DX_REG : CX_REG;
31915 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31916 {
31917 regno = CX_REG;
31918 if (aggr)
31919 return gen_rtx_MEM (SImode,
31920 plus_constant (stack_pointer_rtx, 4));
31921 }
31922 else
31923 {
31924 regno = AX_REG;
31925 if (aggr)
31926 {
31927 regno = DX_REG;
31928 if (nregs == 1)
31929 return gen_rtx_MEM (SImode,
31930 plus_constant (stack_pointer_rtx, 4));
31931 }
31932 }
31933 return gen_rtx_REG (SImode, regno);
31934 }
31935
31936 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
31937 }
31938
31939 /* Determine whether x86_output_mi_thunk can succeed. */
31940
31941 static bool
31942 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
31943 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
31944 HOST_WIDE_INT vcall_offset, const_tree function)
31945 {
31946 /* 64-bit can handle anything. */
31947 if (TARGET_64BIT)
31948 return true;
31949
31950 /* For 32-bit, everything's fine if we have one free register. */
31951 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
31952 return true;
31953
31954 /* Need a free register for vcall_offset. */
31955 if (vcall_offset)
31956 return false;
31957
31958 /* Need a free register for GOT references. */
31959 if (flag_pic && !targetm.binds_local_p (function))
31960 return false;
31961
31962 /* Otherwise ok. */
31963 return true;
31964 }
31965
31966 /* Output the assembler code for a thunk function. THUNK_DECL is the
31967 declaration for the thunk function itself, FUNCTION is the decl for
31968 the target function. DELTA is an immediate constant offset to be
31969 added to THIS. If VCALL_OFFSET is nonzero, the word at
31970 *(*this + vcall_offset) should be added to THIS. */
31971
31972 static void
31973 x86_output_mi_thunk (FILE *file,
31974 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
31975 HOST_WIDE_INT vcall_offset, tree function)
31976 {
31977 rtx this_param = x86_this_parameter (function);
31978 rtx this_reg, tmp, fnaddr;
31979
31980 emit_note (NOTE_INSN_PROLOGUE_END);
31981
31982 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
31983 pull it in now and let DELTA benefit. */
31984 if (REG_P (this_param))
31985 this_reg = this_param;
31986 else if (vcall_offset)
31987 {
31988 /* Put the this parameter into %eax. */
31989 this_reg = gen_rtx_REG (Pmode, AX_REG);
31990 emit_move_insn (this_reg, this_param);
31991 }
31992 else
31993 this_reg = NULL_RTX;
31994
31995 /* Adjust the this parameter by a fixed constant. */
31996 if (delta)
31997 {
31998 rtx delta_rtx = GEN_INT (delta);
31999 rtx delta_dst = this_reg ? this_reg : this_param;
32000
32001 if (TARGET_64BIT)
32002 {
32003 if (!x86_64_general_operand (delta_rtx, Pmode))
32004 {
32005 tmp = gen_rtx_REG (Pmode, R10_REG);
32006 emit_move_insn (tmp, delta_rtx);
32007 delta_rtx = tmp;
32008 }
32009 }
32010
32011 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32012 }
32013
32014 /* Adjust the this parameter by a value stored in the vtable. */
32015 if (vcall_offset)
32016 {
32017 rtx vcall_addr, vcall_mem, this_mem;
32018 unsigned int tmp_regno;
32019
32020 if (TARGET_64BIT)
32021 tmp_regno = R10_REG;
32022 else
32023 {
32024 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32025 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
32026 tmp_regno = AX_REG;
32027 else
32028 tmp_regno = CX_REG;
32029 }
32030 tmp = gen_rtx_REG (Pmode, tmp_regno);
32031
32032 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32033 if (Pmode != ptr_mode)
32034 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32035 emit_move_insn (tmp, this_mem);
32036
32037 /* Adjust the this parameter. */
32038 vcall_addr = plus_constant (tmp, vcall_offset);
32039 if (TARGET_64BIT
32040 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32041 {
32042 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32043 emit_move_insn (tmp2, GEN_INT (vcall_offset));
32044 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32045 }
32046
32047 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32048 if (Pmode != ptr_mode)
32049 emit_insn (gen_addsi_1_zext (this_reg,
32050 gen_rtx_REG (ptr_mode,
32051 REGNO (this_reg)),
32052 vcall_mem));
32053 else
32054 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32055 }
32056
32057 /* If necessary, drop THIS back to its stack slot. */
32058 if (this_reg && this_reg != this_param)
32059 emit_move_insn (this_param, this_reg);
32060
32061 fnaddr = XEXP (DECL_RTL (function), 0);
32062 if (TARGET_64BIT)
32063 {
32064 if (!flag_pic || targetm.binds_local_p (function)
32065 || cfun->machine->call_abi == MS_ABI)
32066 ;
32067 else
32068 {
32069 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32070 tmp = gen_rtx_CONST (Pmode, tmp);
32071 fnaddr = gen_rtx_MEM (Pmode, tmp);
32072 }
32073 }
32074 else
32075 {
32076 if (!flag_pic || targetm.binds_local_p (function))
32077 ;
32078 #if TARGET_MACHO
32079 else if (TARGET_MACHO)
32080 {
32081 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32082 fnaddr = XEXP (fnaddr, 0);
32083 }
32084 #endif /* TARGET_MACHO */
32085 else
32086 {
32087 tmp = gen_rtx_REG (Pmode, CX_REG);
32088 output_set_got (tmp, NULL_RTX);
32089
32090 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32091 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32092 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32093 }
32094 }
32095
32096 /* Our sibling call patterns do not allow memories, because we have no
32097 predicate that can distinguish between frame and non-frame memory.
32098 For our purposes here, we can get away with (ab)using a jump pattern,
32099 because we're going to do no optimization. */
32100 if (MEM_P (fnaddr))
32101 emit_jump_insn (gen_indirect_jump (fnaddr));
32102 else
32103 {
32104 tmp = gen_rtx_MEM (QImode, fnaddr);
32105 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32106 tmp = emit_call_insn (tmp);
32107 SIBLING_CALL_P (tmp) = 1;
32108 }
32109 emit_barrier ();
32110
32111 /* Emit just enough of rest_of_compilation to get the insns emitted.
32112 Note that use_thunk calls assemble_start_function et al. */
32113 tmp = get_insns ();
32114 insn_locators_alloc ();
32115 shorten_branches (tmp);
32116 final_start_function (tmp, file, 1);
32117 final (tmp, file, 1);
32118 final_end_function ();
32119 }
32120
32121 static void
32122 x86_file_start (void)
32123 {
32124 default_file_start ();
32125 #if TARGET_MACHO
32126 darwin_file_start ();
32127 #endif
32128 if (X86_FILE_START_VERSION_DIRECTIVE)
32129 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32130 if (X86_FILE_START_FLTUSED)
32131 fputs ("\t.global\t__fltused\n", asm_out_file);
32132 if (ix86_asm_dialect == ASM_INTEL)
32133 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32134 }
32135
32136 int
32137 x86_field_alignment (tree field, int computed)
32138 {
32139 enum machine_mode mode;
32140 tree type = TREE_TYPE (field);
32141
32142 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32143 return computed;
32144 mode = TYPE_MODE (strip_array_types (type));
32145 if (mode == DFmode || mode == DCmode
32146 || GET_MODE_CLASS (mode) == MODE_INT
32147 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32148 return MIN (32, computed);
32149 return computed;
32150 }
32151
32152 /* Output assembler code to FILE to increment profiler label # LABELNO
32153 for profiling a function entry. */
32154 void
32155 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32156 {
32157 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32158 : MCOUNT_NAME);
32159
32160 if (TARGET_64BIT)
32161 {
32162 #ifndef NO_PROFILE_COUNTERS
32163 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32164 #endif
32165
32166 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32167 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32168 else
32169 fprintf (file, "\tcall\t%s\n", mcount_name);
32170 }
32171 else if (flag_pic)
32172 {
32173 #ifndef NO_PROFILE_COUNTERS
32174 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32175 LPREFIX, labelno);
32176 #endif
32177 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32178 }
32179 else
32180 {
32181 #ifndef NO_PROFILE_COUNTERS
32182 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32183 LPREFIX, labelno);
32184 #endif
32185 fprintf (file, "\tcall\t%s\n", mcount_name);
32186 }
32187 }
32188
32189 /* We don't have exact information about the insn sizes, but we may assume
32190 quite safely that we are informed about all 1 byte insns and memory
32191 address sizes. This is enough to eliminate unnecessary padding in
32192 99% of cases. */
32193
32194 static int
32195 min_insn_size (rtx insn)
32196 {
32197 int l = 0, len;
32198
32199 if (!INSN_P (insn) || !active_insn_p (insn))
32200 return 0;
32201
32202 /* Discard alignments we've emit and jump instructions. */
32203 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
32204 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
32205 return 0;
32206 if (JUMP_TABLE_DATA_P (insn))
32207 return 0;
32208
32209 /* Important case - calls are always 5 bytes.
32210 It is common to have many calls in the row. */
32211 if (CALL_P (insn)
32212 && symbolic_reference_mentioned_p (PATTERN (insn))
32213 && !SIBLING_CALL_P (insn))
32214 return 5;
32215 len = get_attr_length (insn);
32216 if (len <= 1)
32217 return 1;
32218
32219 /* For normal instructions we rely on get_attr_length being exact,
32220 with a few exceptions. */
32221 if (!JUMP_P (insn))
32222 {
32223 enum attr_type type = get_attr_type (insn);
32224
32225 switch (type)
32226 {
32227 case TYPE_MULTI:
32228 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
32229 || asm_noperands (PATTERN (insn)) >= 0)
32230 return 0;
32231 break;
32232 case TYPE_OTHER:
32233 case TYPE_FCMP:
32234 break;
32235 default:
32236 /* Otherwise trust get_attr_length. */
32237 return len;
32238 }
32239
32240 l = get_attr_length_address (insn);
32241 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
32242 l = 4;
32243 }
32244 if (l)
32245 return 1+l;
32246 else
32247 return 2;
32248 }
32249
32250 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32251
32252 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
32253 window. */
32254
32255 static void
32256 ix86_avoid_jump_mispredicts (void)
32257 {
32258 rtx insn, start = get_insns ();
32259 int nbytes = 0, njumps = 0;
32260 int isjump = 0;
32261
32262 /* Look for all minimal intervals of instructions containing 4 jumps.
32263 The intervals are bounded by START and INSN. NBYTES is the total
32264 size of instructions in the interval including INSN and not including
32265 START. When the NBYTES is smaller than 16 bytes, it is possible
32266 that the end of START and INSN ends up in the same 16byte page.
32267
32268 The smallest offset in the page INSN can start is the case where START
32269 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
32270 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
32271 */
32272 for (insn = start; insn; insn = NEXT_INSN (insn))
32273 {
32274 int min_size;
32275
32276 if (LABEL_P (insn))
32277 {
32278 int align = label_to_alignment (insn);
32279 int max_skip = label_to_max_skip (insn);
32280
32281 if (max_skip > 15)
32282 max_skip = 15;
32283 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
32284 already in the current 16 byte page, because otherwise
32285 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
32286 bytes to reach 16 byte boundary. */
32287 if (align <= 0
32288 || (align <= 3 && max_skip != (1 << align) - 1))
32289 max_skip = 0;
32290 if (dump_file)
32291 fprintf (dump_file, "Label %i with max_skip %i\n",
32292 INSN_UID (insn), max_skip);
32293 if (max_skip)
32294 {
32295 while (nbytes + max_skip >= 16)
32296 {
32297 start = NEXT_INSN (start);
32298 if ((JUMP_P (start)
32299 && GET_CODE (PATTERN (start)) != ADDR_VEC
32300 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32301 || CALL_P (start))
32302 njumps--, isjump = 1;
32303 else
32304 isjump = 0;
32305 nbytes -= min_insn_size (start);
32306 }
32307 }
32308 continue;
32309 }
32310
32311 min_size = min_insn_size (insn);
32312 nbytes += min_size;
32313 if (dump_file)
32314 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
32315 INSN_UID (insn), min_size);
32316 if ((JUMP_P (insn)
32317 && GET_CODE (PATTERN (insn)) != ADDR_VEC
32318 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
32319 || CALL_P (insn))
32320 njumps++;
32321 else
32322 continue;
32323
32324 while (njumps > 3)
32325 {
32326 start = NEXT_INSN (start);
32327 if ((JUMP_P (start)
32328 && GET_CODE (PATTERN (start)) != ADDR_VEC
32329 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32330 || CALL_P (start))
32331 njumps--, isjump = 1;
32332 else
32333 isjump = 0;
32334 nbytes -= min_insn_size (start);
32335 }
32336 gcc_assert (njumps >= 0);
32337 if (dump_file)
32338 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
32339 INSN_UID (start), INSN_UID (insn), nbytes);
32340
32341 if (njumps == 3 && isjump && nbytes < 16)
32342 {
32343 int padsize = 15 - nbytes + min_insn_size (insn);
32344
32345 if (dump_file)
32346 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
32347 INSN_UID (insn), padsize);
32348 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
32349 }
32350 }
32351 }
32352 #endif
32353
32354 /* AMD Athlon works faster
32355 when RET is not destination of conditional jump or directly preceded
32356 by other jump instruction. We avoid the penalty by inserting NOP just
32357 before the RET instructions in such cases. */
32358 static void
32359 ix86_pad_returns (void)
32360 {
32361 edge e;
32362 edge_iterator ei;
32363
32364 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32365 {
32366 basic_block bb = e->src;
32367 rtx ret = BB_END (bb);
32368 rtx prev;
32369 bool replace = false;
32370
32371 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
32372 || optimize_bb_for_size_p (bb))
32373 continue;
32374 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
32375 if (active_insn_p (prev) || LABEL_P (prev))
32376 break;
32377 if (prev && LABEL_P (prev))
32378 {
32379 edge e;
32380 edge_iterator ei;
32381
32382 FOR_EACH_EDGE (e, ei, bb->preds)
32383 if (EDGE_FREQUENCY (e) && e->src->index >= 0
32384 && !(e->flags & EDGE_FALLTHRU))
32385 replace = true;
32386 }
32387 if (!replace)
32388 {
32389 prev = prev_active_insn (ret);
32390 if (prev
32391 && ((JUMP_P (prev) && any_condjump_p (prev))
32392 || CALL_P (prev)))
32393 replace = true;
32394 /* Empty functions get branch mispredict even when
32395 the jump destination is not visible to us. */
32396 if (!prev && !optimize_function_for_size_p (cfun))
32397 replace = true;
32398 }
32399 if (replace)
32400 {
32401 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
32402 delete_insn (ret);
32403 }
32404 }
32405 }
32406
32407 /* Count the minimum number of instructions in BB. Return 4 if the
32408 number of instructions >= 4. */
32409
32410 static int
32411 ix86_count_insn_bb (basic_block bb)
32412 {
32413 rtx insn;
32414 int insn_count = 0;
32415
32416 /* Count number of instructions in this block. Return 4 if the number
32417 of instructions >= 4. */
32418 FOR_BB_INSNS (bb, insn)
32419 {
32420 /* Only happen in exit blocks. */
32421 if (JUMP_P (insn)
32422 && ANY_RETURN_P (PATTERN (insn)))
32423 break;
32424
32425 if (NONDEBUG_INSN_P (insn)
32426 && GET_CODE (PATTERN (insn)) != USE
32427 && GET_CODE (PATTERN (insn)) != CLOBBER)
32428 {
32429 insn_count++;
32430 if (insn_count >= 4)
32431 return insn_count;
32432 }
32433 }
32434
32435 return insn_count;
32436 }
32437
32438
32439 /* Count the minimum number of instructions in code path in BB.
32440 Return 4 if the number of instructions >= 4. */
32441
32442 static int
32443 ix86_count_insn (basic_block bb)
32444 {
32445 edge e;
32446 edge_iterator ei;
32447 int min_prev_count;
32448
32449 /* Only bother counting instructions along paths with no
32450 more than 2 basic blocks between entry and exit. Given
32451 that BB has an edge to exit, determine if a predecessor
32452 of BB has an edge from entry. If so, compute the number
32453 of instructions in the predecessor block. If there
32454 happen to be multiple such blocks, compute the minimum. */
32455 min_prev_count = 4;
32456 FOR_EACH_EDGE (e, ei, bb->preds)
32457 {
32458 edge prev_e;
32459 edge_iterator prev_ei;
32460
32461 if (e->src == ENTRY_BLOCK_PTR)
32462 {
32463 min_prev_count = 0;
32464 break;
32465 }
32466 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
32467 {
32468 if (prev_e->src == ENTRY_BLOCK_PTR)
32469 {
32470 int count = ix86_count_insn_bb (e->src);
32471 if (count < min_prev_count)
32472 min_prev_count = count;
32473 break;
32474 }
32475 }
32476 }
32477
32478 if (min_prev_count < 4)
32479 min_prev_count += ix86_count_insn_bb (bb);
32480
32481 return min_prev_count;
32482 }
32483
32484 /* Pad short funtion to 4 instructions. */
32485
32486 static void
32487 ix86_pad_short_function (void)
32488 {
32489 edge e;
32490 edge_iterator ei;
32491
32492 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32493 {
32494 rtx ret = BB_END (e->src);
32495 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
32496 {
32497 int insn_count = ix86_count_insn (e->src);
32498
32499 /* Pad short function. */
32500 if (insn_count < 4)
32501 {
32502 rtx insn = ret;
32503
32504 /* Find epilogue. */
32505 while (insn
32506 && (!NOTE_P (insn)
32507 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
32508 insn = PREV_INSN (insn);
32509
32510 if (!insn)
32511 insn = ret;
32512
32513 /* Two NOPs count as one instruction. */
32514 insn_count = 2 * (4 - insn_count);
32515 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
32516 }
32517 }
32518 }
32519 }
32520
32521 /* Implement machine specific optimizations. We implement padding of returns
32522 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
32523 static void
32524 ix86_reorg (void)
32525 {
32526 /* We are freeing block_for_insn in the toplev to keep compatibility
32527 with old MDEP_REORGS that are not CFG based. Recompute it now. */
32528 compute_bb_for_insn ();
32529
32530 /* Run the vzeroupper optimization if needed. */
32531 if (TARGET_VZEROUPPER)
32532 move_or_delete_vzeroupper ();
32533
32534 if (optimize && optimize_function_for_speed_p (cfun))
32535 {
32536 if (TARGET_PAD_SHORT_FUNCTION)
32537 ix86_pad_short_function ();
32538 else if (TARGET_PAD_RETURNS)
32539 ix86_pad_returns ();
32540 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32541 if (TARGET_FOUR_JUMP_LIMIT)
32542 ix86_avoid_jump_mispredicts ();
32543 #endif
32544 }
32545 }
32546
32547 /* Return nonzero when QImode register that must be represented via REX prefix
32548 is used. */
32549 bool
32550 x86_extended_QIreg_mentioned_p (rtx insn)
32551 {
32552 int i;
32553 extract_insn_cached (insn);
32554 for (i = 0; i < recog_data.n_operands; i++)
32555 if (REG_P (recog_data.operand[i])
32556 && REGNO (recog_data.operand[i]) > BX_REG)
32557 return true;
32558 return false;
32559 }
32560
32561 /* Return nonzero when P points to register encoded via REX prefix.
32562 Called via for_each_rtx. */
32563 static int
32564 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
32565 {
32566 unsigned int regno;
32567 if (!REG_P (*p))
32568 return 0;
32569 regno = REGNO (*p);
32570 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
32571 }
32572
32573 /* Return true when INSN mentions register that must be encoded using REX
32574 prefix. */
32575 bool
32576 x86_extended_reg_mentioned_p (rtx insn)
32577 {
32578 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
32579 extended_reg_mentioned_1, NULL);
32580 }
32581
32582 /* If profitable, negate (without causing overflow) integer constant
32583 of mode MODE at location LOC. Return true in this case. */
32584 bool
32585 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
32586 {
32587 HOST_WIDE_INT val;
32588
32589 if (!CONST_INT_P (*loc))
32590 return false;
32591
32592 switch (mode)
32593 {
32594 case DImode:
32595 /* DImode x86_64 constants must fit in 32 bits. */
32596 gcc_assert (x86_64_immediate_operand (*loc, mode));
32597
32598 mode = SImode;
32599 break;
32600
32601 case SImode:
32602 case HImode:
32603 case QImode:
32604 break;
32605
32606 default:
32607 gcc_unreachable ();
32608 }
32609
32610 /* Avoid overflows. */
32611 if (mode_signbit_p (mode, *loc))
32612 return false;
32613
32614 val = INTVAL (*loc);
32615
32616 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
32617 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
32618 if ((val < 0 && val != -128)
32619 || val == 128)
32620 {
32621 *loc = GEN_INT (-val);
32622 return true;
32623 }
32624
32625 return false;
32626 }
32627
32628 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
32629 optabs would emit if we didn't have TFmode patterns. */
32630
32631 void
32632 x86_emit_floatuns (rtx operands[2])
32633 {
32634 rtx neglab, donelab, i0, i1, f0, in, out;
32635 enum machine_mode mode, inmode;
32636
32637 inmode = GET_MODE (operands[1]);
32638 gcc_assert (inmode == SImode || inmode == DImode);
32639
32640 out = operands[0];
32641 in = force_reg (inmode, operands[1]);
32642 mode = GET_MODE (out);
32643 neglab = gen_label_rtx ();
32644 donelab = gen_label_rtx ();
32645 f0 = gen_reg_rtx (mode);
32646
32647 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
32648
32649 expand_float (out, in, 0);
32650
32651 emit_jump_insn (gen_jump (donelab));
32652 emit_barrier ();
32653
32654 emit_label (neglab);
32655
32656 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
32657 1, OPTAB_DIRECT);
32658 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
32659 1, OPTAB_DIRECT);
32660 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
32661
32662 expand_float (f0, i0, 0);
32663
32664 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
32665
32666 emit_label (donelab);
32667 }
32668 \f
32669 /* AVX2 does support 32-byte integer vector operations,
32670 thus the longest vector we are faced with is V32QImode. */
32671 #define MAX_VECT_LEN 32
32672
32673 struct expand_vec_perm_d
32674 {
32675 rtx target, op0, op1;
32676 unsigned char perm[MAX_VECT_LEN];
32677 enum machine_mode vmode;
32678 unsigned char nelt;
32679 bool testing_p;
32680 };
32681
32682 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
32683 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
32684
32685 /* Get a vector mode of the same size as the original but with elements
32686 twice as wide. This is only guaranteed to apply to integral vectors. */
32687
32688 static inline enum machine_mode
32689 get_mode_wider_vector (enum machine_mode o)
32690 {
32691 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
32692 enum machine_mode n = GET_MODE_WIDER_MODE (o);
32693 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
32694 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
32695 return n;
32696 }
32697
32698 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32699 with all elements equal to VAR. Return true if successful. */
32700
32701 static bool
32702 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
32703 rtx target, rtx val)
32704 {
32705 bool ok;
32706
32707 switch (mode)
32708 {
32709 case V2SImode:
32710 case V2SFmode:
32711 if (!mmx_ok)
32712 return false;
32713 /* FALLTHRU */
32714
32715 case V4DFmode:
32716 case V4DImode:
32717 case V8SFmode:
32718 case V8SImode:
32719 case V2DFmode:
32720 case V2DImode:
32721 case V4SFmode:
32722 case V4SImode:
32723 {
32724 rtx insn, dup;
32725
32726 /* First attempt to recognize VAL as-is. */
32727 dup = gen_rtx_VEC_DUPLICATE (mode, val);
32728 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
32729 if (recog_memoized (insn) < 0)
32730 {
32731 rtx seq;
32732 /* If that fails, force VAL into a register. */
32733
32734 start_sequence ();
32735 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
32736 seq = get_insns ();
32737 end_sequence ();
32738 if (seq)
32739 emit_insn_before (seq, insn);
32740
32741 ok = recog_memoized (insn) >= 0;
32742 gcc_assert (ok);
32743 }
32744 }
32745 return true;
32746
32747 case V4HImode:
32748 if (!mmx_ok)
32749 return false;
32750 if (TARGET_SSE || TARGET_3DNOW_A)
32751 {
32752 rtx x;
32753
32754 val = gen_lowpart (SImode, val);
32755 x = gen_rtx_TRUNCATE (HImode, val);
32756 x = gen_rtx_VEC_DUPLICATE (mode, x);
32757 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32758 return true;
32759 }
32760 goto widen;
32761
32762 case V8QImode:
32763 if (!mmx_ok)
32764 return false;
32765 goto widen;
32766
32767 case V8HImode:
32768 if (TARGET_SSE2)
32769 {
32770 struct expand_vec_perm_d dperm;
32771 rtx tmp1, tmp2;
32772
32773 permute:
32774 memset (&dperm, 0, sizeof (dperm));
32775 dperm.target = target;
32776 dperm.vmode = mode;
32777 dperm.nelt = GET_MODE_NUNITS (mode);
32778 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
32779
32780 /* Extend to SImode using a paradoxical SUBREG. */
32781 tmp1 = gen_reg_rtx (SImode);
32782 emit_move_insn (tmp1, gen_lowpart (SImode, val));
32783
32784 /* Insert the SImode value as low element of a V4SImode vector. */
32785 tmp2 = gen_lowpart (V4SImode, dperm.op0);
32786 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
32787
32788 ok = (expand_vec_perm_1 (&dperm)
32789 || expand_vec_perm_broadcast_1 (&dperm));
32790 gcc_assert (ok);
32791 return ok;
32792 }
32793 goto widen;
32794
32795 case V16QImode:
32796 if (TARGET_SSE2)
32797 goto permute;
32798 goto widen;
32799
32800 widen:
32801 /* Replicate the value once into the next wider mode and recurse. */
32802 {
32803 enum machine_mode smode, wsmode, wvmode;
32804 rtx x;
32805
32806 smode = GET_MODE_INNER (mode);
32807 wvmode = get_mode_wider_vector (mode);
32808 wsmode = GET_MODE_INNER (wvmode);
32809
32810 val = convert_modes (wsmode, smode, val, true);
32811 x = expand_simple_binop (wsmode, ASHIFT, val,
32812 GEN_INT (GET_MODE_BITSIZE (smode)),
32813 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32814 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
32815
32816 x = gen_lowpart (wvmode, target);
32817 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
32818 gcc_assert (ok);
32819 return ok;
32820 }
32821
32822 case V16HImode:
32823 case V32QImode:
32824 {
32825 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
32826 rtx x = gen_reg_rtx (hvmode);
32827
32828 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
32829 gcc_assert (ok);
32830
32831 x = gen_rtx_VEC_CONCAT (mode, x, x);
32832 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32833 }
32834 return true;
32835
32836 default:
32837 return false;
32838 }
32839 }
32840
32841 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32842 whose ONE_VAR element is VAR, and other elements are zero. Return true
32843 if successful. */
32844
32845 static bool
32846 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
32847 rtx target, rtx var, int one_var)
32848 {
32849 enum machine_mode vsimode;
32850 rtx new_target;
32851 rtx x, tmp;
32852 bool use_vector_set = false;
32853
32854 switch (mode)
32855 {
32856 case V2DImode:
32857 /* For SSE4.1, we normally use vector set. But if the second
32858 element is zero and inter-unit moves are OK, we use movq
32859 instead. */
32860 use_vector_set = (TARGET_64BIT
32861 && TARGET_SSE4_1
32862 && !(TARGET_INTER_UNIT_MOVES
32863 && one_var == 0));
32864 break;
32865 case V16QImode:
32866 case V4SImode:
32867 case V4SFmode:
32868 use_vector_set = TARGET_SSE4_1;
32869 break;
32870 case V8HImode:
32871 use_vector_set = TARGET_SSE2;
32872 break;
32873 case V4HImode:
32874 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
32875 break;
32876 case V32QImode:
32877 case V16HImode:
32878 case V8SImode:
32879 case V8SFmode:
32880 case V4DFmode:
32881 use_vector_set = TARGET_AVX;
32882 break;
32883 case V4DImode:
32884 /* Use ix86_expand_vector_set in 64bit mode only. */
32885 use_vector_set = TARGET_AVX && TARGET_64BIT;
32886 break;
32887 default:
32888 break;
32889 }
32890
32891 if (use_vector_set)
32892 {
32893 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
32894 var = force_reg (GET_MODE_INNER (mode), var);
32895 ix86_expand_vector_set (mmx_ok, target, var, one_var);
32896 return true;
32897 }
32898
32899 switch (mode)
32900 {
32901 case V2SFmode:
32902 case V2SImode:
32903 if (!mmx_ok)
32904 return false;
32905 /* FALLTHRU */
32906
32907 case V2DFmode:
32908 case V2DImode:
32909 if (one_var != 0)
32910 return false;
32911 var = force_reg (GET_MODE_INNER (mode), var);
32912 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
32913 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32914 return true;
32915
32916 case V4SFmode:
32917 case V4SImode:
32918 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
32919 new_target = gen_reg_rtx (mode);
32920 else
32921 new_target = target;
32922 var = force_reg (GET_MODE_INNER (mode), var);
32923 x = gen_rtx_VEC_DUPLICATE (mode, var);
32924 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
32925 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
32926 if (one_var != 0)
32927 {
32928 /* We need to shuffle the value to the correct position, so
32929 create a new pseudo to store the intermediate result. */
32930
32931 /* With SSE2, we can use the integer shuffle insns. */
32932 if (mode != V4SFmode && TARGET_SSE2)
32933 {
32934 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
32935 const1_rtx,
32936 GEN_INT (one_var == 1 ? 0 : 1),
32937 GEN_INT (one_var == 2 ? 0 : 1),
32938 GEN_INT (one_var == 3 ? 0 : 1)));
32939 if (target != new_target)
32940 emit_move_insn (target, new_target);
32941 return true;
32942 }
32943
32944 /* Otherwise convert the intermediate result to V4SFmode and
32945 use the SSE1 shuffle instructions. */
32946 if (mode != V4SFmode)
32947 {
32948 tmp = gen_reg_rtx (V4SFmode);
32949 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
32950 }
32951 else
32952 tmp = new_target;
32953
32954 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
32955 const1_rtx,
32956 GEN_INT (one_var == 1 ? 0 : 1),
32957 GEN_INT (one_var == 2 ? 0+4 : 1+4),
32958 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
32959
32960 if (mode != V4SFmode)
32961 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
32962 else if (tmp != target)
32963 emit_move_insn (target, tmp);
32964 }
32965 else if (target != new_target)
32966 emit_move_insn (target, new_target);
32967 return true;
32968
32969 case V8HImode:
32970 case V16QImode:
32971 vsimode = V4SImode;
32972 goto widen;
32973 case V4HImode:
32974 case V8QImode:
32975 if (!mmx_ok)
32976 return false;
32977 vsimode = V2SImode;
32978 goto widen;
32979 widen:
32980 if (one_var != 0)
32981 return false;
32982
32983 /* Zero extend the variable element to SImode and recurse. */
32984 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
32985
32986 x = gen_reg_rtx (vsimode);
32987 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
32988 var, one_var))
32989 gcc_unreachable ();
32990
32991 emit_move_insn (target, gen_lowpart (mode, x));
32992 return true;
32993
32994 default:
32995 return false;
32996 }
32997 }
32998
32999 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33000 consisting of the values in VALS. It is known that all elements
33001 except ONE_VAR are constants. Return true if successful. */
33002
33003 static bool
33004 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33005 rtx target, rtx vals, int one_var)
33006 {
33007 rtx var = XVECEXP (vals, 0, one_var);
33008 enum machine_mode wmode;
33009 rtx const_vec, x;
33010
33011 const_vec = copy_rtx (vals);
33012 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33013 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33014
33015 switch (mode)
33016 {
33017 case V2DFmode:
33018 case V2DImode:
33019 case V2SFmode:
33020 case V2SImode:
33021 /* For the two element vectors, it's just as easy to use
33022 the general case. */
33023 return false;
33024
33025 case V4DImode:
33026 /* Use ix86_expand_vector_set in 64bit mode only. */
33027 if (!TARGET_64BIT)
33028 return false;
33029 case V4DFmode:
33030 case V8SFmode:
33031 case V8SImode:
33032 case V16HImode:
33033 case V32QImode:
33034 case V4SFmode:
33035 case V4SImode:
33036 case V8HImode:
33037 case V4HImode:
33038 break;
33039
33040 case V16QImode:
33041 if (TARGET_SSE4_1)
33042 break;
33043 wmode = V8HImode;
33044 goto widen;
33045 case V8QImode:
33046 wmode = V4HImode;
33047 goto widen;
33048 widen:
33049 /* There's no way to set one QImode entry easily. Combine
33050 the variable value with its adjacent constant value, and
33051 promote to an HImode set. */
33052 x = XVECEXP (vals, 0, one_var ^ 1);
33053 if (one_var & 1)
33054 {
33055 var = convert_modes (HImode, QImode, var, true);
33056 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33057 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33058 x = GEN_INT (INTVAL (x) & 0xff);
33059 }
33060 else
33061 {
33062 var = convert_modes (HImode, QImode, var, true);
33063 x = gen_int_mode (INTVAL (x) << 8, HImode);
33064 }
33065 if (x != const0_rtx)
33066 var = expand_simple_binop (HImode, IOR, var, x, var,
33067 1, OPTAB_LIB_WIDEN);
33068
33069 x = gen_reg_rtx (wmode);
33070 emit_move_insn (x, gen_lowpart (wmode, const_vec));
33071 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33072
33073 emit_move_insn (target, gen_lowpart (mode, x));
33074 return true;
33075
33076 default:
33077 return false;
33078 }
33079
33080 emit_move_insn (target, const_vec);
33081 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33082 return true;
33083 }
33084
33085 /* A subroutine of ix86_expand_vector_init_general. Use vector
33086 concatenate to handle the most general case: all values variable,
33087 and none identical. */
33088
33089 static void
33090 ix86_expand_vector_init_concat (enum machine_mode mode,
33091 rtx target, rtx *ops, int n)
33092 {
33093 enum machine_mode cmode, hmode = VOIDmode;
33094 rtx first[8], second[4];
33095 rtvec v;
33096 int i, j;
33097
33098 switch (n)
33099 {
33100 case 2:
33101 switch (mode)
33102 {
33103 case V8SImode:
33104 cmode = V4SImode;
33105 break;
33106 case V8SFmode:
33107 cmode = V4SFmode;
33108 break;
33109 case V4DImode:
33110 cmode = V2DImode;
33111 break;
33112 case V4DFmode:
33113 cmode = V2DFmode;
33114 break;
33115 case V4SImode:
33116 cmode = V2SImode;
33117 break;
33118 case V4SFmode:
33119 cmode = V2SFmode;
33120 break;
33121 case V2DImode:
33122 cmode = DImode;
33123 break;
33124 case V2SImode:
33125 cmode = SImode;
33126 break;
33127 case V2DFmode:
33128 cmode = DFmode;
33129 break;
33130 case V2SFmode:
33131 cmode = SFmode;
33132 break;
33133 default:
33134 gcc_unreachable ();
33135 }
33136
33137 if (!register_operand (ops[1], cmode))
33138 ops[1] = force_reg (cmode, ops[1]);
33139 if (!register_operand (ops[0], cmode))
33140 ops[0] = force_reg (cmode, ops[0]);
33141 emit_insn (gen_rtx_SET (VOIDmode, target,
33142 gen_rtx_VEC_CONCAT (mode, ops[0],
33143 ops[1])));
33144 break;
33145
33146 case 4:
33147 switch (mode)
33148 {
33149 case V4DImode:
33150 cmode = V2DImode;
33151 break;
33152 case V4DFmode:
33153 cmode = V2DFmode;
33154 break;
33155 case V4SImode:
33156 cmode = V2SImode;
33157 break;
33158 case V4SFmode:
33159 cmode = V2SFmode;
33160 break;
33161 default:
33162 gcc_unreachable ();
33163 }
33164 goto half;
33165
33166 case 8:
33167 switch (mode)
33168 {
33169 case V8SImode:
33170 cmode = V2SImode;
33171 hmode = V4SImode;
33172 break;
33173 case V8SFmode:
33174 cmode = V2SFmode;
33175 hmode = V4SFmode;
33176 break;
33177 default:
33178 gcc_unreachable ();
33179 }
33180 goto half;
33181
33182 half:
33183 /* FIXME: We process inputs backward to help RA. PR 36222. */
33184 i = n - 1;
33185 j = (n >> 1) - 1;
33186 for (; i > 0; i -= 2, j--)
33187 {
33188 first[j] = gen_reg_rtx (cmode);
33189 v = gen_rtvec (2, ops[i - 1], ops[i]);
33190 ix86_expand_vector_init (false, first[j],
33191 gen_rtx_PARALLEL (cmode, v));
33192 }
33193
33194 n >>= 1;
33195 if (n > 2)
33196 {
33197 gcc_assert (hmode != VOIDmode);
33198 for (i = j = 0; i < n; i += 2, j++)
33199 {
33200 second[j] = gen_reg_rtx (hmode);
33201 ix86_expand_vector_init_concat (hmode, second [j],
33202 &first [i], 2);
33203 }
33204 n >>= 1;
33205 ix86_expand_vector_init_concat (mode, target, second, n);
33206 }
33207 else
33208 ix86_expand_vector_init_concat (mode, target, first, n);
33209 break;
33210
33211 default:
33212 gcc_unreachable ();
33213 }
33214 }
33215
33216 /* A subroutine of ix86_expand_vector_init_general. Use vector
33217 interleave to handle the most general case: all values variable,
33218 and none identical. */
33219
33220 static void
33221 ix86_expand_vector_init_interleave (enum machine_mode mode,
33222 rtx target, rtx *ops, int n)
33223 {
33224 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
33225 int i, j;
33226 rtx op0, op1;
33227 rtx (*gen_load_even) (rtx, rtx, rtx);
33228 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
33229 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
33230
33231 switch (mode)
33232 {
33233 case V8HImode:
33234 gen_load_even = gen_vec_setv8hi;
33235 gen_interleave_first_low = gen_vec_interleave_lowv4si;
33236 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33237 inner_mode = HImode;
33238 first_imode = V4SImode;
33239 second_imode = V2DImode;
33240 third_imode = VOIDmode;
33241 break;
33242 case V16QImode:
33243 gen_load_even = gen_vec_setv16qi;
33244 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
33245 gen_interleave_second_low = gen_vec_interleave_lowv4si;
33246 inner_mode = QImode;
33247 first_imode = V8HImode;
33248 second_imode = V4SImode;
33249 third_imode = V2DImode;
33250 break;
33251 default:
33252 gcc_unreachable ();
33253 }
33254
33255 for (i = 0; i < n; i++)
33256 {
33257 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
33258 op0 = gen_reg_rtx (SImode);
33259 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
33260
33261 /* Insert the SImode value as low element of V4SImode vector. */
33262 op1 = gen_reg_rtx (V4SImode);
33263 op0 = gen_rtx_VEC_MERGE (V4SImode,
33264 gen_rtx_VEC_DUPLICATE (V4SImode,
33265 op0),
33266 CONST0_RTX (V4SImode),
33267 const1_rtx);
33268 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
33269
33270 /* Cast the V4SImode vector back to a vector in orignal mode. */
33271 op0 = gen_reg_rtx (mode);
33272 emit_move_insn (op0, gen_lowpart (mode, op1));
33273
33274 /* Load even elements into the second positon. */
33275 emit_insn (gen_load_even (op0,
33276 force_reg (inner_mode,
33277 ops [i + i + 1]),
33278 const1_rtx));
33279
33280 /* Cast vector to FIRST_IMODE vector. */
33281 ops[i] = gen_reg_rtx (first_imode);
33282 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
33283 }
33284
33285 /* Interleave low FIRST_IMODE vectors. */
33286 for (i = j = 0; i < n; i += 2, j++)
33287 {
33288 op0 = gen_reg_rtx (first_imode);
33289 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
33290
33291 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
33292 ops[j] = gen_reg_rtx (second_imode);
33293 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
33294 }
33295
33296 /* Interleave low SECOND_IMODE vectors. */
33297 switch (second_imode)
33298 {
33299 case V4SImode:
33300 for (i = j = 0; i < n / 2; i += 2, j++)
33301 {
33302 op0 = gen_reg_rtx (second_imode);
33303 emit_insn (gen_interleave_second_low (op0, ops[i],
33304 ops[i + 1]));
33305
33306 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
33307 vector. */
33308 ops[j] = gen_reg_rtx (third_imode);
33309 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
33310 }
33311 second_imode = V2DImode;
33312 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33313 /* FALLTHRU */
33314
33315 case V2DImode:
33316 op0 = gen_reg_rtx (second_imode);
33317 emit_insn (gen_interleave_second_low (op0, ops[0],
33318 ops[1]));
33319
33320 /* Cast the SECOND_IMODE vector back to a vector on original
33321 mode. */
33322 emit_insn (gen_rtx_SET (VOIDmode, target,
33323 gen_lowpart (mode, op0)));
33324 break;
33325
33326 default:
33327 gcc_unreachable ();
33328 }
33329 }
33330
33331 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
33332 all values variable, and none identical. */
33333
33334 static void
33335 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
33336 rtx target, rtx vals)
33337 {
33338 rtx ops[32], op0, op1;
33339 enum machine_mode half_mode = VOIDmode;
33340 int n, i;
33341
33342 switch (mode)
33343 {
33344 case V2SFmode:
33345 case V2SImode:
33346 if (!mmx_ok && !TARGET_SSE)
33347 break;
33348 /* FALLTHRU */
33349
33350 case V8SFmode:
33351 case V8SImode:
33352 case V4DFmode:
33353 case V4DImode:
33354 case V4SFmode:
33355 case V4SImode:
33356 case V2DFmode:
33357 case V2DImode:
33358 n = GET_MODE_NUNITS (mode);
33359 for (i = 0; i < n; i++)
33360 ops[i] = XVECEXP (vals, 0, i);
33361 ix86_expand_vector_init_concat (mode, target, ops, n);
33362 return;
33363
33364 case V32QImode:
33365 half_mode = V16QImode;
33366 goto half;
33367
33368 case V16HImode:
33369 half_mode = V8HImode;
33370 goto half;
33371
33372 half:
33373 n = GET_MODE_NUNITS (mode);
33374 for (i = 0; i < n; i++)
33375 ops[i] = XVECEXP (vals, 0, i);
33376 op0 = gen_reg_rtx (half_mode);
33377 op1 = gen_reg_rtx (half_mode);
33378 ix86_expand_vector_init_interleave (half_mode, op0, ops,
33379 n >> 2);
33380 ix86_expand_vector_init_interleave (half_mode, op1,
33381 &ops [n >> 1], n >> 2);
33382 emit_insn (gen_rtx_SET (VOIDmode, target,
33383 gen_rtx_VEC_CONCAT (mode, op0, op1)));
33384 return;
33385
33386 case V16QImode:
33387 if (!TARGET_SSE4_1)
33388 break;
33389 /* FALLTHRU */
33390
33391 case V8HImode:
33392 if (!TARGET_SSE2)
33393 break;
33394
33395 /* Don't use ix86_expand_vector_init_interleave if we can't
33396 move from GPR to SSE register directly. */
33397 if (!TARGET_INTER_UNIT_MOVES)
33398 break;
33399
33400 n = GET_MODE_NUNITS (mode);
33401 for (i = 0; i < n; i++)
33402 ops[i] = XVECEXP (vals, 0, i);
33403 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
33404 return;
33405
33406 case V4HImode:
33407 case V8QImode:
33408 break;
33409
33410 default:
33411 gcc_unreachable ();
33412 }
33413
33414 {
33415 int i, j, n_elts, n_words, n_elt_per_word;
33416 enum machine_mode inner_mode;
33417 rtx words[4], shift;
33418
33419 inner_mode = GET_MODE_INNER (mode);
33420 n_elts = GET_MODE_NUNITS (mode);
33421 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
33422 n_elt_per_word = n_elts / n_words;
33423 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
33424
33425 for (i = 0; i < n_words; ++i)
33426 {
33427 rtx word = NULL_RTX;
33428
33429 for (j = 0; j < n_elt_per_word; ++j)
33430 {
33431 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
33432 elt = convert_modes (word_mode, inner_mode, elt, true);
33433
33434 if (j == 0)
33435 word = elt;
33436 else
33437 {
33438 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
33439 word, 1, OPTAB_LIB_WIDEN);
33440 word = expand_simple_binop (word_mode, IOR, word, elt,
33441 word, 1, OPTAB_LIB_WIDEN);
33442 }
33443 }
33444
33445 words[i] = word;
33446 }
33447
33448 if (n_words == 1)
33449 emit_move_insn (target, gen_lowpart (mode, words[0]));
33450 else if (n_words == 2)
33451 {
33452 rtx tmp = gen_reg_rtx (mode);
33453 emit_clobber (tmp);
33454 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
33455 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
33456 emit_move_insn (target, tmp);
33457 }
33458 else if (n_words == 4)
33459 {
33460 rtx tmp = gen_reg_rtx (V4SImode);
33461 gcc_assert (word_mode == SImode);
33462 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
33463 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
33464 emit_move_insn (target, gen_lowpart (mode, tmp));
33465 }
33466 else
33467 gcc_unreachable ();
33468 }
33469 }
33470
33471 /* Initialize vector TARGET via VALS. Suppress the use of MMX
33472 instructions unless MMX_OK is true. */
33473
33474 void
33475 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
33476 {
33477 enum machine_mode mode = GET_MODE (target);
33478 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33479 int n_elts = GET_MODE_NUNITS (mode);
33480 int n_var = 0, one_var = -1;
33481 bool all_same = true, all_const_zero = true;
33482 int i;
33483 rtx x;
33484
33485 for (i = 0; i < n_elts; ++i)
33486 {
33487 x = XVECEXP (vals, 0, i);
33488 if (!(CONST_INT_P (x)
33489 || GET_CODE (x) == CONST_DOUBLE
33490 || GET_CODE (x) == CONST_FIXED))
33491 n_var++, one_var = i;
33492 else if (x != CONST0_RTX (inner_mode))
33493 all_const_zero = false;
33494 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
33495 all_same = false;
33496 }
33497
33498 /* Constants are best loaded from the constant pool. */
33499 if (n_var == 0)
33500 {
33501 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
33502 return;
33503 }
33504
33505 /* If all values are identical, broadcast the value. */
33506 if (all_same
33507 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
33508 XVECEXP (vals, 0, 0)))
33509 return;
33510
33511 /* Values where only one field is non-constant are best loaded from
33512 the pool and overwritten via move later. */
33513 if (n_var == 1)
33514 {
33515 if (all_const_zero
33516 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
33517 XVECEXP (vals, 0, one_var),
33518 one_var))
33519 return;
33520
33521 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
33522 return;
33523 }
33524
33525 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
33526 }
33527
33528 void
33529 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
33530 {
33531 enum machine_mode mode = GET_MODE (target);
33532 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33533 enum machine_mode half_mode;
33534 bool use_vec_merge = false;
33535 rtx tmp;
33536 static rtx (*gen_extract[6][2]) (rtx, rtx)
33537 = {
33538 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
33539 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
33540 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
33541 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
33542 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
33543 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
33544 };
33545 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
33546 = {
33547 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
33548 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
33549 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
33550 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
33551 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
33552 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
33553 };
33554 int i, j, n;
33555
33556 switch (mode)
33557 {
33558 case V2SFmode:
33559 case V2SImode:
33560 if (mmx_ok)
33561 {
33562 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33563 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
33564 if (elt == 0)
33565 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33566 else
33567 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33568 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33569 return;
33570 }
33571 break;
33572
33573 case V2DImode:
33574 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
33575 if (use_vec_merge)
33576 break;
33577
33578 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33579 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
33580 if (elt == 0)
33581 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33582 else
33583 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33584 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33585 return;
33586
33587 case V2DFmode:
33588 {
33589 rtx op0, op1;
33590
33591 /* For the two element vectors, we implement a VEC_CONCAT with
33592 the extraction of the other element. */
33593
33594 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
33595 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
33596
33597 if (elt == 0)
33598 op0 = val, op1 = tmp;
33599 else
33600 op0 = tmp, op1 = val;
33601
33602 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
33603 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33604 }
33605 return;
33606
33607 case V4SFmode:
33608 use_vec_merge = TARGET_SSE4_1;
33609 if (use_vec_merge)
33610 break;
33611
33612 switch (elt)
33613 {
33614 case 0:
33615 use_vec_merge = true;
33616 break;
33617
33618 case 1:
33619 /* tmp = target = A B C D */
33620 tmp = copy_to_reg (target);
33621 /* target = A A B B */
33622 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
33623 /* target = X A B B */
33624 ix86_expand_vector_set (false, target, val, 0);
33625 /* target = A X C D */
33626 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33627 const1_rtx, const0_rtx,
33628 GEN_INT (2+4), GEN_INT (3+4)));
33629 return;
33630
33631 case 2:
33632 /* tmp = target = A B C D */
33633 tmp = copy_to_reg (target);
33634 /* tmp = X B C D */
33635 ix86_expand_vector_set (false, tmp, val, 0);
33636 /* target = A B X D */
33637 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33638 const0_rtx, const1_rtx,
33639 GEN_INT (0+4), GEN_INT (3+4)));
33640 return;
33641
33642 case 3:
33643 /* tmp = target = A B C D */
33644 tmp = copy_to_reg (target);
33645 /* tmp = X B C D */
33646 ix86_expand_vector_set (false, tmp, val, 0);
33647 /* target = A B X D */
33648 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33649 const0_rtx, const1_rtx,
33650 GEN_INT (2+4), GEN_INT (0+4)));
33651 return;
33652
33653 default:
33654 gcc_unreachable ();
33655 }
33656 break;
33657
33658 case V4SImode:
33659 use_vec_merge = TARGET_SSE4_1;
33660 if (use_vec_merge)
33661 break;
33662
33663 /* Element 0 handled by vec_merge below. */
33664 if (elt == 0)
33665 {
33666 use_vec_merge = true;
33667 break;
33668 }
33669
33670 if (TARGET_SSE2)
33671 {
33672 /* With SSE2, use integer shuffles to swap element 0 and ELT,
33673 store into element 0, then shuffle them back. */
33674
33675 rtx order[4];
33676
33677 order[0] = GEN_INT (elt);
33678 order[1] = const1_rtx;
33679 order[2] = const2_rtx;
33680 order[3] = GEN_INT (3);
33681 order[elt] = const0_rtx;
33682
33683 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33684 order[1], order[2], order[3]));
33685
33686 ix86_expand_vector_set (false, target, val, 0);
33687
33688 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33689 order[1], order[2], order[3]));
33690 }
33691 else
33692 {
33693 /* For SSE1, we have to reuse the V4SF code. */
33694 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
33695 gen_lowpart (SFmode, val), elt);
33696 }
33697 return;
33698
33699 case V8HImode:
33700 use_vec_merge = TARGET_SSE2;
33701 break;
33702 case V4HImode:
33703 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33704 break;
33705
33706 case V16QImode:
33707 use_vec_merge = TARGET_SSE4_1;
33708 break;
33709
33710 case V8QImode:
33711 break;
33712
33713 case V32QImode:
33714 half_mode = V16QImode;
33715 j = 0;
33716 n = 16;
33717 goto half;
33718
33719 case V16HImode:
33720 half_mode = V8HImode;
33721 j = 1;
33722 n = 8;
33723 goto half;
33724
33725 case V8SImode:
33726 half_mode = V4SImode;
33727 j = 2;
33728 n = 4;
33729 goto half;
33730
33731 case V4DImode:
33732 half_mode = V2DImode;
33733 j = 3;
33734 n = 2;
33735 goto half;
33736
33737 case V8SFmode:
33738 half_mode = V4SFmode;
33739 j = 4;
33740 n = 4;
33741 goto half;
33742
33743 case V4DFmode:
33744 half_mode = V2DFmode;
33745 j = 5;
33746 n = 2;
33747 goto half;
33748
33749 half:
33750 /* Compute offset. */
33751 i = elt / n;
33752 elt %= n;
33753
33754 gcc_assert (i <= 1);
33755
33756 /* Extract the half. */
33757 tmp = gen_reg_rtx (half_mode);
33758 emit_insn (gen_extract[j][i] (tmp, target));
33759
33760 /* Put val in tmp at elt. */
33761 ix86_expand_vector_set (false, tmp, val, elt);
33762
33763 /* Put it back. */
33764 emit_insn (gen_insert[j][i] (target, target, tmp));
33765 return;
33766
33767 default:
33768 break;
33769 }
33770
33771 if (use_vec_merge)
33772 {
33773 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
33774 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
33775 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33776 }
33777 else
33778 {
33779 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33780
33781 emit_move_insn (mem, target);
33782
33783 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33784 emit_move_insn (tmp, val);
33785
33786 emit_move_insn (target, mem);
33787 }
33788 }
33789
33790 void
33791 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
33792 {
33793 enum machine_mode mode = GET_MODE (vec);
33794 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33795 bool use_vec_extr = false;
33796 rtx tmp;
33797
33798 switch (mode)
33799 {
33800 case V2SImode:
33801 case V2SFmode:
33802 if (!mmx_ok)
33803 break;
33804 /* FALLTHRU */
33805
33806 case V2DFmode:
33807 case V2DImode:
33808 use_vec_extr = true;
33809 break;
33810
33811 case V4SFmode:
33812 use_vec_extr = TARGET_SSE4_1;
33813 if (use_vec_extr)
33814 break;
33815
33816 switch (elt)
33817 {
33818 case 0:
33819 tmp = vec;
33820 break;
33821
33822 case 1:
33823 case 3:
33824 tmp = gen_reg_rtx (mode);
33825 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
33826 GEN_INT (elt), GEN_INT (elt),
33827 GEN_INT (elt+4), GEN_INT (elt+4)));
33828 break;
33829
33830 case 2:
33831 tmp = gen_reg_rtx (mode);
33832 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
33833 break;
33834
33835 default:
33836 gcc_unreachable ();
33837 }
33838 vec = tmp;
33839 use_vec_extr = true;
33840 elt = 0;
33841 break;
33842
33843 case V4SImode:
33844 use_vec_extr = TARGET_SSE4_1;
33845 if (use_vec_extr)
33846 break;
33847
33848 if (TARGET_SSE2)
33849 {
33850 switch (elt)
33851 {
33852 case 0:
33853 tmp = vec;
33854 break;
33855
33856 case 1:
33857 case 3:
33858 tmp = gen_reg_rtx (mode);
33859 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
33860 GEN_INT (elt), GEN_INT (elt),
33861 GEN_INT (elt), GEN_INT (elt)));
33862 break;
33863
33864 case 2:
33865 tmp = gen_reg_rtx (mode);
33866 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
33867 break;
33868
33869 default:
33870 gcc_unreachable ();
33871 }
33872 vec = tmp;
33873 use_vec_extr = true;
33874 elt = 0;
33875 }
33876 else
33877 {
33878 /* For SSE1, we have to reuse the V4SF code. */
33879 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
33880 gen_lowpart (V4SFmode, vec), elt);
33881 return;
33882 }
33883 break;
33884
33885 case V8HImode:
33886 use_vec_extr = TARGET_SSE2;
33887 break;
33888 case V4HImode:
33889 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33890 break;
33891
33892 case V16QImode:
33893 use_vec_extr = TARGET_SSE4_1;
33894 break;
33895
33896 case V8SFmode:
33897 if (TARGET_AVX)
33898 {
33899 tmp = gen_reg_rtx (V4SFmode);
33900 if (elt < 4)
33901 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
33902 else
33903 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
33904 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33905 return;
33906 }
33907 break;
33908
33909 case V4DFmode:
33910 if (TARGET_AVX)
33911 {
33912 tmp = gen_reg_rtx (V2DFmode);
33913 if (elt < 2)
33914 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
33915 else
33916 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
33917 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33918 return;
33919 }
33920 break;
33921
33922 case V32QImode:
33923 if (TARGET_AVX)
33924 {
33925 tmp = gen_reg_rtx (V16QImode);
33926 if (elt < 16)
33927 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
33928 else
33929 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
33930 ix86_expand_vector_extract (false, target, tmp, elt & 15);
33931 return;
33932 }
33933 break;
33934
33935 case V16HImode:
33936 if (TARGET_AVX)
33937 {
33938 tmp = gen_reg_rtx (V8HImode);
33939 if (elt < 8)
33940 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
33941 else
33942 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
33943 ix86_expand_vector_extract (false, target, tmp, elt & 7);
33944 return;
33945 }
33946 break;
33947
33948 case V8SImode:
33949 if (TARGET_AVX)
33950 {
33951 tmp = gen_reg_rtx (V4SImode);
33952 if (elt < 4)
33953 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
33954 else
33955 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
33956 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33957 return;
33958 }
33959 break;
33960
33961 case V4DImode:
33962 if (TARGET_AVX)
33963 {
33964 tmp = gen_reg_rtx (V2DImode);
33965 if (elt < 2)
33966 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
33967 else
33968 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
33969 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33970 return;
33971 }
33972 break;
33973
33974 case V8QImode:
33975 /* ??? Could extract the appropriate HImode element and shift. */
33976 default:
33977 break;
33978 }
33979
33980 if (use_vec_extr)
33981 {
33982 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
33983 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
33984
33985 /* Let the rtl optimizers know about the zero extension performed. */
33986 if (inner_mode == QImode || inner_mode == HImode)
33987 {
33988 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
33989 target = gen_lowpart (SImode, target);
33990 }
33991
33992 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33993 }
33994 else
33995 {
33996 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33997
33998 emit_move_insn (mem, vec);
33999
34000 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34001 emit_move_insn (target, tmp);
34002 }
34003 }
34004
34005 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34006 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34007 The upper bits of DEST are undefined, though they shouldn't cause
34008 exceptions (some bits from src or all zeros are ok). */
34009
34010 static void
34011 emit_reduc_half (rtx dest, rtx src, int i)
34012 {
34013 rtx tem;
34014 switch (GET_MODE (src))
34015 {
34016 case V4SFmode:
34017 if (i == 128)
34018 tem = gen_sse_movhlps (dest, src, src);
34019 else
34020 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34021 GEN_INT (1 + 4), GEN_INT (1 + 4));
34022 break;
34023 case V2DFmode:
34024 tem = gen_vec_interleave_highv2df (dest, src, src);
34025 break;
34026 case V16QImode:
34027 case V8HImode:
34028 case V4SImode:
34029 case V2DImode:
34030 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34031 gen_lowpart (V1TImode, src),
34032 GEN_INT (i / 2));
34033 break;
34034 case V8SFmode:
34035 if (i == 256)
34036 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34037 else
34038 tem = gen_avx_shufps256 (dest, src, src,
34039 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34040 break;
34041 case V4DFmode:
34042 if (i == 256)
34043 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34044 else
34045 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34046 break;
34047 case V32QImode:
34048 case V16HImode:
34049 case V8SImode:
34050 case V4DImode:
34051 if (i == 256)
34052 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34053 gen_lowpart (V4DImode, src),
34054 gen_lowpart (V4DImode, src),
34055 const1_rtx);
34056 else
34057 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34058 gen_lowpart (V2TImode, src),
34059 GEN_INT (i / 2));
34060 break;
34061 default:
34062 gcc_unreachable ();
34063 }
34064 emit_insn (tem);
34065 }
34066
34067 /* Expand a vector reduction. FN is the binary pattern to reduce;
34068 DEST is the destination; IN is the input vector. */
34069
34070 void
34071 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34072 {
34073 rtx half, dst, vec = in;
34074 enum machine_mode mode = GET_MODE (in);
34075 int i;
34076
34077 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
34078 if (TARGET_SSE4_1
34079 && mode == V8HImode
34080 && fn == gen_uminv8hi3)
34081 {
34082 emit_insn (gen_sse4_1_phminposuw (dest, in));
34083 return;
34084 }
34085
34086 for (i = GET_MODE_BITSIZE (mode);
34087 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34088 i >>= 1)
34089 {
34090 half = gen_reg_rtx (mode);
34091 emit_reduc_half (half, vec, i);
34092 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34093 dst = dest;
34094 else
34095 dst = gen_reg_rtx (mode);
34096 emit_insn (fn (dst, half, vec));
34097 vec = dst;
34098 }
34099 }
34100 \f
34101 /* Target hook for scalar_mode_supported_p. */
34102 static bool
34103 ix86_scalar_mode_supported_p (enum machine_mode mode)
34104 {
34105 if (DECIMAL_FLOAT_MODE_P (mode))
34106 return default_decimal_float_supported_p ();
34107 else if (mode == TFmode)
34108 return true;
34109 else
34110 return default_scalar_mode_supported_p (mode);
34111 }
34112
34113 /* Implements target hook vector_mode_supported_p. */
34114 static bool
34115 ix86_vector_mode_supported_p (enum machine_mode mode)
34116 {
34117 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34118 return true;
34119 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34120 return true;
34121 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34122 return true;
34123 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34124 return true;
34125 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34126 return true;
34127 return false;
34128 }
34129
34130 /* Target hook for c_mode_for_suffix. */
34131 static enum machine_mode
34132 ix86_c_mode_for_suffix (char suffix)
34133 {
34134 if (suffix == 'q')
34135 return TFmode;
34136 if (suffix == 'w')
34137 return XFmode;
34138
34139 return VOIDmode;
34140 }
34141
34142 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34143
34144 We do this in the new i386 backend to maintain source compatibility
34145 with the old cc0-based compiler. */
34146
34147 static tree
34148 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34149 tree inputs ATTRIBUTE_UNUSED,
34150 tree clobbers)
34151 {
34152 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34153 clobbers);
34154 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34155 clobbers);
34156 return clobbers;
34157 }
34158
34159 /* Implements target vector targetm.asm.encode_section_info. */
34160
34161 static void ATTRIBUTE_UNUSED
34162 ix86_encode_section_info (tree decl, rtx rtl, int first)
34163 {
34164 default_encode_section_info (decl, rtl, first);
34165
34166 if (TREE_CODE (decl) == VAR_DECL
34167 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34168 && ix86_in_large_data_p (decl))
34169 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34170 }
34171
34172 /* Worker function for REVERSE_CONDITION. */
34173
34174 enum rtx_code
34175 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34176 {
34177 return (mode != CCFPmode && mode != CCFPUmode
34178 ? reverse_condition (code)
34179 : reverse_condition_maybe_unordered (code));
34180 }
34181
34182 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34183 to OPERANDS[0]. */
34184
34185 const char *
34186 output_387_reg_move (rtx insn, rtx *operands)
34187 {
34188 if (REG_P (operands[0]))
34189 {
34190 if (REG_P (operands[1])
34191 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34192 {
34193 if (REGNO (operands[0]) == FIRST_STACK_REG)
34194 return output_387_ffreep (operands, 0);
34195 return "fstp\t%y0";
34196 }
34197 if (STACK_TOP_P (operands[0]))
34198 return "fld%Z1\t%y1";
34199 return "fst\t%y0";
34200 }
34201 else if (MEM_P (operands[0]))
34202 {
34203 gcc_assert (REG_P (operands[1]));
34204 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34205 return "fstp%Z0\t%y0";
34206 else
34207 {
34208 /* There is no non-popping store to memory for XFmode.
34209 So if we need one, follow the store with a load. */
34210 if (GET_MODE (operands[0]) == XFmode)
34211 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
34212 else
34213 return "fst%Z0\t%y0";
34214 }
34215 }
34216 else
34217 gcc_unreachable();
34218 }
34219
34220 /* Output code to perform a conditional jump to LABEL, if C2 flag in
34221 FP status register is set. */
34222
34223 void
34224 ix86_emit_fp_unordered_jump (rtx label)
34225 {
34226 rtx reg = gen_reg_rtx (HImode);
34227 rtx temp;
34228
34229 emit_insn (gen_x86_fnstsw_1 (reg));
34230
34231 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
34232 {
34233 emit_insn (gen_x86_sahf_1 (reg));
34234
34235 temp = gen_rtx_REG (CCmode, FLAGS_REG);
34236 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
34237 }
34238 else
34239 {
34240 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
34241
34242 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
34243 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
34244 }
34245
34246 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
34247 gen_rtx_LABEL_REF (VOIDmode, label),
34248 pc_rtx);
34249 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
34250
34251 emit_jump_insn (temp);
34252 predict_jump (REG_BR_PROB_BASE * 10 / 100);
34253 }
34254
34255 /* Output code to perform a log1p XFmode calculation. */
34256
34257 void ix86_emit_i387_log1p (rtx op0, rtx op1)
34258 {
34259 rtx label1 = gen_label_rtx ();
34260 rtx label2 = gen_label_rtx ();
34261
34262 rtx tmp = gen_reg_rtx (XFmode);
34263 rtx tmp2 = gen_reg_rtx (XFmode);
34264 rtx test;
34265
34266 emit_insn (gen_absxf2 (tmp, op1));
34267 test = gen_rtx_GE (VOIDmode, tmp,
34268 CONST_DOUBLE_FROM_REAL_VALUE (
34269 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
34270 XFmode));
34271 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
34272
34273 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34274 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
34275 emit_jump (label2);
34276
34277 emit_label (label1);
34278 emit_move_insn (tmp, CONST1_RTX (XFmode));
34279 emit_insn (gen_addxf3 (tmp, op1, tmp));
34280 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34281 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
34282
34283 emit_label (label2);
34284 }
34285
34286 /* Emit code for round calculation. */
34287 void ix86_emit_i387_round (rtx op0, rtx op1)
34288 {
34289 enum machine_mode inmode = GET_MODE (op1);
34290 enum machine_mode outmode = GET_MODE (op0);
34291 rtx e1, e2, res, tmp, tmp1, half;
34292 rtx scratch = gen_reg_rtx (HImode);
34293 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
34294 rtx jump_label = gen_label_rtx ();
34295 rtx insn;
34296 rtx (*gen_abs) (rtx, rtx);
34297 rtx (*gen_neg) (rtx, rtx);
34298
34299 switch (inmode)
34300 {
34301 case SFmode:
34302 gen_abs = gen_abssf2;
34303 break;
34304 case DFmode:
34305 gen_abs = gen_absdf2;
34306 break;
34307 case XFmode:
34308 gen_abs = gen_absxf2;
34309 break;
34310 default:
34311 gcc_unreachable ();
34312 }
34313
34314 switch (outmode)
34315 {
34316 case SFmode:
34317 gen_neg = gen_negsf2;
34318 break;
34319 case DFmode:
34320 gen_neg = gen_negdf2;
34321 break;
34322 case XFmode:
34323 gen_neg = gen_negxf2;
34324 break;
34325 case HImode:
34326 gen_neg = gen_neghi2;
34327 break;
34328 case SImode:
34329 gen_neg = gen_negsi2;
34330 break;
34331 case DImode:
34332 gen_neg = gen_negdi2;
34333 break;
34334 default:
34335 gcc_unreachable ();
34336 }
34337
34338 e1 = gen_reg_rtx (inmode);
34339 e2 = gen_reg_rtx (inmode);
34340 res = gen_reg_rtx (outmode);
34341
34342 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
34343
34344 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
34345
34346 /* scratch = fxam(op1) */
34347 emit_insn (gen_rtx_SET (VOIDmode, scratch,
34348 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
34349 UNSPEC_FXAM)));
34350 /* e1 = fabs(op1) */
34351 emit_insn (gen_abs (e1, op1));
34352
34353 /* e2 = e1 + 0.5 */
34354 half = force_reg (inmode, half);
34355 emit_insn (gen_rtx_SET (VOIDmode, e2,
34356 gen_rtx_PLUS (inmode, e1, half)));
34357
34358 /* res = floor(e2) */
34359 if (inmode != XFmode)
34360 {
34361 tmp1 = gen_reg_rtx (XFmode);
34362
34363 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
34364 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
34365 }
34366 else
34367 tmp1 = e2;
34368
34369 switch (outmode)
34370 {
34371 case SFmode:
34372 case DFmode:
34373 {
34374 rtx tmp0 = gen_reg_rtx (XFmode);
34375
34376 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
34377
34378 emit_insn (gen_rtx_SET (VOIDmode, res,
34379 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
34380 UNSPEC_TRUNC_NOOP)));
34381 }
34382 break;
34383 case XFmode:
34384 emit_insn (gen_frndintxf2_floor (res, tmp1));
34385 break;
34386 case HImode:
34387 emit_insn (gen_lfloorxfhi2 (res, tmp1));
34388 break;
34389 case SImode:
34390 emit_insn (gen_lfloorxfsi2 (res, tmp1));
34391 break;
34392 case DImode:
34393 emit_insn (gen_lfloorxfdi2 (res, tmp1));
34394 break;
34395 default:
34396 gcc_unreachable ();
34397 }
34398
34399 /* flags = signbit(a) */
34400 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
34401
34402 /* if (flags) then res = -res */
34403 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
34404 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
34405 gen_rtx_LABEL_REF (VOIDmode, jump_label),
34406 pc_rtx);
34407 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34408 predict_jump (REG_BR_PROB_BASE * 50 / 100);
34409 JUMP_LABEL (insn) = jump_label;
34410
34411 emit_insn (gen_neg (res, res));
34412
34413 emit_label (jump_label);
34414 LABEL_NUSES (jump_label) = 1;
34415
34416 emit_move_insn (op0, res);
34417 }
34418
34419 /* Output code to perform a Newton-Rhapson approximation of a single precision
34420 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
34421
34422 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
34423 {
34424 rtx x0, x1, e0, e1;
34425
34426 x0 = gen_reg_rtx (mode);
34427 e0 = gen_reg_rtx (mode);
34428 e1 = gen_reg_rtx (mode);
34429 x1 = gen_reg_rtx (mode);
34430
34431 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
34432
34433 b = force_reg (mode, b);
34434
34435 /* x0 = rcp(b) estimate */
34436 emit_insn (gen_rtx_SET (VOIDmode, x0,
34437 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
34438 UNSPEC_RCP)));
34439 /* e0 = x0 * b */
34440 emit_insn (gen_rtx_SET (VOIDmode, e0,
34441 gen_rtx_MULT (mode, x0, b)));
34442
34443 /* e0 = x0 * e0 */
34444 emit_insn (gen_rtx_SET (VOIDmode, e0,
34445 gen_rtx_MULT (mode, x0, e0)));
34446
34447 /* e1 = x0 + x0 */
34448 emit_insn (gen_rtx_SET (VOIDmode, e1,
34449 gen_rtx_PLUS (mode, x0, x0)));
34450
34451 /* x1 = e1 - e0 */
34452 emit_insn (gen_rtx_SET (VOIDmode, x1,
34453 gen_rtx_MINUS (mode, e1, e0)));
34454
34455 /* res = a * x1 */
34456 emit_insn (gen_rtx_SET (VOIDmode, res,
34457 gen_rtx_MULT (mode, a, x1)));
34458 }
34459
34460 /* Output code to perform a Newton-Rhapson approximation of a
34461 single precision floating point [reciprocal] square root. */
34462
34463 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
34464 bool recip)
34465 {
34466 rtx x0, e0, e1, e2, e3, mthree, mhalf;
34467 REAL_VALUE_TYPE r;
34468
34469 x0 = gen_reg_rtx (mode);
34470 e0 = gen_reg_rtx (mode);
34471 e1 = gen_reg_rtx (mode);
34472 e2 = gen_reg_rtx (mode);
34473 e3 = gen_reg_rtx (mode);
34474
34475 real_from_integer (&r, VOIDmode, -3, -1, 0);
34476 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34477
34478 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
34479 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34480
34481 if (VECTOR_MODE_P (mode))
34482 {
34483 mthree = ix86_build_const_vector (mode, true, mthree);
34484 mhalf = ix86_build_const_vector (mode, true, mhalf);
34485 }
34486
34487 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
34488 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
34489
34490 a = force_reg (mode, a);
34491
34492 /* x0 = rsqrt(a) estimate */
34493 emit_insn (gen_rtx_SET (VOIDmode, x0,
34494 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
34495 UNSPEC_RSQRT)));
34496
34497 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
34498 if (!recip)
34499 {
34500 rtx zero, mask;
34501
34502 zero = gen_reg_rtx (mode);
34503 mask = gen_reg_rtx (mode);
34504
34505 zero = force_reg (mode, CONST0_RTX(mode));
34506 emit_insn (gen_rtx_SET (VOIDmode, mask,
34507 gen_rtx_NE (mode, zero, a)));
34508
34509 emit_insn (gen_rtx_SET (VOIDmode, x0,
34510 gen_rtx_AND (mode, x0, mask)));
34511 }
34512
34513 /* e0 = x0 * a */
34514 emit_insn (gen_rtx_SET (VOIDmode, e0,
34515 gen_rtx_MULT (mode, x0, a)));
34516 /* e1 = e0 * x0 */
34517 emit_insn (gen_rtx_SET (VOIDmode, e1,
34518 gen_rtx_MULT (mode, e0, x0)));
34519
34520 /* e2 = e1 - 3. */
34521 mthree = force_reg (mode, mthree);
34522 emit_insn (gen_rtx_SET (VOIDmode, e2,
34523 gen_rtx_PLUS (mode, e1, mthree)));
34524
34525 mhalf = force_reg (mode, mhalf);
34526 if (recip)
34527 /* e3 = -.5 * x0 */
34528 emit_insn (gen_rtx_SET (VOIDmode, e3,
34529 gen_rtx_MULT (mode, x0, mhalf)));
34530 else
34531 /* e3 = -.5 * e0 */
34532 emit_insn (gen_rtx_SET (VOIDmode, e3,
34533 gen_rtx_MULT (mode, e0, mhalf)));
34534 /* ret = e2 * e3 */
34535 emit_insn (gen_rtx_SET (VOIDmode, res,
34536 gen_rtx_MULT (mode, e2, e3)));
34537 }
34538
34539 #ifdef TARGET_SOLARIS
34540 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
34541
34542 static void
34543 i386_solaris_elf_named_section (const char *name, unsigned int flags,
34544 tree decl)
34545 {
34546 /* With Binutils 2.15, the "@unwind" marker must be specified on
34547 every occurrence of the ".eh_frame" section, not just the first
34548 one. */
34549 if (TARGET_64BIT
34550 && strcmp (name, ".eh_frame") == 0)
34551 {
34552 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
34553 flags & SECTION_WRITE ? "aw" : "a");
34554 return;
34555 }
34556
34557 #ifndef USE_GAS
34558 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
34559 {
34560 solaris_elf_asm_comdat_section (name, flags, decl);
34561 return;
34562 }
34563 #endif
34564
34565 default_elf_asm_named_section (name, flags, decl);
34566 }
34567 #endif /* TARGET_SOLARIS */
34568
34569 /* Return the mangling of TYPE if it is an extended fundamental type. */
34570
34571 static const char *
34572 ix86_mangle_type (const_tree type)
34573 {
34574 type = TYPE_MAIN_VARIANT (type);
34575
34576 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
34577 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
34578 return NULL;
34579
34580 switch (TYPE_MODE (type))
34581 {
34582 case TFmode:
34583 /* __float128 is "g". */
34584 return "g";
34585 case XFmode:
34586 /* "long double" or __float80 is "e". */
34587 return "e";
34588 default:
34589 return NULL;
34590 }
34591 }
34592
34593 /* For 32-bit code we can save PIC register setup by using
34594 __stack_chk_fail_local hidden function instead of calling
34595 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
34596 register, so it is better to call __stack_chk_fail directly. */
34597
34598 static tree ATTRIBUTE_UNUSED
34599 ix86_stack_protect_fail (void)
34600 {
34601 return TARGET_64BIT
34602 ? default_external_stack_protect_fail ()
34603 : default_hidden_stack_protect_fail ();
34604 }
34605
34606 /* Select a format to encode pointers in exception handling data. CODE
34607 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
34608 true if the symbol may be affected by dynamic relocations.
34609
34610 ??? All x86 object file formats are capable of representing this.
34611 After all, the relocation needed is the same as for the call insn.
34612 Whether or not a particular assembler allows us to enter such, I
34613 guess we'll have to see. */
34614 int
34615 asm_preferred_eh_data_format (int code, int global)
34616 {
34617 if (flag_pic)
34618 {
34619 int type = DW_EH_PE_sdata8;
34620 if (!TARGET_64BIT
34621 || ix86_cmodel == CM_SMALL_PIC
34622 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
34623 type = DW_EH_PE_sdata4;
34624 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
34625 }
34626 if (ix86_cmodel == CM_SMALL
34627 || (ix86_cmodel == CM_MEDIUM && code))
34628 return DW_EH_PE_udata4;
34629 return DW_EH_PE_absptr;
34630 }
34631 \f
34632 /* Expand copysign from SIGN to the positive value ABS_VALUE
34633 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
34634 the sign-bit. */
34635 static void
34636 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
34637 {
34638 enum machine_mode mode = GET_MODE (sign);
34639 rtx sgn = gen_reg_rtx (mode);
34640 if (mask == NULL_RTX)
34641 {
34642 enum machine_mode vmode;
34643
34644 if (mode == SFmode)
34645 vmode = V4SFmode;
34646 else if (mode == DFmode)
34647 vmode = V2DFmode;
34648 else
34649 vmode = mode;
34650
34651 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
34652 if (!VECTOR_MODE_P (mode))
34653 {
34654 /* We need to generate a scalar mode mask in this case. */
34655 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34656 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34657 mask = gen_reg_rtx (mode);
34658 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34659 }
34660 }
34661 else
34662 mask = gen_rtx_NOT (mode, mask);
34663 emit_insn (gen_rtx_SET (VOIDmode, sgn,
34664 gen_rtx_AND (mode, mask, sign)));
34665 emit_insn (gen_rtx_SET (VOIDmode, result,
34666 gen_rtx_IOR (mode, abs_value, sgn)));
34667 }
34668
34669 /* Expand fabs (OP0) and return a new rtx that holds the result. The
34670 mask for masking out the sign-bit is stored in *SMASK, if that is
34671 non-null. */
34672 static rtx
34673 ix86_expand_sse_fabs (rtx op0, rtx *smask)
34674 {
34675 enum machine_mode vmode, mode = GET_MODE (op0);
34676 rtx xa, mask;
34677
34678 xa = gen_reg_rtx (mode);
34679 if (mode == SFmode)
34680 vmode = V4SFmode;
34681 else if (mode == DFmode)
34682 vmode = V2DFmode;
34683 else
34684 vmode = mode;
34685 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
34686 if (!VECTOR_MODE_P (mode))
34687 {
34688 /* We need to generate a scalar mode mask in this case. */
34689 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34690 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34691 mask = gen_reg_rtx (mode);
34692 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34693 }
34694 emit_insn (gen_rtx_SET (VOIDmode, xa,
34695 gen_rtx_AND (mode, op0, mask)));
34696
34697 if (smask)
34698 *smask = mask;
34699
34700 return xa;
34701 }
34702
34703 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
34704 swapping the operands if SWAP_OPERANDS is true. The expanded
34705 code is a forward jump to a newly created label in case the
34706 comparison is true. The generated label rtx is returned. */
34707 static rtx
34708 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
34709 bool swap_operands)
34710 {
34711 rtx label, tmp;
34712
34713 if (swap_operands)
34714 {
34715 tmp = op0;
34716 op0 = op1;
34717 op1 = tmp;
34718 }
34719
34720 label = gen_label_rtx ();
34721 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
34722 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34723 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
34724 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
34725 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
34726 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
34727 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34728 JUMP_LABEL (tmp) = label;
34729
34730 return label;
34731 }
34732
34733 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
34734 using comparison code CODE. Operands are swapped for the comparison if
34735 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
34736 static rtx
34737 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
34738 bool swap_operands)
34739 {
34740 rtx (*insn)(rtx, rtx, rtx, rtx);
34741 enum machine_mode mode = GET_MODE (op0);
34742 rtx mask = gen_reg_rtx (mode);
34743
34744 if (swap_operands)
34745 {
34746 rtx tmp = op0;
34747 op0 = op1;
34748 op1 = tmp;
34749 }
34750
34751 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
34752
34753 emit_insn (insn (mask, op0, op1,
34754 gen_rtx_fmt_ee (code, mode, op0, op1)));
34755 return mask;
34756 }
34757
34758 /* Generate and return a rtx of mode MODE for 2**n where n is the number
34759 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
34760 static rtx
34761 ix86_gen_TWO52 (enum machine_mode mode)
34762 {
34763 REAL_VALUE_TYPE TWO52r;
34764 rtx TWO52;
34765
34766 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
34767 TWO52 = const_double_from_real_value (TWO52r, mode);
34768 TWO52 = force_reg (mode, TWO52);
34769
34770 return TWO52;
34771 }
34772
34773 /* Expand SSE sequence for computing lround from OP1 storing
34774 into OP0. */
34775 void
34776 ix86_expand_lround (rtx op0, rtx op1)
34777 {
34778 /* C code for the stuff we're doing below:
34779 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
34780 return (long)tmp;
34781 */
34782 enum machine_mode mode = GET_MODE (op1);
34783 const struct real_format *fmt;
34784 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34785 rtx adj;
34786
34787 /* load nextafter (0.5, 0.0) */
34788 fmt = REAL_MODE_FORMAT (mode);
34789 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34790 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34791
34792 /* adj = copysign (0.5, op1) */
34793 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
34794 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
34795
34796 /* adj = op1 + adj */
34797 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
34798
34799 /* op0 = (imode)adj */
34800 expand_fix (op0, adj, 0);
34801 }
34802
34803 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
34804 into OPERAND0. */
34805 void
34806 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
34807 {
34808 /* C code for the stuff we're doing below (for do_floor):
34809 xi = (long)op1;
34810 xi -= (double)xi > op1 ? 1 : 0;
34811 return xi;
34812 */
34813 enum machine_mode fmode = GET_MODE (op1);
34814 enum machine_mode imode = GET_MODE (op0);
34815 rtx ireg, freg, label, tmp;
34816
34817 /* reg = (long)op1 */
34818 ireg = gen_reg_rtx (imode);
34819 expand_fix (ireg, op1, 0);
34820
34821 /* freg = (double)reg */
34822 freg = gen_reg_rtx (fmode);
34823 expand_float (freg, ireg, 0);
34824
34825 /* ireg = (freg > op1) ? ireg - 1 : ireg */
34826 label = ix86_expand_sse_compare_and_jump (UNLE,
34827 freg, op1, !do_floor);
34828 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
34829 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
34830 emit_move_insn (ireg, tmp);
34831
34832 emit_label (label);
34833 LABEL_NUSES (label) = 1;
34834
34835 emit_move_insn (op0, ireg);
34836 }
34837
34838 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
34839 result in OPERAND0. */
34840 void
34841 ix86_expand_rint (rtx operand0, rtx operand1)
34842 {
34843 /* C code for the stuff we're doing below:
34844 xa = fabs (operand1);
34845 if (!isless (xa, 2**52))
34846 return operand1;
34847 xa = xa + 2**52 - 2**52;
34848 return copysign (xa, operand1);
34849 */
34850 enum machine_mode mode = GET_MODE (operand0);
34851 rtx res, xa, label, TWO52, mask;
34852
34853 res = gen_reg_rtx (mode);
34854 emit_move_insn (res, operand1);
34855
34856 /* xa = abs (operand1) */
34857 xa = ix86_expand_sse_fabs (res, &mask);
34858
34859 /* if (!isless (xa, TWO52)) goto label; */
34860 TWO52 = ix86_gen_TWO52 (mode);
34861 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34862
34863 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34864 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34865
34866 ix86_sse_copysign_to_positive (res, xa, res, mask);
34867
34868 emit_label (label);
34869 LABEL_NUSES (label) = 1;
34870
34871 emit_move_insn (operand0, res);
34872 }
34873
34874 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34875 into OPERAND0. */
34876 void
34877 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
34878 {
34879 /* C code for the stuff we expand below.
34880 double xa = fabs (x), x2;
34881 if (!isless (xa, TWO52))
34882 return x;
34883 xa = xa + TWO52 - TWO52;
34884 x2 = copysign (xa, x);
34885 Compensate. Floor:
34886 if (x2 > x)
34887 x2 -= 1;
34888 Compensate. Ceil:
34889 if (x2 < x)
34890 x2 -= -1;
34891 return x2;
34892 */
34893 enum machine_mode mode = GET_MODE (operand0);
34894 rtx xa, TWO52, tmp, label, one, res, mask;
34895
34896 TWO52 = ix86_gen_TWO52 (mode);
34897
34898 /* Temporary for holding the result, initialized to the input
34899 operand to ease control flow. */
34900 res = gen_reg_rtx (mode);
34901 emit_move_insn (res, operand1);
34902
34903 /* xa = abs (operand1) */
34904 xa = ix86_expand_sse_fabs (res, &mask);
34905
34906 /* if (!isless (xa, TWO52)) goto label; */
34907 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34908
34909 /* xa = xa + TWO52 - TWO52; */
34910 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34911 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34912
34913 /* xa = copysign (xa, operand1) */
34914 ix86_sse_copysign_to_positive (xa, xa, res, mask);
34915
34916 /* generate 1.0 or -1.0 */
34917 one = force_reg (mode,
34918 const_double_from_real_value (do_floor
34919 ? dconst1 : dconstm1, mode));
34920
34921 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34922 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34923 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34924 gen_rtx_AND (mode, one, tmp)));
34925 /* We always need to subtract here to preserve signed zero. */
34926 tmp = expand_simple_binop (mode, MINUS,
34927 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34928 emit_move_insn (res, tmp);
34929
34930 emit_label (label);
34931 LABEL_NUSES (label) = 1;
34932
34933 emit_move_insn (operand0, res);
34934 }
34935
34936 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34937 into OPERAND0. */
34938 void
34939 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
34940 {
34941 /* C code for the stuff we expand below.
34942 double xa = fabs (x), x2;
34943 if (!isless (xa, TWO52))
34944 return x;
34945 x2 = (double)(long)x;
34946 Compensate. Floor:
34947 if (x2 > x)
34948 x2 -= 1;
34949 Compensate. Ceil:
34950 if (x2 < x)
34951 x2 += 1;
34952 if (HONOR_SIGNED_ZEROS (mode))
34953 return copysign (x2, x);
34954 return x2;
34955 */
34956 enum machine_mode mode = GET_MODE (operand0);
34957 rtx xa, xi, TWO52, tmp, label, one, res, mask;
34958
34959 TWO52 = ix86_gen_TWO52 (mode);
34960
34961 /* Temporary for holding the result, initialized to the input
34962 operand to ease control flow. */
34963 res = gen_reg_rtx (mode);
34964 emit_move_insn (res, operand1);
34965
34966 /* xa = abs (operand1) */
34967 xa = ix86_expand_sse_fabs (res, &mask);
34968
34969 /* if (!isless (xa, TWO52)) goto label; */
34970 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34971
34972 /* xa = (double)(long)x */
34973 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34974 expand_fix (xi, res, 0);
34975 expand_float (xa, xi, 0);
34976
34977 /* generate 1.0 */
34978 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
34979
34980 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34981 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34982 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34983 gen_rtx_AND (mode, one, tmp)));
34984 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
34985 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34986 emit_move_insn (res, tmp);
34987
34988 if (HONOR_SIGNED_ZEROS (mode))
34989 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
34990
34991 emit_label (label);
34992 LABEL_NUSES (label) = 1;
34993
34994 emit_move_insn (operand0, res);
34995 }
34996
34997 /* Expand SSE sequence for computing round from OPERAND1 storing
34998 into OPERAND0. Sequence that works without relying on DImode truncation
34999 via cvttsd2siq that is only available on 64bit targets. */
35000 void
35001 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35002 {
35003 /* C code for the stuff we expand below.
35004 double xa = fabs (x), xa2, x2;
35005 if (!isless (xa, TWO52))
35006 return x;
35007 Using the absolute value and copying back sign makes
35008 -0.0 -> -0.0 correct.
35009 xa2 = xa + TWO52 - TWO52;
35010 Compensate.
35011 dxa = xa2 - xa;
35012 if (dxa <= -0.5)
35013 xa2 += 1;
35014 else if (dxa > 0.5)
35015 xa2 -= 1;
35016 x2 = copysign (xa2, x);
35017 return x2;
35018 */
35019 enum machine_mode mode = GET_MODE (operand0);
35020 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35021
35022 TWO52 = ix86_gen_TWO52 (mode);
35023
35024 /* Temporary for holding the result, initialized to the input
35025 operand to ease control flow. */
35026 res = gen_reg_rtx (mode);
35027 emit_move_insn (res, operand1);
35028
35029 /* xa = abs (operand1) */
35030 xa = ix86_expand_sse_fabs (res, &mask);
35031
35032 /* if (!isless (xa, TWO52)) goto label; */
35033 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35034
35035 /* xa2 = xa + TWO52 - TWO52; */
35036 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35037 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35038
35039 /* dxa = xa2 - xa; */
35040 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35041
35042 /* generate 0.5, 1.0 and -0.5 */
35043 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35044 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35045 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35046 0, OPTAB_DIRECT);
35047
35048 /* Compensate. */
35049 tmp = gen_reg_rtx (mode);
35050 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35051 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35052 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35053 gen_rtx_AND (mode, one, tmp)));
35054 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35055 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35056 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35057 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35058 gen_rtx_AND (mode, one, tmp)));
35059 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35060
35061 /* res = copysign (xa2, operand1) */
35062 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35063
35064 emit_label (label);
35065 LABEL_NUSES (label) = 1;
35066
35067 emit_move_insn (operand0, res);
35068 }
35069
35070 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35071 into OPERAND0. */
35072 void
35073 ix86_expand_trunc (rtx operand0, rtx operand1)
35074 {
35075 /* C code for SSE variant we expand below.
35076 double xa = fabs (x), x2;
35077 if (!isless (xa, TWO52))
35078 return x;
35079 x2 = (double)(long)x;
35080 if (HONOR_SIGNED_ZEROS (mode))
35081 return copysign (x2, x);
35082 return x2;
35083 */
35084 enum machine_mode mode = GET_MODE (operand0);
35085 rtx xa, xi, TWO52, label, res, mask;
35086
35087 TWO52 = ix86_gen_TWO52 (mode);
35088
35089 /* Temporary for holding the result, initialized to the input
35090 operand to ease control flow. */
35091 res = gen_reg_rtx (mode);
35092 emit_move_insn (res, operand1);
35093
35094 /* xa = abs (operand1) */
35095 xa = ix86_expand_sse_fabs (res, &mask);
35096
35097 /* if (!isless (xa, TWO52)) goto label; */
35098 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35099
35100 /* x = (double)(long)x */
35101 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35102 expand_fix (xi, res, 0);
35103 expand_float (res, xi, 0);
35104
35105 if (HONOR_SIGNED_ZEROS (mode))
35106 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35107
35108 emit_label (label);
35109 LABEL_NUSES (label) = 1;
35110
35111 emit_move_insn (operand0, res);
35112 }
35113
35114 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35115 into OPERAND0. */
35116 void
35117 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35118 {
35119 enum machine_mode mode = GET_MODE (operand0);
35120 rtx xa, mask, TWO52, label, one, res, smask, tmp;
35121
35122 /* C code for SSE variant we expand below.
35123 double xa = fabs (x), x2;
35124 if (!isless (xa, TWO52))
35125 return x;
35126 xa2 = xa + TWO52 - TWO52;
35127 Compensate:
35128 if (xa2 > xa)
35129 xa2 -= 1.0;
35130 x2 = copysign (xa2, x);
35131 return x2;
35132 */
35133
35134 TWO52 = ix86_gen_TWO52 (mode);
35135
35136 /* Temporary for holding the result, initialized to the input
35137 operand to ease control flow. */
35138 res = gen_reg_rtx (mode);
35139 emit_move_insn (res, operand1);
35140
35141 /* xa = abs (operand1) */
35142 xa = ix86_expand_sse_fabs (res, &smask);
35143
35144 /* if (!isless (xa, TWO52)) goto label; */
35145 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35146
35147 /* res = xa + TWO52 - TWO52; */
35148 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35149 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35150 emit_move_insn (res, tmp);
35151
35152 /* generate 1.0 */
35153 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35154
35155 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
35156 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35157 emit_insn (gen_rtx_SET (VOIDmode, mask,
35158 gen_rtx_AND (mode, mask, one)));
35159 tmp = expand_simple_binop (mode, MINUS,
35160 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35161 emit_move_insn (res, tmp);
35162
35163 /* res = copysign (res, operand1) */
35164 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35165
35166 emit_label (label);
35167 LABEL_NUSES (label) = 1;
35168
35169 emit_move_insn (operand0, res);
35170 }
35171
35172 /* Expand SSE sequence for computing round from OPERAND1 storing
35173 into OPERAND0. */
35174 void
35175 ix86_expand_round (rtx operand0, rtx operand1)
35176 {
35177 /* C code for the stuff we're doing below:
35178 double xa = fabs (x);
35179 if (!isless (xa, TWO52))
35180 return x;
35181 xa = (double)(long)(xa + nextafter (0.5, 0.0));
35182 return copysign (xa, x);
35183 */
35184 enum machine_mode mode = GET_MODE (operand0);
35185 rtx res, TWO52, xa, label, xi, half, mask;
35186 const struct real_format *fmt;
35187 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35188
35189 /* Temporary for holding the result, initialized to the input
35190 operand to ease control flow. */
35191 res = gen_reg_rtx (mode);
35192 emit_move_insn (res, operand1);
35193
35194 TWO52 = ix86_gen_TWO52 (mode);
35195 xa = ix86_expand_sse_fabs (res, &mask);
35196 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35197
35198 /* load nextafter (0.5, 0.0) */
35199 fmt = REAL_MODE_FORMAT (mode);
35200 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35201 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35202
35203 /* xa = xa + 0.5 */
35204 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
35205 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
35206
35207 /* xa = (double)(int64_t)xa */
35208 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35209 expand_fix (xi, xa, 0);
35210 expand_float (xa, xi, 0);
35211
35212 /* res = copysign (xa, operand1) */
35213 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
35214
35215 emit_label (label);
35216 LABEL_NUSES (label) = 1;
35217
35218 emit_move_insn (operand0, res);
35219 }
35220
35221 /* Expand SSE sequence for computing round
35222 from OP1 storing into OP0 using sse4 round insn. */
35223 void
35224 ix86_expand_round_sse4 (rtx op0, rtx op1)
35225 {
35226 enum machine_mode mode = GET_MODE (op0);
35227 rtx e1, e2, res, half;
35228 const struct real_format *fmt;
35229 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35230 rtx (*gen_copysign) (rtx, rtx, rtx);
35231 rtx (*gen_round) (rtx, rtx, rtx);
35232
35233 switch (mode)
35234 {
35235 case SFmode:
35236 gen_copysign = gen_copysignsf3;
35237 gen_round = gen_sse4_1_roundsf2;
35238 break;
35239 case DFmode:
35240 gen_copysign = gen_copysigndf3;
35241 gen_round = gen_sse4_1_rounddf2;
35242 break;
35243 default:
35244 gcc_unreachable ();
35245 }
35246
35247 /* round (a) = trunc (a + copysign (0.5, a)) */
35248
35249 /* load nextafter (0.5, 0.0) */
35250 fmt = REAL_MODE_FORMAT (mode);
35251 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35252 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35253 half = const_double_from_real_value (pred_half, mode);
35254
35255 /* e1 = copysign (0.5, op1) */
35256 e1 = gen_reg_rtx (mode);
35257 emit_insn (gen_copysign (e1, half, op1));
35258
35259 /* e2 = op1 + e1 */
35260 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
35261
35262 /* res = trunc (e2) */
35263 res = gen_reg_rtx (mode);
35264 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
35265
35266 emit_move_insn (op0, res);
35267 }
35268 \f
35269
35270 /* Table of valid machine attributes. */
35271 static const struct attribute_spec ix86_attribute_table[] =
35272 {
35273 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
35274 affects_type_identity } */
35275 /* Stdcall attribute says callee is responsible for popping arguments
35276 if they are not variable. */
35277 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35278 true },
35279 /* Fastcall attribute says callee is responsible for popping arguments
35280 if they are not variable. */
35281 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35282 true },
35283 /* Thiscall attribute says callee is responsible for popping arguments
35284 if they are not variable. */
35285 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35286 true },
35287 /* Cdecl attribute says the callee is a normal C declaration */
35288 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35289 true },
35290 /* Regparm attribute specifies how many integer arguments are to be
35291 passed in registers. */
35292 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
35293 true },
35294 /* Sseregparm attribute says we are using x86_64 calling conventions
35295 for FP arguments. */
35296 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35297 true },
35298 /* The transactional memory builtins are implicitly regparm or fastcall
35299 depending on the ABI. Override the generic do-nothing attribute that
35300 these builtins were declared with. */
35301 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
35302 true },
35303 /* force_align_arg_pointer says this function realigns the stack at entry. */
35304 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
35305 false, true, true, ix86_handle_cconv_attribute, false },
35306 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
35307 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
35308 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
35309 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
35310 false },
35311 #endif
35312 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35313 false },
35314 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35315 false },
35316 #ifdef SUBTARGET_ATTRIBUTE_TABLE
35317 SUBTARGET_ATTRIBUTE_TABLE,
35318 #endif
35319 /* ms_abi and sysv_abi calling convention function attributes. */
35320 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35321 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35322 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
35323 false },
35324 { "callee_pop_aggregate_return", 1, 1, false, true, true,
35325 ix86_handle_callee_pop_aggregate_return, true },
35326 /* End element. */
35327 { NULL, 0, 0, false, false, false, NULL, false }
35328 };
35329
35330 /* Implement targetm.vectorize.builtin_vectorization_cost. */
35331 static int
35332 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35333 tree vectype ATTRIBUTE_UNUSED,
35334 int misalign ATTRIBUTE_UNUSED)
35335 {
35336 switch (type_of_cost)
35337 {
35338 case scalar_stmt:
35339 return ix86_cost->scalar_stmt_cost;
35340
35341 case scalar_load:
35342 return ix86_cost->scalar_load_cost;
35343
35344 case scalar_store:
35345 return ix86_cost->scalar_store_cost;
35346
35347 case vector_stmt:
35348 return ix86_cost->vec_stmt_cost;
35349
35350 case vector_load:
35351 return ix86_cost->vec_align_load_cost;
35352
35353 case vector_store:
35354 return ix86_cost->vec_store_cost;
35355
35356 case vec_to_scalar:
35357 return ix86_cost->vec_to_scalar_cost;
35358
35359 case scalar_to_vec:
35360 return ix86_cost->scalar_to_vec_cost;
35361
35362 case unaligned_load:
35363 case unaligned_store:
35364 return ix86_cost->vec_unalign_load_cost;
35365
35366 case cond_branch_taken:
35367 return ix86_cost->cond_taken_branch_cost;
35368
35369 case cond_branch_not_taken:
35370 return ix86_cost->cond_not_taken_branch_cost;
35371
35372 case vec_perm:
35373 case vec_promote_demote:
35374 return ix86_cost->vec_stmt_cost;
35375
35376 default:
35377 gcc_unreachable ();
35378 }
35379 }
35380
35381 /* Construct (set target (vec_select op0 (parallel perm))) and
35382 return true if that's a valid instruction in the active ISA. */
35383
35384 static bool
35385 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
35386 {
35387 rtx rperm[MAX_VECT_LEN], x;
35388 unsigned i;
35389
35390 for (i = 0; i < nelt; ++i)
35391 rperm[i] = GEN_INT (perm[i]);
35392
35393 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
35394 x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
35395 x = gen_rtx_SET (VOIDmode, target, x);
35396
35397 x = emit_insn (x);
35398 if (recog_memoized (x) < 0)
35399 {
35400 remove_insn (x);
35401 return false;
35402 }
35403 return true;
35404 }
35405
35406 /* Similar, but generate a vec_concat from op0 and op1 as well. */
35407
35408 static bool
35409 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
35410 const unsigned char *perm, unsigned nelt)
35411 {
35412 enum machine_mode v2mode;
35413 rtx x;
35414
35415 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
35416 x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
35417 return expand_vselect (target, x, perm, nelt);
35418 }
35419
35420 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35421 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
35422
35423 static bool
35424 expand_vec_perm_blend (struct expand_vec_perm_d *d)
35425 {
35426 enum machine_mode vmode = d->vmode;
35427 unsigned i, mask, nelt = d->nelt;
35428 rtx target, op0, op1, x;
35429 rtx rperm[32], vperm;
35430
35431 if (d->op0 == d->op1)
35432 return false;
35433 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
35434 ;
35435 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
35436 ;
35437 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
35438 ;
35439 else
35440 return false;
35441
35442 /* This is a blend, not a permute. Elements must stay in their
35443 respective lanes. */
35444 for (i = 0; i < nelt; ++i)
35445 {
35446 unsigned e = d->perm[i];
35447 if (!(e == i || e == i + nelt))
35448 return false;
35449 }
35450
35451 if (d->testing_p)
35452 return true;
35453
35454 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
35455 decision should be extracted elsewhere, so that we only try that
35456 sequence once all budget==3 options have been tried. */
35457 target = d->target;
35458 op0 = d->op0;
35459 op1 = d->op1;
35460 mask = 0;
35461
35462 switch (vmode)
35463 {
35464 case V4DFmode:
35465 case V8SFmode:
35466 case V2DFmode:
35467 case V4SFmode:
35468 case V8HImode:
35469 case V8SImode:
35470 for (i = 0; i < nelt; ++i)
35471 mask |= (d->perm[i] >= nelt) << i;
35472 break;
35473
35474 case V2DImode:
35475 for (i = 0; i < 2; ++i)
35476 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
35477 vmode = V8HImode;
35478 goto do_subreg;
35479
35480 case V4SImode:
35481 for (i = 0; i < 4; ++i)
35482 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35483 vmode = V8HImode;
35484 goto do_subreg;
35485
35486 case V16QImode:
35487 /* See if bytes move in pairs so we can use pblendw with
35488 an immediate argument, rather than pblendvb with a vector
35489 argument. */
35490 for (i = 0; i < 16; i += 2)
35491 if (d->perm[i] + 1 != d->perm[i + 1])
35492 {
35493 use_pblendvb:
35494 for (i = 0; i < nelt; ++i)
35495 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
35496
35497 finish_pblendvb:
35498 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
35499 vperm = force_reg (vmode, vperm);
35500
35501 if (GET_MODE_SIZE (vmode) == 16)
35502 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
35503 else
35504 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
35505 return true;
35506 }
35507
35508 for (i = 0; i < 8; ++i)
35509 mask |= (d->perm[i * 2] >= 16) << i;
35510 vmode = V8HImode;
35511 /* FALLTHRU */
35512
35513 do_subreg:
35514 target = gen_lowpart (vmode, target);
35515 op0 = gen_lowpart (vmode, op0);
35516 op1 = gen_lowpart (vmode, op1);
35517 break;
35518
35519 case V32QImode:
35520 /* See if bytes move in pairs. If not, vpblendvb must be used. */
35521 for (i = 0; i < 32; i += 2)
35522 if (d->perm[i] + 1 != d->perm[i + 1])
35523 goto use_pblendvb;
35524 /* See if bytes move in quadruplets. If yes, vpblendd
35525 with immediate can be used. */
35526 for (i = 0; i < 32; i += 4)
35527 if (d->perm[i] + 2 != d->perm[i + 2])
35528 break;
35529 if (i < 32)
35530 {
35531 /* See if bytes move the same in both lanes. If yes,
35532 vpblendw with immediate can be used. */
35533 for (i = 0; i < 16; i += 2)
35534 if (d->perm[i] + 16 != d->perm[i + 16])
35535 goto use_pblendvb;
35536
35537 /* Use vpblendw. */
35538 for (i = 0; i < 16; ++i)
35539 mask |= (d->perm[i * 2] >= 32) << i;
35540 vmode = V16HImode;
35541 goto do_subreg;
35542 }
35543
35544 /* Use vpblendd. */
35545 for (i = 0; i < 8; ++i)
35546 mask |= (d->perm[i * 4] >= 32) << i;
35547 vmode = V8SImode;
35548 goto do_subreg;
35549
35550 case V16HImode:
35551 /* See if words move in pairs. If yes, vpblendd can be used. */
35552 for (i = 0; i < 16; i += 2)
35553 if (d->perm[i] + 1 != d->perm[i + 1])
35554 break;
35555 if (i < 16)
35556 {
35557 /* See if words move the same in both lanes. If not,
35558 vpblendvb must be used. */
35559 for (i = 0; i < 8; i++)
35560 if (d->perm[i] + 8 != d->perm[i + 8])
35561 {
35562 /* Use vpblendvb. */
35563 for (i = 0; i < 32; ++i)
35564 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
35565
35566 vmode = V32QImode;
35567 nelt = 32;
35568 target = gen_lowpart (vmode, target);
35569 op0 = gen_lowpart (vmode, op0);
35570 op1 = gen_lowpart (vmode, op1);
35571 goto finish_pblendvb;
35572 }
35573
35574 /* Use vpblendw. */
35575 for (i = 0; i < 16; ++i)
35576 mask |= (d->perm[i] >= 16) << i;
35577 break;
35578 }
35579
35580 /* Use vpblendd. */
35581 for (i = 0; i < 8; ++i)
35582 mask |= (d->perm[i * 2] >= 16) << i;
35583 vmode = V8SImode;
35584 goto do_subreg;
35585
35586 case V4DImode:
35587 /* Use vpblendd. */
35588 for (i = 0; i < 4; ++i)
35589 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35590 vmode = V8SImode;
35591 goto do_subreg;
35592
35593 default:
35594 gcc_unreachable ();
35595 }
35596
35597 /* This matches five different patterns with the different modes. */
35598 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
35599 x = gen_rtx_SET (VOIDmode, target, x);
35600 emit_insn (x);
35601
35602 return true;
35603 }
35604
35605 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35606 in terms of the variable form of vpermilps.
35607
35608 Note that we will have already failed the immediate input vpermilps,
35609 which requires that the high and low part shuffle be identical; the
35610 variable form doesn't require that. */
35611
35612 static bool
35613 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
35614 {
35615 rtx rperm[8], vperm;
35616 unsigned i;
35617
35618 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
35619 return false;
35620
35621 /* We can only permute within the 128-bit lane. */
35622 for (i = 0; i < 8; ++i)
35623 {
35624 unsigned e = d->perm[i];
35625 if (i < 4 ? e >= 4 : e < 4)
35626 return false;
35627 }
35628
35629 if (d->testing_p)
35630 return true;
35631
35632 for (i = 0; i < 8; ++i)
35633 {
35634 unsigned e = d->perm[i];
35635
35636 /* Within each 128-bit lane, the elements of op0 are numbered
35637 from 0 and the elements of op1 are numbered from 4. */
35638 if (e >= 8 + 4)
35639 e -= 8;
35640 else if (e >= 4)
35641 e -= 4;
35642
35643 rperm[i] = GEN_INT (e);
35644 }
35645
35646 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
35647 vperm = force_reg (V8SImode, vperm);
35648 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
35649
35650 return true;
35651 }
35652
35653 /* Return true if permutation D can be performed as VMODE permutation
35654 instead. */
35655
35656 static bool
35657 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
35658 {
35659 unsigned int i, j, chunk;
35660
35661 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
35662 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
35663 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
35664 return false;
35665
35666 if (GET_MODE_NUNITS (vmode) >= d->nelt)
35667 return true;
35668
35669 chunk = d->nelt / GET_MODE_NUNITS (vmode);
35670 for (i = 0; i < d->nelt; i += chunk)
35671 if (d->perm[i] & (chunk - 1))
35672 return false;
35673 else
35674 for (j = 1; j < chunk; ++j)
35675 if (d->perm[i] + j != d->perm[i + j])
35676 return false;
35677
35678 return true;
35679 }
35680
35681 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35682 in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */
35683
35684 static bool
35685 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
35686 {
35687 unsigned i, nelt, eltsz, mask;
35688 unsigned char perm[32];
35689 enum machine_mode vmode = V16QImode;
35690 rtx rperm[32], vperm, target, op0, op1;
35691
35692 nelt = d->nelt;
35693
35694 if (d->op0 != d->op1)
35695 {
35696 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
35697 {
35698 if (TARGET_AVX2
35699 && valid_perm_using_mode_p (V2TImode, d))
35700 {
35701 if (d->testing_p)
35702 return true;
35703
35704 /* Use vperm2i128 insn. The pattern uses
35705 V4DImode instead of V2TImode. */
35706 target = gen_lowpart (V4DImode, d->target);
35707 op0 = gen_lowpart (V4DImode, d->op0);
35708 op1 = gen_lowpart (V4DImode, d->op1);
35709 rperm[0]
35710 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
35711 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
35712 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
35713 return true;
35714 }
35715 return false;
35716 }
35717 }
35718 else
35719 {
35720 if (GET_MODE_SIZE (d->vmode) == 16)
35721 {
35722 if (!TARGET_SSSE3)
35723 return false;
35724 }
35725 else if (GET_MODE_SIZE (d->vmode) == 32)
35726 {
35727 if (!TARGET_AVX2)
35728 return false;
35729
35730 /* V4DImode should be already handled through
35731 expand_vselect by vpermq instruction. */
35732 gcc_assert (d->vmode != V4DImode);
35733
35734 vmode = V32QImode;
35735 if (d->vmode == V8SImode
35736 || d->vmode == V16HImode
35737 || d->vmode == V32QImode)
35738 {
35739 /* First see if vpermq can be used for
35740 V8SImode/V16HImode/V32QImode. */
35741 if (valid_perm_using_mode_p (V4DImode, d))
35742 {
35743 for (i = 0; i < 4; i++)
35744 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
35745 if (d->testing_p)
35746 return true;
35747 return expand_vselect (gen_lowpart (V4DImode, d->target),
35748 gen_lowpart (V4DImode, d->op0),
35749 perm, 4);
35750 }
35751
35752 /* Next see if vpermd can be used. */
35753 if (valid_perm_using_mode_p (V8SImode, d))
35754 vmode = V8SImode;
35755 }
35756
35757 if (vmode == V32QImode)
35758 {
35759 /* vpshufb only works intra lanes, it is not
35760 possible to shuffle bytes in between the lanes. */
35761 for (i = 0; i < nelt; ++i)
35762 if ((d->perm[i] ^ i) & (nelt / 2))
35763 return false;
35764 }
35765 }
35766 else
35767 return false;
35768 }
35769
35770 if (d->testing_p)
35771 return true;
35772
35773 if (vmode == V8SImode)
35774 for (i = 0; i < 8; ++i)
35775 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
35776 else
35777 {
35778 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35779 if (d->op0 != d->op1)
35780 mask = 2 * nelt - 1;
35781 else if (vmode == V16QImode)
35782 mask = nelt - 1;
35783 else
35784 mask = nelt / 2 - 1;
35785
35786 for (i = 0; i < nelt; ++i)
35787 {
35788 unsigned j, e = d->perm[i] & mask;
35789 for (j = 0; j < eltsz; ++j)
35790 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
35791 }
35792 }
35793
35794 vperm = gen_rtx_CONST_VECTOR (vmode,
35795 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
35796 vperm = force_reg (vmode, vperm);
35797
35798 target = gen_lowpart (vmode, d->target);
35799 op0 = gen_lowpart (vmode, d->op0);
35800 if (d->op0 == d->op1)
35801 {
35802 if (vmode == V16QImode)
35803 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
35804 else if (vmode == V32QImode)
35805 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
35806 else
35807 emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
35808 }
35809 else
35810 {
35811 op1 = gen_lowpart (vmode, d->op1);
35812 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
35813 }
35814
35815 return true;
35816 }
35817
35818 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
35819 in a single instruction. */
35820
35821 static bool
35822 expand_vec_perm_1 (struct expand_vec_perm_d *d)
35823 {
35824 unsigned i, nelt = d->nelt;
35825 unsigned char perm2[MAX_VECT_LEN];
35826
35827 /* Check plain VEC_SELECT first, because AVX has instructions that could
35828 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
35829 input where SEL+CONCAT may not. */
35830 if (d->op0 == d->op1)
35831 {
35832 int mask = nelt - 1;
35833 bool identity_perm = true;
35834 bool broadcast_perm = true;
35835
35836 for (i = 0; i < nelt; i++)
35837 {
35838 perm2[i] = d->perm[i] & mask;
35839 if (perm2[i] != i)
35840 identity_perm = false;
35841 if (perm2[i])
35842 broadcast_perm = false;
35843 }
35844
35845 if (identity_perm)
35846 {
35847 if (!d->testing_p)
35848 emit_move_insn (d->target, d->op0);
35849 return true;
35850 }
35851 else if (broadcast_perm && TARGET_AVX2)
35852 {
35853 /* Use vpbroadcast{b,w,d}. */
35854 rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
35855 switch (d->vmode)
35856 {
35857 case V32QImode:
35858 op = gen_lowpart (V16QImode, op);
35859 gen = gen_avx2_pbroadcastv32qi;
35860 break;
35861 case V16HImode:
35862 op = gen_lowpart (V8HImode, op);
35863 gen = gen_avx2_pbroadcastv16hi;
35864 break;
35865 case V8SImode:
35866 op = gen_lowpart (V4SImode, op);
35867 gen = gen_avx2_pbroadcastv8si;
35868 break;
35869 case V16QImode:
35870 gen = gen_avx2_pbroadcastv16qi;
35871 break;
35872 case V8HImode:
35873 gen = gen_avx2_pbroadcastv8hi;
35874 break;
35875 /* For other modes prefer other shuffles this function creates. */
35876 default: break;
35877 }
35878 if (gen != NULL)
35879 {
35880 if (!d->testing_p)
35881 emit_insn (gen (d->target, op));
35882 return true;
35883 }
35884 }
35885
35886 if (expand_vselect (d->target, d->op0, perm2, nelt))
35887 return true;
35888
35889 /* There are plenty of patterns in sse.md that are written for
35890 SEL+CONCAT and are not replicated for a single op. Perhaps
35891 that should be changed, to avoid the nastiness here. */
35892
35893 /* Recognize interleave style patterns, which means incrementing
35894 every other permutation operand. */
35895 for (i = 0; i < nelt; i += 2)
35896 {
35897 perm2[i] = d->perm[i] & mask;
35898 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
35899 }
35900 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35901 return true;
35902
35903 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
35904 if (nelt >= 4)
35905 {
35906 for (i = 0; i < nelt; i += 4)
35907 {
35908 perm2[i + 0] = d->perm[i + 0] & mask;
35909 perm2[i + 1] = d->perm[i + 1] & mask;
35910 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
35911 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
35912 }
35913
35914 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35915 return true;
35916 }
35917 }
35918
35919 /* Finally, try the fully general two operand permute. */
35920 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
35921 return true;
35922
35923 /* Recognize interleave style patterns with reversed operands. */
35924 if (d->op0 != d->op1)
35925 {
35926 for (i = 0; i < nelt; ++i)
35927 {
35928 unsigned e = d->perm[i];
35929 if (e >= nelt)
35930 e -= nelt;
35931 else
35932 e += nelt;
35933 perm2[i] = e;
35934 }
35935
35936 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
35937 return true;
35938 }
35939
35940 /* Try the SSE4.1 blend variable merge instructions. */
35941 if (expand_vec_perm_blend (d))
35942 return true;
35943
35944 /* Try one of the AVX vpermil variable permutations. */
35945 if (expand_vec_perm_vpermil (d))
35946 return true;
35947
35948 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
35949 vpshufb, vpermd or vpermq variable permutation. */
35950 if (expand_vec_perm_pshufb (d))
35951 return true;
35952
35953 return false;
35954 }
35955
35956 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35957 in terms of a pair of pshuflw + pshufhw instructions. */
35958
35959 static bool
35960 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
35961 {
35962 unsigned char perm2[MAX_VECT_LEN];
35963 unsigned i;
35964 bool ok;
35965
35966 if (d->vmode != V8HImode || d->op0 != d->op1)
35967 return false;
35968
35969 /* The two permutations only operate in 64-bit lanes. */
35970 for (i = 0; i < 4; ++i)
35971 if (d->perm[i] >= 4)
35972 return false;
35973 for (i = 4; i < 8; ++i)
35974 if (d->perm[i] < 4)
35975 return false;
35976
35977 if (d->testing_p)
35978 return true;
35979
35980 /* Emit the pshuflw. */
35981 memcpy (perm2, d->perm, 4);
35982 for (i = 4; i < 8; ++i)
35983 perm2[i] = i;
35984 ok = expand_vselect (d->target, d->op0, perm2, 8);
35985 gcc_assert (ok);
35986
35987 /* Emit the pshufhw. */
35988 memcpy (perm2 + 4, d->perm + 4, 4);
35989 for (i = 0; i < 4; ++i)
35990 perm2[i] = i;
35991 ok = expand_vselect (d->target, d->target, perm2, 8);
35992 gcc_assert (ok);
35993
35994 return true;
35995 }
35996
35997 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
35998 the permutation using the SSSE3 palignr instruction. This succeeds
35999 when all of the elements in PERM fit within one vector and we merely
36000 need to shift them down so that a single vector permutation has a
36001 chance to succeed. */
36002
36003 static bool
36004 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36005 {
36006 unsigned i, nelt = d->nelt;
36007 unsigned min, max;
36008 bool in_order, ok;
36009 rtx shift;
36010
36011 /* Even with AVX, palignr only operates on 128-bit vectors. */
36012 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36013 return false;
36014
36015 min = nelt, max = 0;
36016 for (i = 0; i < nelt; ++i)
36017 {
36018 unsigned e = d->perm[i];
36019 if (e < min)
36020 min = e;
36021 if (e > max)
36022 max = e;
36023 }
36024 if (min == 0 || max - min >= nelt)
36025 return false;
36026
36027 /* Given that we have SSSE3, we know we'll be able to implement the
36028 single operand permutation after the palignr with pshufb. */
36029 if (d->testing_p)
36030 return true;
36031
36032 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36033 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36034 gen_lowpart (TImode, d->op1),
36035 gen_lowpart (TImode, d->op0), shift));
36036
36037 d->op0 = d->op1 = d->target;
36038
36039 in_order = true;
36040 for (i = 0; i < nelt; ++i)
36041 {
36042 unsigned e = d->perm[i] - min;
36043 if (e != i)
36044 in_order = false;
36045 d->perm[i] = e;
36046 }
36047
36048 /* Test for the degenerate case where the alignment by itself
36049 produces the desired permutation. */
36050 if (in_order)
36051 return true;
36052
36053 ok = expand_vec_perm_1 (d);
36054 gcc_assert (ok);
36055
36056 return ok;
36057 }
36058
36059 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
36060
36061 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36062 a two vector permutation into a single vector permutation by using
36063 an interleave operation to merge the vectors. */
36064
36065 static bool
36066 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36067 {
36068 struct expand_vec_perm_d dremap, dfinal;
36069 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36070 unsigned HOST_WIDE_INT contents;
36071 unsigned char remap[2 * MAX_VECT_LEN];
36072 rtx seq;
36073 bool ok, same_halves = false;
36074
36075 if (GET_MODE_SIZE (d->vmode) == 16)
36076 {
36077 if (d->op0 == d->op1)
36078 return false;
36079 }
36080 else if (GET_MODE_SIZE (d->vmode) == 32)
36081 {
36082 if (!TARGET_AVX)
36083 return false;
36084 /* For 32-byte modes allow even d->op0 == d->op1.
36085 The lack of cross-lane shuffling in some instructions
36086 might prevent a single insn shuffle. */
36087 dfinal = *d;
36088 dfinal.testing_p = true;
36089 /* If expand_vec_perm_interleave3 can expand this into
36090 a 3 insn sequence, give up and let it be expanded as
36091 3 insn sequence. While that is one insn longer,
36092 it doesn't need a memory operand and in the common
36093 case that both interleave low and high permutations
36094 with the same operands are adjacent needs 4 insns
36095 for both after CSE. */
36096 if (expand_vec_perm_interleave3 (&dfinal))
36097 return false;
36098 }
36099 else
36100 return false;
36101
36102 /* Examine from whence the elements come. */
36103 contents = 0;
36104 for (i = 0; i < nelt; ++i)
36105 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36106
36107 memset (remap, 0xff, sizeof (remap));
36108 dremap = *d;
36109
36110 if (GET_MODE_SIZE (d->vmode) == 16)
36111 {
36112 unsigned HOST_WIDE_INT h1, h2, h3, h4;
36113
36114 /* Split the two input vectors into 4 halves. */
36115 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36116 h2 = h1 << nelt2;
36117 h3 = h2 << nelt2;
36118 h4 = h3 << nelt2;
36119
36120 /* If the elements from the low halves use interleave low, and similarly
36121 for interleave high. If the elements are from mis-matched halves, we
36122 can use shufps for V4SF/V4SI or do a DImode shuffle. */
36123 if ((contents & (h1 | h3)) == contents)
36124 {
36125 /* punpckl* */
36126 for (i = 0; i < nelt2; ++i)
36127 {
36128 remap[i] = i * 2;
36129 remap[i + nelt] = i * 2 + 1;
36130 dremap.perm[i * 2] = i;
36131 dremap.perm[i * 2 + 1] = i + nelt;
36132 }
36133 if (!TARGET_SSE2 && d->vmode == V4SImode)
36134 dremap.vmode = V4SFmode;
36135 }
36136 else if ((contents & (h2 | h4)) == contents)
36137 {
36138 /* punpckh* */
36139 for (i = 0; i < nelt2; ++i)
36140 {
36141 remap[i + nelt2] = i * 2;
36142 remap[i + nelt + nelt2] = i * 2 + 1;
36143 dremap.perm[i * 2] = i + nelt2;
36144 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
36145 }
36146 if (!TARGET_SSE2 && d->vmode == V4SImode)
36147 dremap.vmode = V4SFmode;
36148 }
36149 else if ((contents & (h1 | h4)) == contents)
36150 {
36151 /* shufps */
36152 for (i = 0; i < nelt2; ++i)
36153 {
36154 remap[i] = i;
36155 remap[i + nelt + nelt2] = i + nelt2;
36156 dremap.perm[i] = i;
36157 dremap.perm[i + nelt2] = i + nelt + nelt2;
36158 }
36159 if (nelt != 4)
36160 {
36161 /* shufpd */
36162 dremap.vmode = V2DImode;
36163 dremap.nelt = 2;
36164 dremap.perm[0] = 0;
36165 dremap.perm[1] = 3;
36166 }
36167 }
36168 else if ((contents & (h2 | h3)) == contents)
36169 {
36170 /* shufps */
36171 for (i = 0; i < nelt2; ++i)
36172 {
36173 remap[i + nelt2] = i;
36174 remap[i + nelt] = i + nelt2;
36175 dremap.perm[i] = i + nelt2;
36176 dremap.perm[i + nelt2] = i + nelt;
36177 }
36178 if (nelt != 4)
36179 {
36180 /* shufpd */
36181 dremap.vmode = V2DImode;
36182 dremap.nelt = 2;
36183 dremap.perm[0] = 1;
36184 dremap.perm[1] = 2;
36185 }
36186 }
36187 else
36188 return false;
36189 }
36190 else
36191 {
36192 unsigned int nelt4 = nelt / 4, nzcnt = 0;
36193 unsigned HOST_WIDE_INT q[8];
36194 unsigned int nonzero_halves[4];
36195
36196 /* Split the two input vectors into 8 quarters. */
36197 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
36198 for (i = 1; i < 8; ++i)
36199 q[i] = q[0] << (nelt4 * i);
36200 for (i = 0; i < 4; ++i)
36201 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
36202 {
36203 nonzero_halves[nzcnt] = i;
36204 ++nzcnt;
36205 }
36206
36207 if (nzcnt == 1)
36208 {
36209 gcc_assert (d->op0 == d->op1);
36210 nonzero_halves[1] = nonzero_halves[0];
36211 same_halves = true;
36212 }
36213 else if (d->op0 == d->op1)
36214 {
36215 gcc_assert (nonzero_halves[0] == 0);
36216 gcc_assert (nonzero_halves[1] == 1);
36217 }
36218
36219 if (nzcnt <= 2)
36220 {
36221 if (d->perm[0] / nelt2 == nonzero_halves[1])
36222 {
36223 /* Attempt to increase the likelyhood that dfinal
36224 shuffle will be intra-lane. */
36225 char tmph = nonzero_halves[0];
36226 nonzero_halves[0] = nonzero_halves[1];
36227 nonzero_halves[1] = tmph;
36228 }
36229
36230 /* vperm2f128 or vperm2i128. */
36231 for (i = 0; i < nelt2; ++i)
36232 {
36233 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
36234 remap[i + nonzero_halves[0] * nelt2] = i;
36235 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
36236 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
36237 }
36238
36239 if (d->vmode != V8SFmode
36240 && d->vmode != V4DFmode
36241 && d->vmode != V8SImode)
36242 {
36243 dremap.vmode = V8SImode;
36244 dremap.nelt = 8;
36245 for (i = 0; i < 4; ++i)
36246 {
36247 dremap.perm[i] = i + nonzero_halves[0] * 4;
36248 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
36249 }
36250 }
36251 }
36252 else if (d->op0 == d->op1)
36253 return false;
36254 else if (TARGET_AVX2
36255 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
36256 {
36257 /* vpunpckl* */
36258 for (i = 0; i < nelt4; ++i)
36259 {
36260 remap[i] = i * 2;
36261 remap[i + nelt] = i * 2 + 1;
36262 remap[i + nelt2] = i * 2 + nelt2;
36263 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
36264 dremap.perm[i * 2] = i;
36265 dremap.perm[i * 2 + 1] = i + nelt;
36266 dremap.perm[i * 2 + nelt2] = i + nelt2;
36267 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
36268 }
36269 }
36270 else if (TARGET_AVX2
36271 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
36272 {
36273 /* vpunpckh* */
36274 for (i = 0; i < nelt4; ++i)
36275 {
36276 remap[i + nelt4] = i * 2;
36277 remap[i + nelt + nelt4] = i * 2 + 1;
36278 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
36279 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
36280 dremap.perm[i * 2] = i + nelt4;
36281 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
36282 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
36283 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
36284 }
36285 }
36286 else
36287 return false;
36288 }
36289
36290 /* Use the remapping array set up above to move the elements from their
36291 swizzled locations into their final destinations. */
36292 dfinal = *d;
36293 for (i = 0; i < nelt; ++i)
36294 {
36295 unsigned e = remap[d->perm[i]];
36296 gcc_assert (e < nelt);
36297 /* If same_halves is true, both halves of the remapped vector are the
36298 same. Avoid cross-lane accesses if possible. */
36299 if (same_halves && i >= nelt2)
36300 {
36301 gcc_assert (e < nelt2);
36302 dfinal.perm[i] = e + nelt2;
36303 }
36304 else
36305 dfinal.perm[i] = e;
36306 }
36307 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
36308 dfinal.op1 = dfinal.op0;
36309 dremap.target = dfinal.op0;
36310
36311 /* Test if the final remap can be done with a single insn. For V4SFmode or
36312 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
36313 start_sequence ();
36314 ok = expand_vec_perm_1 (&dfinal);
36315 seq = get_insns ();
36316 end_sequence ();
36317
36318 if (!ok)
36319 return false;
36320
36321 if (d->testing_p)
36322 return true;
36323
36324 if (dremap.vmode != dfinal.vmode)
36325 {
36326 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
36327 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
36328 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
36329 }
36330
36331 ok = expand_vec_perm_1 (&dremap);
36332 gcc_assert (ok);
36333
36334 emit_insn (seq);
36335 return true;
36336 }
36337
36338 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36339 a single vector cross-lane permutation into vpermq followed
36340 by any of the single insn permutations. */
36341
36342 static bool
36343 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
36344 {
36345 struct expand_vec_perm_d dremap, dfinal;
36346 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
36347 unsigned contents[2];
36348 bool ok;
36349
36350 if (!(TARGET_AVX2
36351 && (d->vmode == V32QImode || d->vmode == V16HImode)
36352 && d->op0 == d->op1))
36353 return false;
36354
36355 contents[0] = 0;
36356 contents[1] = 0;
36357 for (i = 0; i < nelt2; ++i)
36358 {
36359 contents[0] |= 1u << (d->perm[i] / nelt4);
36360 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
36361 }
36362
36363 for (i = 0; i < 2; ++i)
36364 {
36365 unsigned int cnt = 0;
36366 for (j = 0; j < 4; ++j)
36367 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
36368 return false;
36369 }
36370
36371 if (d->testing_p)
36372 return true;
36373
36374 dremap = *d;
36375 dremap.vmode = V4DImode;
36376 dremap.nelt = 4;
36377 dremap.target = gen_reg_rtx (V4DImode);
36378 dremap.op0 = gen_lowpart (V4DImode, d->op0);
36379 dremap.op1 = dremap.op0;
36380 for (i = 0; i < 2; ++i)
36381 {
36382 unsigned int cnt = 0;
36383 for (j = 0; j < 4; ++j)
36384 if ((contents[i] & (1u << j)) != 0)
36385 dremap.perm[2 * i + cnt++] = j;
36386 for (; cnt < 2; ++cnt)
36387 dremap.perm[2 * i + cnt] = 0;
36388 }
36389
36390 dfinal = *d;
36391 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
36392 dfinal.op1 = dfinal.op0;
36393 for (i = 0, j = 0; i < nelt; ++i)
36394 {
36395 if (i == nelt2)
36396 j = 2;
36397 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
36398 if ((d->perm[i] / nelt4) == dremap.perm[j])
36399 ;
36400 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
36401 dfinal.perm[i] |= nelt4;
36402 else
36403 gcc_unreachable ();
36404 }
36405
36406 ok = expand_vec_perm_1 (&dremap);
36407 gcc_assert (ok);
36408
36409 ok = expand_vec_perm_1 (&dfinal);
36410 gcc_assert (ok);
36411
36412 return true;
36413 }
36414
36415 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36416 a two vector permutation using 2 intra-lane interleave insns
36417 and cross-lane shuffle for 32-byte vectors. */
36418
36419 static bool
36420 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
36421 {
36422 unsigned i, nelt;
36423 rtx (*gen) (rtx, rtx, rtx);
36424
36425 if (d->op0 == d->op1)
36426 return false;
36427 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
36428 ;
36429 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
36430 ;
36431 else
36432 return false;
36433
36434 nelt = d->nelt;
36435 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
36436 return false;
36437 for (i = 0; i < nelt; i += 2)
36438 if (d->perm[i] != d->perm[0] + i / 2
36439 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
36440 return false;
36441
36442 if (d->testing_p)
36443 return true;
36444
36445 switch (d->vmode)
36446 {
36447 case V32QImode:
36448 if (d->perm[0])
36449 gen = gen_vec_interleave_highv32qi;
36450 else
36451 gen = gen_vec_interleave_lowv32qi;
36452 break;
36453 case V16HImode:
36454 if (d->perm[0])
36455 gen = gen_vec_interleave_highv16hi;
36456 else
36457 gen = gen_vec_interleave_lowv16hi;
36458 break;
36459 case V8SImode:
36460 if (d->perm[0])
36461 gen = gen_vec_interleave_highv8si;
36462 else
36463 gen = gen_vec_interleave_lowv8si;
36464 break;
36465 case V4DImode:
36466 if (d->perm[0])
36467 gen = gen_vec_interleave_highv4di;
36468 else
36469 gen = gen_vec_interleave_lowv4di;
36470 break;
36471 case V8SFmode:
36472 if (d->perm[0])
36473 gen = gen_vec_interleave_highv8sf;
36474 else
36475 gen = gen_vec_interleave_lowv8sf;
36476 break;
36477 case V4DFmode:
36478 if (d->perm[0])
36479 gen = gen_vec_interleave_highv4df;
36480 else
36481 gen = gen_vec_interleave_lowv4df;
36482 break;
36483 default:
36484 gcc_unreachable ();
36485 }
36486
36487 emit_insn (gen (d->target, d->op0, d->op1));
36488 return true;
36489 }
36490
36491 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
36492 permutation with two pshufb insns and an ior. We should have already
36493 failed all two instruction sequences. */
36494
36495 static bool
36496 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
36497 {
36498 rtx rperm[2][16], vperm, l, h, op, m128;
36499 unsigned int i, nelt, eltsz;
36500
36501 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36502 return false;
36503 gcc_assert (d->op0 != d->op1);
36504
36505 nelt = d->nelt;
36506 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36507
36508 /* Generate two permutation masks. If the required element is within
36509 the given vector it is shuffled into the proper lane. If the required
36510 element is in the other vector, force a zero into the lane by setting
36511 bit 7 in the permutation mask. */
36512 m128 = GEN_INT (-128);
36513 for (i = 0; i < nelt; ++i)
36514 {
36515 unsigned j, e = d->perm[i];
36516 unsigned which = (e >= nelt);
36517 if (e >= nelt)
36518 e -= nelt;
36519
36520 for (j = 0; j < eltsz; ++j)
36521 {
36522 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
36523 rperm[1-which][i*eltsz + j] = m128;
36524 }
36525 }
36526
36527 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
36528 vperm = force_reg (V16QImode, vperm);
36529
36530 l = gen_reg_rtx (V16QImode);
36531 op = gen_lowpart (V16QImode, d->op0);
36532 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
36533
36534 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
36535 vperm = force_reg (V16QImode, vperm);
36536
36537 h = gen_reg_rtx (V16QImode);
36538 op = gen_lowpart (V16QImode, d->op1);
36539 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
36540
36541 op = gen_lowpart (V16QImode, d->target);
36542 emit_insn (gen_iorv16qi3 (op, l, h));
36543
36544 return true;
36545 }
36546
36547 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
36548 with two vpshufb insns, vpermq and vpor. We should have already failed
36549 all two or three instruction sequences. */
36550
36551 static bool
36552 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
36553 {
36554 rtx rperm[2][32], vperm, l, h, hp, op, m128;
36555 unsigned int i, nelt, eltsz;
36556
36557 if (!TARGET_AVX2
36558 || d->op0 != d->op1
36559 || (d->vmode != V32QImode && d->vmode != V16HImode))
36560 return false;
36561
36562 if (d->testing_p)
36563 return true;
36564
36565 nelt = d->nelt;
36566 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36567
36568 /* Generate two permutation masks. If the required element is within
36569 the same lane, it is shuffled in. If the required element from the
36570 other lane, force a zero by setting bit 7 in the permutation mask.
36571 In the other mask the mask has non-negative elements if element
36572 is requested from the other lane, but also moved to the other lane,
36573 so that the result of vpshufb can have the two V2TImode halves
36574 swapped. */
36575 m128 = GEN_INT (-128);
36576 for (i = 0; i < nelt; ++i)
36577 {
36578 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36579 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36580
36581 for (j = 0; j < eltsz; ++j)
36582 {
36583 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
36584 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
36585 }
36586 }
36587
36588 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36589 vperm = force_reg (V32QImode, vperm);
36590
36591 h = gen_reg_rtx (V32QImode);
36592 op = gen_lowpart (V32QImode, d->op0);
36593 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36594
36595 /* Swap the 128-byte lanes of h into hp. */
36596 hp = gen_reg_rtx (V4DImode);
36597 op = gen_lowpart (V4DImode, h);
36598 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
36599 const1_rtx));
36600
36601 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36602 vperm = force_reg (V32QImode, vperm);
36603
36604 l = gen_reg_rtx (V32QImode);
36605 op = gen_lowpart (V32QImode, d->op0);
36606 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36607
36608 op = gen_lowpart (V32QImode, d->target);
36609 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
36610
36611 return true;
36612 }
36613
36614 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
36615 and extract-odd permutations of two V32QImode and V16QImode operand
36616 with two vpshufb insns, vpor and vpermq. We should have already
36617 failed all two or three instruction sequences. */
36618
36619 static bool
36620 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
36621 {
36622 rtx rperm[2][32], vperm, l, h, ior, op, m128;
36623 unsigned int i, nelt, eltsz;
36624
36625 if (!TARGET_AVX2
36626 || d->op0 == d->op1
36627 || (d->vmode != V32QImode && d->vmode != V16HImode))
36628 return false;
36629
36630 for (i = 0; i < d->nelt; ++i)
36631 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
36632 return false;
36633
36634 if (d->testing_p)
36635 return true;
36636
36637 nelt = d->nelt;
36638 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36639
36640 /* Generate two permutation masks. In the first permutation mask
36641 the first quarter will contain indexes for the first half
36642 of the op0, the second quarter will contain bit 7 set, third quarter
36643 will contain indexes for the second half of the op0 and the
36644 last quarter bit 7 set. In the second permutation mask
36645 the first quarter will contain bit 7 set, the second quarter
36646 indexes for the first half of the op1, the third quarter bit 7 set
36647 and last quarter indexes for the second half of the op1.
36648 I.e. the first mask e.g. for V32QImode extract even will be:
36649 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
36650 (all values masked with 0xf except for -128) and second mask
36651 for extract even will be
36652 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
36653 m128 = GEN_INT (-128);
36654 for (i = 0; i < nelt; ++i)
36655 {
36656 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36657 unsigned which = d->perm[i] >= nelt;
36658 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
36659
36660 for (j = 0; j < eltsz; ++j)
36661 {
36662 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
36663 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
36664 }
36665 }
36666
36667 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36668 vperm = force_reg (V32QImode, vperm);
36669
36670 l = gen_reg_rtx (V32QImode);
36671 op = gen_lowpart (V32QImode, d->op0);
36672 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36673
36674 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36675 vperm = force_reg (V32QImode, vperm);
36676
36677 h = gen_reg_rtx (V32QImode);
36678 op = gen_lowpart (V32QImode, d->op1);
36679 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36680
36681 ior = gen_reg_rtx (V32QImode);
36682 emit_insn (gen_iorv32qi3 (ior, l, h));
36683
36684 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
36685 op = gen_lowpart (V4DImode, d->target);
36686 ior = gen_lowpart (V4DImode, ior);
36687 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
36688 const1_rtx, GEN_INT (3)));
36689
36690 return true;
36691 }
36692
36693 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
36694 and extract-odd permutations. */
36695
36696 static bool
36697 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
36698 {
36699 rtx t1, t2, t3;
36700
36701 switch (d->vmode)
36702 {
36703 case V4DFmode:
36704 t1 = gen_reg_rtx (V4DFmode);
36705 t2 = gen_reg_rtx (V4DFmode);
36706
36707 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36708 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
36709 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
36710
36711 /* Now an unpck[lh]pd will produce the result required. */
36712 if (odd)
36713 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
36714 else
36715 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
36716 emit_insn (t3);
36717 break;
36718
36719 case V8SFmode:
36720 {
36721 int mask = odd ? 0xdd : 0x88;
36722
36723 t1 = gen_reg_rtx (V8SFmode);
36724 t2 = gen_reg_rtx (V8SFmode);
36725 t3 = gen_reg_rtx (V8SFmode);
36726
36727 /* Shuffle within the 128-bit lanes to produce:
36728 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
36729 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
36730 GEN_INT (mask)));
36731
36732 /* Shuffle the lanes around to produce:
36733 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
36734 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
36735 GEN_INT (0x3)));
36736
36737 /* Shuffle within the 128-bit lanes to produce:
36738 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
36739 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
36740
36741 /* Shuffle within the 128-bit lanes to produce:
36742 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
36743 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
36744
36745 /* Shuffle the lanes around to produce:
36746 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
36747 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
36748 GEN_INT (0x20)));
36749 }
36750 break;
36751
36752 case V2DFmode:
36753 case V4SFmode:
36754 case V2DImode:
36755 case V4SImode:
36756 /* These are always directly implementable by expand_vec_perm_1. */
36757 gcc_unreachable ();
36758
36759 case V8HImode:
36760 if (TARGET_SSSE3)
36761 return expand_vec_perm_pshufb2 (d);
36762 else
36763 {
36764 /* We need 2*log2(N)-1 operations to achieve odd/even
36765 with interleave. */
36766 t1 = gen_reg_rtx (V8HImode);
36767 t2 = gen_reg_rtx (V8HImode);
36768 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
36769 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
36770 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
36771 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
36772 if (odd)
36773 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
36774 else
36775 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
36776 emit_insn (t3);
36777 }
36778 break;
36779
36780 case V16QImode:
36781 if (TARGET_SSSE3)
36782 return expand_vec_perm_pshufb2 (d);
36783 else
36784 {
36785 t1 = gen_reg_rtx (V16QImode);
36786 t2 = gen_reg_rtx (V16QImode);
36787 t3 = gen_reg_rtx (V16QImode);
36788 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
36789 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
36790 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
36791 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
36792 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
36793 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
36794 if (odd)
36795 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
36796 else
36797 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
36798 emit_insn (t3);
36799 }
36800 break;
36801
36802 case V16HImode:
36803 case V32QImode:
36804 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
36805
36806 case V4DImode:
36807 if (!TARGET_AVX2)
36808 {
36809 struct expand_vec_perm_d d_copy = *d;
36810 d_copy.vmode = V4DFmode;
36811 d_copy.target = gen_lowpart (V4DFmode, d->target);
36812 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
36813 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
36814 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36815 }
36816
36817 t1 = gen_reg_rtx (V4DImode);
36818 t2 = gen_reg_rtx (V4DImode);
36819
36820 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36821 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
36822 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
36823
36824 /* Now an vpunpck[lh]qdq will produce the result required. */
36825 if (odd)
36826 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
36827 else
36828 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
36829 emit_insn (t3);
36830 break;
36831
36832 case V8SImode:
36833 if (!TARGET_AVX2)
36834 {
36835 struct expand_vec_perm_d d_copy = *d;
36836 d_copy.vmode = V8SFmode;
36837 d_copy.target = gen_lowpart (V8SFmode, d->target);
36838 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
36839 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
36840 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36841 }
36842
36843 t1 = gen_reg_rtx (V8SImode);
36844 t2 = gen_reg_rtx (V8SImode);
36845
36846 /* Shuffle the lanes around into
36847 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
36848 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
36849 gen_lowpart (V4DImode, d->op0),
36850 gen_lowpart (V4DImode, d->op1),
36851 GEN_INT (0x20)));
36852 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
36853 gen_lowpart (V4DImode, d->op0),
36854 gen_lowpart (V4DImode, d->op1),
36855 GEN_INT (0x31)));
36856
36857 /* Swap the 2nd and 3rd position in each lane into
36858 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
36859 emit_insn (gen_avx2_pshufdv3 (t1, t1,
36860 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36861 emit_insn (gen_avx2_pshufdv3 (t2, t2,
36862 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36863
36864 /* Now an vpunpck[lh]qdq will produce
36865 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
36866 if (odd)
36867 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
36868 gen_lowpart (V4DImode, t1),
36869 gen_lowpart (V4DImode, t2));
36870 else
36871 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
36872 gen_lowpart (V4DImode, t1),
36873 gen_lowpart (V4DImode, t2));
36874 emit_insn (t3);
36875 break;
36876
36877 default:
36878 gcc_unreachable ();
36879 }
36880
36881 return true;
36882 }
36883
36884 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36885 extract-even and extract-odd permutations. */
36886
36887 static bool
36888 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
36889 {
36890 unsigned i, odd, nelt = d->nelt;
36891
36892 odd = d->perm[0];
36893 if (odd != 0 && odd != 1)
36894 return false;
36895
36896 for (i = 1; i < nelt; ++i)
36897 if (d->perm[i] != 2 * i + odd)
36898 return false;
36899
36900 return expand_vec_perm_even_odd_1 (d, odd);
36901 }
36902
36903 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
36904 permutations. We assume that expand_vec_perm_1 has already failed. */
36905
36906 static bool
36907 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
36908 {
36909 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
36910 enum machine_mode vmode = d->vmode;
36911 unsigned char perm2[4];
36912 rtx op0 = d->op0;
36913 bool ok;
36914
36915 switch (vmode)
36916 {
36917 case V4DFmode:
36918 case V8SFmode:
36919 /* These are special-cased in sse.md so that we can optionally
36920 use the vbroadcast instruction. They expand to two insns
36921 if the input happens to be in a register. */
36922 gcc_unreachable ();
36923
36924 case V2DFmode:
36925 case V2DImode:
36926 case V4SFmode:
36927 case V4SImode:
36928 /* These are always implementable using standard shuffle patterns. */
36929 gcc_unreachable ();
36930
36931 case V8HImode:
36932 case V16QImode:
36933 /* These can be implemented via interleave. We save one insn by
36934 stopping once we have promoted to V4SImode and then use pshufd. */
36935 do
36936 {
36937 rtx dest;
36938 rtx (*gen) (rtx, rtx, rtx)
36939 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
36940 : gen_vec_interleave_lowv8hi;
36941
36942 if (elt >= nelt2)
36943 {
36944 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
36945 : gen_vec_interleave_highv8hi;
36946 elt -= nelt2;
36947 }
36948 nelt2 /= 2;
36949
36950 dest = gen_reg_rtx (vmode);
36951 emit_insn (gen (dest, op0, op0));
36952 vmode = get_mode_wider_vector (vmode);
36953 op0 = gen_lowpart (vmode, dest);
36954 }
36955 while (vmode != V4SImode);
36956
36957 memset (perm2, elt, 4);
36958 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
36959 gcc_assert (ok);
36960 return true;
36961
36962 case V32QImode:
36963 case V16HImode:
36964 case V8SImode:
36965 case V4DImode:
36966 /* For AVX2 broadcasts of the first element vpbroadcast* or
36967 vpermq should be used by expand_vec_perm_1. */
36968 gcc_assert (!TARGET_AVX2 || d->perm[0]);
36969 return false;
36970
36971 default:
36972 gcc_unreachable ();
36973 }
36974 }
36975
36976 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36977 broadcast permutations. */
36978
36979 static bool
36980 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
36981 {
36982 unsigned i, elt, nelt = d->nelt;
36983
36984 if (d->op0 != d->op1)
36985 return false;
36986
36987 elt = d->perm[0];
36988 for (i = 1; i < nelt; ++i)
36989 if (d->perm[i] != elt)
36990 return false;
36991
36992 return expand_vec_perm_broadcast_1 (d);
36993 }
36994
36995 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
36996 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
36997 all the shorter instruction sequences. */
36998
36999 static bool
37000 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
37001 {
37002 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
37003 unsigned int i, nelt, eltsz;
37004 bool used[4];
37005
37006 if (!TARGET_AVX2
37007 || d->op0 == d->op1
37008 || (d->vmode != V32QImode && d->vmode != V16HImode))
37009 return false;
37010
37011 if (d->testing_p)
37012 return true;
37013
37014 nelt = d->nelt;
37015 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37016
37017 /* Generate 4 permutation masks. If the required element is within
37018 the same lane, it is shuffled in. If the required element from the
37019 other lane, force a zero by setting bit 7 in the permutation mask.
37020 In the other mask the mask has non-negative elements if element
37021 is requested from the other lane, but also moved to the other lane,
37022 so that the result of vpshufb can have the two V2TImode halves
37023 swapped. */
37024 m128 = GEN_INT (-128);
37025 for (i = 0; i < 32; ++i)
37026 {
37027 rperm[0][i] = m128;
37028 rperm[1][i] = m128;
37029 rperm[2][i] = m128;
37030 rperm[3][i] = m128;
37031 }
37032 used[0] = false;
37033 used[1] = false;
37034 used[2] = false;
37035 used[3] = false;
37036 for (i = 0; i < nelt; ++i)
37037 {
37038 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37039 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37040 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
37041
37042 for (j = 0; j < eltsz; ++j)
37043 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
37044 used[which] = true;
37045 }
37046
37047 for (i = 0; i < 2; ++i)
37048 {
37049 if (!used[2 * i + 1])
37050 {
37051 h[i] = NULL_RTX;
37052 continue;
37053 }
37054 vperm = gen_rtx_CONST_VECTOR (V32QImode,
37055 gen_rtvec_v (32, rperm[2 * i + 1]));
37056 vperm = force_reg (V32QImode, vperm);
37057 h[i] = gen_reg_rtx (V32QImode);
37058 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37059 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
37060 }
37061
37062 /* Swap the 128-byte lanes of h[X]. */
37063 for (i = 0; i < 2; ++i)
37064 {
37065 if (h[i] == NULL_RTX)
37066 continue;
37067 op = gen_reg_rtx (V4DImode);
37068 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
37069 const2_rtx, GEN_INT (3), const0_rtx,
37070 const1_rtx));
37071 h[i] = gen_lowpart (V32QImode, op);
37072 }
37073
37074 for (i = 0; i < 2; ++i)
37075 {
37076 if (!used[2 * i])
37077 {
37078 l[i] = NULL_RTX;
37079 continue;
37080 }
37081 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
37082 vperm = force_reg (V32QImode, vperm);
37083 l[i] = gen_reg_rtx (V32QImode);
37084 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37085 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
37086 }
37087
37088 for (i = 0; i < 2; ++i)
37089 {
37090 if (h[i] && l[i])
37091 {
37092 op = gen_reg_rtx (V32QImode);
37093 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
37094 l[i] = op;
37095 }
37096 else if (h[i])
37097 l[i] = h[i];
37098 }
37099
37100 gcc_assert (l[0] && l[1]);
37101 op = gen_lowpart (V32QImode, d->target);
37102 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
37103 return true;
37104 }
37105
37106 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
37107 With all of the interface bits taken care of, perform the expansion
37108 in D and return true on success. */
37109
37110 static bool
37111 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
37112 {
37113 /* Try a single instruction expansion. */
37114 if (expand_vec_perm_1 (d))
37115 return true;
37116
37117 /* Try sequences of two instructions. */
37118
37119 if (expand_vec_perm_pshuflw_pshufhw (d))
37120 return true;
37121
37122 if (expand_vec_perm_palignr (d))
37123 return true;
37124
37125 if (expand_vec_perm_interleave2 (d))
37126 return true;
37127
37128 if (expand_vec_perm_broadcast (d))
37129 return true;
37130
37131 if (expand_vec_perm_vpermq_perm_1 (d))
37132 return true;
37133
37134 /* Try sequences of three instructions. */
37135
37136 if (expand_vec_perm_pshufb2 (d))
37137 return true;
37138
37139 if (expand_vec_perm_interleave3 (d))
37140 return true;
37141
37142 /* Try sequences of four instructions. */
37143
37144 if (expand_vec_perm_vpshufb2_vpermq (d))
37145 return true;
37146
37147 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
37148 return true;
37149
37150 /* ??? Look for narrow permutations whose element orderings would
37151 allow the promotion to a wider mode. */
37152
37153 /* ??? Look for sequences of interleave or a wider permute that place
37154 the data into the correct lanes for a half-vector shuffle like
37155 pshuf[lh]w or vpermilps. */
37156
37157 /* ??? Look for sequences of interleave that produce the desired results.
37158 The combinatorics of punpck[lh] get pretty ugly... */
37159
37160 if (expand_vec_perm_even_odd (d))
37161 return true;
37162
37163 /* Even longer sequences. */
37164 if (expand_vec_perm_vpshufb4_vpermq2 (d))
37165 return true;
37166
37167 return false;
37168 }
37169
37170 bool
37171 ix86_expand_vec_perm_const (rtx operands[4])
37172 {
37173 struct expand_vec_perm_d d;
37174 unsigned char perm[MAX_VECT_LEN];
37175 int i, nelt, which;
37176 rtx sel;
37177
37178 d.target = operands[0];
37179 d.op0 = operands[1];
37180 d.op1 = operands[2];
37181 sel = operands[3];
37182
37183 d.vmode = GET_MODE (d.target);
37184 gcc_assert (VECTOR_MODE_P (d.vmode));
37185 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37186 d.testing_p = false;
37187
37188 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
37189 gcc_assert (XVECLEN (sel, 0) == nelt);
37190 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
37191
37192 for (i = which = 0; i < nelt; ++i)
37193 {
37194 rtx e = XVECEXP (sel, 0, i);
37195 int ei = INTVAL (e) & (2 * nelt - 1);
37196
37197 which |= (ei < nelt ? 1 : 2);
37198 d.perm[i] = ei;
37199 perm[i] = ei;
37200 }
37201
37202 switch (which)
37203 {
37204 default:
37205 gcc_unreachable();
37206
37207 case 3:
37208 if (!rtx_equal_p (d.op0, d.op1))
37209 break;
37210
37211 /* The elements of PERM do not suggest that only the first operand
37212 is used, but both operands are identical. Allow easier matching
37213 of the permutation by folding the permutation into the single
37214 input vector. */
37215 for (i = 0; i < nelt; ++i)
37216 if (d.perm[i] >= nelt)
37217 d.perm[i] -= nelt;
37218 /* FALLTHRU */
37219
37220 case 1:
37221 d.op1 = d.op0;
37222 break;
37223
37224 case 2:
37225 for (i = 0; i < nelt; ++i)
37226 d.perm[i] -= nelt;
37227 d.op0 = d.op1;
37228 break;
37229 }
37230
37231 if (ix86_expand_vec_perm_const_1 (&d))
37232 return true;
37233
37234 /* If the mask says both arguments are needed, but they are the same,
37235 the above tried to expand with d.op0 == d.op1. If that didn't work,
37236 retry with d.op0 != d.op1 as that is what testing has been done with. */
37237 if (which == 3 && d.op0 == d.op1)
37238 {
37239 rtx seq;
37240 bool ok;
37241
37242 memcpy (d.perm, perm, sizeof (perm));
37243 d.op1 = gen_reg_rtx (d.vmode);
37244 start_sequence ();
37245 ok = ix86_expand_vec_perm_const_1 (&d);
37246 seq = get_insns ();
37247 end_sequence ();
37248 if (ok)
37249 {
37250 emit_move_insn (d.op1, d.op0);
37251 emit_insn (seq);
37252 return true;
37253 }
37254 }
37255
37256 return false;
37257 }
37258
37259 /* Implement targetm.vectorize.vec_perm_const_ok. */
37260
37261 static bool
37262 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
37263 const unsigned char *sel)
37264 {
37265 struct expand_vec_perm_d d;
37266 unsigned int i, nelt, which;
37267 bool ret, one_vec;
37268
37269 d.vmode = vmode;
37270 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37271 d.testing_p = true;
37272
37273 /* Given sufficient ISA support we can just return true here
37274 for selected vector modes. */
37275 if (GET_MODE_SIZE (d.vmode) == 16)
37276 {
37277 /* All implementable with a single vpperm insn. */
37278 if (TARGET_XOP)
37279 return true;
37280 /* All implementable with 2 pshufb + 1 ior. */
37281 if (TARGET_SSSE3)
37282 return true;
37283 /* All implementable with shufpd or unpck[lh]pd. */
37284 if (d.nelt == 2)
37285 return true;
37286 }
37287
37288 /* Extract the values from the vector CST into the permutation
37289 array in D. */
37290 memcpy (d.perm, sel, nelt);
37291 for (i = which = 0; i < nelt; ++i)
37292 {
37293 unsigned char e = d.perm[i];
37294 gcc_assert (e < 2 * nelt);
37295 which |= (e < nelt ? 1 : 2);
37296 }
37297
37298 /* For all elements from second vector, fold the elements to first. */
37299 if (which == 2)
37300 for (i = 0; i < nelt; ++i)
37301 d.perm[i] -= nelt;
37302
37303 /* Check whether the mask can be applied to the vector type. */
37304 one_vec = (which != 3);
37305
37306 /* Implementable with shufps or pshufd. */
37307 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
37308 return true;
37309
37310 /* Otherwise we have to go through the motions and see if we can
37311 figure out how to generate the requested permutation. */
37312 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
37313 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
37314 if (!one_vec)
37315 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
37316
37317 start_sequence ();
37318 ret = ix86_expand_vec_perm_const_1 (&d);
37319 end_sequence ();
37320
37321 return ret;
37322 }
37323
37324 void
37325 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
37326 {
37327 struct expand_vec_perm_d d;
37328 unsigned i, nelt;
37329
37330 d.target = targ;
37331 d.op0 = op0;
37332 d.op1 = op1;
37333 d.vmode = GET_MODE (targ);
37334 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37335 d.testing_p = false;
37336
37337 for (i = 0; i < nelt; ++i)
37338 d.perm[i] = i * 2 + odd;
37339
37340 /* We'll either be able to implement the permutation directly... */
37341 if (expand_vec_perm_1 (&d))
37342 return;
37343
37344 /* ... or we use the special-case patterns. */
37345 expand_vec_perm_even_odd_1 (&d, odd);
37346 }
37347
37348 /* Expand an insert into a vector register through pinsr insn.
37349 Return true if successful. */
37350
37351 bool
37352 ix86_expand_pinsr (rtx *operands)
37353 {
37354 rtx dst = operands[0];
37355 rtx src = operands[3];
37356
37357 unsigned int size = INTVAL (operands[1]);
37358 unsigned int pos = INTVAL (operands[2]);
37359
37360 if (GET_CODE (dst) == SUBREG)
37361 {
37362 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
37363 dst = SUBREG_REG (dst);
37364 }
37365
37366 if (GET_CODE (src) == SUBREG)
37367 src = SUBREG_REG (src);
37368
37369 switch (GET_MODE (dst))
37370 {
37371 case V16QImode:
37372 case V8HImode:
37373 case V4SImode:
37374 case V2DImode:
37375 {
37376 enum machine_mode srcmode, dstmode;
37377 rtx (*pinsr)(rtx, rtx, rtx, rtx);
37378
37379 srcmode = mode_for_size (size, MODE_INT, 0);
37380
37381 switch (srcmode)
37382 {
37383 case QImode:
37384 if (!TARGET_SSE4_1)
37385 return false;
37386 dstmode = V16QImode;
37387 pinsr = gen_sse4_1_pinsrb;
37388 break;
37389
37390 case HImode:
37391 if (!TARGET_SSE2)
37392 return false;
37393 dstmode = V8HImode;
37394 pinsr = gen_sse2_pinsrw;
37395 break;
37396
37397 case SImode:
37398 if (!TARGET_SSE4_1)
37399 return false;
37400 dstmode = V4SImode;
37401 pinsr = gen_sse4_1_pinsrd;
37402 break;
37403
37404 case DImode:
37405 gcc_assert (TARGET_64BIT);
37406 if (!TARGET_SSE4_1)
37407 return false;
37408 dstmode = V2DImode;
37409 pinsr = gen_sse4_1_pinsrq;
37410 break;
37411
37412 default:
37413 return false;
37414 }
37415
37416 dst = gen_lowpart (dstmode, dst);
37417 src = gen_lowpart (srcmode, src);
37418
37419 pos /= size;
37420
37421 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
37422 return true;
37423 }
37424
37425 default:
37426 return false;
37427 }
37428 }
37429 \f
37430 /* This function returns the calling abi specific va_list type node.
37431 It returns the FNDECL specific va_list type. */
37432
37433 static tree
37434 ix86_fn_abi_va_list (tree fndecl)
37435 {
37436 if (!TARGET_64BIT)
37437 return va_list_type_node;
37438 gcc_assert (fndecl != NULL_TREE);
37439
37440 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
37441 return ms_va_list_type_node;
37442 else
37443 return sysv_va_list_type_node;
37444 }
37445
37446 /* Returns the canonical va_list type specified by TYPE. If there
37447 is no valid TYPE provided, it return NULL_TREE. */
37448
37449 static tree
37450 ix86_canonical_va_list_type (tree type)
37451 {
37452 tree wtype, htype;
37453
37454 /* Resolve references and pointers to va_list type. */
37455 if (TREE_CODE (type) == MEM_REF)
37456 type = TREE_TYPE (type);
37457 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
37458 type = TREE_TYPE (type);
37459 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
37460 type = TREE_TYPE (type);
37461
37462 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
37463 {
37464 wtype = va_list_type_node;
37465 gcc_assert (wtype != NULL_TREE);
37466 htype = type;
37467 if (TREE_CODE (wtype) == ARRAY_TYPE)
37468 {
37469 /* If va_list is an array type, the argument may have decayed
37470 to a pointer type, e.g. by being passed to another function.
37471 In that case, unwrap both types so that we can compare the
37472 underlying records. */
37473 if (TREE_CODE (htype) == ARRAY_TYPE
37474 || POINTER_TYPE_P (htype))
37475 {
37476 wtype = TREE_TYPE (wtype);
37477 htype = TREE_TYPE (htype);
37478 }
37479 }
37480 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37481 return va_list_type_node;
37482 wtype = sysv_va_list_type_node;
37483 gcc_assert (wtype != NULL_TREE);
37484 htype = type;
37485 if (TREE_CODE (wtype) == ARRAY_TYPE)
37486 {
37487 /* If va_list is an array type, the argument may have decayed
37488 to a pointer type, e.g. by being passed to another function.
37489 In that case, unwrap both types so that we can compare the
37490 underlying records. */
37491 if (TREE_CODE (htype) == ARRAY_TYPE
37492 || POINTER_TYPE_P (htype))
37493 {
37494 wtype = TREE_TYPE (wtype);
37495 htype = TREE_TYPE (htype);
37496 }
37497 }
37498 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37499 return sysv_va_list_type_node;
37500 wtype = ms_va_list_type_node;
37501 gcc_assert (wtype != NULL_TREE);
37502 htype = type;
37503 if (TREE_CODE (wtype) == ARRAY_TYPE)
37504 {
37505 /* If va_list is an array type, the argument may have decayed
37506 to a pointer type, e.g. by being passed to another function.
37507 In that case, unwrap both types so that we can compare the
37508 underlying records. */
37509 if (TREE_CODE (htype) == ARRAY_TYPE
37510 || POINTER_TYPE_P (htype))
37511 {
37512 wtype = TREE_TYPE (wtype);
37513 htype = TREE_TYPE (htype);
37514 }
37515 }
37516 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37517 return ms_va_list_type_node;
37518 return NULL_TREE;
37519 }
37520 return std_canonical_va_list_type (type);
37521 }
37522
37523 /* Iterate through the target-specific builtin types for va_list.
37524 IDX denotes the iterator, *PTREE is set to the result type of
37525 the va_list builtin, and *PNAME to its internal type.
37526 Returns zero if there is no element for this index, otherwise
37527 IDX should be increased upon the next call.
37528 Note, do not iterate a base builtin's name like __builtin_va_list.
37529 Used from c_common_nodes_and_builtins. */
37530
37531 static int
37532 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
37533 {
37534 if (TARGET_64BIT)
37535 {
37536 switch (idx)
37537 {
37538 default:
37539 break;
37540
37541 case 0:
37542 *ptree = ms_va_list_type_node;
37543 *pname = "__builtin_ms_va_list";
37544 return 1;
37545
37546 case 1:
37547 *ptree = sysv_va_list_type_node;
37548 *pname = "__builtin_sysv_va_list";
37549 return 1;
37550 }
37551 }
37552
37553 return 0;
37554 }
37555
37556 #undef TARGET_SCHED_DISPATCH
37557 #define TARGET_SCHED_DISPATCH has_dispatch
37558 #undef TARGET_SCHED_DISPATCH_DO
37559 #define TARGET_SCHED_DISPATCH_DO do_dispatch
37560 #undef TARGET_SCHED_REASSOCIATION_WIDTH
37561 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
37562
37563 /* The size of the dispatch window is the total number of bytes of
37564 object code allowed in a window. */
37565 #define DISPATCH_WINDOW_SIZE 16
37566
37567 /* Number of dispatch windows considered for scheduling. */
37568 #define MAX_DISPATCH_WINDOWS 3
37569
37570 /* Maximum number of instructions in a window. */
37571 #define MAX_INSN 4
37572
37573 /* Maximum number of immediate operands in a window. */
37574 #define MAX_IMM 4
37575
37576 /* Maximum number of immediate bits allowed in a window. */
37577 #define MAX_IMM_SIZE 128
37578
37579 /* Maximum number of 32 bit immediates allowed in a window. */
37580 #define MAX_IMM_32 4
37581
37582 /* Maximum number of 64 bit immediates allowed in a window. */
37583 #define MAX_IMM_64 2
37584
37585 /* Maximum total of loads or prefetches allowed in a window. */
37586 #define MAX_LOAD 2
37587
37588 /* Maximum total of stores allowed in a window. */
37589 #define MAX_STORE 1
37590
37591 #undef BIG
37592 #define BIG 100
37593
37594
37595 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
37596 enum dispatch_group {
37597 disp_no_group = 0,
37598 disp_load,
37599 disp_store,
37600 disp_load_store,
37601 disp_prefetch,
37602 disp_imm,
37603 disp_imm_32,
37604 disp_imm_64,
37605 disp_branch,
37606 disp_cmp,
37607 disp_jcc,
37608 disp_last
37609 };
37610
37611 /* Number of allowable groups in a dispatch window. It is an array
37612 indexed by dispatch_group enum. 100 is used as a big number,
37613 because the number of these kind of operations does not have any
37614 effect in dispatch window, but we need them for other reasons in
37615 the table. */
37616 static unsigned int num_allowable_groups[disp_last] = {
37617 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
37618 };
37619
37620 char group_name[disp_last + 1][16] = {
37621 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
37622 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
37623 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
37624 };
37625
37626 /* Instruction path. */
37627 enum insn_path {
37628 no_path = 0,
37629 path_single, /* Single micro op. */
37630 path_double, /* Double micro op. */
37631 path_multi, /* Instructions with more than 2 micro op.. */
37632 last_path
37633 };
37634
37635 /* sched_insn_info defines a window to the instructions scheduled in
37636 the basic block. It contains a pointer to the insn_info table and
37637 the instruction scheduled.
37638
37639 Windows are allocated for each basic block and are linked
37640 together. */
37641 typedef struct sched_insn_info_s {
37642 rtx insn;
37643 enum dispatch_group group;
37644 enum insn_path path;
37645 int byte_len;
37646 int imm_bytes;
37647 } sched_insn_info;
37648
37649 /* Linked list of dispatch windows. This is a two way list of
37650 dispatch windows of a basic block. It contains information about
37651 the number of uops in the window and the total number of
37652 instructions and of bytes in the object code for this dispatch
37653 window. */
37654 typedef struct dispatch_windows_s {
37655 int num_insn; /* Number of insn in the window. */
37656 int num_uops; /* Number of uops in the window. */
37657 int window_size; /* Number of bytes in the window. */
37658 int window_num; /* Window number between 0 or 1. */
37659 int num_imm; /* Number of immediates in an insn. */
37660 int num_imm_32; /* Number of 32 bit immediates in an insn. */
37661 int num_imm_64; /* Number of 64 bit immediates in an insn. */
37662 int imm_size; /* Total immediates in the window. */
37663 int num_loads; /* Total memory loads in the window. */
37664 int num_stores; /* Total memory stores in the window. */
37665 int violation; /* Violation exists in window. */
37666 sched_insn_info *window; /* Pointer to the window. */
37667 struct dispatch_windows_s *next;
37668 struct dispatch_windows_s *prev;
37669 } dispatch_windows;
37670
37671 /* Immediate valuse used in an insn. */
37672 typedef struct imm_info_s
37673 {
37674 int imm;
37675 int imm32;
37676 int imm64;
37677 } imm_info;
37678
37679 static dispatch_windows *dispatch_window_list;
37680 static dispatch_windows *dispatch_window_list1;
37681
37682 /* Get dispatch group of insn. */
37683
37684 static enum dispatch_group
37685 get_mem_group (rtx insn)
37686 {
37687 enum attr_memory memory;
37688
37689 if (INSN_CODE (insn) < 0)
37690 return disp_no_group;
37691 memory = get_attr_memory (insn);
37692 if (memory == MEMORY_STORE)
37693 return disp_store;
37694
37695 if (memory == MEMORY_LOAD)
37696 return disp_load;
37697
37698 if (memory == MEMORY_BOTH)
37699 return disp_load_store;
37700
37701 return disp_no_group;
37702 }
37703
37704 /* Return true if insn is a compare instruction. */
37705
37706 static bool
37707 is_cmp (rtx insn)
37708 {
37709 enum attr_type type;
37710
37711 type = get_attr_type (insn);
37712 return (type == TYPE_TEST
37713 || type == TYPE_ICMP
37714 || type == TYPE_FCMP
37715 || GET_CODE (PATTERN (insn)) == COMPARE);
37716 }
37717
37718 /* Return true if a dispatch violation encountered. */
37719
37720 static bool
37721 dispatch_violation (void)
37722 {
37723 if (dispatch_window_list->next)
37724 return dispatch_window_list->next->violation;
37725 return dispatch_window_list->violation;
37726 }
37727
37728 /* Return true if insn is a branch instruction. */
37729
37730 static bool
37731 is_branch (rtx insn)
37732 {
37733 return (CALL_P (insn) || JUMP_P (insn));
37734 }
37735
37736 /* Return true if insn is a prefetch instruction. */
37737
37738 static bool
37739 is_prefetch (rtx insn)
37740 {
37741 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
37742 }
37743
37744 /* This function initializes a dispatch window and the list container holding a
37745 pointer to the window. */
37746
37747 static void
37748 init_window (int window_num)
37749 {
37750 int i;
37751 dispatch_windows *new_list;
37752
37753 if (window_num == 0)
37754 new_list = dispatch_window_list;
37755 else
37756 new_list = dispatch_window_list1;
37757
37758 new_list->num_insn = 0;
37759 new_list->num_uops = 0;
37760 new_list->window_size = 0;
37761 new_list->next = NULL;
37762 new_list->prev = NULL;
37763 new_list->window_num = window_num;
37764 new_list->num_imm = 0;
37765 new_list->num_imm_32 = 0;
37766 new_list->num_imm_64 = 0;
37767 new_list->imm_size = 0;
37768 new_list->num_loads = 0;
37769 new_list->num_stores = 0;
37770 new_list->violation = false;
37771
37772 for (i = 0; i < MAX_INSN; i++)
37773 {
37774 new_list->window[i].insn = NULL;
37775 new_list->window[i].group = disp_no_group;
37776 new_list->window[i].path = no_path;
37777 new_list->window[i].byte_len = 0;
37778 new_list->window[i].imm_bytes = 0;
37779 }
37780 return;
37781 }
37782
37783 /* This function allocates and initializes a dispatch window and the
37784 list container holding a pointer to the window. */
37785
37786 static dispatch_windows *
37787 allocate_window (void)
37788 {
37789 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
37790 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
37791
37792 return new_list;
37793 }
37794
37795 /* This routine initializes the dispatch scheduling information. It
37796 initiates building dispatch scheduler tables and constructs the
37797 first dispatch window. */
37798
37799 static void
37800 init_dispatch_sched (void)
37801 {
37802 /* Allocate a dispatch list and a window. */
37803 dispatch_window_list = allocate_window ();
37804 dispatch_window_list1 = allocate_window ();
37805 init_window (0);
37806 init_window (1);
37807 }
37808
37809 /* This function returns true if a branch is detected. End of a basic block
37810 does not have to be a branch, but here we assume only branches end a
37811 window. */
37812
37813 static bool
37814 is_end_basic_block (enum dispatch_group group)
37815 {
37816 return group == disp_branch;
37817 }
37818
37819 /* This function is called when the end of a window processing is reached. */
37820
37821 static void
37822 process_end_window (void)
37823 {
37824 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
37825 if (dispatch_window_list->next)
37826 {
37827 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
37828 gcc_assert (dispatch_window_list->window_size
37829 + dispatch_window_list1->window_size <= 48);
37830 init_window (1);
37831 }
37832 init_window (0);
37833 }
37834
37835 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
37836 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
37837 for 48 bytes of instructions. Note that these windows are not dispatch
37838 windows that their sizes are DISPATCH_WINDOW_SIZE. */
37839
37840 static dispatch_windows *
37841 allocate_next_window (int window_num)
37842 {
37843 if (window_num == 0)
37844 {
37845 if (dispatch_window_list->next)
37846 init_window (1);
37847 init_window (0);
37848 return dispatch_window_list;
37849 }
37850
37851 dispatch_window_list->next = dispatch_window_list1;
37852 dispatch_window_list1->prev = dispatch_window_list;
37853
37854 return dispatch_window_list1;
37855 }
37856
37857 /* Increment the number of immediate operands of an instruction. */
37858
37859 static int
37860 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
37861 {
37862 if (*in_rtx == 0)
37863 return 0;
37864
37865 switch ( GET_CODE (*in_rtx))
37866 {
37867 case CONST:
37868 case SYMBOL_REF:
37869 case CONST_INT:
37870 (imm_values->imm)++;
37871 if (x86_64_immediate_operand (*in_rtx, SImode))
37872 (imm_values->imm32)++;
37873 else
37874 (imm_values->imm64)++;
37875 break;
37876
37877 case CONST_DOUBLE:
37878 (imm_values->imm)++;
37879 (imm_values->imm64)++;
37880 break;
37881
37882 case CODE_LABEL:
37883 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
37884 {
37885 (imm_values->imm)++;
37886 (imm_values->imm32)++;
37887 }
37888 break;
37889
37890 default:
37891 break;
37892 }
37893
37894 return 0;
37895 }
37896
37897 /* Compute number of immediate operands of an instruction. */
37898
37899 static void
37900 find_constant (rtx in_rtx, imm_info *imm_values)
37901 {
37902 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
37903 (rtx_function) find_constant_1, (void *) imm_values);
37904 }
37905
37906 /* Return total size of immediate operands of an instruction along with number
37907 of corresponding immediate-operands. It initializes its parameters to zero
37908 befor calling FIND_CONSTANT.
37909 INSN is the input instruction. IMM is the total of immediates.
37910 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
37911 bit immediates. */
37912
37913 static int
37914 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
37915 {
37916 imm_info imm_values = {0, 0, 0};
37917
37918 find_constant (insn, &imm_values);
37919 *imm = imm_values.imm;
37920 *imm32 = imm_values.imm32;
37921 *imm64 = imm_values.imm64;
37922 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
37923 }
37924
37925 /* This function indicates if an operand of an instruction is an
37926 immediate. */
37927
37928 static bool
37929 has_immediate (rtx insn)
37930 {
37931 int num_imm_operand;
37932 int num_imm32_operand;
37933 int num_imm64_operand;
37934
37935 if (insn)
37936 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37937 &num_imm64_operand);
37938 return false;
37939 }
37940
37941 /* Return single or double path for instructions. */
37942
37943 static enum insn_path
37944 get_insn_path (rtx insn)
37945 {
37946 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
37947
37948 if ((int)path == 0)
37949 return path_single;
37950
37951 if ((int)path == 1)
37952 return path_double;
37953
37954 return path_multi;
37955 }
37956
37957 /* Return insn dispatch group. */
37958
37959 static enum dispatch_group
37960 get_insn_group (rtx insn)
37961 {
37962 enum dispatch_group group = get_mem_group (insn);
37963 if (group)
37964 return group;
37965
37966 if (is_branch (insn))
37967 return disp_branch;
37968
37969 if (is_cmp (insn))
37970 return disp_cmp;
37971
37972 if (has_immediate (insn))
37973 return disp_imm;
37974
37975 if (is_prefetch (insn))
37976 return disp_prefetch;
37977
37978 return disp_no_group;
37979 }
37980
37981 /* Count number of GROUP restricted instructions in a dispatch
37982 window WINDOW_LIST. */
37983
37984 static int
37985 count_num_restricted (rtx insn, dispatch_windows *window_list)
37986 {
37987 enum dispatch_group group = get_insn_group (insn);
37988 int imm_size;
37989 int num_imm_operand;
37990 int num_imm32_operand;
37991 int num_imm64_operand;
37992
37993 if (group == disp_no_group)
37994 return 0;
37995
37996 if (group == disp_imm)
37997 {
37998 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37999 &num_imm64_operand);
38000 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
38001 || num_imm_operand + window_list->num_imm > MAX_IMM
38002 || (num_imm32_operand > 0
38003 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
38004 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
38005 || (num_imm64_operand > 0
38006 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
38007 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
38008 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
38009 && num_imm64_operand > 0
38010 && ((window_list->num_imm_64 > 0
38011 && window_list->num_insn >= 2)
38012 || window_list->num_insn >= 3)))
38013 return BIG;
38014
38015 return 1;
38016 }
38017
38018 if ((group == disp_load_store
38019 && (window_list->num_loads >= MAX_LOAD
38020 || window_list->num_stores >= MAX_STORE))
38021 || ((group == disp_load
38022 || group == disp_prefetch)
38023 && window_list->num_loads >= MAX_LOAD)
38024 || (group == disp_store
38025 && window_list->num_stores >= MAX_STORE))
38026 return BIG;
38027
38028 return 1;
38029 }
38030
38031 /* This function returns true if insn satisfies dispatch rules on the
38032 last window scheduled. */
38033
38034 static bool
38035 fits_dispatch_window (rtx insn)
38036 {
38037 dispatch_windows *window_list = dispatch_window_list;
38038 dispatch_windows *window_list_next = dispatch_window_list->next;
38039 unsigned int num_restrict;
38040 enum dispatch_group group = get_insn_group (insn);
38041 enum insn_path path = get_insn_path (insn);
38042 int sum;
38043
38044 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
38045 instructions should be given the lowest priority in the
38046 scheduling process in Haifa scheduler to make sure they will be
38047 scheduled in the same dispatch window as the refrence to them. */
38048 if (group == disp_jcc || group == disp_cmp)
38049 return false;
38050
38051 /* Check nonrestricted. */
38052 if (group == disp_no_group || group == disp_branch)
38053 return true;
38054
38055 /* Get last dispatch window. */
38056 if (window_list_next)
38057 window_list = window_list_next;
38058
38059 if (window_list->window_num == 1)
38060 {
38061 sum = window_list->prev->window_size + window_list->window_size;
38062
38063 if (sum == 32
38064 || (min_insn_size (insn) + sum) >= 48)
38065 /* Window 1 is full. Go for next window. */
38066 return true;
38067 }
38068
38069 num_restrict = count_num_restricted (insn, window_list);
38070
38071 if (num_restrict > num_allowable_groups[group])
38072 return false;
38073
38074 /* See if it fits in the first window. */
38075 if (window_list->window_num == 0)
38076 {
38077 /* The first widow should have only single and double path
38078 uops. */
38079 if (path == path_double
38080 && (window_list->num_uops + 2) > MAX_INSN)
38081 return false;
38082 else if (path != path_single)
38083 return false;
38084 }
38085 return true;
38086 }
38087
38088 /* Add an instruction INSN with NUM_UOPS micro-operations to the
38089 dispatch window WINDOW_LIST. */
38090
38091 static void
38092 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
38093 {
38094 int byte_len = min_insn_size (insn);
38095 int num_insn = window_list->num_insn;
38096 int imm_size;
38097 sched_insn_info *window = window_list->window;
38098 enum dispatch_group group = get_insn_group (insn);
38099 enum insn_path path = get_insn_path (insn);
38100 int num_imm_operand;
38101 int num_imm32_operand;
38102 int num_imm64_operand;
38103
38104 if (!window_list->violation && group != disp_cmp
38105 && !fits_dispatch_window (insn))
38106 window_list->violation = true;
38107
38108 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38109 &num_imm64_operand);
38110
38111 /* Initialize window with new instruction. */
38112 window[num_insn].insn = insn;
38113 window[num_insn].byte_len = byte_len;
38114 window[num_insn].group = group;
38115 window[num_insn].path = path;
38116 window[num_insn].imm_bytes = imm_size;
38117
38118 window_list->window_size += byte_len;
38119 window_list->num_insn = num_insn + 1;
38120 window_list->num_uops = window_list->num_uops + num_uops;
38121 window_list->imm_size += imm_size;
38122 window_list->num_imm += num_imm_operand;
38123 window_list->num_imm_32 += num_imm32_operand;
38124 window_list->num_imm_64 += num_imm64_operand;
38125
38126 if (group == disp_store)
38127 window_list->num_stores += 1;
38128 else if (group == disp_load
38129 || group == disp_prefetch)
38130 window_list->num_loads += 1;
38131 else if (group == disp_load_store)
38132 {
38133 window_list->num_stores += 1;
38134 window_list->num_loads += 1;
38135 }
38136 }
38137
38138 /* Adds a scheduled instruction, INSN, to the current dispatch window.
38139 If the total bytes of instructions or the number of instructions in
38140 the window exceed allowable, it allocates a new window. */
38141
38142 static void
38143 add_to_dispatch_window (rtx insn)
38144 {
38145 int byte_len;
38146 dispatch_windows *window_list;
38147 dispatch_windows *next_list;
38148 dispatch_windows *window0_list;
38149 enum insn_path path;
38150 enum dispatch_group insn_group;
38151 bool insn_fits;
38152 int num_insn;
38153 int num_uops;
38154 int window_num;
38155 int insn_num_uops;
38156 int sum;
38157
38158 if (INSN_CODE (insn) < 0)
38159 return;
38160
38161 byte_len = min_insn_size (insn);
38162 window_list = dispatch_window_list;
38163 next_list = window_list->next;
38164 path = get_insn_path (insn);
38165 insn_group = get_insn_group (insn);
38166
38167 /* Get the last dispatch window. */
38168 if (next_list)
38169 window_list = dispatch_window_list->next;
38170
38171 if (path == path_single)
38172 insn_num_uops = 1;
38173 else if (path == path_double)
38174 insn_num_uops = 2;
38175 else
38176 insn_num_uops = (int) path;
38177
38178 /* If current window is full, get a new window.
38179 Window number zero is full, if MAX_INSN uops are scheduled in it.
38180 Window number one is full, if window zero's bytes plus window
38181 one's bytes is 32, or if the bytes of the new instruction added
38182 to the total makes it greater than 48, or it has already MAX_INSN
38183 instructions in it. */
38184 num_insn = window_list->num_insn;
38185 num_uops = window_list->num_uops;
38186 window_num = window_list->window_num;
38187 insn_fits = fits_dispatch_window (insn);
38188
38189 if (num_insn >= MAX_INSN
38190 || num_uops + insn_num_uops > MAX_INSN
38191 || !(insn_fits))
38192 {
38193 window_num = ~window_num & 1;
38194 window_list = allocate_next_window (window_num);
38195 }
38196
38197 if (window_num == 0)
38198 {
38199 add_insn_window (insn, window_list, insn_num_uops);
38200 if (window_list->num_insn >= MAX_INSN
38201 && insn_group == disp_branch)
38202 {
38203 process_end_window ();
38204 return;
38205 }
38206 }
38207 else if (window_num == 1)
38208 {
38209 window0_list = window_list->prev;
38210 sum = window0_list->window_size + window_list->window_size;
38211 if (sum == 32
38212 || (byte_len + sum) >= 48)
38213 {
38214 process_end_window ();
38215 window_list = dispatch_window_list;
38216 }
38217
38218 add_insn_window (insn, window_list, insn_num_uops);
38219 }
38220 else
38221 gcc_unreachable ();
38222
38223 if (is_end_basic_block (insn_group))
38224 {
38225 /* End of basic block is reached do end-basic-block process. */
38226 process_end_window ();
38227 return;
38228 }
38229 }
38230
38231 /* Print the dispatch window, WINDOW_NUM, to FILE. */
38232
38233 DEBUG_FUNCTION static void
38234 debug_dispatch_window_file (FILE *file, int window_num)
38235 {
38236 dispatch_windows *list;
38237 int i;
38238
38239 if (window_num == 0)
38240 list = dispatch_window_list;
38241 else
38242 list = dispatch_window_list1;
38243
38244 fprintf (file, "Window #%d:\n", list->window_num);
38245 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
38246 list->num_insn, list->num_uops, list->window_size);
38247 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38248 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
38249
38250 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
38251 list->num_stores);
38252 fprintf (file, " insn info:\n");
38253
38254 for (i = 0; i < MAX_INSN; i++)
38255 {
38256 if (!list->window[i].insn)
38257 break;
38258 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
38259 i, group_name[list->window[i].group],
38260 i, (void *)list->window[i].insn,
38261 i, list->window[i].path,
38262 i, list->window[i].byte_len,
38263 i, list->window[i].imm_bytes);
38264 }
38265 }
38266
38267 /* Print to stdout a dispatch window. */
38268
38269 DEBUG_FUNCTION void
38270 debug_dispatch_window (int window_num)
38271 {
38272 debug_dispatch_window_file (stdout, window_num);
38273 }
38274
38275 /* Print INSN dispatch information to FILE. */
38276
38277 DEBUG_FUNCTION static void
38278 debug_insn_dispatch_info_file (FILE *file, rtx insn)
38279 {
38280 int byte_len;
38281 enum insn_path path;
38282 enum dispatch_group group;
38283 int imm_size;
38284 int num_imm_operand;
38285 int num_imm32_operand;
38286 int num_imm64_operand;
38287
38288 if (INSN_CODE (insn) < 0)
38289 return;
38290
38291 byte_len = min_insn_size (insn);
38292 path = get_insn_path (insn);
38293 group = get_insn_group (insn);
38294 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38295 &num_imm64_operand);
38296
38297 fprintf (file, " insn info:\n");
38298 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
38299 group_name[group], path, byte_len);
38300 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38301 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
38302 }
38303
38304 /* Print to STDERR the status of the ready list with respect to
38305 dispatch windows. */
38306
38307 DEBUG_FUNCTION void
38308 debug_ready_dispatch (void)
38309 {
38310 int i;
38311 int no_ready = number_in_ready ();
38312
38313 fprintf (stdout, "Number of ready: %d\n", no_ready);
38314
38315 for (i = 0; i < no_ready; i++)
38316 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
38317 }
38318
38319 /* This routine is the driver of the dispatch scheduler. */
38320
38321 static void
38322 do_dispatch (rtx insn, int mode)
38323 {
38324 if (mode == DISPATCH_INIT)
38325 init_dispatch_sched ();
38326 else if (mode == ADD_TO_DISPATCH_WINDOW)
38327 add_to_dispatch_window (insn);
38328 }
38329
38330 /* Return TRUE if Dispatch Scheduling is supported. */
38331
38332 static bool
38333 has_dispatch (rtx insn, int action)
38334 {
38335 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
38336 && flag_dispatch_scheduler)
38337 switch (action)
38338 {
38339 default:
38340 return false;
38341
38342 case IS_DISPATCH_ON:
38343 return true;
38344 break;
38345
38346 case IS_CMP:
38347 return is_cmp (insn);
38348
38349 case DISPATCH_VIOLATION:
38350 return dispatch_violation ();
38351
38352 case FITS_DISPATCH_WINDOW:
38353 return fits_dispatch_window (insn);
38354 }
38355
38356 return false;
38357 }
38358
38359 /* Implementation of reassociation_width target hook used by
38360 reassoc phase to identify parallelism level in reassociated
38361 tree. Statements tree_code is passed in OPC. Arguments type
38362 is passed in MODE.
38363
38364 Currently parallel reassociation is enabled for Atom
38365 processors only and we set reassociation width to be 2
38366 because Atom may issue up to 2 instructions per cycle.
38367
38368 Return value should be fixed if parallel reassociation is
38369 enabled for other processors. */
38370
38371 static int
38372 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
38373 enum machine_mode mode)
38374 {
38375 int res = 1;
38376
38377 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
38378 res = 2;
38379 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
38380 res = 2;
38381
38382 return res;
38383 }
38384
38385 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
38386 place emms and femms instructions. */
38387
38388 static enum machine_mode
38389 ix86_preferred_simd_mode (enum machine_mode mode)
38390 {
38391 if (!TARGET_SSE)
38392 return word_mode;
38393
38394 switch (mode)
38395 {
38396 case QImode:
38397 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
38398 case HImode:
38399 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
38400 case SImode:
38401 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
38402 case DImode:
38403 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
38404
38405 case SFmode:
38406 if (TARGET_AVX && !TARGET_PREFER_AVX128)
38407 return V8SFmode;
38408 else
38409 return V4SFmode;
38410
38411 case DFmode:
38412 if (!TARGET_VECTORIZE_DOUBLE)
38413 return word_mode;
38414 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
38415 return V4DFmode;
38416 else if (TARGET_SSE2)
38417 return V2DFmode;
38418 /* FALLTHRU */
38419
38420 default:
38421 return word_mode;
38422 }
38423 }
38424
38425 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
38426 vectors. */
38427
38428 static unsigned int
38429 ix86_autovectorize_vector_sizes (void)
38430 {
38431 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
38432 }
38433
38434 /* Initialize the GCC target structure. */
38435 #undef TARGET_RETURN_IN_MEMORY
38436 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
38437
38438 #undef TARGET_LEGITIMIZE_ADDRESS
38439 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
38440
38441 #undef TARGET_ATTRIBUTE_TABLE
38442 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
38443 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38444 # undef TARGET_MERGE_DECL_ATTRIBUTES
38445 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
38446 #endif
38447
38448 #undef TARGET_COMP_TYPE_ATTRIBUTES
38449 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
38450
38451 #undef TARGET_INIT_BUILTINS
38452 #define TARGET_INIT_BUILTINS ix86_init_builtins
38453 #undef TARGET_BUILTIN_DECL
38454 #define TARGET_BUILTIN_DECL ix86_builtin_decl
38455 #undef TARGET_EXPAND_BUILTIN
38456 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
38457
38458 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
38459 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
38460 ix86_builtin_vectorized_function
38461
38462 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
38463 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
38464
38465 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
38466 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
38467
38468 #undef TARGET_VECTORIZE_BUILTIN_GATHER
38469 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
38470
38471 #undef TARGET_BUILTIN_RECIPROCAL
38472 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
38473
38474 #undef TARGET_ASM_FUNCTION_EPILOGUE
38475 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
38476
38477 #undef TARGET_ENCODE_SECTION_INFO
38478 #ifndef SUBTARGET_ENCODE_SECTION_INFO
38479 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
38480 #else
38481 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
38482 #endif
38483
38484 #undef TARGET_ASM_OPEN_PAREN
38485 #define TARGET_ASM_OPEN_PAREN ""
38486 #undef TARGET_ASM_CLOSE_PAREN
38487 #define TARGET_ASM_CLOSE_PAREN ""
38488
38489 #undef TARGET_ASM_BYTE_OP
38490 #define TARGET_ASM_BYTE_OP ASM_BYTE
38491
38492 #undef TARGET_ASM_ALIGNED_HI_OP
38493 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
38494 #undef TARGET_ASM_ALIGNED_SI_OP
38495 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
38496 #ifdef ASM_QUAD
38497 #undef TARGET_ASM_ALIGNED_DI_OP
38498 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
38499 #endif
38500
38501 #undef TARGET_PROFILE_BEFORE_PROLOGUE
38502 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
38503
38504 #undef TARGET_ASM_UNALIGNED_HI_OP
38505 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
38506 #undef TARGET_ASM_UNALIGNED_SI_OP
38507 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
38508 #undef TARGET_ASM_UNALIGNED_DI_OP
38509 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
38510
38511 #undef TARGET_PRINT_OPERAND
38512 #define TARGET_PRINT_OPERAND ix86_print_operand
38513 #undef TARGET_PRINT_OPERAND_ADDRESS
38514 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
38515 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
38516 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
38517 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
38518 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
38519
38520 #undef TARGET_SCHED_INIT_GLOBAL
38521 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
38522 #undef TARGET_SCHED_ADJUST_COST
38523 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
38524 #undef TARGET_SCHED_ISSUE_RATE
38525 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
38526 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
38527 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
38528 ia32_multipass_dfa_lookahead
38529
38530 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
38531 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
38532
38533 #ifdef HAVE_AS_TLS
38534 #undef TARGET_HAVE_TLS
38535 #define TARGET_HAVE_TLS true
38536 #endif
38537 #undef TARGET_CANNOT_FORCE_CONST_MEM
38538 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
38539 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
38540 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
38541
38542 #undef TARGET_DELEGITIMIZE_ADDRESS
38543 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
38544
38545 #undef TARGET_MS_BITFIELD_LAYOUT_P
38546 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
38547
38548 #if TARGET_MACHO
38549 #undef TARGET_BINDS_LOCAL_P
38550 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
38551 #endif
38552 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38553 #undef TARGET_BINDS_LOCAL_P
38554 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
38555 #endif
38556
38557 #undef TARGET_ASM_OUTPUT_MI_THUNK
38558 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
38559 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
38560 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
38561
38562 #undef TARGET_ASM_FILE_START
38563 #define TARGET_ASM_FILE_START x86_file_start
38564
38565 #undef TARGET_OPTION_OVERRIDE
38566 #define TARGET_OPTION_OVERRIDE ix86_option_override
38567
38568 #undef TARGET_REGISTER_MOVE_COST
38569 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
38570 #undef TARGET_MEMORY_MOVE_COST
38571 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
38572 #undef TARGET_RTX_COSTS
38573 #define TARGET_RTX_COSTS ix86_rtx_costs
38574 #undef TARGET_ADDRESS_COST
38575 #define TARGET_ADDRESS_COST ix86_address_cost
38576
38577 #undef TARGET_FIXED_CONDITION_CODE_REGS
38578 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
38579 #undef TARGET_CC_MODES_COMPATIBLE
38580 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
38581
38582 #undef TARGET_MACHINE_DEPENDENT_REORG
38583 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
38584
38585 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
38586 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
38587
38588 #undef TARGET_BUILD_BUILTIN_VA_LIST
38589 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
38590
38591 #undef TARGET_ENUM_VA_LIST_P
38592 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
38593
38594 #undef TARGET_FN_ABI_VA_LIST
38595 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
38596
38597 #undef TARGET_CANONICAL_VA_LIST_TYPE
38598 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
38599
38600 #undef TARGET_EXPAND_BUILTIN_VA_START
38601 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
38602
38603 #undef TARGET_MD_ASM_CLOBBERS
38604 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
38605
38606 #undef TARGET_PROMOTE_PROTOTYPES
38607 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
38608 #undef TARGET_STRUCT_VALUE_RTX
38609 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
38610 #undef TARGET_SETUP_INCOMING_VARARGS
38611 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
38612 #undef TARGET_MUST_PASS_IN_STACK
38613 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
38614 #undef TARGET_FUNCTION_ARG_ADVANCE
38615 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
38616 #undef TARGET_FUNCTION_ARG
38617 #define TARGET_FUNCTION_ARG ix86_function_arg
38618 #undef TARGET_FUNCTION_ARG_BOUNDARY
38619 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
38620 #undef TARGET_PASS_BY_REFERENCE
38621 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
38622 #undef TARGET_INTERNAL_ARG_POINTER
38623 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
38624 #undef TARGET_UPDATE_STACK_BOUNDARY
38625 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
38626 #undef TARGET_GET_DRAP_RTX
38627 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
38628 #undef TARGET_STRICT_ARGUMENT_NAMING
38629 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
38630 #undef TARGET_STATIC_CHAIN
38631 #define TARGET_STATIC_CHAIN ix86_static_chain
38632 #undef TARGET_TRAMPOLINE_INIT
38633 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
38634 #undef TARGET_RETURN_POPS_ARGS
38635 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
38636
38637 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
38638 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
38639
38640 #undef TARGET_SCALAR_MODE_SUPPORTED_P
38641 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
38642
38643 #undef TARGET_VECTOR_MODE_SUPPORTED_P
38644 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
38645
38646 #undef TARGET_C_MODE_FOR_SUFFIX
38647 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
38648
38649 #ifdef HAVE_AS_TLS
38650 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
38651 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
38652 #endif
38653
38654 #ifdef SUBTARGET_INSERT_ATTRIBUTES
38655 #undef TARGET_INSERT_ATTRIBUTES
38656 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
38657 #endif
38658
38659 #undef TARGET_MANGLE_TYPE
38660 #define TARGET_MANGLE_TYPE ix86_mangle_type
38661
38662 #if !TARGET_MACHO
38663 #undef TARGET_STACK_PROTECT_FAIL
38664 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
38665 #endif
38666
38667 #undef TARGET_FUNCTION_VALUE
38668 #define TARGET_FUNCTION_VALUE ix86_function_value
38669
38670 #undef TARGET_FUNCTION_VALUE_REGNO_P
38671 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
38672
38673 #undef TARGET_PROMOTE_FUNCTION_MODE
38674 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
38675
38676 #undef TARGET_SECONDARY_RELOAD
38677 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
38678
38679 #undef TARGET_CLASS_MAX_NREGS
38680 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
38681
38682 #undef TARGET_PREFERRED_RELOAD_CLASS
38683 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
38684 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
38685 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
38686 #undef TARGET_CLASS_LIKELY_SPILLED_P
38687 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
38688
38689 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
38690 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
38691 ix86_builtin_vectorization_cost
38692 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
38693 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
38694 ix86_vectorize_vec_perm_const_ok
38695 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
38696 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
38697 ix86_preferred_simd_mode
38698 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
38699 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
38700 ix86_autovectorize_vector_sizes
38701
38702 #undef TARGET_SET_CURRENT_FUNCTION
38703 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
38704
38705 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
38706 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
38707
38708 #undef TARGET_OPTION_SAVE
38709 #define TARGET_OPTION_SAVE ix86_function_specific_save
38710
38711 #undef TARGET_OPTION_RESTORE
38712 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
38713
38714 #undef TARGET_OPTION_PRINT
38715 #define TARGET_OPTION_PRINT ix86_function_specific_print
38716
38717 #undef TARGET_CAN_INLINE_P
38718 #define TARGET_CAN_INLINE_P ix86_can_inline_p
38719
38720 #undef TARGET_EXPAND_TO_RTL_HOOK
38721 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
38722
38723 #undef TARGET_LEGITIMATE_ADDRESS_P
38724 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
38725
38726 #undef TARGET_LEGITIMATE_CONSTANT_P
38727 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
38728
38729 #undef TARGET_FRAME_POINTER_REQUIRED
38730 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
38731
38732 #undef TARGET_CAN_ELIMINATE
38733 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
38734
38735 #undef TARGET_EXTRA_LIVE_ON_ENTRY
38736 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
38737
38738 #undef TARGET_ASM_CODE_END
38739 #define TARGET_ASM_CODE_END ix86_code_end
38740
38741 #undef TARGET_CONDITIONAL_REGISTER_USAGE
38742 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
38743
38744 #if TARGET_MACHO
38745 #undef TARGET_INIT_LIBFUNCS
38746 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
38747 #endif
38748
38749 struct gcc_target targetm = TARGET_INITIALIZER;
38750 \f
38751 #include "gt-i386.h"