tree.h (TREE_VECTOR_CST_ELTS): Remove.
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "cgraph.h"
51 #include "gimple.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "sched-int.h"
59 #include "sbitmap.h"
60 #include "fibheap.h"
61 #include "opts.h"
62 #include "diagnostic.h"
63
64 enum upper_128bits_state
65 {
66 unknown = 0,
67 unused,
68 used
69 };
70
71 typedef struct block_info_def
72 {
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
76 in this block. */
77 bool unchanged;
78 /* TRUE if block has been processed. */
79 bool processed;
80 /* TRUE if block has been scanned. */
81 bool scanned;
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
84 } *block_info;
85
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
87
88 enum call_avx256_state
89 {
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
95 callee_pass_avx256,
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
98 call_no_avx256,
99 /* vzeroupper intrinsic. */
100 vzeroupper_intrinsic
101 };
102
103 /* Check if a 256bit AVX register is referenced in stores. */
104
105 static void
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
107 {
108 if ((REG_P (dest)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
113 {
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
116 *state = used;
117 }
118 }
119
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
123
124 STATE is state of the upper 128bits of AVX registers at entry. */
125
126 static void
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
129 {
130 rtx insn, bb_end;
131 rtx vzeroupper_insn = NULL_RTX;
132 rtx pat;
133 int avx256;
134 bool unchanged;
135
136 if (BLOCK_INFO (bb)->unchanged)
137 {
138 if (dump_file)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
140 bb->index, state);
141
142 BLOCK_INFO (bb)->state = state;
143 return;
144 }
145
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
147 {
148 if (dump_file)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
151 return;
152 }
153
154 BLOCK_INFO (bb)->prev = state;
155
156 if (dump_file)
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
158 bb->index, state);
159
160 unchanged = true;
161
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
164 insn = BB_HEAD (bb);
165 while (insn != bb_end)
166 {
167 insn = NEXT_INSN (insn);
168
169 if (!NONDEBUG_INSN_P (insn))
170 continue;
171
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
174 {
175 if (!vzeroupper_insn)
176 continue;
177
178 if (PREV_INSN (insn) != vzeroupper_insn)
179 {
180 if (dump_file)
181 {
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
186 }
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
188 PREV_INSN (insn));
189 }
190 vzeroupper_insn = NULL_RTX;
191 continue;
192 }
193
194 pat = PATTERN (insn);
195
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
199 {
200 if (dump_file)
201 {
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
205 }
206 }
207 else
208 {
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
213 {
214 state = unused;
215 unchanged = false;
216
217 /* Delete pending vzeroupper insertion. */
218 if (vzeroupper_insn)
219 {
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
222 }
223 }
224 else if (state != used)
225 {
226 note_stores (pat, check_avx256_stores, &state);
227 if (state == used)
228 unchanged = false;
229 }
230 continue;
231 }
232
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
235
236 if (state == unused)
237 {
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
242 {
243 state = used;
244 unchanged = false;
245 }
246
247 /* Remove unnecessary vzeroupper since upper 128bits are
248 cleared. */
249 if (dump_file)
250 {
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
253 }
254 delete_insn (insn);
255 }
256 else
257 {
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
259 register. */
260 if (avx256 != callee_return_pass_avx256)
261 state = unused;
262
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
265 {
266 /* Must remove vzeroupper since callee passes in 256bit
267 AVX register. */
268 if (dump_file)
269 {
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
272 }
273 delete_insn (insn);
274 }
275 else
276 {
277 vzeroupper_insn = insn;
278 unchanged = false;
279 }
280 }
281 }
282
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
286
287 if (dump_file)
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
290 state);
291 }
292
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
296 state is changed. */
297
298 static bool
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
300 {
301 edge e;
302 edge_iterator ei;
303 enum upper_128bits_state state, old_state, new_state;
304 bool seen_unknown;
305
306 if (dump_file)
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
309
310 if (BLOCK_INFO (block)->processed)
311 return false;
312
313 state = unused;
314
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
318 {
319 if (e->src == block)
320 continue;
321 switch (BLOCK_INFO (e->src)->state)
322 {
323 case unknown:
324 if (!unknown_is_unused)
325 seen_unknown = true;
326 case unused:
327 break;
328 case used:
329 state = used;
330 goto done;
331 }
332 }
333
334 if (seen_unknown)
335 state = unknown;
336
337 done:
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
341
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
344
345 /* Need to rescan if the upper 128bits of AVX registers are changed
346 to USED at exit. */
347 if (new_state != old_state)
348 {
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
351 return true;
352 }
353 else
354 return false;
355 }
356
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
360
361 static void
362 move_or_delete_vzeroupper (void)
363 {
364 edge e;
365 edge_iterator ei;
366 basic_block bb;
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
369 int *bb_order;
370 int *rc_order;
371 int i;
372
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
375
376 /* Process outgoing edges of entry point. */
377 if (dump_file)
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
379
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
381 {
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
384 ? used : unused);
385 BLOCK_INFO (e->dest)->processed = true;
386 }
387
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
395 free (rc_order);
396
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
403
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
406 FOR_EACH_BB (bb)
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
409 else
410 {
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
413 }
414
415 if (dump_file)
416 fprintf (dump_file, "Check remaining basic blocks\n");
417
418 while (!fibheap_empty (pending))
419 {
420 fibheap_swap = pending;
421 pending = worklist;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
426
427 sbitmap_zero (visited);
428
429 cfun->machine->rescan_vzeroupper_p = 0;
430
431 while (!fibheap_empty (worklist))
432 {
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
437 {
438 edge_iterator ei;
439
440 SET_BIT (visited, bb->index);
441
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
444 {
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
447 continue;
448
449 if (TEST_BIT (visited, e->dest->index))
450 {
451 if (!TEST_BIT (in_pending, e->dest->index))
452 {
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
457 e->dest);
458 }
459 }
460 else if (!TEST_BIT (in_worklist, e->dest->index))
461 {
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
465 e->dest);
466 }
467 }
468 }
469 }
470
471 if (!cfun->machine->rescan_vzeroupper_p)
472 break;
473 }
474
475 free (bb_order);
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
481
482 if (dump_file)
483 fprintf (dump_file, "Process remaining basic blocks\n");
484
485 FOR_EACH_BB (bb)
486 move_or_delete_vzeroupper_1 (bb, true);
487
488 free_aux_for_blocks ();
489 }
490
491 static rtx legitimize_dllimport_symbol (rtx, bool);
492
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
495 #endif
496
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
503 : 4)
504
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
508
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
510
511 const
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
531 2, /* MOVE_RATIO */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
557 2, /* Branch cost */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 1, /* scalar_stmt_cost. */
569 1, /* scalar load_cost. */
570 1, /* scalar_store_cost. */
571 1, /* vec_stmt_cost. */
572 1, /* vec_to_scalar_cost. */
573 1, /* scalar_to_vec_cost. */
574 1, /* vec_align_load_cost. */
575 1, /* vec_unalign_load_cost. */
576 1, /* vec_store_cost. */
577 1, /* cond_taken_branch_cost. */
578 1, /* cond_not_taken_branch_cost. */
579 };
580
581 /* Processor costs (relative to an add) */
582 static const
583 struct processor_costs i386_cost = { /* 386 specific costs */
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (1), /* cost of a lea instruction */
586 COSTS_N_INSNS (3), /* variable shift costs */
587 COSTS_N_INSNS (2), /* constant shift costs */
588 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (6), /* HI */
590 COSTS_N_INSNS (6), /* SI */
591 COSTS_N_INSNS (6), /* DI */
592 COSTS_N_INSNS (6)}, /* other */
593 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (23), /* HI */
596 COSTS_N_INSNS (23), /* SI */
597 COSTS_N_INSNS (23), /* DI */
598 COSTS_N_INSNS (23)}, /* other */
599 COSTS_N_INSNS (3), /* cost of movsx */
600 COSTS_N_INSNS (2), /* cost of movzx */
601 15, /* "large" insn */
602 3, /* MOVE_RATIO */
603 4, /* cost for loading QImode using movzbl */
604 {2, 4, 2}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {2, 4, 2}, /* cost of storing integer registers */
608 2, /* cost of reg,reg fld/fst */
609 {8, 8, 8}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {8, 8, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 8}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 8}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 8, 16}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 8, 16}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 3, /* MMX or SSE register to integer */
624 0, /* size of l1 cache */
625 0, /* size of l2 cache */
626 0, /* size of prefetch block */
627 0, /* number of parallel prefetches */
628 1, /* Branch cost */
629 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (22), /* cost of FABS instruction. */
633 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
635 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
636 DUMMY_STRINGOP_ALGS},
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
650 };
651
652 static const
653 struct processor_costs i486_cost = { /* 486 specific costs */
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (1), /* cost of a lea instruction */
656 COSTS_N_INSNS (3), /* variable shift costs */
657 COSTS_N_INSNS (2), /* constant shift costs */
658 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (12), /* HI */
660 COSTS_N_INSNS (12), /* SI */
661 COSTS_N_INSNS (12), /* DI */
662 COSTS_N_INSNS (12)}, /* other */
663 1, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (40), /* HI */
666 COSTS_N_INSNS (40), /* SI */
667 COSTS_N_INSNS (40), /* DI */
668 COSTS_N_INSNS (40)}, /* other */
669 COSTS_N_INSNS (3), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 15, /* "large" insn */
672 3, /* MOVE_RATIO */
673 4, /* cost for loading QImode using movzbl */
674 {2, 4, 2}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 4, 2}, /* cost of storing integer registers */
678 2, /* cost of reg,reg fld/fst */
679 {8, 8, 8}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {8, 8, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 8}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 8}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 8, 16}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 8, 16}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 3, /* MMX or SSE register to integer */
694 4, /* size of l1 cache. 486 has 8kB cache
695 shared for code and data, so 4kB is
696 not really precise. */
697 4, /* size of l2 cache */
698 0, /* size of prefetch block */
699 0, /* number of parallel prefetches */
700 1, /* Branch cost */
701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (3), /* cost of FABS instruction. */
705 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
707 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
708 DUMMY_STRINGOP_ALGS},
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 1, /* scalar_stmt_cost. */
712 1, /* scalar load_cost. */
713 1, /* scalar_store_cost. */
714 1, /* vec_stmt_cost. */
715 1, /* vec_to_scalar_cost. */
716 1, /* scalar_to_vec_cost. */
717 1, /* vec_align_load_cost. */
718 2, /* vec_unalign_load_cost. */
719 1, /* vec_store_cost. */
720 3, /* cond_taken_branch_cost. */
721 1, /* cond_not_taken_branch_cost. */
722 };
723
724 static const
725 struct processor_costs pentium_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (1), /* cost of a lea instruction */
728 COSTS_N_INSNS (4), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (11), /* HI */
732 COSTS_N_INSNS (11), /* SI */
733 COSTS_N_INSNS (11), /* DI */
734 COSTS_N_INSNS (11)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (25), /* HI */
738 COSTS_N_INSNS (25), /* SI */
739 COSTS_N_INSNS (25), /* DI */
740 COSTS_N_INSNS (25)}, /* other */
741 COSTS_N_INSNS (3), /* cost of movsx */
742 COSTS_N_INSNS (2), /* cost of movzx */
743 8, /* "large" insn */
744 6, /* MOVE_RATIO */
745 6, /* cost for loading QImode using movzbl */
746 {2, 4, 2}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {2, 4, 2}, /* cost of storing integer registers */
750 2, /* cost of reg,reg fld/fst */
751 {2, 2, 6}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {4, 4, 6}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 8, /* cost of moving MMX register */
756 {8, 8}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {8, 8}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 8, 16}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 8, 16}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 3, /* MMX or SSE register to integer */
766 8, /* size of l1 cache. */
767 8, /* size of l2 cache */
768 0, /* size of prefetch block */
769 0, /* number of parallel prefetches */
770 2, /* Branch cost */
771 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
772 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
773 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
774 COSTS_N_INSNS (1), /* cost of FABS instruction. */
775 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
776 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
777 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
778 DUMMY_STRINGOP_ALGS},
779 {{libcall, {{-1, rep_prefix_4_byte}}},
780 DUMMY_STRINGOP_ALGS},
781 1, /* scalar_stmt_cost. */
782 1, /* scalar load_cost. */
783 1, /* scalar_store_cost. */
784 1, /* vec_stmt_cost. */
785 1, /* vec_to_scalar_cost. */
786 1, /* scalar_to_vec_cost. */
787 1, /* vec_align_load_cost. */
788 2, /* vec_unalign_load_cost. */
789 1, /* vec_store_cost. */
790 3, /* cond_taken_branch_cost. */
791 1, /* cond_not_taken_branch_cost. */
792 };
793
794 static const
795 struct processor_costs pentiumpro_cost = {
796 COSTS_N_INSNS (1), /* cost of an add instruction */
797 COSTS_N_INSNS (1), /* cost of a lea instruction */
798 COSTS_N_INSNS (1), /* variable shift costs */
799 COSTS_N_INSNS (1), /* constant shift costs */
800 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
801 COSTS_N_INSNS (4), /* HI */
802 COSTS_N_INSNS (4), /* SI */
803 COSTS_N_INSNS (4), /* DI */
804 COSTS_N_INSNS (4)}, /* other */
805 0, /* cost of multiply per each bit set */
806 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
807 COSTS_N_INSNS (17), /* HI */
808 COSTS_N_INSNS (17), /* SI */
809 COSTS_N_INSNS (17), /* DI */
810 COSTS_N_INSNS (17)}, /* other */
811 COSTS_N_INSNS (1), /* cost of movsx */
812 COSTS_N_INSNS (1), /* cost of movzx */
813 8, /* "large" insn */
814 6, /* MOVE_RATIO */
815 2, /* cost for loading QImode using movzbl */
816 {4, 4, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 2, 2}, /* cost of storing integer registers */
820 2, /* cost of reg,reg fld/fst */
821 {2, 2, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 6}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, /* cost of moving SSE register */
831 {2, 2, 8}, /* cost of loading SSE registers
832 in SImode, DImode and TImode */
833 {2, 2, 8}, /* cost of storing SSE registers
834 in SImode, DImode and TImode */
835 3, /* MMX or SSE register to integer */
836 8, /* size of l1 cache. */
837 256, /* size of l2 cache */
838 32, /* size of prefetch block */
839 6, /* number of parallel prefetches */
840 2, /* Branch cost */
841 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
842 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
843 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
844 COSTS_N_INSNS (2), /* cost of FABS instruction. */
845 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
846 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
847 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
848 (we ensure the alignment). For small blocks inline loop is still a
849 noticeable win, for bigger blocks either rep movsl or rep movsb is
850 way to go. Rep movsb has apparently more expensive startup time in CPU,
851 but after 4K the difference is down in the noise. */
852 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
853 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
854 DUMMY_STRINGOP_ALGS},
855 {{rep_prefix_4_byte, {{1024, unrolled_loop},
856 {8192, rep_prefix_4_byte}, {-1, libcall}}},
857 DUMMY_STRINGOP_ALGS},
858 1, /* scalar_stmt_cost. */
859 1, /* scalar load_cost. */
860 1, /* scalar_store_cost. */
861 1, /* vec_stmt_cost. */
862 1, /* vec_to_scalar_cost. */
863 1, /* scalar_to_vec_cost. */
864 1, /* vec_align_load_cost. */
865 2, /* vec_unalign_load_cost. */
866 1, /* vec_store_cost. */
867 3, /* cond_taken_branch_cost. */
868 1, /* cond_not_taken_branch_cost. */
869 };
870
871 static const
872 struct processor_costs geode_cost = {
873 COSTS_N_INSNS (1), /* cost of an add instruction */
874 COSTS_N_INSNS (1), /* cost of a lea instruction */
875 COSTS_N_INSNS (2), /* variable shift costs */
876 COSTS_N_INSNS (1), /* constant shift costs */
877 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
878 COSTS_N_INSNS (4), /* HI */
879 COSTS_N_INSNS (7), /* SI */
880 COSTS_N_INSNS (7), /* DI */
881 COSTS_N_INSNS (7)}, /* other */
882 0, /* cost of multiply per each bit set */
883 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
884 COSTS_N_INSNS (23), /* HI */
885 COSTS_N_INSNS (39), /* SI */
886 COSTS_N_INSNS (39), /* DI */
887 COSTS_N_INSNS (39)}, /* other */
888 COSTS_N_INSNS (1), /* cost of movsx */
889 COSTS_N_INSNS (1), /* cost of movzx */
890 8, /* "large" insn */
891 4, /* MOVE_RATIO */
892 1, /* cost for loading QImode using movzbl */
893 {1, 1, 1}, /* cost of loading integer registers
894 in QImode, HImode and SImode.
895 Relative to reg-reg move (2). */
896 {1, 1, 1}, /* cost of storing integer registers */
897 1, /* cost of reg,reg fld/fst */
898 {1, 1, 1}, /* cost of loading fp registers
899 in SFmode, DFmode and XFmode */
900 {4, 6, 6}, /* cost of storing fp registers
901 in SFmode, DFmode and XFmode */
902
903 1, /* cost of moving MMX register */
904 {1, 1}, /* cost of loading MMX registers
905 in SImode and DImode */
906 {1, 1}, /* cost of storing MMX registers
907 in SImode and DImode */
908 1, /* cost of moving SSE register */
909 {1, 1, 1}, /* cost of loading SSE registers
910 in SImode, DImode and TImode */
911 {1, 1, 1}, /* cost of storing SSE registers
912 in SImode, DImode and TImode */
913 1, /* MMX or SSE register to integer */
914 64, /* size of l1 cache. */
915 128, /* size of l2 cache. */
916 32, /* size of prefetch block */
917 1, /* number of parallel prefetches */
918 1, /* Branch cost */
919 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
920 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
921 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
922 COSTS_N_INSNS (1), /* cost of FABS instruction. */
923 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
924 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
925 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
926 DUMMY_STRINGOP_ALGS},
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 1, /* scalar_stmt_cost. */
930 1, /* scalar load_cost. */
931 1, /* scalar_store_cost. */
932 1, /* vec_stmt_cost. */
933 1, /* vec_to_scalar_cost. */
934 1, /* scalar_to_vec_cost. */
935 1, /* vec_align_load_cost. */
936 2, /* vec_unalign_load_cost. */
937 1, /* vec_store_cost. */
938 3, /* cond_taken_branch_cost. */
939 1, /* cond_not_taken_branch_cost. */
940 };
941
942 static const
943 struct processor_costs k6_cost = {
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (3), /* HI */
950 COSTS_N_INSNS (3), /* SI */
951 COSTS_N_INSNS (3), /* DI */
952 COSTS_N_INSNS (3)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (18), /* HI */
956 COSTS_N_INSNS (18), /* SI */
957 COSTS_N_INSNS (18), /* DI */
958 COSTS_N_INSNS (18)}, /* other */
959 COSTS_N_INSNS (2), /* cost of movsx */
960 COSTS_N_INSNS (2), /* cost of movzx */
961 8, /* "large" insn */
962 4, /* MOVE_RATIO */
963 3, /* cost for loading QImode using movzbl */
964 {4, 5, 4}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {2, 3, 2}, /* cost of storing integer registers */
968 4, /* cost of reg,reg fld/fst */
969 {6, 6, 6}, /* cost of loading fp registers
970 in SFmode, DFmode and XFmode */
971 {4, 4, 4}, /* cost of storing fp registers
972 in SFmode, DFmode and XFmode */
973 2, /* cost of moving MMX register */
974 {2, 2}, /* cost of loading MMX registers
975 in SImode and DImode */
976 {2, 2}, /* cost of storing MMX registers
977 in SImode and DImode */
978 2, /* cost of moving SSE register */
979 {2, 2, 8}, /* cost of loading SSE registers
980 in SImode, DImode and TImode */
981 {2, 2, 8}, /* cost of storing SSE registers
982 in SImode, DImode and TImode */
983 6, /* MMX or SSE register to integer */
984 32, /* size of l1 cache. */
985 32, /* size of l2 cache. Some models
986 have integrated l2 cache, but
987 optimizing for k6 is not important
988 enough to worry about that. */
989 32, /* size of prefetch block */
990 1, /* number of parallel prefetches */
991 1, /* Branch cost */
992 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
993 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
994 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
995 COSTS_N_INSNS (2), /* cost of FABS instruction. */
996 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
997 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
998 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
999 DUMMY_STRINGOP_ALGS},
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 1, /* scalar_stmt_cost. */
1003 1, /* scalar load_cost. */
1004 1, /* scalar_store_cost. */
1005 1, /* vec_stmt_cost. */
1006 1, /* vec_to_scalar_cost. */
1007 1, /* scalar_to_vec_cost. */
1008 1, /* vec_align_load_cost. */
1009 2, /* vec_unalign_load_cost. */
1010 1, /* vec_store_cost. */
1011 3, /* cond_taken_branch_cost. */
1012 1, /* cond_not_taken_branch_cost. */
1013 };
1014
1015 static const
1016 struct processor_costs athlon_cost = {
1017 COSTS_N_INSNS (1), /* cost of an add instruction */
1018 COSTS_N_INSNS (2), /* cost of a lea instruction */
1019 COSTS_N_INSNS (1), /* variable shift costs */
1020 COSTS_N_INSNS (1), /* constant shift costs */
1021 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1022 COSTS_N_INSNS (5), /* HI */
1023 COSTS_N_INSNS (5), /* SI */
1024 COSTS_N_INSNS (5), /* DI */
1025 COSTS_N_INSNS (5)}, /* other */
1026 0, /* cost of multiply per each bit set */
1027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1028 COSTS_N_INSNS (26), /* HI */
1029 COSTS_N_INSNS (42), /* SI */
1030 COSTS_N_INSNS (74), /* DI */
1031 COSTS_N_INSNS (74)}, /* other */
1032 COSTS_N_INSNS (1), /* cost of movsx */
1033 COSTS_N_INSNS (1), /* cost of movzx */
1034 8, /* "large" insn */
1035 9, /* MOVE_RATIO */
1036 4, /* cost for loading QImode using movzbl */
1037 {3, 4, 3}, /* cost of loading integer registers
1038 in QImode, HImode and SImode.
1039 Relative to reg-reg move (2). */
1040 {3, 4, 3}, /* cost of storing integer registers */
1041 4, /* cost of reg,reg fld/fst */
1042 {4, 4, 12}, /* cost of loading fp registers
1043 in SFmode, DFmode and XFmode */
1044 {6, 6, 8}, /* cost of storing fp registers
1045 in SFmode, DFmode and XFmode */
1046 2, /* cost of moving MMX register */
1047 {4, 4}, /* cost of loading MMX registers
1048 in SImode and DImode */
1049 {4, 4}, /* cost of storing MMX registers
1050 in SImode and DImode */
1051 2, /* cost of moving SSE register */
1052 {4, 4, 6}, /* cost of loading SSE registers
1053 in SImode, DImode and TImode */
1054 {4, 4, 5}, /* cost of storing SSE registers
1055 in SImode, DImode and TImode */
1056 5, /* MMX or SSE register to integer */
1057 64, /* size of l1 cache. */
1058 256, /* size of l2 cache. */
1059 64, /* size of prefetch block */
1060 6, /* number of parallel prefetches */
1061 5, /* Branch cost */
1062 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1063 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1064 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1065 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1066 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1067 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1068 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1069 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1070 128 bytes for memset. */
1071 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1072 DUMMY_STRINGOP_ALGS},
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 1, /* scalar_stmt_cost. */
1076 1, /* scalar load_cost. */
1077 1, /* scalar_store_cost. */
1078 1, /* vec_stmt_cost. */
1079 1, /* vec_to_scalar_cost. */
1080 1, /* scalar_to_vec_cost. */
1081 1, /* vec_align_load_cost. */
1082 2, /* vec_unalign_load_cost. */
1083 1, /* vec_store_cost. */
1084 3, /* cond_taken_branch_cost. */
1085 1, /* cond_not_taken_branch_cost. */
1086 };
1087
1088 static const
1089 struct processor_costs k8_cost = {
1090 COSTS_N_INSNS (1), /* cost of an add instruction */
1091 COSTS_N_INSNS (2), /* cost of a lea instruction */
1092 COSTS_N_INSNS (1), /* variable shift costs */
1093 COSTS_N_INSNS (1), /* constant shift costs */
1094 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1095 COSTS_N_INSNS (4), /* HI */
1096 COSTS_N_INSNS (3), /* SI */
1097 COSTS_N_INSNS (4), /* DI */
1098 COSTS_N_INSNS (5)}, /* other */
1099 0, /* cost of multiply per each bit set */
1100 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1101 COSTS_N_INSNS (26), /* HI */
1102 COSTS_N_INSNS (42), /* SI */
1103 COSTS_N_INSNS (74), /* DI */
1104 COSTS_N_INSNS (74)}, /* other */
1105 COSTS_N_INSNS (1), /* cost of movsx */
1106 COSTS_N_INSNS (1), /* cost of movzx */
1107 8, /* "large" insn */
1108 9, /* MOVE_RATIO */
1109 4, /* cost for loading QImode using movzbl */
1110 {3, 4, 3}, /* cost of loading integer registers
1111 in QImode, HImode and SImode.
1112 Relative to reg-reg move (2). */
1113 {3, 4, 3}, /* cost of storing integer registers */
1114 4, /* cost of reg,reg fld/fst */
1115 {4, 4, 12}, /* cost of loading fp registers
1116 in SFmode, DFmode and XFmode */
1117 {6, 6, 8}, /* cost of storing fp registers
1118 in SFmode, DFmode and XFmode */
1119 2, /* cost of moving MMX register */
1120 {3, 3}, /* cost of loading MMX registers
1121 in SImode and DImode */
1122 {4, 4}, /* cost of storing MMX registers
1123 in SImode and DImode */
1124 2, /* cost of moving SSE register */
1125 {4, 3, 6}, /* cost of loading SSE registers
1126 in SImode, DImode and TImode */
1127 {4, 4, 5}, /* cost of storing SSE registers
1128 in SImode, DImode and TImode */
1129 5, /* MMX or SSE register to integer */
1130 64, /* size of l1 cache. */
1131 512, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 3, /* Branch cost */
1140 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1146 /* K8 has optimized REP instruction for medium sized blocks, but for very
1147 small blocks it is better to use loop. For large blocks, libcall can
1148 do nontemporary accesses and beat inline considerably. */
1149 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1150 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1151 {{libcall, {{8, loop}, {24, unrolled_loop},
1152 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1153 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1154 4, /* scalar_stmt_cost. */
1155 2, /* scalar load_cost. */
1156 2, /* scalar_store_cost. */
1157 5, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 2, /* vec_align_load_cost. */
1161 3, /* vec_unalign_load_cost. */
1162 3, /* vec_store_cost. */
1163 3, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1165 };
1166
1167 struct processor_costs amdfam10_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (2), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (3), /* SI */
1175 COSTS_N_INSNS (4), /* DI */
1176 COSTS_N_INSNS (5)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1186 9, /* MOVE_RATIO */
1187 4, /* cost for loading QImode using movzbl */
1188 {3, 4, 3}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {3, 4, 3}, /* cost of storing integer registers */
1192 4, /* cost of reg,reg fld/fst */
1193 {4, 4, 12}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {6, 6, 8}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {3, 3}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {4, 4}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 2, /* cost of moving SSE register */
1203 {4, 4, 3}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {4, 4, 5}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 3, /* MMX or SSE register to integer */
1208 /* On K8:
1209 MOVD reg64, xmmreg Double FSTORE 4
1210 MOVD reg32, xmmreg Double FSTORE 4
1211 On AMDFAM10:
1212 MOVD reg64, xmmreg Double FADD 3
1213 1/1 1/1
1214 MOVD reg32, xmmreg Double FADD 3
1215 1/1 1/1 */
1216 64, /* size of l1 cache. */
1217 512, /* size of l2 cache. */
1218 64, /* size of prefetch block */
1219 /* New AMD processors never drop prefetches; if they cannot be performed
1220 immediately, they are queued. We set number of simultaneous prefetches
1221 to a large constant to reflect this (it probably is not a good idea not
1222 to limit number of prefetches at all, as their execution also takes some
1223 time). */
1224 100, /* number of parallel prefetches */
1225 2, /* Branch cost */
1226 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1227 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1228 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1229 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1230 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1231 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1232
1233 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1234 very small blocks it is better to use loop. For large blocks, libcall can
1235 do nontemporary accesses and beat inline considerably. */
1236 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1237 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1238 {{libcall, {{8, loop}, {24, unrolled_loop},
1239 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1240 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1241 4, /* scalar_stmt_cost. */
1242 2, /* scalar load_cost. */
1243 2, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 2, /* vec_align_load_cost. */
1248 2, /* vec_unalign_load_cost. */
1249 2, /* vec_store_cost. */
1250 2, /* cond_taken_branch_cost. */
1251 1, /* cond_not_taken_branch_cost. */
1252 };
1253
1254 struct processor_costs bdver1_cost = {
1255 COSTS_N_INSNS (1), /* cost of an add instruction */
1256 COSTS_N_INSNS (1), /* cost of a lea instruction */
1257 COSTS_N_INSNS (1), /* variable shift costs */
1258 COSTS_N_INSNS (1), /* constant shift costs */
1259 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1260 COSTS_N_INSNS (4), /* HI */
1261 COSTS_N_INSNS (4), /* SI */
1262 COSTS_N_INSNS (6), /* DI */
1263 COSTS_N_INSNS (6)}, /* other */
1264 0, /* cost of multiply per each bit set */
1265 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1266 COSTS_N_INSNS (35), /* HI */
1267 COSTS_N_INSNS (51), /* SI */
1268 COSTS_N_INSNS (83), /* DI */
1269 COSTS_N_INSNS (83)}, /* other */
1270 COSTS_N_INSNS (1), /* cost of movsx */
1271 COSTS_N_INSNS (1), /* cost of movzx */
1272 8, /* "large" insn */
1273 9, /* MOVE_RATIO */
1274 4, /* cost for loading QImode using movzbl */
1275 {5, 5, 4}, /* cost of loading integer registers
1276 in QImode, HImode and SImode.
1277 Relative to reg-reg move (2). */
1278 {4, 4, 4}, /* cost of storing integer registers */
1279 2, /* cost of reg,reg fld/fst */
1280 {5, 5, 12}, /* cost of loading fp registers
1281 in SFmode, DFmode and XFmode */
1282 {4, 4, 8}, /* cost of storing fp registers
1283 in SFmode, DFmode and XFmode */
1284 2, /* cost of moving MMX register */
1285 {4, 4}, /* cost of loading MMX registers
1286 in SImode and DImode */
1287 {4, 4}, /* cost of storing MMX registers
1288 in SImode and DImode */
1289 2, /* cost of moving SSE register */
1290 {4, 4, 4}, /* cost of loading SSE registers
1291 in SImode, DImode and TImode */
1292 {4, 4, 4}, /* cost of storing SSE registers
1293 in SImode, DImode and TImode */
1294 2, /* MMX or SSE register to integer */
1295 /* On K8:
1296 MOVD reg64, xmmreg Double FSTORE 4
1297 MOVD reg32, xmmreg Double FSTORE 4
1298 On AMDFAM10:
1299 MOVD reg64, xmmreg Double FADD 3
1300 1/1 1/1
1301 MOVD reg32, xmmreg Double FADD 3
1302 1/1 1/1 */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1319
1320 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1321 very small blocks it is better to use loop. For large blocks, libcall
1322 can do nontemporary accesses and beat inline considerably. */
1323 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1324 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1325 {{libcall, {{8, loop}, {24, unrolled_loop},
1326 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1327 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 2, /* cond_taken_branch_cost. */
1338 1, /* cond_not_taken_branch_cost. */
1339 };
1340
1341 struct processor_costs bdver2_cost = {
1342 COSTS_N_INSNS (1), /* cost of an add instruction */
1343 COSTS_N_INSNS (1), /* cost of a lea instruction */
1344 COSTS_N_INSNS (1), /* variable shift costs */
1345 COSTS_N_INSNS (1), /* constant shift costs */
1346 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1347 COSTS_N_INSNS (4), /* HI */
1348 COSTS_N_INSNS (4), /* SI */
1349 COSTS_N_INSNS (6), /* DI */
1350 COSTS_N_INSNS (6)}, /* other */
1351 0, /* cost of multiply per each bit set */
1352 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1353 COSTS_N_INSNS (35), /* HI */
1354 COSTS_N_INSNS (51), /* SI */
1355 COSTS_N_INSNS (83), /* DI */
1356 COSTS_N_INSNS (83)}, /* other */
1357 COSTS_N_INSNS (1), /* cost of movsx */
1358 COSTS_N_INSNS (1), /* cost of movzx */
1359 8, /* "large" insn */
1360 9, /* MOVE_RATIO */
1361 4, /* cost for loading QImode using movzbl */
1362 {5, 5, 4}, /* cost of loading integer registers
1363 in QImode, HImode and SImode.
1364 Relative to reg-reg move (2). */
1365 {4, 4, 4}, /* cost of storing integer registers */
1366 2, /* cost of reg,reg fld/fst */
1367 {5, 5, 12}, /* cost of loading fp registers
1368 in SFmode, DFmode and XFmode */
1369 {4, 4, 8}, /* cost of storing fp registers
1370 in SFmode, DFmode and XFmode */
1371 2, /* cost of moving MMX register */
1372 {4, 4}, /* cost of loading MMX registers
1373 in SImode and DImode */
1374 {4, 4}, /* cost of storing MMX registers
1375 in SImode and DImode */
1376 2, /* cost of moving SSE register */
1377 {4, 4, 4}, /* cost of loading SSE registers
1378 in SImode, DImode and TImode */
1379 {4, 4, 4}, /* cost of storing SSE registers
1380 in SImode, DImode and TImode */
1381 2, /* MMX or SSE register to integer */
1382 /* On K8:
1383 MOVD reg64, xmmreg Double FSTORE 4
1384 MOVD reg32, xmmreg Double FSTORE 4
1385 On AMDFAM10:
1386 MOVD reg64, xmmreg Double FADD 3
1387 1/1 1/1
1388 MOVD reg32, xmmreg Double FADD 3
1389 1/1 1/1 */
1390 16, /* size of l1 cache. */
1391 2048, /* size of l2 cache. */
1392 64, /* size of prefetch block */
1393 /* New AMD processors never drop prefetches; if they cannot be performed
1394 immediately, they are queued. We set number of simultaneous prefetches
1395 to a large constant to reflect this (it probably is not a good idea not
1396 to limit number of prefetches at all, as their execution also takes some
1397 time). */
1398 100, /* number of parallel prefetches */
1399 2, /* Branch cost */
1400 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1401 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1402 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1403 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1404 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1405 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1406
1407 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1408 very small blocks it is better to use loop. For large blocks, libcall
1409 can do nontemporary accesses and beat inline considerably. */
1410 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1411 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1412 {{libcall, {{8, loop}, {24, unrolled_loop},
1413 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1414 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1415 6, /* scalar_stmt_cost. */
1416 4, /* scalar load_cost. */
1417 4, /* scalar_store_cost. */
1418 6, /* vec_stmt_cost. */
1419 0, /* vec_to_scalar_cost. */
1420 2, /* scalar_to_vec_cost. */
1421 4, /* vec_align_load_cost. */
1422 4, /* vec_unalign_load_cost. */
1423 4, /* vec_store_cost. */
1424 2, /* cond_taken_branch_cost. */
1425 1, /* cond_not_taken_branch_cost. */
1426 };
1427
1428 struct processor_costs btver1_cost = {
1429 COSTS_N_INSNS (1), /* cost of an add instruction */
1430 COSTS_N_INSNS (2), /* cost of a lea instruction */
1431 COSTS_N_INSNS (1), /* variable shift costs */
1432 COSTS_N_INSNS (1), /* constant shift costs */
1433 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1434 COSTS_N_INSNS (4), /* HI */
1435 COSTS_N_INSNS (3), /* SI */
1436 COSTS_N_INSNS (4), /* DI */
1437 COSTS_N_INSNS (5)}, /* other */
1438 0, /* cost of multiply per each bit set */
1439 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1440 COSTS_N_INSNS (35), /* HI */
1441 COSTS_N_INSNS (51), /* SI */
1442 COSTS_N_INSNS (83), /* DI */
1443 COSTS_N_INSNS (83)}, /* other */
1444 COSTS_N_INSNS (1), /* cost of movsx */
1445 COSTS_N_INSNS (1), /* cost of movzx */
1446 8, /* "large" insn */
1447 9, /* MOVE_RATIO */
1448 4, /* cost for loading QImode using movzbl */
1449 {3, 4, 3}, /* cost of loading integer registers
1450 in QImode, HImode and SImode.
1451 Relative to reg-reg move (2). */
1452 {3, 4, 3}, /* cost of storing integer registers */
1453 4, /* cost of reg,reg fld/fst */
1454 {4, 4, 12}, /* cost of loading fp registers
1455 in SFmode, DFmode and XFmode */
1456 {6, 6, 8}, /* cost of storing fp registers
1457 in SFmode, DFmode and XFmode */
1458 2, /* cost of moving MMX register */
1459 {3, 3}, /* cost of loading MMX registers
1460 in SImode and DImode */
1461 {4, 4}, /* cost of storing MMX registers
1462 in SImode and DImode */
1463 2, /* cost of moving SSE register */
1464 {4, 4, 3}, /* cost of loading SSE registers
1465 in SImode, DImode and TImode */
1466 {4, 4, 5}, /* cost of storing SSE registers
1467 in SImode, DImode and TImode */
1468 3, /* MMX or SSE register to integer */
1469 /* On K8:
1470 MOVD reg64, xmmreg Double FSTORE 4
1471 MOVD reg32, xmmreg Double FSTORE 4
1472 On AMDFAM10:
1473 MOVD reg64, xmmreg Double FADD 3
1474 1/1 1/1
1475 MOVD reg32, xmmreg Double FADD 3
1476 1/1 1/1 */
1477 32, /* size of l1 cache. */
1478 512, /* size of l2 cache. */
1479 64, /* size of prefetch block */
1480 100, /* number of parallel prefetches */
1481 2, /* Branch cost */
1482 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1483 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1484 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1485 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1486 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1487 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1488
1489 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1490 very small blocks it is better to use loop. For large blocks, libcall can
1491 do nontemporary accesses and beat inline considerably. */
1492 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1493 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1494 {{libcall, {{8, loop}, {24, unrolled_loop},
1495 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1496 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1497 4, /* scalar_stmt_cost. */
1498 2, /* scalar load_cost. */
1499 2, /* scalar_store_cost. */
1500 6, /* vec_stmt_cost. */
1501 0, /* vec_to_scalar_cost. */
1502 2, /* scalar_to_vec_cost. */
1503 2, /* vec_align_load_cost. */
1504 2, /* vec_unalign_load_cost. */
1505 2, /* vec_store_cost. */
1506 2, /* cond_taken_branch_cost. */
1507 1, /* cond_not_taken_branch_cost. */
1508 };
1509
1510 static const
1511 struct processor_costs pentium4_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (3), /* cost of a lea instruction */
1514 COSTS_N_INSNS (4), /* variable shift costs */
1515 COSTS_N_INSNS (4), /* constant shift costs */
1516 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (15), /* HI */
1518 COSTS_N_INSNS (15), /* SI */
1519 COSTS_N_INSNS (15), /* DI */
1520 COSTS_N_INSNS (15)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (56), /* HI */
1524 COSTS_N_INSNS (56), /* SI */
1525 COSTS_N_INSNS (56), /* DI */
1526 COSTS_N_INSNS (56)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 16, /* "large" insn */
1530 6, /* MOVE_RATIO */
1531 2, /* cost for loading QImode using movzbl */
1532 {4, 5, 4}, /* cost of loading integer registers
1533 in QImode, HImode and SImode.
1534 Relative to reg-reg move (2). */
1535 {2, 3, 2}, /* cost of storing integer registers */
1536 2, /* cost of reg,reg fld/fst */
1537 {2, 2, 6}, /* cost of loading fp registers
1538 in SFmode, DFmode and XFmode */
1539 {4, 4, 6}, /* cost of storing fp registers
1540 in SFmode, DFmode and XFmode */
1541 2, /* cost of moving MMX register */
1542 {2, 2}, /* cost of loading MMX registers
1543 in SImode and DImode */
1544 {2, 2}, /* cost of storing MMX registers
1545 in SImode and DImode */
1546 12, /* cost of moving SSE register */
1547 {12, 12, 12}, /* cost of loading SSE registers
1548 in SImode, DImode and TImode */
1549 {2, 2, 8}, /* cost of storing SSE registers
1550 in SImode, DImode and TImode */
1551 10, /* MMX or SSE register to integer */
1552 8, /* size of l1 cache. */
1553 256, /* size of l2 cache. */
1554 64, /* size of prefetch block */
1555 6, /* number of parallel prefetches */
1556 2, /* Branch cost */
1557 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1558 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1559 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1560 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1561 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1562 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1563 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1564 DUMMY_STRINGOP_ALGS},
1565 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1566 {-1, libcall}}},
1567 DUMMY_STRINGOP_ALGS},
1568 1, /* scalar_stmt_cost. */
1569 1, /* scalar load_cost. */
1570 1, /* scalar_store_cost. */
1571 1, /* vec_stmt_cost. */
1572 1, /* vec_to_scalar_cost. */
1573 1, /* scalar_to_vec_cost. */
1574 1, /* vec_align_load_cost. */
1575 2, /* vec_unalign_load_cost. */
1576 1, /* vec_store_cost. */
1577 3, /* cond_taken_branch_cost. */
1578 1, /* cond_not_taken_branch_cost. */
1579 };
1580
1581 static const
1582 struct processor_costs nocona_cost = {
1583 COSTS_N_INSNS (1), /* cost of an add instruction */
1584 COSTS_N_INSNS (1), /* cost of a lea instruction */
1585 COSTS_N_INSNS (1), /* variable shift costs */
1586 COSTS_N_INSNS (1), /* constant shift costs */
1587 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1588 COSTS_N_INSNS (10), /* HI */
1589 COSTS_N_INSNS (10), /* SI */
1590 COSTS_N_INSNS (10), /* DI */
1591 COSTS_N_INSNS (10)}, /* other */
1592 0, /* cost of multiply per each bit set */
1593 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1594 COSTS_N_INSNS (66), /* HI */
1595 COSTS_N_INSNS (66), /* SI */
1596 COSTS_N_INSNS (66), /* DI */
1597 COSTS_N_INSNS (66)}, /* other */
1598 COSTS_N_INSNS (1), /* cost of movsx */
1599 COSTS_N_INSNS (1), /* cost of movzx */
1600 16, /* "large" insn */
1601 17, /* MOVE_RATIO */
1602 4, /* cost for loading QImode using movzbl */
1603 {4, 4, 4}, /* cost of loading integer registers
1604 in QImode, HImode and SImode.
1605 Relative to reg-reg move (2). */
1606 {4, 4, 4}, /* cost of storing integer registers */
1607 3, /* cost of reg,reg fld/fst */
1608 {12, 12, 12}, /* cost of loading fp registers
1609 in SFmode, DFmode and XFmode */
1610 {4, 4, 4}, /* cost of storing fp registers
1611 in SFmode, DFmode and XFmode */
1612 6, /* cost of moving MMX register */
1613 {12, 12}, /* cost of loading MMX registers
1614 in SImode and DImode */
1615 {12, 12}, /* cost of storing MMX registers
1616 in SImode and DImode */
1617 6, /* cost of moving SSE register */
1618 {12, 12, 12}, /* cost of loading SSE registers
1619 in SImode, DImode and TImode */
1620 {12, 12, 12}, /* cost of storing SSE registers
1621 in SImode, DImode and TImode */
1622 8, /* MMX or SSE register to integer */
1623 8, /* size of l1 cache. */
1624 1024, /* size of l2 cache. */
1625 128, /* size of prefetch block */
1626 8, /* number of parallel prefetches */
1627 1, /* Branch cost */
1628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1629 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1630 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1631 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1632 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1633 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1634 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1635 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1636 {100000, unrolled_loop}, {-1, libcall}}}},
1637 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1638 {-1, libcall}}},
1639 {libcall, {{24, loop}, {64, unrolled_loop},
1640 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1641 1, /* scalar_stmt_cost. */
1642 1, /* scalar load_cost. */
1643 1, /* scalar_store_cost. */
1644 1, /* vec_stmt_cost. */
1645 1, /* vec_to_scalar_cost. */
1646 1, /* scalar_to_vec_cost. */
1647 1, /* vec_align_load_cost. */
1648 2, /* vec_unalign_load_cost. */
1649 1, /* vec_store_cost. */
1650 3, /* cond_taken_branch_cost. */
1651 1, /* cond_not_taken_branch_cost. */
1652 };
1653
1654 static const
1655 struct processor_costs atom_cost = {
1656 COSTS_N_INSNS (1), /* cost of an add instruction */
1657 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1658 COSTS_N_INSNS (1), /* variable shift costs */
1659 COSTS_N_INSNS (1), /* constant shift costs */
1660 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1661 COSTS_N_INSNS (4), /* HI */
1662 COSTS_N_INSNS (3), /* SI */
1663 COSTS_N_INSNS (4), /* DI */
1664 COSTS_N_INSNS (2)}, /* other */
1665 0, /* cost of multiply per each bit set */
1666 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1667 COSTS_N_INSNS (26), /* HI */
1668 COSTS_N_INSNS (42), /* SI */
1669 COSTS_N_INSNS (74), /* DI */
1670 COSTS_N_INSNS (74)}, /* other */
1671 COSTS_N_INSNS (1), /* cost of movsx */
1672 COSTS_N_INSNS (1), /* cost of movzx */
1673 8, /* "large" insn */
1674 17, /* MOVE_RATIO */
1675 4, /* cost for loading QImode using movzbl */
1676 {4, 4, 4}, /* cost of loading integer registers
1677 in QImode, HImode and SImode.
1678 Relative to reg-reg move (2). */
1679 {4, 4, 4}, /* cost of storing integer registers */
1680 4, /* cost of reg,reg fld/fst */
1681 {12, 12, 12}, /* cost of loading fp registers
1682 in SFmode, DFmode and XFmode */
1683 {6, 6, 8}, /* cost of storing fp registers
1684 in SFmode, DFmode and XFmode */
1685 2, /* cost of moving MMX register */
1686 {8, 8}, /* cost of loading MMX registers
1687 in SImode and DImode */
1688 {8, 8}, /* cost of storing MMX registers
1689 in SImode and DImode */
1690 2, /* cost of moving SSE register */
1691 {8, 8, 8}, /* cost of loading SSE registers
1692 in SImode, DImode and TImode */
1693 {8, 8, 8}, /* cost of storing SSE registers
1694 in SImode, DImode and TImode */
1695 5, /* MMX or SSE register to integer */
1696 32, /* size of l1 cache. */
1697 256, /* size of l2 cache. */
1698 64, /* size of prefetch block */
1699 6, /* number of parallel prefetches */
1700 3, /* Branch cost */
1701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1702 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1703 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1704 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1705 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1706 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1707 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1708 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1709 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1710 {{libcall, {{8, loop}, {15, unrolled_loop},
1711 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1712 {libcall, {{24, loop}, {32, unrolled_loop},
1713 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1714 1, /* scalar_stmt_cost. */
1715 1, /* scalar load_cost. */
1716 1, /* scalar_store_cost. */
1717 1, /* vec_stmt_cost. */
1718 1, /* vec_to_scalar_cost. */
1719 1, /* scalar_to_vec_cost. */
1720 1, /* vec_align_load_cost. */
1721 2, /* vec_unalign_load_cost. */
1722 1, /* vec_store_cost. */
1723 3, /* cond_taken_branch_cost. */
1724 1, /* cond_not_taken_branch_cost. */
1725 };
1726
1727 /* Generic64 should produce code tuned for Nocona and K8. */
1728 static const
1729 struct processor_costs generic64_cost = {
1730 COSTS_N_INSNS (1), /* cost of an add instruction */
1731 /* On all chips taken into consideration lea is 2 cycles and more. With
1732 this cost however our current implementation of synth_mult results in
1733 use of unnecessary temporary registers causing regression on several
1734 SPECfp benchmarks. */
1735 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1736 COSTS_N_INSNS (1), /* variable shift costs */
1737 COSTS_N_INSNS (1), /* constant shift costs */
1738 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1739 COSTS_N_INSNS (4), /* HI */
1740 COSTS_N_INSNS (3), /* SI */
1741 COSTS_N_INSNS (4), /* DI */
1742 COSTS_N_INSNS (2)}, /* other */
1743 0, /* cost of multiply per each bit set */
1744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1745 COSTS_N_INSNS (26), /* HI */
1746 COSTS_N_INSNS (42), /* SI */
1747 COSTS_N_INSNS (74), /* DI */
1748 COSTS_N_INSNS (74)}, /* other */
1749 COSTS_N_INSNS (1), /* cost of movsx */
1750 COSTS_N_INSNS (1), /* cost of movzx */
1751 8, /* "large" insn */
1752 17, /* MOVE_RATIO */
1753 4, /* cost for loading QImode using movzbl */
1754 {4, 4, 4}, /* cost of loading integer registers
1755 in QImode, HImode and SImode.
1756 Relative to reg-reg move (2). */
1757 {4, 4, 4}, /* cost of storing integer registers */
1758 4, /* cost of reg,reg fld/fst */
1759 {12, 12, 12}, /* cost of loading fp registers
1760 in SFmode, DFmode and XFmode */
1761 {6, 6, 8}, /* cost of storing fp registers
1762 in SFmode, DFmode and XFmode */
1763 2, /* cost of moving MMX register */
1764 {8, 8}, /* cost of loading MMX registers
1765 in SImode and DImode */
1766 {8, 8}, /* cost of storing MMX registers
1767 in SImode and DImode */
1768 2, /* cost of moving SSE register */
1769 {8, 8, 8}, /* cost of loading SSE registers
1770 in SImode, DImode and TImode */
1771 {8, 8, 8}, /* cost of storing SSE registers
1772 in SImode, DImode and TImode */
1773 5, /* MMX or SSE register to integer */
1774 32, /* size of l1 cache. */
1775 512, /* size of l2 cache. */
1776 64, /* size of prefetch block */
1777 6, /* number of parallel prefetches */
1778 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1779 value is increased to perhaps more appropriate value of 5. */
1780 3, /* Branch cost */
1781 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1782 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1783 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1784 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1785 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1786 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1787 {DUMMY_STRINGOP_ALGS,
1788 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1789 {DUMMY_STRINGOP_ALGS,
1790 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 1, /* scalar_stmt_cost. */
1792 1, /* scalar load_cost. */
1793 1, /* scalar_store_cost. */
1794 1, /* vec_stmt_cost. */
1795 1, /* vec_to_scalar_cost. */
1796 1, /* scalar_to_vec_cost. */
1797 1, /* vec_align_load_cost. */
1798 2, /* vec_unalign_load_cost. */
1799 1, /* vec_store_cost. */
1800 3, /* cond_taken_branch_cost. */
1801 1, /* cond_not_taken_branch_cost. */
1802 };
1803
1804 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1805 Athlon and K8. */
1806 static const
1807 struct processor_costs generic32_cost = {
1808 COSTS_N_INSNS (1), /* cost of an add instruction */
1809 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1810 COSTS_N_INSNS (1), /* variable shift costs */
1811 COSTS_N_INSNS (1), /* constant shift costs */
1812 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1813 COSTS_N_INSNS (4), /* HI */
1814 COSTS_N_INSNS (3), /* SI */
1815 COSTS_N_INSNS (4), /* DI */
1816 COSTS_N_INSNS (2)}, /* other */
1817 0, /* cost of multiply per each bit set */
1818 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1819 COSTS_N_INSNS (26), /* HI */
1820 COSTS_N_INSNS (42), /* SI */
1821 COSTS_N_INSNS (74), /* DI */
1822 COSTS_N_INSNS (74)}, /* other */
1823 COSTS_N_INSNS (1), /* cost of movsx */
1824 COSTS_N_INSNS (1), /* cost of movzx */
1825 8, /* "large" insn */
1826 17, /* MOVE_RATIO */
1827 4, /* cost for loading QImode using movzbl */
1828 {4, 4, 4}, /* cost of loading integer registers
1829 in QImode, HImode and SImode.
1830 Relative to reg-reg move (2). */
1831 {4, 4, 4}, /* cost of storing integer registers */
1832 4, /* cost of reg,reg fld/fst */
1833 {12, 12, 12}, /* cost of loading fp registers
1834 in SFmode, DFmode and XFmode */
1835 {6, 6, 8}, /* cost of storing fp registers
1836 in SFmode, DFmode and XFmode */
1837 2, /* cost of moving MMX register */
1838 {8, 8}, /* cost of loading MMX registers
1839 in SImode and DImode */
1840 {8, 8}, /* cost of storing MMX registers
1841 in SImode and DImode */
1842 2, /* cost of moving SSE register */
1843 {8, 8, 8}, /* cost of loading SSE registers
1844 in SImode, DImode and TImode */
1845 {8, 8, 8}, /* cost of storing SSE registers
1846 in SImode, DImode and TImode */
1847 5, /* MMX or SSE register to integer */
1848 32, /* size of l1 cache. */
1849 256, /* size of l2 cache. */
1850 64, /* size of prefetch block */
1851 6, /* number of parallel prefetches */
1852 3, /* Branch cost */
1853 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1854 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1855 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1856 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1857 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1858 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1859 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1860 DUMMY_STRINGOP_ALGS},
1861 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1862 DUMMY_STRINGOP_ALGS},
1863 1, /* scalar_stmt_cost. */
1864 1, /* scalar load_cost. */
1865 1, /* scalar_store_cost. */
1866 1, /* vec_stmt_cost. */
1867 1, /* vec_to_scalar_cost. */
1868 1, /* scalar_to_vec_cost. */
1869 1, /* vec_align_load_cost. */
1870 2, /* vec_unalign_load_cost. */
1871 1, /* vec_store_cost. */
1872 3, /* cond_taken_branch_cost. */
1873 1, /* cond_not_taken_branch_cost. */
1874 };
1875
1876 const struct processor_costs *ix86_cost = &pentium_cost;
1877
1878 /* Processor feature/optimization bitmasks. */
1879 #define m_386 (1<<PROCESSOR_I386)
1880 #define m_486 (1<<PROCESSOR_I486)
1881 #define m_PENT (1<<PROCESSOR_PENTIUM)
1882 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1883 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1884 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1885 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1886 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1887 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1888 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1889 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1890 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1891 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1892 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1893 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1894 #define m_ATOM (1<<PROCESSOR_ATOM)
1895
1896 #define m_GEODE (1<<PROCESSOR_GEODE)
1897 #define m_K6 (1<<PROCESSOR_K6)
1898 #define m_K6_GEODE (m_K6 | m_GEODE)
1899 #define m_K8 (1<<PROCESSOR_K8)
1900 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1901 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1902 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1903 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1904 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1905 #define m_BDVER (m_BDVER1 | m_BDVER2)
1906 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1907 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1908
1909 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1910 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1911
1912 /* Generic instruction choice should be common subset of supported CPUs
1913 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1914 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1915
1916 /* Feature tests against the various tunings. */
1917 unsigned char ix86_tune_features[X86_TUNE_LAST];
1918
1919 /* Feature tests against the various tunings used to create ix86_tune_features
1920 based on the processor mask. */
1921 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1922 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1923 negatively, so enabling for Generic64 seems like good code size
1924 tradeoff. We can't enable it for 32bit generic because it does not
1925 work well with PPro base chips. */
1926 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1927
1928 /* X86_TUNE_PUSH_MEMORY */
1929 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1930
1931 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1932 m_486 | m_PENT,
1933
1934 /* X86_TUNE_UNROLL_STRLEN */
1935 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1936
1937 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1938 on simulation result. But after P4 was made, no performance benefit
1939 was observed with branch hints. It also increases the code size.
1940 As a result, icc never generates branch hints. */
1941 0,
1942
1943 /* X86_TUNE_DOUBLE_WITH_ADD */
1944 ~m_386,
1945
1946 /* X86_TUNE_USE_SAHF */
1947 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1948
1949 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1950 partial dependencies. */
1951 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1952
1953 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1954 register stalls on Generic32 compilation setting as well. However
1955 in current implementation the partial register stalls are not eliminated
1956 very well - they can be introduced via subregs synthesized by combine
1957 and can happen in caller/callee saving sequences. Because this option
1958 pays back little on PPro based chips and is in conflict with partial reg
1959 dependencies used by Athlon/P4 based chips, it is better to leave it off
1960 for generic32 for now. */
1961 m_PPRO,
1962
1963 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1964 m_CORE2I7 | m_GENERIC,
1965
1966 /* X86_TUNE_USE_HIMODE_FIOP */
1967 m_386 | m_486 | m_K6_GEODE,
1968
1969 /* X86_TUNE_USE_SIMODE_FIOP */
1970 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1971
1972 /* X86_TUNE_USE_MOV0 */
1973 m_K6,
1974
1975 /* X86_TUNE_USE_CLTD */
1976 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1977
1978 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1979 m_PENT4,
1980
1981 /* X86_TUNE_SPLIT_LONG_MOVES */
1982 m_PPRO,
1983
1984 /* X86_TUNE_READ_MODIFY_WRITE */
1985 ~m_PENT,
1986
1987 /* X86_TUNE_READ_MODIFY */
1988 ~(m_PENT | m_PPRO),
1989
1990 /* X86_TUNE_PROMOTE_QIMODE */
1991 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1992
1993 /* X86_TUNE_FAST_PREFIX */
1994 ~(m_386 | m_486 | m_PENT),
1995
1996 /* X86_TUNE_SINGLE_STRINGOP */
1997 m_386 | m_P4_NOCONA,
1998
1999 /* X86_TUNE_QIMODE_MATH */
2000 ~0,
2001
2002 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2003 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2004 might be considered for Generic32 if our scheme for avoiding partial
2005 stalls was more effective. */
2006 ~m_PPRO,
2007
2008 /* X86_TUNE_PROMOTE_QI_REGS */
2009 0,
2010
2011 /* X86_TUNE_PROMOTE_HI_REGS */
2012 m_PPRO,
2013
2014 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2015 over esp addition. */
2016 m_386 | m_486 | m_PENT | m_PPRO,
2017
2018 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2019 over esp addition. */
2020 m_PENT,
2021
2022 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2023 over esp subtraction. */
2024 m_386 | m_486 | m_PENT | m_K6_GEODE,
2025
2026 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2027 over esp subtraction. */
2028 m_PENT | m_K6_GEODE,
2029
2030 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2031 for DFmode copies */
2032 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2033
2034 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2035 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2036
2037 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2038 conflict here in between PPro/Pentium4 based chips that thread 128bit
2039 SSE registers as single units versus K8 based chips that divide SSE
2040 registers to two 64bit halves. This knob promotes all store destinations
2041 to be 128bit to allow register renaming on 128bit SSE units, but usually
2042 results in one extra microop on 64bit SSE units. Experimental results
2043 shows that disabling this option on P4 brings over 20% SPECfp regression,
2044 while enabling it on K8 brings roughly 2.4% regression that can be partly
2045 masked by careful scheduling of moves. */
2046 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2047
2048 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2049 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2050
2051 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2052 m_COREI7 | m_BDVER,
2053
2054 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2055 m_BDVER ,
2056
2057 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2058 are resolved on SSE register parts instead of whole registers, so we may
2059 maintain just lower part of scalar values in proper format leaving the
2060 upper part undefined. */
2061 m_ATHLON_K8,
2062
2063 /* X86_TUNE_SSE_TYPELESS_STORES */
2064 m_AMD_MULTIPLE,
2065
2066 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2067 m_PPRO | m_P4_NOCONA,
2068
2069 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2070 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2071
2072 /* X86_TUNE_PROLOGUE_USING_MOVE */
2073 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2074
2075 /* X86_TUNE_EPILOGUE_USING_MOVE */
2076 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2077
2078 /* X86_TUNE_SHIFT1 */
2079 ~m_486,
2080
2081 /* X86_TUNE_USE_FFREEP */
2082 m_AMD_MULTIPLE,
2083
2084 /* X86_TUNE_INTER_UNIT_MOVES */
2085 ~(m_AMD_MULTIPLE | m_GENERIC),
2086
2087 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2088 ~(m_AMDFAM10 | m_BDVER ),
2089
2090 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2091 than 4 branch instructions in the 16 byte window. */
2092 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2093
2094 /* X86_TUNE_SCHEDULE */
2095 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2096
2097 /* X86_TUNE_USE_BT */
2098 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2099
2100 /* X86_TUNE_USE_INCDEC */
2101 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2102
2103 /* X86_TUNE_PAD_RETURNS */
2104 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2105
2106 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2107 m_ATOM,
2108
2109 /* X86_TUNE_EXT_80387_CONSTANTS */
2110 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2111
2112 /* X86_TUNE_SHORTEN_X87_SSE */
2113 ~m_K8,
2114
2115 /* X86_TUNE_AVOID_VECTOR_DECODE */
2116 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2117
2118 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2119 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2120 ~(m_386 | m_486),
2121
2122 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2123 vector path on AMD machines. */
2124 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2125
2126 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2127 machines. */
2128 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2129
2130 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2131 than a MOV. */
2132 m_PENT,
2133
2134 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2135 but one byte longer. */
2136 m_PENT,
2137
2138 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2139 operand that cannot be represented using a modRM byte. The XOR
2140 replacement is long decoded, so this split helps here as well. */
2141 m_K6,
2142
2143 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2144 from FP to FP. */
2145 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2146
2147 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2148 from integer to FP. */
2149 m_AMDFAM10,
2150
2151 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2152 with a subsequent conditional jump instruction into a single
2153 compare-and-branch uop. */
2154 m_BDVER,
2155
2156 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2157 will impact LEA instruction selection. */
2158 m_ATOM,
2159
2160 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2161 instructions. */
2162 ~m_ATOM,
2163
2164 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2165 at -O3. For the moment, the prefetching seems badly tuned for Intel
2166 chips. */
2167 m_K6_GEODE | m_AMD_MULTIPLE,
2168
2169 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2170 the auto-vectorizer. */
2171 m_BDVER,
2172
2173 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2174 during reassociation of integer computation. */
2175 m_ATOM,
2176
2177 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2178 during reassociation of fp computation. */
2179 m_ATOM
2180 };
2181
2182 /* Feature tests against the various architecture variations. */
2183 unsigned char ix86_arch_features[X86_ARCH_LAST];
2184
2185 /* Feature tests against the various architecture variations, used to create
2186 ix86_arch_features based on the processor mask. */
2187 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2188 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2189 ~(m_386 | m_486 | m_PENT | m_K6),
2190
2191 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2192 ~m_386,
2193
2194 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2195 ~(m_386 | m_486),
2196
2197 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2198 ~m_386,
2199
2200 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2201 ~m_386,
2202 };
2203
2204 static const unsigned int x86_accumulate_outgoing_args
2205 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2206
2207 static const unsigned int x86_arch_always_fancy_math_387
2208 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2209
2210 static const unsigned int x86_avx256_split_unaligned_load
2211 = m_COREI7 | m_GENERIC;
2212
2213 static const unsigned int x86_avx256_split_unaligned_store
2214 = m_COREI7 | m_BDVER | m_GENERIC;
2215
2216 /* In case the average insn count for single function invocation is
2217 lower than this constant, emit fast (but longer) prologue and
2218 epilogue code. */
2219 #define FAST_PROLOGUE_INSN_COUNT 20
2220
2221 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2222 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2223 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2224 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2225
2226 /* Array of the smallest class containing reg number REGNO, indexed by
2227 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2228
2229 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2230 {
2231 /* ax, dx, cx, bx */
2232 AREG, DREG, CREG, BREG,
2233 /* si, di, bp, sp */
2234 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2235 /* FP registers */
2236 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2237 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2238 /* arg pointer */
2239 NON_Q_REGS,
2240 /* flags, fpsr, fpcr, frame */
2241 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2242 /* SSE registers */
2243 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2244 SSE_REGS, SSE_REGS,
2245 /* MMX registers */
2246 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2247 MMX_REGS, MMX_REGS,
2248 /* REX registers */
2249 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 /* SSE REX registers */
2252 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2253 SSE_REGS, SSE_REGS,
2254 };
2255
2256 /* The "default" register map used in 32bit mode. */
2257
2258 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2259 {
2260 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2261 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2262 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2263 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2264 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2265 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2267 };
2268
2269 /* The "default" register map used in 64bit mode. */
2270
2271 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2272 {
2273 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2274 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2275 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2276 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2277 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2278 8,9,10,11,12,13,14,15, /* extended integer registers */
2279 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2280 };
2281
2282 /* Define the register numbers to be used in Dwarf debugging information.
2283 The SVR4 reference port C compiler uses the following register numbers
2284 in its Dwarf output code:
2285 0 for %eax (gcc regno = 0)
2286 1 for %ecx (gcc regno = 2)
2287 2 for %edx (gcc regno = 1)
2288 3 for %ebx (gcc regno = 3)
2289 4 for %esp (gcc regno = 7)
2290 5 for %ebp (gcc regno = 6)
2291 6 for %esi (gcc regno = 4)
2292 7 for %edi (gcc regno = 5)
2293 The following three DWARF register numbers are never generated by
2294 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2295 believes these numbers have these meanings.
2296 8 for %eip (no gcc equivalent)
2297 9 for %eflags (gcc regno = 17)
2298 10 for %trapno (no gcc equivalent)
2299 It is not at all clear how we should number the FP stack registers
2300 for the x86 architecture. If the version of SDB on x86/svr4 were
2301 a bit less brain dead with respect to floating-point then we would
2302 have a precedent to follow with respect to DWARF register numbers
2303 for x86 FP registers, but the SDB on x86/svr4 is so completely
2304 broken with respect to FP registers that it is hardly worth thinking
2305 of it as something to strive for compatibility with.
2306 The version of x86/svr4 SDB I have at the moment does (partially)
2307 seem to believe that DWARF register number 11 is associated with
2308 the x86 register %st(0), but that's about all. Higher DWARF
2309 register numbers don't seem to be associated with anything in
2310 particular, and even for DWARF regno 11, SDB only seems to under-
2311 stand that it should say that a variable lives in %st(0) (when
2312 asked via an `=' command) if we said it was in DWARF regno 11,
2313 but SDB still prints garbage when asked for the value of the
2314 variable in question (via a `/' command).
2315 (Also note that the labels SDB prints for various FP stack regs
2316 when doing an `x' command are all wrong.)
2317 Note that these problems generally don't affect the native SVR4
2318 C compiler because it doesn't allow the use of -O with -g and
2319 because when it is *not* optimizing, it allocates a memory
2320 location for each floating-point variable, and the memory
2321 location is what gets described in the DWARF AT_location
2322 attribute for the variable in question.
2323 Regardless of the severe mental illness of the x86/svr4 SDB, we
2324 do something sensible here and we use the following DWARF
2325 register numbers. Note that these are all stack-top-relative
2326 numbers.
2327 11 for %st(0) (gcc regno = 8)
2328 12 for %st(1) (gcc regno = 9)
2329 13 for %st(2) (gcc regno = 10)
2330 14 for %st(3) (gcc regno = 11)
2331 15 for %st(4) (gcc regno = 12)
2332 16 for %st(5) (gcc regno = 13)
2333 17 for %st(6) (gcc regno = 14)
2334 18 for %st(7) (gcc regno = 15)
2335 */
2336 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2337 {
2338 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2339 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2340 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2341 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2342 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2343 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2345 };
2346
2347 /* Define parameter passing and return registers. */
2348
2349 static int const x86_64_int_parameter_registers[6] =
2350 {
2351 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2352 };
2353
2354 static int const x86_64_ms_abi_int_parameter_registers[4] =
2355 {
2356 CX_REG, DX_REG, R8_REG, R9_REG
2357 };
2358
2359 static int const x86_64_int_return_registers[4] =
2360 {
2361 AX_REG, DX_REG, DI_REG, SI_REG
2362 };
2363
2364 /* Define the structure for the machine field in struct function. */
2365
2366 struct GTY(()) stack_local_entry {
2367 unsigned short mode;
2368 unsigned short n;
2369 rtx rtl;
2370 struct stack_local_entry *next;
2371 };
2372
2373 /* Structure describing stack frame layout.
2374 Stack grows downward:
2375
2376 [arguments]
2377 <- ARG_POINTER
2378 saved pc
2379
2380 saved static chain if ix86_static_chain_on_stack
2381
2382 saved frame pointer if frame_pointer_needed
2383 <- HARD_FRAME_POINTER
2384 [saved regs]
2385 <- regs_save_offset
2386 [padding0]
2387
2388 [saved SSE regs]
2389 <- sse_regs_save_offset
2390 [padding1] |
2391 | <- FRAME_POINTER
2392 [va_arg registers] |
2393 |
2394 [frame] |
2395 |
2396 [padding2] | = to_allocate
2397 <- STACK_POINTER
2398 */
2399 struct ix86_frame
2400 {
2401 int nsseregs;
2402 int nregs;
2403 int va_arg_size;
2404 int red_zone_size;
2405 int outgoing_arguments_size;
2406 HOST_WIDE_INT frame;
2407
2408 /* The offsets relative to ARG_POINTER. */
2409 HOST_WIDE_INT frame_pointer_offset;
2410 HOST_WIDE_INT hard_frame_pointer_offset;
2411 HOST_WIDE_INT stack_pointer_offset;
2412 HOST_WIDE_INT hfp_save_offset;
2413 HOST_WIDE_INT reg_save_offset;
2414 HOST_WIDE_INT sse_reg_save_offset;
2415
2416 /* When save_regs_using_mov is set, emit prologue using
2417 move instead of push instructions. */
2418 bool save_regs_using_mov;
2419 };
2420
2421 /* Which cpu are we scheduling for. */
2422 enum attr_cpu ix86_schedule;
2423
2424 /* Which cpu are we optimizing for. */
2425 enum processor_type ix86_tune;
2426
2427 /* Which instruction set architecture to use. */
2428 enum processor_type ix86_arch;
2429
2430 /* true if sse prefetch instruction is not NOOP. */
2431 int x86_prefetch_sse;
2432
2433 /* -mstackrealign option */
2434 static const char ix86_force_align_arg_pointer_string[]
2435 = "force_align_arg_pointer";
2436
2437 static rtx (*ix86_gen_leave) (void);
2438 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2439 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2440 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2441 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2442 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2445 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2446 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2447 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2448 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2449
2450 /* Preferred alignment for stack boundary in bits. */
2451 unsigned int ix86_preferred_stack_boundary;
2452
2453 /* Alignment for incoming stack boundary in bits specified at
2454 command line. */
2455 static unsigned int ix86_user_incoming_stack_boundary;
2456
2457 /* Default alignment for incoming stack boundary in bits. */
2458 static unsigned int ix86_default_incoming_stack_boundary;
2459
2460 /* Alignment for incoming stack boundary in bits. */
2461 unsigned int ix86_incoming_stack_boundary;
2462
2463 /* Calling abi specific va_list type nodes. */
2464 static GTY(()) tree sysv_va_list_type_node;
2465 static GTY(()) tree ms_va_list_type_node;
2466
2467 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2468 char internal_label_prefix[16];
2469 int internal_label_prefix_len;
2470
2471 /* Fence to use after loop using movnt. */
2472 tree x86_mfence;
2473
2474 /* Register class used for passing given 64bit part of the argument.
2475 These represent classes as documented by the PS ABI, with the exception
2476 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2477 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2478
2479 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2480 whenever possible (upper half does contain padding). */
2481 enum x86_64_reg_class
2482 {
2483 X86_64_NO_CLASS,
2484 X86_64_INTEGER_CLASS,
2485 X86_64_INTEGERSI_CLASS,
2486 X86_64_SSE_CLASS,
2487 X86_64_SSESF_CLASS,
2488 X86_64_SSEDF_CLASS,
2489 X86_64_SSEUP_CLASS,
2490 X86_64_X87_CLASS,
2491 X86_64_X87UP_CLASS,
2492 X86_64_COMPLEX_X87_CLASS,
2493 X86_64_MEMORY_CLASS
2494 };
2495
2496 #define MAX_CLASSES 4
2497
2498 /* Table of constants used by fldpi, fldln2, etc.... */
2499 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2500 static bool ext_80387_constants_init = 0;
2501
2502 \f
2503 static struct machine_function * ix86_init_machine_status (void);
2504 static rtx ix86_function_value (const_tree, const_tree, bool);
2505 static bool ix86_function_value_regno_p (const unsigned int);
2506 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2507 const_tree);
2508 static rtx ix86_static_chain (const_tree, bool);
2509 static int ix86_function_regparm (const_tree, const_tree);
2510 static void ix86_compute_frame_layout (struct ix86_frame *);
2511 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2512 rtx, rtx, int);
2513 static void ix86_add_new_builtins (HOST_WIDE_INT);
2514 static tree ix86_canonical_va_list_type (tree);
2515 static void predict_jump (int);
2516 static unsigned int split_stack_prologue_scratch_regno (void);
2517 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2518
2519 enum ix86_function_specific_strings
2520 {
2521 IX86_FUNCTION_SPECIFIC_ARCH,
2522 IX86_FUNCTION_SPECIFIC_TUNE,
2523 IX86_FUNCTION_SPECIFIC_MAX
2524 };
2525
2526 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2527 const char *, enum fpmath_unit, bool);
2528 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2529 static void ix86_function_specific_save (struct cl_target_option *);
2530 static void ix86_function_specific_restore (struct cl_target_option *);
2531 static void ix86_function_specific_print (FILE *, int,
2532 struct cl_target_option *);
2533 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2534 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2535 struct gcc_options *);
2536 static bool ix86_can_inline_p (tree, tree);
2537 static void ix86_set_current_function (tree);
2538 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2539
2540 static enum calling_abi ix86_function_abi (const_tree);
2541
2542 \f
2543 #ifndef SUBTARGET32_DEFAULT_CPU
2544 #define SUBTARGET32_DEFAULT_CPU "i386"
2545 #endif
2546
2547 /* The svr4 ABI for the i386 says that records and unions are returned
2548 in memory. */
2549 #ifndef DEFAULT_PCC_STRUCT_RETURN
2550 #define DEFAULT_PCC_STRUCT_RETURN 1
2551 #endif
2552
2553 /* Whether -mtune= or -march= were specified */
2554 static int ix86_tune_defaulted;
2555 static int ix86_arch_specified;
2556
2557 /* Vectorization library interface and handlers. */
2558 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2559
2560 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2561 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2562
2563 /* Processor target table, indexed by processor number */
2564 struct ptt
2565 {
2566 const struct processor_costs *cost; /* Processor costs */
2567 const int align_loop; /* Default alignments. */
2568 const int align_loop_max_skip;
2569 const int align_jump;
2570 const int align_jump_max_skip;
2571 const int align_func;
2572 };
2573
2574 static const struct ptt processor_target_table[PROCESSOR_max] =
2575 {
2576 {&i386_cost, 4, 3, 4, 3, 4},
2577 {&i486_cost, 16, 15, 16, 15, 16},
2578 {&pentium_cost, 16, 7, 16, 7, 16},
2579 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2580 {&geode_cost, 0, 0, 0, 0, 0},
2581 {&k6_cost, 32, 7, 32, 7, 32},
2582 {&athlon_cost, 16, 7, 16, 7, 16},
2583 {&pentium4_cost, 0, 0, 0, 0, 0},
2584 {&k8_cost, 16, 7, 16, 7, 16},
2585 {&nocona_cost, 0, 0, 0, 0, 0},
2586 /* Core 2 32-bit. */
2587 {&generic32_cost, 16, 10, 16, 10, 16},
2588 /* Core 2 64-bit. */
2589 {&generic64_cost, 16, 10, 16, 10, 16},
2590 /* Core i7 32-bit. */
2591 {&generic32_cost, 16, 10, 16, 10, 16},
2592 /* Core i7 64-bit. */
2593 {&generic64_cost, 16, 10, 16, 10, 16},
2594 {&generic32_cost, 16, 7, 16, 7, 16},
2595 {&generic64_cost, 16, 10, 16, 10, 16},
2596 {&amdfam10_cost, 32, 24, 32, 7, 32},
2597 {&bdver1_cost, 32, 24, 32, 7, 32},
2598 {&bdver2_cost, 32, 24, 32, 7, 32},
2599 {&btver1_cost, 32, 24, 32, 7, 32},
2600 {&atom_cost, 16, 15, 16, 7, 16}
2601 };
2602
2603 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2604 {
2605 "generic",
2606 "i386",
2607 "i486",
2608 "pentium",
2609 "pentium-mmx",
2610 "pentiumpro",
2611 "pentium2",
2612 "pentium3",
2613 "pentium4",
2614 "pentium-m",
2615 "prescott",
2616 "nocona",
2617 "core2",
2618 "corei7",
2619 "atom",
2620 "geode",
2621 "k6",
2622 "k6-2",
2623 "k6-3",
2624 "athlon",
2625 "athlon-4",
2626 "k8",
2627 "amdfam10",
2628 "bdver1",
2629 "bdver2",
2630 "btver1"
2631 };
2632 \f
2633 /* Return true if a red-zone is in use. */
2634
2635 static inline bool
2636 ix86_using_red_zone (void)
2637 {
2638 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2639 }
2640 \f
2641 /* Return a string that documents the current -m options. The caller is
2642 responsible for freeing the string. */
2643
2644 static char *
2645 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2646 const char *tune, enum fpmath_unit fpmath,
2647 bool add_nl_p)
2648 {
2649 struct ix86_target_opts
2650 {
2651 const char *option; /* option string */
2652 HOST_WIDE_INT mask; /* isa mask options */
2653 };
2654
2655 /* This table is ordered so that options like -msse4.2 that imply
2656 preceding options while match those first. */
2657 static struct ix86_target_opts isa_opts[] =
2658 {
2659 { "-m64", OPTION_MASK_ISA_64BIT },
2660 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2661 { "-mfma", OPTION_MASK_ISA_FMA },
2662 { "-mxop", OPTION_MASK_ISA_XOP },
2663 { "-mlwp", OPTION_MASK_ISA_LWP },
2664 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2665 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2666 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2667 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2668 { "-msse3", OPTION_MASK_ISA_SSE3 },
2669 { "-msse2", OPTION_MASK_ISA_SSE2 },
2670 { "-msse", OPTION_MASK_ISA_SSE },
2671 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2672 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2673 { "-mmmx", OPTION_MASK_ISA_MMX },
2674 { "-mabm", OPTION_MASK_ISA_ABM },
2675 { "-mbmi", OPTION_MASK_ISA_BMI },
2676 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2677 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2678 { "-mtbm", OPTION_MASK_ISA_TBM },
2679 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2680 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2681 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2682 { "-maes", OPTION_MASK_ISA_AES },
2683 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2684 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2685 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2686 { "-mf16c", OPTION_MASK_ISA_F16C },
2687 { "-mrtm", OPTION_MASK_ISA_RTM },
2688 };
2689
2690 /* Flag options. */
2691 static struct ix86_target_opts flag_opts[] =
2692 {
2693 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2694 { "-m80387", MASK_80387 },
2695 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2696 { "-malign-double", MASK_ALIGN_DOUBLE },
2697 { "-mcld", MASK_CLD },
2698 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2699 { "-mieee-fp", MASK_IEEE_FP },
2700 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2701 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2702 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2703 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2704 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2705 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2706 { "-mno-red-zone", MASK_NO_RED_ZONE },
2707 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2708 { "-mrecip", MASK_RECIP },
2709 { "-mrtd", MASK_RTD },
2710 { "-msseregparm", MASK_SSEREGPARM },
2711 { "-mstack-arg-probe", MASK_STACK_PROBE },
2712 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2713 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2714 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2715 { "-mvzeroupper", MASK_VZEROUPPER },
2716 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2717 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2718 { "-mprefer-avx128", MASK_PREFER_AVX128},
2719 };
2720
2721 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2722
2723 char isa_other[40];
2724 char target_other[40];
2725 unsigned num = 0;
2726 unsigned i, j;
2727 char *ret;
2728 char *ptr;
2729 size_t len;
2730 size_t line_len;
2731 size_t sep_len;
2732
2733 memset (opts, '\0', sizeof (opts));
2734
2735 /* Add -march= option. */
2736 if (arch)
2737 {
2738 opts[num][0] = "-march=";
2739 opts[num++][1] = arch;
2740 }
2741
2742 /* Add -mtune= option. */
2743 if (tune)
2744 {
2745 opts[num][0] = "-mtune=";
2746 opts[num++][1] = tune;
2747 }
2748
2749 /* Pick out the options in isa options. */
2750 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2751 {
2752 if ((isa & isa_opts[i].mask) != 0)
2753 {
2754 opts[num++][0] = isa_opts[i].option;
2755 isa &= ~ isa_opts[i].mask;
2756 }
2757 }
2758
2759 if (isa && add_nl_p)
2760 {
2761 opts[num++][0] = isa_other;
2762 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2763 isa);
2764 }
2765
2766 /* Add flag options. */
2767 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2768 {
2769 if ((flags & flag_opts[i].mask) != 0)
2770 {
2771 opts[num++][0] = flag_opts[i].option;
2772 flags &= ~ flag_opts[i].mask;
2773 }
2774 }
2775
2776 if (flags && add_nl_p)
2777 {
2778 opts[num++][0] = target_other;
2779 sprintf (target_other, "(other flags: %#x)", flags);
2780 }
2781
2782 /* Add -fpmath= option. */
2783 if (fpmath)
2784 {
2785 opts[num][0] = "-mfpmath=";
2786 switch ((int) fpmath)
2787 {
2788 case FPMATH_387:
2789 opts[num++][1] = "387";
2790 break;
2791
2792 case FPMATH_SSE:
2793 opts[num++][1] = "sse";
2794 break;
2795
2796 case FPMATH_387 | FPMATH_SSE:
2797 opts[num++][1] = "sse+387";
2798 break;
2799
2800 default:
2801 gcc_unreachable ();
2802 }
2803 }
2804
2805 /* Any options? */
2806 if (num == 0)
2807 return NULL;
2808
2809 gcc_assert (num < ARRAY_SIZE (opts));
2810
2811 /* Size the string. */
2812 len = 0;
2813 sep_len = (add_nl_p) ? 3 : 1;
2814 for (i = 0; i < num; i++)
2815 {
2816 len += sep_len;
2817 for (j = 0; j < 2; j++)
2818 if (opts[i][j])
2819 len += strlen (opts[i][j]);
2820 }
2821
2822 /* Build the string. */
2823 ret = ptr = (char *) xmalloc (len);
2824 line_len = 0;
2825
2826 for (i = 0; i < num; i++)
2827 {
2828 size_t len2[2];
2829
2830 for (j = 0; j < 2; j++)
2831 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2832
2833 if (i != 0)
2834 {
2835 *ptr++ = ' ';
2836 line_len++;
2837
2838 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2839 {
2840 *ptr++ = '\\';
2841 *ptr++ = '\n';
2842 line_len = 0;
2843 }
2844 }
2845
2846 for (j = 0; j < 2; j++)
2847 if (opts[i][j])
2848 {
2849 memcpy (ptr, opts[i][j], len2[j]);
2850 ptr += len2[j];
2851 line_len += len2[j];
2852 }
2853 }
2854
2855 *ptr = '\0';
2856 gcc_assert (ret + len >= ptr);
2857
2858 return ret;
2859 }
2860
2861 /* Return true, if profiling code should be emitted before
2862 prologue. Otherwise it returns false.
2863 Note: For x86 with "hotfix" it is sorried. */
2864 static bool
2865 ix86_profile_before_prologue (void)
2866 {
2867 return flag_fentry != 0;
2868 }
2869
2870 /* Function that is callable from the debugger to print the current
2871 options. */
2872 void
2873 ix86_debug_options (void)
2874 {
2875 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2876 ix86_arch_string, ix86_tune_string,
2877 ix86_fpmath, true);
2878
2879 if (opts)
2880 {
2881 fprintf (stderr, "%s\n\n", opts);
2882 free (opts);
2883 }
2884 else
2885 fputs ("<no options>\n\n", stderr);
2886
2887 return;
2888 }
2889 \f
2890 /* Override various settings based on options. If MAIN_ARGS_P, the
2891 options are from the command line, otherwise they are from
2892 attributes. */
2893
2894 static void
2895 ix86_option_override_internal (bool main_args_p)
2896 {
2897 int i;
2898 unsigned int ix86_arch_mask, ix86_tune_mask;
2899 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2900 const char *prefix;
2901 const char *suffix;
2902 const char *sw;
2903
2904 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2905 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2906 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2907 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2908 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2909 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2910 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2911 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2912 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2913 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2914 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2915 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2916 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2917 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2918 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2919 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2920 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2921 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2922 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2923 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2924 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2925 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2926 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2927 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2928 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2929 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2930 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2931 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2932 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2933 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2934 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2935 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2936 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2937 /* if this reaches 64, need to widen struct pta flags below */
2938
2939 static struct pta
2940 {
2941 const char *const name; /* processor name or nickname. */
2942 const enum processor_type processor;
2943 const enum attr_cpu schedule;
2944 const unsigned HOST_WIDE_INT flags;
2945 }
2946 const processor_alias_table[] =
2947 {
2948 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2949 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2950 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2951 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2952 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2953 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2954 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2955 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2956 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2957 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2958 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2959 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2960 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2961 PTA_MMX | PTA_SSE},
2962 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2963 PTA_MMX | PTA_SSE},
2964 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2965 PTA_MMX | PTA_SSE | PTA_SSE2},
2966 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2967 PTA_MMX |PTA_SSE | PTA_SSE2},
2968 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2969 PTA_MMX | PTA_SSE | PTA_SSE2},
2970 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2971 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2972 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2973 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2974 | PTA_CX16 | PTA_NO_SAHF},
2975 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2976 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2977 | PTA_SSSE3 | PTA_CX16},
2978 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2979 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2980 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2981 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2982 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2983 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2984 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2985 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2986 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2987 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2988 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2989 | PTA_RDRND | PTA_F16C},
2990 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2991 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2992 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2993 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2994 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2995 | PTA_FMA | PTA_MOVBE | PTA_RTM},
2996 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2997 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2998 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
2999 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3000 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
3001 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3002 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3003 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3004 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3005 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3006 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3007 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3008 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3009 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3010 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3011 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3012 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3013 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3014 {"x86-64", PROCESSOR_K8, CPU_K8,
3015 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3016 {"k8", PROCESSOR_K8, CPU_K8,
3017 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3018 | PTA_SSE2 | PTA_NO_SAHF},
3019 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3020 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3021 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3022 {"opteron", PROCESSOR_K8, CPU_K8,
3023 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3024 | PTA_SSE2 | PTA_NO_SAHF},
3025 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3026 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3027 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3028 {"athlon64", PROCESSOR_K8, CPU_K8,
3029 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3030 | PTA_SSE2 | PTA_NO_SAHF},
3031 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3032 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3033 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3034 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3035 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3036 | PTA_SSE2 | PTA_NO_SAHF},
3037 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3038 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3039 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3040 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3041 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3042 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3043 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3044 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3045 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3046 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3047 | PTA_XOP | PTA_LWP},
3048 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3049 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3050 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3051 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3052 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3053 | PTA_FMA},
3054 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3055 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3056 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3057 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3058 0 /* flags are only used for -march switch. */ },
3059 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3060 PTA_64BIT /* flags are only used for -march switch. */ },
3061 };
3062
3063 /* -mrecip options. */
3064 static struct
3065 {
3066 const char *string; /* option name */
3067 unsigned int mask; /* mask bits to set */
3068 }
3069 const recip_options[] =
3070 {
3071 { "all", RECIP_MASK_ALL },
3072 { "none", RECIP_MASK_NONE },
3073 { "div", RECIP_MASK_DIV },
3074 { "sqrt", RECIP_MASK_SQRT },
3075 { "vec-div", RECIP_MASK_VEC_DIV },
3076 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3077 };
3078
3079 int const pta_size = ARRAY_SIZE (processor_alias_table);
3080
3081 /* Set up prefix/suffix so the error messages refer to either the command
3082 line argument, or the attribute(target). */
3083 if (main_args_p)
3084 {
3085 prefix = "-m";
3086 suffix = "";
3087 sw = "switch";
3088 }
3089 else
3090 {
3091 prefix = "option(\"";
3092 suffix = "\")";
3093 sw = "attribute";
3094 }
3095
3096 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3097 SUBTARGET_OVERRIDE_OPTIONS;
3098 #endif
3099
3100 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3101 SUBSUBTARGET_OVERRIDE_OPTIONS;
3102 #endif
3103
3104 if (TARGET_X32)
3105 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3106
3107 /* -fPIC is the default for x86_64. */
3108 if (TARGET_MACHO && TARGET_64BIT)
3109 flag_pic = 2;
3110
3111 /* Need to check -mtune=generic first. */
3112 if (ix86_tune_string)
3113 {
3114 if (!strcmp (ix86_tune_string, "generic")
3115 || !strcmp (ix86_tune_string, "i686")
3116 /* As special support for cross compilers we read -mtune=native
3117 as -mtune=generic. With native compilers we won't see the
3118 -mtune=native, as it was changed by the driver. */
3119 || !strcmp (ix86_tune_string, "native"))
3120 {
3121 if (TARGET_64BIT)
3122 ix86_tune_string = "generic64";
3123 else
3124 ix86_tune_string = "generic32";
3125 }
3126 /* If this call is for setting the option attribute, allow the
3127 generic32/generic64 that was previously set. */
3128 else if (!main_args_p
3129 && (!strcmp (ix86_tune_string, "generic32")
3130 || !strcmp (ix86_tune_string, "generic64")))
3131 ;
3132 else if (!strncmp (ix86_tune_string, "generic", 7))
3133 error ("bad value (%s) for %stune=%s %s",
3134 ix86_tune_string, prefix, suffix, sw);
3135 else if (!strcmp (ix86_tune_string, "x86-64"))
3136 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3137 "%stune=k8%s or %stune=generic%s instead as appropriate",
3138 prefix, suffix, prefix, suffix, prefix, suffix);
3139 }
3140 else
3141 {
3142 if (ix86_arch_string)
3143 ix86_tune_string = ix86_arch_string;
3144 if (!ix86_tune_string)
3145 {
3146 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3147 ix86_tune_defaulted = 1;
3148 }
3149
3150 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3151 need to use a sensible tune option. */
3152 if (!strcmp (ix86_tune_string, "generic")
3153 || !strcmp (ix86_tune_string, "x86-64")
3154 || !strcmp (ix86_tune_string, "i686"))
3155 {
3156 if (TARGET_64BIT)
3157 ix86_tune_string = "generic64";
3158 else
3159 ix86_tune_string = "generic32";
3160 }
3161 }
3162
3163 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3164 {
3165 /* rep; movq isn't available in 32-bit code. */
3166 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3167 ix86_stringop_alg = no_stringop;
3168 }
3169
3170 if (!ix86_arch_string)
3171 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3172 else
3173 ix86_arch_specified = 1;
3174
3175 if (global_options_set.x_ix86_pmode)
3176 {
3177 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3178 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3179 error ("address mode %qs not supported in the %s bit mode",
3180 TARGET_64BIT ? "short" : "long",
3181 TARGET_64BIT ? "64" : "32");
3182 }
3183 else
3184 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3185
3186 if (!global_options_set.x_ix86_abi)
3187 ix86_abi = DEFAULT_ABI;
3188
3189 if (global_options_set.x_ix86_cmodel)
3190 {
3191 switch (ix86_cmodel)
3192 {
3193 case CM_SMALL:
3194 case CM_SMALL_PIC:
3195 if (flag_pic)
3196 ix86_cmodel = CM_SMALL_PIC;
3197 if (!TARGET_64BIT)
3198 error ("code model %qs not supported in the %s bit mode",
3199 "small", "32");
3200 break;
3201
3202 case CM_MEDIUM:
3203 case CM_MEDIUM_PIC:
3204 if (flag_pic)
3205 ix86_cmodel = CM_MEDIUM_PIC;
3206 if (!TARGET_64BIT)
3207 error ("code model %qs not supported in the %s bit mode",
3208 "medium", "32");
3209 else if (TARGET_X32)
3210 error ("code model %qs not supported in x32 mode",
3211 "medium");
3212 break;
3213
3214 case CM_LARGE:
3215 case CM_LARGE_PIC:
3216 if (flag_pic)
3217 ix86_cmodel = CM_LARGE_PIC;
3218 if (!TARGET_64BIT)
3219 error ("code model %qs not supported in the %s bit mode",
3220 "large", "32");
3221 else if (TARGET_X32)
3222 error ("code model %qs not supported in x32 mode",
3223 "medium");
3224 break;
3225
3226 case CM_32:
3227 if (flag_pic)
3228 error ("code model %s does not support PIC mode", "32");
3229 if (TARGET_64BIT)
3230 error ("code model %qs not supported in the %s bit mode",
3231 "32", "64");
3232 break;
3233
3234 case CM_KERNEL:
3235 if (flag_pic)
3236 {
3237 error ("code model %s does not support PIC mode", "kernel");
3238 ix86_cmodel = CM_32;
3239 }
3240 if (!TARGET_64BIT)
3241 error ("code model %qs not supported in the %s bit mode",
3242 "kernel", "32");
3243 break;
3244
3245 default:
3246 gcc_unreachable ();
3247 }
3248 }
3249 else
3250 {
3251 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3252 use of rip-relative addressing. This eliminates fixups that
3253 would otherwise be needed if this object is to be placed in a
3254 DLL, and is essentially just as efficient as direct addressing. */
3255 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3256 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3257 else if (TARGET_64BIT)
3258 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3259 else
3260 ix86_cmodel = CM_32;
3261 }
3262 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3263 {
3264 error ("-masm=intel not supported in this configuration");
3265 ix86_asm_dialect = ASM_ATT;
3266 }
3267 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3268 sorry ("%i-bit mode not compiled in",
3269 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3270
3271 for (i = 0; i < pta_size; i++)
3272 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3273 {
3274 ix86_schedule = processor_alias_table[i].schedule;
3275 ix86_arch = processor_alias_table[i].processor;
3276 /* Default cpu tuning to the architecture. */
3277 ix86_tune = ix86_arch;
3278
3279 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3280 error ("CPU you selected does not support x86-64 "
3281 "instruction set");
3282
3283 if (processor_alias_table[i].flags & PTA_MMX
3284 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3285 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3286 if (processor_alias_table[i].flags & PTA_3DNOW
3287 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3288 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3289 if (processor_alias_table[i].flags & PTA_3DNOW_A
3290 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3291 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3292 if (processor_alias_table[i].flags & PTA_SSE
3293 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3294 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3295 if (processor_alias_table[i].flags & PTA_SSE2
3296 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3297 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3298 if (processor_alias_table[i].flags & PTA_SSE3
3299 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3300 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3301 if (processor_alias_table[i].flags & PTA_SSSE3
3302 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3303 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3304 if (processor_alias_table[i].flags & PTA_SSE4_1
3305 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3306 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3307 if (processor_alias_table[i].flags & PTA_SSE4_2
3308 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3309 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3310 if (processor_alias_table[i].flags & PTA_AVX
3311 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3312 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3313 if (processor_alias_table[i].flags & PTA_AVX2
3314 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3315 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3316 if (processor_alias_table[i].flags & PTA_FMA
3317 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3318 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3319 if (processor_alias_table[i].flags & PTA_SSE4A
3320 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3321 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3322 if (processor_alias_table[i].flags & PTA_FMA4
3323 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3324 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3325 if (processor_alias_table[i].flags & PTA_XOP
3326 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3327 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3328 if (processor_alias_table[i].flags & PTA_LWP
3329 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3330 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3331 if (processor_alias_table[i].flags & PTA_ABM
3332 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3333 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3334 if (processor_alias_table[i].flags & PTA_BMI
3335 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3336 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3337 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3338 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3339 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3340 if (processor_alias_table[i].flags & PTA_TBM
3341 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3342 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3343 if (processor_alias_table[i].flags & PTA_BMI2
3344 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3345 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3346 if (processor_alias_table[i].flags & PTA_CX16
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3348 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3349 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3351 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3352 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3354 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3355 if (processor_alias_table[i].flags & PTA_MOVBE
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3357 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3358 if (processor_alias_table[i].flags & PTA_AES
3359 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3360 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3361 if (processor_alias_table[i].flags & PTA_PCLMUL
3362 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3363 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3364 if (processor_alias_table[i].flags & PTA_FSGSBASE
3365 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3366 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3367 if (processor_alias_table[i].flags & PTA_RDRND
3368 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3369 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3370 if (processor_alias_table[i].flags & PTA_F16C
3371 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3372 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3373 if (processor_alias_table[i].flags & PTA_RTM
3374 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3375 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3376 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3377 x86_prefetch_sse = true;
3378
3379 break;
3380 }
3381
3382 if (!strcmp (ix86_arch_string, "generic"))
3383 error ("generic CPU can be used only for %stune=%s %s",
3384 prefix, suffix, sw);
3385 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3386 error ("bad value (%s) for %sarch=%s %s",
3387 ix86_arch_string, prefix, suffix, sw);
3388
3389 ix86_arch_mask = 1u << ix86_arch;
3390 for (i = 0; i < X86_ARCH_LAST; ++i)
3391 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3392
3393 for (i = 0; i < pta_size; i++)
3394 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3395 {
3396 ix86_schedule = processor_alias_table[i].schedule;
3397 ix86_tune = processor_alias_table[i].processor;
3398 if (TARGET_64BIT)
3399 {
3400 if (!(processor_alias_table[i].flags & PTA_64BIT))
3401 {
3402 if (ix86_tune_defaulted)
3403 {
3404 ix86_tune_string = "x86-64";
3405 for (i = 0; i < pta_size; i++)
3406 if (! strcmp (ix86_tune_string,
3407 processor_alias_table[i].name))
3408 break;
3409 ix86_schedule = processor_alias_table[i].schedule;
3410 ix86_tune = processor_alias_table[i].processor;
3411 }
3412 else
3413 error ("CPU you selected does not support x86-64 "
3414 "instruction set");
3415 }
3416 }
3417 else
3418 {
3419 /* Adjust tuning when compiling for 32-bit ABI. */
3420 switch (ix86_tune)
3421 {
3422 case PROCESSOR_GENERIC64:
3423 ix86_tune = PROCESSOR_GENERIC32;
3424 ix86_schedule = CPU_PENTIUMPRO;
3425 break;
3426
3427 case PROCESSOR_CORE2_64:
3428 ix86_tune = PROCESSOR_CORE2_32;
3429 break;
3430
3431 case PROCESSOR_COREI7_64:
3432 ix86_tune = PROCESSOR_COREI7_32;
3433 break;
3434
3435 default:
3436 break;
3437 }
3438 }
3439 /* Intel CPUs have always interpreted SSE prefetch instructions as
3440 NOPs; so, we can enable SSE prefetch instructions even when
3441 -mtune (rather than -march) points us to a processor that has them.
3442 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3443 higher processors. */
3444 if (TARGET_CMOVE
3445 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3446 x86_prefetch_sse = true;
3447 break;
3448 }
3449
3450 if (ix86_tune_specified && i == pta_size)
3451 error ("bad value (%s) for %stune=%s %s",
3452 ix86_tune_string, prefix, suffix, sw);
3453
3454 ix86_tune_mask = 1u << ix86_tune;
3455 for (i = 0; i < X86_TUNE_LAST; ++i)
3456 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3457
3458 #ifndef USE_IX86_FRAME_POINTER
3459 #define USE_IX86_FRAME_POINTER 0
3460 #endif
3461
3462 #ifndef USE_X86_64_FRAME_POINTER
3463 #define USE_X86_64_FRAME_POINTER 0
3464 #endif
3465
3466 /* Set the default values for switches whose default depends on TARGET_64BIT
3467 in case they weren't overwritten by command line options. */
3468 if (TARGET_64BIT)
3469 {
3470 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3471 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3472 if (flag_asynchronous_unwind_tables == 2)
3473 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3474 if (flag_pcc_struct_return == 2)
3475 flag_pcc_struct_return = 0;
3476 }
3477 else
3478 {
3479 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3480 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3481 if (flag_asynchronous_unwind_tables == 2)
3482 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3483 if (flag_pcc_struct_return == 2)
3484 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3485 }
3486
3487 if (optimize_size)
3488 ix86_cost = &ix86_size_cost;
3489 else
3490 ix86_cost = processor_target_table[ix86_tune].cost;
3491
3492 /* Arrange to set up i386_stack_locals for all functions. */
3493 init_machine_status = ix86_init_machine_status;
3494
3495 /* Validate -mregparm= value. */
3496 if (global_options_set.x_ix86_regparm)
3497 {
3498 if (TARGET_64BIT)
3499 warning (0, "-mregparm is ignored in 64-bit mode");
3500 if (ix86_regparm > REGPARM_MAX)
3501 {
3502 error ("-mregparm=%d is not between 0 and %d",
3503 ix86_regparm, REGPARM_MAX);
3504 ix86_regparm = 0;
3505 }
3506 }
3507 if (TARGET_64BIT)
3508 ix86_regparm = REGPARM_MAX;
3509
3510 /* Default align_* from the processor table. */
3511 if (align_loops == 0)
3512 {
3513 align_loops = processor_target_table[ix86_tune].align_loop;
3514 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3515 }
3516 if (align_jumps == 0)
3517 {
3518 align_jumps = processor_target_table[ix86_tune].align_jump;
3519 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3520 }
3521 if (align_functions == 0)
3522 {
3523 align_functions = processor_target_table[ix86_tune].align_func;
3524 }
3525
3526 /* Provide default for -mbranch-cost= value. */
3527 if (!global_options_set.x_ix86_branch_cost)
3528 ix86_branch_cost = ix86_cost->branch_cost;
3529
3530 if (TARGET_64BIT)
3531 {
3532 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3533
3534 /* Enable by default the SSE and MMX builtins. Do allow the user to
3535 explicitly disable any of these. In particular, disabling SSE and
3536 MMX for kernel code is extremely useful. */
3537 if (!ix86_arch_specified)
3538 ix86_isa_flags
3539 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3540 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3541
3542 if (TARGET_RTD)
3543 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3544 }
3545 else
3546 {
3547 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3548
3549 if (!ix86_arch_specified)
3550 ix86_isa_flags
3551 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3552
3553 /* i386 ABI does not specify red zone. It still makes sense to use it
3554 when programmer takes care to stack from being destroyed. */
3555 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3556 target_flags |= MASK_NO_RED_ZONE;
3557 }
3558
3559 /* Keep nonleaf frame pointers. */
3560 if (flag_omit_frame_pointer)
3561 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3562 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3563 flag_omit_frame_pointer = 1;
3564
3565 /* If we're doing fast math, we don't care about comparison order
3566 wrt NaNs. This lets us use a shorter comparison sequence. */
3567 if (flag_finite_math_only)
3568 target_flags &= ~MASK_IEEE_FP;
3569
3570 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3571 since the insns won't need emulation. */
3572 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3573 target_flags &= ~MASK_NO_FANCY_MATH_387;
3574
3575 /* Likewise, if the target doesn't have a 387, or we've specified
3576 software floating point, don't use 387 inline intrinsics. */
3577 if (!TARGET_80387)
3578 target_flags |= MASK_NO_FANCY_MATH_387;
3579
3580 /* Turn on MMX builtins for -msse. */
3581 if (TARGET_SSE)
3582 {
3583 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3584 x86_prefetch_sse = true;
3585 }
3586
3587 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3588 if (TARGET_SSE4_2 || TARGET_ABM)
3589 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3590
3591 /* Turn on lzcnt instruction for -mabm. */
3592 if (TARGET_ABM)
3593 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3594
3595 /* Validate -mpreferred-stack-boundary= value or default it to
3596 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3597 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3598 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3599 {
3600 int min = (TARGET_64BIT ? 4 : 2);
3601 int max = (TARGET_SEH ? 4 : 12);
3602
3603 if (ix86_preferred_stack_boundary_arg < min
3604 || ix86_preferred_stack_boundary_arg > max)
3605 {
3606 if (min == max)
3607 error ("-mpreferred-stack-boundary is not supported "
3608 "for this target");
3609 else
3610 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3611 ix86_preferred_stack_boundary_arg, min, max);
3612 }
3613 else
3614 ix86_preferred_stack_boundary
3615 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3616 }
3617
3618 /* Set the default value for -mstackrealign. */
3619 if (ix86_force_align_arg_pointer == -1)
3620 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3621
3622 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3623
3624 /* Validate -mincoming-stack-boundary= value or default it to
3625 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3626 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3627 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3628 {
3629 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3630 || ix86_incoming_stack_boundary_arg > 12)
3631 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3632 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3633 else
3634 {
3635 ix86_user_incoming_stack_boundary
3636 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3637 ix86_incoming_stack_boundary
3638 = ix86_user_incoming_stack_boundary;
3639 }
3640 }
3641
3642 /* Accept -msseregparm only if at least SSE support is enabled. */
3643 if (TARGET_SSEREGPARM
3644 && ! TARGET_SSE)
3645 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3646
3647 if (global_options_set.x_ix86_fpmath)
3648 {
3649 if (ix86_fpmath & FPMATH_SSE)
3650 {
3651 if (!TARGET_SSE)
3652 {
3653 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3654 ix86_fpmath = FPMATH_387;
3655 }
3656 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3657 {
3658 warning (0, "387 instruction set disabled, using SSE arithmetics");
3659 ix86_fpmath = FPMATH_SSE;
3660 }
3661 }
3662 }
3663 else
3664 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3665
3666 /* If the i387 is disabled, then do not return values in it. */
3667 if (!TARGET_80387)
3668 target_flags &= ~MASK_FLOAT_RETURNS;
3669
3670 /* Use external vectorized library in vectorizing intrinsics. */
3671 if (global_options_set.x_ix86_veclibabi_type)
3672 switch (ix86_veclibabi_type)
3673 {
3674 case ix86_veclibabi_type_svml:
3675 ix86_veclib_handler = ix86_veclibabi_svml;
3676 break;
3677
3678 case ix86_veclibabi_type_acml:
3679 ix86_veclib_handler = ix86_veclibabi_acml;
3680 break;
3681
3682 default:
3683 gcc_unreachable ();
3684 }
3685
3686 if ((!USE_IX86_FRAME_POINTER
3687 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3688 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3689 && !optimize_size)
3690 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3691
3692 /* ??? Unwind info is not correct around the CFG unless either a frame
3693 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3694 unwind info generation to be aware of the CFG and propagating states
3695 around edges. */
3696 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3697 || flag_exceptions || flag_non_call_exceptions)
3698 && flag_omit_frame_pointer
3699 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3700 {
3701 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3702 warning (0, "unwind tables currently require either a frame pointer "
3703 "or %saccumulate-outgoing-args%s for correctness",
3704 prefix, suffix);
3705 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3706 }
3707
3708 /* If stack probes are required, the space used for large function
3709 arguments on the stack must also be probed, so enable
3710 -maccumulate-outgoing-args so this happens in the prologue. */
3711 if (TARGET_STACK_PROBE
3712 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3713 {
3714 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3715 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3716 "for correctness", prefix, suffix);
3717 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3718 }
3719
3720 /* For sane SSE instruction set generation we need fcomi instruction.
3721 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3722 expands to a sequence that includes conditional move. */
3723 if (TARGET_SSE || TARGET_RDRND)
3724 TARGET_CMOVE = 1;
3725
3726 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3727 {
3728 char *p;
3729 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3730 p = strchr (internal_label_prefix, 'X');
3731 internal_label_prefix_len = p - internal_label_prefix;
3732 *p = '\0';
3733 }
3734
3735 /* When scheduling description is not available, disable scheduler pass
3736 so it won't slow down the compilation and make x87 code slower. */
3737 if (!TARGET_SCHEDULE)
3738 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3739
3740 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3741 ix86_cost->simultaneous_prefetches,
3742 global_options.x_param_values,
3743 global_options_set.x_param_values);
3744 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3745 global_options.x_param_values,
3746 global_options_set.x_param_values);
3747 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3748 global_options.x_param_values,
3749 global_options_set.x_param_values);
3750 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3751 global_options.x_param_values,
3752 global_options_set.x_param_values);
3753
3754 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3755 if (flag_prefetch_loop_arrays < 0
3756 && HAVE_prefetch
3757 && optimize >= 3
3758 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3759 flag_prefetch_loop_arrays = 1;
3760
3761 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3762 can be optimized to ap = __builtin_next_arg (0). */
3763 if (!TARGET_64BIT && !flag_split_stack)
3764 targetm.expand_builtin_va_start = NULL;
3765
3766 if (TARGET_64BIT)
3767 {
3768 ix86_gen_leave = gen_leave_rex64;
3769 if (Pmode == DImode)
3770 {
3771 ix86_gen_monitor = gen_sse3_monitor64_di;
3772 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3773 ix86_gen_tls_local_dynamic_base_64
3774 = gen_tls_local_dynamic_base_64_di;
3775 }
3776 else
3777 {
3778 ix86_gen_monitor = gen_sse3_monitor64_si;
3779 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3780 ix86_gen_tls_local_dynamic_base_64
3781 = gen_tls_local_dynamic_base_64_si;
3782 }
3783 }
3784 else
3785 {
3786 ix86_gen_leave = gen_leave;
3787 ix86_gen_monitor = gen_sse3_monitor;
3788 }
3789
3790 if (Pmode == DImode)
3791 {
3792 ix86_gen_add3 = gen_adddi3;
3793 ix86_gen_sub3 = gen_subdi3;
3794 ix86_gen_sub3_carry = gen_subdi3_carry;
3795 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3796 ix86_gen_andsp = gen_anddi3;
3797 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3798 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3799 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3800 }
3801 else
3802 {
3803 ix86_gen_add3 = gen_addsi3;
3804 ix86_gen_sub3 = gen_subsi3;
3805 ix86_gen_sub3_carry = gen_subsi3_carry;
3806 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3807 ix86_gen_andsp = gen_andsi3;
3808 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3809 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3810 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3811 }
3812
3813 #ifdef USE_IX86_CLD
3814 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3815 if (!TARGET_64BIT)
3816 target_flags |= MASK_CLD & ~target_flags_explicit;
3817 #endif
3818
3819 if (!TARGET_64BIT && flag_pic)
3820 {
3821 if (flag_fentry > 0)
3822 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3823 "with -fpic");
3824 flag_fentry = 0;
3825 }
3826 else if (TARGET_SEH)
3827 {
3828 if (flag_fentry == 0)
3829 sorry ("-mno-fentry isn%'t compatible with SEH");
3830 flag_fentry = 1;
3831 }
3832 else if (flag_fentry < 0)
3833 {
3834 #if defined(PROFILE_BEFORE_PROLOGUE)
3835 flag_fentry = 1;
3836 #else
3837 flag_fentry = 0;
3838 #endif
3839 }
3840
3841 if (TARGET_AVX)
3842 {
3843 /* When not optimize for size, enable vzeroupper optimization for
3844 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3845 AVX unaligned load/store. */
3846 if (!optimize_size)
3847 {
3848 if (flag_expensive_optimizations
3849 && !(target_flags_explicit & MASK_VZEROUPPER))
3850 target_flags |= MASK_VZEROUPPER;
3851 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3852 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3853 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3854 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3855 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3856 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3857 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3858 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3859 target_flags |= MASK_PREFER_AVX128;
3860 }
3861 }
3862 else
3863 {
3864 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3865 target_flags &= ~MASK_VZEROUPPER;
3866 }
3867
3868 if (ix86_recip_name)
3869 {
3870 char *p = ASTRDUP (ix86_recip_name);
3871 char *q;
3872 unsigned int mask, i;
3873 bool invert;
3874
3875 while ((q = strtok (p, ",")) != NULL)
3876 {
3877 p = NULL;
3878 if (*q == '!')
3879 {
3880 invert = true;
3881 q++;
3882 }
3883 else
3884 invert = false;
3885
3886 if (!strcmp (q, "default"))
3887 mask = RECIP_MASK_ALL;
3888 else
3889 {
3890 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3891 if (!strcmp (q, recip_options[i].string))
3892 {
3893 mask = recip_options[i].mask;
3894 break;
3895 }
3896
3897 if (i == ARRAY_SIZE (recip_options))
3898 {
3899 error ("unknown option for -mrecip=%s", q);
3900 invert = false;
3901 mask = RECIP_MASK_NONE;
3902 }
3903 }
3904
3905 recip_mask_explicit |= mask;
3906 if (invert)
3907 recip_mask &= ~mask;
3908 else
3909 recip_mask |= mask;
3910 }
3911 }
3912
3913 if (TARGET_RECIP)
3914 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3915 else if (target_flags_explicit & MASK_RECIP)
3916 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3917
3918 /* Save the initial options in case the user does function specific
3919 options. */
3920 if (main_args_p)
3921 target_option_default_node = target_option_current_node
3922 = build_target_option_node ();
3923 }
3924
3925 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3926
3927 static bool
3928 function_pass_avx256_p (const_rtx val)
3929 {
3930 if (!val)
3931 return false;
3932
3933 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3934 return true;
3935
3936 if (GET_CODE (val) == PARALLEL)
3937 {
3938 int i;
3939 rtx r;
3940
3941 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3942 {
3943 r = XVECEXP (val, 0, i);
3944 if (GET_CODE (r) == EXPR_LIST
3945 && XEXP (r, 0)
3946 && REG_P (XEXP (r, 0))
3947 && (GET_MODE (XEXP (r, 0)) == OImode
3948 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3949 return true;
3950 }
3951 }
3952
3953 return false;
3954 }
3955
3956 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3957
3958 static void
3959 ix86_option_override (void)
3960 {
3961 ix86_option_override_internal (true);
3962 }
3963
3964 /* Update register usage after having seen the compiler flags. */
3965
3966 static void
3967 ix86_conditional_register_usage (void)
3968 {
3969 int i;
3970 unsigned int j;
3971
3972 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3973 {
3974 if (fixed_regs[i] > 1)
3975 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3976 if (call_used_regs[i] > 1)
3977 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3978 }
3979
3980 /* The PIC register, if it exists, is fixed. */
3981 j = PIC_OFFSET_TABLE_REGNUM;
3982 if (j != INVALID_REGNUM)
3983 fixed_regs[j] = call_used_regs[j] = 1;
3984
3985 /* The 64-bit MS_ABI changes the set of call-used registers. */
3986 if (TARGET_64BIT_MS_ABI)
3987 {
3988 call_used_regs[SI_REG] = 0;
3989 call_used_regs[DI_REG] = 0;
3990 call_used_regs[XMM6_REG] = 0;
3991 call_used_regs[XMM7_REG] = 0;
3992 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3993 call_used_regs[i] = 0;
3994 }
3995
3996 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3997 other call-clobbered regs for 64-bit. */
3998 if (TARGET_64BIT)
3999 {
4000 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4001
4002 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4003 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4004 && call_used_regs[i])
4005 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4006 }
4007
4008 /* If MMX is disabled, squash the registers. */
4009 if (! TARGET_MMX)
4010 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4011 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4012 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4013
4014 /* If SSE is disabled, squash the registers. */
4015 if (! TARGET_SSE)
4016 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4017 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4018 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4019
4020 /* If the FPU is disabled, squash the registers. */
4021 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4022 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4023 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4024 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4025
4026 /* If 32-bit, squash the 64-bit registers. */
4027 if (! TARGET_64BIT)
4028 {
4029 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4030 reg_names[i] = "";
4031 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4032 reg_names[i] = "";
4033 }
4034 }
4035
4036 \f
4037 /* Save the current options */
4038
4039 static void
4040 ix86_function_specific_save (struct cl_target_option *ptr)
4041 {
4042 ptr->arch = ix86_arch;
4043 ptr->schedule = ix86_schedule;
4044 ptr->tune = ix86_tune;
4045 ptr->branch_cost = ix86_branch_cost;
4046 ptr->tune_defaulted = ix86_tune_defaulted;
4047 ptr->arch_specified = ix86_arch_specified;
4048 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4049 ptr->ix86_target_flags_explicit = target_flags_explicit;
4050 ptr->x_recip_mask_explicit = recip_mask_explicit;
4051
4052 /* The fields are char but the variables are not; make sure the
4053 values fit in the fields. */
4054 gcc_assert (ptr->arch == ix86_arch);
4055 gcc_assert (ptr->schedule == ix86_schedule);
4056 gcc_assert (ptr->tune == ix86_tune);
4057 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4058 }
4059
4060 /* Restore the current options */
4061
4062 static void
4063 ix86_function_specific_restore (struct cl_target_option *ptr)
4064 {
4065 enum processor_type old_tune = ix86_tune;
4066 enum processor_type old_arch = ix86_arch;
4067 unsigned int ix86_arch_mask, ix86_tune_mask;
4068 int i;
4069
4070 ix86_arch = (enum processor_type) ptr->arch;
4071 ix86_schedule = (enum attr_cpu) ptr->schedule;
4072 ix86_tune = (enum processor_type) ptr->tune;
4073 ix86_branch_cost = ptr->branch_cost;
4074 ix86_tune_defaulted = ptr->tune_defaulted;
4075 ix86_arch_specified = ptr->arch_specified;
4076 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4077 target_flags_explicit = ptr->ix86_target_flags_explicit;
4078 recip_mask_explicit = ptr->x_recip_mask_explicit;
4079
4080 /* Recreate the arch feature tests if the arch changed */
4081 if (old_arch != ix86_arch)
4082 {
4083 ix86_arch_mask = 1u << ix86_arch;
4084 for (i = 0; i < X86_ARCH_LAST; ++i)
4085 ix86_arch_features[i]
4086 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4087 }
4088
4089 /* Recreate the tune optimization tests */
4090 if (old_tune != ix86_tune)
4091 {
4092 ix86_tune_mask = 1u << ix86_tune;
4093 for (i = 0; i < X86_TUNE_LAST; ++i)
4094 ix86_tune_features[i]
4095 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4096 }
4097 }
4098
4099 /* Print the current options */
4100
4101 static void
4102 ix86_function_specific_print (FILE *file, int indent,
4103 struct cl_target_option *ptr)
4104 {
4105 char *target_string
4106 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4107 NULL, NULL, ptr->x_ix86_fpmath, false);
4108
4109 fprintf (file, "%*sarch = %d (%s)\n",
4110 indent, "",
4111 ptr->arch,
4112 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4113 ? cpu_names[ptr->arch]
4114 : "<unknown>"));
4115
4116 fprintf (file, "%*stune = %d (%s)\n",
4117 indent, "",
4118 ptr->tune,
4119 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4120 ? cpu_names[ptr->tune]
4121 : "<unknown>"));
4122
4123 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4124
4125 if (target_string)
4126 {
4127 fprintf (file, "%*s%s\n", indent, "", target_string);
4128 free (target_string);
4129 }
4130 }
4131
4132 \f
4133 /* Inner function to process the attribute((target(...))), take an argument and
4134 set the current options from the argument. If we have a list, recursively go
4135 over the list. */
4136
4137 static bool
4138 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4139 struct gcc_options *enum_opts_set)
4140 {
4141 char *next_optstr;
4142 bool ret = true;
4143
4144 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4145 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4146 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4147 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4148 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4149
4150 enum ix86_opt_type
4151 {
4152 ix86_opt_unknown,
4153 ix86_opt_yes,
4154 ix86_opt_no,
4155 ix86_opt_str,
4156 ix86_opt_enum,
4157 ix86_opt_isa
4158 };
4159
4160 static const struct
4161 {
4162 const char *string;
4163 size_t len;
4164 enum ix86_opt_type type;
4165 int opt;
4166 int mask;
4167 } attrs[] = {
4168 /* isa options */
4169 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4170 IX86_ATTR_ISA ("abm", OPT_mabm),
4171 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4172 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4173 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4174 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4175 IX86_ATTR_ISA ("aes", OPT_maes),
4176 IX86_ATTR_ISA ("avx", OPT_mavx),
4177 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4178 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4179 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4180 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4181 IX86_ATTR_ISA ("sse", OPT_msse),
4182 IX86_ATTR_ISA ("sse2", OPT_msse2),
4183 IX86_ATTR_ISA ("sse3", OPT_msse3),
4184 IX86_ATTR_ISA ("sse4", OPT_msse4),
4185 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4186 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4187 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4188 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4189 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4190 IX86_ATTR_ISA ("fma", OPT_mfma),
4191 IX86_ATTR_ISA ("xop", OPT_mxop),
4192 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4193 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4194 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4195 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4196 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4197
4198 /* enum options */
4199 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4200
4201 /* string options */
4202 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4203 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4204
4205 /* flag options */
4206 IX86_ATTR_YES ("cld",
4207 OPT_mcld,
4208 MASK_CLD),
4209
4210 IX86_ATTR_NO ("fancy-math-387",
4211 OPT_mfancy_math_387,
4212 MASK_NO_FANCY_MATH_387),
4213
4214 IX86_ATTR_YES ("ieee-fp",
4215 OPT_mieee_fp,
4216 MASK_IEEE_FP),
4217
4218 IX86_ATTR_YES ("inline-all-stringops",
4219 OPT_minline_all_stringops,
4220 MASK_INLINE_ALL_STRINGOPS),
4221
4222 IX86_ATTR_YES ("inline-stringops-dynamically",
4223 OPT_minline_stringops_dynamically,
4224 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4225
4226 IX86_ATTR_NO ("align-stringops",
4227 OPT_mno_align_stringops,
4228 MASK_NO_ALIGN_STRINGOPS),
4229
4230 IX86_ATTR_YES ("recip",
4231 OPT_mrecip,
4232 MASK_RECIP),
4233
4234 };
4235
4236 /* If this is a list, recurse to get the options. */
4237 if (TREE_CODE (args) == TREE_LIST)
4238 {
4239 bool ret = true;
4240
4241 for (; args; args = TREE_CHAIN (args))
4242 if (TREE_VALUE (args)
4243 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4244 p_strings, enum_opts_set))
4245 ret = false;
4246
4247 return ret;
4248 }
4249
4250 else if (TREE_CODE (args) != STRING_CST)
4251 gcc_unreachable ();
4252
4253 /* Handle multiple arguments separated by commas. */
4254 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4255
4256 while (next_optstr && *next_optstr != '\0')
4257 {
4258 char *p = next_optstr;
4259 char *orig_p = p;
4260 char *comma = strchr (next_optstr, ',');
4261 const char *opt_string;
4262 size_t len, opt_len;
4263 int opt;
4264 bool opt_set_p;
4265 char ch;
4266 unsigned i;
4267 enum ix86_opt_type type = ix86_opt_unknown;
4268 int mask = 0;
4269
4270 if (comma)
4271 {
4272 *comma = '\0';
4273 len = comma - next_optstr;
4274 next_optstr = comma + 1;
4275 }
4276 else
4277 {
4278 len = strlen (p);
4279 next_optstr = NULL;
4280 }
4281
4282 /* Recognize no-xxx. */
4283 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4284 {
4285 opt_set_p = false;
4286 p += 3;
4287 len -= 3;
4288 }
4289 else
4290 opt_set_p = true;
4291
4292 /* Find the option. */
4293 ch = *p;
4294 opt = N_OPTS;
4295 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4296 {
4297 type = attrs[i].type;
4298 opt_len = attrs[i].len;
4299 if (ch == attrs[i].string[0]
4300 && ((type != ix86_opt_str && type != ix86_opt_enum)
4301 ? len == opt_len
4302 : len > opt_len)
4303 && memcmp (p, attrs[i].string, opt_len) == 0)
4304 {
4305 opt = attrs[i].opt;
4306 mask = attrs[i].mask;
4307 opt_string = attrs[i].string;
4308 break;
4309 }
4310 }
4311
4312 /* Process the option. */
4313 if (opt == N_OPTS)
4314 {
4315 error ("attribute(target(\"%s\")) is unknown", orig_p);
4316 ret = false;
4317 }
4318
4319 else if (type == ix86_opt_isa)
4320 {
4321 struct cl_decoded_option decoded;
4322
4323 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4324 ix86_handle_option (&global_options, &global_options_set,
4325 &decoded, input_location);
4326 }
4327
4328 else if (type == ix86_opt_yes || type == ix86_opt_no)
4329 {
4330 if (type == ix86_opt_no)
4331 opt_set_p = !opt_set_p;
4332
4333 if (opt_set_p)
4334 target_flags |= mask;
4335 else
4336 target_flags &= ~mask;
4337 }
4338
4339 else if (type == ix86_opt_str)
4340 {
4341 if (p_strings[opt])
4342 {
4343 error ("option(\"%s\") was already specified", opt_string);
4344 ret = false;
4345 }
4346 else
4347 p_strings[opt] = xstrdup (p + opt_len);
4348 }
4349
4350 else if (type == ix86_opt_enum)
4351 {
4352 bool arg_ok;
4353 int value;
4354
4355 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4356 if (arg_ok)
4357 set_option (&global_options, enum_opts_set, opt, value,
4358 p + opt_len, DK_UNSPECIFIED, input_location,
4359 global_dc);
4360 else
4361 {
4362 error ("attribute(target(\"%s\")) is unknown", orig_p);
4363 ret = false;
4364 }
4365 }
4366
4367 else
4368 gcc_unreachable ();
4369 }
4370
4371 return ret;
4372 }
4373
4374 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4375
4376 tree
4377 ix86_valid_target_attribute_tree (tree args)
4378 {
4379 const char *orig_arch_string = ix86_arch_string;
4380 const char *orig_tune_string = ix86_tune_string;
4381 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4382 int orig_tune_defaulted = ix86_tune_defaulted;
4383 int orig_arch_specified = ix86_arch_specified;
4384 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4385 tree t = NULL_TREE;
4386 int i;
4387 struct cl_target_option *def
4388 = TREE_TARGET_OPTION (target_option_default_node);
4389 struct gcc_options enum_opts_set;
4390
4391 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4392
4393 /* Process each of the options on the chain. */
4394 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4395 &enum_opts_set))
4396 return NULL_TREE;
4397
4398 /* If the changed options are different from the default, rerun
4399 ix86_option_override_internal, and then save the options away.
4400 The string options are are attribute options, and will be undone
4401 when we copy the save structure. */
4402 if (ix86_isa_flags != def->x_ix86_isa_flags
4403 || target_flags != def->x_target_flags
4404 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4405 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4406 || enum_opts_set.x_ix86_fpmath)
4407 {
4408 /* If we are using the default tune= or arch=, undo the string assigned,
4409 and use the default. */
4410 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4411 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4412 else if (!orig_arch_specified)
4413 ix86_arch_string = NULL;
4414
4415 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4416 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4417 else if (orig_tune_defaulted)
4418 ix86_tune_string = NULL;
4419
4420 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4421 if (enum_opts_set.x_ix86_fpmath)
4422 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4423 else if (!TARGET_64BIT && TARGET_SSE)
4424 {
4425 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4426 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4427 }
4428
4429 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4430 ix86_option_override_internal (false);
4431
4432 /* Add any builtin functions with the new isa if any. */
4433 ix86_add_new_builtins (ix86_isa_flags);
4434
4435 /* Save the current options unless we are validating options for
4436 #pragma. */
4437 t = build_target_option_node ();
4438
4439 ix86_arch_string = orig_arch_string;
4440 ix86_tune_string = orig_tune_string;
4441 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4442
4443 /* Free up memory allocated to hold the strings */
4444 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4445 free (option_strings[i]);
4446 }
4447
4448 return t;
4449 }
4450
4451 /* Hook to validate attribute((target("string"))). */
4452
4453 static bool
4454 ix86_valid_target_attribute_p (tree fndecl,
4455 tree ARG_UNUSED (name),
4456 tree args,
4457 int ARG_UNUSED (flags))
4458 {
4459 struct cl_target_option cur_target;
4460 bool ret = true;
4461 tree old_optimize = build_optimization_node ();
4462 tree new_target, new_optimize;
4463 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4464
4465 /* If the function changed the optimization levels as well as setting target
4466 options, start with the optimizations specified. */
4467 if (func_optimize && func_optimize != old_optimize)
4468 cl_optimization_restore (&global_options,
4469 TREE_OPTIMIZATION (func_optimize));
4470
4471 /* The target attributes may also change some optimization flags, so update
4472 the optimization options if necessary. */
4473 cl_target_option_save (&cur_target, &global_options);
4474 new_target = ix86_valid_target_attribute_tree (args);
4475 new_optimize = build_optimization_node ();
4476
4477 if (!new_target)
4478 ret = false;
4479
4480 else if (fndecl)
4481 {
4482 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4483
4484 if (old_optimize != new_optimize)
4485 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4486 }
4487
4488 cl_target_option_restore (&global_options, &cur_target);
4489
4490 if (old_optimize != new_optimize)
4491 cl_optimization_restore (&global_options,
4492 TREE_OPTIMIZATION (old_optimize));
4493
4494 return ret;
4495 }
4496
4497 \f
4498 /* Hook to determine if one function can safely inline another. */
4499
4500 static bool
4501 ix86_can_inline_p (tree caller, tree callee)
4502 {
4503 bool ret = false;
4504 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4505 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4506
4507 /* If callee has no option attributes, then it is ok to inline. */
4508 if (!callee_tree)
4509 ret = true;
4510
4511 /* If caller has no option attributes, but callee does then it is not ok to
4512 inline. */
4513 else if (!caller_tree)
4514 ret = false;
4515
4516 else
4517 {
4518 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4519 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4520
4521 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4522 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4523 function. */
4524 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4525 != callee_opts->x_ix86_isa_flags)
4526 ret = false;
4527
4528 /* See if we have the same non-isa options. */
4529 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4530 ret = false;
4531
4532 /* See if arch, tune, etc. are the same. */
4533 else if (caller_opts->arch != callee_opts->arch)
4534 ret = false;
4535
4536 else if (caller_opts->tune != callee_opts->tune)
4537 ret = false;
4538
4539 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4540 ret = false;
4541
4542 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4543 ret = false;
4544
4545 else
4546 ret = true;
4547 }
4548
4549 return ret;
4550 }
4551
4552 \f
4553 /* Remember the last target of ix86_set_current_function. */
4554 static GTY(()) tree ix86_previous_fndecl;
4555
4556 /* Establish appropriate back-end context for processing the function
4557 FNDECL. The argument might be NULL to indicate processing at top
4558 level, outside of any function scope. */
4559 static void
4560 ix86_set_current_function (tree fndecl)
4561 {
4562 /* Only change the context if the function changes. This hook is called
4563 several times in the course of compiling a function, and we don't want to
4564 slow things down too much or call target_reinit when it isn't safe. */
4565 if (fndecl && fndecl != ix86_previous_fndecl)
4566 {
4567 tree old_tree = (ix86_previous_fndecl
4568 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4569 : NULL_TREE);
4570
4571 tree new_tree = (fndecl
4572 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4573 : NULL_TREE);
4574
4575 ix86_previous_fndecl = fndecl;
4576 if (old_tree == new_tree)
4577 ;
4578
4579 else if (new_tree)
4580 {
4581 cl_target_option_restore (&global_options,
4582 TREE_TARGET_OPTION (new_tree));
4583 target_reinit ();
4584 }
4585
4586 else if (old_tree)
4587 {
4588 struct cl_target_option *def
4589 = TREE_TARGET_OPTION (target_option_current_node);
4590
4591 cl_target_option_restore (&global_options, def);
4592 target_reinit ();
4593 }
4594 }
4595 }
4596
4597 \f
4598 /* Return true if this goes in large data/bss. */
4599
4600 static bool
4601 ix86_in_large_data_p (tree exp)
4602 {
4603 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4604 return false;
4605
4606 /* Functions are never large data. */
4607 if (TREE_CODE (exp) == FUNCTION_DECL)
4608 return false;
4609
4610 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4611 {
4612 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4613 if (strcmp (section, ".ldata") == 0
4614 || strcmp (section, ".lbss") == 0)
4615 return true;
4616 return false;
4617 }
4618 else
4619 {
4620 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4621
4622 /* If this is an incomplete type with size 0, then we can't put it
4623 in data because it might be too big when completed. */
4624 if (!size || size > ix86_section_threshold)
4625 return true;
4626 }
4627
4628 return false;
4629 }
4630
4631 /* Switch to the appropriate section for output of DECL.
4632 DECL is either a `VAR_DECL' node or a constant of some sort.
4633 RELOC indicates whether forming the initial value of DECL requires
4634 link-time relocations. */
4635
4636 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4637 ATTRIBUTE_UNUSED;
4638
4639 static section *
4640 x86_64_elf_select_section (tree decl, int reloc,
4641 unsigned HOST_WIDE_INT align)
4642 {
4643 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4644 && ix86_in_large_data_p (decl))
4645 {
4646 const char *sname = NULL;
4647 unsigned int flags = SECTION_WRITE;
4648 switch (categorize_decl_for_section (decl, reloc))
4649 {
4650 case SECCAT_DATA:
4651 sname = ".ldata";
4652 break;
4653 case SECCAT_DATA_REL:
4654 sname = ".ldata.rel";
4655 break;
4656 case SECCAT_DATA_REL_LOCAL:
4657 sname = ".ldata.rel.local";
4658 break;
4659 case SECCAT_DATA_REL_RO:
4660 sname = ".ldata.rel.ro";
4661 break;
4662 case SECCAT_DATA_REL_RO_LOCAL:
4663 sname = ".ldata.rel.ro.local";
4664 break;
4665 case SECCAT_BSS:
4666 sname = ".lbss";
4667 flags |= SECTION_BSS;
4668 break;
4669 case SECCAT_RODATA:
4670 case SECCAT_RODATA_MERGE_STR:
4671 case SECCAT_RODATA_MERGE_STR_INIT:
4672 case SECCAT_RODATA_MERGE_CONST:
4673 sname = ".lrodata";
4674 flags = 0;
4675 break;
4676 case SECCAT_SRODATA:
4677 case SECCAT_SDATA:
4678 case SECCAT_SBSS:
4679 gcc_unreachable ();
4680 case SECCAT_TEXT:
4681 case SECCAT_TDATA:
4682 case SECCAT_TBSS:
4683 /* We don't split these for medium model. Place them into
4684 default sections and hope for best. */
4685 break;
4686 }
4687 if (sname)
4688 {
4689 /* We might get called with string constants, but get_named_section
4690 doesn't like them as they are not DECLs. Also, we need to set
4691 flags in that case. */
4692 if (!DECL_P (decl))
4693 return get_section (sname, flags, NULL);
4694 return get_named_section (decl, sname, reloc);
4695 }
4696 }
4697 return default_elf_select_section (decl, reloc, align);
4698 }
4699
4700 /* Build up a unique section name, expressed as a
4701 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4702 RELOC indicates whether the initial value of EXP requires
4703 link-time relocations. */
4704
4705 static void ATTRIBUTE_UNUSED
4706 x86_64_elf_unique_section (tree decl, int reloc)
4707 {
4708 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4709 && ix86_in_large_data_p (decl))
4710 {
4711 const char *prefix = NULL;
4712 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4713 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4714
4715 switch (categorize_decl_for_section (decl, reloc))
4716 {
4717 case SECCAT_DATA:
4718 case SECCAT_DATA_REL:
4719 case SECCAT_DATA_REL_LOCAL:
4720 case SECCAT_DATA_REL_RO:
4721 case SECCAT_DATA_REL_RO_LOCAL:
4722 prefix = one_only ? ".ld" : ".ldata";
4723 break;
4724 case SECCAT_BSS:
4725 prefix = one_only ? ".lb" : ".lbss";
4726 break;
4727 case SECCAT_RODATA:
4728 case SECCAT_RODATA_MERGE_STR:
4729 case SECCAT_RODATA_MERGE_STR_INIT:
4730 case SECCAT_RODATA_MERGE_CONST:
4731 prefix = one_only ? ".lr" : ".lrodata";
4732 break;
4733 case SECCAT_SRODATA:
4734 case SECCAT_SDATA:
4735 case SECCAT_SBSS:
4736 gcc_unreachable ();
4737 case SECCAT_TEXT:
4738 case SECCAT_TDATA:
4739 case SECCAT_TBSS:
4740 /* We don't split these for medium model. Place them into
4741 default sections and hope for best. */
4742 break;
4743 }
4744 if (prefix)
4745 {
4746 const char *name, *linkonce;
4747 char *string;
4748
4749 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4750 name = targetm.strip_name_encoding (name);
4751
4752 /* If we're using one_only, then there needs to be a .gnu.linkonce
4753 prefix to the section name. */
4754 linkonce = one_only ? ".gnu.linkonce" : "";
4755
4756 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4757
4758 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4759 return;
4760 }
4761 }
4762 default_unique_section (decl, reloc);
4763 }
4764
4765 #ifdef COMMON_ASM_OP
4766 /* This says how to output assembler code to declare an
4767 uninitialized external linkage data object.
4768
4769 For medium model x86-64 we need to use .largecomm opcode for
4770 large objects. */
4771 void
4772 x86_elf_aligned_common (FILE *file,
4773 const char *name, unsigned HOST_WIDE_INT size,
4774 int align)
4775 {
4776 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4777 && size > (unsigned int)ix86_section_threshold)
4778 fputs (".largecomm\t", file);
4779 else
4780 fputs (COMMON_ASM_OP, file);
4781 assemble_name (file, name);
4782 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4783 size, align / BITS_PER_UNIT);
4784 }
4785 #endif
4786
4787 /* Utility function for targets to use in implementing
4788 ASM_OUTPUT_ALIGNED_BSS. */
4789
4790 void
4791 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4792 const char *name, unsigned HOST_WIDE_INT size,
4793 int align)
4794 {
4795 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4796 && size > (unsigned int)ix86_section_threshold)
4797 switch_to_section (get_named_section (decl, ".lbss", 0));
4798 else
4799 switch_to_section (bss_section);
4800 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4801 #ifdef ASM_DECLARE_OBJECT_NAME
4802 last_assemble_variable_decl = decl;
4803 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4804 #else
4805 /* Standard thing is just output label for the object. */
4806 ASM_OUTPUT_LABEL (file, name);
4807 #endif /* ASM_DECLARE_OBJECT_NAME */
4808 ASM_OUTPUT_SKIP (file, size ? size : 1);
4809 }
4810 \f
4811 /* Decide whether we must probe the stack before any space allocation
4812 on this target. It's essentially TARGET_STACK_PROBE except when
4813 -fstack-check causes the stack to be already probed differently. */
4814
4815 bool
4816 ix86_target_stack_probe (void)
4817 {
4818 /* Do not probe the stack twice if static stack checking is enabled. */
4819 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4820 return false;
4821
4822 return TARGET_STACK_PROBE;
4823 }
4824 \f
4825 /* Decide whether we can make a sibling call to a function. DECL is the
4826 declaration of the function being targeted by the call and EXP is the
4827 CALL_EXPR representing the call. */
4828
4829 static bool
4830 ix86_function_ok_for_sibcall (tree decl, tree exp)
4831 {
4832 tree type, decl_or_type;
4833 rtx a, b;
4834
4835 /* If we are generating position-independent code, we cannot sibcall
4836 optimize any indirect call, or a direct call to a global function,
4837 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4838 if (!TARGET_MACHO
4839 && !TARGET_64BIT
4840 && flag_pic
4841 && (!decl || !targetm.binds_local_p (decl)))
4842 return false;
4843
4844 /* If we need to align the outgoing stack, then sibcalling would
4845 unalign the stack, which may break the called function. */
4846 if (ix86_minimum_incoming_stack_boundary (true)
4847 < PREFERRED_STACK_BOUNDARY)
4848 return false;
4849
4850 if (decl)
4851 {
4852 decl_or_type = decl;
4853 type = TREE_TYPE (decl);
4854 }
4855 else
4856 {
4857 /* We're looking at the CALL_EXPR, we need the type of the function. */
4858 type = CALL_EXPR_FN (exp); /* pointer expression */
4859 type = TREE_TYPE (type); /* pointer type */
4860 type = TREE_TYPE (type); /* function type */
4861 decl_or_type = type;
4862 }
4863
4864 /* Check that the return value locations are the same. Like
4865 if we are returning floats on the 80387 register stack, we cannot
4866 make a sibcall from a function that doesn't return a float to a
4867 function that does or, conversely, from a function that does return
4868 a float to a function that doesn't; the necessary stack adjustment
4869 would not be executed. This is also the place we notice
4870 differences in the return value ABI. Note that it is ok for one
4871 of the functions to have void return type as long as the return
4872 value of the other is passed in a register. */
4873 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4874 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4875 cfun->decl, false);
4876 if (STACK_REG_P (a) || STACK_REG_P (b))
4877 {
4878 if (!rtx_equal_p (a, b))
4879 return false;
4880 }
4881 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4882 {
4883 /* Disable sibcall if we need to generate vzeroupper after
4884 callee returns. */
4885 if (TARGET_VZEROUPPER
4886 && cfun->machine->callee_return_avx256_p
4887 && !cfun->machine->caller_return_avx256_p)
4888 return false;
4889 }
4890 else if (!rtx_equal_p (a, b))
4891 return false;
4892
4893 if (TARGET_64BIT)
4894 {
4895 /* The SYSV ABI has more call-clobbered registers;
4896 disallow sibcalls from MS to SYSV. */
4897 if (cfun->machine->call_abi == MS_ABI
4898 && ix86_function_type_abi (type) == SYSV_ABI)
4899 return false;
4900 }
4901 else
4902 {
4903 /* If this call is indirect, we'll need to be able to use a
4904 call-clobbered register for the address of the target function.
4905 Make sure that all such registers are not used for passing
4906 parameters. Note that DLLIMPORT functions are indirect. */
4907 if (!decl
4908 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4909 {
4910 if (ix86_function_regparm (type, NULL) >= 3)
4911 {
4912 /* ??? Need to count the actual number of registers to be used,
4913 not the possible number of registers. Fix later. */
4914 return false;
4915 }
4916 }
4917 }
4918
4919 /* Otherwise okay. That also includes certain types of indirect calls. */
4920 return true;
4921 }
4922
4923 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4924 and "sseregparm" calling convention attributes;
4925 arguments as in struct attribute_spec.handler. */
4926
4927 static tree
4928 ix86_handle_cconv_attribute (tree *node, tree name,
4929 tree args,
4930 int flags ATTRIBUTE_UNUSED,
4931 bool *no_add_attrs)
4932 {
4933 if (TREE_CODE (*node) != FUNCTION_TYPE
4934 && TREE_CODE (*node) != METHOD_TYPE
4935 && TREE_CODE (*node) != FIELD_DECL
4936 && TREE_CODE (*node) != TYPE_DECL)
4937 {
4938 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4939 name);
4940 *no_add_attrs = true;
4941 return NULL_TREE;
4942 }
4943
4944 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4945 if (is_attribute_p ("regparm", name))
4946 {
4947 tree cst;
4948
4949 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4950 {
4951 error ("fastcall and regparm attributes are not compatible");
4952 }
4953
4954 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4955 {
4956 error ("regparam and thiscall attributes are not compatible");
4957 }
4958
4959 cst = TREE_VALUE (args);
4960 if (TREE_CODE (cst) != INTEGER_CST)
4961 {
4962 warning (OPT_Wattributes,
4963 "%qE attribute requires an integer constant argument",
4964 name);
4965 *no_add_attrs = true;
4966 }
4967 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4968 {
4969 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4970 name, REGPARM_MAX);
4971 *no_add_attrs = true;
4972 }
4973
4974 return NULL_TREE;
4975 }
4976
4977 if (TARGET_64BIT)
4978 {
4979 /* Do not warn when emulating the MS ABI. */
4980 if ((TREE_CODE (*node) != FUNCTION_TYPE
4981 && TREE_CODE (*node) != METHOD_TYPE)
4982 || ix86_function_type_abi (*node) != MS_ABI)
4983 warning (OPT_Wattributes, "%qE attribute ignored",
4984 name);
4985 *no_add_attrs = true;
4986 return NULL_TREE;
4987 }
4988
4989 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4990 if (is_attribute_p ("fastcall", name))
4991 {
4992 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4993 {
4994 error ("fastcall and cdecl attributes are not compatible");
4995 }
4996 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4997 {
4998 error ("fastcall and stdcall attributes are not compatible");
4999 }
5000 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5001 {
5002 error ("fastcall and regparm attributes are not compatible");
5003 }
5004 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5005 {
5006 error ("fastcall and thiscall attributes are not compatible");
5007 }
5008 }
5009
5010 /* Can combine stdcall with fastcall (redundant), regparm and
5011 sseregparm. */
5012 else if (is_attribute_p ("stdcall", name))
5013 {
5014 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5015 {
5016 error ("stdcall and cdecl attributes are not compatible");
5017 }
5018 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5019 {
5020 error ("stdcall and fastcall attributes are not compatible");
5021 }
5022 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5023 {
5024 error ("stdcall and thiscall attributes are not compatible");
5025 }
5026 }
5027
5028 /* Can combine cdecl with regparm and sseregparm. */
5029 else if (is_attribute_p ("cdecl", name))
5030 {
5031 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5032 {
5033 error ("stdcall and cdecl attributes are not compatible");
5034 }
5035 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5036 {
5037 error ("fastcall and cdecl attributes are not compatible");
5038 }
5039 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5040 {
5041 error ("cdecl and thiscall attributes are not compatible");
5042 }
5043 }
5044 else if (is_attribute_p ("thiscall", name))
5045 {
5046 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5047 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5048 name);
5049 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5050 {
5051 error ("stdcall and thiscall attributes are not compatible");
5052 }
5053 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5054 {
5055 error ("fastcall and thiscall attributes are not compatible");
5056 }
5057 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5058 {
5059 error ("cdecl and thiscall attributes are not compatible");
5060 }
5061 }
5062
5063 /* Can combine sseregparm with all attributes. */
5064
5065 return NULL_TREE;
5066 }
5067
5068 /* The transactional memory builtins are implicitly regparm or fastcall
5069 depending on the ABI. Override the generic do-nothing attribute that
5070 these builtins were declared with, and replace it with one of the two
5071 attributes that we expect elsewhere. */
5072
5073 static tree
5074 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5075 tree args ATTRIBUTE_UNUSED,
5076 int flags ATTRIBUTE_UNUSED,
5077 bool *no_add_attrs)
5078 {
5079 tree alt;
5080
5081 /* In no case do we want to add the placeholder attribute. */
5082 *no_add_attrs = true;
5083
5084 /* The 64-bit ABI is unchanged for transactional memory. */
5085 if (TARGET_64BIT)
5086 return NULL_TREE;
5087
5088 /* ??? Is there a better way to validate 32-bit windows? We have
5089 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5090 if (CHECK_STACK_LIMIT > 0)
5091 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5092 else
5093 {
5094 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5095 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5096 }
5097 decl_attributes (node, alt, flags);
5098
5099 return NULL_TREE;
5100 }
5101
5102 /* This function determines from TYPE the calling-convention. */
5103
5104 unsigned int
5105 ix86_get_callcvt (const_tree type)
5106 {
5107 unsigned int ret = 0;
5108 bool is_stdarg;
5109 tree attrs;
5110
5111 if (TARGET_64BIT)
5112 return IX86_CALLCVT_CDECL;
5113
5114 attrs = TYPE_ATTRIBUTES (type);
5115 if (attrs != NULL_TREE)
5116 {
5117 if (lookup_attribute ("cdecl", attrs))
5118 ret |= IX86_CALLCVT_CDECL;
5119 else if (lookup_attribute ("stdcall", attrs))
5120 ret |= IX86_CALLCVT_STDCALL;
5121 else if (lookup_attribute ("fastcall", attrs))
5122 ret |= IX86_CALLCVT_FASTCALL;
5123 else if (lookup_attribute ("thiscall", attrs))
5124 ret |= IX86_CALLCVT_THISCALL;
5125
5126 /* Regparam isn't allowed for thiscall and fastcall. */
5127 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5128 {
5129 if (lookup_attribute ("regparm", attrs))
5130 ret |= IX86_CALLCVT_REGPARM;
5131 if (lookup_attribute ("sseregparm", attrs))
5132 ret |= IX86_CALLCVT_SSEREGPARM;
5133 }
5134
5135 if (IX86_BASE_CALLCVT(ret) != 0)
5136 return ret;
5137 }
5138
5139 is_stdarg = stdarg_p (type);
5140 if (TARGET_RTD && !is_stdarg)
5141 return IX86_CALLCVT_STDCALL | ret;
5142
5143 if (ret != 0
5144 || is_stdarg
5145 || TREE_CODE (type) != METHOD_TYPE
5146 || ix86_function_type_abi (type) != MS_ABI)
5147 return IX86_CALLCVT_CDECL | ret;
5148
5149 return IX86_CALLCVT_THISCALL;
5150 }
5151
5152 /* Return 0 if the attributes for two types are incompatible, 1 if they
5153 are compatible, and 2 if they are nearly compatible (which causes a
5154 warning to be generated). */
5155
5156 static int
5157 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5158 {
5159 unsigned int ccvt1, ccvt2;
5160
5161 if (TREE_CODE (type1) != FUNCTION_TYPE
5162 && TREE_CODE (type1) != METHOD_TYPE)
5163 return 1;
5164
5165 ccvt1 = ix86_get_callcvt (type1);
5166 ccvt2 = ix86_get_callcvt (type2);
5167 if (ccvt1 != ccvt2)
5168 return 0;
5169 if (ix86_function_regparm (type1, NULL)
5170 != ix86_function_regparm (type2, NULL))
5171 return 0;
5172
5173 return 1;
5174 }
5175 \f
5176 /* Return the regparm value for a function with the indicated TYPE and DECL.
5177 DECL may be NULL when calling function indirectly
5178 or considering a libcall. */
5179
5180 static int
5181 ix86_function_regparm (const_tree type, const_tree decl)
5182 {
5183 tree attr;
5184 int regparm;
5185 unsigned int ccvt;
5186
5187 if (TARGET_64BIT)
5188 return (ix86_function_type_abi (type) == SYSV_ABI
5189 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5190 ccvt = ix86_get_callcvt (type);
5191 regparm = ix86_regparm;
5192
5193 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5194 {
5195 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5196 if (attr)
5197 {
5198 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5199 return regparm;
5200 }
5201 }
5202 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5203 return 2;
5204 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5205 return 1;
5206
5207 /* Use register calling convention for local functions when possible. */
5208 if (decl
5209 && TREE_CODE (decl) == FUNCTION_DECL
5210 && optimize
5211 && !(profile_flag && !flag_fentry))
5212 {
5213 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5214 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5215 if (i && i->local && i->can_change_signature)
5216 {
5217 int local_regparm, globals = 0, regno;
5218
5219 /* Make sure no regparm register is taken by a
5220 fixed register variable. */
5221 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5222 if (fixed_regs[local_regparm])
5223 break;
5224
5225 /* We don't want to use regparm(3) for nested functions as
5226 these use a static chain pointer in the third argument. */
5227 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5228 local_regparm = 2;
5229
5230 /* In 32-bit mode save a register for the split stack. */
5231 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5232 local_regparm = 2;
5233
5234 /* Each fixed register usage increases register pressure,
5235 so less registers should be used for argument passing.
5236 This functionality can be overriden by an explicit
5237 regparm value. */
5238 for (regno = 0; regno <= DI_REG; regno++)
5239 if (fixed_regs[regno])
5240 globals++;
5241
5242 local_regparm
5243 = globals < local_regparm ? local_regparm - globals : 0;
5244
5245 if (local_regparm > regparm)
5246 regparm = local_regparm;
5247 }
5248 }
5249
5250 return regparm;
5251 }
5252
5253 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5254 DFmode (2) arguments in SSE registers for a function with the
5255 indicated TYPE and DECL. DECL may be NULL when calling function
5256 indirectly or considering a libcall. Otherwise return 0. */
5257
5258 static int
5259 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5260 {
5261 gcc_assert (!TARGET_64BIT);
5262
5263 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5264 by the sseregparm attribute. */
5265 if (TARGET_SSEREGPARM
5266 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5267 {
5268 if (!TARGET_SSE)
5269 {
5270 if (warn)
5271 {
5272 if (decl)
5273 error ("calling %qD with attribute sseregparm without "
5274 "SSE/SSE2 enabled", decl);
5275 else
5276 error ("calling %qT with attribute sseregparm without "
5277 "SSE/SSE2 enabled", type);
5278 }
5279 return 0;
5280 }
5281
5282 return 2;
5283 }
5284
5285 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5286 (and DFmode for SSE2) arguments in SSE registers. */
5287 if (decl && TARGET_SSE_MATH && optimize
5288 && !(profile_flag && !flag_fentry))
5289 {
5290 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5291 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5292 if (i && i->local && i->can_change_signature)
5293 return TARGET_SSE2 ? 2 : 1;
5294 }
5295
5296 return 0;
5297 }
5298
5299 /* Return true if EAX is live at the start of the function. Used by
5300 ix86_expand_prologue to determine if we need special help before
5301 calling allocate_stack_worker. */
5302
5303 static bool
5304 ix86_eax_live_at_start_p (void)
5305 {
5306 /* Cheat. Don't bother working forward from ix86_function_regparm
5307 to the function type to whether an actual argument is located in
5308 eax. Instead just look at cfg info, which is still close enough
5309 to correct at this point. This gives false positives for broken
5310 functions that might use uninitialized data that happens to be
5311 allocated in eax, but who cares? */
5312 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5313 }
5314
5315 static bool
5316 ix86_keep_aggregate_return_pointer (tree fntype)
5317 {
5318 tree attr;
5319
5320 if (!TARGET_64BIT)
5321 {
5322 attr = lookup_attribute ("callee_pop_aggregate_return",
5323 TYPE_ATTRIBUTES (fntype));
5324 if (attr)
5325 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5326
5327 /* For 32-bit MS-ABI the default is to keep aggregate
5328 return pointer. */
5329 if (ix86_function_type_abi (fntype) == MS_ABI)
5330 return true;
5331 }
5332 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5333 }
5334
5335 /* Value is the number of bytes of arguments automatically
5336 popped when returning from a subroutine call.
5337 FUNDECL is the declaration node of the function (as a tree),
5338 FUNTYPE is the data type of the function (as a tree),
5339 or for a library call it is an identifier node for the subroutine name.
5340 SIZE is the number of bytes of arguments passed on the stack.
5341
5342 On the 80386, the RTD insn may be used to pop them if the number
5343 of args is fixed, but if the number is variable then the caller
5344 must pop them all. RTD can't be used for library calls now
5345 because the library is compiled with the Unix compiler.
5346 Use of RTD is a selectable option, since it is incompatible with
5347 standard Unix calling sequences. If the option is not selected,
5348 the caller must always pop the args.
5349
5350 The attribute stdcall is equivalent to RTD on a per module basis. */
5351
5352 static int
5353 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5354 {
5355 unsigned int ccvt;
5356
5357 /* None of the 64-bit ABIs pop arguments. */
5358 if (TARGET_64BIT)
5359 return 0;
5360
5361 ccvt = ix86_get_callcvt (funtype);
5362
5363 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5364 | IX86_CALLCVT_THISCALL)) != 0
5365 && ! stdarg_p (funtype))
5366 return size;
5367
5368 /* Lose any fake structure return argument if it is passed on the stack. */
5369 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5370 && !ix86_keep_aggregate_return_pointer (funtype))
5371 {
5372 int nregs = ix86_function_regparm (funtype, fundecl);
5373 if (nregs == 0)
5374 return GET_MODE_SIZE (Pmode);
5375 }
5376
5377 return 0;
5378 }
5379 \f
5380 /* Argument support functions. */
5381
5382 /* Return true when register may be used to pass function parameters. */
5383 bool
5384 ix86_function_arg_regno_p (int regno)
5385 {
5386 int i;
5387 const int *parm_regs;
5388
5389 if (!TARGET_64BIT)
5390 {
5391 if (TARGET_MACHO)
5392 return (regno < REGPARM_MAX
5393 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5394 else
5395 return (regno < REGPARM_MAX
5396 || (TARGET_MMX && MMX_REGNO_P (regno)
5397 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5398 || (TARGET_SSE && SSE_REGNO_P (regno)
5399 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5400 }
5401
5402 if (TARGET_MACHO)
5403 {
5404 if (SSE_REGNO_P (regno) && TARGET_SSE)
5405 return true;
5406 }
5407 else
5408 {
5409 if (TARGET_SSE && SSE_REGNO_P (regno)
5410 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5411 return true;
5412 }
5413
5414 /* TODO: The function should depend on current function ABI but
5415 builtins.c would need updating then. Therefore we use the
5416 default ABI. */
5417
5418 /* RAX is used as hidden argument to va_arg functions. */
5419 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5420 return true;
5421
5422 if (ix86_abi == MS_ABI)
5423 parm_regs = x86_64_ms_abi_int_parameter_registers;
5424 else
5425 parm_regs = x86_64_int_parameter_registers;
5426 for (i = 0; i < (ix86_abi == MS_ABI
5427 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5428 if (regno == parm_regs[i])
5429 return true;
5430 return false;
5431 }
5432
5433 /* Return if we do not know how to pass TYPE solely in registers. */
5434
5435 static bool
5436 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5437 {
5438 if (must_pass_in_stack_var_size_or_pad (mode, type))
5439 return true;
5440
5441 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5442 The layout_type routine is crafty and tries to trick us into passing
5443 currently unsupported vector types on the stack by using TImode. */
5444 return (!TARGET_64BIT && mode == TImode
5445 && type && TREE_CODE (type) != VECTOR_TYPE);
5446 }
5447
5448 /* It returns the size, in bytes, of the area reserved for arguments passed
5449 in registers for the function represented by fndecl dependent to the used
5450 abi format. */
5451 int
5452 ix86_reg_parm_stack_space (const_tree fndecl)
5453 {
5454 enum calling_abi call_abi = SYSV_ABI;
5455 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5456 call_abi = ix86_function_abi (fndecl);
5457 else
5458 call_abi = ix86_function_type_abi (fndecl);
5459 if (TARGET_64BIT && call_abi == MS_ABI)
5460 return 32;
5461 return 0;
5462 }
5463
5464 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5465 call abi used. */
5466 enum calling_abi
5467 ix86_function_type_abi (const_tree fntype)
5468 {
5469 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5470 {
5471 enum calling_abi abi = ix86_abi;
5472 if (abi == SYSV_ABI)
5473 {
5474 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5475 abi = MS_ABI;
5476 }
5477 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5478 abi = SYSV_ABI;
5479 return abi;
5480 }
5481 return ix86_abi;
5482 }
5483
5484 static bool
5485 ix86_function_ms_hook_prologue (const_tree fn)
5486 {
5487 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5488 {
5489 if (decl_function_context (fn) != NULL_TREE)
5490 error_at (DECL_SOURCE_LOCATION (fn),
5491 "ms_hook_prologue is not compatible with nested function");
5492 else
5493 return true;
5494 }
5495 return false;
5496 }
5497
5498 static enum calling_abi
5499 ix86_function_abi (const_tree fndecl)
5500 {
5501 if (! fndecl)
5502 return ix86_abi;
5503 return ix86_function_type_abi (TREE_TYPE (fndecl));
5504 }
5505
5506 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5507 call abi used. */
5508 enum calling_abi
5509 ix86_cfun_abi (void)
5510 {
5511 if (! cfun)
5512 return ix86_abi;
5513 return cfun->machine->call_abi;
5514 }
5515
5516 /* Write the extra assembler code needed to declare a function properly. */
5517
5518 void
5519 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5520 tree decl)
5521 {
5522 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5523
5524 if (is_ms_hook)
5525 {
5526 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5527 unsigned int filler_cc = 0xcccccccc;
5528
5529 for (i = 0; i < filler_count; i += 4)
5530 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5531 }
5532
5533 #ifdef SUBTARGET_ASM_UNWIND_INIT
5534 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5535 #endif
5536
5537 ASM_OUTPUT_LABEL (asm_out_file, fname);
5538
5539 /* Output magic byte marker, if hot-patch attribute is set. */
5540 if (is_ms_hook)
5541 {
5542 if (TARGET_64BIT)
5543 {
5544 /* leaq [%rsp + 0], %rsp */
5545 asm_fprintf (asm_out_file, ASM_BYTE
5546 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5547 }
5548 else
5549 {
5550 /* movl.s %edi, %edi
5551 push %ebp
5552 movl.s %esp, %ebp */
5553 asm_fprintf (asm_out_file, ASM_BYTE
5554 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5555 }
5556 }
5557 }
5558
5559 /* regclass.c */
5560 extern void init_regs (void);
5561
5562 /* Implementation of call abi switching target hook. Specific to FNDECL
5563 the specific call register sets are set. See also
5564 ix86_conditional_register_usage for more details. */
5565 void
5566 ix86_call_abi_override (const_tree fndecl)
5567 {
5568 if (fndecl == NULL_TREE)
5569 cfun->machine->call_abi = ix86_abi;
5570 else
5571 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5572 }
5573
5574 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5575 expensive re-initialization of init_regs each time we switch function context
5576 since this is needed only during RTL expansion. */
5577 static void
5578 ix86_maybe_switch_abi (void)
5579 {
5580 if (TARGET_64BIT &&
5581 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5582 reinit_regs ();
5583 }
5584
5585 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5586 for a call to a function whose data type is FNTYPE.
5587 For a library call, FNTYPE is 0. */
5588
5589 void
5590 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5591 tree fntype, /* tree ptr for function decl */
5592 rtx libname, /* SYMBOL_REF of library name or 0 */
5593 tree fndecl,
5594 int caller)
5595 {
5596 struct cgraph_local_info *i;
5597 tree fnret_type;
5598
5599 memset (cum, 0, sizeof (*cum));
5600
5601 /* Initialize for the current callee. */
5602 if (caller)
5603 {
5604 cfun->machine->callee_pass_avx256_p = false;
5605 cfun->machine->callee_return_avx256_p = false;
5606 }
5607
5608 if (fndecl)
5609 {
5610 i = cgraph_local_info (fndecl);
5611 cum->call_abi = ix86_function_abi (fndecl);
5612 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5613 }
5614 else
5615 {
5616 i = NULL;
5617 cum->call_abi = ix86_function_type_abi (fntype);
5618 if (fntype)
5619 fnret_type = TREE_TYPE (fntype);
5620 else
5621 fnret_type = NULL;
5622 }
5623
5624 if (TARGET_VZEROUPPER && fnret_type)
5625 {
5626 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5627 false);
5628 if (function_pass_avx256_p (fnret_value))
5629 {
5630 /* The return value of this function uses 256bit AVX modes. */
5631 if (caller)
5632 cfun->machine->callee_return_avx256_p = true;
5633 else
5634 cfun->machine->caller_return_avx256_p = true;
5635 }
5636 }
5637
5638 cum->caller = caller;
5639
5640 /* Set up the number of registers to use for passing arguments. */
5641
5642 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5643 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5644 "or subtarget optimization implying it");
5645 cum->nregs = ix86_regparm;
5646 if (TARGET_64BIT)
5647 {
5648 cum->nregs = (cum->call_abi == SYSV_ABI
5649 ? X86_64_REGPARM_MAX
5650 : X86_64_MS_REGPARM_MAX);
5651 }
5652 if (TARGET_SSE)
5653 {
5654 cum->sse_nregs = SSE_REGPARM_MAX;
5655 if (TARGET_64BIT)
5656 {
5657 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5658 ? X86_64_SSE_REGPARM_MAX
5659 : X86_64_MS_SSE_REGPARM_MAX);
5660 }
5661 }
5662 if (TARGET_MMX)
5663 cum->mmx_nregs = MMX_REGPARM_MAX;
5664 cum->warn_avx = true;
5665 cum->warn_sse = true;
5666 cum->warn_mmx = true;
5667
5668 /* Because type might mismatch in between caller and callee, we need to
5669 use actual type of function for local calls.
5670 FIXME: cgraph_analyze can be told to actually record if function uses
5671 va_start so for local functions maybe_vaarg can be made aggressive
5672 helping K&R code.
5673 FIXME: once typesytem is fixed, we won't need this code anymore. */
5674 if (i && i->local && i->can_change_signature)
5675 fntype = TREE_TYPE (fndecl);
5676 cum->maybe_vaarg = (fntype
5677 ? (!prototype_p (fntype) || stdarg_p (fntype))
5678 : !libname);
5679
5680 if (!TARGET_64BIT)
5681 {
5682 /* If there are variable arguments, then we won't pass anything
5683 in registers in 32-bit mode. */
5684 if (stdarg_p (fntype))
5685 {
5686 cum->nregs = 0;
5687 cum->sse_nregs = 0;
5688 cum->mmx_nregs = 0;
5689 cum->warn_avx = 0;
5690 cum->warn_sse = 0;
5691 cum->warn_mmx = 0;
5692 return;
5693 }
5694
5695 /* Use ecx and edx registers if function has fastcall attribute,
5696 else look for regparm information. */
5697 if (fntype)
5698 {
5699 unsigned int ccvt = ix86_get_callcvt (fntype);
5700 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5701 {
5702 cum->nregs = 1;
5703 cum->fastcall = 1; /* Same first register as in fastcall. */
5704 }
5705 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5706 {
5707 cum->nregs = 2;
5708 cum->fastcall = 1;
5709 }
5710 else
5711 cum->nregs = ix86_function_regparm (fntype, fndecl);
5712 }
5713
5714 /* Set up the number of SSE registers used for passing SFmode
5715 and DFmode arguments. Warn for mismatching ABI. */
5716 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5717 }
5718 }
5719
5720 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5721 But in the case of vector types, it is some vector mode.
5722
5723 When we have only some of our vector isa extensions enabled, then there
5724 are some modes for which vector_mode_supported_p is false. For these
5725 modes, the generic vector support in gcc will choose some non-vector mode
5726 in order to implement the type. By computing the natural mode, we'll
5727 select the proper ABI location for the operand and not depend on whatever
5728 the middle-end decides to do with these vector types.
5729
5730 The midde-end can't deal with the vector types > 16 bytes. In this
5731 case, we return the original mode and warn ABI change if CUM isn't
5732 NULL. */
5733
5734 static enum machine_mode
5735 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5736 {
5737 enum machine_mode mode = TYPE_MODE (type);
5738
5739 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5740 {
5741 HOST_WIDE_INT size = int_size_in_bytes (type);
5742 if ((size == 8 || size == 16 || size == 32)
5743 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5744 && TYPE_VECTOR_SUBPARTS (type) > 1)
5745 {
5746 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5747
5748 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5749 mode = MIN_MODE_VECTOR_FLOAT;
5750 else
5751 mode = MIN_MODE_VECTOR_INT;
5752
5753 /* Get the mode which has this inner mode and number of units. */
5754 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5755 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5756 && GET_MODE_INNER (mode) == innermode)
5757 {
5758 if (size == 32 && !TARGET_AVX)
5759 {
5760 static bool warnedavx;
5761
5762 if (cum
5763 && !warnedavx
5764 && cum->warn_avx)
5765 {
5766 warnedavx = true;
5767 warning (0, "AVX vector argument without AVX "
5768 "enabled changes the ABI");
5769 }
5770 return TYPE_MODE (type);
5771 }
5772 else
5773 return mode;
5774 }
5775
5776 gcc_unreachable ();
5777 }
5778 }
5779
5780 return mode;
5781 }
5782
5783 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5784 this may not agree with the mode that the type system has chosen for the
5785 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5786 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5787
5788 static rtx
5789 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5790 unsigned int regno)
5791 {
5792 rtx tmp;
5793
5794 if (orig_mode != BLKmode)
5795 tmp = gen_rtx_REG (orig_mode, regno);
5796 else
5797 {
5798 tmp = gen_rtx_REG (mode, regno);
5799 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5800 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5801 }
5802
5803 return tmp;
5804 }
5805
5806 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5807 of this code is to classify each 8bytes of incoming argument by the register
5808 class and assign registers accordingly. */
5809
5810 /* Return the union class of CLASS1 and CLASS2.
5811 See the x86-64 PS ABI for details. */
5812
5813 static enum x86_64_reg_class
5814 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5815 {
5816 /* Rule #1: If both classes are equal, this is the resulting class. */
5817 if (class1 == class2)
5818 return class1;
5819
5820 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5821 the other class. */
5822 if (class1 == X86_64_NO_CLASS)
5823 return class2;
5824 if (class2 == X86_64_NO_CLASS)
5825 return class1;
5826
5827 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5828 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5829 return X86_64_MEMORY_CLASS;
5830
5831 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5832 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5833 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5834 return X86_64_INTEGERSI_CLASS;
5835 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5836 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5837 return X86_64_INTEGER_CLASS;
5838
5839 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5840 MEMORY is used. */
5841 if (class1 == X86_64_X87_CLASS
5842 || class1 == X86_64_X87UP_CLASS
5843 || class1 == X86_64_COMPLEX_X87_CLASS
5844 || class2 == X86_64_X87_CLASS
5845 || class2 == X86_64_X87UP_CLASS
5846 || class2 == X86_64_COMPLEX_X87_CLASS)
5847 return X86_64_MEMORY_CLASS;
5848
5849 /* Rule #6: Otherwise class SSE is used. */
5850 return X86_64_SSE_CLASS;
5851 }
5852
5853 /* Classify the argument of type TYPE and mode MODE.
5854 CLASSES will be filled by the register class used to pass each word
5855 of the operand. The number of words is returned. In case the parameter
5856 should be passed in memory, 0 is returned. As a special case for zero
5857 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5858
5859 BIT_OFFSET is used internally for handling records and specifies offset
5860 of the offset in bits modulo 256 to avoid overflow cases.
5861
5862 See the x86-64 PS ABI for details.
5863 */
5864
5865 static int
5866 classify_argument (enum machine_mode mode, const_tree type,
5867 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5868 {
5869 HOST_WIDE_INT bytes =
5870 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5871 int words
5872 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5873
5874 /* Variable sized entities are always passed/returned in memory. */
5875 if (bytes < 0)
5876 return 0;
5877
5878 if (mode != VOIDmode
5879 && targetm.calls.must_pass_in_stack (mode, type))
5880 return 0;
5881
5882 if (type && AGGREGATE_TYPE_P (type))
5883 {
5884 int i;
5885 tree field;
5886 enum x86_64_reg_class subclasses[MAX_CLASSES];
5887
5888 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5889 if (bytes > 32)
5890 return 0;
5891
5892 for (i = 0; i < words; i++)
5893 classes[i] = X86_64_NO_CLASS;
5894
5895 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5896 signalize memory class, so handle it as special case. */
5897 if (!words)
5898 {
5899 classes[0] = X86_64_NO_CLASS;
5900 return 1;
5901 }
5902
5903 /* Classify each field of record and merge classes. */
5904 switch (TREE_CODE (type))
5905 {
5906 case RECORD_TYPE:
5907 /* And now merge the fields of structure. */
5908 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5909 {
5910 if (TREE_CODE (field) == FIELD_DECL)
5911 {
5912 int num;
5913
5914 if (TREE_TYPE (field) == error_mark_node)
5915 continue;
5916
5917 /* Bitfields are always classified as integer. Handle them
5918 early, since later code would consider them to be
5919 misaligned integers. */
5920 if (DECL_BIT_FIELD (field))
5921 {
5922 for (i = (int_bit_position (field)
5923 + (bit_offset % 64)) / 8 / 8;
5924 i < ((int_bit_position (field) + (bit_offset % 64))
5925 + tree_low_cst (DECL_SIZE (field), 0)
5926 + 63) / 8 / 8; i++)
5927 classes[i] =
5928 merge_classes (X86_64_INTEGER_CLASS,
5929 classes[i]);
5930 }
5931 else
5932 {
5933 int pos;
5934
5935 type = TREE_TYPE (field);
5936
5937 /* Flexible array member is ignored. */
5938 if (TYPE_MODE (type) == BLKmode
5939 && TREE_CODE (type) == ARRAY_TYPE
5940 && TYPE_SIZE (type) == NULL_TREE
5941 && TYPE_DOMAIN (type) != NULL_TREE
5942 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5943 == NULL_TREE))
5944 {
5945 static bool warned;
5946
5947 if (!warned && warn_psabi)
5948 {
5949 warned = true;
5950 inform (input_location,
5951 "the ABI of passing struct with"
5952 " a flexible array member has"
5953 " changed in GCC 4.4");
5954 }
5955 continue;
5956 }
5957 num = classify_argument (TYPE_MODE (type), type,
5958 subclasses,
5959 (int_bit_position (field)
5960 + bit_offset) % 256);
5961 if (!num)
5962 return 0;
5963 pos = (int_bit_position (field)
5964 + (bit_offset % 64)) / 8 / 8;
5965 for (i = 0; i < num && (i + pos) < words; i++)
5966 classes[i + pos] =
5967 merge_classes (subclasses[i], classes[i + pos]);
5968 }
5969 }
5970 }
5971 break;
5972
5973 case ARRAY_TYPE:
5974 /* Arrays are handled as small records. */
5975 {
5976 int num;
5977 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5978 TREE_TYPE (type), subclasses, bit_offset);
5979 if (!num)
5980 return 0;
5981
5982 /* The partial classes are now full classes. */
5983 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5984 subclasses[0] = X86_64_SSE_CLASS;
5985 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5986 && !((bit_offset % 64) == 0 && bytes == 4))
5987 subclasses[0] = X86_64_INTEGER_CLASS;
5988
5989 for (i = 0; i < words; i++)
5990 classes[i] = subclasses[i % num];
5991
5992 break;
5993 }
5994 case UNION_TYPE:
5995 case QUAL_UNION_TYPE:
5996 /* Unions are similar to RECORD_TYPE but offset is always 0.
5997 */
5998 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5999 {
6000 if (TREE_CODE (field) == FIELD_DECL)
6001 {
6002 int num;
6003
6004 if (TREE_TYPE (field) == error_mark_node)
6005 continue;
6006
6007 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6008 TREE_TYPE (field), subclasses,
6009 bit_offset);
6010 if (!num)
6011 return 0;
6012 for (i = 0; i < num; i++)
6013 classes[i] = merge_classes (subclasses[i], classes[i]);
6014 }
6015 }
6016 break;
6017
6018 default:
6019 gcc_unreachable ();
6020 }
6021
6022 if (words > 2)
6023 {
6024 /* When size > 16 bytes, if the first one isn't
6025 X86_64_SSE_CLASS or any other ones aren't
6026 X86_64_SSEUP_CLASS, everything should be passed in
6027 memory. */
6028 if (classes[0] != X86_64_SSE_CLASS)
6029 return 0;
6030
6031 for (i = 1; i < words; i++)
6032 if (classes[i] != X86_64_SSEUP_CLASS)
6033 return 0;
6034 }
6035
6036 /* Final merger cleanup. */
6037 for (i = 0; i < words; i++)
6038 {
6039 /* If one class is MEMORY, everything should be passed in
6040 memory. */
6041 if (classes[i] == X86_64_MEMORY_CLASS)
6042 return 0;
6043
6044 /* The X86_64_SSEUP_CLASS should be always preceded by
6045 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6046 if (classes[i] == X86_64_SSEUP_CLASS
6047 && classes[i - 1] != X86_64_SSE_CLASS
6048 && classes[i - 1] != X86_64_SSEUP_CLASS)
6049 {
6050 /* The first one should never be X86_64_SSEUP_CLASS. */
6051 gcc_assert (i != 0);
6052 classes[i] = X86_64_SSE_CLASS;
6053 }
6054
6055 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6056 everything should be passed in memory. */
6057 if (classes[i] == X86_64_X87UP_CLASS
6058 && (classes[i - 1] != X86_64_X87_CLASS))
6059 {
6060 static bool warned;
6061
6062 /* The first one should never be X86_64_X87UP_CLASS. */
6063 gcc_assert (i != 0);
6064 if (!warned && warn_psabi)
6065 {
6066 warned = true;
6067 inform (input_location,
6068 "the ABI of passing union with long double"
6069 " has changed in GCC 4.4");
6070 }
6071 return 0;
6072 }
6073 }
6074 return words;
6075 }
6076
6077 /* Compute alignment needed. We align all types to natural boundaries with
6078 exception of XFmode that is aligned to 64bits. */
6079 if (mode != VOIDmode && mode != BLKmode)
6080 {
6081 int mode_alignment = GET_MODE_BITSIZE (mode);
6082
6083 if (mode == XFmode)
6084 mode_alignment = 128;
6085 else if (mode == XCmode)
6086 mode_alignment = 256;
6087 if (COMPLEX_MODE_P (mode))
6088 mode_alignment /= 2;
6089 /* Misaligned fields are always returned in memory. */
6090 if (bit_offset % mode_alignment)
6091 return 0;
6092 }
6093
6094 /* for V1xx modes, just use the base mode */
6095 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6096 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6097 mode = GET_MODE_INNER (mode);
6098
6099 /* Classification of atomic types. */
6100 switch (mode)
6101 {
6102 case SDmode:
6103 case DDmode:
6104 classes[0] = X86_64_SSE_CLASS;
6105 return 1;
6106 case TDmode:
6107 classes[0] = X86_64_SSE_CLASS;
6108 classes[1] = X86_64_SSEUP_CLASS;
6109 return 2;
6110 case DImode:
6111 case SImode:
6112 case HImode:
6113 case QImode:
6114 case CSImode:
6115 case CHImode:
6116 case CQImode:
6117 {
6118 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6119
6120 if (size <= 32)
6121 {
6122 classes[0] = X86_64_INTEGERSI_CLASS;
6123 return 1;
6124 }
6125 else if (size <= 64)
6126 {
6127 classes[0] = X86_64_INTEGER_CLASS;
6128 return 1;
6129 }
6130 else if (size <= 64+32)
6131 {
6132 classes[0] = X86_64_INTEGER_CLASS;
6133 classes[1] = X86_64_INTEGERSI_CLASS;
6134 return 2;
6135 }
6136 else if (size <= 64+64)
6137 {
6138 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6139 return 2;
6140 }
6141 else
6142 gcc_unreachable ();
6143 }
6144 case CDImode:
6145 case TImode:
6146 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6147 return 2;
6148 case COImode:
6149 case OImode:
6150 /* OImode shouldn't be used directly. */
6151 gcc_unreachable ();
6152 case CTImode:
6153 return 0;
6154 case SFmode:
6155 if (!(bit_offset % 64))
6156 classes[0] = X86_64_SSESF_CLASS;
6157 else
6158 classes[0] = X86_64_SSE_CLASS;
6159 return 1;
6160 case DFmode:
6161 classes[0] = X86_64_SSEDF_CLASS;
6162 return 1;
6163 case XFmode:
6164 classes[0] = X86_64_X87_CLASS;
6165 classes[1] = X86_64_X87UP_CLASS;
6166 return 2;
6167 case TFmode:
6168 classes[0] = X86_64_SSE_CLASS;
6169 classes[1] = X86_64_SSEUP_CLASS;
6170 return 2;
6171 case SCmode:
6172 classes[0] = X86_64_SSE_CLASS;
6173 if (!(bit_offset % 64))
6174 return 1;
6175 else
6176 {
6177 static bool warned;
6178
6179 if (!warned && warn_psabi)
6180 {
6181 warned = true;
6182 inform (input_location,
6183 "the ABI of passing structure with complex float"
6184 " member has changed in GCC 4.4");
6185 }
6186 classes[1] = X86_64_SSESF_CLASS;
6187 return 2;
6188 }
6189 case DCmode:
6190 classes[0] = X86_64_SSEDF_CLASS;
6191 classes[1] = X86_64_SSEDF_CLASS;
6192 return 2;
6193 case XCmode:
6194 classes[0] = X86_64_COMPLEX_X87_CLASS;
6195 return 1;
6196 case TCmode:
6197 /* This modes is larger than 16 bytes. */
6198 return 0;
6199 case V8SFmode:
6200 case V8SImode:
6201 case V32QImode:
6202 case V16HImode:
6203 case V4DFmode:
6204 case V4DImode:
6205 classes[0] = X86_64_SSE_CLASS;
6206 classes[1] = X86_64_SSEUP_CLASS;
6207 classes[2] = X86_64_SSEUP_CLASS;
6208 classes[3] = X86_64_SSEUP_CLASS;
6209 return 4;
6210 case V4SFmode:
6211 case V4SImode:
6212 case V16QImode:
6213 case V8HImode:
6214 case V2DFmode:
6215 case V2DImode:
6216 classes[0] = X86_64_SSE_CLASS;
6217 classes[1] = X86_64_SSEUP_CLASS;
6218 return 2;
6219 case V1TImode:
6220 case V1DImode:
6221 case V2SFmode:
6222 case V2SImode:
6223 case V4HImode:
6224 case V8QImode:
6225 classes[0] = X86_64_SSE_CLASS;
6226 return 1;
6227 case BLKmode:
6228 case VOIDmode:
6229 return 0;
6230 default:
6231 gcc_assert (VECTOR_MODE_P (mode));
6232
6233 if (bytes > 16)
6234 return 0;
6235
6236 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6237
6238 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6239 classes[0] = X86_64_INTEGERSI_CLASS;
6240 else
6241 classes[0] = X86_64_INTEGER_CLASS;
6242 classes[1] = X86_64_INTEGER_CLASS;
6243 return 1 + (bytes > 8);
6244 }
6245 }
6246
6247 /* Examine the argument and return set number of register required in each
6248 class. Return 0 iff parameter should be passed in memory. */
6249 static int
6250 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6251 int *int_nregs, int *sse_nregs)
6252 {
6253 enum x86_64_reg_class regclass[MAX_CLASSES];
6254 int n = classify_argument (mode, type, regclass, 0);
6255
6256 *int_nregs = 0;
6257 *sse_nregs = 0;
6258 if (!n)
6259 return 0;
6260 for (n--; n >= 0; n--)
6261 switch (regclass[n])
6262 {
6263 case X86_64_INTEGER_CLASS:
6264 case X86_64_INTEGERSI_CLASS:
6265 (*int_nregs)++;
6266 break;
6267 case X86_64_SSE_CLASS:
6268 case X86_64_SSESF_CLASS:
6269 case X86_64_SSEDF_CLASS:
6270 (*sse_nregs)++;
6271 break;
6272 case X86_64_NO_CLASS:
6273 case X86_64_SSEUP_CLASS:
6274 break;
6275 case X86_64_X87_CLASS:
6276 case X86_64_X87UP_CLASS:
6277 if (!in_return)
6278 return 0;
6279 break;
6280 case X86_64_COMPLEX_X87_CLASS:
6281 return in_return ? 2 : 0;
6282 case X86_64_MEMORY_CLASS:
6283 gcc_unreachable ();
6284 }
6285 return 1;
6286 }
6287
6288 /* Construct container for the argument used by GCC interface. See
6289 FUNCTION_ARG for the detailed description. */
6290
6291 static rtx
6292 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6293 const_tree type, int in_return, int nintregs, int nsseregs,
6294 const int *intreg, int sse_regno)
6295 {
6296 /* The following variables hold the static issued_error state. */
6297 static bool issued_sse_arg_error;
6298 static bool issued_sse_ret_error;
6299 static bool issued_x87_ret_error;
6300
6301 enum machine_mode tmpmode;
6302 int bytes =
6303 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6304 enum x86_64_reg_class regclass[MAX_CLASSES];
6305 int n;
6306 int i;
6307 int nexps = 0;
6308 int needed_sseregs, needed_intregs;
6309 rtx exp[MAX_CLASSES];
6310 rtx ret;
6311
6312 n = classify_argument (mode, type, regclass, 0);
6313 if (!n)
6314 return NULL;
6315 if (!examine_argument (mode, type, in_return, &needed_intregs,
6316 &needed_sseregs))
6317 return NULL;
6318 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6319 return NULL;
6320
6321 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6322 some less clueful developer tries to use floating-point anyway. */
6323 if (needed_sseregs && !TARGET_SSE)
6324 {
6325 if (in_return)
6326 {
6327 if (!issued_sse_ret_error)
6328 {
6329 error ("SSE register return with SSE disabled");
6330 issued_sse_ret_error = true;
6331 }
6332 }
6333 else if (!issued_sse_arg_error)
6334 {
6335 error ("SSE register argument with SSE disabled");
6336 issued_sse_arg_error = true;
6337 }
6338 return NULL;
6339 }
6340
6341 /* Likewise, error if the ABI requires us to return values in the
6342 x87 registers and the user specified -mno-80387. */
6343 if (!TARGET_80387 && in_return)
6344 for (i = 0; i < n; i++)
6345 if (regclass[i] == X86_64_X87_CLASS
6346 || regclass[i] == X86_64_X87UP_CLASS
6347 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6348 {
6349 if (!issued_x87_ret_error)
6350 {
6351 error ("x87 register return with x87 disabled");
6352 issued_x87_ret_error = true;
6353 }
6354 return NULL;
6355 }
6356
6357 /* First construct simple cases. Avoid SCmode, since we want to use
6358 single register to pass this type. */
6359 if (n == 1 && mode != SCmode)
6360 switch (regclass[0])
6361 {
6362 case X86_64_INTEGER_CLASS:
6363 case X86_64_INTEGERSI_CLASS:
6364 return gen_rtx_REG (mode, intreg[0]);
6365 case X86_64_SSE_CLASS:
6366 case X86_64_SSESF_CLASS:
6367 case X86_64_SSEDF_CLASS:
6368 if (mode != BLKmode)
6369 return gen_reg_or_parallel (mode, orig_mode,
6370 SSE_REGNO (sse_regno));
6371 break;
6372 case X86_64_X87_CLASS:
6373 case X86_64_COMPLEX_X87_CLASS:
6374 return gen_rtx_REG (mode, FIRST_STACK_REG);
6375 case X86_64_NO_CLASS:
6376 /* Zero sized array, struct or class. */
6377 return NULL;
6378 default:
6379 gcc_unreachable ();
6380 }
6381 if (n == 2
6382 && regclass[0] == X86_64_SSE_CLASS
6383 && regclass[1] == X86_64_SSEUP_CLASS
6384 && mode != BLKmode)
6385 return gen_reg_or_parallel (mode, orig_mode,
6386 SSE_REGNO (sse_regno));
6387 if (n == 4
6388 && regclass[0] == X86_64_SSE_CLASS
6389 && regclass[1] == X86_64_SSEUP_CLASS
6390 && regclass[2] == X86_64_SSEUP_CLASS
6391 && regclass[3] == X86_64_SSEUP_CLASS
6392 && mode != BLKmode)
6393 return gen_reg_or_parallel (mode, orig_mode,
6394 SSE_REGNO (sse_regno));
6395 if (n == 2
6396 && regclass[0] == X86_64_X87_CLASS
6397 && regclass[1] == X86_64_X87UP_CLASS)
6398 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6399
6400 if (n == 2
6401 && regclass[0] == X86_64_INTEGER_CLASS
6402 && regclass[1] == X86_64_INTEGER_CLASS
6403 && (mode == CDImode || mode == TImode || mode == TFmode)
6404 && intreg[0] + 1 == intreg[1])
6405 return gen_rtx_REG (mode, intreg[0]);
6406
6407 /* Otherwise figure out the entries of the PARALLEL. */
6408 for (i = 0; i < n; i++)
6409 {
6410 int pos;
6411
6412 switch (regclass[i])
6413 {
6414 case X86_64_NO_CLASS:
6415 break;
6416 case X86_64_INTEGER_CLASS:
6417 case X86_64_INTEGERSI_CLASS:
6418 /* Merge TImodes on aligned occasions here too. */
6419 if (i * 8 + 8 > bytes)
6420 tmpmode
6421 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6422 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6423 tmpmode = SImode;
6424 else
6425 tmpmode = DImode;
6426 /* We've requested 24 bytes we
6427 don't have mode for. Use DImode. */
6428 if (tmpmode == BLKmode)
6429 tmpmode = DImode;
6430 exp [nexps++]
6431 = gen_rtx_EXPR_LIST (VOIDmode,
6432 gen_rtx_REG (tmpmode, *intreg),
6433 GEN_INT (i*8));
6434 intreg++;
6435 break;
6436 case X86_64_SSESF_CLASS:
6437 exp [nexps++]
6438 = gen_rtx_EXPR_LIST (VOIDmode,
6439 gen_rtx_REG (SFmode,
6440 SSE_REGNO (sse_regno)),
6441 GEN_INT (i*8));
6442 sse_regno++;
6443 break;
6444 case X86_64_SSEDF_CLASS:
6445 exp [nexps++]
6446 = gen_rtx_EXPR_LIST (VOIDmode,
6447 gen_rtx_REG (DFmode,
6448 SSE_REGNO (sse_regno)),
6449 GEN_INT (i*8));
6450 sse_regno++;
6451 break;
6452 case X86_64_SSE_CLASS:
6453 pos = i;
6454 switch (n)
6455 {
6456 case 1:
6457 tmpmode = DImode;
6458 break;
6459 case 2:
6460 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6461 {
6462 tmpmode = TImode;
6463 i++;
6464 }
6465 else
6466 tmpmode = DImode;
6467 break;
6468 case 4:
6469 gcc_assert (i == 0
6470 && regclass[1] == X86_64_SSEUP_CLASS
6471 && regclass[2] == X86_64_SSEUP_CLASS
6472 && regclass[3] == X86_64_SSEUP_CLASS);
6473 tmpmode = OImode;
6474 i += 3;
6475 break;
6476 default:
6477 gcc_unreachable ();
6478 }
6479 exp [nexps++]
6480 = gen_rtx_EXPR_LIST (VOIDmode,
6481 gen_rtx_REG (tmpmode,
6482 SSE_REGNO (sse_regno)),
6483 GEN_INT (pos*8));
6484 sse_regno++;
6485 break;
6486 default:
6487 gcc_unreachable ();
6488 }
6489 }
6490
6491 /* Empty aligned struct, union or class. */
6492 if (nexps == 0)
6493 return NULL;
6494
6495 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6496 for (i = 0; i < nexps; i++)
6497 XVECEXP (ret, 0, i) = exp [i];
6498 return ret;
6499 }
6500
6501 /* Update the data in CUM to advance over an argument of mode MODE
6502 and data type TYPE. (TYPE is null for libcalls where that information
6503 may not be available.) */
6504
6505 static void
6506 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6507 const_tree type, HOST_WIDE_INT bytes,
6508 HOST_WIDE_INT words)
6509 {
6510 switch (mode)
6511 {
6512 default:
6513 break;
6514
6515 case BLKmode:
6516 if (bytes < 0)
6517 break;
6518 /* FALLTHRU */
6519
6520 case DImode:
6521 case SImode:
6522 case HImode:
6523 case QImode:
6524 cum->words += words;
6525 cum->nregs -= words;
6526 cum->regno += words;
6527
6528 if (cum->nregs <= 0)
6529 {
6530 cum->nregs = 0;
6531 cum->regno = 0;
6532 }
6533 break;
6534
6535 case OImode:
6536 /* OImode shouldn't be used directly. */
6537 gcc_unreachable ();
6538
6539 case DFmode:
6540 if (cum->float_in_sse < 2)
6541 break;
6542 case SFmode:
6543 if (cum->float_in_sse < 1)
6544 break;
6545 /* FALLTHRU */
6546
6547 case V8SFmode:
6548 case V8SImode:
6549 case V32QImode:
6550 case V16HImode:
6551 case V4DFmode:
6552 case V4DImode:
6553 case TImode:
6554 case V16QImode:
6555 case V8HImode:
6556 case V4SImode:
6557 case V2DImode:
6558 case V4SFmode:
6559 case V2DFmode:
6560 if (!type || !AGGREGATE_TYPE_P (type))
6561 {
6562 cum->sse_words += words;
6563 cum->sse_nregs -= 1;
6564 cum->sse_regno += 1;
6565 if (cum->sse_nregs <= 0)
6566 {
6567 cum->sse_nregs = 0;
6568 cum->sse_regno = 0;
6569 }
6570 }
6571 break;
6572
6573 case V8QImode:
6574 case V4HImode:
6575 case V2SImode:
6576 case V2SFmode:
6577 case V1TImode:
6578 case V1DImode:
6579 if (!type || !AGGREGATE_TYPE_P (type))
6580 {
6581 cum->mmx_words += words;
6582 cum->mmx_nregs -= 1;
6583 cum->mmx_regno += 1;
6584 if (cum->mmx_nregs <= 0)
6585 {
6586 cum->mmx_nregs = 0;
6587 cum->mmx_regno = 0;
6588 }
6589 }
6590 break;
6591 }
6592 }
6593
6594 static void
6595 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6596 const_tree type, HOST_WIDE_INT words, bool named)
6597 {
6598 int int_nregs, sse_nregs;
6599
6600 /* Unnamed 256bit vector mode parameters are passed on stack. */
6601 if (!named && VALID_AVX256_REG_MODE (mode))
6602 return;
6603
6604 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6605 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6606 {
6607 cum->nregs -= int_nregs;
6608 cum->sse_nregs -= sse_nregs;
6609 cum->regno += int_nregs;
6610 cum->sse_regno += sse_nregs;
6611 }
6612 else
6613 {
6614 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6615 cum->words = (cum->words + align - 1) & ~(align - 1);
6616 cum->words += words;
6617 }
6618 }
6619
6620 static void
6621 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6622 HOST_WIDE_INT words)
6623 {
6624 /* Otherwise, this should be passed indirect. */
6625 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6626
6627 cum->words += words;
6628 if (cum->nregs > 0)
6629 {
6630 cum->nregs -= 1;
6631 cum->regno += 1;
6632 }
6633 }
6634
6635 /* Update the data in CUM to advance over an argument of mode MODE and
6636 data type TYPE. (TYPE is null for libcalls where that information
6637 may not be available.) */
6638
6639 static void
6640 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6641 const_tree type, bool named)
6642 {
6643 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6644 HOST_WIDE_INT bytes, words;
6645
6646 if (mode == BLKmode)
6647 bytes = int_size_in_bytes (type);
6648 else
6649 bytes = GET_MODE_SIZE (mode);
6650 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6651
6652 if (type)
6653 mode = type_natural_mode (type, NULL);
6654
6655 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6656 function_arg_advance_ms_64 (cum, bytes, words);
6657 else if (TARGET_64BIT)
6658 function_arg_advance_64 (cum, mode, type, words, named);
6659 else
6660 function_arg_advance_32 (cum, mode, type, bytes, words);
6661 }
6662
6663 /* Define where to put the arguments to a function.
6664 Value is zero to push the argument on the stack,
6665 or a hard register in which to store the argument.
6666
6667 MODE is the argument's machine mode.
6668 TYPE is the data type of the argument (as a tree).
6669 This is null for libcalls where that information may
6670 not be available.
6671 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6672 the preceding args and about the function being called.
6673 NAMED is nonzero if this argument is a named parameter
6674 (otherwise it is an extra parameter matching an ellipsis). */
6675
6676 static rtx
6677 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6678 enum machine_mode orig_mode, const_tree type,
6679 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6680 {
6681 static bool warnedsse, warnedmmx;
6682
6683 /* Avoid the AL settings for the Unix64 ABI. */
6684 if (mode == VOIDmode)
6685 return constm1_rtx;
6686
6687 switch (mode)
6688 {
6689 default:
6690 break;
6691
6692 case BLKmode:
6693 if (bytes < 0)
6694 break;
6695 /* FALLTHRU */
6696 case DImode:
6697 case SImode:
6698 case HImode:
6699 case QImode:
6700 if (words <= cum->nregs)
6701 {
6702 int regno = cum->regno;
6703
6704 /* Fastcall allocates the first two DWORD (SImode) or
6705 smaller arguments to ECX and EDX if it isn't an
6706 aggregate type . */
6707 if (cum->fastcall)
6708 {
6709 if (mode == BLKmode
6710 || mode == DImode
6711 || (type && AGGREGATE_TYPE_P (type)))
6712 break;
6713
6714 /* ECX not EAX is the first allocated register. */
6715 if (regno == AX_REG)
6716 regno = CX_REG;
6717 }
6718 return gen_rtx_REG (mode, regno);
6719 }
6720 break;
6721
6722 case DFmode:
6723 if (cum->float_in_sse < 2)
6724 break;
6725 case SFmode:
6726 if (cum->float_in_sse < 1)
6727 break;
6728 /* FALLTHRU */
6729 case TImode:
6730 /* In 32bit, we pass TImode in xmm registers. */
6731 case V16QImode:
6732 case V8HImode:
6733 case V4SImode:
6734 case V2DImode:
6735 case V4SFmode:
6736 case V2DFmode:
6737 if (!type || !AGGREGATE_TYPE_P (type))
6738 {
6739 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6740 {
6741 warnedsse = true;
6742 warning (0, "SSE vector argument without SSE enabled "
6743 "changes the ABI");
6744 }
6745 if (cum->sse_nregs)
6746 return gen_reg_or_parallel (mode, orig_mode,
6747 cum->sse_regno + FIRST_SSE_REG);
6748 }
6749 break;
6750
6751 case OImode:
6752 /* OImode shouldn't be used directly. */
6753 gcc_unreachable ();
6754
6755 case V8SFmode:
6756 case V8SImode:
6757 case V32QImode:
6758 case V16HImode:
6759 case V4DFmode:
6760 case V4DImode:
6761 if (!type || !AGGREGATE_TYPE_P (type))
6762 {
6763 if (cum->sse_nregs)
6764 return gen_reg_or_parallel (mode, orig_mode,
6765 cum->sse_regno + FIRST_SSE_REG);
6766 }
6767 break;
6768
6769 case V8QImode:
6770 case V4HImode:
6771 case V2SImode:
6772 case V2SFmode:
6773 case V1TImode:
6774 case V1DImode:
6775 if (!type || !AGGREGATE_TYPE_P (type))
6776 {
6777 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6778 {
6779 warnedmmx = true;
6780 warning (0, "MMX vector argument without MMX enabled "
6781 "changes the ABI");
6782 }
6783 if (cum->mmx_nregs)
6784 return gen_reg_or_parallel (mode, orig_mode,
6785 cum->mmx_regno + FIRST_MMX_REG);
6786 }
6787 break;
6788 }
6789
6790 return NULL_RTX;
6791 }
6792
6793 static rtx
6794 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6795 enum machine_mode orig_mode, const_tree type, bool named)
6796 {
6797 /* Handle a hidden AL argument containing number of registers
6798 for varargs x86-64 functions. */
6799 if (mode == VOIDmode)
6800 return GEN_INT (cum->maybe_vaarg
6801 ? (cum->sse_nregs < 0
6802 ? X86_64_SSE_REGPARM_MAX
6803 : cum->sse_regno)
6804 : -1);
6805
6806 switch (mode)
6807 {
6808 default:
6809 break;
6810
6811 case V8SFmode:
6812 case V8SImode:
6813 case V32QImode:
6814 case V16HImode:
6815 case V4DFmode:
6816 case V4DImode:
6817 /* Unnamed 256bit vector mode parameters are passed on stack. */
6818 if (!named)
6819 return NULL;
6820 break;
6821 }
6822
6823 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6824 cum->sse_nregs,
6825 &x86_64_int_parameter_registers [cum->regno],
6826 cum->sse_regno);
6827 }
6828
6829 static rtx
6830 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6831 enum machine_mode orig_mode, bool named,
6832 HOST_WIDE_INT bytes)
6833 {
6834 unsigned int regno;
6835
6836 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6837 We use value of -2 to specify that current function call is MSABI. */
6838 if (mode == VOIDmode)
6839 return GEN_INT (-2);
6840
6841 /* If we've run out of registers, it goes on the stack. */
6842 if (cum->nregs == 0)
6843 return NULL_RTX;
6844
6845 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6846
6847 /* Only floating point modes are passed in anything but integer regs. */
6848 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6849 {
6850 if (named)
6851 regno = cum->regno + FIRST_SSE_REG;
6852 else
6853 {
6854 rtx t1, t2;
6855
6856 /* Unnamed floating parameters are passed in both the
6857 SSE and integer registers. */
6858 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6859 t2 = gen_rtx_REG (mode, regno);
6860 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6861 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6862 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6863 }
6864 }
6865 /* Handle aggregated types passed in register. */
6866 if (orig_mode == BLKmode)
6867 {
6868 if (bytes > 0 && bytes <= 8)
6869 mode = (bytes > 4 ? DImode : SImode);
6870 if (mode == BLKmode)
6871 mode = DImode;
6872 }
6873
6874 return gen_reg_or_parallel (mode, orig_mode, regno);
6875 }
6876
6877 /* Return where to put the arguments to a function.
6878 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6879
6880 MODE is the argument's machine mode. TYPE is the data type of the
6881 argument. It is null for libcalls where that information may not be
6882 available. CUM gives information about the preceding args and about
6883 the function being called. NAMED is nonzero if this argument is a
6884 named parameter (otherwise it is an extra parameter matching an
6885 ellipsis). */
6886
6887 static rtx
6888 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6889 const_tree type, bool named)
6890 {
6891 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6892 enum machine_mode mode = omode;
6893 HOST_WIDE_INT bytes, words;
6894 rtx arg;
6895
6896 if (mode == BLKmode)
6897 bytes = int_size_in_bytes (type);
6898 else
6899 bytes = GET_MODE_SIZE (mode);
6900 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6901
6902 /* To simplify the code below, represent vector types with a vector mode
6903 even if MMX/SSE are not active. */
6904 if (type && TREE_CODE (type) == VECTOR_TYPE)
6905 mode = type_natural_mode (type, cum);
6906
6907 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6908 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6909 else if (TARGET_64BIT)
6910 arg = function_arg_64 (cum, mode, omode, type, named);
6911 else
6912 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6913
6914 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6915 {
6916 /* This argument uses 256bit AVX modes. */
6917 if (cum->caller)
6918 cfun->machine->callee_pass_avx256_p = true;
6919 else
6920 cfun->machine->caller_pass_avx256_p = true;
6921 }
6922
6923 return arg;
6924 }
6925
6926 /* A C expression that indicates when an argument must be passed by
6927 reference. If nonzero for an argument, a copy of that argument is
6928 made in memory and a pointer to the argument is passed instead of
6929 the argument itself. The pointer is passed in whatever way is
6930 appropriate for passing a pointer to that type. */
6931
6932 static bool
6933 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6934 enum machine_mode mode ATTRIBUTE_UNUSED,
6935 const_tree type, bool named ATTRIBUTE_UNUSED)
6936 {
6937 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6938
6939 /* See Windows x64 Software Convention. */
6940 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6941 {
6942 int msize = (int) GET_MODE_SIZE (mode);
6943 if (type)
6944 {
6945 /* Arrays are passed by reference. */
6946 if (TREE_CODE (type) == ARRAY_TYPE)
6947 return true;
6948
6949 if (AGGREGATE_TYPE_P (type))
6950 {
6951 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6952 are passed by reference. */
6953 msize = int_size_in_bytes (type);
6954 }
6955 }
6956
6957 /* __m128 is passed by reference. */
6958 switch (msize) {
6959 case 1: case 2: case 4: case 8:
6960 break;
6961 default:
6962 return true;
6963 }
6964 }
6965 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6966 return 1;
6967
6968 return 0;
6969 }
6970
6971 /* Return true when TYPE should be 128bit aligned for 32bit argument
6972 passing ABI. XXX: This function is obsolete and is only used for
6973 checking psABI compatibility with previous versions of GCC. */
6974
6975 static bool
6976 ix86_compat_aligned_value_p (const_tree type)
6977 {
6978 enum machine_mode mode = TYPE_MODE (type);
6979 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6980 || mode == TDmode
6981 || mode == TFmode
6982 || mode == TCmode)
6983 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6984 return true;
6985 if (TYPE_ALIGN (type) < 128)
6986 return false;
6987
6988 if (AGGREGATE_TYPE_P (type))
6989 {
6990 /* Walk the aggregates recursively. */
6991 switch (TREE_CODE (type))
6992 {
6993 case RECORD_TYPE:
6994 case UNION_TYPE:
6995 case QUAL_UNION_TYPE:
6996 {
6997 tree field;
6998
6999 /* Walk all the structure fields. */
7000 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7001 {
7002 if (TREE_CODE (field) == FIELD_DECL
7003 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7004 return true;
7005 }
7006 break;
7007 }
7008
7009 case ARRAY_TYPE:
7010 /* Just for use if some languages passes arrays by value. */
7011 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7012 return true;
7013 break;
7014
7015 default:
7016 gcc_unreachable ();
7017 }
7018 }
7019 return false;
7020 }
7021
7022 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7023 XXX: This function is obsolete and is only used for checking psABI
7024 compatibility with previous versions of GCC. */
7025
7026 static unsigned int
7027 ix86_compat_function_arg_boundary (enum machine_mode mode,
7028 const_tree type, unsigned int align)
7029 {
7030 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7031 natural boundaries. */
7032 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7033 {
7034 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7035 make an exception for SSE modes since these require 128bit
7036 alignment.
7037
7038 The handling here differs from field_alignment. ICC aligns MMX
7039 arguments to 4 byte boundaries, while structure fields are aligned
7040 to 8 byte boundaries. */
7041 if (!type)
7042 {
7043 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7044 align = PARM_BOUNDARY;
7045 }
7046 else
7047 {
7048 if (!ix86_compat_aligned_value_p (type))
7049 align = PARM_BOUNDARY;
7050 }
7051 }
7052 if (align > BIGGEST_ALIGNMENT)
7053 align = BIGGEST_ALIGNMENT;
7054 return align;
7055 }
7056
7057 /* Return true when TYPE should be 128bit aligned for 32bit argument
7058 passing ABI. */
7059
7060 static bool
7061 ix86_contains_aligned_value_p (const_tree type)
7062 {
7063 enum machine_mode mode = TYPE_MODE (type);
7064
7065 if (mode == XFmode || mode == XCmode)
7066 return false;
7067
7068 if (TYPE_ALIGN (type) < 128)
7069 return false;
7070
7071 if (AGGREGATE_TYPE_P (type))
7072 {
7073 /* Walk the aggregates recursively. */
7074 switch (TREE_CODE (type))
7075 {
7076 case RECORD_TYPE:
7077 case UNION_TYPE:
7078 case QUAL_UNION_TYPE:
7079 {
7080 tree field;
7081
7082 /* Walk all the structure fields. */
7083 for (field = TYPE_FIELDS (type);
7084 field;
7085 field = DECL_CHAIN (field))
7086 {
7087 if (TREE_CODE (field) == FIELD_DECL
7088 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7089 return true;
7090 }
7091 break;
7092 }
7093
7094 case ARRAY_TYPE:
7095 /* Just for use if some languages passes arrays by value. */
7096 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7097 return true;
7098 break;
7099
7100 default:
7101 gcc_unreachable ();
7102 }
7103 }
7104 else
7105 return TYPE_ALIGN (type) >= 128;
7106
7107 return false;
7108 }
7109
7110 /* Gives the alignment boundary, in bits, of an argument with the
7111 specified mode and type. */
7112
7113 static unsigned int
7114 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7115 {
7116 unsigned int align;
7117 if (type)
7118 {
7119 /* Since the main variant type is used for call, we convert it to
7120 the main variant type. */
7121 type = TYPE_MAIN_VARIANT (type);
7122 align = TYPE_ALIGN (type);
7123 }
7124 else
7125 align = GET_MODE_ALIGNMENT (mode);
7126 if (align < PARM_BOUNDARY)
7127 align = PARM_BOUNDARY;
7128 else
7129 {
7130 static bool warned;
7131 unsigned int saved_align = align;
7132
7133 if (!TARGET_64BIT)
7134 {
7135 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7136 if (!type)
7137 {
7138 if (mode == XFmode || mode == XCmode)
7139 align = PARM_BOUNDARY;
7140 }
7141 else if (!ix86_contains_aligned_value_p (type))
7142 align = PARM_BOUNDARY;
7143
7144 if (align < 128)
7145 align = PARM_BOUNDARY;
7146 }
7147
7148 if (warn_psabi
7149 && !warned
7150 && align != ix86_compat_function_arg_boundary (mode, type,
7151 saved_align))
7152 {
7153 warned = true;
7154 inform (input_location,
7155 "The ABI for passing parameters with %d-byte"
7156 " alignment has changed in GCC 4.6",
7157 align / BITS_PER_UNIT);
7158 }
7159 }
7160
7161 return align;
7162 }
7163
7164 /* Return true if N is a possible register number of function value. */
7165
7166 static bool
7167 ix86_function_value_regno_p (const unsigned int regno)
7168 {
7169 switch (regno)
7170 {
7171 case AX_REG:
7172 return true;
7173
7174 case FIRST_FLOAT_REG:
7175 /* TODO: The function should depend on current function ABI but
7176 builtins.c would need updating then. Therefore we use the
7177 default ABI. */
7178 if (TARGET_64BIT && ix86_abi == MS_ABI)
7179 return false;
7180 return TARGET_FLOAT_RETURNS_IN_80387;
7181
7182 case FIRST_SSE_REG:
7183 return TARGET_SSE;
7184
7185 case FIRST_MMX_REG:
7186 if (TARGET_MACHO || TARGET_64BIT)
7187 return false;
7188 return TARGET_MMX;
7189 }
7190
7191 return false;
7192 }
7193
7194 /* Define how to find the value returned by a function.
7195 VALTYPE is the data type of the value (as a tree).
7196 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7197 otherwise, FUNC is 0. */
7198
7199 static rtx
7200 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7201 const_tree fntype, const_tree fn)
7202 {
7203 unsigned int regno;
7204
7205 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7206 we normally prevent this case when mmx is not available. However
7207 some ABIs may require the result to be returned like DImode. */
7208 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7209 regno = FIRST_MMX_REG;
7210
7211 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7212 we prevent this case when sse is not available. However some ABIs
7213 may require the result to be returned like integer TImode. */
7214 else if (mode == TImode
7215 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7216 regno = FIRST_SSE_REG;
7217
7218 /* 32-byte vector modes in %ymm0. */
7219 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7220 regno = FIRST_SSE_REG;
7221
7222 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7223 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7224 regno = FIRST_FLOAT_REG;
7225 else
7226 /* Most things go in %eax. */
7227 regno = AX_REG;
7228
7229 /* Override FP return register with %xmm0 for local functions when
7230 SSE math is enabled or for functions with sseregparm attribute. */
7231 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7232 {
7233 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7234 if ((sse_level >= 1 && mode == SFmode)
7235 || (sse_level == 2 && mode == DFmode))
7236 regno = FIRST_SSE_REG;
7237 }
7238
7239 /* OImode shouldn't be used directly. */
7240 gcc_assert (mode != OImode);
7241
7242 return gen_rtx_REG (orig_mode, regno);
7243 }
7244
7245 static rtx
7246 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7247 const_tree valtype)
7248 {
7249 rtx ret;
7250
7251 /* Handle libcalls, which don't provide a type node. */
7252 if (valtype == NULL)
7253 {
7254 unsigned int regno;
7255
7256 switch (mode)
7257 {
7258 case SFmode:
7259 case SCmode:
7260 case DFmode:
7261 case DCmode:
7262 case TFmode:
7263 case SDmode:
7264 case DDmode:
7265 case TDmode:
7266 regno = FIRST_SSE_REG;
7267 break;
7268 case XFmode:
7269 case XCmode:
7270 regno = FIRST_FLOAT_REG;
7271 break;
7272 case TCmode:
7273 return NULL;
7274 default:
7275 regno = AX_REG;
7276 }
7277
7278 return gen_rtx_REG (mode, regno);
7279 }
7280 else if (POINTER_TYPE_P (valtype))
7281 {
7282 /* Pointers are always returned in word_mode. */
7283 mode = word_mode;
7284 }
7285
7286 ret = construct_container (mode, orig_mode, valtype, 1,
7287 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7288 x86_64_int_return_registers, 0);
7289
7290 /* For zero sized structures, construct_container returns NULL, but we
7291 need to keep rest of compiler happy by returning meaningful value. */
7292 if (!ret)
7293 ret = gen_rtx_REG (orig_mode, AX_REG);
7294
7295 return ret;
7296 }
7297
7298 static rtx
7299 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7300 {
7301 unsigned int regno = AX_REG;
7302
7303 if (TARGET_SSE)
7304 {
7305 switch (GET_MODE_SIZE (mode))
7306 {
7307 case 16:
7308 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7309 && !COMPLEX_MODE_P (mode))
7310 regno = FIRST_SSE_REG;
7311 break;
7312 case 8:
7313 case 4:
7314 if (mode == SFmode || mode == DFmode)
7315 regno = FIRST_SSE_REG;
7316 break;
7317 default:
7318 break;
7319 }
7320 }
7321 return gen_rtx_REG (orig_mode, regno);
7322 }
7323
7324 static rtx
7325 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7326 enum machine_mode orig_mode, enum machine_mode mode)
7327 {
7328 const_tree fn, fntype;
7329
7330 fn = NULL_TREE;
7331 if (fntype_or_decl && DECL_P (fntype_or_decl))
7332 fn = fntype_or_decl;
7333 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7334
7335 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7336 return function_value_ms_64 (orig_mode, mode);
7337 else if (TARGET_64BIT)
7338 return function_value_64 (orig_mode, mode, valtype);
7339 else
7340 return function_value_32 (orig_mode, mode, fntype, fn);
7341 }
7342
7343 static rtx
7344 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7345 bool outgoing ATTRIBUTE_UNUSED)
7346 {
7347 enum machine_mode mode, orig_mode;
7348
7349 orig_mode = TYPE_MODE (valtype);
7350 mode = type_natural_mode (valtype, NULL);
7351 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7352 }
7353
7354 /* Pointer function arguments and return values are promoted to
7355 word_mode. */
7356
7357 static enum machine_mode
7358 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7359 int *punsignedp, const_tree fntype,
7360 int for_return)
7361 {
7362 if (type != NULL_TREE && POINTER_TYPE_P (type))
7363 {
7364 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7365 return word_mode;
7366 }
7367 return default_promote_function_mode (type, mode, punsignedp, fntype,
7368 for_return);
7369 }
7370
7371 rtx
7372 ix86_libcall_value (enum machine_mode mode)
7373 {
7374 return ix86_function_value_1 (NULL, NULL, mode, mode);
7375 }
7376
7377 /* Return true iff type is returned in memory. */
7378
7379 static bool ATTRIBUTE_UNUSED
7380 return_in_memory_32 (const_tree type, enum machine_mode mode)
7381 {
7382 HOST_WIDE_INT size;
7383
7384 if (mode == BLKmode)
7385 return true;
7386
7387 size = int_size_in_bytes (type);
7388
7389 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7390 return false;
7391
7392 if (VECTOR_MODE_P (mode) || mode == TImode)
7393 {
7394 /* User-created vectors small enough to fit in EAX. */
7395 if (size < 8)
7396 return false;
7397
7398 /* MMX/3dNow values are returned in MM0,
7399 except when it doesn't exits or the ABI prescribes otherwise. */
7400 if (size == 8)
7401 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7402
7403 /* SSE values are returned in XMM0, except when it doesn't exist. */
7404 if (size == 16)
7405 return !TARGET_SSE;
7406
7407 /* AVX values are returned in YMM0, except when it doesn't exist. */
7408 if (size == 32)
7409 return !TARGET_AVX;
7410 }
7411
7412 if (mode == XFmode)
7413 return false;
7414
7415 if (size > 12)
7416 return true;
7417
7418 /* OImode shouldn't be used directly. */
7419 gcc_assert (mode != OImode);
7420
7421 return false;
7422 }
7423
7424 static bool ATTRIBUTE_UNUSED
7425 return_in_memory_64 (const_tree type, enum machine_mode mode)
7426 {
7427 int needed_intregs, needed_sseregs;
7428 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7429 }
7430
7431 static bool ATTRIBUTE_UNUSED
7432 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7433 {
7434 HOST_WIDE_INT size = int_size_in_bytes (type);
7435
7436 /* __m128 is returned in xmm0. */
7437 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7438 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7439 return false;
7440
7441 /* Otherwise, the size must be exactly in [1248]. */
7442 return size != 1 && size != 2 && size != 4 && size != 8;
7443 }
7444
7445 static bool
7446 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7447 {
7448 #ifdef SUBTARGET_RETURN_IN_MEMORY
7449 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7450 #else
7451 const enum machine_mode mode = type_natural_mode (type, NULL);
7452
7453 if (TARGET_64BIT)
7454 {
7455 if (ix86_function_type_abi (fntype) == MS_ABI)
7456 return return_in_memory_ms_64 (type, mode);
7457 else
7458 return return_in_memory_64 (type, mode);
7459 }
7460 else
7461 return return_in_memory_32 (type, mode);
7462 #endif
7463 }
7464
7465 /* When returning SSE vector types, we have a choice of either
7466 (1) being abi incompatible with a -march switch, or
7467 (2) generating an error.
7468 Given no good solution, I think the safest thing is one warning.
7469 The user won't be able to use -Werror, but....
7470
7471 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7472 called in response to actually generating a caller or callee that
7473 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7474 via aggregate_value_p for general type probing from tree-ssa. */
7475
7476 static rtx
7477 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7478 {
7479 static bool warnedsse, warnedmmx;
7480
7481 if (!TARGET_64BIT && type)
7482 {
7483 /* Look at the return type of the function, not the function type. */
7484 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7485
7486 if (!TARGET_SSE && !warnedsse)
7487 {
7488 if (mode == TImode
7489 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7490 {
7491 warnedsse = true;
7492 warning (0, "SSE vector return without SSE enabled "
7493 "changes the ABI");
7494 }
7495 }
7496
7497 if (!TARGET_MMX && !warnedmmx)
7498 {
7499 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7500 {
7501 warnedmmx = true;
7502 warning (0, "MMX vector return without MMX enabled "
7503 "changes the ABI");
7504 }
7505 }
7506 }
7507
7508 return NULL;
7509 }
7510
7511 \f
7512 /* Create the va_list data type. */
7513
7514 /* Returns the calling convention specific va_list date type.
7515 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7516
7517 static tree
7518 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7519 {
7520 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7521
7522 /* For i386 we use plain pointer to argument area. */
7523 if (!TARGET_64BIT || abi == MS_ABI)
7524 return build_pointer_type (char_type_node);
7525
7526 record = lang_hooks.types.make_type (RECORD_TYPE);
7527 type_decl = build_decl (BUILTINS_LOCATION,
7528 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7529
7530 f_gpr = build_decl (BUILTINS_LOCATION,
7531 FIELD_DECL, get_identifier ("gp_offset"),
7532 unsigned_type_node);
7533 f_fpr = build_decl (BUILTINS_LOCATION,
7534 FIELD_DECL, get_identifier ("fp_offset"),
7535 unsigned_type_node);
7536 f_ovf = build_decl (BUILTINS_LOCATION,
7537 FIELD_DECL, get_identifier ("overflow_arg_area"),
7538 ptr_type_node);
7539 f_sav = build_decl (BUILTINS_LOCATION,
7540 FIELD_DECL, get_identifier ("reg_save_area"),
7541 ptr_type_node);
7542
7543 va_list_gpr_counter_field = f_gpr;
7544 va_list_fpr_counter_field = f_fpr;
7545
7546 DECL_FIELD_CONTEXT (f_gpr) = record;
7547 DECL_FIELD_CONTEXT (f_fpr) = record;
7548 DECL_FIELD_CONTEXT (f_ovf) = record;
7549 DECL_FIELD_CONTEXT (f_sav) = record;
7550
7551 TYPE_STUB_DECL (record) = type_decl;
7552 TYPE_NAME (record) = type_decl;
7553 TYPE_FIELDS (record) = f_gpr;
7554 DECL_CHAIN (f_gpr) = f_fpr;
7555 DECL_CHAIN (f_fpr) = f_ovf;
7556 DECL_CHAIN (f_ovf) = f_sav;
7557
7558 layout_type (record);
7559
7560 /* The correct type is an array type of one element. */
7561 return build_array_type (record, build_index_type (size_zero_node));
7562 }
7563
7564 /* Setup the builtin va_list data type and for 64-bit the additional
7565 calling convention specific va_list data types. */
7566
7567 static tree
7568 ix86_build_builtin_va_list (void)
7569 {
7570 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7571
7572 /* Initialize abi specific va_list builtin types. */
7573 if (TARGET_64BIT)
7574 {
7575 tree t;
7576 if (ix86_abi == MS_ABI)
7577 {
7578 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7579 if (TREE_CODE (t) != RECORD_TYPE)
7580 t = build_variant_type_copy (t);
7581 sysv_va_list_type_node = t;
7582 }
7583 else
7584 {
7585 t = ret;
7586 if (TREE_CODE (t) != RECORD_TYPE)
7587 t = build_variant_type_copy (t);
7588 sysv_va_list_type_node = t;
7589 }
7590 if (ix86_abi != MS_ABI)
7591 {
7592 t = ix86_build_builtin_va_list_abi (MS_ABI);
7593 if (TREE_CODE (t) != RECORD_TYPE)
7594 t = build_variant_type_copy (t);
7595 ms_va_list_type_node = t;
7596 }
7597 else
7598 {
7599 t = ret;
7600 if (TREE_CODE (t) != RECORD_TYPE)
7601 t = build_variant_type_copy (t);
7602 ms_va_list_type_node = t;
7603 }
7604 }
7605
7606 return ret;
7607 }
7608
7609 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7610
7611 static void
7612 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7613 {
7614 rtx save_area, mem;
7615 alias_set_type set;
7616 int i, max;
7617
7618 /* GPR size of varargs save area. */
7619 if (cfun->va_list_gpr_size)
7620 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7621 else
7622 ix86_varargs_gpr_size = 0;
7623
7624 /* FPR size of varargs save area. We don't need it if we don't pass
7625 anything in SSE registers. */
7626 if (TARGET_SSE && cfun->va_list_fpr_size)
7627 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7628 else
7629 ix86_varargs_fpr_size = 0;
7630
7631 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7632 return;
7633
7634 save_area = frame_pointer_rtx;
7635 set = get_varargs_alias_set ();
7636
7637 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7638 if (max > X86_64_REGPARM_MAX)
7639 max = X86_64_REGPARM_MAX;
7640
7641 for (i = cum->regno; i < max; i++)
7642 {
7643 mem = gen_rtx_MEM (word_mode,
7644 plus_constant (save_area, i * UNITS_PER_WORD));
7645 MEM_NOTRAP_P (mem) = 1;
7646 set_mem_alias_set (mem, set);
7647 emit_move_insn (mem,
7648 gen_rtx_REG (word_mode,
7649 x86_64_int_parameter_registers[i]));
7650 }
7651
7652 if (ix86_varargs_fpr_size)
7653 {
7654 enum machine_mode smode;
7655 rtx label, test;
7656
7657 /* Now emit code to save SSE registers. The AX parameter contains number
7658 of SSE parameter registers used to call this function, though all we
7659 actually check here is the zero/non-zero status. */
7660
7661 label = gen_label_rtx ();
7662 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7663 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7664 label));
7665
7666 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7667 we used movdqa (i.e. TImode) instead? Perhaps even better would
7668 be if we could determine the real mode of the data, via a hook
7669 into pass_stdarg. Ignore all that for now. */
7670 smode = V4SFmode;
7671 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7672 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7673
7674 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7675 if (max > X86_64_SSE_REGPARM_MAX)
7676 max = X86_64_SSE_REGPARM_MAX;
7677
7678 for (i = cum->sse_regno; i < max; ++i)
7679 {
7680 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7681 mem = gen_rtx_MEM (smode, mem);
7682 MEM_NOTRAP_P (mem) = 1;
7683 set_mem_alias_set (mem, set);
7684 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7685
7686 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7687 }
7688
7689 emit_label (label);
7690 }
7691 }
7692
7693 static void
7694 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7695 {
7696 alias_set_type set = get_varargs_alias_set ();
7697 int i;
7698
7699 /* Reset to zero, as there might be a sysv vaarg used
7700 before. */
7701 ix86_varargs_gpr_size = 0;
7702 ix86_varargs_fpr_size = 0;
7703
7704 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7705 {
7706 rtx reg, mem;
7707
7708 mem = gen_rtx_MEM (Pmode,
7709 plus_constant (virtual_incoming_args_rtx,
7710 i * UNITS_PER_WORD));
7711 MEM_NOTRAP_P (mem) = 1;
7712 set_mem_alias_set (mem, set);
7713
7714 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7715 emit_move_insn (mem, reg);
7716 }
7717 }
7718
7719 static void
7720 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7721 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7722 int no_rtl)
7723 {
7724 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7725 CUMULATIVE_ARGS next_cum;
7726 tree fntype;
7727
7728 /* This argument doesn't appear to be used anymore. Which is good,
7729 because the old code here didn't suppress rtl generation. */
7730 gcc_assert (!no_rtl);
7731
7732 if (!TARGET_64BIT)
7733 return;
7734
7735 fntype = TREE_TYPE (current_function_decl);
7736
7737 /* For varargs, we do not want to skip the dummy va_dcl argument.
7738 For stdargs, we do want to skip the last named argument. */
7739 next_cum = *cum;
7740 if (stdarg_p (fntype))
7741 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7742 true);
7743
7744 if (cum->call_abi == MS_ABI)
7745 setup_incoming_varargs_ms_64 (&next_cum);
7746 else
7747 setup_incoming_varargs_64 (&next_cum);
7748 }
7749
7750 /* Checks if TYPE is of kind va_list char *. */
7751
7752 static bool
7753 is_va_list_char_pointer (tree type)
7754 {
7755 tree canonic;
7756
7757 /* For 32-bit it is always true. */
7758 if (!TARGET_64BIT)
7759 return true;
7760 canonic = ix86_canonical_va_list_type (type);
7761 return (canonic == ms_va_list_type_node
7762 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7763 }
7764
7765 /* Implement va_start. */
7766
7767 static void
7768 ix86_va_start (tree valist, rtx nextarg)
7769 {
7770 HOST_WIDE_INT words, n_gpr, n_fpr;
7771 tree f_gpr, f_fpr, f_ovf, f_sav;
7772 tree gpr, fpr, ovf, sav, t;
7773 tree type;
7774 rtx ovf_rtx;
7775
7776 if (flag_split_stack
7777 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7778 {
7779 unsigned int scratch_regno;
7780
7781 /* When we are splitting the stack, we can't refer to the stack
7782 arguments using internal_arg_pointer, because they may be on
7783 the old stack. The split stack prologue will arrange to
7784 leave a pointer to the old stack arguments in a scratch
7785 register, which we here copy to a pseudo-register. The split
7786 stack prologue can't set the pseudo-register directly because
7787 it (the prologue) runs before any registers have been saved. */
7788
7789 scratch_regno = split_stack_prologue_scratch_regno ();
7790 if (scratch_regno != INVALID_REGNUM)
7791 {
7792 rtx reg, seq;
7793
7794 reg = gen_reg_rtx (Pmode);
7795 cfun->machine->split_stack_varargs_pointer = reg;
7796
7797 start_sequence ();
7798 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7799 seq = get_insns ();
7800 end_sequence ();
7801
7802 push_topmost_sequence ();
7803 emit_insn_after (seq, entry_of_function ());
7804 pop_topmost_sequence ();
7805 }
7806 }
7807
7808 /* Only 64bit target needs something special. */
7809 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7810 {
7811 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7812 std_expand_builtin_va_start (valist, nextarg);
7813 else
7814 {
7815 rtx va_r, next;
7816
7817 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7818 next = expand_binop (ptr_mode, add_optab,
7819 cfun->machine->split_stack_varargs_pointer,
7820 crtl->args.arg_offset_rtx,
7821 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7822 convert_move (va_r, next, 0);
7823 }
7824 return;
7825 }
7826
7827 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7828 f_fpr = DECL_CHAIN (f_gpr);
7829 f_ovf = DECL_CHAIN (f_fpr);
7830 f_sav = DECL_CHAIN (f_ovf);
7831
7832 valist = build_simple_mem_ref (valist);
7833 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7834 /* The following should be folded into the MEM_REF offset. */
7835 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7836 f_gpr, NULL_TREE);
7837 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7838 f_fpr, NULL_TREE);
7839 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7840 f_ovf, NULL_TREE);
7841 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7842 f_sav, NULL_TREE);
7843
7844 /* Count number of gp and fp argument registers used. */
7845 words = crtl->args.info.words;
7846 n_gpr = crtl->args.info.regno;
7847 n_fpr = crtl->args.info.sse_regno;
7848
7849 if (cfun->va_list_gpr_size)
7850 {
7851 type = TREE_TYPE (gpr);
7852 t = build2 (MODIFY_EXPR, type,
7853 gpr, build_int_cst (type, n_gpr * 8));
7854 TREE_SIDE_EFFECTS (t) = 1;
7855 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7856 }
7857
7858 if (TARGET_SSE && cfun->va_list_fpr_size)
7859 {
7860 type = TREE_TYPE (fpr);
7861 t = build2 (MODIFY_EXPR, type, fpr,
7862 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7863 TREE_SIDE_EFFECTS (t) = 1;
7864 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7865 }
7866
7867 /* Find the overflow area. */
7868 type = TREE_TYPE (ovf);
7869 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7870 ovf_rtx = crtl->args.internal_arg_pointer;
7871 else
7872 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7873 t = make_tree (type, ovf_rtx);
7874 if (words != 0)
7875 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7876 t = build2 (MODIFY_EXPR, type, ovf, t);
7877 TREE_SIDE_EFFECTS (t) = 1;
7878 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7879
7880 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7881 {
7882 /* Find the register save area.
7883 Prologue of the function save it right above stack frame. */
7884 type = TREE_TYPE (sav);
7885 t = make_tree (type, frame_pointer_rtx);
7886 if (!ix86_varargs_gpr_size)
7887 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7888 t = build2 (MODIFY_EXPR, type, sav, t);
7889 TREE_SIDE_EFFECTS (t) = 1;
7890 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7891 }
7892 }
7893
7894 /* Implement va_arg. */
7895
7896 static tree
7897 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7898 gimple_seq *post_p)
7899 {
7900 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7901 tree f_gpr, f_fpr, f_ovf, f_sav;
7902 tree gpr, fpr, ovf, sav, t;
7903 int size, rsize;
7904 tree lab_false, lab_over = NULL_TREE;
7905 tree addr, t2;
7906 rtx container;
7907 int indirect_p = 0;
7908 tree ptrtype;
7909 enum machine_mode nat_mode;
7910 unsigned int arg_boundary;
7911
7912 /* Only 64bit target needs something special. */
7913 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7914 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7915
7916 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7917 f_fpr = DECL_CHAIN (f_gpr);
7918 f_ovf = DECL_CHAIN (f_fpr);
7919 f_sav = DECL_CHAIN (f_ovf);
7920
7921 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7922 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7923 valist = build_va_arg_indirect_ref (valist);
7924 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7925 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7926 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7927
7928 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7929 if (indirect_p)
7930 type = build_pointer_type (type);
7931 size = int_size_in_bytes (type);
7932 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7933
7934 nat_mode = type_natural_mode (type, NULL);
7935 switch (nat_mode)
7936 {
7937 case V8SFmode:
7938 case V8SImode:
7939 case V32QImode:
7940 case V16HImode:
7941 case V4DFmode:
7942 case V4DImode:
7943 /* Unnamed 256bit vector mode parameters are passed on stack. */
7944 if (!TARGET_64BIT_MS_ABI)
7945 {
7946 container = NULL;
7947 break;
7948 }
7949
7950 default:
7951 container = construct_container (nat_mode, TYPE_MODE (type),
7952 type, 0, X86_64_REGPARM_MAX,
7953 X86_64_SSE_REGPARM_MAX, intreg,
7954 0);
7955 break;
7956 }
7957
7958 /* Pull the value out of the saved registers. */
7959
7960 addr = create_tmp_var (ptr_type_node, "addr");
7961
7962 if (container)
7963 {
7964 int needed_intregs, needed_sseregs;
7965 bool need_temp;
7966 tree int_addr, sse_addr;
7967
7968 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7969 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7970
7971 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7972
7973 need_temp = (!REG_P (container)
7974 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7975 || TYPE_ALIGN (type) > 128));
7976
7977 /* In case we are passing structure, verify that it is consecutive block
7978 on the register save area. If not we need to do moves. */
7979 if (!need_temp && !REG_P (container))
7980 {
7981 /* Verify that all registers are strictly consecutive */
7982 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7983 {
7984 int i;
7985
7986 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7987 {
7988 rtx slot = XVECEXP (container, 0, i);
7989 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7990 || INTVAL (XEXP (slot, 1)) != i * 16)
7991 need_temp = 1;
7992 }
7993 }
7994 else
7995 {
7996 int i;
7997
7998 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7999 {
8000 rtx slot = XVECEXP (container, 0, i);
8001 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8002 || INTVAL (XEXP (slot, 1)) != i * 8)
8003 need_temp = 1;
8004 }
8005 }
8006 }
8007 if (!need_temp)
8008 {
8009 int_addr = addr;
8010 sse_addr = addr;
8011 }
8012 else
8013 {
8014 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8015 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8016 }
8017
8018 /* First ensure that we fit completely in registers. */
8019 if (needed_intregs)
8020 {
8021 t = build_int_cst (TREE_TYPE (gpr),
8022 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8023 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8024 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8025 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8026 gimplify_and_add (t, pre_p);
8027 }
8028 if (needed_sseregs)
8029 {
8030 t = build_int_cst (TREE_TYPE (fpr),
8031 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8032 + X86_64_REGPARM_MAX * 8);
8033 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8034 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8035 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8036 gimplify_and_add (t, pre_p);
8037 }
8038
8039 /* Compute index to start of area used for integer regs. */
8040 if (needed_intregs)
8041 {
8042 /* int_addr = gpr + sav; */
8043 t = fold_build_pointer_plus (sav, gpr);
8044 gimplify_assign (int_addr, t, pre_p);
8045 }
8046 if (needed_sseregs)
8047 {
8048 /* sse_addr = fpr + sav; */
8049 t = fold_build_pointer_plus (sav, fpr);
8050 gimplify_assign (sse_addr, t, pre_p);
8051 }
8052 if (need_temp)
8053 {
8054 int i, prev_size = 0;
8055 tree temp = create_tmp_var (type, "va_arg_tmp");
8056
8057 /* addr = &temp; */
8058 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8059 gimplify_assign (addr, t, pre_p);
8060
8061 for (i = 0; i < XVECLEN (container, 0); i++)
8062 {
8063 rtx slot = XVECEXP (container, 0, i);
8064 rtx reg = XEXP (slot, 0);
8065 enum machine_mode mode = GET_MODE (reg);
8066 tree piece_type;
8067 tree addr_type;
8068 tree daddr_type;
8069 tree src_addr, src;
8070 int src_offset;
8071 tree dest_addr, dest;
8072 int cur_size = GET_MODE_SIZE (mode);
8073
8074 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8075 prev_size = INTVAL (XEXP (slot, 1));
8076 if (prev_size + cur_size > size)
8077 {
8078 cur_size = size - prev_size;
8079 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8080 if (mode == BLKmode)
8081 mode = QImode;
8082 }
8083 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8084 if (mode == GET_MODE (reg))
8085 addr_type = build_pointer_type (piece_type);
8086 else
8087 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8088 true);
8089 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8090 true);
8091
8092 if (SSE_REGNO_P (REGNO (reg)))
8093 {
8094 src_addr = sse_addr;
8095 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8096 }
8097 else
8098 {
8099 src_addr = int_addr;
8100 src_offset = REGNO (reg) * 8;
8101 }
8102 src_addr = fold_convert (addr_type, src_addr);
8103 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8104
8105 dest_addr = fold_convert (daddr_type, addr);
8106 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8107 if (cur_size == GET_MODE_SIZE (mode))
8108 {
8109 src = build_va_arg_indirect_ref (src_addr);
8110 dest = build_va_arg_indirect_ref (dest_addr);
8111
8112 gimplify_assign (dest, src, pre_p);
8113 }
8114 else
8115 {
8116 tree copy
8117 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8118 3, dest_addr, src_addr,
8119 size_int (cur_size));
8120 gimplify_and_add (copy, pre_p);
8121 }
8122 prev_size += cur_size;
8123 }
8124 }
8125
8126 if (needed_intregs)
8127 {
8128 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8129 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8130 gimplify_assign (gpr, t, pre_p);
8131 }
8132
8133 if (needed_sseregs)
8134 {
8135 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8136 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8137 gimplify_assign (fpr, t, pre_p);
8138 }
8139
8140 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8141
8142 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8143 }
8144
8145 /* ... otherwise out of the overflow area. */
8146
8147 /* When we align parameter on stack for caller, if the parameter
8148 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8149 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8150 here with caller. */
8151 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8152 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8153 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8154
8155 /* Care for on-stack alignment if needed. */
8156 if (arg_boundary <= 64 || size == 0)
8157 t = ovf;
8158 else
8159 {
8160 HOST_WIDE_INT align = arg_boundary / 8;
8161 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8162 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8163 build_int_cst (TREE_TYPE (t), -align));
8164 }
8165
8166 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8167 gimplify_assign (addr, t, pre_p);
8168
8169 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8170 gimplify_assign (unshare_expr (ovf), t, pre_p);
8171
8172 if (container)
8173 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8174
8175 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8176 addr = fold_convert (ptrtype, addr);
8177
8178 if (indirect_p)
8179 addr = build_va_arg_indirect_ref (addr);
8180 return build_va_arg_indirect_ref (addr);
8181 }
8182 \f
8183 /* Return true if OPNUM's MEM should be matched
8184 in movabs* patterns. */
8185
8186 bool
8187 ix86_check_movabs (rtx insn, int opnum)
8188 {
8189 rtx set, mem;
8190
8191 set = PATTERN (insn);
8192 if (GET_CODE (set) == PARALLEL)
8193 set = XVECEXP (set, 0, 0);
8194 gcc_assert (GET_CODE (set) == SET);
8195 mem = XEXP (set, opnum);
8196 while (GET_CODE (mem) == SUBREG)
8197 mem = SUBREG_REG (mem);
8198 gcc_assert (MEM_P (mem));
8199 return volatile_ok || !MEM_VOLATILE_P (mem);
8200 }
8201 \f
8202 /* Initialize the table of extra 80387 mathematical constants. */
8203
8204 static void
8205 init_ext_80387_constants (void)
8206 {
8207 static const char * cst[5] =
8208 {
8209 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8210 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8211 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8212 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8213 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8214 };
8215 int i;
8216
8217 for (i = 0; i < 5; i++)
8218 {
8219 real_from_string (&ext_80387_constants_table[i], cst[i]);
8220 /* Ensure each constant is rounded to XFmode precision. */
8221 real_convert (&ext_80387_constants_table[i],
8222 XFmode, &ext_80387_constants_table[i]);
8223 }
8224
8225 ext_80387_constants_init = 1;
8226 }
8227
8228 /* Return non-zero if the constant is something that
8229 can be loaded with a special instruction. */
8230
8231 int
8232 standard_80387_constant_p (rtx x)
8233 {
8234 enum machine_mode mode = GET_MODE (x);
8235
8236 REAL_VALUE_TYPE r;
8237
8238 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8239 return -1;
8240
8241 if (x == CONST0_RTX (mode))
8242 return 1;
8243 if (x == CONST1_RTX (mode))
8244 return 2;
8245
8246 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8247
8248 /* For XFmode constants, try to find a special 80387 instruction when
8249 optimizing for size or on those CPUs that benefit from them. */
8250 if (mode == XFmode
8251 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8252 {
8253 int i;
8254
8255 if (! ext_80387_constants_init)
8256 init_ext_80387_constants ();
8257
8258 for (i = 0; i < 5; i++)
8259 if (real_identical (&r, &ext_80387_constants_table[i]))
8260 return i + 3;
8261 }
8262
8263 /* Load of the constant -0.0 or -1.0 will be split as
8264 fldz;fchs or fld1;fchs sequence. */
8265 if (real_isnegzero (&r))
8266 return 8;
8267 if (real_identical (&r, &dconstm1))
8268 return 9;
8269
8270 return 0;
8271 }
8272
8273 /* Return the opcode of the special instruction to be used to load
8274 the constant X. */
8275
8276 const char *
8277 standard_80387_constant_opcode (rtx x)
8278 {
8279 switch (standard_80387_constant_p (x))
8280 {
8281 case 1:
8282 return "fldz";
8283 case 2:
8284 return "fld1";
8285 case 3:
8286 return "fldlg2";
8287 case 4:
8288 return "fldln2";
8289 case 5:
8290 return "fldl2e";
8291 case 6:
8292 return "fldl2t";
8293 case 7:
8294 return "fldpi";
8295 case 8:
8296 case 9:
8297 return "#";
8298 default:
8299 gcc_unreachable ();
8300 }
8301 }
8302
8303 /* Return the CONST_DOUBLE representing the 80387 constant that is
8304 loaded by the specified special instruction. The argument IDX
8305 matches the return value from standard_80387_constant_p. */
8306
8307 rtx
8308 standard_80387_constant_rtx (int idx)
8309 {
8310 int i;
8311
8312 if (! ext_80387_constants_init)
8313 init_ext_80387_constants ();
8314
8315 switch (idx)
8316 {
8317 case 3:
8318 case 4:
8319 case 5:
8320 case 6:
8321 case 7:
8322 i = idx - 3;
8323 break;
8324
8325 default:
8326 gcc_unreachable ();
8327 }
8328
8329 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8330 XFmode);
8331 }
8332
8333 /* Return 1 if X is all 0s and 2 if x is all 1s
8334 in supported SSE/AVX vector mode. */
8335
8336 int
8337 standard_sse_constant_p (rtx x)
8338 {
8339 enum machine_mode mode = GET_MODE (x);
8340
8341 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8342 return 1;
8343 if (vector_all_ones_operand (x, mode))
8344 switch (mode)
8345 {
8346 case V16QImode:
8347 case V8HImode:
8348 case V4SImode:
8349 case V2DImode:
8350 if (TARGET_SSE2)
8351 return 2;
8352 case V32QImode:
8353 case V16HImode:
8354 case V8SImode:
8355 case V4DImode:
8356 if (TARGET_AVX2)
8357 return 2;
8358 default:
8359 break;
8360 }
8361
8362 return 0;
8363 }
8364
8365 /* Return the opcode of the special instruction to be used to load
8366 the constant X. */
8367
8368 const char *
8369 standard_sse_constant_opcode (rtx insn, rtx x)
8370 {
8371 switch (standard_sse_constant_p (x))
8372 {
8373 case 1:
8374 switch (get_attr_mode (insn))
8375 {
8376 case MODE_TI:
8377 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8378 return "%vpxor\t%0, %d0";
8379 case MODE_V2DF:
8380 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8381 return "%vxorpd\t%0, %d0";
8382 case MODE_V4SF:
8383 return "%vxorps\t%0, %d0";
8384
8385 case MODE_OI:
8386 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8387 return "vpxor\t%x0, %x0, %x0";
8388 case MODE_V4DF:
8389 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8390 return "vxorpd\t%x0, %x0, %x0";
8391 case MODE_V8SF:
8392 return "vxorps\t%x0, %x0, %x0";
8393
8394 default:
8395 break;
8396 }
8397
8398 case 2:
8399 if (TARGET_AVX)
8400 return "vpcmpeqd\t%0, %0, %0";
8401 else
8402 return "pcmpeqd\t%0, %0";
8403
8404 default:
8405 break;
8406 }
8407 gcc_unreachable ();
8408 }
8409
8410 /* Returns true if OP contains a symbol reference */
8411
8412 bool
8413 symbolic_reference_mentioned_p (rtx op)
8414 {
8415 const char *fmt;
8416 int i;
8417
8418 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8419 return true;
8420
8421 fmt = GET_RTX_FORMAT (GET_CODE (op));
8422 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8423 {
8424 if (fmt[i] == 'E')
8425 {
8426 int j;
8427
8428 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8429 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8430 return true;
8431 }
8432
8433 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8434 return true;
8435 }
8436
8437 return false;
8438 }
8439
8440 /* Return true if it is appropriate to emit `ret' instructions in the
8441 body of a function. Do this only if the epilogue is simple, needing a
8442 couple of insns. Prior to reloading, we can't tell how many registers
8443 must be saved, so return false then. Return false if there is no frame
8444 marker to de-allocate. */
8445
8446 bool
8447 ix86_can_use_return_insn_p (void)
8448 {
8449 struct ix86_frame frame;
8450
8451 if (! reload_completed || frame_pointer_needed)
8452 return 0;
8453
8454 /* Don't allow more than 32k pop, since that's all we can do
8455 with one instruction. */
8456 if (crtl->args.pops_args && crtl->args.size >= 32768)
8457 return 0;
8458
8459 ix86_compute_frame_layout (&frame);
8460 return (frame.stack_pointer_offset == UNITS_PER_WORD
8461 && (frame.nregs + frame.nsseregs) == 0);
8462 }
8463 \f
8464 /* Value should be nonzero if functions must have frame pointers.
8465 Zero means the frame pointer need not be set up (and parms may
8466 be accessed via the stack pointer) in functions that seem suitable. */
8467
8468 static bool
8469 ix86_frame_pointer_required (void)
8470 {
8471 /* If we accessed previous frames, then the generated code expects
8472 to be able to access the saved ebp value in our frame. */
8473 if (cfun->machine->accesses_prev_frame)
8474 return true;
8475
8476 /* Several x86 os'es need a frame pointer for other reasons,
8477 usually pertaining to setjmp. */
8478 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8479 return true;
8480
8481 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8482 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8483 return true;
8484
8485 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8486 turns off the frame pointer by default. Turn it back on now if
8487 we've not got a leaf function. */
8488 if (TARGET_OMIT_LEAF_FRAME_POINTER
8489 && (!current_function_is_leaf
8490 || ix86_current_function_calls_tls_descriptor))
8491 return true;
8492
8493 if (crtl->profile && !flag_fentry)
8494 return true;
8495
8496 return false;
8497 }
8498
8499 /* Record that the current function accesses previous call frames. */
8500
8501 void
8502 ix86_setup_frame_addresses (void)
8503 {
8504 cfun->machine->accesses_prev_frame = 1;
8505 }
8506 \f
8507 #ifndef USE_HIDDEN_LINKONCE
8508 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8509 # define USE_HIDDEN_LINKONCE 1
8510 # else
8511 # define USE_HIDDEN_LINKONCE 0
8512 # endif
8513 #endif
8514
8515 static int pic_labels_used;
8516
8517 /* Fills in the label name that should be used for a pc thunk for
8518 the given register. */
8519
8520 static void
8521 get_pc_thunk_name (char name[32], unsigned int regno)
8522 {
8523 gcc_assert (!TARGET_64BIT);
8524
8525 if (USE_HIDDEN_LINKONCE)
8526 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8527 else
8528 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8529 }
8530
8531
8532 /* This function generates code for -fpic that loads %ebx with
8533 the return address of the caller and then returns. */
8534
8535 static void
8536 ix86_code_end (void)
8537 {
8538 rtx xops[2];
8539 int regno;
8540
8541 for (regno = AX_REG; regno <= SP_REG; regno++)
8542 {
8543 char name[32];
8544 tree decl;
8545
8546 if (!(pic_labels_used & (1 << regno)))
8547 continue;
8548
8549 get_pc_thunk_name (name, regno);
8550
8551 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8552 get_identifier (name),
8553 build_function_type_list (void_type_node, NULL_TREE));
8554 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8555 NULL_TREE, void_type_node);
8556 TREE_PUBLIC (decl) = 1;
8557 TREE_STATIC (decl) = 1;
8558
8559 #if TARGET_MACHO
8560 if (TARGET_MACHO)
8561 {
8562 switch_to_section (darwin_sections[text_coal_section]);
8563 fputs ("\t.weak_definition\t", asm_out_file);
8564 assemble_name (asm_out_file, name);
8565 fputs ("\n\t.private_extern\t", asm_out_file);
8566 assemble_name (asm_out_file, name);
8567 putc ('\n', asm_out_file);
8568 ASM_OUTPUT_LABEL (asm_out_file, name);
8569 DECL_WEAK (decl) = 1;
8570 }
8571 else
8572 #endif
8573 if (USE_HIDDEN_LINKONCE)
8574 {
8575 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8576
8577 targetm.asm_out.unique_section (decl, 0);
8578 switch_to_section (get_named_section (decl, NULL, 0));
8579
8580 targetm.asm_out.globalize_label (asm_out_file, name);
8581 fputs ("\t.hidden\t", asm_out_file);
8582 assemble_name (asm_out_file, name);
8583 putc ('\n', asm_out_file);
8584 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8585 }
8586 else
8587 {
8588 switch_to_section (text_section);
8589 ASM_OUTPUT_LABEL (asm_out_file, name);
8590 }
8591
8592 DECL_INITIAL (decl) = make_node (BLOCK);
8593 current_function_decl = decl;
8594 init_function_start (decl);
8595 first_function_block_is_cold = false;
8596 /* Make sure unwind info is emitted for the thunk if needed. */
8597 final_start_function (emit_barrier (), asm_out_file, 1);
8598
8599 /* Pad stack IP move with 4 instructions (two NOPs count
8600 as one instruction). */
8601 if (TARGET_PAD_SHORT_FUNCTION)
8602 {
8603 int i = 8;
8604
8605 while (i--)
8606 fputs ("\tnop\n", asm_out_file);
8607 }
8608
8609 xops[0] = gen_rtx_REG (Pmode, regno);
8610 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8611 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8612 fputs ("\tret\n", asm_out_file);
8613 final_end_function ();
8614 init_insn_lengths ();
8615 free_after_compilation (cfun);
8616 set_cfun (NULL);
8617 current_function_decl = NULL;
8618 }
8619
8620 if (flag_split_stack)
8621 file_end_indicate_split_stack ();
8622 }
8623
8624 /* Emit code for the SET_GOT patterns. */
8625
8626 const char *
8627 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8628 {
8629 rtx xops[3];
8630
8631 xops[0] = dest;
8632
8633 if (TARGET_VXWORKS_RTP && flag_pic)
8634 {
8635 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8636 xops[2] = gen_rtx_MEM (Pmode,
8637 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8638 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8639
8640 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8641 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8642 an unadorned address. */
8643 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8644 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8645 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8646 return "";
8647 }
8648
8649 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8650
8651 if (!flag_pic)
8652 {
8653 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8654
8655 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8656
8657 #if TARGET_MACHO
8658 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8659 is what will be referenced by the Mach-O PIC subsystem. */
8660 if (!label)
8661 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8662 #endif
8663
8664 targetm.asm_out.internal_label (asm_out_file, "L",
8665 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8666 }
8667 else
8668 {
8669 char name[32];
8670 get_pc_thunk_name (name, REGNO (dest));
8671 pic_labels_used |= 1 << REGNO (dest);
8672
8673 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8674 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8675 output_asm_insn ("call\t%X2", xops);
8676 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8677 is what will be referenced by the Mach-O PIC subsystem. */
8678 #if TARGET_MACHO
8679 if (!label)
8680 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8681 else
8682 targetm.asm_out.internal_label (asm_out_file, "L",
8683 CODE_LABEL_NUMBER (label));
8684 #endif
8685 }
8686
8687 if (!TARGET_MACHO)
8688 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8689
8690 return "";
8691 }
8692
8693 /* Generate an "push" pattern for input ARG. */
8694
8695 static rtx
8696 gen_push (rtx arg)
8697 {
8698 struct machine_function *m = cfun->machine;
8699
8700 if (m->fs.cfa_reg == stack_pointer_rtx)
8701 m->fs.cfa_offset += UNITS_PER_WORD;
8702 m->fs.sp_offset += UNITS_PER_WORD;
8703
8704 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8705 arg = gen_rtx_REG (word_mode, REGNO (arg));
8706
8707 return gen_rtx_SET (VOIDmode,
8708 gen_rtx_MEM (word_mode,
8709 gen_rtx_PRE_DEC (Pmode,
8710 stack_pointer_rtx)),
8711 arg);
8712 }
8713
8714 /* Generate an "pop" pattern for input ARG. */
8715
8716 static rtx
8717 gen_pop (rtx arg)
8718 {
8719 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8720 arg = gen_rtx_REG (word_mode, REGNO (arg));
8721
8722 return gen_rtx_SET (VOIDmode,
8723 arg,
8724 gen_rtx_MEM (word_mode,
8725 gen_rtx_POST_INC (Pmode,
8726 stack_pointer_rtx)));
8727 }
8728
8729 /* Return >= 0 if there is an unused call-clobbered register available
8730 for the entire function. */
8731
8732 static unsigned int
8733 ix86_select_alt_pic_regnum (void)
8734 {
8735 if (current_function_is_leaf
8736 && !crtl->profile
8737 && !ix86_current_function_calls_tls_descriptor)
8738 {
8739 int i, drap;
8740 /* Can't use the same register for both PIC and DRAP. */
8741 if (crtl->drap_reg)
8742 drap = REGNO (crtl->drap_reg);
8743 else
8744 drap = -1;
8745 for (i = 2; i >= 0; --i)
8746 if (i != drap && !df_regs_ever_live_p (i))
8747 return i;
8748 }
8749
8750 return INVALID_REGNUM;
8751 }
8752
8753 /* Return TRUE if we need to save REGNO. */
8754
8755 static bool
8756 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8757 {
8758 if (pic_offset_table_rtx
8759 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8760 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8761 || crtl->profile
8762 || crtl->calls_eh_return
8763 || crtl->uses_const_pool))
8764 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8765
8766 if (crtl->calls_eh_return && maybe_eh_return)
8767 {
8768 unsigned i;
8769 for (i = 0; ; i++)
8770 {
8771 unsigned test = EH_RETURN_DATA_REGNO (i);
8772 if (test == INVALID_REGNUM)
8773 break;
8774 if (test == regno)
8775 return true;
8776 }
8777 }
8778
8779 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8780 return true;
8781
8782 return (df_regs_ever_live_p (regno)
8783 && !call_used_regs[regno]
8784 && !fixed_regs[regno]
8785 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8786 }
8787
8788 /* Return number of saved general prupose registers. */
8789
8790 static int
8791 ix86_nsaved_regs (void)
8792 {
8793 int nregs = 0;
8794 int regno;
8795
8796 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8797 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8798 nregs ++;
8799 return nregs;
8800 }
8801
8802 /* Return number of saved SSE registrers. */
8803
8804 static int
8805 ix86_nsaved_sseregs (void)
8806 {
8807 int nregs = 0;
8808 int regno;
8809
8810 if (!TARGET_64BIT_MS_ABI)
8811 return 0;
8812 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8813 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8814 nregs ++;
8815 return nregs;
8816 }
8817
8818 /* Given FROM and TO register numbers, say whether this elimination is
8819 allowed. If stack alignment is needed, we can only replace argument
8820 pointer with hard frame pointer, or replace frame pointer with stack
8821 pointer. Otherwise, frame pointer elimination is automatically
8822 handled and all other eliminations are valid. */
8823
8824 static bool
8825 ix86_can_eliminate (const int from, const int to)
8826 {
8827 if (stack_realign_fp)
8828 return ((from == ARG_POINTER_REGNUM
8829 && to == HARD_FRAME_POINTER_REGNUM)
8830 || (from == FRAME_POINTER_REGNUM
8831 && to == STACK_POINTER_REGNUM));
8832 else
8833 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8834 }
8835
8836 /* Return the offset between two registers, one to be eliminated, and the other
8837 its replacement, at the start of a routine. */
8838
8839 HOST_WIDE_INT
8840 ix86_initial_elimination_offset (int from, int to)
8841 {
8842 struct ix86_frame frame;
8843 ix86_compute_frame_layout (&frame);
8844
8845 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8846 return frame.hard_frame_pointer_offset;
8847 else if (from == FRAME_POINTER_REGNUM
8848 && to == HARD_FRAME_POINTER_REGNUM)
8849 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8850 else
8851 {
8852 gcc_assert (to == STACK_POINTER_REGNUM);
8853
8854 if (from == ARG_POINTER_REGNUM)
8855 return frame.stack_pointer_offset;
8856
8857 gcc_assert (from == FRAME_POINTER_REGNUM);
8858 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8859 }
8860 }
8861
8862 /* In a dynamically-aligned function, we can't know the offset from
8863 stack pointer to frame pointer, so we must ensure that setjmp
8864 eliminates fp against the hard fp (%ebp) rather than trying to
8865 index from %esp up to the top of the frame across a gap that is
8866 of unknown (at compile-time) size. */
8867 static rtx
8868 ix86_builtin_setjmp_frame_value (void)
8869 {
8870 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8871 }
8872
8873 /* When using -fsplit-stack, the allocation routines set a field in
8874 the TCB to the bottom of the stack plus this much space, measured
8875 in bytes. */
8876
8877 #define SPLIT_STACK_AVAILABLE 256
8878
8879 /* Fill structure ix86_frame about frame of currently computed function. */
8880
8881 static void
8882 ix86_compute_frame_layout (struct ix86_frame *frame)
8883 {
8884 unsigned int stack_alignment_needed;
8885 HOST_WIDE_INT offset;
8886 unsigned int preferred_alignment;
8887 HOST_WIDE_INT size = get_frame_size ();
8888 HOST_WIDE_INT to_allocate;
8889
8890 frame->nregs = ix86_nsaved_regs ();
8891 frame->nsseregs = ix86_nsaved_sseregs ();
8892
8893 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8894 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8895
8896 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8897 function prologues and leaf. */
8898 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8899 && (!current_function_is_leaf || cfun->calls_alloca != 0
8900 || ix86_current_function_calls_tls_descriptor))
8901 {
8902 preferred_alignment = 16;
8903 stack_alignment_needed = 16;
8904 crtl->preferred_stack_boundary = 128;
8905 crtl->stack_alignment_needed = 128;
8906 }
8907
8908 gcc_assert (!size || stack_alignment_needed);
8909 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8910 gcc_assert (preferred_alignment <= stack_alignment_needed);
8911
8912 /* For SEH we have to limit the amount of code movement into the prologue.
8913 At present we do this via a BLOCKAGE, at which point there's very little
8914 scheduling that can be done, which means that there's very little point
8915 in doing anything except PUSHs. */
8916 if (TARGET_SEH)
8917 cfun->machine->use_fast_prologue_epilogue = false;
8918
8919 /* During reload iteration the amount of registers saved can change.
8920 Recompute the value as needed. Do not recompute when amount of registers
8921 didn't change as reload does multiple calls to the function and does not
8922 expect the decision to change within single iteration. */
8923 else if (!optimize_function_for_size_p (cfun)
8924 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8925 {
8926 int count = frame->nregs;
8927 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8928
8929 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8930
8931 /* The fast prologue uses move instead of push to save registers. This
8932 is significantly longer, but also executes faster as modern hardware
8933 can execute the moves in parallel, but can't do that for push/pop.
8934
8935 Be careful about choosing what prologue to emit: When function takes
8936 many instructions to execute we may use slow version as well as in
8937 case function is known to be outside hot spot (this is known with
8938 feedback only). Weight the size of function by number of registers
8939 to save as it is cheap to use one or two push instructions but very
8940 slow to use many of them. */
8941 if (count)
8942 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8943 if (node->frequency < NODE_FREQUENCY_NORMAL
8944 || (flag_branch_probabilities
8945 && node->frequency < NODE_FREQUENCY_HOT))
8946 cfun->machine->use_fast_prologue_epilogue = false;
8947 else
8948 cfun->machine->use_fast_prologue_epilogue
8949 = !expensive_function_p (count);
8950 }
8951
8952 frame->save_regs_using_mov
8953 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8954 /* If static stack checking is enabled and done with probes,
8955 the registers need to be saved before allocating the frame. */
8956 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8957
8958 /* Skip return address. */
8959 offset = UNITS_PER_WORD;
8960
8961 /* Skip pushed static chain. */
8962 if (ix86_static_chain_on_stack)
8963 offset += UNITS_PER_WORD;
8964
8965 /* Skip saved base pointer. */
8966 if (frame_pointer_needed)
8967 offset += UNITS_PER_WORD;
8968 frame->hfp_save_offset = offset;
8969
8970 /* The traditional frame pointer location is at the top of the frame. */
8971 frame->hard_frame_pointer_offset = offset;
8972
8973 /* Register save area */
8974 offset += frame->nregs * UNITS_PER_WORD;
8975 frame->reg_save_offset = offset;
8976
8977 /* Align and set SSE register save area. */
8978 if (frame->nsseregs)
8979 {
8980 /* The only ABI that has saved SSE registers (Win64) also has a
8981 16-byte aligned default stack, and thus we don't need to be
8982 within the re-aligned local stack frame to save them. */
8983 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8984 offset = (offset + 16 - 1) & -16;
8985 offset += frame->nsseregs * 16;
8986 }
8987 frame->sse_reg_save_offset = offset;
8988
8989 /* The re-aligned stack starts here. Values before this point are not
8990 directly comparable with values below this point. In order to make
8991 sure that no value happens to be the same before and after, force
8992 the alignment computation below to add a non-zero value. */
8993 if (stack_realign_fp)
8994 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8995
8996 /* Va-arg area */
8997 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8998 offset += frame->va_arg_size;
8999
9000 /* Align start of frame for local function. */
9001 if (stack_realign_fp
9002 || offset != frame->sse_reg_save_offset
9003 || size != 0
9004 || !current_function_is_leaf
9005 || cfun->calls_alloca
9006 || ix86_current_function_calls_tls_descriptor)
9007 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9008
9009 /* Frame pointer points here. */
9010 frame->frame_pointer_offset = offset;
9011
9012 offset += size;
9013
9014 /* Add outgoing arguments area. Can be skipped if we eliminated
9015 all the function calls as dead code.
9016 Skipping is however impossible when function calls alloca. Alloca
9017 expander assumes that last crtl->outgoing_args_size
9018 of stack frame are unused. */
9019 if (ACCUMULATE_OUTGOING_ARGS
9020 && (!current_function_is_leaf || cfun->calls_alloca
9021 || ix86_current_function_calls_tls_descriptor))
9022 {
9023 offset += crtl->outgoing_args_size;
9024 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9025 }
9026 else
9027 frame->outgoing_arguments_size = 0;
9028
9029 /* Align stack boundary. Only needed if we're calling another function
9030 or using alloca. */
9031 if (!current_function_is_leaf || cfun->calls_alloca
9032 || ix86_current_function_calls_tls_descriptor)
9033 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9034
9035 /* We've reached end of stack frame. */
9036 frame->stack_pointer_offset = offset;
9037
9038 /* Size prologue needs to allocate. */
9039 to_allocate = offset - frame->sse_reg_save_offset;
9040
9041 if ((!to_allocate && frame->nregs <= 1)
9042 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9043 frame->save_regs_using_mov = false;
9044
9045 if (ix86_using_red_zone ()
9046 && current_function_sp_is_unchanging
9047 && current_function_is_leaf
9048 && !ix86_current_function_calls_tls_descriptor)
9049 {
9050 frame->red_zone_size = to_allocate;
9051 if (frame->save_regs_using_mov)
9052 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9053 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9054 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9055 }
9056 else
9057 frame->red_zone_size = 0;
9058 frame->stack_pointer_offset -= frame->red_zone_size;
9059
9060 /* The SEH frame pointer location is near the bottom of the frame.
9061 This is enforced by the fact that the difference between the
9062 stack pointer and the frame pointer is limited to 240 bytes in
9063 the unwind data structure. */
9064 if (TARGET_SEH)
9065 {
9066 HOST_WIDE_INT diff;
9067
9068 /* If we can leave the frame pointer where it is, do so. */
9069 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9070 if (diff > 240 || (diff & 15) != 0)
9071 {
9072 /* Ideally we'd determine what portion of the local stack frame
9073 (within the constraint of the lowest 240) is most heavily used.
9074 But without that complication, simply bias the frame pointer
9075 by 128 bytes so as to maximize the amount of the local stack
9076 frame that is addressable with 8-bit offsets. */
9077 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9078 }
9079 }
9080 }
9081
9082 /* This is semi-inlined memory_address_length, but simplified
9083 since we know that we're always dealing with reg+offset, and
9084 to avoid having to create and discard all that rtl. */
9085
9086 static inline int
9087 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9088 {
9089 int len = 4;
9090
9091 if (offset == 0)
9092 {
9093 /* EBP and R13 cannot be encoded without an offset. */
9094 len = (regno == BP_REG || regno == R13_REG);
9095 }
9096 else if (IN_RANGE (offset, -128, 127))
9097 len = 1;
9098
9099 /* ESP and R12 must be encoded with a SIB byte. */
9100 if (regno == SP_REG || regno == R12_REG)
9101 len++;
9102
9103 return len;
9104 }
9105
9106 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9107 The valid base registers are taken from CFUN->MACHINE->FS. */
9108
9109 static rtx
9110 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9111 {
9112 const struct machine_function *m = cfun->machine;
9113 rtx base_reg = NULL;
9114 HOST_WIDE_INT base_offset = 0;
9115
9116 if (m->use_fast_prologue_epilogue)
9117 {
9118 /* Choose the base register most likely to allow the most scheduling
9119 opportunities. Generally FP is valid througout the function,
9120 while DRAP must be reloaded within the epilogue. But choose either
9121 over the SP due to increased encoding size. */
9122
9123 if (m->fs.fp_valid)
9124 {
9125 base_reg = hard_frame_pointer_rtx;
9126 base_offset = m->fs.fp_offset - cfa_offset;
9127 }
9128 else if (m->fs.drap_valid)
9129 {
9130 base_reg = crtl->drap_reg;
9131 base_offset = 0 - cfa_offset;
9132 }
9133 else if (m->fs.sp_valid)
9134 {
9135 base_reg = stack_pointer_rtx;
9136 base_offset = m->fs.sp_offset - cfa_offset;
9137 }
9138 }
9139 else
9140 {
9141 HOST_WIDE_INT toffset;
9142 int len = 16, tlen;
9143
9144 /* Choose the base register with the smallest address encoding.
9145 With a tie, choose FP > DRAP > SP. */
9146 if (m->fs.sp_valid)
9147 {
9148 base_reg = stack_pointer_rtx;
9149 base_offset = m->fs.sp_offset - cfa_offset;
9150 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9151 }
9152 if (m->fs.drap_valid)
9153 {
9154 toffset = 0 - cfa_offset;
9155 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9156 if (tlen <= len)
9157 {
9158 base_reg = crtl->drap_reg;
9159 base_offset = toffset;
9160 len = tlen;
9161 }
9162 }
9163 if (m->fs.fp_valid)
9164 {
9165 toffset = m->fs.fp_offset - cfa_offset;
9166 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9167 if (tlen <= len)
9168 {
9169 base_reg = hard_frame_pointer_rtx;
9170 base_offset = toffset;
9171 len = tlen;
9172 }
9173 }
9174 }
9175 gcc_assert (base_reg != NULL);
9176
9177 return plus_constant (base_reg, base_offset);
9178 }
9179
9180 /* Emit code to save registers in the prologue. */
9181
9182 static void
9183 ix86_emit_save_regs (void)
9184 {
9185 unsigned int regno;
9186 rtx insn;
9187
9188 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9189 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9190 {
9191 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9192 RTX_FRAME_RELATED_P (insn) = 1;
9193 }
9194 }
9195
9196 /* Emit a single register save at CFA - CFA_OFFSET. */
9197
9198 static void
9199 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9200 HOST_WIDE_INT cfa_offset)
9201 {
9202 struct machine_function *m = cfun->machine;
9203 rtx reg = gen_rtx_REG (mode, regno);
9204 rtx mem, addr, base, insn;
9205
9206 addr = choose_baseaddr (cfa_offset);
9207 mem = gen_frame_mem (mode, addr);
9208
9209 /* For SSE saves, we need to indicate the 128-bit alignment. */
9210 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9211
9212 insn = emit_move_insn (mem, reg);
9213 RTX_FRAME_RELATED_P (insn) = 1;
9214
9215 base = addr;
9216 if (GET_CODE (base) == PLUS)
9217 base = XEXP (base, 0);
9218 gcc_checking_assert (REG_P (base));
9219
9220 /* When saving registers into a re-aligned local stack frame, avoid
9221 any tricky guessing by dwarf2out. */
9222 if (m->fs.realigned)
9223 {
9224 gcc_checking_assert (stack_realign_drap);
9225
9226 if (regno == REGNO (crtl->drap_reg))
9227 {
9228 /* A bit of a hack. We force the DRAP register to be saved in
9229 the re-aligned stack frame, which provides us with a copy
9230 of the CFA that will last past the prologue. Install it. */
9231 gcc_checking_assert (cfun->machine->fs.fp_valid);
9232 addr = plus_constant (hard_frame_pointer_rtx,
9233 cfun->machine->fs.fp_offset - cfa_offset);
9234 mem = gen_rtx_MEM (mode, addr);
9235 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9236 }
9237 else
9238 {
9239 /* The frame pointer is a stable reference within the
9240 aligned frame. Use it. */
9241 gcc_checking_assert (cfun->machine->fs.fp_valid);
9242 addr = plus_constant (hard_frame_pointer_rtx,
9243 cfun->machine->fs.fp_offset - cfa_offset);
9244 mem = gen_rtx_MEM (mode, addr);
9245 add_reg_note (insn, REG_CFA_EXPRESSION,
9246 gen_rtx_SET (VOIDmode, mem, reg));
9247 }
9248 }
9249
9250 /* The memory may not be relative to the current CFA register,
9251 which means that we may need to generate a new pattern for
9252 use by the unwind info. */
9253 else if (base != m->fs.cfa_reg)
9254 {
9255 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9256 mem = gen_rtx_MEM (mode, addr);
9257 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9258 }
9259 }
9260
9261 /* Emit code to save registers using MOV insns.
9262 First register is stored at CFA - CFA_OFFSET. */
9263 static void
9264 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9265 {
9266 unsigned int regno;
9267
9268 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9269 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9270 {
9271 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9272 cfa_offset -= UNITS_PER_WORD;
9273 }
9274 }
9275
9276 /* Emit code to save SSE registers using MOV insns.
9277 First register is stored at CFA - CFA_OFFSET. */
9278 static void
9279 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9280 {
9281 unsigned int regno;
9282
9283 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9284 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9285 {
9286 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9287 cfa_offset -= 16;
9288 }
9289 }
9290
9291 static GTY(()) rtx queued_cfa_restores;
9292
9293 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9294 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9295 Don't add the note if the previously saved value will be left untouched
9296 within stack red-zone till return, as unwinders can find the same value
9297 in the register and on the stack. */
9298
9299 static void
9300 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9301 {
9302 if (!crtl->shrink_wrapped
9303 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9304 return;
9305
9306 if (insn)
9307 {
9308 add_reg_note (insn, REG_CFA_RESTORE, reg);
9309 RTX_FRAME_RELATED_P (insn) = 1;
9310 }
9311 else
9312 queued_cfa_restores
9313 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9314 }
9315
9316 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9317
9318 static void
9319 ix86_add_queued_cfa_restore_notes (rtx insn)
9320 {
9321 rtx last;
9322 if (!queued_cfa_restores)
9323 return;
9324 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9325 ;
9326 XEXP (last, 1) = REG_NOTES (insn);
9327 REG_NOTES (insn) = queued_cfa_restores;
9328 queued_cfa_restores = NULL_RTX;
9329 RTX_FRAME_RELATED_P (insn) = 1;
9330 }
9331
9332 /* Expand prologue or epilogue stack adjustment.
9333 The pattern exist to put a dependency on all ebp-based memory accesses.
9334 STYLE should be negative if instructions should be marked as frame related,
9335 zero if %r11 register is live and cannot be freely used and positive
9336 otherwise. */
9337
9338 static void
9339 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9340 int style, bool set_cfa)
9341 {
9342 struct machine_function *m = cfun->machine;
9343 rtx insn;
9344 bool add_frame_related_expr = false;
9345
9346 if (Pmode == SImode)
9347 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9348 else if (x86_64_immediate_operand (offset, DImode))
9349 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9350 else
9351 {
9352 rtx tmp;
9353 /* r11 is used by indirect sibcall return as well, set before the
9354 epilogue and used after the epilogue. */
9355 if (style)
9356 tmp = gen_rtx_REG (DImode, R11_REG);
9357 else
9358 {
9359 gcc_assert (src != hard_frame_pointer_rtx
9360 && dest != hard_frame_pointer_rtx);
9361 tmp = hard_frame_pointer_rtx;
9362 }
9363 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9364 if (style < 0)
9365 add_frame_related_expr = true;
9366
9367 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9368 }
9369
9370 insn = emit_insn (insn);
9371 if (style >= 0)
9372 ix86_add_queued_cfa_restore_notes (insn);
9373
9374 if (set_cfa)
9375 {
9376 rtx r;
9377
9378 gcc_assert (m->fs.cfa_reg == src);
9379 m->fs.cfa_offset += INTVAL (offset);
9380 m->fs.cfa_reg = dest;
9381
9382 r = gen_rtx_PLUS (Pmode, src, offset);
9383 r = gen_rtx_SET (VOIDmode, dest, r);
9384 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9385 RTX_FRAME_RELATED_P (insn) = 1;
9386 }
9387 else if (style < 0)
9388 {
9389 RTX_FRAME_RELATED_P (insn) = 1;
9390 if (add_frame_related_expr)
9391 {
9392 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9393 r = gen_rtx_SET (VOIDmode, dest, r);
9394 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9395 }
9396 }
9397
9398 if (dest == stack_pointer_rtx)
9399 {
9400 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9401 bool valid = m->fs.sp_valid;
9402
9403 if (src == hard_frame_pointer_rtx)
9404 {
9405 valid = m->fs.fp_valid;
9406 ooffset = m->fs.fp_offset;
9407 }
9408 else if (src == crtl->drap_reg)
9409 {
9410 valid = m->fs.drap_valid;
9411 ooffset = 0;
9412 }
9413 else
9414 {
9415 /* Else there are two possibilities: SP itself, which we set
9416 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9417 taken care of this by hand along the eh_return path. */
9418 gcc_checking_assert (src == stack_pointer_rtx
9419 || offset == const0_rtx);
9420 }
9421
9422 m->fs.sp_offset = ooffset - INTVAL (offset);
9423 m->fs.sp_valid = valid;
9424 }
9425 }
9426
9427 /* Find an available register to be used as dynamic realign argument
9428 pointer regsiter. Such a register will be written in prologue and
9429 used in begin of body, so it must not be
9430 1. parameter passing register.
9431 2. GOT pointer.
9432 We reuse static-chain register if it is available. Otherwise, we
9433 use DI for i386 and R13 for x86-64. We chose R13 since it has
9434 shorter encoding.
9435
9436 Return: the regno of chosen register. */
9437
9438 static unsigned int
9439 find_drap_reg (void)
9440 {
9441 tree decl = cfun->decl;
9442
9443 if (TARGET_64BIT)
9444 {
9445 /* Use R13 for nested function or function need static chain.
9446 Since function with tail call may use any caller-saved
9447 registers in epilogue, DRAP must not use caller-saved
9448 register in such case. */
9449 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9450 return R13_REG;
9451
9452 return R10_REG;
9453 }
9454 else
9455 {
9456 /* Use DI for nested function or function need static chain.
9457 Since function with tail call may use any caller-saved
9458 registers in epilogue, DRAP must not use caller-saved
9459 register in such case. */
9460 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9461 return DI_REG;
9462
9463 /* Reuse static chain register if it isn't used for parameter
9464 passing. */
9465 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9466 {
9467 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9468 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9469 return CX_REG;
9470 }
9471 return DI_REG;
9472 }
9473 }
9474
9475 /* Return minimum incoming stack alignment. */
9476
9477 static unsigned int
9478 ix86_minimum_incoming_stack_boundary (bool sibcall)
9479 {
9480 unsigned int incoming_stack_boundary;
9481
9482 /* Prefer the one specified at command line. */
9483 if (ix86_user_incoming_stack_boundary)
9484 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9485 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9486 if -mstackrealign is used, it isn't used for sibcall check and
9487 estimated stack alignment is 128bit. */
9488 else if (!sibcall
9489 && !TARGET_64BIT
9490 && ix86_force_align_arg_pointer
9491 && crtl->stack_alignment_estimated == 128)
9492 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9493 else
9494 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9495
9496 /* Incoming stack alignment can be changed on individual functions
9497 via force_align_arg_pointer attribute. We use the smallest
9498 incoming stack boundary. */
9499 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9500 && lookup_attribute (ix86_force_align_arg_pointer_string,
9501 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9502 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9503
9504 /* The incoming stack frame has to be aligned at least at
9505 parm_stack_boundary. */
9506 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9507 incoming_stack_boundary = crtl->parm_stack_boundary;
9508
9509 /* Stack at entrance of main is aligned by runtime. We use the
9510 smallest incoming stack boundary. */
9511 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9512 && DECL_NAME (current_function_decl)
9513 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9514 && DECL_FILE_SCOPE_P (current_function_decl))
9515 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9516
9517 return incoming_stack_boundary;
9518 }
9519
9520 /* Update incoming stack boundary and estimated stack alignment. */
9521
9522 static void
9523 ix86_update_stack_boundary (void)
9524 {
9525 ix86_incoming_stack_boundary
9526 = ix86_minimum_incoming_stack_boundary (false);
9527
9528 /* x86_64 vararg needs 16byte stack alignment for register save
9529 area. */
9530 if (TARGET_64BIT
9531 && cfun->stdarg
9532 && crtl->stack_alignment_estimated < 128)
9533 crtl->stack_alignment_estimated = 128;
9534 }
9535
9536 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9537 needed or an rtx for DRAP otherwise. */
9538
9539 static rtx
9540 ix86_get_drap_rtx (void)
9541 {
9542 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9543 crtl->need_drap = true;
9544
9545 if (stack_realign_drap)
9546 {
9547 /* Assign DRAP to vDRAP and returns vDRAP */
9548 unsigned int regno = find_drap_reg ();
9549 rtx drap_vreg;
9550 rtx arg_ptr;
9551 rtx seq, insn;
9552
9553 arg_ptr = gen_rtx_REG (Pmode, regno);
9554 crtl->drap_reg = arg_ptr;
9555
9556 start_sequence ();
9557 drap_vreg = copy_to_reg (arg_ptr);
9558 seq = get_insns ();
9559 end_sequence ();
9560
9561 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9562 if (!optimize)
9563 {
9564 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9565 RTX_FRAME_RELATED_P (insn) = 1;
9566 }
9567 return drap_vreg;
9568 }
9569 else
9570 return NULL;
9571 }
9572
9573 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9574
9575 static rtx
9576 ix86_internal_arg_pointer (void)
9577 {
9578 return virtual_incoming_args_rtx;
9579 }
9580
9581 struct scratch_reg {
9582 rtx reg;
9583 bool saved;
9584 };
9585
9586 /* Return a short-lived scratch register for use on function entry.
9587 In 32-bit mode, it is valid only after the registers are saved
9588 in the prologue. This register must be released by means of
9589 release_scratch_register_on_entry once it is dead. */
9590
9591 static void
9592 get_scratch_register_on_entry (struct scratch_reg *sr)
9593 {
9594 int regno;
9595
9596 sr->saved = false;
9597
9598 if (TARGET_64BIT)
9599 {
9600 /* We always use R11 in 64-bit mode. */
9601 regno = R11_REG;
9602 }
9603 else
9604 {
9605 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9606 bool fastcall_p
9607 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9608 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9609 int regparm = ix86_function_regparm (fntype, decl);
9610 int drap_regno
9611 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9612
9613 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9614 for the static chain register. */
9615 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9616 && drap_regno != AX_REG)
9617 regno = AX_REG;
9618 else if (regparm < 2 && drap_regno != DX_REG)
9619 regno = DX_REG;
9620 /* ecx is the static chain register. */
9621 else if (regparm < 3 && !fastcall_p && !static_chain_p
9622 && drap_regno != CX_REG)
9623 regno = CX_REG;
9624 else if (ix86_save_reg (BX_REG, true))
9625 regno = BX_REG;
9626 /* esi is the static chain register. */
9627 else if (!(regparm == 3 && static_chain_p)
9628 && ix86_save_reg (SI_REG, true))
9629 regno = SI_REG;
9630 else if (ix86_save_reg (DI_REG, true))
9631 regno = DI_REG;
9632 else
9633 {
9634 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9635 sr->saved = true;
9636 }
9637 }
9638
9639 sr->reg = gen_rtx_REG (Pmode, regno);
9640 if (sr->saved)
9641 {
9642 rtx insn = emit_insn (gen_push (sr->reg));
9643 RTX_FRAME_RELATED_P (insn) = 1;
9644 }
9645 }
9646
9647 /* Release a scratch register obtained from the preceding function. */
9648
9649 static void
9650 release_scratch_register_on_entry (struct scratch_reg *sr)
9651 {
9652 if (sr->saved)
9653 {
9654 rtx x, insn = emit_insn (gen_pop (sr->reg));
9655
9656 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9657 RTX_FRAME_RELATED_P (insn) = 1;
9658 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9659 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9660 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9661 }
9662 }
9663
9664 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9665
9666 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9667
9668 static void
9669 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9670 {
9671 /* We skip the probe for the first interval + a small dope of 4 words and
9672 probe that many bytes past the specified size to maintain a protection
9673 area at the botton of the stack. */
9674 const int dope = 4 * UNITS_PER_WORD;
9675 rtx size_rtx = GEN_INT (size), last;
9676
9677 /* See if we have a constant small number of probes to generate. If so,
9678 that's the easy case. The run-time loop is made up of 11 insns in the
9679 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9680 for n # of intervals. */
9681 if (size <= 5 * PROBE_INTERVAL)
9682 {
9683 HOST_WIDE_INT i, adjust;
9684 bool first_probe = true;
9685
9686 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9687 values of N from 1 until it exceeds SIZE. If only one probe is
9688 needed, this will not generate any code. Then adjust and probe
9689 to PROBE_INTERVAL + SIZE. */
9690 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9691 {
9692 if (first_probe)
9693 {
9694 adjust = 2 * PROBE_INTERVAL + dope;
9695 first_probe = false;
9696 }
9697 else
9698 adjust = PROBE_INTERVAL;
9699
9700 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9701 plus_constant (stack_pointer_rtx, -adjust)));
9702 emit_stack_probe (stack_pointer_rtx);
9703 }
9704
9705 if (first_probe)
9706 adjust = size + PROBE_INTERVAL + dope;
9707 else
9708 adjust = size + PROBE_INTERVAL - i;
9709
9710 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9711 plus_constant (stack_pointer_rtx, -adjust)));
9712 emit_stack_probe (stack_pointer_rtx);
9713
9714 /* Adjust back to account for the additional first interval. */
9715 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9716 plus_constant (stack_pointer_rtx,
9717 PROBE_INTERVAL + dope)));
9718 }
9719
9720 /* Otherwise, do the same as above, but in a loop. Note that we must be
9721 extra careful with variables wrapping around because we might be at
9722 the very top (or the very bottom) of the address space and we have
9723 to be able to handle this case properly; in particular, we use an
9724 equality test for the loop condition. */
9725 else
9726 {
9727 HOST_WIDE_INT rounded_size;
9728 struct scratch_reg sr;
9729
9730 get_scratch_register_on_entry (&sr);
9731
9732
9733 /* Step 1: round SIZE to the previous multiple of the interval. */
9734
9735 rounded_size = size & -PROBE_INTERVAL;
9736
9737
9738 /* Step 2: compute initial and final value of the loop counter. */
9739
9740 /* SP = SP_0 + PROBE_INTERVAL. */
9741 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9742 plus_constant (stack_pointer_rtx,
9743 - (PROBE_INTERVAL + dope))));
9744
9745 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9746 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9747 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9748 gen_rtx_PLUS (Pmode, sr.reg,
9749 stack_pointer_rtx)));
9750
9751
9752 /* Step 3: the loop
9753
9754 while (SP != LAST_ADDR)
9755 {
9756 SP = SP + PROBE_INTERVAL
9757 probe at SP
9758 }
9759
9760 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9761 values of N from 1 until it is equal to ROUNDED_SIZE. */
9762
9763 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9764
9765
9766 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9767 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9768
9769 if (size != rounded_size)
9770 {
9771 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9772 plus_constant (stack_pointer_rtx,
9773 rounded_size - size)));
9774 emit_stack_probe (stack_pointer_rtx);
9775 }
9776
9777 /* Adjust back to account for the additional first interval. */
9778 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9779 plus_constant (stack_pointer_rtx,
9780 PROBE_INTERVAL + dope)));
9781
9782 release_scratch_register_on_entry (&sr);
9783 }
9784
9785 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9786
9787 /* Even if the stack pointer isn't the CFA register, we need to correctly
9788 describe the adjustments made to it, in particular differentiate the
9789 frame-related ones from the frame-unrelated ones. */
9790 if (size > 0)
9791 {
9792 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9793 XVECEXP (expr, 0, 0)
9794 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9795 plus_constant (stack_pointer_rtx, -size));
9796 XVECEXP (expr, 0, 1)
9797 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9798 plus_constant (stack_pointer_rtx,
9799 PROBE_INTERVAL + dope + size));
9800 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9801 RTX_FRAME_RELATED_P (last) = 1;
9802
9803 cfun->machine->fs.sp_offset += size;
9804 }
9805
9806 /* Make sure nothing is scheduled before we are done. */
9807 emit_insn (gen_blockage ());
9808 }
9809
9810 /* Adjust the stack pointer up to REG while probing it. */
9811
9812 const char *
9813 output_adjust_stack_and_probe (rtx reg)
9814 {
9815 static int labelno = 0;
9816 char loop_lab[32], end_lab[32];
9817 rtx xops[2];
9818
9819 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9820 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9821
9822 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9823
9824 /* Jump to END_LAB if SP == LAST_ADDR. */
9825 xops[0] = stack_pointer_rtx;
9826 xops[1] = reg;
9827 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9828 fputs ("\tje\t", asm_out_file);
9829 assemble_name_raw (asm_out_file, end_lab);
9830 fputc ('\n', asm_out_file);
9831
9832 /* SP = SP + PROBE_INTERVAL. */
9833 xops[1] = GEN_INT (PROBE_INTERVAL);
9834 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9835
9836 /* Probe at SP. */
9837 xops[1] = const0_rtx;
9838 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9839
9840 fprintf (asm_out_file, "\tjmp\t");
9841 assemble_name_raw (asm_out_file, loop_lab);
9842 fputc ('\n', asm_out_file);
9843
9844 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9845
9846 return "";
9847 }
9848
9849 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9850 inclusive. These are offsets from the current stack pointer. */
9851
9852 static void
9853 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9854 {
9855 /* See if we have a constant small number of probes to generate. If so,
9856 that's the easy case. The run-time loop is made up of 7 insns in the
9857 generic case while the compile-time loop is made up of n insns for n #
9858 of intervals. */
9859 if (size <= 7 * PROBE_INTERVAL)
9860 {
9861 HOST_WIDE_INT i;
9862
9863 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9864 it exceeds SIZE. If only one probe is needed, this will not
9865 generate any code. Then probe at FIRST + SIZE. */
9866 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9867 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9868
9869 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9870 }
9871
9872 /* Otherwise, do the same as above, but in a loop. Note that we must be
9873 extra careful with variables wrapping around because we might be at
9874 the very top (or the very bottom) of the address space and we have
9875 to be able to handle this case properly; in particular, we use an
9876 equality test for the loop condition. */
9877 else
9878 {
9879 HOST_WIDE_INT rounded_size, last;
9880 struct scratch_reg sr;
9881
9882 get_scratch_register_on_entry (&sr);
9883
9884
9885 /* Step 1: round SIZE to the previous multiple of the interval. */
9886
9887 rounded_size = size & -PROBE_INTERVAL;
9888
9889
9890 /* Step 2: compute initial and final value of the loop counter. */
9891
9892 /* TEST_OFFSET = FIRST. */
9893 emit_move_insn (sr.reg, GEN_INT (-first));
9894
9895 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9896 last = first + rounded_size;
9897
9898
9899 /* Step 3: the loop
9900
9901 while (TEST_ADDR != LAST_ADDR)
9902 {
9903 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9904 probe at TEST_ADDR
9905 }
9906
9907 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9908 until it is equal to ROUNDED_SIZE. */
9909
9910 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9911
9912
9913 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9914 that SIZE is equal to ROUNDED_SIZE. */
9915
9916 if (size != rounded_size)
9917 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9918 stack_pointer_rtx,
9919 sr.reg),
9920 rounded_size - size));
9921
9922 release_scratch_register_on_entry (&sr);
9923 }
9924
9925 /* Make sure nothing is scheduled before we are done. */
9926 emit_insn (gen_blockage ());
9927 }
9928
9929 /* Probe a range of stack addresses from REG to END, inclusive. These are
9930 offsets from the current stack pointer. */
9931
9932 const char *
9933 output_probe_stack_range (rtx reg, rtx end)
9934 {
9935 static int labelno = 0;
9936 char loop_lab[32], end_lab[32];
9937 rtx xops[3];
9938
9939 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9940 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9941
9942 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9943
9944 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9945 xops[0] = reg;
9946 xops[1] = end;
9947 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9948 fputs ("\tje\t", asm_out_file);
9949 assemble_name_raw (asm_out_file, end_lab);
9950 fputc ('\n', asm_out_file);
9951
9952 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9953 xops[1] = GEN_INT (PROBE_INTERVAL);
9954 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9955
9956 /* Probe at TEST_ADDR. */
9957 xops[0] = stack_pointer_rtx;
9958 xops[1] = reg;
9959 xops[2] = const0_rtx;
9960 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9961
9962 fprintf (asm_out_file, "\tjmp\t");
9963 assemble_name_raw (asm_out_file, loop_lab);
9964 fputc ('\n', asm_out_file);
9965
9966 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9967
9968 return "";
9969 }
9970
9971 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9972 to be generated in correct form. */
9973 static void
9974 ix86_finalize_stack_realign_flags (void)
9975 {
9976 /* Check if stack realign is really needed after reload, and
9977 stores result in cfun */
9978 unsigned int incoming_stack_boundary
9979 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9980 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9981 unsigned int stack_realign = (incoming_stack_boundary
9982 < (current_function_is_leaf
9983 ? crtl->max_used_stack_slot_alignment
9984 : crtl->stack_alignment_needed));
9985
9986 if (crtl->stack_realign_finalized)
9987 {
9988 /* After stack_realign_needed is finalized, we can't no longer
9989 change it. */
9990 gcc_assert (crtl->stack_realign_needed == stack_realign);
9991 return;
9992 }
9993
9994 /* If the only reason for frame_pointer_needed is that we conservatively
9995 assumed stack realignment might be needed, but in the end nothing that
9996 needed the stack alignment had been spilled, clear frame_pointer_needed
9997 and say we don't need stack realignment. */
9998 if (stack_realign
9999 && !crtl->need_drap
10000 && frame_pointer_needed
10001 && current_function_is_leaf
10002 && flag_omit_frame_pointer
10003 && current_function_sp_is_unchanging
10004 && !ix86_current_function_calls_tls_descriptor
10005 && !crtl->accesses_prior_frames
10006 && !cfun->calls_alloca
10007 && !crtl->calls_eh_return
10008 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10009 && !ix86_frame_pointer_required ()
10010 && get_frame_size () == 0
10011 && ix86_nsaved_sseregs () == 0
10012 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10013 {
10014 HARD_REG_SET set_up_by_prologue, prologue_used;
10015 basic_block bb;
10016
10017 CLEAR_HARD_REG_SET (prologue_used);
10018 CLEAR_HARD_REG_SET (set_up_by_prologue);
10019 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10020 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10021 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10022 HARD_FRAME_POINTER_REGNUM);
10023 FOR_EACH_BB (bb)
10024 {
10025 rtx insn;
10026 FOR_BB_INSNS (bb, insn)
10027 if (NONDEBUG_INSN_P (insn)
10028 && requires_stack_frame_p (insn, prologue_used,
10029 set_up_by_prologue))
10030 {
10031 crtl->stack_realign_needed = stack_realign;
10032 crtl->stack_realign_finalized = true;
10033 return;
10034 }
10035 }
10036
10037 frame_pointer_needed = false;
10038 stack_realign = false;
10039 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10040 crtl->stack_alignment_needed = incoming_stack_boundary;
10041 crtl->stack_alignment_estimated = incoming_stack_boundary;
10042 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10043 crtl->preferred_stack_boundary = incoming_stack_boundary;
10044 df_finish_pass (true);
10045 df_scan_alloc (NULL);
10046 df_scan_blocks ();
10047 df_compute_regs_ever_live (true);
10048 df_analyze ();
10049 }
10050
10051 crtl->stack_realign_needed = stack_realign;
10052 crtl->stack_realign_finalized = true;
10053 }
10054
10055 /* Expand the prologue into a bunch of separate insns. */
10056
10057 void
10058 ix86_expand_prologue (void)
10059 {
10060 struct machine_function *m = cfun->machine;
10061 rtx insn, t;
10062 bool pic_reg_used;
10063 struct ix86_frame frame;
10064 HOST_WIDE_INT allocate;
10065 bool int_registers_saved;
10066
10067 ix86_finalize_stack_realign_flags ();
10068
10069 /* DRAP should not coexist with stack_realign_fp */
10070 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10071
10072 memset (&m->fs, 0, sizeof (m->fs));
10073
10074 /* Initialize CFA state for before the prologue. */
10075 m->fs.cfa_reg = stack_pointer_rtx;
10076 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10077
10078 /* Track SP offset to the CFA. We continue tracking this after we've
10079 swapped the CFA register away from SP. In the case of re-alignment
10080 this is fudged; we're interested to offsets within the local frame. */
10081 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10082 m->fs.sp_valid = true;
10083
10084 ix86_compute_frame_layout (&frame);
10085
10086 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10087 {
10088 /* We should have already generated an error for any use of
10089 ms_hook on a nested function. */
10090 gcc_checking_assert (!ix86_static_chain_on_stack);
10091
10092 /* Check if profiling is active and we shall use profiling before
10093 prologue variant. If so sorry. */
10094 if (crtl->profile && flag_fentry != 0)
10095 sorry ("ms_hook_prologue attribute isn%'t compatible "
10096 "with -mfentry for 32-bit");
10097
10098 /* In ix86_asm_output_function_label we emitted:
10099 8b ff movl.s %edi,%edi
10100 55 push %ebp
10101 8b ec movl.s %esp,%ebp
10102
10103 This matches the hookable function prologue in Win32 API
10104 functions in Microsoft Windows XP Service Pack 2 and newer.
10105 Wine uses this to enable Windows apps to hook the Win32 API
10106 functions provided by Wine.
10107
10108 What that means is that we've already set up the frame pointer. */
10109
10110 if (frame_pointer_needed
10111 && !(crtl->drap_reg && crtl->stack_realign_needed))
10112 {
10113 rtx push, mov;
10114
10115 /* We've decided to use the frame pointer already set up.
10116 Describe this to the unwinder by pretending that both
10117 push and mov insns happen right here.
10118
10119 Putting the unwind info here at the end of the ms_hook
10120 is done so that we can make absolutely certain we get
10121 the required byte sequence at the start of the function,
10122 rather than relying on an assembler that can produce
10123 the exact encoding required.
10124
10125 However it does mean (in the unpatched case) that we have
10126 a 1 insn window where the asynchronous unwind info is
10127 incorrect. However, if we placed the unwind info at
10128 its correct location we would have incorrect unwind info
10129 in the patched case. Which is probably all moot since
10130 I don't expect Wine generates dwarf2 unwind info for the
10131 system libraries that use this feature. */
10132
10133 insn = emit_insn (gen_blockage ());
10134
10135 push = gen_push (hard_frame_pointer_rtx);
10136 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10137 stack_pointer_rtx);
10138 RTX_FRAME_RELATED_P (push) = 1;
10139 RTX_FRAME_RELATED_P (mov) = 1;
10140
10141 RTX_FRAME_RELATED_P (insn) = 1;
10142 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10143 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10144
10145 /* Note that gen_push incremented m->fs.cfa_offset, even
10146 though we didn't emit the push insn here. */
10147 m->fs.cfa_reg = hard_frame_pointer_rtx;
10148 m->fs.fp_offset = m->fs.cfa_offset;
10149 m->fs.fp_valid = true;
10150 }
10151 else
10152 {
10153 /* The frame pointer is not needed so pop %ebp again.
10154 This leaves us with a pristine state. */
10155 emit_insn (gen_pop (hard_frame_pointer_rtx));
10156 }
10157 }
10158
10159 /* The first insn of a function that accepts its static chain on the
10160 stack is to push the register that would be filled in by a direct
10161 call. This insn will be skipped by the trampoline. */
10162 else if (ix86_static_chain_on_stack)
10163 {
10164 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10165 emit_insn (gen_blockage ());
10166
10167 /* We don't want to interpret this push insn as a register save,
10168 only as a stack adjustment. The real copy of the register as
10169 a save will be done later, if needed. */
10170 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10171 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10172 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10173 RTX_FRAME_RELATED_P (insn) = 1;
10174 }
10175
10176 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10177 of DRAP is needed and stack realignment is really needed after reload */
10178 if (stack_realign_drap)
10179 {
10180 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10181
10182 /* Only need to push parameter pointer reg if it is caller saved. */
10183 if (!call_used_regs[REGNO (crtl->drap_reg)])
10184 {
10185 /* Push arg pointer reg */
10186 insn = emit_insn (gen_push (crtl->drap_reg));
10187 RTX_FRAME_RELATED_P (insn) = 1;
10188 }
10189
10190 /* Grab the argument pointer. */
10191 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10192 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10193 RTX_FRAME_RELATED_P (insn) = 1;
10194 m->fs.cfa_reg = crtl->drap_reg;
10195 m->fs.cfa_offset = 0;
10196
10197 /* Align the stack. */
10198 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10199 stack_pointer_rtx,
10200 GEN_INT (-align_bytes)));
10201 RTX_FRAME_RELATED_P (insn) = 1;
10202
10203 /* Replicate the return address on the stack so that return
10204 address can be reached via (argp - 1) slot. This is needed
10205 to implement macro RETURN_ADDR_RTX and intrinsic function
10206 expand_builtin_return_addr etc. */
10207 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10208 t = gen_frame_mem (word_mode, t);
10209 insn = emit_insn (gen_push (t));
10210 RTX_FRAME_RELATED_P (insn) = 1;
10211
10212 /* For the purposes of frame and register save area addressing,
10213 we've started over with a new frame. */
10214 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10215 m->fs.realigned = true;
10216 }
10217
10218 if (frame_pointer_needed && !m->fs.fp_valid)
10219 {
10220 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10221 slower on all targets. Also sdb doesn't like it. */
10222 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10223 RTX_FRAME_RELATED_P (insn) = 1;
10224
10225 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10226 {
10227 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10228 RTX_FRAME_RELATED_P (insn) = 1;
10229
10230 if (m->fs.cfa_reg == stack_pointer_rtx)
10231 m->fs.cfa_reg = hard_frame_pointer_rtx;
10232 m->fs.fp_offset = m->fs.sp_offset;
10233 m->fs.fp_valid = true;
10234 }
10235 }
10236
10237 int_registers_saved = (frame.nregs == 0);
10238
10239 if (!int_registers_saved)
10240 {
10241 /* If saving registers via PUSH, do so now. */
10242 if (!frame.save_regs_using_mov)
10243 {
10244 ix86_emit_save_regs ();
10245 int_registers_saved = true;
10246 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10247 }
10248
10249 /* When using red zone we may start register saving before allocating
10250 the stack frame saving one cycle of the prologue. However, avoid
10251 doing this if we have to probe the stack; at least on x86_64 the
10252 stack probe can turn into a call that clobbers a red zone location. */
10253 else if (ix86_using_red_zone ()
10254 && (! TARGET_STACK_PROBE
10255 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10256 {
10257 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10258 int_registers_saved = true;
10259 }
10260 }
10261
10262 if (stack_realign_fp)
10263 {
10264 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10265 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10266
10267 /* The computation of the size of the re-aligned stack frame means
10268 that we must allocate the size of the register save area before
10269 performing the actual alignment. Otherwise we cannot guarantee
10270 that there's enough storage above the realignment point. */
10271 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10272 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10273 GEN_INT (m->fs.sp_offset
10274 - frame.sse_reg_save_offset),
10275 -1, false);
10276
10277 /* Align the stack. */
10278 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10279 stack_pointer_rtx,
10280 GEN_INT (-align_bytes)));
10281
10282 /* For the purposes of register save area addressing, the stack
10283 pointer is no longer valid. As for the value of sp_offset,
10284 see ix86_compute_frame_layout, which we need to match in order
10285 to pass verification of stack_pointer_offset at the end. */
10286 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10287 m->fs.sp_valid = false;
10288 }
10289
10290 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10291
10292 if (flag_stack_usage_info)
10293 {
10294 /* We start to count from ARG_POINTER. */
10295 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10296
10297 /* If it was realigned, take into account the fake frame. */
10298 if (stack_realign_drap)
10299 {
10300 if (ix86_static_chain_on_stack)
10301 stack_size += UNITS_PER_WORD;
10302
10303 if (!call_used_regs[REGNO (crtl->drap_reg)])
10304 stack_size += UNITS_PER_WORD;
10305
10306 /* This over-estimates by 1 minimal-stack-alignment-unit but
10307 mitigates that by counting in the new return address slot. */
10308 current_function_dynamic_stack_size
10309 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10310 }
10311
10312 current_function_static_stack_size = stack_size;
10313 }
10314
10315 /* The stack has already been decremented by the instruction calling us
10316 so probe if the size is non-negative to preserve the protection area. */
10317 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10318 {
10319 /* We expect the registers to be saved when probes are used. */
10320 gcc_assert (int_registers_saved);
10321
10322 if (STACK_CHECK_MOVING_SP)
10323 {
10324 ix86_adjust_stack_and_probe (allocate);
10325 allocate = 0;
10326 }
10327 else
10328 {
10329 HOST_WIDE_INT size = allocate;
10330
10331 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10332 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10333
10334 if (TARGET_STACK_PROBE)
10335 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10336 else
10337 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10338 }
10339 }
10340
10341 if (allocate == 0)
10342 ;
10343 else if (!ix86_target_stack_probe ()
10344 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10345 {
10346 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10347 GEN_INT (-allocate), -1,
10348 m->fs.cfa_reg == stack_pointer_rtx);
10349 }
10350 else
10351 {
10352 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10353 rtx r10 = NULL;
10354 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10355
10356 bool eax_live = false;
10357 bool r10_live = false;
10358
10359 if (TARGET_64BIT)
10360 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10361 if (!TARGET_64BIT_MS_ABI)
10362 eax_live = ix86_eax_live_at_start_p ();
10363
10364 if (eax_live)
10365 {
10366 emit_insn (gen_push (eax));
10367 allocate -= UNITS_PER_WORD;
10368 }
10369 if (r10_live)
10370 {
10371 r10 = gen_rtx_REG (Pmode, R10_REG);
10372 emit_insn (gen_push (r10));
10373 allocate -= UNITS_PER_WORD;
10374 }
10375
10376 emit_move_insn (eax, GEN_INT (allocate));
10377 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10378
10379 /* Use the fact that AX still contains ALLOCATE. */
10380 adjust_stack_insn = (Pmode == DImode
10381 ? gen_pro_epilogue_adjust_stack_di_sub
10382 : gen_pro_epilogue_adjust_stack_si_sub);
10383
10384 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10385 stack_pointer_rtx, eax));
10386
10387 /* Note that SEH directives need to continue tracking the stack
10388 pointer even after the frame pointer has been set up. */
10389 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10390 {
10391 if (m->fs.cfa_reg == stack_pointer_rtx)
10392 m->fs.cfa_offset += allocate;
10393
10394 RTX_FRAME_RELATED_P (insn) = 1;
10395 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10396 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10397 plus_constant (stack_pointer_rtx,
10398 -allocate)));
10399 }
10400 m->fs.sp_offset += allocate;
10401
10402 if (r10_live && eax_live)
10403 {
10404 t = choose_baseaddr (m->fs.sp_offset - allocate);
10405 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10406 gen_frame_mem (word_mode, t));
10407 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10408 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10409 gen_frame_mem (word_mode, t));
10410 }
10411 else if (eax_live || r10_live)
10412 {
10413 t = choose_baseaddr (m->fs.sp_offset - allocate);
10414 emit_move_insn (gen_rtx_REG (word_mode,
10415 (eax_live ? AX_REG : R10_REG)),
10416 gen_frame_mem (word_mode, t));
10417 }
10418 }
10419 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10420
10421 /* If we havn't already set up the frame pointer, do so now. */
10422 if (frame_pointer_needed && !m->fs.fp_valid)
10423 {
10424 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10425 GEN_INT (frame.stack_pointer_offset
10426 - frame.hard_frame_pointer_offset));
10427 insn = emit_insn (insn);
10428 RTX_FRAME_RELATED_P (insn) = 1;
10429 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10430
10431 if (m->fs.cfa_reg == stack_pointer_rtx)
10432 m->fs.cfa_reg = hard_frame_pointer_rtx;
10433 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10434 m->fs.fp_valid = true;
10435 }
10436
10437 if (!int_registers_saved)
10438 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10439 if (frame.nsseregs)
10440 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10441
10442 pic_reg_used = false;
10443 if (pic_offset_table_rtx
10444 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10445 || crtl->profile))
10446 {
10447 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10448
10449 if (alt_pic_reg_used != INVALID_REGNUM)
10450 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10451
10452 pic_reg_used = true;
10453 }
10454
10455 if (pic_reg_used)
10456 {
10457 if (TARGET_64BIT)
10458 {
10459 if (ix86_cmodel == CM_LARGE_PIC)
10460 {
10461 rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
10462 rtx label = gen_label_rtx ();
10463 emit_label (label);
10464 LABEL_PRESERVE_P (label) = 1;
10465 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10466 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
10467 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10468 insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
10469 pic_offset_table_rtx, tmp_reg));
10470 }
10471 else
10472 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10473 }
10474 else
10475 {
10476 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10477 RTX_FRAME_RELATED_P (insn) = 1;
10478 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10479 }
10480 }
10481
10482 /* In the pic_reg_used case, make sure that the got load isn't deleted
10483 when mcount needs it. Blockage to avoid call movement across mcount
10484 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10485 note. */
10486 if (crtl->profile && !flag_fentry && pic_reg_used)
10487 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10488
10489 if (crtl->drap_reg && !crtl->stack_realign_needed)
10490 {
10491 /* vDRAP is setup but after reload it turns out stack realign
10492 isn't necessary, here we will emit prologue to setup DRAP
10493 without stack realign adjustment */
10494 t = choose_baseaddr (0);
10495 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10496 }
10497
10498 /* Prevent instructions from being scheduled into register save push
10499 sequence when access to the redzone area is done through frame pointer.
10500 The offset between the frame pointer and the stack pointer is calculated
10501 relative to the value of the stack pointer at the end of the function
10502 prologue, and moving instructions that access redzone area via frame
10503 pointer inside push sequence violates this assumption. */
10504 if (frame_pointer_needed && frame.red_zone_size)
10505 emit_insn (gen_memory_blockage ());
10506
10507 /* Emit cld instruction if stringops are used in the function. */
10508 if (TARGET_CLD && ix86_current_function_needs_cld)
10509 emit_insn (gen_cld ());
10510
10511 /* SEH requires that the prologue end within 256 bytes of the start of
10512 the function. Prevent instruction schedules that would extend that.
10513 Further, prevent alloca modifications to the stack pointer from being
10514 combined with prologue modifications. */
10515 if (TARGET_SEH)
10516 emit_insn (gen_prologue_use (stack_pointer_rtx));
10517 }
10518
10519 /* Emit code to restore REG using a POP insn. */
10520
10521 static void
10522 ix86_emit_restore_reg_using_pop (rtx reg)
10523 {
10524 struct machine_function *m = cfun->machine;
10525 rtx insn = emit_insn (gen_pop (reg));
10526
10527 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10528 m->fs.sp_offset -= UNITS_PER_WORD;
10529
10530 if (m->fs.cfa_reg == crtl->drap_reg
10531 && REGNO (reg) == REGNO (crtl->drap_reg))
10532 {
10533 /* Previously we'd represented the CFA as an expression
10534 like *(%ebp - 8). We've just popped that value from
10535 the stack, which means we need to reset the CFA to
10536 the drap register. This will remain until we restore
10537 the stack pointer. */
10538 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10539 RTX_FRAME_RELATED_P (insn) = 1;
10540
10541 /* This means that the DRAP register is valid for addressing too. */
10542 m->fs.drap_valid = true;
10543 return;
10544 }
10545
10546 if (m->fs.cfa_reg == stack_pointer_rtx)
10547 {
10548 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10549 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10550 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10551 RTX_FRAME_RELATED_P (insn) = 1;
10552
10553 m->fs.cfa_offset -= UNITS_PER_WORD;
10554 }
10555
10556 /* When the frame pointer is the CFA, and we pop it, we are
10557 swapping back to the stack pointer as the CFA. This happens
10558 for stack frames that don't allocate other data, so we assume
10559 the stack pointer is now pointing at the return address, i.e.
10560 the function entry state, which makes the offset be 1 word. */
10561 if (reg == hard_frame_pointer_rtx)
10562 {
10563 m->fs.fp_valid = false;
10564 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10565 {
10566 m->fs.cfa_reg = stack_pointer_rtx;
10567 m->fs.cfa_offset -= UNITS_PER_WORD;
10568
10569 add_reg_note (insn, REG_CFA_DEF_CFA,
10570 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10571 GEN_INT (m->fs.cfa_offset)));
10572 RTX_FRAME_RELATED_P (insn) = 1;
10573 }
10574 }
10575 }
10576
10577 /* Emit code to restore saved registers using POP insns. */
10578
10579 static void
10580 ix86_emit_restore_regs_using_pop (void)
10581 {
10582 unsigned int regno;
10583
10584 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10585 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10586 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10587 }
10588
10589 /* Emit code and notes for the LEAVE instruction. */
10590
10591 static void
10592 ix86_emit_leave (void)
10593 {
10594 struct machine_function *m = cfun->machine;
10595 rtx insn = emit_insn (ix86_gen_leave ());
10596
10597 ix86_add_queued_cfa_restore_notes (insn);
10598
10599 gcc_assert (m->fs.fp_valid);
10600 m->fs.sp_valid = true;
10601 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10602 m->fs.fp_valid = false;
10603
10604 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10605 {
10606 m->fs.cfa_reg = stack_pointer_rtx;
10607 m->fs.cfa_offset = m->fs.sp_offset;
10608
10609 add_reg_note (insn, REG_CFA_DEF_CFA,
10610 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10611 RTX_FRAME_RELATED_P (insn) = 1;
10612 }
10613 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10614 m->fs.fp_offset);
10615 }
10616
10617 /* Emit code to restore saved registers using MOV insns.
10618 First register is restored from CFA - CFA_OFFSET. */
10619 static void
10620 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10621 bool maybe_eh_return)
10622 {
10623 struct machine_function *m = cfun->machine;
10624 unsigned int regno;
10625
10626 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10627 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10628 {
10629 rtx reg = gen_rtx_REG (word_mode, regno);
10630 rtx insn, mem;
10631
10632 mem = choose_baseaddr (cfa_offset);
10633 mem = gen_frame_mem (word_mode, mem);
10634 insn = emit_move_insn (reg, mem);
10635
10636 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10637 {
10638 /* Previously we'd represented the CFA as an expression
10639 like *(%ebp - 8). We've just popped that value from
10640 the stack, which means we need to reset the CFA to
10641 the drap register. This will remain until we restore
10642 the stack pointer. */
10643 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10644 RTX_FRAME_RELATED_P (insn) = 1;
10645
10646 /* This means that the DRAP register is valid for addressing. */
10647 m->fs.drap_valid = true;
10648 }
10649 else
10650 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10651
10652 cfa_offset -= UNITS_PER_WORD;
10653 }
10654 }
10655
10656 /* Emit code to restore saved registers using MOV insns.
10657 First register is restored from CFA - CFA_OFFSET. */
10658 static void
10659 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10660 bool maybe_eh_return)
10661 {
10662 unsigned int regno;
10663
10664 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10665 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10666 {
10667 rtx reg = gen_rtx_REG (V4SFmode, regno);
10668 rtx mem;
10669
10670 mem = choose_baseaddr (cfa_offset);
10671 mem = gen_rtx_MEM (V4SFmode, mem);
10672 set_mem_align (mem, 128);
10673 emit_move_insn (reg, mem);
10674
10675 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10676
10677 cfa_offset -= 16;
10678 }
10679 }
10680
10681 /* Emit vzeroupper if needed. */
10682
10683 void
10684 ix86_maybe_emit_epilogue_vzeroupper (void)
10685 {
10686 if (TARGET_VZEROUPPER
10687 && !TREE_THIS_VOLATILE (cfun->decl)
10688 && !cfun->machine->caller_return_avx256_p)
10689 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10690 }
10691
10692 /* Restore function stack, frame, and registers. */
10693
10694 void
10695 ix86_expand_epilogue (int style)
10696 {
10697 struct machine_function *m = cfun->machine;
10698 struct machine_frame_state frame_state_save = m->fs;
10699 struct ix86_frame frame;
10700 bool restore_regs_via_mov;
10701 bool using_drap;
10702
10703 ix86_finalize_stack_realign_flags ();
10704 ix86_compute_frame_layout (&frame);
10705
10706 m->fs.sp_valid = (!frame_pointer_needed
10707 || (current_function_sp_is_unchanging
10708 && !stack_realign_fp));
10709 gcc_assert (!m->fs.sp_valid
10710 || m->fs.sp_offset == frame.stack_pointer_offset);
10711
10712 /* The FP must be valid if the frame pointer is present. */
10713 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10714 gcc_assert (!m->fs.fp_valid
10715 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10716
10717 /* We must have *some* valid pointer to the stack frame. */
10718 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10719
10720 /* The DRAP is never valid at this point. */
10721 gcc_assert (!m->fs.drap_valid);
10722
10723 /* See the comment about red zone and frame
10724 pointer usage in ix86_expand_prologue. */
10725 if (frame_pointer_needed && frame.red_zone_size)
10726 emit_insn (gen_memory_blockage ());
10727
10728 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10729 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10730
10731 /* Determine the CFA offset of the end of the red-zone. */
10732 m->fs.red_zone_offset = 0;
10733 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10734 {
10735 /* The red-zone begins below the return address. */
10736 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10737
10738 /* When the register save area is in the aligned portion of
10739 the stack, determine the maximum runtime displacement that
10740 matches up with the aligned frame. */
10741 if (stack_realign_drap)
10742 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10743 + UNITS_PER_WORD);
10744 }
10745
10746 /* Special care must be taken for the normal return case of a function
10747 using eh_return: the eax and edx registers are marked as saved, but
10748 not restored along this path. Adjust the save location to match. */
10749 if (crtl->calls_eh_return && style != 2)
10750 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10751
10752 /* EH_RETURN requires the use of moves to function properly. */
10753 if (crtl->calls_eh_return)
10754 restore_regs_via_mov = true;
10755 /* SEH requires the use of pops to identify the epilogue. */
10756 else if (TARGET_SEH)
10757 restore_regs_via_mov = false;
10758 /* If we're only restoring one register and sp is not valid then
10759 using a move instruction to restore the register since it's
10760 less work than reloading sp and popping the register. */
10761 else if (!m->fs.sp_valid && frame.nregs <= 1)
10762 restore_regs_via_mov = true;
10763 else if (TARGET_EPILOGUE_USING_MOVE
10764 && cfun->machine->use_fast_prologue_epilogue
10765 && (frame.nregs > 1
10766 || m->fs.sp_offset != frame.reg_save_offset))
10767 restore_regs_via_mov = true;
10768 else if (frame_pointer_needed
10769 && !frame.nregs
10770 && m->fs.sp_offset != frame.reg_save_offset)
10771 restore_regs_via_mov = true;
10772 else if (frame_pointer_needed
10773 && TARGET_USE_LEAVE
10774 && cfun->machine->use_fast_prologue_epilogue
10775 && frame.nregs == 1)
10776 restore_regs_via_mov = true;
10777 else
10778 restore_regs_via_mov = false;
10779
10780 if (restore_regs_via_mov || frame.nsseregs)
10781 {
10782 /* Ensure that the entire register save area is addressable via
10783 the stack pointer, if we will restore via sp. */
10784 if (TARGET_64BIT
10785 && m->fs.sp_offset > 0x7fffffff
10786 && !(m->fs.fp_valid || m->fs.drap_valid)
10787 && (frame.nsseregs + frame.nregs) != 0)
10788 {
10789 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10790 GEN_INT (m->fs.sp_offset
10791 - frame.sse_reg_save_offset),
10792 style,
10793 m->fs.cfa_reg == stack_pointer_rtx);
10794 }
10795 }
10796
10797 /* If there are any SSE registers to restore, then we have to do it
10798 via moves, since there's obviously no pop for SSE regs. */
10799 if (frame.nsseregs)
10800 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10801 style == 2);
10802
10803 if (restore_regs_via_mov)
10804 {
10805 rtx t;
10806
10807 if (frame.nregs)
10808 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10809
10810 /* eh_return epilogues need %ecx added to the stack pointer. */
10811 if (style == 2)
10812 {
10813 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10814
10815 /* Stack align doesn't work with eh_return. */
10816 gcc_assert (!stack_realign_drap);
10817 /* Neither does regparm nested functions. */
10818 gcc_assert (!ix86_static_chain_on_stack);
10819
10820 if (frame_pointer_needed)
10821 {
10822 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10823 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10824 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10825
10826 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10827 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10828
10829 /* Note that we use SA as a temporary CFA, as the return
10830 address is at the proper place relative to it. We
10831 pretend this happens at the FP restore insn because
10832 prior to this insn the FP would be stored at the wrong
10833 offset relative to SA, and after this insn we have no
10834 other reasonable register to use for the CFA. We don't
10835 bother resetting the CFA to the SP for the duration of
10836 the return insn. */
10837 add_reg_note (insn, REG_CFA_DEF_CFA,
10838 plus_constant (sa, UNITS_PER_WORD));
10839 ix86_add_queued_cfa_restore_notes (insn);
10840 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10841 RTX_FRAME_RELATED_P (insn) = 1;
10842
10843 m->fs.cfa_reg = sa;
10844 m->fs.cfa_offset = UNITS_PER_WORD;
10845 m->fs.fp_valid = false;
10846
10847 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10848 const0_rtx, style, false);
10849 }
10850 else
10851 {
10852 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10853 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10854 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10855 ix86_add_queued_cfa_restore_notes (insn);
10856
10857 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10858 if (m->fs.cfa_offset != UNITS_PER_WORD)
10859 {
10860 m->fs.cfa_offset = UNITS_PER_WORD;
10861 add_reg_note (insn, REG_CFA_DEF_CFA,
10862 plus_constant (stack_pointer_rtx,
10863 UNITS_PER_WORD));
10864 RTX_FRAME_RELATED_P (insn) = 1;
10865 }
10866 }
10867 m->fs.sp_offset = UNITS_PER_WORD;
10868 m->fs.sp_valid = true;
10869 }
10870 }
10871 else
10872 {
10873 /* SEH requires that the function end with (1) a stack adjustment
10874 if necessary, (2) a sequence of pops, and (3) a return or
10875 jump instruction. Prevent insns from the function body from
10876 being scheduled into this sequence. */
10877 if (TARGET_SEH)
10878 {
10879 /* Prevent a catch region from being adjacent to the standard
10880 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10881 several other flags that would be interesting to test are
10882 not yet set up. */
10883 if (flag_non_call_exceptions)
10884 emit_insn (gen_nops (const1_rtx));
10885 else
10886 emit_insn (gen_blockage ());
10887 }
10888
10889 /* First step is to deallocate the stack frame so that we can
10890 pop the registers. */
10891 if (!m->fs.sp_valid)
10892 {
10893 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10894 GEN_INT (m->fs.fp_offset
10895 - frame.reg_save_offset),
10896 style, false);
10897 }
10898 else if (m->fs.sp_offset != frame.reg_save_offset)
10899 {
10900 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10901 GEN_INT (m->fs.sp_offset
10902 - frame.reg_save_offset),
10903 style,
10904 m->fs.cfa_reg == stack_pointer_rtx);
10905 }
10906
10907 ix86_emit_restore_regs_using_pop ();
10908 }
10909
10910 /* If we used a stack pointer and haven't already got rid of it,
10911 then do so now. */
10912 if (m->fs.fp_valid)
10913 {
10914 /* If the stack pointer is valid and pointing at the frame
10915 pointer store address, then we only need a pop. */
10916 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10917 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10918 /* Leave results in shorter dependency chains on CPUs that are
10919 able to grok it fast. */
10920 else if (TARGET_USE_LEAVE
10921 || optimize_function_for_size_p (cfun)
10922 || !cfun->machine->use_fast_prologue_epilogue)
10923 ix86_emit_leave ();
10924 else
10925 {
10926 pro_epilogue_adjust_stack (stack_pointer_rtx,
10927 hard_frame_pointer_rtx,
10928 const0_rtx, style, !using_drap);
10929 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10930 }
10931 }
10932
10933 if (using_drap)
10934 {
10935 int param_ptr_offset = UNITS_PER_WORD;
10936 rtx insn;
10937
10938 gcc_assert (stack_realign_drap);
10939
10940 if (ix86_static_chain_on_stack)
10941 param_ptr_offset += UNITS_PER_WORD;
10942 if (!call_used_regs[REGNO (crtl->drap_reg)])
10943 param_ptr_offset += UNITS_PER_WORD;
10944
10945 insn = emit_insn (gen_rtx_SET
10946 (VOIDmode, stack_pointer_rtx,
10947 gen_rtx_PLUS (Pmode,
10948 crtl->drap_reg,
10949 GEN_INT (-param_ptr_offset))));
10950 m->fs.cfa_reg = stack_pointer_rtx;
10951 m->fs.cfa_offset = param_ptr_offset;
10952 m->fs.sp_offset = param_ptr_offset;
10953 m->fs.realigned = false;
10954
10955 add_reg_note (insn, REG_CFA_DEF_CFA,
10956 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10957 GEN_INT (param_ptr_offset)));
10958 RTX_FRAME_RELATED_P (insn) = 1;
10959
10960 if (!call_used_regs[REGNO (crtl->drap_reg)])
10961 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10962 }
10963
10964 /* At this point the stack pointer must be valid, and we must have
10965 restored all of the registers. We may not have deallocated the
10966 entire stack frame. We've delayed this until now because it may
10967 be possible to merge the local stack deallocation with the
10968 deallocation forced by ix86_static_chain_on_stack. */
10969 gcc_assert (m->fs.sp_valid);
10970 gcc_assert (!m->fs.fp_valid);
10971 gcc_assert (!m->fs.realigned);
10972 if (m->fs.sp_offset != UNITS_PER_WORD)
10973 {
10974 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10975 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10976 style, true);
10977 }
10978 else
10979 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10980
10981 /* Sibcall epilogues don't want a return instruction. */
10982 if (style == 0)
10983 {
10984 m->fs = frame_state_save;
10985 return;
10986 }
10987
10988 /* Emit vzeroupper if needed. */
10989 ix86_maybe_emit_epilogue_vzeroupper ();
10990
10991 if (crtl->args.pops_args && crtl->args.size)
10992 {
10993 rtx popc = GEN_INT (crtl->args.pops_args);
10994
10995 /* i386 can only pop 64K bytes. If asked to pop more, pop return
10996 address, do explicit add, and jump indirectly to the caller. */
10997
10998 if (crtl->args.pops_args >= 65536)
10999 {
11000 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11001 rtx insn;
11002
11003 /* There is no "pascal" calling convention in any 64bit ABI. */
11004 gcc_assert (!TARGET_64BIT);
11005
11006 insn = emit_insn (gen_pop (ecx));
11007 m->fs.cfa_offset -= UNITS_PER_WORD;
11008 m->fs.sp_offset -= UNITS_PER_WORD;
11009
11010 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11011 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11012 add_reg_note (insn, REG_CFA_REGISTER,
11013 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11014 RTX_FRAME_RELATED_P (insn) = 1;
11015
11016 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11017 popc, -1, true);
11018 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11019 }
11020 else
11021 emit_jump_insn (gen_simple_return_pop_internal (popc));
11022 }
11023 else
11024 emit_jump_insn (gen_simple_return_internal ());
11025
11026 /* Restore the state back to the state from the prologue,
11027 so that it's correct for the next epilogue. */
11028 m->fs = frame_state_save;
11029 }
11030
11031 /* Reset from the function's potential modifications. */
11032
11033 static void
11034 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11035 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11036 {
11037 if (pic_offset_table_rtx)
11038 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11039 #if TARGET_MACHO
11040 /* Mach-O doesn't support labels at the end of objects, so if
11041 it looks like we might want one, insert a NOP. */
11042 {
11043 rtx insn = get_last_insn ();
11044 rtx deleted_debug_label = NULL_RTX;
11045 while (insn
11046 && NOTE_P (insn)
11047 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11048 {
11049 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11050 notes only, instead set their CODE_LABEL_NUMBER to -1,
11051 otherwise there would be code generation differences
11052 in between -g and -g0. */
11053 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11054 deleted_debug_label = insn;
11055 insn = PREV_INSN (insn);
11056 }
11057 if (insn
11058 && (LABEL_P (insn)
11059 || (NOTE_P (insn)
11060 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11061 fputs ("\tnop\n", file);
11062 else if (deleted_debug_label)
11063 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11064 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11065 CODE_LABEL_NUMBER (insn) = -1;
11066 }
11067 #endif
11068
11069 }
11070
11071 /* Return a scratch register to use in the split stack prologue. The
11072 split stack prologue is used for -fsplit-stack. It is the first
11073 instructions in the function, even before the regular prologue.
11074 The scratch register can be any caller-saved register which is not
11075 used for parameters or for the static chain. */
11076
11077 static unsigned int
11078 split_stack_prologue_scratch_regno (void)
11079 {
11080 if (TARGET_64BIT)
11081 return R11_REG;
11082 else
11083 {
11084 bool is_fastcall;
11085 int regparm;
11086
11087 is_fastcall = (lookup_attribute ("fastcall",
11088 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11089 != NULL);
11090 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11091
11092 if (is_fastcall)
11093 {
11094 if (DECL_STATIC_CHAIN (cfun->decl))
11095 {
11096 sorry ("-fsplit-stack does not support fastcall with "
11097 "nested function");
11098 return INVALID_REGNUM;
11099 }
11100 return AX_REG;
11101 }
11102 else if (regparm < 3)
11103 {
11104 if (!DECL_STATIC_CHAIN (cfun->decl))
11105 return CX_REG;
11106 else
11107 {
11108 if (regparm >= 2)
11109 {
11110 sorry ("-fsplit-stack does not support 2 register "
11111 " parameters for a nested function");
11112 return INVALID_REGNUM;
11113 }
11114 return DX_REG;
11115 }
11116 }
11117 else
11118 {
11119 /* FIXME: We could make this work by pushing a register
11120 around the addition and comparison. */
11121 sorry ("-fsplit-stack does not support 3 register parameters");
11122 return INVALID_REGNUM;
11123 }
11124 }
11125 }
11126
11127 /* A SYMBOL_REF for the function which allocates new stackspace for
11128 -fsplit-stack. */
11129
11130 static GTY(()) rtx split_stack_fn;
11131
11132 /* A SYMBOL_REF for the more stack function when using the large
11133 model. */
11134
11135 static GTY(()) rtx split_stack_fn_large;
11136
11137 /* Handle -fsplit-stack. These are the first instructions in the
11138 function, even before the regular prologue. */
11139
11140 void
11141 ix86_expand_split_stack_prologue (void)
11142 {
11143 struct ix86_frame frame;
11144 HOST_WIDE_INT allocate;
11145 unsigned HOST_WIDE_INT args_size;
11146 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11147 rtx scratch_reg = NULL_RTX;
11148 rtx varargs_label = NULL_RTX;
11149 rtx fn;
11150
11151 gcc_assert (flag_split_stack && reload_completed);
11152
11153 ix86_finalize_stack_realign_flags ();
11154 ix86_compute_frame_layout (&frame);
11155 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11156
11157 /* This is the label we will branch to if we have enough stack
11158 space. We expect the basic block reordering pass to reverse this
11159 branch if optimizing, so that we branch in the unlikely case. */
11160 label = gen_label_rtx ();
11161
11162 /* We need to compare the stack pointer minus the frame size with
11163 the stack boundary in the TCB. The stack boundary always gives
11164 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11165 can compare directly. Otherwise we need to do an addition. */
11166
11167 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11168 UNSPEC_STACK_CHECK);
11169 limit = gen_rtx_CONST (Pmode, limit);
11170 limit = gen_rtx_MEM (Pmode, limit);
11171 if (allocate < SPLIT_STACK_AVAILABLE)
11172 current = stack_pointer_rtx;
11173 else
11174 {
11175 unsigned int scratch_regno;
11176 rtx offset;
11177
11178 /* We need a scratch register to hold the stack pointer minus
11179 the required frame size. Since this is the very start of the
11180 function, the scratch register can be any caller-saved
11181 register which is not used for parameters. */
11182 offset = GEN_INT (- allocate);
11183 scratch_regno = split_stack_prologue_scratch_regno ();
11184 if (scratch_regno == INVALID_REGNUM)
11185 return;
11186 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11187 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11188 {
11189 /* We don't use ix86_gen_add3 in this case because it will
11190 want to split to lea, but when not optimizing the insn
11191 will not be split after this point. */
11192 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11193 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11194 offset)));
11195 }
11196 else
11197 {
11198 emit_move_insn (scratch_reg, offset);
11199 emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
11200 stack_pointer_rtx));
11201 }
11202 current = scratch_reg;
11203 }
11204
11205 ix86_expand_branch (GEU, current, limit, label);
11206 jump_insn = get_last_insn ();
11207 JUMP_LABEL (jump_insn) = label;
11208
11209 /* Mark the jump as very likely to be taken. */
11210 add_reg_note (jump_insn, REG_BR_PROB,
11211 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11212
11213 if (split_stack_fn == NULL_RTX)
11214 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11215 fn = split_stack_fn;
11216
11217 /* Get more stack space. We pass in the desired stack space and the
11218 size of the arguments to copy to the new stack. In 32-bit mode
11219 we push the parameters; __morestack will return on a new stack
11220 anyhow. In 64-bit mode we pass the parameters in r10 and
11221 r11. */
11222 allocate_rtx = GEN_INT (allocate);
11223 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11224 call_fusage = NULL_RTX;
11225 if (TARGET_64BIT)
11226 {
11227 rtx reg10, reg11;
11228
11229 reg10 = gen_rtx_REG (Pmode, R10_REG);
11230 reg11 = gen_rtx_REG (Pmode, R11_REG);
11231
11232 /* If this function uses a static chain, it will be in %r10.
11233 Preserve it across the call to __morestack. */
11234 if (DECL_STATIC_CHAIN (cfun->decl))
11235 {
11236 rtx rax;
11237
11238 rax = gen_rtx_REG (word_mode, AX_REG);
11239 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11240 use_reg (&call_fusage, rax);
11241 }
11242
11243 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11244 {
11245 HOST_WIDE_INT argval;
11246
11247 /* When using the large model we need to load the address
11248 into a register, and we've run out of registers. So we
11249 switch to a different calling convention, and we call a
11250 different function: __morestack_large. We pass the
11251 argument size in the upper 32 bits of r10 and pass the
11252 frame size in the lower 32 bits. */
11253 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11254 gcc_assert ((args_size & 0xffffffff) == args_size);
11255
11256 if (split_stack_fn_large == NULL_RTX)
11257 split_stack_fn_large =
11258 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11259
11260 if (ix86_cmodel == CM_LARGE_PIC)
11261 {
11262 rtx label, x;
11263
11264 label = gen_label_rtx ();
11265 emit_label (label);
11266 LABEL_PRESERVE_P (label) = 1;
11267 emit_insn (gen_set_rip_rex64 (reg10, label));
11268 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11269 emit_insn (gen_adddi3 (reg10, reg10, reg11));
11270 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11271 UNSPEC_GOT);
11272 x = gen_rtx_CONST (Pmode, x);
11273 emit_move_insn (reg11, x);
11274 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11275 x = gen_const_mem (Pmode, x);
11276 emit_move_insn (reg11, x);
11277 }
11278 else
11279 emit_move_insn (reg11, split_stack_fn_large);
11280
11281 fn = reg11;
11282
11283 argval = ((args_size << 16) << 16) + allocate;
11284 emit_move_insn (reg10, GEN_INT (argval));
11285 }
11286 else
11287 {
11288 emit_move_insn (reg10, allocate_rtx);
11289 emit_move_insn (reg11, GEN_INT (args_size));
11290 use_reg (&call_fusage, reg11);
11291 }
11292
11293 use_reg (&call_fusage, reg10);
11294 }
11295 else
11296 {
11297 emit_insn (gen_push (GEN_INT (args_size)));
11298 emit_insn (gen_push (allocate_rtx));
11299 }
11300 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11301 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11302 NULL_RTX, false);
11303 add_function_usage_to (call_insn, call_fusage);
11304
11305 /* In order to make call/return prediction work right, we now need
11306 to execute a return instruction. See
11307 libgcc/config/i386/morestack.S for the details on how this works.
11308
11309 For flow purposes gcc must not see this as a return
11310 instruction--we need control flow to continue at the subsequent
11311 label. Therefore, we use an unspec. */
11312 gcc_assert (crtl->args.pops_args < 65536);
11313 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11314
11315 /* If we are in 64-bit mode and this function uses a static chain,
11316 we saved %r10 in %rax before calling _morestack. */
11317 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11318 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11319 gen_rtx_REG (word_mode, AX_REG));
11320
11321 /* If this function calls va_start, we need to store a pointer to
11322 the arguments on the old stack, because they may not have been
11323 all copied to the new stack. At this point the old stack can be
11324 found at the frame pointer value used by __morestack, because
11325 __morestack has set that up before calling back to us. Here we
11326 store that pointer in a scratch register, and in
11327 ix86_expand_prologue we store the scratch register in a stack
11328 slot. */
11329 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11330 {
11331 unsigned int scratch_regno;
11332 rtx frame_reg;
11333 int words;
11334
11335 scratch_regno = split_stack_prologue_scratch_regno ();
11336 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11337 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11338
11339 /* 64-bit:
11340 fp -> old fp value
11341 return address within this function
11342 return address of caller of this function
11343 stack arguments
11344 So we add three words to get to the stack arguments.
11345
11346 32-bit:
11347 fp -> old fp value
11348 return address within this function
11349 first argument to __morestack
11350 second argument to __morestack
11351 return address of caller of this function
11352 stack arguments
11353 So we add five words to get to the stack arguments.
11354 */
11355 words = TARGET_64BIT ? 3 : 5;
11356 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11357 gen_rtx_PLUS (Pmode, frame_reg,
11358 GEN_INT (words * UNITS_PER_WORD))));
11359
11360 varargs_label = gen_label_rtx ();
11361 emit_jump_insn (gen_jump (varargs_label));
11362 JUMP_LABEL (get_last_insn ()) = varargs_label;
11363
11364 emit_barrier ();
11365 }
11366
11367 emit_label (label);
11368 LABEL_NUSES (label) = 1;
11369
11370 /* If this function calls va_start, we now have to set the scratch
11371 register for the case where we do not call __morestack. In this
11372 case we need to set it based on the stack pointer. */
11373 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11374 {
11375 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11376 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11377 GEN_INT (UNITS_PER_WORD))));
11378
11379 emit_label (varargs_label);
11380 LABEL_NUSES (varargs_label) = 1;
11381 }
11382 }
11383
11384 /* We may have to tell the dataflow pass that the split stack prologue
11385 is initializing a scratch register. */
11386
11387 static void
11388 ix86_live_on_entry (bitmap regs)
11389 {
11390 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11391 {
11392 gcc_assert (flag_split_stack);
11393 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11394 }
11395 }
11396 \f
11397 /* Determine if op is suitable SUBREG RTX for address. */
11398
11399 static bool
11400 ix86_address_subreg_operand (rtx op)
11401 {
11402 enum machine_mode mode;
11403
11404 if (!REG_P (op))
11405 return false;
11406
11407 mode = GET_MODE (op);
11408
11409 if (GET_MODE_CLASS (mode) != MODE_INT)
11410 return false;
11411
11412 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11413 failures when the register is one word out of a two word structure. */
11414 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11415 return false;
11416
11417 /* Allow only SUBREGs of non-eliminable hard registers. */
11418 return register_no_elim_operand (op, mode);
11419 }
11420
11421 /* Extract the parts of an RTL expression that is a valid memory address
11422 for an instruction. Return 0 if the structure of the address is
11423 grossly off. Return -1 if the address contains ASHIFT, so it is not
11424 strictly valid, but still used for computing length of lea instruction. */
11425
11426 int
11427 ix86_decompose_address (rtx addr, struct ix86_address *out)
11428 {
11429 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11430 rtx base_reg, index_reg;
11431 HOST_WIDE_INT scale = 1;
11432 rtx scale_rtx = NULL_RTX;
11433 rtx tmp;
11434 int retval = 1;
11435 enum ix86_address_seg seg = SEG_DEFAULT;
11436
11437 /* Allow zero-extended SImode addresses,
11438 they will be emitted with addr32 prefix. */
11439 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11440 {
11441 if (GET_CODE (addr) == ZERO_EXTEND
11442 && GET_MODE (XEXP (addr, 0)) == SImode)
11443 addr = XEXP (addr, 0);
11444 else if (GET_CODE (addr) == AND
11445 && const_32bit_mask (XEXP (addr, 1), DImode))
11446 {
11447 addr = XEXP (addr, 0);
11448
11449 /* Adjust SUBREGs. */
11450 if (GET_CODE (addr) == SUBREG
11451 && GET_MODE (SUBREG_REG (addr)) == SImode)
11452 addr = SUBREG_REG (addr);
11453 else if (GET_MODE (addr) == DImode)
11454 addr = gen_rtx_SUBREG (SImode, addr, 0);
11455 else
11456 return 0;
11457 }
11458 }
11459
11460 if (REG_P (addr))
11461 base = addr;
11462 else if (GET_CODE (addr) == SUBREG)
11463 {
11464 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11465 base = addr;
11466 else
11467 return 0;
11468 }
11469 else if (GET_CODE (addr) == PLUS)
11470 {
11471 rtx addends[4], op;
11472 int n = 0, i;
11473
11474 op = addr;
11475 do
11476 {
11477 if (n >= 4)
11478 return 0;
11479 addends[n++] = XEXP (op, 1);
11480 op = XEXP (op, 0);
11481 }
11482 while (GET_CODE (op) == PLUS);
11483 if (n >= 4)
11484 return 0;
11485 addends[n] = op;
11486
11487 for (i = n; i >= 0; --i)
11488 {
11489 op = addends[i];
11490 switch (GET_CODE (op))
11491 {
11492 case MULT:
11493 if (index)
11494 return 0;
11495 index = XEXP (op, 0);
11496 scale_rtx = XEXP (op, 1);
11497 break;
11498
11499 case ASHIFT:
11500 if (index)
11501 return 0;
11502 index = XEXP (op, 0);
11503 tmp = XEXP (op, 1);
11504 if (!CONST_INT_P (tmp))
11505 return 0;
11506 scale = INTVAL (tmp);
11507 if ((unsigned HOST_WIDE_INT) scale > 3)
11508 return 0;
11509 scale = 1 << scale;
11510 break;
11511
11512 case UNSPEC:
11513 if (XINT (op, 1) == UNSPEC_TP
11514 && TARGET_TLS_DIRECT_SEG_REFS
11515 && seg == SEG_DEFAULT)
11516 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11517 else
11518 return 0;
11519 break;
11520
11521 case SUBREG:
11522 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11523 return 0;
11524 /* FALLTHRU */
11525
11526 case REG:
11527 if (!base)
11528 base = op;
11529 else if (!index)
11530 index = op;
11531 else
11532 return 0;
11533 break;
11534
11535 case CONST:
11536 case CONST_INT:
11537 case SYMBOL_REF:
11538 case LABEL_REF:
11539 if (disp)
11540 return 0;
11541 disp = op;
11542 break;
11543
11544 default:
11545 return 0;
11546 }
11547 }
11548 }
11549 else if (GET_CODE (addr) == MULT)
11550 {
11551 index = XEXP (addr, 0); /* index*scale */
11552 scale_rtx = XEXP (addr, 1);
11553 }
11554 else if (GET_CODE (addr) == ASHIFT)
11555 {
11556 /* We're called for lea too, which implements ashift on occasion. */
11557 index = XEXP (addr, 0);
11558 tmp = XEXP (addr, 1);
11559 if (!CONST_INT_P (tmp))
11560 return 0;
11561 scale = INTVAL (tmp);
11562 if ((unsigned HOST_WIDE_INT) scale > 3)
11563 return 0;
11564 scale = 1 << scale;
11565 retval = -1;
11566 }
11567 else
11568 disp = addr; /* displacement */
11569
11570 if (index)
11571 {
11572 if (REG_P (index))
11573 ;
11574 else if (GET_CODE (index) == SUBREG
11575 && ix86_address_subreg_operand (SUBREG_REG (index)))
11576 ;
11577 else
11578 return 0;
11579 }
11580
11581 /* Address override works only on the (%reg) part of %fs:(%reg). */
11582 if (seg != SEG_DEFAULT
11583 && ((base && GET_MODE (base) != word_mode)
11584 || (index && GET_MODE (index) != word_mode)))
11585 return 0;
11586
11587 /* Extract the integral value of scale. */
11588 if (scale_rtx)
11589 {
11590 if (!CONST_INT_P (scale_rtx))
11591 return 0;
11592 scale = INTVAL (scale_rtx);
11593 }
11594
11595 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11596 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11597
11598 /* Avoid useless 0 displacement. */
11599 if (disp == const0_rtx && (base || index))
11600 disp = NULL_RTX;
11601
11602 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11603 if (base_reg && index_reg && scale == 1
11604 && (index_reg == arg_pointer_rtx
11605 || index_reg == frame_pointer_rtx
11606 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11607 {
11608 rtx tmp;
11609 tmp = base, base = index, index = tmp;
11610 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11611 }
11612
11613 /* Special case: %ebp cannot be encoded as a base without a displacement.
11614 Similarly %r13. */
11615 if (!disp
11616 && base_reg
11617 && (base_reg == hard_frame_pointer_rtx
11618 || base_reg == frame_pointer_rtx
11619 || base_reg == arg_pointer_rtx
11620 || (REG_P (base_reg)
11621 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11622 || REGNO (base_reg) == R13_REG))))
11623 disp = const0_rtx;
11624
11625 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11626 Avoid this by transforming to [%esi+0].
11627 Reload calls address legitimization without cfun defined, so we need
11628 to test cfun for being non-NULL. */
11629 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11630 && base_reg && !index_reg && !disp
11631 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11632 disp = const0_rtx;
11633
11634 /* Special case: encode reg+reg instead of reg*2. */
11635 if (!base && index && scale == 2)
11636 base = index, base_reg = index_reg, scale = 1;
11637
11638 /* Special case: scaling cannot be encoded without base or displacement. */
11639 if (!base && !disp && index && scale != 1)
11640 disp = const0_rtx;
11641
11642 out->base = base;
11643 out->index = index;
11644 out->disp = disp;
11645 out->scale = scale;
11646 out->seg = seg;
11647
11648 return retval;
11649 }
11650 \f
11651 /* Return cost of the memory address x.
11652 For i386, it is better to use a complex address than let gcc copy
11653 the address into a reg and make a new pseudo. But not if the address
11654 requires to two regs - that would mean more pseudos with longer
11655 lifetimes. */
11656 static int
11657 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11658 {
11659 struct ix86_address parts;
11660 int cost = 1;
11661 int ok = ix86_decompose_address (x, &parts);
11662
11663 gcc_assert (ok);
11664
11665 if (parts.base && GET_CODE (parts.base) == SUBREG)
11666 parts.base = SUBREG_REG (parts.base);
11667 if (parts.index && GET_CODE (parts.index) == SUBREG)
11668 parts.index = SUBREG_REG (parts.index);
11669
11670 /* Attempt to minimize number of registers in the address. */
11671 if ((parts.base
11672 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11673 || (parts.index
11674 && (!REG_P (parts.index)
11675 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11676 cost++;
11677
11678 if (parts.base
11679 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11680 && parts.index
11681 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11682 && parts.base != parts.index)
11683 cost++;
11684
11685 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11686 since it's predecode logic can't detect the length of instructions
11687 and it degenerates to vector decoded. Increase cost of such
11688 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11689 to split such addresses or even refuse such addresses at all.
11690
11691 Following addressing modes are affected:
11692 [base+scale*index]
11693 [scale*index+disp]
11694 [base+index]
11695
11696 The first and last case may be avoidable by explicitly coding the zero in
11697 memory address, but I don't have AMD-K6 machine handy to check this
11698 theory. */
11699
11700 if (TARGET_K6
11701 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11702 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11703 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11704 cost += 10;
11705
11706 return cost;
11707 }
11708 \f
11709 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11710 this is used for to form addresses to local data when -fPIC is in
11711 use. */
11712
11713 static bool
11714 darwin_local_data_pic (rtx disp)
11715 {
11716 return (GET_CODE (disp) == UNSPEC
11717 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11718 }
11719
11720 /* Determine if a given RTX is a valid constant. We already know this
11721 satisfies CONSTANT_P. */
11722
11723 static bool
11724 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11725 {
11726 switch (GET_CODE (x))
11727 {
11728 case CONST:
11729 x = XEXP (x, 0);
11730
11731 if (GET_CODE (x) == PLUS)
11732 {
11733 if (!CONST_INT_P (XEXP (x, 1)))
11734 return false;
11735 x = XEXP (x, 0);
11736 }
11737
11738 if (TARGET_MACHO && darwin_local_data_pic (x))
11739 return true;
11740
11741 /* Only some unspecs are valid as "constants". */
11742 if (GET_CODE (x) == UNSPEC)
11743 switch (XINT (x, 1))
11744 {
11745 case UNSPEC_GOT:
11746 case UNSPEC_GOTOFF:
11747 case UNSPEC_PLTOFF:
11748 return TARGET_64BIT;
11749 case UNSPEC_TPOFF:
11750 case UNSPEC_NTPOFF:
11751 x = XVECEXP (x, 0, 0);
11752 return (GET_CODE (x) == SYMBOL_REF
11753 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11754 case UNSPEC_DTPOFF:
11755 x = XVECEXP (x, 0, 0);
11756 return (GET_CODE (x) == SYMBOL_REF
11757 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11758 default:
11759 return false;
11760 }
11761
11762 /* We must have drilled down to a symbol. */
11763 if (GET_CODE (x) == LABEL_REF)
11764 return true;
11765 if (GET_CODE (x) != SYMBOL_REF)
11766 return false;
11767 /* FALLTHRU */
11768
11769 case SYMBOL_REF:
11770 /* TLS symbols are never valid. */
11771 if (SYMBOL_REF_TLS_MODEL (x))
11772 return false;
11773
11774 /* DLLIMPORT symbols are never valid. */
11775 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11776 && SYMBOL_REF_DLLIMPORT_P (x))
11777 return false;
11778
11779 #if TARGET_MACHO
11780 /* mdynamic-no-pic */
11781 if (MACHO_DYNAMIC_NO_PIC_P)
11782 return machopic_symbol_defined_p (x);
11783 #endif
11784 break;
11785
11786 case CONST_DOUBLE:
11787 if (GET_MODE (x) == TImode
11788 && x != CONST0_RTX (TImode)
11789 && !TARGET_64BIT)
11790 return false;
11791 break;
11792
11793 case CONST_VECTOR:
11794 if (!standard_sse_constant_p (x))
11795 return false;
11796
11797 default:
11798 break;
11799 }
11800
11801 /* Otherwise we handle everything else in the move patterns. */
11802 return true;
11803 }
11804
11805 /* Determine if it's legal to put X into the constant pool. This
11806 is not possible for the address of thread-local symbols, which
11807 is checked above. */
11808
11809 static bool
11810 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11811 {
11812 /* We can always put integral constants and vectors in memory. */
11813 switch (GET_CODE (x))
11814 {
11815 case CONST_INT:
11816 case CONST_DOUBLE:
11817 case CONST_VECTOR:
11818 return false;
11819
11820 default:
11821 break;
11822 }
11823 return !ix86_legitimate_constant_p (mode, x);
11824 }
11825
11826
11827 /* Nonzero if the constant value X is a legitimate general operand
11828 when generating PIC code. It is given that flag_pic is on and
11829 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11830
11831 bool
11832 legitimate_pic_operand_p (rtx x)
11833 {
11834 rtx inner;
11835
11836 switch (GET_CODE (x))
11837 {
11838 case CONST:
11839 inner = XEXP (x, 0);
11840 if (GET_CODE (inner) == PLUS
11841 && CONST_INT_P (XEXP (inner, 1)))
11842 inner = XEXP (inner, 0);
11843
11844 /* Only some unspecs are valid as "constants". */
11845 if (GET_CODE (inner) == UNSPEC)
11846 switch (XINT (inner, 1))
11847 {
11848 case UNSPEC_GOT:
11849 case UNSPEC_GOTOFF:
11850 case UNSPEC_PLTOFF:
11851 return TARGET_64BIT;
11852 case UNSPEC_TPOFF:
11853 x = XVECEXP (inner, 0, 0);
11854 return (GET_CODE (x) == SYMBOL_REF
11855 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11856 case UNSPEC_MACHOPIC_OFFSET:
11857 return legitimate_pic_address_disp_p (x);
11858 default:
11859 return false;
11860 }
11861 /* FALLTHRU */
11862
11863 case SYMBOL_REF:
11864 case LABEL_REF:
11865 return legitimate_pic_address_disp_p (x);
11866
11867 default:
11868 return true;
11869 }
11870 }
11871
11872 /* Determine if a given CONST RTX is a valid memory displacement
11873 in PIC mode. */
11874
11875 bool
11876 legitimate_pic_address_disp_p (rtx disp)
11877 {
11878 bool saw_plus;
11879
11880 /* In 64bit mode we can allow direct addresses of symbols and labels
11881 when they are not dynamic symbols. */
11882 if (TARGET_64BIT)
11883 {
11884 rtx op0 = disp, op1;
11885
11886 switch (GET_CODE (disp))
11887 {
11888 case LABEL_REF:
11889 return true;
11890
11891 case CONST:
11892 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11893 break;
11894 op0 = XEXP (XEXP (disp, 0), 0);
11895 op1 = XEXP (XEXP (disp, 0), 1);
11896 if (!CONST_INT_P (op1)
11897 || INTVAL (op1) >= 16*1024*1024
11898 || INTVAL (op1) < -16*1024*1024)
11899 break;
11900 if (GET_CODE (op0) == LABEL_REF)
11901 return true;
11902 if (GET_CODE (op0) == CONST
11903 && GET_CODE (XEXP (op0, 0)) == UNSPEC
11904 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
11905 return true;
11906 if (GET_CODE (op0) == UNSPEC
11907 && XINT (op0, 1) == UNSPEC_PCREL)
11908 return true;
11909 if (GET_CODE (op0) != SYMBOL_REF)
11910 break;
11911 /* FALLTHRU */
11912
11913 case SYMBOL_REF:
11914 /* TLS references should always be enclosed in UNSPEC. */
11915 if (SYMBOL_REF_TLS_MODEL (op0))
11916 return false;
11917 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11918 && ix86_cmodel != CM_LARGE_PIC)
11919 return true;
11920 break;
11921
11922 default:
11923 break;
11924 }
11925 }
11926 if (GET_CODE (disp) != CONST)
11927 return false;
11928 disp = XEXP (disp, 0);
11929
11930 if (TARGET_64BIT)
11931 {
11932 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11933 of GOT tables. We should not need these anyway. */
11934 if (GET_CODE (disp) != UNSPEC
11935 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11936 && XINT (disp, 1) != UNSPEC_GOTOFF
11937 && XINT (disp, 1) != UNSPEC_PCREL
11938 && XINT (disp, 1) != UNSPEC_PLTOFF))
11939 return false;
11940
11941 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11942 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11943 return false;
11944 return true;
11945 }
11946
11947 saw_plus = false;
11948 if (GET_CODE (disp) == PLUS)
11949 {
11950 if (!CONST_INT_P (XEXP (disp, 1)))
11951 return false;
11952 disp = XEXP (disp, 0);
11953 saw_plus = true;
11954 }
11955
11956 if (TARGET_MACHO && darwin_local_data_pic (disp))
11957 return true;
11958
11959 if (GET_CODE (disp) != UNSPEC)
11960 return false;
11961
11962 switch (XINT (disp, 1))
11963 {
11964 case UNSPEC_GOT:
11965 if (saw_plus)
11966 return false;
11967 /* We need to check for both symbols and labels because VxWorks loads
11968 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11969 details. */
11970 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11971 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11972 case UNSPEC_GOTOFF:
11973 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11974 While ABI specify also 32bit relocation but we don't produce it in
11975 small PIC model at all. */
11976 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11977 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11978 && !TARGET_64BIT)
11979 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11980 return false;
11981 case UNSPEC_GOTTPOFF:
11982 case UNSPEC_GOTNTPOFF:
11983 case UNSPEC_INDNTPOFF:
11984 if (saw_plus)
11985 return false;
11986 disp = XVECEXP (disp, 0, 0);
11987 return (GET_CODE (disp) == SYMBOL_REF
11988 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
11989 case UNSPEC_NTPOFF:
11990 disp = XVECEXP (disp, 0, 0);
11991 return (GET_CODE (disp) == SYMBOL_REF
11992 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
11993 case UNSPEC_DTPOFF:
11994 disp = XVECEXP (disp, 0, 0);
11995 return (GET_CODE (disp) == SYMBOL_REF
11996 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
11997 }
11998
11999 return false;
12000 }
12001
12002 /* Recognizes RTL expressions that are valid memory addresses for an
12003 instruction. The MODE argument is the machine mode for the MEM
12004 expression that wants to use this address.
12005
12006 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12007 convert common non-canonical forms to canonical form so that they will
12008 be recognized. */
12009
12010 static bool
12011 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12012 rtx addr, bool strict)
12013 {
12014 struct ix86_address parts;
12015 rtx base, index, disp;
12016 HOST_WIDE_INT scale;
12017
12018 /* Since constant address in x32 is signed extended to 64bit,
12019 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12020 if (TARGET_X32
12021 && CONST_INT_P (addr)
12022 && INTVAL (addr) < 0)
12023 return false;
12024
12025 if (ix86_decompose_address (addr, &parts) <= 0)
12026 /* Decomposition failed. */
12027 return false;
12028
12029 base = parts.base;
12030 index = parts.index;
12031 disp = parts.disp;
12032 scale = parts.scale;
12033
12034 /* Validate base register. */
12035 if (base)
12036 {
12037 rtx reg;
12038
12039 if (REG_P (base))
12040 reg = base;
12041 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12042 reg = SUBREG_REG (base);
12043 else
12044 /* Base is not a register. */
12045 return false;
12046
12047 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12048 return false;
12049
12050 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12051 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12052 /* Base is not valid. */
12053 return false;
12054 }
12055
12056 /* Validate index register. */
12057 if (index)
12058 {
12059 rtx reg;
12060
12061 if (REG_P (index))
12062 reg = index;
12063 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12064 reg = SUBREG_REG (index);
12065 else
12066 /* Index is not a register. */
12067 return false;
12068
12069 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12070 return false;
12071
12072 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12073 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12074 /* Index is not valid. */
12075 return false;
12076 }
12077
12078 /* Index and base should have the same mode. */
12079 if (base && index
12080 && GET_MODE (base) != GET_MODE (index))
12081 return false;
12082
12083 /* Validate scale factor. */
12084 if (scale != 1)
12085 {
12086 if (!index)
12087 /* Scale without index. */
12088 return false;
12089
12090 if (scale != 2 && scale != 4 && scale != 8)
12091 /* Scale is not a valid multiplier. */
12092 return false;
12093 }
12094
12095 /* Validate displacement. */
12096 if (disp)
12097 {
12098 if (GET_CODE (disp) == CONST
12099 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12100 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12101 switch (XINT (XEXP (disp, 0), 1))
12102 {
12103 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12104 used. While ABI specify also 32bit relocations, we don't produce
12105 them at all and use IP relative instead. */
12106 case UNSPEC_GOT:
12107 case UNSPEC_GOTOFF:
12108 gcc_assert (flag_pic);
12109 if (!TARGET_64BIT)
12110 goto is_legitimate_pic;
12111
12112 /* 64bit address unspec. */
12113 return false;
12114
12115 case UNSPEC_GOTPCREL:
12116 case UNSPEC_PCREL:
12117 gcc_assert (flag_pic);
12118 goto is_legitimate_pic;
12119
12120 case UNSPEC_GOTTPOFF:
12121 case UNSPEC_GOTNTPOFF:
12122 case UNSPEC_INDNTPOFF:
12123 case UNSPEC_NTPOFF:
12124 case UNSPEC_DTPOFF:
12125 break;
12126
12127 case UNSPEC_STACK_CHECK:
12128 gcc_assert (flag_split_stack);
12129 break;
12130
12131 default:
12132 /* Invalid address unspec. */
12133 return false;
12134 }
12135
12136 else if (SYMBOLIC_CONST (disp)
12137 && (flag_pic
12138 || (TARGET_MACHO
12139 #if TARGET_MACHO
12140 && MACHOPIC_INDIRECT
12141 && !machopic_operand_p (disp)
12142 #endif
12143 )))
12144 {
12145
12146 is_legitimate_pic:
12147 if (TARGET_64BIT && (index || base))
12148 {
12149 /* foo@dtpoff(%rX) is ok. */
12150 if (GET_CODE (disp) != CONST
12151 || GET_CODE (XEXP (disp, 0)) != PLUS
12152 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12153 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12154 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12155 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12156 /* Non-constant pic memory reference. */
12157 return false;
12158 }
12159 else if ((!TARGET_MACHO || flag_pic)
12160 && ! legitimate_pic_address_disp_p (disp))
12161 /* Displacement is an invalid pic construct. */
12162 return false;
12163 #if TARGET_MACHO
12164 else if (MACHO_DYNAMIC_NO_PIC_P
12165 && !ix86_legitimate_constant_p (Pmode, disp))
12166 /* displacment must be referenced via non_lazy_pointer */
12167 return false;
12168 #endif
12169
12170 /* This code used to verify that a symbolic pic displacement
12171 includes the pic_offset_table_rtx register.
12172
12173 While this is good idea, unfortunately these constructs may
12174 be created by "adds using lea" optimization for incorrect
12175 code like:
12176
12177 int a;
12178 int foo(int i)
12179 {
12180 return *(&a+i);
12181 }
12182
12183 This code is nonsensical, but results in addressing
12184 GOT table with pic_offset_table_rtx base. We can't
12185 just refuse it easily, since it gets matched by
12186 "addsi3" pattern, that later gets split to lea in the
12187 case output register differs from input. While this
12188 can be handled by separate addsi pattern for this case
12189 that never results in lea, this seems to be easier and
12190 correct fix for crash to disable this test. */
12191 }
12192 else if (GET_CODE (disp) != LABEL_REF
12193 && !CONST_INT_P (disp)
12194 && (GET_CODE (disp) != CONST
12195 || !ix86_legitimate_constant_p (Pmode, disp))
12196 && (GET_CODE (disp) != SYMBOL_REF
12197 || !ix86_legitimate_constant_p (Pmode, disp)))
12198 /* Displacement is not constant. */
12199 return false;
12200 else if (TARGET_64BIT
12201 && !x86_64_immediate_operand (disp, VOIDmode))
12202 /* Displacement is out of range. */
12203 return false;
12204 }
12205
12206 /* Everything looks valid. */
12207 return true;
12208 }
12209
12210 /* Determine if a given RTX is a valid constant address. */
12211
12212 bool
12213 constant_address_p (rtx x)
12214 {
12215 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12216 }
12217 \f
12218 /* Return a unique alias set for the GOT. */
12219
12220 static alias_set_type
12221 ix86_GOT_alias_set (void)
12222 {
12223 static alias_set_type set = -1;
12224 if (set == -1)
12225 set = new_alias_set ();
12226 return set;
12227 }
12228
12229 /* Return a legitimate reference for ORIG (an address) using the
12230 register REG. If REG is 0, a new pseudo is generated.
12231
12232 There are two types of references that must be handled:
12233
12234 1. Global data references must load the address from the GOT, via
12235 the PIC reg. An insn is emitted to do this load, and the reg is
12236 returned.
12237
12238 2. Static data references, constant pool addresses, and code labels
12239 compute the address as an offset from the GOT, whose base is in
12240 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12241 differentiate them from global data objects. The returned
12242 address is the PIC reg + an unspec constant.
12243
12244 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12245 reg also appears in the address. */
12246
12247 static rtx
12248 legitimize_pic_address (rtx orig, rtx reg)
12249 {
12250 rtx addr = orig;
12251 rtx new_rtx = orig;
12252 rtx base;
12253
12254 #if TARGET_MACHO
12255 if (TARGET_MACHO && !TARGET_64BIT)
12256 {
12257 if (reg == 0)
12258 reg = gen_reg_rtx (Pmode);
12259 /* Use the generic Mach-O PIC machinery. */
12260 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12261 }
12262 #endif
12263
12264 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12265 new_rtx = addr;
12266 else if (TARGET_64BIT
12267 && ix86_cmodel != CM_SMALL_PIC
12268 && gotoff_operand (addr, Pmode))
12269 {
12270 rtx tmpreg;
12271 /* This symbol may be referenced via a displacement from the PIC
12272 base address (@GOTOFF). */
12273
12274 if (reload_in_progress)
12275 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12276 if (GET_CODE (addr) == CONST)
12277 addr = XEXP (addr, 0);
12278 if (GET_CODE (addr) == PLUS)
12279 {
12280 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12281 UNSPEC_GOTOFF);
12282 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12283 }
12284 else
12285 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12286 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12287 if (!reg)
12288 tmpreg = gen_reg_rtx (Pmode);
12289 else
12290 tmpreg = reg;
12291 emit_move_insn (tmpreg, new_rtx);
12292
12293 if (reg != 0)
12294 {
12295 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12296 tmpreg, 1, OPTAB_DIRECT);
12297 new_rtx = reg;
12298 }
12299 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12300 }
12301 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12302 {
12303 /* This symbol may be referenced via a displacement from the PIC
12304 base address (@GOTOFF). */
12305
12306 if (reload_in_progress)
12307 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12308 if (GET_CODE (addr) == CONST)
12309 addr = XEXP (addr, 0);
12310 if (GET_CODE (addr) == PLUS)
12311 {
12312 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12313 UNSPEC_GOTOFF);
12314 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12315 }
12316 else
12317 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12318 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12319 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12320
12321 if (reg != 0)
12322 {
12323 emit_move_insn (reg, new_rtx);
12324 new_rtx = reg;
12325 }
12326 }
12327 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12328 /* We can't use @GOTOFF for text labels on VxWorks;
12329 see gotoff_operand. */
12330 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12331 {
12332 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12333 {
12334 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12335 return legitimize_dllimport_symbol (addr, true);
12336 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12337 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12338 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12339 {
12340 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12341 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12342 }
12343 }
12344
12345 /* For x64 PE-COFF there is no GOT table. So we use address
12346 directly. */
12347 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12348 {
12349 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12350 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12351
12352 if (reg == 0)
12353 reg = gen_reg_rtx (Pmode);
12354 emit_move_insn (reg, new_rtx);
12355 new_rtx = reg;
12356 }
12357 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12358 {
12359 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12360 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12361 new_rtx = gen_const_mem (Pmode, new_rtx);
12362 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12363
12364 if (reg == 0)
12365 reg = gen_reg_rtx (Pmode);
12366 /* Use directly gen_movsi, otherwise the address is loaded
12367 into register for CSE. We don't want to CSE this addresses,
12368 instead we CSE addresses from the GOT table, so skip this. */
12369 emit_insn (gen_movsi (reg, new_rtx));
12370 new_rtx = reg;
12371 }
12372 else
12373 {
12374 /* This symbol must be referenced via a load from the
12375 Global Offset Table (@GOT). */
12376
12377 if (reload_in_progress)
12378 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12379 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12380 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12381 if (TARGET_64BIT)
12382 new_rtx = force_reg (Pmode, new_rtx);
12383 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12384 new_rtx = gen_const_mem (Pmode, new_rtx);
12385 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12386
12387 if (reg == 0)
12388 reg = gen_reg_rtx (Pmode);
12389 emit_move_insn (reg, new_rtx);
12390 new_rtx = reg;
12391 }
12392 }
12393 else
12394 {
12395 if (CONST_INT_P (addr)
12396 && !x86_64_immediate_operand (addr, VOIDmode))
12397 {
12398 if (reg)
12399 {
12400 emit_move_insn (reg, addr);
12401 new_rtx = reg;
12402 }
12403 else
12404 new_rtx = force_reg (Pmode, addr);
12405 }
12406 else if (GET_CODE (addr) == CONST)
12407 {
12408 addr = XEXP (addr, 0);
12409
12410 /* We must match stuff we generate before. Assume the only
12411 unspecs that can get here are ours. Not that we could do
12412 anything with them anyway.... */
12413 if (GET_CODE (addr) == UNSPEC
12414 || (GET_CODE (addr) == PLUS
12415 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12416 return orig;
12417 gcc_assert (GET_CODE (addr) == PLUS);
12418 }
12419 if (GET_CODE (addr) == PLUS)
12420 {
12421 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12422
12423 /* Check first to see if this is a constant offset from a @GOTOFF
12424 symbol reference. */
12425 if (gotoff_operand (op0, Pmode)
12426 && CONST_INT_P (op1))
12427 {
12428 if (!TARGET_64BIT)
12429 {
12430 if (reload_in_progress)
12431 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12432 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12433 UNSPEC_GOTOFF);
12434 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12435 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12436 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12437
12438 if (reg != 0)
12439 {
12440 emit_move_insn (reg, new_rtx);
12441 new_rtx = reg;
12442 }
12443 }
12444 else
12445 {
12446 if (INTVAL (op1) < -16*1024*1024
12447 || INTVAL (op1) >= 16*1024*1024)
12448 {
12449 if (!x86_64_immediate_operand (op1, Pmode))
12450 op1 = force_reg (Pmode, op1);
12451 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12452 }
12453 }
12454 }
12455 else
12456 {
12457 base = legitimize_pic_address (XEXP (addr, 0), reg);
12458 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12459 base == reg ? NULL_RTX : reg);
12460
12461 if (CONST_INT_P (new_rtx))
12462 new_rtx = plus_constant (base, INTVAL (new_rtx));
12463 else
12464 {
12465 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12466 {
12467 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12468 new_rtx = XEXP (new_rtx, 1);
12469 }
12470 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12471 }
12472 }
12473 }
12474 }
12475 return new_rtx;
12476 }
12477 \f
12478 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12479
12480 static rtx
12481 get_thread_pointer (bool to_reg)
12482 {
12483 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12484
12485 if (GET_MODE (tp) != Pmode)
12486 tp = convert_to_mode (Pmode, tp, 1);
12487
12488 if (to_reg)
12489 tp = copy_addr_to_reg (tp);
12490
12491 return tp;
12492 }
12493
12494 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12495
12496 static GTY(()) rtx ix86_tls_symbol;
12497
12498 static rtx
12499 ix86_tls_get_addr (void)
12500 {
12501 if (!ix86_tls_symbol)
12502 {
12503 const char *sym
12504 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12505 ? "___tls_get_addr" : "__tls_get_addr");
12506
12507 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12508 }
12509
12510 return ix86_tls_symbol;
12511 }
12512
12513 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12514
12515 static GTY(()) rtx ix86_tls_module_base_symbol;
12516
12517 rtx
12518 ix86_tls_module_base (void)
12519 {
12520 if (!ix86_tls_module_base_symbol)
12521 {
12522 ix86_tls_module_base_symbol
12523 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12524
12525 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12526 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12527 }
12528
12529 return ix86_tls_module_base_symbol;
12530 }
12531
12532 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12533 false if we expect this to be used for a memory address and true if
12534 we expect to load the address into a register. */
12535
12536 static rtx
12537 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12538 {
12539 rtx dest, base, off;
12540 rtx pic = NULL_RTX, tp = NULL_RTX;
12541 int type;
12542
12543 switch (model)
12544 {
12545 case TLS_MODEL_GLOBAL_DYNAMIC:
12546 dest = gen_reg_rtx (Pmode);
12547
12548 if (!TARGET_64BIT)
12549 {
12550 if (flag_pic)
12551 pic = pic_offset_table_rtx;
12552 else
12553 {
12554 pic = gen_reg_rtx (Pmode);
12555 emit_insn (gen_set_got (pic));
12556 }
12557 }
12558
12559 if (TARGET_GNU2_TLS)
12560 {
12561 if (TARGET_64BIT)
12562 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12563 else
12564 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12565
12566 tp = get_thread_pointer (true);
12567 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12568
12569 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12570 }
12571 else
12572 {
12573 rtx caddr = ix86_tls_get_addr ();
12574
12575 if (TARGET_64BIT)
12576 {
12577 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12578
12579 start_sequence ();
12580 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
12581 caddr));
12582 insns = get_insns ();
12583 end_sequence ();
12584
12585 RTL_CONST_CALL_P (insns) = 1;
12586 emit_libcall_block (insns, dest, rax, x);
12587 }
12588 else
12589 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12590 }
12591 break;
12592
12593 case TLS_MODEL_LOCAL_DYNAMIC:
12594 base = gen_reg_rtx (Pmode);
12595
12596 if (!TARGET_64BIT)
12597 {
12598 if (flag_pic)
12599 pic = pic_offset_table_rtx;
12600 else
12601 {
12602 pic = gen_reg_rtx (Pmode);
12603 emit_insn (gen_set_got (pic));
12604 }
12605 }
12606
12607 if (TARGET_GNU2_TLS)
12608 {
12609 rtx tmp = ix86_tls_module_base ();
12610
12611 if (TARGET_64BIT)
12612 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12613 else
12614 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12615
12616 tp = get_thread_pointer (true);
12617 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12618 gen_rtx_MINUS (Pmode, tmp, tp));
12619 }
12620 else
12621 {
12622 rtx caddr = ix86_tls_get_addr ();
12623
12624 if (TARGET_64BIT)
12625 {
12626 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12627
12628 start_sequence ();
12629 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
12630 caddr));
12631 insns = get_insns ();
12632 end_sequence ();
12633
12634 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12635 share the LD_BASE result with other LD model accesses. */
12636 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12637 UNSPEC_TLS_LD_BASE);
12638
12639 RTL_CONST_CALL_P (insns) = 1;
12640 emit_libcall_block (insns, base, rax, eqv);
12641 }
12642 else
12643 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12644 }
12645
12646 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12647 off = gen_rtx_CONST (Pmode, off);
12648
12649 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12650
12651 if (TARGET_GNU2_TLS)
12652 {
12653 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12654
12655 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12656 }
12657 break;
12658
12659 case TLS_MODEL_INITIAL_EXEC:
12660 if (TARGET_64BIT)
12661 {
12662 if (TARGET_SUN_TLS)
12663 {
12664 /* The Sun linker took the AMD64 TLS spec literally
12665 and can only handle %rax as destination of the
12666 initial executable code sequence. */
12667
12668 dest = gen_reg_rtx (Pmode);
12669 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12670 return dest;
12671 }
12672 else if (Pmode == SImode)
12673 {
12674 /* Always generate
12675 movl %fs:0, %reg32
12676 addl xgottpoff(%rip), %reg32
12677 to support linker IE->LE optimization and avoid
12678 fs:(%reg32) as memory operand. */
12679 dest = gen_reg_rtx (Pmode);
12680 emit_insn (gen_tls_initial_exec_x32 (dest, x));
12681 return dest;
12682 }
12683
12684 pic = NULL;
12685 type = UNSPEC_GOTNTPOFF;
12686 }
12687 else if (flag_pic)
12688 {
12689 if (reload_in_progress)
12690 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12691 pic = pic_offset_table_rtx;
12692 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12693 }
12694 else if (!TARGET_ANY_GNU_TLS)
12695 {
12696 pic = gen_reg_rtx (Pmode);
12697 emit_insn (gen_set_got (pic));
12698 type = UNSPEC_GOTTPOFF;
12699 }
12700 else
12701 {
12702 pic = NULL;
12703 type = UNSPEC_INDNTPOFF;
12704 }
12705
12706 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
12707 off = gen_rtx_CONST (Pmode, off);
12708 if (pic)
12709 off = gen_rtx_PLUS (Pmode, pic, off);
12710 off = gen_const_mem (Pmode, off);
12711 set_mem_alias_set (off, ix86_GOT_alias_set ());
12712
12713 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12714 {
12715 base = get_thread_pointer (for_mov
12716 || !(TARGET_TLS_DIRECT_SEG_REFS
12717 && TARGET_TLS_INDIRECT_SEG_REFS));
12718 off = force_reg (Pmode, off);
12719 return gen_rtx_PLUS (Pmode, base, off);
12720 }
12721 else
12722 {
12723 base = get_thread_pointer (true);
12724 dest = gen_reg_rtx (Pmode);
12725 emit_insn (gen_subsi3 (dest, base, off));
12726 }
12727 break;
12728
12729 case TLS_MODEL_LOCAL_EXEC:
12730 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12731 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12732 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12733 off = gen_rtx_CONST (Pmode, off);
12734
12735 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12736 {
12737 base = get_thread_pointer (for_mov
12738 || !(TARGET_TLS_DIRECT_SEG_REFS
12739 && TARGET_TLS_INDIRECT_SEG_REFS));
12740 return gen_rtx_PLUS (Pmode, base, off);
12741 }
12742 else
12743 {
12744 base = get_thread_pointer (true);
12745 dest = gen_reg_rtx (Pmode);
12746 emit_insn (gen_subsi3 (dest, base, off));
12747 }
12748 break;
12749
12750 default:
12751 gcc_unreachable ();
12752 }
12753
12754 return dest;
12755 }
12756
12757 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12758 to symbol DECL. */
12759
12760 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12761 htab_t dllimport_map;
12762
12763 static tree
12764 get_dllimport_decl (tree decl)
12765 {
12766 struct tree_map *h, in;
12767 void **loc;
12768 const char *name;
12769 const char *prefix;
12770 size_t namelen, prefixlen;
12771 char *imp_name;
12772 tree to;
12773 rtx rtl;
12774
12775 if (!dllimport_map)
12776 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12777
12778 in.hash = htab_hash_pointer (decl);
12779 in.base.from = decl;
12780 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12781 h = (struct tree_map *) *loc;
12782 if (h)
12783 return h->to;
12784
12785 *loc = h = ggc_alloc_tree_map ();
12786 h->hash = in.hash;
12787 h->base.from = decl;
12788 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12789 VAR_DECL, NULL, ptr_type_node);
12790 DECL_ARTIFICIAL (to) = 1;
12791 DECL_IGNORED_P (to) = 1;
12792 DECL_EXTERNAL (to) = 1;
12793 TREE_READONLY (to) = 1;
12794
12795 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12796 name = targetm.strip_name_encoding (name);
12797 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12798 ? "*__imp_" : "*__imp__";
12799 namelen = strlen (name);
12800 prefixlen = strlen (prefix);
12801 imp_name = (char *) alloca (namelen + prefixlen + 1);
12802 memcpy (imp_name, prefix, prefixlen);
12803 memcpy (imp_name + prefixlen, name, namelen + 1);
12804
12805 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12806 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12807 SET_SYMBOL_REF_DECL (rtl, to);
12808 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12809
12810 rtl = gen_const_mem (Pmode, rtl);
12811 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12812
12813 SET_DECL_RTL (to, rtl);
12814 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12815
12816 return to;
12817 }
12818
12819 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12820 true if we require the result be a register. */
12821
12822 static rtx
12823 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12824 {
12825 tree imp_decl;
12826 rtx x;
12827
12828 gcc_assert (SYMBOL_REF_DECL (symbol));
12829 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12830
12831 x = DECL_RTL (imp_decl);
12832 if (want_reg)
12833 x = force_reg (Pmode, x);
12834 return x;
12835 }
12836
12837 /* Try machine-dependent ways of modifying an illegitimate address
12838 to be legitimate. If we find one, return the new, valid address.
12839 This macro is used in only one place: `memory_address' in explow.c.
12840
12841 OLDX is the address as it was before break_out_memory_refs was called.
12842 In some cases it is useful to look at this to decide what needs to be done.
12843
12844 It is always safe for this macro to do nothing. It exists to recognize
12845 opportunities to optimize the output.
12846
12847 For the 80386, we handle X+REG by loading X into a register R and
12848 using R+REG. R will go in a general reg and indexing will be used.
12849 However, if REG is a broken-out memory address or multiplication,
12850 nothing needs to be done because REG can certainly go in a general reg.
12851
12852 When -fpic is used, special handling is needed for symbolic references.
12853 See comments by legitimize_pic_address in i386.c for details. */
12854
12855 static rtx
12856 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12857 enum machine_mode mode)
12858 {
12859 int changed = 0;
12860 unsigned log;
12861
12862 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12863 if (log)
12864 return legitimize_tls_address (x, (enum tls_model) log, false);
12865 if (GET_CODE (x) == CONST
12866 && GET_CODE (XEXP (x, 0)) == PLUS
12867 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12868 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12869 {
12870 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12871 (enum tls_model) log, false);
12872 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12873 }
12874
12875 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12876 {
12877 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12878 return legitimize_dllimport_symbol (x, true);
12879 if (GET_CODE (x) == CONST
12880 && GET_CODE (XEXP (x, 0)) == PLUS
12881 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12882 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12883 {
12884 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12885 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12886 }
12887 }
12888
12889 if (flag_pic && SYMBOLIC_CONST (x))
12890 return legitimize_pic_address (x, 0);
12891
12892 #if TARGET_MACHO
12893 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12894 return machopic_indirect_data_reference (x, 0);
12895 #endif
12896
12897 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12898 if (GET_CODE (x) == ASHIFT
12899 && CONST_INT_P (XEXP (x, 1))
12900 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12901 {
12902 changed = 1;
12903 log = INTVAL (XEXP (x, 1));
12904 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12905 GEN_INT (1 << log));
12906 }
12907
12908 if (GET_CODE (x) == PLUS)
12909 {
12910 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12911
12912 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12913 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12914 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12915 {
12916 changed = 1;
12917 log = INTVAL (XEXP (XEXP (x, 0), 1));
12918 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12919 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12920 GEN_INT (1 << log));
12921 }
12922
12923 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12924 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12925 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12926 {
12927 changed = 1;
12928 log = INTVAL (XEXP (XEXP (x, 1), 1));
12929 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12930 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12931 GEN_INT (1 << log));
12932 }
12933
12934 /* Put multiply first if it isn't already. */
12935 if (GET_CODE (XEXP (x, 1)) == MULT)
12936 {
12937 rtx tmp = XEXP (x, 0);
12938 XEXP (x, 0) = XEXP (x, 1);
12939 XEXP (x, 1) = tmp;
12940 changed = 1;
12941 }
12942
12943 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
12944 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
12945 created by virtual register instantiation, register elimination, and
12946 similar optimizations. */
12947 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
12948 {
12949 changed = 1;
12950 x = gen_rtx_PLUS (Pmode,
12951 gen_rtx_PLUS (Pmode, XEXP (x, 0),
12952 XEXP (XEXP (x, 1), 0)),
12953 XEXP (XEXP (x, 1), 1));
12954 }
12955
12956 /* Canonicalize
12957 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
12958 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
12959 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
12960 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12961 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
12962 && CONSTANT_P (XEXP (x, 1)))
12963 {
12964 rtx constant;
12965 rtx other = NULL_RTX;
12966
12967 if (CONST_INT_P (XEXP (x, 1)))
12968 {
12969 constant = XEXP (x, 1);
12970 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
12971 }
12972 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
12973 {
12974 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
12975 other = XEXP (x, 1);
12976 }
12977 else
12978 constant = 0;
12979
12980 if (constant)
12981 {
12982 changed = 1;
12983 x = gen_rtx_PLUS (Pmode,
12984 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
12985 XEXP (XEXP (XEXP (x, 0), 1), 0)),
12986 plus_constant (other, INTVAL (constant)));
12987 }
12988 }
12989
12990 if (changed && ix86_legitimate_address_p (mode, x, false))
12991 return x;
12992
12993 if (GET_CODE (XEXP (x, 0)) == MULT)
12994 {
12995 changed = 1;
12996 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
12997 }
12998
12999 if (GET_CODE (XEXP (x, 1)) == MULT)
13000 {
13001 changed = 1;
13002 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13003 }
13004
13005 if (changed
13006 && REG_P (XEXP (x, 1))
13007 && REG_P (XEXP (x, 0)))
13008 return x;
13009
13010 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13011 {
13012 changed = 1;
13013 x = legitimize_pic_address (x, 0);
13014 }
13015
13016 if (changed && ix86_legitimate_address_p (mode, x, false))
13017 return x;
13018
13019 if (REG_P (XEXP (x, 0)))
13020 {
13021 rtx temp = gen_reg_rtx (Pmode);
13022 rtx val = force_operand (XEXP (x, 1), temp);
13023 if (val != temp)
13024 {
13025 if (GET_MODE (val) != Pmode)
13026 val = convert_to_mode (Pmode, val, 1);
13027 emit_move_insn (temp, val);
13028 }
13029
13030 XEXP (x, 1) = temp;
13031 return x;
13032 }
13033
13034 else if (REG_P (XEXP (x, 1)))
13035 {
13036 rtx temp = gen_reg_rtx (Pmode);
13037 rtx val = force_operand (XEXP (x, 0), temp);
13038 if (val != temp)
13039 {
13040 if (GET_MODE (val) != Pmode)
13041 val = convert_to_mode (Pmode, val, 1);
13042 emit_move_insn (temp, val);
13043 }
13044
13045 XEXP (x, 0) = temp;
13046 return x;
13047 }
13048 }
13049
13050 return x;
13051 }
13052 \f
13053 /* Print an integer constant expression in assembler syntax. Addition
13054 and subtraction are the only arithmetic that may appear in these
13055 expressions. FILE is the stdio stream to write to, X is the rtx, and
13056 CODE is the operand print code from the output string. */
13057
13058 static void
13059 output_pic_addr_const (FILE *file, rtx x, int code)
13060 {
13061 char buf[256];
13062
13063 switch (GET_CODE (x))
13064 {
13065 case PC:
13066 gcc_assert (flag_pic);
13067 putc ('.', file);
13068 break;
13069
13070 case SYMBOL_REF:
13071 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13072 output_addr_const (file, x);
13073 else
13074 {
13075 const char *name = XSTR (x, 0);
13076
13077 /* Mark the decl as referenced so that cgraph will
13078 output the function. */
13079 if (SYMBOL_REF_DECL (x))
13080 mark_decl_referenced (SYMBOL_REF_DECL (x));
13081
13082 #if TARGET_MACHO
13083 if (MACHOPIC_INDIRECT
13084 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13085 name = machopic_indirection_name (x, /*stub_p=*/true);
13086 #endif
13087 assemble_name (file, name);
13088 }
13089 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13090 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13091 fputs ("@PLT", file);
13092 break;
13093
13094 case LABEL_REF:
13095 x = XEXP (x, 0);
13096 /* FALLTHRU */
13097 case CODE_LABEL:
13098 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13099 assemble_name (asm_out_file, buf);
13100 break;
13101
13102 case CONST_INT:
13103 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13104 break;
13105
13106 case CONST:
13107 /* This used to output parentheses around the expression,
13108 but that does not work on the 386 (either ATT or BSD assembler). */
13109 output_pic_addr_const (file, XEXP (x, 0), code);
13110 break;
13111
13112 case CONST_DOUBLE:
13113 if (GET_MODE (x) == VOIDmode)
13114 {
13115 /* We can use %d if the number is <32 bits and positive. */
13116 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13117 fprintf (file, "0x%lx%08lx",
13118 (unsigned long) CONST_DOUBLE_HIGH (x),
13119 (unsigned long) CONST_DOUBLE_LOW (x));
13120 else
13121 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13122 }
13123 else
13124 /* We can't handle floating point constants;
13125 TARGET_PRINT_OPERAND must handle them. */
13126 output_operand_lossage ("floating constant misused");
13127 break;
13128
13129 case PLUS:
13130 /* Some assemblers need integer constants to appear first. */
13131 if (CONST_INT_P (XEXP (x, 0)))
13132 {
13133 output_pic_addr_const (file, XEXP (x, 0), code);
13134 putc ('+', file);
13135 output_pic_addr_const (file, XEXP (x, 1), code);
13136 }
13137 else
13138 {
13139 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13140 output_pic_addr_const (file, XEXP (x, 1), code);
13141 putc ('+', file);
13142 output_pic_addr_const (file, XEXP (x, 0), code);
13143 }
13144 break;
13145
13146 case MINUS:
13147 if (!TARGET_MACHO)
13148 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13149 output_pic_addr_const (file, XEXP (x, 0), code);
13150 putc ('-', file);
13151 output_pic_addr_const (file, XEXP (x, 1), code);
13152 if (!TARGET_MACHO)
13153 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13154 break;
13155
13156 case UNSPEC:
13157 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13158 {
13159 bool f = i386_asm_output_addr_const_extra (file, x);
13160 gcc_assert (f);
13161 break;
13162 }
13163
13164 gcc_assert (XVECLEN (x, 0) == 1);
13165 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13166 switch (XINT (x, 1))
13167 {
13168 case UNSPEC_GOT:
13169 fputs ("@GOT", file);
13170 break;
13171 case UNSPEC_GOTOFF:
13172 fputs ("@GOTOFF", file);
13173 break;
13174 case UNSPEC_PLTOFF:
13175 fputs ("@PLTOFF", file);
13176 break;
13177 case UNSPEC_PCREL:
13178 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13179 "(%rip)" : "[rip]", file);
13180 break;
13181 case UNSPEC_GOTPCREL:
13182 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13183 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13184 break;
13185 case UNSPEC_GOTTPOFF:
13186 /* FIXME: This might be @TPOFF in Sun ld too. */
13187 fputs ("@gottpoff", file);
13188 break;
13189 case UNSPEC_TPOFF:
13190 fputs ("@tpoff", file);
13191 break;
13192 case UNSPEC_NTPOFF:
13193 if (TARGET_64BIT)
13194 fputs ("@tpoff", file);
13195 else
13196 fputs ("@ntpoff", file);
13197 break;
13198 case UNSPEC_DTPOFF:
13199 fputs ("@dtpoff", file);
13200 break;
13201 case UNSPEC_GOTNTPOFF:
13202 if (TARGET_64BIT)
13203 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13204 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13205 else
13206 fputs ("@gotntpoff", file);
13207 break;
13208 case UNSPEC_INDNTPOFF:
13209 fputs ("@indntpoff", file);
13210 break;
13211 #if TARGET_MACHO
13212 case UNSPEC_MACHOPIC_OFFSET:
13213 putc ('-', file);
13214 machopic_output_function_base_name (file);
13215 break;
13216 #endif
13217 default:
13218 output_operand_lossage ("invalid UNSPEC as operand");
13219 break;
13220 }
13221 break;
13222
13223 default:
13224 output_operand_lossage ("invalid expression as operand");
13225 }
13226 }
13227
13228 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13229 We need to emit DTP-relative relocations. */
13230
13231 static void ATTRIBUTE_UNUSED
13232 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13233 {
13234 fputs (ASM_LONG, file);
13235 output_addr_const (file, x);
13236 fputs ("@dtpoff", file);
13237 switch (size)
13238 {
13239 case 4:
13240 break;
13241 case 8:
13242 fputs (", 0", file);
13243 break;
13244 default:
13245 gcc_unreachable ();
13246 }
13247 }
13248
13249 /* Return true if X is a representation of the PIC register. This copes
13250 with calls from ix86_find_base_term, where the register might have
13251 been replaced by a cselib value. */
13252
13253 static bool
13254 ix86_pic_register_p (rtx x)
13255 {
13256 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13257 return (pic_offset_table_rtx
13258 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13259 else
13260 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13261 }
13262
13263 /* Helper function for ix86_delegitimize_address.
13264 Attempt to delegitimize TLS local-exec accesses. */
13265
13266 static rtx
13267 ix86_delegitimize_tls_address (rtx orig_x)
13268 {
13269 rtx x = orig_x, unspec;
13270 struct ix86_address addr;
13271
13272 if (!(TARGET_TLS_DIRECT_SEG_REFS
13273 && TARGET_TLS_INDIRECT_SEG_REFS))
13274 return orig_x;
13275 if (MEM_P (x))
13276 x = XEXP (x, 0);
13277 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13278 return orig_x;
13279 if (ix86_decompose_address (x, &addr) == 0
13280 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13281 || addr.disp == NULL_RTX
13282 || GET_CODE (addr.disp) != CONST)
13283 return orig_x;
13284 unspec = XEXP (addr.disp, 0);
13285 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13286 unspec = XEXP (unspec, 0);
13287 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13288 return orig_x;
13289 x = XVECEXP (unspec, 0, 0);
13290 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13291 if (unspec != XEXP (addr.disp, 0))
13292 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13293 if (addr.index)
13294 {
13295 rtx idx = addr.index;
13296 if (addr.scale != 1)
13297 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13298 x = gen_rtx_PLUS (Pmode, idx, x);
13299 }
13300 if (addr.base)
13301 x = gen_rtx_PLUS (Pmode, addr.base, x);
13302 if (MEM_P (orig_x))
13303 x = replace_equiv_address_nv (orig_x, x);
13304 return x;
13305 }
13306
13307 /* In the name of slightly smaller debug output, and to cater to
13308 general assembler lossage, recognize PIC+GOTOFF and turn it back
13309 into a direct symbol reference.
13310
13311 On Darwin, this is necessary to avoid a crash, because Darwin
13312 has a different PIC label for each routine but the DWARF debugging
13313 information is not associated with any particular routine, so it's
13314 necessary to remove references to the PIC label from RTL stored by
13315 the DWARF output code. */
13316
13317 static rtx
13318 ix86_delegitimize_address (rtx x)
13319 {
13320 rtx orig_x = delegitimize_mem_from_attrs (x);
13321 /* addend is NULL or some rtx if x is something+GOTOFF where
13322 something doesn't include the PIC register. */
13323 rtx addend = NULL_RTX;
13324 /* reg_addend is NULL or a multiple of some register. */
13325 rtx reg_addend = NULL_RTX;
13326 /* const_addend is NULL or a const_int. */
13327 rtx const_addend = NULL_RTX;
13328 /* This is the result, or NULL. */
13329 rtx result = NULL_RTX;
13330
13331 x = orig_x;
13332
13333 if (MEM_P (x))
13334 x = XEXP (x, 0);
13335
13336 if (TARGET_64BIT)
13337 {
13338 if (GET_CODE (x) == CONST
13339 && GET_CODE (XEXP (x, 0)) == PLUS
13340 && GET_MODE (XEXP (x, 0)) == Pmode
13341 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13342 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13343 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13344 {
13345 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13346 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13347 if (MEM_P (orig_x))
13348 x = replace_equiv_address_nv (orig_x, x);
13349 return x;
13350 }
13351 if (GET_CODE (x) != CONST
13352 || GET_CODE (XEXP (x, 0)) != UNSPEC
13353 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13354 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13355 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13356 return ix86_delegitimize_tls_address (orig_x);
13357 x = XVECEXP (XEXP (x, 0), 0, 0);
13358 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13359 {
13360 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13361 GET_MODE (x), 0);
13362 if (x == NULL_RTX)
13363 return orig_x;
13364 }
13365 return x;
13366 }
13367
13368 if (GET_CODE (x) != PLUS
13369 || GET_CODE (XEXP (x, 1)) != CONST)
13370 return ix86_delegitimize_tls_address (orig_x);
13371
13372 if (ix86_pic_register_p (XEXP (x, 0)))
13373 /* %ebx + GOT/GOTOFF */
13374 ;
13375 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13376 {
13377 /* %ebx + %reg * scale + GOT/GOTOFF */
13378 reg_addend = XEXP (x, 0);
13379 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13380 reg_addend = XEXP (reg_addend, 1);
13381 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13382 reg_addend = XEXP (reg_addend, 0);
13383 else
13384 {
13385 reg_addend = NULL_RTX;
13386 addend = XEXP (x, 0);
13387 }
13388 }
13389 else
13390 addend = XEXP (x, 0);
13391
13392 x = XEXP (XEXP (x, 1), 0);
13393 if (GET_CODE (x) == PLUS
13394 && CONST_INT_P (XEXP (x, 1)))
13395 {
13396 const_addend = XEXP (x, 1);
13397 x = XEXP (x, 0);
13398 }
13399
13400 if (GET_CODE (x) == UNSPEC
13401 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13402 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13403 result = XVECEXP (x, 0, 0);
13404
13405 if (TARGET_MACHO && darwin_local_data_pic (x)
13406 && !MEM_P (orig_x))
13407 result = XVECEXP (x, 0, 0);
13408
13409 if (! result)
13410 return ix86_delegitimize_tls_address (orig_x);
13411
13412 if (const_addend)
13413 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13414 if (reg_addend)
13415 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13416 if (addend)
13417 {
13418 /* If the rest of original X doesn't involve the PIC register, add
13419 addend and subtract pic_offset_table_rtx. This can happen e.g.
13420 for code like:
13421 leal (%ebx, %ecx, 4), %ecx
13422 ...
13423 movl foo@GOTOFF(%ecx), %edx
13424 in which case we return (%ecx - %ebx) + foo. */
13425 if (pic_offset_table_rtx)
13426 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13427 pic_offset_table_rtx),
13428 result);
13429 else
13430 return orig_x;
13431 }
13432 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13433 {
13434 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13435 if (result == NULL_RTX)
13436 return orig_x;
13437 }
13438 return result;
13439 }
13440
13441 /* If X is a machine specific address (i.e. a symbol or label being
13442 referenced as a displacement from the GOT implemented using an
13443 UNSPEC), then return the base term. Otherwise return X. */
13444
13445 rtx
13446 ix86_find_base_term (rtx x)
13447 {
13448 rtx term;
13449
13450 if (TARGET_64BIT)
13451 {
13452 if (GET_CODE (x) != CONST)
13453 return x;
13454 term = XEXP (x, 0);
13455 if (GET_CODE (term) == PLUS
13456 && (CONST_INT_P (XEXP (term, 1))
13457 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13458 term = XEXP (term, 0);
13459 if (GET_CODE (term) != UNSPEC
13460 || (XINT (term, 1) != UNSPEC_GOTPCREL
13461 && XINT (term, 1) != UNSPEC_PCREL))
13462 return x;
13463
13464 return XVECEXP (term, 0, 0);
13465 }
13466
13467 return ix86_delegitimize_address (x);
13468 }
13469 \f
13470 static void
13471 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13472 int fp, FILE *file)
13473 {
13474 const char *suffix;
13475
13476 if (mode == CCFPmode || mode == CCFPUmode)
13477 {
13478 code = ix86_fp_compare_code_to_integer (code);
13479 mode = CCmode;
13480 }
13481 if (reverse)
13482 code = reverse_condition (code);
13483
13484 switch (code)
13485 {
13486 case EQ:
13487 switch (mode)
13488 {
13489 case CCAmode:
13490 suffix = "a";
13491 break;
13492
13493 case CCCmode:
13494 suffix = "c";
13495 break;
13496
13497 case CCOmode:
13498 suffix = "o";
13499 break;
13500
13501 case CCSmode:
13502 suffix = "s";
13503 break;
13504
13505 default:
13506 suffix = "e";
13507 }
13508 break;
13509 case NE:
13510 switch (mode)
13511 {
13512 case CCAmode:
13513 suffix = "na";
13514 break;
13515
13516 case CCCmode:
13517 suffix = "nc";
13518 break;
13519
13520 case CCOmode:
13521 suffix = "no";
13522 break;
13523
13524 case CCSmode:
13525 suffix = "ns";
13526 break;
13527
13528 default:
13529 suffix = "ne";
13530 }
13531 break;
13532 case GT:
13533 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13534 suffix = "g";
13535 break;
13536 case GTU:
13537 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13538 Those same assemblers have the same but opposite lossage on cmov. */
13539 if (mode == CCmode)
13540 suffix = fp ? "nbe" : "a";
13541 else if (mode == CCCmode)
13542 suffix = "b";
13543 else
13544 gcc_unreachable ();
13545 break;
13546 case LT:
13547 switch (mode)
13548 {
13549 case CCNOmode:
13550 case CCGOCmode:
13551 suffix = "s";
13552 break;
13553
13554 case CCmode:
13555 case CCGCmode:
13556 suffix = "l";
13557 break;
13558
13559 default:
13560 gcc_unreachable ();
13561 }
13562 break;
13563 case LTU:
13564 gcc_assert (mode == CCmode || mode == CCCmode);
13565 suffix = "b";
13566 break;
13567 case GE:
13568 switch (mode)
13569 {
13570 case CCNOmode:
13571 case CCGOCmode:
13572 suffix = "ns";
13573 break;
13574
13575 case CCmode:
13576 case CCGCmode:
13577 suffix = "ge";
13578 break;
13579
13580 default:
13581 gcc_unreachable ();
13582 }
13583 break;
13584 case GEU:
13585 /* ??? As above. */
13586 gcc_assert (mode == CCmode || mode == CCCmode);
13587 suffix = fp ? "nb" : "ae";
13588 break;
13589 case LE:
13590 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13591 suffix = "le";
13592 break;
13593 case LEU:
13594 /* ??? As above. */
13595 if (mode == CCmode)
13596 suffix = "be";
13597 else if (mode == CCCmode)
13598 suffix = fp ? "nb" : "ae";
13599 else
13600 gcc_unreachable ();
13601 break;
13602 case UNORDERED:
13603 suffix = fp ? "u" : "p";
13604 break;
13605 case ORDERED:
13606 suffix = fp ? "nu" : "np";
13607 break;
13608 default:
13609 gcc_unreachable ();
13610 }
13611 fputs (suffix, file);
13612 }
13613
13614 /* Print the name of register X to FILE based on its machine mode and number.
13615 If CODE is 'w', pretend the mode is HImode.
13616 If CODE is 'b', pretend the mode is QImode.
13617 If CODE is 'k', pretend the mode is SImode.
13618 If CODE is 'q', pretend the mode is DImode.
13619 If CODE is 'x', pretend the mode is V4SFmode.
13620 If CODE is 't', pretend the mode is V8SFmode.
13621 If CODE is 'h', pretend the reg is the 'high' byte register.
13622 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13623 If CODE is 'd', duplicate the operand for AVX instruction.
13624 */
13625
13626 void
13627 print_reg (rtx x, int code, FILE *file)
13628 {
13629 const char *reg;
13630 bool duplicated = code == 'd' && TARGET_AVX;
13631
13632 gcc_assert (x == pc_rtx
13633 || (REGNO (x) != ARG_POINTER_REGNUM
13634 && REGNO (x) != FRAME_POINTER_REGNUM
13635 && REGNO (x) != FLAGS_REG
13636 && REGNO (x) != FPSR_REG
13637 && REGNO (x) != FPCR_REG));
13638
13639 if (ASSEMBLER_DIALECT == ASM_ATT)
13640 putc ('%', file);
13641
13642 if (x == pc_rtx)
13643 {
13644 gcc_assert (TARGET_64BIT);
13645 fputs ("rip", file);
13646 return;
13647 }
13648
13649 if (code == 'w' || MMX_REG_P (x))
13650 code = 2;
13651 else if (code == 'b')
13652 code = 1;
13653 else if (code == 'k')
13654 code = 4;
13655 else if (code == 'q')
13656 code = 8;
13657 else if (code == 'y')
13658 code = 3;
13659 else if (code == 'h')
13660 code = 0;
13661 else if (code == 'x')
13662 code = 16;
13663 else if (code == 't')
13664 code = 32;
13665 else
13666 code = GET_MODE_SIZE (GET_MODE (x));
13667
13668 /* Irritatingly, AMD extended registers use different naming convention
13669 from the normal registers: "r%d[bwd]" */
13670 if (REX_INT_REG_P (x))
13671 {
13672 gcc_assert (TARGET_64BIT);
13673 putc ('r', file);
13674 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13675 switch (code)
13676 {
13677 case 0:
13678 error ("extended registers have no high halves");
13679 break;
13680 case 1:
13681 putc ('b', file);
13682 break;
13683 case 2:
13684 putc ('w', file);
13685 break;
13686 case 4:
13687 putc ('d', file);
13688 break;
13689 case 8:
13690 /* no suffix */
13691 break;
13692 default:
13693 error ("unsupported operand size for extended register");
13694 break;
13695 }
13696 return;
13697 }
13698
13699 reg = NULL;
13700 switch (code)
13701 {
13702 case 3:
13703 if (STACK_TOP_P (x))
13704 {
13705 reg = "st(0)";
13706 break;
13707 }
13708 /* FALLTHRU */
13709 case 8:
13710 case 4:
13711 case 12:
13712 if (! ANY_FP_REG_P (x))
13713 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13714 /* FALLTHRU */
13715 case 16:
13716 case 2:
13717 normal:
13718 reg = hi_reg_name[REGNO (x)];
13719 break;
13720 case 1:
13721 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13722 goto normal;
13723 reg = qi_reg_name[REGNO (x)];
13724 break;
13725 case 0:
13726 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13727 goto normal;
13728 reg = qi_high_reg_name[REGNO (x)];
13729 break;
13730 case 32:
13731 if (SSE_REG_P (x))
13732 {
13733 gcc_assert (!duplicated);
13734 putc ('y', file);
13735 fputs (hi_reg_name[REGNO (x)] + 1, file);
13736 return;
13737 }
13738 break;
13739 default:
13740 gcc_unreachable ();
13741 }
13742
13743 fputs (reg, file);
13744 if (duplicated)
13745 {
13746 if (ASSEMBLER_DIALECT == ASM_ATT)
13747 fprintf (file, ", %%%s", reg);
13748 else
13749 fprintf (file, ", %s", reg);
13750 }
13751 }
13752
13753 /* Locate some local-dynamic symbol still in use by this function
13754 so that we can print its name in some tls_local_dynamic_base
13755 pattern. */
13756
13757 static int
13758 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13759 {
13760 rtx x = *px;
13761
13762 if (GET_CODE (x) == SYMBOL_REF
13763 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13764 {
13765 cfun->machine->some_ld_name = XSTR (x, 0);
13766 return 1;
13767 }
13768
13769 return 0;
13770 }
13771
13772 static const char *
13773 get_some_local_dynamic_name (void)
13774 {
13775 rtx insn;
13776
13777 if (cfun->machine->some_ld_name)
13778 return cfun->machine->some_ld_name;
13779
13780 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13781 if (NONDEBUG_INSN_P (insn)
13782 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13783 return cfun->machine->some_ld_name;
13784
13785 return NULL;
13786 }
13787
13788 /* Meaning of CODE:
13789 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13790 C -- print opcode suffix for set/cmov insn.
13791 c -- like C, but print reversed condition
13792 F,f -- likewise, but for floating-point.
13793 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13794 otherwise nothing
13795 R -- print the prefix for register names.
13796 z -- print the opcode suffix for the size of the current operand.
13797 Z -- likewise, with special suffixes for x87 instructions.
13798 * -- print a star (in certain assembler syntax)
13799 A -- print an absolute memory reference.
13800 E -- print address with DImode register names if TARGET_64BIT.
13801 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13802 s -- print a shift double count, followed by the assemblers argument
13803 delimiter.
13804 b -- print the QImode name of the register for the indicated operand.
13805 %b0 would print %al if operands[0] is reg 0.
13806 w -- likewise, print the HImode name of the register.
13807 k -- likewise, print the SImode name of the register.
13808 q -- likewise, print the DImode name of the register.
13809 x -- likewise, print the V4SFmode name of the register.
13810 t -- likewise, print the V8SFmode name of the register.
13811 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13812 y -- print "st(0)" instead of "st" as a register.
13813 d -- print duplicated register operand for AVX instruction.
13814 D -- print condition for SSE cmp instruction.
13815 P -- if PIC, print an @PLT suffix.
13816 p -- print raw symbol name.
13817 X -- don't print any sort of PIC '@' suffix for a symbol.
13818 & -- print some in-use local-dynamic symbol name.
13819 H -- print a memory address offset by 8; used for sse high-parts
13820 Y -- print condition for XOP pcom* instruction.
13821 + -- print a branch hint as 'cs' or 'ds' prefix
13822 ; -- print a semicolon (after prefixes due to bug in older gas).
13823 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13824 @ -- print a segment register of thread base pointer load
13825 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
13826 */
13827
13828 void
13829 ix86_print_operand (FILE *file, rtx x, int code)
13830 {
13831 if (code)
13832 {
13833 switch (code)
13834 {
13835 case '*':
13836 if (ASSEMBLER_DIALECT == ASM_ATT)
13837 putc ('*', file);
13838 return;
13839
13840 case '&':
13841 {
13842 const char *name = get_some_local_dynamic_name ();
13843 if (name == NULL)
13844 output_operand_lossage ("'%%&' used without any "
13845 "local dynamic TLS references");
13846 else
13847 assemble_name (file, name);
13848 return;
13849 }
13850
13851 case 'A':
13852 switch (ASSEMBLER_DIALECT)
13853 {
13854 case ASM_ATT:
13855 putc ('*', file);
13856 break;
13857
13858 case ASM_INTEL:
13859 /* Intel syntax. For absolute addresses, registers should not
13860 be surrounded by braces. */
13861 if (!REG_P (x))
13862 {
13863 putc ('[', file);
13864 ix86_print_operand (file, x, 0);
13865 putc (']', file);
13866 return;
13867 }
13868 break;
13869
13870 default:
13871 gcc_unreachable ();
13872 }
13873
13874 ix86_print_operand (file, x, 0);
13875 return;
13876
13877 case 'E':
13878 /* Wrap address in an UNSPEC to declare special handling. */
13879 if (TARGET_64BIT)
13880 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
13881
13882 output_address (x);
13883 return;
13884
13885 case 'L':
13886 if (ASSEMBLER_DIALECT == ASM_ATT)
13887 putc ('l', file);
13888 return;
13889
13890 case 'W':
13891 if (ASSEMBLER_DIALECT == ASM_ATT)
13892 putc ('w', file);
13893 return;
13894
13895 case 'B':
13896 if (ASSEMBLER_DIALECT == ASM_ATT)
13897 putc ('b', file);
13898 return;
13899
13900 case 'Q':
13901 if (ASSEMBLER_DIALECT == ASM_ATT)
13902 putc ('l', file);
13903 return;
13904
13905 case 'S':
13906 if (ASSEMBLER_DIALECT == ASM_ATT)
13907 putc ('s', file);
13908 return;
13909
13910 case 'T':
13911 if (ASSEMBLER_DIALECT == ASM_ATT)
13912 putc ('t', file);
13913 return;
13914
13915 case 'z':
13916 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13917 {
13918 /* Opcodes don't get size suffixes if using Intel opcodes. */
13919 if (ASSEMBLER_DIALECT == ASM_INTEL)
13920 return;
13921
13922 switch (GET_MODE_SIZE (GET_MODE (x)))
13923 {
13924 case 1:
13925 putc ('b', file);
13926 return;
13927
13928 case 2:
13929 putc ('w', file);
13930 return;
13931
13932 case 4:
13933 putc ('l', file);
13934 return;
13935
13936 case 8:
13937 putc ('q', file);
13938 return;
13939
13940 default:
13941 output_operand_lossage
13942 ("invalid operand size for operand code '%c'", code);
13943 return;
13944 }
13945 }
13946
13947 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13948 warning
13949 (0, "non-integer operand used with operand code '%c'", code);
13950 /* FALLTHRU */
13951
13952 case 'Z':
13953 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
13954 if (ASSEMBLER_DIALECT == ASM_INTEL)
13955 return;
13956
13957 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13958 {
13959 switch (GET_MODE_SIZE (GET_MODE (x)))
13960 {
13961 case 2:
13962 #ifdef HAVE_AS_IX86_FILDS
13963 putc ('s', file);
13964 #endif
13965 return;
13966
13967 case 4:
13968 putc ('l', file);
13969 return;
13970
13971 case 8:
13972 #ifdef HAVE_AS_IX86_FILDQ
13973 putc ('q', file);
13974 #else
13975 fputs ("ll", file);
13976 #endif
13977 return;
13978
13979 default:
13980 break;
13981 }
13982 }
13983 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13984 {
13985 /* 387 opcodes don't get size suffixes
13986 if the operands are registers. */
13987 if (STACK_REG_P (x))
13988 return;
13989
13990 switch (GET_MODE_SIZE (GET_MODE (x)))
13991 {
13992 case 4:
13993 putc ('s', file);
13994 return;
13995
13996 case 8:
13997 putc ('l', file);
13998 return;
13999
14000 case 12:
14001 case 16:
14002 putc ('t', file);
14003 return;
14004
14005 default:
14006 break;
14007 }
14008 }
14009 else
14010 {
14011 output_operand_lossage
14012 ("invalid operand type used with operand code '%c'", code);
14013 return;
14014 }
14015
14016 output_operand_lossage
14017 ("invalid operand size for operand code '%c'", code);
14018 return;
14019
14020 case 'd':
14021 case 'b':
14022 case 'w':
14023 case 'k':
14024 case 'q':
14025 case 'h':
14026 case 't':
14027 case 'y':
14028 case 'x':
14029 case 'X':
14030 case 'P':
14031 case 'p':
14032 break;
14033
14034 case 's':
14035 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14036 {
14037 ix86_print_operand (file, x, 0);
14038 fputs (", ", file);
14039 }
14040 return;
14041
14042 case 'D':
14043 /* Little bit of braindamage here. The SSE compare instructions
14044 does use completely different names for the comparisons that the
14045 fp conditional moves. */
14046 if (TARGET_AVX)
14047 {
14048 switch (GET_CODE (x))
14049 {
14050 case EQ:
14051 fputs ("eq", file);
14052 break;
14053 case UNEQ:
14054 fputs ("eq_us", file);
14055 break;
14056 case LT:
14057 fputs ("lt", file);
14058 break;
14059 case UNLT:
14060 fputs ("nge", file);
14061 break;
14062 case LE:
14063 fputs ("le", file);
14064 break;
14065 case UNLE:
14066 fputs ("ngt", file);
14067 break;
14068 case UNORDERED:
14069 fputs ("unord", file);
14070 break;
14071 case NE:
14072 fputs ("neq", file);
14073 break;
14074 case LTGT:
14075 fputs ("neq_oq", file);
14076 break;
14077 case GE:
14078 fputs ("ge", file);
14079 break;
14080 case UNGE:
14081 fputs ("nlt", file);
14082 break;
14083 case GT:
14084 fputs ("gt", file);
14085 break;
14086 case UNGT:
14087 fputs ("nle", file);
14088 break;
14089 case ORDERED:
14090 fputs ("ord", file);
14091 break;
14092 default:
14093 output_operand_lossage ("operand is not a condition code, "
14094 "invalid operand code 'D'");
14095 return;
14096 }
14097 }
14098 else
14099 {
14100 switch (GET_CODE (x))
14101 {
14102 case EQ:
14103 case UNEQ:
14104 fputs ("eq", file);
14105 break;
14106 case LT:
14107 case UNLT:
14108 fputs ("lt", file);
14109 break;
14110 case LE:
14111 case UNLE:
14112 fputs ("le", file);
14113 break;
14114 case UNORDERED:
14115 fputs ("unord", file);
14116 break;
14117 case NE:
14118 case LTGT:
14119 fputs ("neq", file);
14120 break;
14121 case UNGE:
14122 case GE:
14123 fputs ("nlt", file);
14124 break;
14125 case UNGT:
14126 case GT:
14127 fputs ("nle", file);
14128 break;
14129 case ORDERED:
14130 fputs ("ord", file);
14131 break;
14132 default:
14133 output_operand_lossage ("operand is not a condition code, "
14134 "invalid operand code 'D'");
14135 return;
14136 }
14137 }
14138 return;
14139 case 'O':
14140 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14141 if (ASSEMBLER_DIALECT == ASM_ATT)
14142 {
14143 switch (GET_MODE (x))
14144 {
14145 case HImode: putc ('w', file); break;
14146 case SImode:
14147 case SFmode: putc ('l', file); break;
14148 case DImode:
14149 case DFmode: putc ('q', file); break;
14150 default: gcc_unreachable ();
14151 }
14152 putc ('.', file);
14153 }
14154 #endif
14155 return;
14156 case 'C':
14157 if (!COMPARISON_P (x))
14158 {
14159 output_operand_lossage ("operand is neither a constant nor a "
14160 "condition code, invalid operand code "
14161 "'C'");
14162 return;
14163 }
14164 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
14165 return;
14166 case 'F':
14167 if (!COMPARISON_P (x))
14168 {
14169 output_operand_lossage ("operand is neither a constant nor a "
14170 "condition code, invalid operand code "
14171 "'F'");
14172 return;
14173 }
14174 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14175 if (ASSEMBLER_DIALECT == ASM_ATT)
14176 putc ('.', file);
14177 #endif
14178 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
14179 return;
14180
14181 /* Like above, but reverse condition */
14182 case 'c':
14183 /* Check to see if argument to %c is really a constant
14184 and not a condition code which needs to be reversed. */
14185 if (!COMPARISON_P (x))
14186 {
14187 output_operand_lossage ("operand is neither a constant nor a "
14188 "condition code, invalid operand "
14189 "code 'c'");
14190 return;
14191 }
14192 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
14193 return;
14194 case 'f':
14195 if (!COMPARISON_P (x))
14196 {
14197 output_operand_lossage ("operand is neither a constant nor a "
14198 "condition code, invalid operand "
14199 "code 'f'");
14200 return;
14201 }
14202 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14203 if (ASSEMBLER_DIALECT == ASM_ATT)
14204 putc ('.', file);
14205 #endif
14206 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14207 return;
14208
14209 case 'H':
14210 if (!offsettable_memref_p (x))
14211 {
14212 output_operand_lossage ("operand is not an offsettable memory "
14213 "reference, invalid operand "
14214 "code 'H'");
14215 return;
14216 }
14217 /* It doesn't actually matter what mode we use here, as we're
14218 only going to use this for printing. */
14219 x = adjust_address_nv (x, DImode, 8);
14220 break;
14221
14222 case '+':
14223 {
14224 rtx x;
14225
14226 if (!optimize
14227 || optimize_function_for_size_p (cfun)
14228 || !TARGET_BRANCH_PREDICTION_HINTS)
14229 return;
14230
14231 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14232 if (x)
14233 {
14234 int pred_val = INTVAL (XEXP (x, 0));
14235
14236 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14237 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14238 {
14239 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14240 bool cputaken
14241 = final_forward_branch_p (current_output_insn) == 0;
14242
14243 /* Emit hints only in the case default branch prediction
14244 heuristics would fail. */
14245 if (taken != cputaken)
14246 {
14247 /* We use 3e (DS) prefix for taken branches and
14248 2e (CS) prefix for not taken branches. */
14249 if (taken)
14250 fputs ("ds ; ", file);
14251 else
14252 fputs ("cs ; ", file);
14253 }
14254 }
14255 }
14256 return;
14257 }
14258
14259 case 'Y':
14260 switch (GET_CODE (x))
14261 {
14262 case NE:
14263 fputs ("neq", file);
14264 break;
14265 case EQ:
14266 fputs ("eq", file);
14267 break;
14268 case GE:
14269 case GEU:
14270 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14271 break;
14272 case GT:
14273 case GTU:
14274 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14275 break;
14276 case LE:
14277 case LEU:
14278 fputs ("le", file);
14279 break;
14280 case LT:
14281 case LTU:
14282 fputs ("lt", file);
14283 break;
14284 case UNORDERED:
14285 fputs ("unord", file);
14286 break;
14287 case ORDERED:
14288 fputs ("ord", file);
14289 break;
14290 case UNEQ:
14291 fputs ("ueq", file);
14292 break;
14293 case UNGE:
14294 fputs ("nlt", file);
14295 break;
14296 case UNGT:
14297 fputs ("nle", file);
14298 break;
14299 case UNLE:
14300 fputs ("ule", file);
14301 break;
14302 case UNLT:
14303 fputs ("ult", file);
14304 break;
14305 case LTGT:
14306 fputs ("une", file);
14307 break;
14308 default:
14309 output_operand_lossage ("operand is not a condition code, "
14310 "invalid operand code 'Y'");
14311 return;
14312 }
14313 return;
14314
14315 case ';':
14316 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14317 putc (';', file);
14318 #endif
14319 return;
14320
14321 case '@':
14322 if (ASSEMBLER_DIALECT == ASM_ATT)
14323 putc ('%', file);
14324
14325 /* The kernel uses a different segment register for performance
14326 reasons; a system call would not have to trash the userspace
14327 segment register, which would be expensive. */
14328 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14329 fputs ("fs", file);
14330 else
14331 fputs ("gs", file);
14332 return;
14333
14334 case '~':
14335 putc (TARGET_AVX2 ? 'i' : 'f', file);
14336 return;
14337
14338 case '^':
14339 if (TARGET_64BIT && Pmode != word_mode)
14340 fputs ("addr32 ", file);
14341 return;
14342
14343 default:
14344 output_operand_lossage ("invalid operand code '%c'", code);
14345 }
14346 }
14347
14348 if (REG_P (x))
14349 print_reg (x, code, file);
14350
14351 else if (MEM_P (x))
14352 {
14353 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14354 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14355 && GET_MODE (x) != BLKmode)
14356 {
14357 const char * size;
14358 switch (GET_MODE_SIZE (GET_MODE (x)))
14359 {
14360 case 1: size = "BYTE"; break;
14361 case 2: size = "WORD"; break;
14362 case 4: size = "DWORD"; break;
14363 case 8: size = "QWORD"; break;
14364 case 12: size = "TBYTE"; break;
14365 case 16:
14366 if (GET_MODE (x) == XFmode)
14367 size = "TBYTE";
14368 else
14369 size = "XMMWORD";
14370 break;
14371 case 32: size = "YMMWORD"; break;
14372 default:
14373 gcc_unreachable ();
14374 }
14375
14376 /* Check for explicit size override (codes 'b', 'w', 'k',
14377 'q' and 'x') */
14378 if (code == 'b')
14379 size = "BYTE";
14380 else if (code == 'w')
14381 size = "WORD";
14382 else if (code == 'k')
14383 size = "DWORD";
14384 else if (code == 'q')
14385 size = "QWORD";
14386 else if (code == 'x')
14387 size = "XMMWORD";
14388
14389 fputs (size, file);
14390 fputs (" PTR ", file);
14391 }
14392
14393 x = XEXP (x, 0);
14394 /* Avoid (%rip) for call operands. */
14395 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14396 && !CONST_INT_P (x))
14397 output_addr_const (file, x);
14398 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14399 output_operand_lossage ("invalid constraints for operand");
14400 else
14401 output_address (x);
14402 }
14403
14404 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14405 {
14406 REAL_VALUE_TYPE r;
14407 long l;
14408
14409 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14410 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14411
14412 if (ASSEMBLER_DIALECT == ASM_ATT)
14413 putc ('$', file);
14414 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14415 if (code == 'q')
14416 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14417 else
14418 fprintf (file, "0x%08x", (unsigned int) l);
14419 }
14420
14421 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14422 {
14423 REAL_VALUE_TYPE r;
14424 long l[2];
14425
14426 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14427 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14428
14429 if (ASSEMBLER_DIALECT == ASM_ATT)
14430 putc ('$', file);
14431 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14432 }
14433
14434 /* These float cases don't actually occur as immediate operands. */
14435 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14436 {
14437 char dstr[30];
14438
14439 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14440 fputs (dstr, file);
14441 }
14442
14443 else
14444 {
14445 /* We have patterns that allow zero sets of memory, for instance.
14446 In 64-bit mode, we should probably support all 8-byte vectors,
14447 since we can in fact encode that into an immediate. */
14448 if (GET_CODE (x) == CONST_VECTOR)
14449 {
14450 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14451 x = const0_rtx;
14452 }
14453
14454 if (code != 'P' && code != 'p')
14455 {
14456 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14457 {
14458 if (ASSEMBLER_DIALECT == ASM_ATT)
14459 putc ('$', file);
14460 }
14461 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14462 || GET_CODE (x) == LABEL_REF)
14463 {
14464 if (ASSEMBLER_DIALECT == ASM_ATT)
14465 putc ('$', file);
14466 else
14467 fputs ("OFFSET FLAT:", file);
14468 }
14469 }
14470 if (CONST_INT_P (x))
14471 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14472 else if (flag_pic || MACHOPIC_INDIRECT)
14473 output_pic_addr_const (file, x, code);
14474 else
14475 output_addr_const (file, x);
14476 }
14477 }
14478
14479 static bool
14480 ix86_print_operand_punct_valid_p (unsigned char code)
14481 {
14482 return (code == '@' || code == '*' || code == '+' || code == '&'
14483 || code == ';' || code == '~' || code == '^');
14484 }
14485 \f
14486 /* Print a memory operand whose address is ADDR. */
14487
14488 static void
14489 ix86_print_operand_address (FILE *file, rtx addr)
14490 {
14491 struct ix86_address parts;
14492 rtx base, index, disp;
14493 int scale;
14494 int ok;
14495 bool vsib = false;
14496 int code = 0;
14497
14498 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14499 {
14500 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14501 gcc_assert (parts.index == NULL_RTX);
14502 parts.index = XVECEXP (addr, 0, 1);
14503 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14504 addr = XVECEXP (addr, 0, 0);
14505 vsib = true;
14506 }
14507 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14508 {
14509 gcc_assert (TARGET_64BIT);
14510 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14511 code = 'q';
14512 }
14513 else
14514 ok = ix86_decompose_address (addr, &parts);
14515
14516 gcc_assert (ok);
14517
14518 if (parts.base && GET_CODE (parts.base) == SUBREG)
14519 {
14520 rtx tmp = SUBREG_REG (parts.base);
14521 parts.base = simplify_subreg (GET_MODE (parts.base),
14522 tmp, GET_MODE (tmp), 0);
14523 }
14524
14525 if (parts.index && GET_CODE (parts.index) == SUBREG)
14526 {
14527 rtx tmp = SUBREG_REG (parts.index);
14528 parts.index = simplify_subreg (GET_MODE (parts.index),
14529 tmp, GET_MODE (tmp), 0);
14530 }
14531
14532 base = parts.base;
14533 index = parts.index;
14534 disp = parts.disp;
14535 scale = parts.scale;
14536
14537 switch (parts.seg)
14538 {
14539 case SEG_DEFAULT:
14540 break;
14541 case SEG_FS:
14542 case SEG_GS:
14543 if (ASSEMBLER_DIALECT == ASM_ATT)
14544 putc ('%', file);
14545 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14546 break;
14547 default:
14548 gcc_unreachable ();
14549 }
14550
14551 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14552 if (TARGET_64BIT && !base && !index)
14553 {
14554 rtx symbol = disp;
14555
14556 if (GET_CODE (disp) == CONST
14557 && GET_CODE (XEXP (disp, 0)) == PLUS
14558 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14559 symbol = XEXP (XEXP (disp, 0), 0);
14560
14561 if (GET_CODE (symbol) == LABEL_REF
14562 || (GET_CODE (symbol) == SYMBOL_REF
14563 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14564 base = pc_rtx;
14565 }
14566 if (!base && !index)
14567 {
14568 /* Displacement only requires special attention. */
14569
14570 if (CONST_INT_P (disp))
14571 {
14572 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14573 fputs ("ds:", file);
14574 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14575 }
14576 else if (flag_pic)
14577 output_pic_addr_const (file, disp, 0);
14578 else
14579 output_addr_const (file, disp);
14580 }
14581 else
14582 {
14583 /* Print SImode register names for zero-extended
14584 addresses to force addr32 prefix. */
14585 if (TARGET_64BIT
14586 && (GET_CODE (addr) == ZERO_EXTEND
14587 || GET_CODE (addr) == AND))
14588 {
14589 gcc_assert (!code);
14590 code = 'l';
14591 }
14592
14593 if (ASSEMBLER_DIALECT == ASM_ATT)
14594 {
14595 if (disp)
14596 {
14597 if (flag_pic)
14598 output_pic_addr_const (file, disp, 0);
14599 else if (GET_CODE (disp) == LABEL_REF)
14600 output_asm_label (disp);
14601 else
14602 output_addr_const (file, disp);
14603 }
14604
14605 putc ('(', file);
14606 if (base)
14607 print_reg (base, code, file);
14608 if (index)
14609 {
14610 putc (',', file);
14611 print_reg (index, vsib ? 0 : code, file);
14612 if (scale != 1 || vsib)
14613 fprintf (file, ",%d", scale);
14614 }
14615 putc (')', file);
14616 }
14617 else
14618 {
14619 rtx offset = NULL_RTX;
14620
14621 if (disp)
14622 {
14623 /* Pull out the offset of a symbol; print any symbol itself. */
14624 if (GET_CODE (disp) == CONST
14625 && GET_CODE (XEXP (disp, 0)) == PLUS
14626 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14627 {
14628 offset = XEXP (XEXP (disp, 0), 1);
14629 disp = gen_rtx_CONST (VOIDmode,
14630 XEXP (XEXP (disp, 0), 0));
14631 }
14632
14633 if (flag_pic)
14634 output_pic_addr_const (file, disp, 0);
14635 else if (GET_CODE (disp) == LABEL_REF)
14636 output_asm_label (disp);
14637 else if (CONST_INT_P (disp))
14638 offset = disp;
14639 else
14640 output_addr_const (file, disp);
14641 }
14642
14643 putc ('[', file);
14644 if (base)
14645 {
14646 print_reg (base, code, file);
14647 if (offset)
14648 {
14649 if (INTVAL (offset) >= 0)
14650 putc ('+', file);
14651 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14652 }
14653 }
14654 else if (offset)
14655 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14656 else
14657 putc ('0', file);
14658
14659 if (index)
14660 {
14661 putc ('+', file);
14662 print_reg (index, vsib ? 0 : code, file);
14663 if (scale != 1 || vsib)
14664 fprintf (file, "*%d", scale);
14665 }
14666 putc (']', file);
14667 }
14668 }
14669 }
14670
14671 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14672
14673 static bool
14674 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14675 {
14676 rtx op;
14677
14678 if (GET_CODE (x) != UNSPEC)
14679 return false;
14680
14681 op = XVECEXP (x, 0, 0);
14682 switch (XINT (x, 1))
14683 {
14684 case UNSPEC_GOTTPOFF:
14685 output_addr_const (file, op);
14686 /* FIXME: This might be @TPOFF in Sun ld. */
14687 fputs ("@gottpoff", file);
14688 break;
14689 case UNSPEC_TPOFF:
14690 output_addr_const (file, op);
14691 fputs ("@tpoff", file);
14692 break;
14693 case UNSPEC_NTPOFF:
14694 output_addr_const (file, op);
14695 if (TARGET_64BIT)
14696 fputs ("@tpoff", file);
14697 else
14698 fputs ("@ntpoff", file);
14699 break;
14700 case UNSPEC_DTPOFF:
14701 output_addr_const (file, op);
14702 fputs ("@dtpoff", file);
14703 break;
14704 case UNSPEC_GOTNTPOFF:
14705 output_addr_const (file, op);
14706 if (TARGET_64BIT)
14707 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14708 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14709 else
14710 fputs ("@gotntpoff", file);
14711 break;
14712 case UNSPEC_INDNTPOFF:
14713 output_addr_const (file, op);
14714 fputs ("@indntpoff", file);
14715 break;
14716 #if TARGET_MACHO
14717 case UNSPEC_MACHOPIC_OFFSET:
14718 output_addr_const (file, op);
14719 putc ('-', file);
14720 machopic_output_function_base_name (file);
14721 break;
14722 #endif
14723
14724 case UNSPEC_STACK_CHECK:
14725 {
14726 int offset;
14727
14728 gcc_assert (flag_split_stack);
14729
14730 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14731 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14732 #else
14733 gcc_unreachable ();
14734 #endif
14735
14736 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14737 }
14738 break;
14739
14740 default:
14741 return false;
14742 }
14743
14744 return true;
14745 }
14746 \f
14747 /* Split one or more double-mode RTL references into pairs of half-mode
14748 references. The RTL can be REG, offsettable MEM, integer constant, or
14749 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14750 split and "num" is its length. lo_half and hi_half are output arrays
14751 that parallel "operands". */
14752
14753 void
14754 split_double_mode (enum machine_mode mode, rtx operands[],
14755 int num, rtx lo_half[], rtx hi_half[])
14756 {
14757 enum machine_mode half_mode;
14758 unsigned int byte;
14759
14760 switch (mode)
14761 {
14762 case TImode:
14763 half_mode = DImode;
14764 break;
14765 case DImode:
14766 half_mode = SImode;
14767 break;
14768 default:
14769 gcc_unreachable ();
14770 }
14771
14772 byte = GET_MODE_SIZE (half_mode);
14773
14774 while (num--)
14775 {
14776 rtx op = operands[num];
14777
14778 /* simplify_subreg refuse to split volatile memory addresses,
14779 but we still have to handle it. */
14780 if (MEM_P (op))
14781 {
14782 lo_half[num] = adjust_address (op, half_mode, 0);
14783 hi_half[num] = adjust_address (op, half_mode, byte);
14784 }
14785 else
14786 {
14787 lo_half[num] = simplify_gen_subreg (half_mode, op,
14788 GET_MODE (op) == VOIDmode
14789 ? mode : GET_MODE (op), 0);
14790 hi_half[num] = simplify_gen_subreg (half_mode, op,
14791 GET_MODE (op) == VOIDmode
14792 ? mode : GET_MODE (op), byte);
14793 }
14794 }
14795 }
14796 \f
14797 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14798 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14799 is the expression of the binary operation. The output may either be
14800 emitted here, or returned to the caller, like all output_* functions.
14801
14802 There is no guarantee that the operands are the same mode, as they
14803 might be within FLOAT or FLOAT_EXTEND expressions. */
14804
14805 #ifndef SYSV386_COMPAT
14806 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14807 wants to fix the assemblers because that causes incompatibility
14808 with gcc. No-one wants to fix gcc because that causes
14809 incompatibility with assemblers... You can use the option of
14810 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14811 #define SYSV386_COMPAT 1
14812 #endif
14813
14814 const char *
14815 output_387_binary_op (rtx insn, rtx *operands)
14816 {
14817 static char buf[40];
14818 const char *p;
14819 const char *ssep;
14820 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14821
14822 #ifdef ENABLE_CHECKING
14823 /* Even if we do not want to check the inputs, this documents input
14824 constraints. Which helps in understanding the following code. */
14825 if (STACK_REG_P (operands[0])
14826 && ((REG_P (operands[1])
14827 && REGNO (operands[0]) == REGNO (operands[1])
14828 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14829 || (REG_P (operands[2])
14830 && REGNO (operands[0]) == REGNO (operands[2])
14831 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14832 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14833 ; /* ok */
14834 else
14835 gcc_assert (is_sse);
14836 #endif
14837
14838 switch (GET_CODE (operands[3]))
14839 {
14840 case PLUS:
14841 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14842 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14843 p = "fiadd";
14844 else
14845 p = "fadd";
14846 ssep = "vadd";
14847 break;
14848
14849 case MINUS:
14850 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14851 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14852 p = "fisub";
14853 else
14854 p = "fsub";
14855 ssep = "vsub";
14856 break;
14857
14858 case MULT:
14859 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14860 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14861 p = "fimul";
14862 else
14863 p = "fmul";
14864 ssep = "vmul";
14865 break;
14866
14867 case DIV:
14868 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14869 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14870 p = "fidiv";
14871 else
14872 p = "fdiv";
14873 ssep = "vdiv";
14874 break;
14875
14876 default:
14877 gcc_unreachable ();
14878 }
14879
14880 if (is_sse)
14881 {
14882 if (TARGET_AVX)
14883 {
14884 strcpy (buf, ssep);
14885 if (GET_MODE (operands[0]) == SFmode)
14886 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14887 else
14888 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14889 }
14890 else
14891 {
14892 strcpy (buf, ssep + 1);
14893 if (GET_MODE (operands[0]) == SFmode)
14894 strcat (buf, "ss\t{%2, %0|%0, %2}");
14895 else
14896 strcat (buf, "sd\t{%2, %0|%0, %2}");
14897 }
14898 return buf;
14899 }
14900 strcpy (buf, p);
14901
14902 switch (GET_CODE (operands[3]))
14903 {
14904 case MULT:
14905 case PLUS:
14906 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14907 {
14908 rtx temp = operands[2];
14909 operands[2] = operands[1];
14910 operands[1] = temp;
14911 }
14912
14913 /* know operands[0] == operands[1]. */
14914
14915 if (MEM_P (operands[2]))
14916 {
14917 p = "%Z2\t%2";
14918 break;
14919 }
14920
14921 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14922 {
14923 if (STACK_TOP_P (operands[0]))
14924 /* How is it that we are storing to a dead operand[2]?
14925 Well, presumably operands[1] is dead too. We can't
14926 store the result to st(0) as st(0) gets popped on this
14927 instruction. Instead store to operands[2] (which I
14928 think has to be st(1)). st(1) will be popped later.
14929 gcc <= 2.8.1 didn't have this check and generated
14930 assembly code that the Unixware assembler rejected. */
14931 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14932 else
14933 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14934 break;
14935 }
14936
14937 if (STACK_TOP_P (operands[0]))
14938 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14939 else
14940 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14941 break;
14942
14943 case MINUS:
14944 case DIV:
14945 if (MEM_P (operands[1]))
14946 {
14947 p = "r%Z1\t%1";
14948 break;
14949 }
14950
14951 if (MEM_P (operands[2]))
14952 {
14953 p = "%Z2\t%2";
14954 break;
14955 }
14956
14957 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14958 {
14959 #if SYSV386_COMPAT
14960 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14961 derived assemblers, confusingly reverse the direction of
14962 the operation for fsub{r} and fdiv{r} when the
14963 destination register is not st(0). The Intel assembler
14964 doesn't have this brain damage. Read !SYSV386_COMPAT to
14965 figure out what the hardware really does. */
14966 if (STACK_TOP_P (operands[0]))
14967 p = "{p\t%0, %2|rp\t%2, %0}";
14968 else
14969 p = "{rp\t%2, %0|p\t%0, %2}";
14970 #else
14971 if (STACK_TOP_P (operands[0]))
14972 /* As above for fmul/fadd, we can't store to st(0). */
14973 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14974 else
14975 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14976 #endif
14977 break;
14978 }
14979
14980 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
14981 {
14982 #if SYSV386_COMPAT
14983 if (STACK_TOP_P (operands[0]))
14984 p = "{rp\t%0, %1|p\t%1, %0}";
14985 else
14986 p = "{p\t%1, %0|rp\t%0, %1}";
14987 #else
14988 if (STACK_TOP_P (operands[0]))
14989 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
14990 else
14991 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
14992 #endif
14993 break;
14994 }
14995
14996 if (STACK_TOP_P (operands[0]))
14997 {
14998 if (STACK_TOP_P (operands[1]))
14999 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15000 else
15001 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15002 break;
15003 }
15004 else if (STACK_TOP_P (operands[1]))
15005 {
15006 #if SYSV386_COMPAT
15007 p = "{\t%1, %0|r\t%0, %1}";
15008 #else
15009 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15010 #endif
15011 }
15012 else
15013 {
15014 #if SYSV386_COMPAT
15015 p = "{r\t%2, %0|\t%0, %2}";
15016 #else
15017 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15018 #endif
15019 }
15020 break;
15021
15022 default:
15023 gcc_unreachable ();
15024 }
15025
15026 strcat (buf, p);
15027 return buf;
15028 }
15029
15030 /* Return needed mode for entity in optimize_mode_switching pass. */
15031
15032 int
15033 ix86_mode_needed (int entity, rtx insn)
15034 {
15035 enum attr_i387_cw mode;
15036
15037 /* The mode UNINITIALIZED is used to store control word after a
15038 function call or ASM pattern. The mode ANY specify that function
15039 has no requirements on the control word and make no changes in the
15040 bits we are interested in. */
15041
15042 if (CALL_P (insn)
15043 || (NONJUMP_INSN_P (insn)
15044 && (asm_noperands (PATTERN (insn)) >= 0
15045 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15046 return I387_CW_UNINITIALIZED;
15047
15048 if (recog_memoized (insn) < 0)
15049 return I387_CW_ANY;
15050
15051 mode = get_attr_i387_cw (insn);
15052
15053 switch (entity)
15054 {
15055 case I387_TRUNC:
15056 if (mode == I387_CW_TRUNC)
15057 return mode;
15058 break;
15059
15060 case I387_FLOOR:
15061 if (mode == I387_CW_FLOOR)
15062 return mode;
15063 break;
15064
15065 case I387_CEIL:
15066 if (mode == I387_CW_CEIL)
15067 return mode;
15068 break;
15069
15070 case I387_MASK_PM:
15071 if (mode == I387_CW_MASK_PM)
15072 return mode;
15073 break;
15074
15075 default:
15076 gcc_unreachable ();
15077 }
15078
15079 return I387_CW_ANY;
15080 }
15081
15082 /* Output code to initialize control word copies used by trunc?f?i and
15083 rounding patterns. CURRENT_MODE is set to current control word,
15084 while NEW_MODE is set to new control word. */
15085
15086 void
15087 emit_i387_cw_initialization (int mode)
15088 {
15089 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15090 rtx new_mode;
15091
15092 enum ix86_stack_slot slot;
15093
15094 rtx reg = gen_reg_rtx (HImode);
15095
15096 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15097 emit_move_insn (reg, copy_rtx (stored_mode));
15098
15099 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15100 || optimize_function_for_size_p (cfun))
15101 {
15102 switch (mode)
15103 {
15104 case I387_CW_TRUNC:
15105 /* round toward zero (truncate) */
15106 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15107 slot = SLOT_CW_TRUNC;
15108 break;
15109
15110 case I387_CW_FLOOR:
15111 /* round down toward -oo */
15112 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15113 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15114 slot = SLOT_CW_FLOOR;
15115 break;
15116
15117 case I387_CW_CEIL:
15118 /* round up toward +oo */
15119 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15120 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15121 slot = SLOT_CW_CEIL;
15122 break;
15123
15124 case I387_CW_MASK_PM:
15125 /* mask precision exception for nearbyint() */
15126 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15127 slot = SLOT_CW_MASK_PM;
15128 break;
15129
15130 default:
15131 gcc_unreachable ();
15132 }
15133 }
15134 else
15135 {
15136 switch (mode)
15137 {
15138 case I387_CW_TRUNC:
15139 /* round toward zero (truncate) */
15140 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15141 slot = SLOT_CW_TRUNC;
15142 break;
15143
15144 case I387_CW_FLOOR:
15145 /* round down toward -oo */
15146 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15147 slot = SLOT_CW_FLOOR;
15148 break;
15149
15150 case I387_CW_CEIL:
15151 /* round up toward +oo */
15152 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15153 slot = SLOT_CW_CEIL;
15154 break;
15155
15156 case I387_CW_MASK_PM:
15157 /* mask precision exception for nearbyint() */
15158 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15159 slot = SLOT_CW_MASK_PM;
15160 break;
15161
15162 default:
15163 gcc_unreachable ();
15164 }
15165 }
15166
15167 gcc_assert (slot < MAX_386_STACK_LOCALS);
15168
15169 new_mode = assign_386_stack_local (HImode, slot);
15170 emit_move_insn (new_mode, reg);
15171 }
15172
15173 /* Output code for INSN to convert a float to a signed int. OPERANDS
15174 are the insn operands. The output may be [HSD]Imode and the input
15175 operand may be [SDX]Fmode. */
15176
15177 const char *
15178 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15179 {
15180 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15181 int dimode_p = GET_MODE (operands[0]) == DImode;
15182 int round_mode = get_attr_i387_cw (insn);
15183
15184 /* Jump through a hoop or two for DImode, since the hardware has no
15185 non-popping instruction. We used to do this a different way, but
15186 that was somewhat fragile and broke with post-reload splitters. */
15187 if ((dimode_p || fisttp) && !stack_top_dies)
15188 output_asm_insn ("fld\t%y1", operands);
15189
15190 gcc_assert (STACK_TOP_P (operands[1]));
15191 gcc_assert (MEM_P (operands[0]));
15192 gcc_assert (GET_MODE (operands[1]) != TFmode);
15193
15194 if (fisttp)
15195 output_asm_insn ("fisttp%Z0\t%0", operands);
15196 else
15197 {
15198 if (round_mode != I387_CW_ANY)
15199 output_asm_insn ("fldcw\t%3", operands);
15200 if (stack_top_dies || dimode_p)
15201 output_asm_insn ("fistp%Z0\t%0", operands);
15202 else
15203 output_asm_insn ("fist%Z0\t%0", operands);
15204 if (round_mode != I387_CW_ANY)
15205 output_asm_insn ("fldcw\t%2", operands);
15206 }
15207
15208 return "";
15209 }
15210
15211 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15212 have the values zero or one, indicates the ffreep insn's operand
15213 from the OPERANDS array. */
15214
15215 static const char *
15216 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15217 {
15218 if (TARGET_USE_FFREEP)
15219 #ifdef HAVE_AS_IX86_FFREEP
15220 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15221 #else
15222 {
15223 static char retval[32];
15224 int regno = REGNO (operands[opno]);
15225
15226 gcc_assert (FP_REGNO_P (regno));
15227
15228 regno -= FIRST_STACK_REG;
15229
15230 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15231 return retval;
15232 }
15233 #endif
15234
15235 return opno ? "fstp\t%y1" : "fstp\t%y0";
15236 }
15237
15238
15239 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15240 should be used. UNORDERED_P is true when fucom should be used. */
15241
15242 const char *
15243 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15244 {
15245 int stack_top_dies;
15246 rtx cmp_op0, cmp_op1;
15247 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15248
15249 if (eflags_p)
15250 {
15251 cmp_op0 = operands[0];
15252 cmp_op1 = operands[1];
15253 }
15254 else
15255 {
15256 cmp_op0 = operands[1];
15257 cmp_op1 = operands[2];
15258 }
15259
15260 if (is_sse)
15261 {
15262 if (GET_MODE (operands[0]) == SFmode)
15263 if (unordered_p)
15264 return "%vucomiss\t{%1, %0|%0, %1}";
15265 else
15266 return "%vcomiss\t{%1, %0|%0, %1}";
15267 else
15268 if (unordered_p)
15269 return "%vucomisd\t{%1, %0|%0, %1}";
15270 else
15271 return "%vcomisd\t{%1, %0|%0, %1}";
15272 }
15273
15274 gcc_assert (STACK_TOP_P (cmp_op0));
15275
15276 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15277
15278 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15279 {
15280 if (stack_top_dies)
15281 {
15282 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15283 return output_387_ffreep (operands, 1);
15284 }
15285 else
15286 return "ftst\n\tfnstsw\t%0";
15287 }
15288
15289 if (STACK_REG_P (cmp_op1)
15290 && stack_top_dies
15291 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15292 && REGNO (cmp_op1) != FIRST_STACK_REG)
15293 {
15294 /* If both the top of the 387 stack dies, and the other operand
15295 is also a stack register that dies, then this must be a
15296 `fcompp' float compare */
15297
15298 if (eflags_p)
15299 {
15300 /* There is no double popping fcomi variant. Fortunately,
15301 eflags is immune from the fstp's cc clobbering. */
15302 if (unordered_p)
15303 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15304 else
15305 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15306 return output_387_ffreep (operands, 0);
15307 }
15308 else
15309 {
15310 if (unordered_p)
15311 return "fucompp\n\tfnstsw\t%0";
15312 else
15313 return "fcompp\n\tfnstsw\t%0";
15314 }
15315 }
15316 else
15317 {
15318 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15319
15320 static const char * const alt[16] =
15321 {
15322 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15323 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15324 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15325 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15326
15327 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15328 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15329 NULL,
15330 NULL,
15331
15332 "fcomi\t{%y1, %0|%0, %y1}",
15333 "fcomip\t{%y1, %0|%0, %y1}",
15334 "fucomi\t{%y1, %0|%0, %y1}",
15335 "fucomip\t{%y1, %0|%0, %y1}",
15336
15337 NULL,
15338 NULL,
15339 NULL,
15340 NULL
15341 };
15342
15343 int mask;
15344 const char *ret;
15345
15346 mask = eflags_p << 3;
15347 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15348 mask |= unordered_p << 1;
15349 mask |= stack_top_dies;
15350
15351 gcc_assert (mask < 16);
15352 ret = alt[mask];
15353 gcc_assert (ret);
15354
15355 return ret;
15356 }
15357 }
15358
15359 void
15360 ix86_output_addr_vec_elt (FILE *file, int value)
15361 {
15362 const char *directive = ASM_LONG;
15363
15364 #ifdef ASM_QUAD
15365 if (TARGET_LP64)
15366 directive = ASM_QUAD;
15367 #else
15368 gcc_assert (!TARGET_64BIT);
15369 #endif
15370
15371 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15372 }
15373
15374 void
15375 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15376 {
15377 const char *directive = ASM_LONG;
15378
15379 #ifdef ASM_QUAD
15380 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15381 directive = ASM_QUAD;
15382 #else
15383 gcc_assert (!TARGET_64BIT);
15384 #endif
15385 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15386 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15387 fprintf (file, "%s%s%d-%s%d\n",
15388 directive, LPREFIX, value, LPREFIX, rel);
15389 else if (HAVE_AS_GOTOFF_IN_DATA)
15390 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15391 #if TARGET_MACHO
15392 else if (TARGET_MACHO)
15393 {
15394 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15395 machopic_output_function_base_name (file);
15396 putc ('\n', file);
15397 }
15398 #endif
15399 else
15400 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15401 GOT_SYMBOL_NAME, LPREFIX, value);
15402 }
15403 \f
15404 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15405 for the target. */
15406
15407 void
15408 ix86_expand_clear (rtx dest)
15409 {
15410 rtx tmp;
15411
15412 /* We play register width games, which are only valid after reload. */
15413 gcc_assert (reload_completed);
15414
15415 /* Avoid HImode and its attendant prefix byte. */
15416 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15417 dest = gen_rtx_REG (SImode, REGNO (dest));
15418 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15419
15420 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15421 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15422 {
15423 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15424 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15425 }
15426
15427 emit_insn (tmp);
15428 }
15429
15430 /* X is an unchanging MEM. If it is a constant pool reference, return
15431 the constant pool rtx, else NULL. */
15432
15433 rtx
15434 maybe_get_pool_constant (rtx x)
15435 {
15436 x = ix86_delegitimize_address (XEXP (x, 0));
15437
15438 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15439 return get_pool_constant (x);
15440
15441 return NULL_RTX;
15442 }
15443
15444 void
15445 ix86_expand_move (enum machine_mode mode, rtx operands[])
15446 {
15447 rtx op0, op1;
15448 enum tls_model model;
15449
15450 op0 = operands[0];
15451 op1 = operands[1];
15452
15453 if (GET_CODE (op1) == SYMBOL_REF)
15454 {
15455 model = SYMBOL_REF_TLS_MODEL (op1);
15456 if (model)
15457 {
15458 op1 = legitimize_tls_address (op1, model, true);
15459 op1 = force_operand (op1, op0);
15460 if (op1 == op0)
15461 return;
15462 if (GET_MODE (op1) != mode)
15463 op1 = convert_to_mode (mode, op1, 1);
15464 }
15465 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15466 && SYMBOL_REF_DLLIMPORT_P (op1))
15467 op1 = legitimize_dllimport_symbol (op1, false);
15468 }
15469 else if (GET_CODE (op1) == CONST
15470 && GET_CODE (XEXP (op1, 0)) == PLUS
15471 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15472 {
15473 rtx addend = XEXP (XEXP (op1, 0), 1);
15474 rtx symbol = XEXP (XEXP (op1, 0), 0);
15475 rtx tmp = NULL;
15476
15477 model = SYMBOL_REF_TLS_MODEL (symbol);
15478 if (model)
15479 tmp = legitimize_tls_address (symbol, model, true);
15480 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15481 && SYMBOL_REF_DLLIMPORT_P (symbol))
15482 tmp = legitimize_dllimport_symbol (symbol, true);
15483
15484 if (tmp)
15485 {
15486 tmp = force_operand (tmp, NULL);
15487 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15488 op0, 1, OPTAB_DIRECT);
15489 if (tmp == op0)
15490 return;
15491 if (GET_MODE (tmp) != mode)
15492 op1 = convert_to_mode (mode, tmp, 1);
15493 }
15494 }
15495
15496 if ((flag_pic || MACHOPIC_INDIRECT)
15497 && symbolic_operand (op1, mode))
15498 {
15499 if (TARGET_MACHO && !TARGET_64BIT)
15500 {
15501 #if TARGET_MACHO
15502 /* dynamic-no-pic */
15503 if (MACHOPIC_INDIRECT)
15504 {
15505 rtx temp = ((reload_in_progress
15506 || ((op0 && REG_P (op0))
15507 && mode == Pmode))
15508 ? op0 : gen_reg_rtx (Pmode));
15509 op1 = machopic_indirect_data_reference (op1, temp);
15510 if (MACHOPIC_PURE)
15511 op1 = machopic_legitimize_pic_address (op1, mode,
15512 temp == op1 ? 0 : temp);
15513 }
15514 if (op0 != op1 && GET_CODE (op0) != MEM)
15515 {
15516 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15517 emit_insn (insn);
15518 return;
15519 }
15520 if (GET_CODE (op0) == MEM)
15521 op1 = force_reg (Pmode, op1);
15522 else
15523 {
15524 rtx temp = op0;
15525 if (GET_CODE (temp) != REG)
15526 temp = gen_reg_rtx (Pmode);
15527 temp = legitimize_pic_address (op1, temp);
15528 if (temp == op0)
15529 return;
15530 op1 = temp;
15531 }
15532 /* dynamic-no-pic */
15533 #endif
15534 }
15535 else
15536 {
15537 if (MEM_P (op0))
15538 op1 = force_reg (mode, op1);
15539 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15540 {
15541 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15542 op1 = legitimize_pic_address (op1, reg);
15543 if (op0 == op1)
15544 return;
15545 if (GET_MODE (op1) != mode)
15546 op1 = convert_to_mode (mode, op1, 1);
15547 }
15548 }
15549 }
15550 else
15551 {
15552 if (MEM_P (op0)
15553 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15554 || !push_operand (op0, mode))
15555 && MEM_P (op1))
15556 op1 = force_reg (mode, op1);
15557
15558 if (push_operand (op0, mode)
15559 && ! general_no_elim_operand (op1, mode))
15560 op1 = copy_to_mode_reg (mode, op1);
15561
15562 /* Force large constants in 64bit compilation into register
15563 to get them CSEed. */
15564 if (can_create_pseudo_p ()
15565 && (mode == DImode) && TARGET_64BIT
15566 && immediate_operand (op1, mode)
15567 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15568 && !register_operand (op0, mode)
15569 && optimize)
15570 op1 = copy_to_mode_reg (mode, op1);
15571
15572 if (can_create_pseudo_p ()
15573 && FLOAT_MODE_P (mode)
15574 && GET_CODE (op1) == CONST_DOUBLE)
15575 {
15576 /* If we are loading a floating point constant to a register,
15577 force the value to memory now, since we'll get better code
15578 out the back end. */
15579
15580 op1 = validize_mem (force_const_mem (mode, op1));
15581 if (!register_operand (op0, mode))
15582 {
15583 rtx temp = gen_reg_rtx (mode);
15584 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15585 emit_move_insn (op0, temp);
15586 return;
15587 }
15588 }
15589 }
15590
15591 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15592 }
15593
15594 void
15595 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15596 {
15597 rtx op0 = operands[0], op1 = operands[1];
15598 unsigned int align = GET_MODE_ALIGNMENT (mode);
15599
15600 /* Force constants other than zero into memory. We do not know how
15601 the instructions used to build constants modify the upper 64 bits
15602 of the register, once we have that information we may be able
15603 to handle some of them more efficiently. */
15604 if (can_create_pseudo_p ()
15605 && register_operand (op0, mode)
15606 && (CONSTANT_P (op1)
15607 || (GET_CODE (op1) == SUBREG
15608 && CONSTANT_P (SUBREG_REG (op1))))
15609 && !standard_sse_constant_p (op1))
15610 op1 = validize_mem (force_const_mem (mode, op1));
15611
15612 /* We need to check memory alignment for SSE mode since attribute
15613 can make operands unaligned. */
15614 if (can_create_pseudo_p ()
15615 && SSE_REG_MODE_P (mode)
15616 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15617 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15618 {
15619 rtx tmp[2];
15620
15621 /* ix86_expand_vector_move_misalign() does not like constants ... */
15622 if (CONSTANT_P (op1)
15623 || (GET_CODE (op1) == SUBREG
15624 && CONSTANT_P (SUBREG_REG (op1))))
15625 op1 = validize_mem (force_const_mem (mode, op1));
15626
15627 /* ... nor both arguments in memory. */
15628 if (!register_operand (op0, mode)
15629 && !register_operand (op1, mode))
15630 op1 = force_reg (mode, op1);
15631
15632 tmp[0] = op0; tmp[1] = op1;
15633 ix86_expand_vector_move_misalign (mode, tmp);
15634 return;
15635 }
15636
15637 /* Make operand1 a register if it isn't already. */
15638 if (can_create_pseudo_p ()
15639 && !register_operand (op0, mode)
15640 && !register_operand (op1, mode))
15641 {
15642 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15643 return;
15644 }
15645
15646 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15647 }
15648
15649 /* Split 32-byte AVX unaligned load and store if needed. */
15650
15651 static void
15652 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15653 {
15654 rtx m;
15655 rtx (*extract) (rtx, rtx, rtx);
15656 rtx (*move_unaligned) (rtx, rtx);
15657 enum machine_mode mode;
15658
15659 switch (GET_MODE (op0))
15660 {
15661 default:
15662 gcc_unreachable ();
15663 case V32QImode:
15664 extract = gen_avx_vextractf128v32qi;
15665 move_unaligned = gen_avx_movdqu256;
15666 mode = V16QImode;
15667 break;
15668 case V8SFmode:
15669 extract = gen_avx_vextractf128v8sf;
15670 move_unaligned = gen_avx_movups256;
15671 mode = V4SFmode;
15672 break;
15673 case V4DFmode:
15674 extract = gen_avx_vextractf128v4df;
15675 move_unaligned = gen_avx_movupd256;
15676 mode = V2DFmode;
15677 break;
15678 }
15679
15680 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15681 {
15682 rtx r = gen_reg_rtx (mode);
15683 m = adjust_address (op1, mode, 0);
15684 emit_move_insn (r, m);
15685 m = adjust_address (op1, mode, 16);
15686 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15687 emit_move_insn (op0, r);
15688 }
15689 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15690 {
15691 m = adjust_address (op0, mode, 0);
15692 emit_insn (extract (m, op1, const0_rtx));
15693 m = adjust_address (op0, mode, 16);
15694 emit_insn (extract (m, op1, const1_rtx));
15695 }
15696 else
15697 emit_insn (move_unaligned (op0, op1));
15698 }
15699
15700 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15701 straight to ix86_expand_vector_move. */
15702 /* Code generation for scalar reg-reg moves of single and double precision data:
15703 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15704 movaps reg, reg
15705 else
15706 movss reg, reg
15707 if (x86_sse_partial_reg_dependency == true)
15708 movapd reg, reg
15709 else
15710 movsd reg, reg
15711
15712 Code generation for scalar loads of double precision data:
15713 if (x86_sse_split_regs == true)
15714 movlpd mem, reg (gas syntax)
15715 else
15716 movsd mem, reg
15717
15718 Code generation for unaligned packed loads of single precision data
15719 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15720 if (x86_sse_unaligned_move_optimal)
15721 movups mem, reg
15722
15723 if (x86_sse_partial_reg_dependency == true)
15724 {
15725 xorps reg, reg
15726 movlps mem, reg
15727 movhps mem+8, reg
15728 }
15729 else
15730 {
15731 movlps mem, reg
15732 movhps mem+8, reg
15733 }
15734
15735 Code generation for unaligned packed loads of double precision data
15736 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15737 if (x86_sse_unaligned_move_optimal)
15738 movupd mem, reg
15739
15740 if (x86_sse_split_regs == true)
15741 {
15742 movlpd mem, reg
15743 movhpd mem+8, reg
15744 }
15745 else
15746 {
15747 movsd mem, reg
15748 movhpd mem+8, reg
15749 }
15750 */
15751
15752 void
15753 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15754 {
15755 rtx op0, op1, m;
15756
15757 op0 = operands[0];
15758 op1 = operands[1];
15759
15760 if (TARGET_AVX)
15761 {
15762 switch (GET_MODE_CLASS (mode))
15763 {
15764 case MODE_VECTOR_INT:
15765 case MODE_INT:
15766 switch (GET_MODE_SIZE (mode))
15767 {
15768 case 16:
15769 /* If we're optimizing for size, movups is the smallest. */
15770 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15771 {
15772 op0 = gen_lowpart (V4SFmode, op0);
15773 op1 = gen_lowpart (V4SFmode, op1);
15774 emit_insn (gen_sse_movups (op0, op1));
15775 return;
15776 }
15777 op0 = gen_lowpart (V16QImode, op0);
15778 op1 = gen_lowpart (V16QImode, op1);
15779 emit_insn (gen_sse2_movdqu (op0, op1));
15780 break;
15781 case 32:
15782 op0 = gen_lowpart (V32QImode, op0);
15783 op1 = gen_lowpart (V32QImode, op1);
15784 ix86_avx256_split_vector_move_misalign (op0, op1);
15785 break;
15786 default:
15787 gcc_unreachable ();
15788 }
15789 break;
15790 case MODE_VECTOR_FLOAT:
15791 op0 = gen_lowpart (mode, op0);
15792 op1 = gen_lowpart (mode, op1);
15793
15794 switch (mode)
15795 {
15796 case V4SFmode:
15797 emit_insn (gen_sse_movups (op0, op1));
15798 break;
15799 case V8SFmode:
15800 ix86_avx256_split_vector_move_misalign (op0, op1);
15801 break;
15802 case V2DFmode:
15803 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15804 {
15805 op0 = gen_lowpart (V4SFmode, op0);
15806 op1 = gen_lowpart (V4SFmode, op1);
15807 emit_insn (gen_sse_movups (op0, op1));
15808 return;
15809 }
15810 emit_insn (gen_sse2_movupd (op0, op1));
15811 break;
15812 case V4DFmode:
15813 ix86_avx256_split_vector_move_misalign (op0, op1);
15814 break;
15815 default:
15816 gcc_unreachable ();
15817 }
15818 break;
15819
15820 default:
15821 gcc_unreachable ();
15822 }
15823
15824 return;
15825 }
15826
15827 if (MEM_P (op1))
15828 {
15829 /* If we're optimizing for size, movups is the smallest. */
15830 if (optimize_insn_for_size_p ()
15831 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15832 {
15833 op0 = gen_lowpart (V4SFmode, op0);
15834 op1 = gen_lowpart (V4SFmode, op1);
15835 emit_insn (gen_sse_movups (op0, op1));
15836 return;
15837 }
15838
15839 /* ??? If we have typed data, then it would appear that using
15840 movdqu is the only way to get unaligned data loaded with
15841 integer type. */
15842 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15843 {
15844 op0 = gen_lowpart (V16QImode, op0);
15845 op1 = gen_lowpart (V16QImode, op1);
15846 emit_insn (gen_sse2_movdqu (op0, op1));
15847 return;
15848 }
15849
15850 if (TARGET_SSE2 && mode == V2DFmode)
15851 {
15852 rtx zero;
15853
15854 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15855 {
15856 op0 = gen_lowpart (V2DFmode, op0);
15857 op1 = gen_lowpart (V2DFmode, op1);
15858 emit_insn (gen_sse2_movupd (op0, op1));
15859 return;
15860 }
15861
15862 /* When SSE registers are split into halves, we can avoid
15863 writing to the top half twice. */
15864 if (TARGET_SSE_SPLIT_REGS)
15865 {
15866 emit_clobber (op0);
15867 zero = op0;
15868 }
15869 else
15870 {
15871 /* ??? Not sure about the best option for the Intel chips.
15872 The following would seem to satisfy; the register is
15873 entirely cleared, breaking the dependency chain. We
15874 then store to the upper half, with a dependency depth
15875 of one. A rumor has it that Intel recommends two movsd
15876 followed by an unpacklpd, but this is unconfirmed. And
15877 given that the dependency depth of the unpacklpd would
15878 still be one, I'm not sure why this would be better. */
15879 zero = CONST0_RTX (V2DFmode);
15880 }
15881
15882 m = adjust_address (op1, DFmode, 0);
15883 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15884 m = adjust_address (op1, DFmode, 8);
15885 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15886 }
15887 else
15888 {
15889 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15890 {
15891 op0 = gen_lowpart (V4SFmode, op0);
15892 op1 = gen_lowpart (V4SFmode, op1);
15893 emit_insn (gen_sse_movups (op0, op1));
15894 return;
15895 }
15896
15897 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15898 emit_move_insn (op0, CONST0_RTX (mode));
15899 else
15900 emit_clobber (op0);
15901
15902 if (mode != V4SFmode)
15903 op0 = gen_lowpart (V4SFmode, op0);
15904 m = adjust_address (op1, V2SFmode, 0);
15905 emit_insn (gen_sse_loadlps (op0, op0, m));
15906 m = adjust_address (op1, V2SFmode, 8);
15907 emit_insn (gen_sse_loadhps (op0, op0, m));
15908 }
15909 }
15910 else if (MEM_P (op0))
15911 {
15912 /* If we're optimizing for size, movups is the smallest. */
15913 if (optimize_insn_for_size_p ()
15914 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15915 {
15916 op0 = gen_lowpart (V4SFmode, op0);
15917 op1 = gen_lowpart (V4SFmode, op1);
15918 emit_insn (gen_sse_movups (op0, op1));
15919 return;
15920 }
15921
15922 /* ??? Similar to above, only less clear because of quote
15923 typeless stores unquote. */
15924 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15925 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15926 {
15927 op0 = gen_lowpart (V16QImode, op0);
15928 op1 = gen_lowpart (V16QImode, op1);
15929 emit_insn (gen_sse2_movdqu (op0, op1));
15930 return;
15931 }
15932
15933 if (TARGET_SSE2 && mode == V2DFmode)
15934 {
15935 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15936 {
15937 op0 = gen_lowpart (V2DFmode, op0);
15938 op1 = gen_lowpart (V2DFmode, op1);
15939 emit_insn (gen_sse2_movupd (op0, op1));
15940 }
15941 else
15942 {
15943 m = adjust_address (op0, DFmode, 0);
15944 emit_insn (gen_sse2_storelpd (m, op1));
15945 m = adjust_address (op0, DFmode, 8);
15946 emit_insn (gen_sse2_storehpd (m, op1));
15947 }
15948 }
15949 else
15950 {
15951 if (mode != V4SFmode)
15952 op1 = gen_lowpart (V4SFmode, op1);
15953
15954 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15955 {
15956 op0 = gen_lowpart (V4SFmode, op0);
15957 emit_insn (gen_sse_movups (op0, op1));
15958 }
15959 else
15960 {
15961 m = adjust_address (op0, V2SFmode, 0);
15962 emit_insn (gen_sse_storelps (m, op1));
15963 m = adjust_address (op0, V2SFmode, 8);
15964 emit_insn (gen_sse_storehps (m, op1));
15965 }
15966 }
15967 }
15968 else
15969 gcc_unreachable ();
15970 }
15971
15972 /* Expand a push in MODE. This is some mode for which we do not support
15973 proper push instructions, at least from the registers that we expect
15974 the value to live in. */
15975
15976 void
15977 ix86_expand_push (enum machine_mode mode, rtx x)
15978 {
15979 rtx tmp;
15980
15981 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
15982 GEN_INT (-GET_MODE_SIZE (mode)),
15983 stack_pointer_rtx, 1, OPTAB_DIRECT);
15984 if (tmp != stack_pointer_rtx)
15985 emit_move_insn (stack_pointer_rtx, tmp);
15986
15987 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
15988
15989 /* When we push an operand onto stack, it has to be aligned at least
15990 at the function argument boundary. However since we don't have
15991 the argument type, we can't determine the actual argument
15992 boundary. */
15993 emit_move_insn (tmp, x);
15994 }
15995
15996 /* Helper function of ix86_fixup_binary_operands to canonicalize
15997 operand order. Returns true if the operands should be swapped. */
15998
15999 static bool
16000 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16001 rtx operands[])
16002 {
16003 rtx dst = operands[0];
16004 rtx src1 = operands[1];
16005 rtx src2 = operands[2];
16006
16007 /* If the operation is not commutative, we can't do anything. */
16008 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16009 return false;
16010
16011 /* Highest priority is that src1 should match dst. */
16012 if (rtx_equal_p (dst, src1))
16013 return false;
16014 if (rtx_equal_p (dst, src2))
16015 return true;
16016
16017 /* Next highest priority is that immediate constants come second. */
16018 if (immediate_operand (src2, mode))
16019 return false;
16020 if (immediate_operand (src1, mode))
16021 return true;
16022
16023 /* Lowest priority is that memory references should come second. */
16024 if (MEM_P (src2))
16025 return false;
16026 if (MEM_P (src1))
16027 return true;
16028
16029 return false;
16030 }
16031
16032
16033 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16034 destination to use for the operation. If different from the true
16035 destination in operands[0], a copy operation will be required. */
16036
16037 rtx
16038 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16039 rtx operands[])
16040 {
16041 rtx dst = operands[0];
16042 rtx src1 = operands[1];
16043 rtx src2 = operands[2];
16044
16045 /* Canonicalize operand order. */
16046 if (ix86_swap_binary_operands_p (code, mode, operands))
16047 {
16048 rtx temp;
16049
16050 /* It is invalid to swap operands of different modes. */
16051 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16052
16053 temp = src1;
16054 src1 = src2;
16055 src2 = temp;
16056 }
16057
16058 /* Both source operands cannot be in memory. */
16059 if (MEM_P (src1) && MEM_P (src2))
16060 {
16061 /* Optimization: Only read from memory once. */
16062 if (rtx_equal_p (src1, src2))
16063 {
16064 src2 = force_reg (mode, src2);
16065 src1 = src2;
16066 }
16067 else
16068 src2 = force_reg (mode, src2);
16069 }
16070
16071 /* If the destination is memory, and we do not have matching source
16072 operands, do things in registers. */
16073 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16074 dst = gen_reg_rtx (mode);
16075
16076 /* Source 1 cannot be a constant. */
16077 if (CONSTANT_P (src1))
16078 src1 = force_reg (mode, src1);
16079
16080 /* Source 1 cannot be a non-matching memory. */
16081 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16082 src1 = force_reg (mode, src1);
16083
16084 /* Improve address combine. */
16085 if (code == PLUS
16086 && GET_MODE_CLASS (mode) == MODE_INT
16087 && MEM_P (src2))
16088 src2 = force_reg (mode, src2);
16089
16090 operands[1] = src1;
16091 operands[2] = src2;
16092 return dst;
16093 }
16094
16095 /* Similarly, but assume that the destination has already been
16096 set up properly. */
16097
16098 void
16099 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16100 enum machine_mode mode, rtx operands[])
16101 {
16102 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16103 gcc_assert (dst == operands[0]);
16104 }
16105
16106 /* Attempt to expand a binary operator. Make the expansion closer to the
16107 actual machine, then just general_operand, which will allow 3 separate
16108 memory references (one output, two input) in a single insn. */
16109
16110 void
16111 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16112 rtx operands[])
16113 {
16114 rtx src1, src2, dst, op, clob;
16115
16116 dst = ix86_fixup_binary_operands (code, mode, operands);
16117 src1 = operands[1];
16118 src2 = operands[2];
16119
16120 /* Emit the instruction. */
16121
16122 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16123 if (reload_in_progress)
16124 {
16125 /* Reload doesn't know about the flags register, and doesn't know that
16126 it doesn't want to clobber it. We can only do this with PLUS. */
16127 gcc_assert (code == PLUS);
16128 emit_insn (op);
16129 }
16130 else if (reload_completed
16131 && code == PLUS
16132 && !rtx_equal_p (dst, src1))
16133 {
16134 /* This is going to be an LEA; avoid splitting it later. */
16135 emit_insn (op);
16136 }
16137 else
16138 {
16139 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16140 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16141 }
16142
16143 /* Fix up the destination if needed. */
16144 if (dst != operands[0])
16145 emit_move_insn (operands[0], dst);
16146 }
16147
16148 /* Return TRUE or FALSE depending on whether the binary operator meets the
16149 appropriate constraints. */
16150
16151 bool
16152 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16153 rtx operands[3])
16154 {
16155 rtx dst = operands[0];
16156 rtx src1 = operands[1];
16157 rtx src2 = operands[2];
16158
16159 /* Both source operands cannot be in memory. */
16160 if (MEM_P (src1) && MEM_P (src2))
16161 return false;
16162
16163 /* Canonicalize operand order for commutative operators. */
16164 if (ix86_swap_binary_operands_p (code, mode, operands))
16165 {
16166 rtx temp = src1;
16167 src1 = src2;
16168 src2 = temp;
16169 }
16170
16171 /* If the destination is memory, we must have a matching source operand. */
16172 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16173 return false;
16174
16175 /* Source 1 cannot be a constant. */
16176 if (CONSTANT_P (src1))
16177 return false;
16178
16179 /* Source 1 cannot be a non-matching memory. */
16180 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16181 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16182 return (code == AND
16183 && (mode == HImode
16184 || mode == SImode
16185 || (TARGET_64BIT && mode == DImode))
16186 && satisfies_constraint_L (src2));
16187
16188 return true;
16189 }
16190
16191 /* Attempt to expand a unary operator. Make the expansion closer to the
16192 actual machine, then just general_operand, which will allow 2 separate
16193 memory references (one output, one input) in a single insn. */
16194
16195 void
16196 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16197 rtx operands[])
16198 {
16199 int matching_memory;
16200 rtx src, dst, op, clob;
16201
16202 dst = operands[0];
16203 src = operands[1];
16204
16205 /* If the destination is memory, and we do not have matching source
16206 operands, do things in registers. */
16207 matching_memory = 0;
16208 if (MEM_P (dst))
16209 {
16210 if (rtx_equal_p (dst, src))
16211 matching_memory = 1;
16212 else
16213 dst = gen_reg_rtx (mode);
16214 }
16215
16216 /* When source operand is memory, destination must match. */
16217 if (MEM_P (src) && !matching_memory)
16218 src = force_reg (mode, src);
16219
16220 /* Emit the instruction. */
16221
16222 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16223 if (reload_in_progress || code == NOT)
16224 {
16225 /* Reload doesn't know about the flags register, and doesn't know that
16226 it doesn't want to clobber it. */
16227 gcc_assert (code == NOT);
16228 emit_insn (op);
16229 }
16230 else
16231 {
16232 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16233 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16234 }
16235
16236 /* Fix up the destination if needed. */
16237 if (dst != operands[0])
16238 emit_move_insn (operands[0], dst);
16239 }
16240
16241 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16242 divisor are within the range [0-255]. */
16243
16244 void
16245 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16246 bool signed_p)
16247 {
16248 rtx end_label, qimode_label;
16249 rtx insn, div, mod;
16250 rtx scratch, tmp0, tmp1, tmp2;
16251 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16252 rtx (*gen_zero_extend) (rtx, rtx);
16253 rtx (*gen_test_ccno_1) (rtx, rtx);
16254
16255 switch (mode)
16256 {
16257 case SImode:
16258 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16259 gen_test_ccno_1 = gen_testsi_ccno_1;
16260 gen_zero_extend = gen_zero_extendqisi2;
16261 break;
16262 case DImode:
16263 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16264 gen_test_ccno_1 = gen_testdi_ccno_1;
16265 gen_zero_extend = gen_zero_extendqidi2;
16266 break;
16267 default:
16268 gcc_unreachable ();
16269 }
16270
16271 end_label = gen_label_rtx ();
16272 qimode_label = gen_label_rtx ();
16273
16274 scratch = gen_reg_rtx (mode);
16275
16276 /* Use 8bit unsigned divimod if dividend and divisor are within
16277 the range [0-255]. */
16278 emit_move_insn (scratch, operands[2]);
16279 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16280 scratch, 1, OPTAB_DIRECT);
16281 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16282 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16283 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16284 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16285 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16286 pc_rtx);
16287 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16288 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16289 JUMP_LABEL (insn) = qimode_label;
16290
16291 /* Generate original signed/unsigned divimod. */
16292 div = gen_divmod4_1 (operands[0], operands[1],
16293 operands[2], operands[3]);
16294 emit_insn (div);
16295
16296 /* Branch to the end. */
16297 emit_jump_insn (gen_jump (end_label));
16298 emit_barrier ();
16299
16300 /* Generate 8bit unsigned divide. */
16301 emit_label (qimode_label);
16302 /* Don't use operands[0] for result of 8bit divide since not all
16303 registers support QImode ZERO_EXTRACT. */
16304 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16305 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16306 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16307 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16308
16309 if (signed_p)
16310 {
16311 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16312 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16313 }
16314 else
16315 {
16316 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16317 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16318 }
16319
16320 /* Extract remainder from AH. */
16321 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16322 if (REG_P (operands[1]))
16323 insn = emit_move_insn (operands[1], tmp1);
16324 else
16325 {
16326 /* Need a new scratch register since the old one has result
16327 of 8bit divide. */
16328 scratch = gen_reg_rtx (mode);
16329 emit_move_insn (scratch, tmp1);
16330 insn = emit_move_insn (operands[1], scratch);
16331 }
16332 set_unique_reg_note (insn, REG_EQUAL, mod);
16333
16334 /* Zero extend quotient from AL. */
16335 tmp1 = gen_lowpart (QImode, tmp0);
16336 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16337 set_unique_reg_note (insn, REG_EQUAL, div);
16338
16339 emit_label (end_label);
16340 }
16341
16342 #define LEA_MAX_STALL (3)
16343 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16344
16345 /* Increase given DISTANCE in half-cycles according to
16346 dependencies between PREV and NEXT instructions.
16347 Add 1 half-cycle if there is no dependency and
16348 go to next cycle if there is some dependecy. */
16349
16350 static unsigned int
16351 increase_distance (rtx prev, rtx next, unsigned int distance)
16352 {
16353 df_ref *use_rec;
16354 df_ref *def_rec;
16355
16356 if (!prev || !next)
16357 return distance + (distance & 1) + 2;
16358
16359 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16360 return distance + 1;
16361
16362 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16363 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16364 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16365 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16366 return distance + (distance & 1) + 2;
16367
16368 return distance + 1;
16369 }
16370
16371 /* Function checks if instruction INSN defines register number
16372 REGNO1 or REGNO2. */
16373
16374 static bool
16375 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16376 rtx insn)
16377 {
16378 df_ref *def_rec;
16379
16380 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16381 if (DF_REF_REG_DEF_P (*def_rec)
16382 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16383 && (regno1 == DF_REF_REGNO (*def_rec)
16384 || regno2 == DF_REF_REGNO (*def_rec)))
16385 {
16386 return true;
16387 }
16388
16389 return false;
16390 }
16391
16392 /* Function checks if instruction INSN uses register number
16393 REGNO as a part of address expression. */
16394
16395 static bool
16396 insn_uses_reg_mem (unsigned int regno, rtx insn)
16397 {
16398 df_ref *use_rec;
16399
16400 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16401 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16402 return true;
16403
16404 return false;
16405 }
16406
16407 /* Search backward for non-agu definition of register number REGNO1
16408 or register number REGNO2 in basic block starting from instruction
16409 START up to head of basic block or instruction INSN.
16410
16411 Function puts true value into *FOUND var if definition was found
16412 and false otherwise.
16413
16414 Distance in half-cycles between START and found instruction or head
16415 of BB is added to DISTANCE and returned. */
16416
16417 static int
16418 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16419 rtx insn, int distance,
16420 rtx start, bool *found)
16421 {
16422 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16423 rtx prev = start;
16424 rtx next = NULL;
16425
16426 *found = false;
16427
16428 while (prev
16429 && prev != insn
16430 && distance < LEA_SEARCH_THRESHOLD)
16431 {
16432 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16433 {
16434 distance = increase_distance (prev, next, distance);
16435 if (insn_defines_reg (regno1, regno2, prev))
16436 {
16437 if (recog_memoized (prev) < 0
16438 || get_attr_type (prev) != TYPE_LEA)
16439 {
16440 *found = true;
16441 return distance;
16442 }
16443 }
16444
16445 next = prev;
16446 }
16447 if (prev == BB_HEAD (bb))
16448 break;
16449
16450 prev = PREV_INSN (prev);
16451 }
16452
16453 return distance;
16454 }
16455
16456 /* Search backward for non-agu definition of register number REGNO1
16457 or register number REGNO2 in INSN's basic block until
16458 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16459 2. Reach neighbour BBs boundary, or
16460 3. Reach agu definition.
16461 Returns the distance between the non-agu definition point and INSN.
16462 If no definition point, returns -1. */
16463
16464 static int
16465 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16466 rtx insn)
16467 {
16468 basic_block bb = BLOCK_FOR_INSN (insn);
16469 int distance = 0;
16470 bool found = false;
16471
16472 if (insn != BB_HEAD (bb))
16473 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16474 distance, PREV_INSN (insn),
16475 &found);
16476
16477 if (!found && distance < LEA_SEARCH_THRESHOLD)
16478 {
16479 edge e;
16480 edge_iterator ei;
16481 bool simple_loop = false;
16482
16483 FOR_EACH_EDGE (e, ei, bb->preds)
16484 if (e->src == bb)
16485 {
16486 simple_loop = true;
16487 break;
16488 }
16489
16490 if (simple_loop)
16491 distance = distance_non_agu_define_in_bb (regno1, regno2,
16492 insn, distance,
16493 BB_END (bb), &found);
16494 else
16495 {
16496 int shortest_dist = -1;
16497 bool found_in_bb = false;
16498
16499 FOR_EACH_EDGE (e, ei, bb->preds)
16500 {
16501 int bb_dist
16502 = distance_non_agu_define_in_bb (regno1, regno2,
16503 insn, distance,
16504 BB_END (e->src),
16505 &found_in_bb);
16506 if (found_in_bb)
16507 {
16508 if (shortest_dist < 0)
16509 shortest_dist = bb_dist;
16510 else if (bb_dist > 0)
16511 shortest_dist = MIN (bb_dist, shortest_dist);
16512
16513 found = true;
16514 }
16515 }
16516
16517 distance = shortest_dist;
16518 }
16519 }
16520
16521 /* get_attr_type may modify recog data. We want to make sure
16522 that recog data is valid for instruction INSN, on which
16523 distance_non_agu_define is called. INSN is unchanged here. */
16524 extract_insn_cached (insn);
16525
16526 if (!found)
16527 return -1;
16528
16529 return distance >> 1;
16530 }
16531
16532 /* Return the distance in half-cycles between INSN and the next
16533 insn that uses register number REGNO in memory address added
16534 to DISTANCE. Return -1 if REGNO0 is set.
16535
16536 Put true value into *FOUND if register usage was found and
16537 false otherwise.
16538 Put true value into *REDEFINED if register redefinition was
16539 found and false otherwise. */
16540
16541 static int
16542 distance_agu_use_in_bb (unsigned int regno,
16543 rtx insn, int distance, rtx start,
16544 bool *found, bool *redefined)
16545 {
16546 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16547 rtx next = start;
16548 rtx prev = NULL;
16549
16550 *found = false;
16551 *redefined = false;
16552
16553 while (next
16554 && next != insn
16555 && distance < LEA_SEARCH_THRESHOLD)
16556 {
16557 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16558 {
16559 distance = increase_distance(prev, next, distance);
16560 if (insn_uses_reg_mem (regno, next))
16561 {
16562 /* Return DISTANCE if OP0 is used in memory
16563 address in NEXT. */
16564 *found = true;
16565 return distance;
16566 }
16567
16568 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16569 {
16570 /* Return -1 if OP0 is set in NEXT. */
16571 *redefined = true;
16572 return -1;
16573 }
16574
16575 prev = next;
16576 }
16577
16578 if (next == BB_END (bb))
16579 break;
16580
16581 next = NEXT_INSN (next);
16582 }
16583
16584 return distance;
16585 }
16586
16587 /* Return the distance between INSN and the next insn that uses
16588 register number REGNO0 in memory address. Return -1 if no such
16589 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16590
16591 static int
16592 distance_agu_use (unsigned int regno0, rtx insn)
16593 {
16594 basic_block bb = BLOCK_FOR_INSN (insn);
16595 int distance = 0;
16596 bool found = false;
16597 bool redefined = false;
16598
16599 if (insn != BB_END (bb))
16600 distance = distance_agu_use_in_bb (regno0, insn, distance,
16601 NEXT_INSN (insn),
16602 &found, &redefined);
16603
16604 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16605 {
16606 edge e;
16607 edge_iterator ei;
16608 bool simple_loop = false;
16609
16610 FOR_EACH_EDGE (e, ei, bb->succs)
16611 if (e->dest == bb)
16612 {
16613 simple_loop = true;
16614 break;
16615 }
16616
16617 if (simple_loop)
16618 distance = distance_agu_use_in_bb (regno0, insn,
16619 distance, BB_HEAD (bb),
16620 &found, &redefined);
16621 else
16622 {
16623 int shortest_dist = -1;
16624 bool found_in_bb = false;
16625 bool redefined_in_bb = false;
16626
16627 FOR_EACH_EDGE (e, ei, bb->succs)
16628 {
16629 int bb_dist
16630 = distance_agu_use_in_bb (regno0, insn,
16631 distance, BB_HEAD (e->dest),
16632 &found_in_bb, &redefined_in_bb);
16633 if (found_in_bb)
16634 {
16635 if (shortest_dist < 0)
16636 shortest_dist = bb_dist;
16637 else if (bb_dist > 0)
16638 shortest_dist = MIN (bb_dist, shortest_dist);
16639
16640 found = true;
16641 }
16642 }
16643
16644 distance = shortest_dist;
16645 }
16646 }
16647
16648 if (!found || redefined)
16649 return -1;
16650
16651 return distance >> 1;
16652 }
16653
16654 /* Define this macro to tune LEA priority vs ADD, it take effect when
16655 there is a dilemma of choicing LEA or ADD
16656 Negative value: ADD is more preferred than LEA
16657 Zero: Netrual
16658 Positive value: LEA is more preferred than ADD*/
16659 #define IX86_LEA_PRIORITY 0
16660
16661 /* Return true if usage of lea INSN has performance advantage
16662 over a sequence of instructions. Instructions sequence has
16663 SPLIT_COST cycles higher latency than lea latency. */
16664
16665 bool
16666 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16667 unsigned int regno2, unsigned int split_cost)
16668 {
16669 int dist_define, dist_use;
16670
16671 dist_define = distance_non_agu_define (regno1, regno2, insn);
16672 dist_use = distance_agu_use (regno0, insn);
16673
16674 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16675 {
16676 /* If there is no non AGU operand definition, no AGU
16677 operand usage and split cost is 0 then both lea
16678 and non lea variants have same priority. Currently
16679 we prefer lea for 64 bit code and non lea on 32 bit
16680 code. */
16681 if (dist_use < 0 && split_cost == 0)
16682 return TARGET_64BIT || IX86_LEA_PRIORITY;
16683 else
16684 return true;
16685 }
16686
16687 /* With longer definitions distance lea is more preferable.
16688 Here we change it to take into account splitting cost and
16689 lea priority. */
16690 dist_define += split_cost + IX86_LEA_PRIORITY;
16691
16692 /* If there is no use in memory addess then we just check
16693 that split cost does not exceed AGU stall. */
16694 if (dist_use < 0)
16695 return dist_define >= LEA_MAX_STALL;
16696
16697 /* If this insn has both backward non-agu dependence and forward
16698 agu dependence, the one with short distance takes effect. */
16699 return dist_define >= dist_use;
16700 }
16701
16702 /* Return true if it is legal to clobber flags by INSN and
16703 false otherwise. */
16704
16705 static bool
16706 ix86_ok_to_clobber_flags (rtx insn)
16707 {
16708 basic_block bb = BLOCK_FOR_INSN (insn);
16709 df_ref *use;
16710 bitmap live;
16711
16712 while (insn)
16713 {
16714 if (NONDEBUG_INSN_P (insn))
16715 {
16716 for (use = DF_INSN_USES (insn); *use; use++)
16717 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16718 return false;
16719
16720 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16721 return true;
16722 }
16723
16724 if (insn == BB_END (bb))
16725 break;
16726
16727 insn = NEXT_INSN (insn);
16728 }
16729
16730 live = df_get_live_out(bb);
16731 return !REGNO_REG_SET_P (live, FLAGS_REG);
16732 }
16733
16734 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16735 move and add to avoid AGU stalls. */
16736
16737 bool
16738 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16739 {
16740 unsigned int regno0 = true_regnum (operands[0]);
16741 unsigned int regno1 = true_regnum (operands[1]);
16742 unsigned int regno2 = true_regnum (operands[2]);
16743
16744 /* Check if we need to optimize. */
16745 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16746 return false;
16747
16748 /* Check it is correct to split here. */
16749 if (!ix86_ok_to_clobber_flags(insn))
16750 return false;
16751
16752 /* We need to split only adds with non destructive
16753 destination operand. */
16754 if (regno0 == regno1 || regno0 == regno2)
16755 return false;
16756 else
16757 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16758 }
16759
16760 /* Return true if we should emit lea instruction instead of mov
16761 instruction. */
16762
16763 bool
16764 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16765 {
16766 unsigned int regno0;
16767 unsigned int regno1;
16768
16769 /* Check if we need to optimize. */
16770 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16771 return false;
16772
16773 /* Use lea for reg to reg moves only. */
16774 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16775 return false;
16776
16777 regno0 = true_regnum (operands[0]);
16778 regno1 = true_regnum (operands[1]);
16779
16780 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16781 }
16782
16783 /* Return true if we need to split lea into a sequence of
16784 instructions to avoid AGU stalls. */
16785
16786 bool
16787 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16788 {
16789 unsigned int regno0 = true_regnum (operands[0]) ;
16790 unsigned int regno1 = -1;
16791 unsigned int regno2 = -1;
16792 unsigned int split_cost = 0;
16793 struct ix86_address parts;
16794 int ok;
16795
16796 /* Check we need to optimize. */
16797 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16798 return false;
16799
16800 /* Check it is correct to split here. */
16801 if (!ix86_ok_to_clobber_flags(insn))
16802 return false;
16803
16804 ok = ix86_decompose_address (operands[1], &parts);
16805 gcc_assert (ok);
16806
16807 /* We should not split into add if non legitimate pic
16808 operand is used as displacement. */
16809 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16810 return false;
16811
16812 if (parts.base)
16813 regno1 = true_regnum (parts.base);
16814 if (parts.index)
16815 regno2 = true_regnum (parts.index);
16816
16817 /* Compute how many cycles we will add to execution time
16818 if split lea into a sequence of instructions. */
16819 if (parts.base || parts.index)
16820 {
16821 /* Have to use mov instruction if non desctructive
16822 destination form is used. */
16823 if (regno1 != regno0 && regno2 != regno0)
16824 split_cost += 1;
16825
16826 /* Have to add index to base if both exist. */
16827 if (parts.base && parts.index)
16828 split_cost += 1;
16829
16830 /* Have to use shift and adds if scale is 2 or greater. */
16831 if (parts.scale > 1)
16832 {
16833 if (regno0 != regno1)
16834 split_cost += 1;
16835 else if (regno2 == regno0)
16836 split_cost += 4;
16837 else
16838 split_cost += parts.scale;
16839 }
16840
16841 /* Have to use add instruction with immediate if
16842 disp is non zero. */
16843 if (parts.disp && parts.disp != const0_rtx)
16844 split_cost += 1;
16845
16846 /* Subtract the price of lea. */
16847 split_cost -= 1;
16848 }
16849
16850 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16851 }
16852
16853 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16854 matches destination. RTX includes clobber of FLAGS_REG. */
16855
16856 static void
16857 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16858 rtx dst, rtx src)
16859 {
16860 rtx op, clob;
16861
16862 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16863 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16864
16865 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16866 }
16867
16868 /* Split lea instructions into a sequence of instructions
16869 which are executed on ALU to avoid AGU stalls.
16870 It is assumed that it is allowed to clobber flags register
16871 at lea position. */
16872
16873 extern void
16874 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16875 {
16876 unsigned int regno0 = true_regnum (operands[0]) ;
16877 unsigned int regno1 = INVALID_REGNUM;
16878 unsigned int regno2 = INVALID_REGNUM;
16879 struct ix86_address parts;
16880 rtx tmp;
16881 int ok, adds;
16882
16883 ok = ix86_decompose_address (operands[1], &parts);
16884 gcc_assert (ok);
16885
16886 if (parts.base)
16887 {
16888 if (GET_MODE (parts.base) != mode)
16889 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16890 regno1 = true_regnum (parts.base);
16891 }
16892
16893 if (parts.index)
16894 {
16895 if (GET_MODE (parts.index) != mode)
16896 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16897 regno2 = true_regnum (parts.index);
16898 }
16899
16900 if (parts.scale > 1)
16901 {
16902 /* Case r1 = r1 + ... */
16903 if (regno1 == regno0)
16904 {
16905 /* If we have a case r1 = r1 + C * r1 then we
16906 should use multiplication which is very
16907 expensive. Assume cost model is wrong if we
16908 have such case here. */
16909 gcc_assert (regno2 != regno0);
16910
16911 for (adds = parts.scale; adds > 0; adds--)
16912 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16913 }
16914 else
16915 {
16916 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16917 if (regno0 != regno2)
16918 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16919
16920 /* Use shift for scaling. */
16921 ix86_emit_binop (ASHIFT, mode, operands[0],
16922 GEN_INT (exact_log2 (parts.scale)));
16923
16924 if (parts.base)
16925 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16926
16927 if (parts.disp && parts.disp != const0_rtx)
16928 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16929 }
16930 }
16931 else if (!parts.base && !parts.index)
16932 {
16933 gcc_assert(parts.disp);
16934 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16935 }
16936 else
16937 {
16938 if (!parts.base)
16939 {
16940 if (regno0 != regno2)
16941 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16942 }
16943 else if (!parts.index)
16944 {
16945 if (regno0 != regno1)
16946 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16947 }
16948 else
16949 {
16950 if (regno0 == regno1)
16951 tmp = parts.index;
16952 else if (regno0 == regno2)
16953 tmp = parts.base;
16954 else
16955 {
16956 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16957 tmp = parts.index;
16958 }
16959
16960 ix86_emit_binop (PLUS, mode, operands[0], tmp);
16961 }
16962
16963 if (parts.disp && parts.disp != const0_rtx)
16964 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16965 }
16966 }
16967
16968 /* Return true if it is ok to optimize an ADD operation to LEA
16969 operation to avoid flag register consumation. For most processors,
16970 ADD is faster than LEA. For the processors like ATOM, if the
16971 destination register of LEA holds an actual address which will be
16972 used soon, LEA is better and otherwise ADD is better. */
16973
16974 bool
16975 ix86_lea_for_add_ok (rtx insn, rtx operands[])
16976 {
16977 unsigned int regno0 = true_regnum (operands[0]);
16978 unsigned int regno1 = true_regnum (operands[1]);
16979 unsigned int regno2 = true_regnum (operands[2]);
16980
16981 /* If a = b + c, (a!=b && a!=c), must use lea form. */
16982 if (regno0 != regno1 && regno0 != regno2)
16983 return true;
16984
16985 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16986 return false;
16987
16988 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
16989 }
16990
16991 /* Return true if destination reg of SET_BODY is shift count of
16992 USE_BODY. */
16993
16994 static bool
16995 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
16996 {
16997 rtx set_dest;
16998 rtx shift_rtx;
16999 int i;
17000
17001 /* Retrieve destination of SET_BODY. */
17002 switch (GET_CODE (set_body))
17003 {
17004 case SET:
17005 set_dest = SET_DEST (set_body);
17006 if (!set_dest || !REG_P (set_dest))
17007 return false;
17008 break;
17009 case PARALLEL:
17010 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17011 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17012 use_body))
17013 return true;
17014 default:
17015 return false;
17016 break;
17017 }
17018
17019 /* Retrieve shift count of USE_BODY. */
17020 switch (GET_CODE (use_body))
17021 {
17022 case SET:
17023 shift_rtx = XEXP (use_body, 1);
17024 break;
17025 case PARALLEL:
17026 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17027 if (ix86_dep_by_shift_count_body (set_body,
17028 XVECEXP (use_body, 0, i)))
17029 return true;
17030 default:
17031 return false;
17032 break;
17033 }
17034
17035 if (shift_rtx
17036 && (GET_CODE (shift_rtx) == ASHIFT
17037 || GET_CODE (shift_rtx) == LSHIFTRT
17038 || GET_CODE (shift_rtx) == ASHIFTRT
17039 || GET_CODE (shift_rtx) == ROTATE
17040 || GET_CODE (shift_rtx) == ROTATERT))
17041 {
17042 rtx shift_count = XEXP (shift_rtx, 1);
17043
17044 /* Return true if shift count is dest of SET_BODY. */
17045 if (REG_P (shift_count)
17046 && true_regnum (set_dest) == true_regnum (shift_count))
17047 return true;
17048 }
17049
17050 return false;
17051 }
17052
17053 /* Return true if destination reg of SET_INSN is shift count of
17054 USE_INSN. */
17055
17056 bool
17057 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17058 {
17059 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17060 PATTERN (use_insn));
17061 }
17062
17063 /* Return TRUE or FALSE depending on whether the unary operator meets the
17064 appropriate constraints. */
17065
17066 bool
17067 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17068 enum machine_mode mode ATTRIBUTE_UNUSED,
17069 rtx operands[2] ATTRIBUTE_UNUSED)
17070 {
17071 /* If one of operands is memory, source and destination must match. */
17072 if ((MEM_P (operands[0])
17073 || MEM_P (operands[1]))
17074 && ! rtx_equal_p (operands[0], operands[1]))
17075 return false;
17076 return true;
17077 }
17078
17079 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17080 are ok, keeping in mind the possible movddup alternative. */
17081
17082 bool
17083 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17084 {
17085 if (MEM_P (operands[0]))
17086 return rtx_equal_p (operands[0], operands[1 + high]);
17087 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17088 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17089 return true;
17090 }
17091
17092 /* Post-reload splitter for converting an SF or DFmode value in an
17093 SSE register into an unsigned SImode. */
17094
17095 void
17096 ix86_split_convert_uns_si_sse (rtx operands[])
17097 {
17098 enum machine_mode vecmode;
17099 rtx value, large, zero_or_two31, input, two31, x;
17100
17101 large = operands[1];
17102 zero_or_two31 = operands[2];
17103 input = operands[3];
17104 two31 = operands[4];
17105 vecmode = GET_MODE (large);
17106 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17107
17108 /* Load up the value into the low element. We must ensure that the other
17109 elements are valid floats -- zero is the easiest such value. */
17110 if (MEM_P (input))
17111 {
17112 if (vecmode == V4SFmode)
17113 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17114 else
17115 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17116 }
17117 else
17118 {
17119 input = gen_rtx_REG (vecmode, REGNO (input));
17120 emit_move_insn (value, CONST0_RTX (vecmode));
17121 if (vecmode == V4SFmode)
17122 emit_insn (gen_sse_movss (value, value, input));
17123 else
17124 emit_insn (gen_sse2_movsd (value, value, input));
17125 }
17126
17127 emit_move_insn (large, two31);
17128 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17129
17130 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17131 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17132
17133 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17134 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17135
17136 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17137 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17138
17139 large = gen_rtx_REG (V4SImode, REGNO (large));
17140 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17141
17142 x = gen_rtx_REG (V4SImode, REGNO (value));
17143 if (vecmode == V4SFmode)
17144 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17145 else
17146 emit_insn (gen_sse2_cvttpd2dq (x, value));
17147 value = x;
17148
17149 emit_insn (gen_xorv4si3 (value, value, large));
17150 }
17151
17152 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17153 Expects the 64-bit DImode to be supplied in a pair of integral
17154 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17155 -mfpmath=sse, !optimize_size only. */
17156
17157 void
17158 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17159 {
17160 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17161 rtx int_xmm, fp_xmm;
17162 rtx biases, exponents;
17163 rtx x;
17164
17165 int_xmm = gen_reg_rtx (V4SImode);
17166 if (TARGET_INTER_UNIT_MOVES)
17167 emit_insn (gen_movdi_to_sse (int_xmm, input));
17168 else if (TARGET_SSE_SPLIT_REGS)
17169 {
17170 emit_clobber (int_xmm);
17171 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17172 }
17173 else
17174 {
17175 x = gen_reg_rtx (V2DImode);
17176 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17177 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17178 }
17179
17180 x = gen_rtx_CONST_VECTOR (V4SImode,
17181 gen_rtvec (4, GEN_INT (0x43300000UL),
17182 GEN_INT (0x45300000UL),
17183 const0_rtx, const0_rtx));
17184 exponents = validize_mem (force_const_mem (V4SImode, x));
17185
17186 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17187 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17188
17189 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17190 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17191 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17192 (0x1.0p84 + double(fp_value_hi_xmm)).
17193 Note these exponents differ by 32. */
17194
17195 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17196
17197 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17198 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17199 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17200 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17201 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17202 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17203 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17204 biases = validize_mem (force_const_mem (V2DFmode, biases));
17205 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17206
17207 /* Add the upper and lower DFmode values together. */
17208 if (TARGET_SSE3)
17209 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17210 else
17211 {
17212 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17213 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17214 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17215 }
17216
17217 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17218 }
17219
17220 /* Not used, but eases macroization of patterns. */
17221 void
17222 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17223 rtx input ATTRIBUTE_UNUSED)
17224 {
17225 gcc_unreachable ();
17226 }
17227
17228 /* Convert an unsigned SImode value into a DFmode. Only currently used
17229 for SSE, but applicable anywhere. */
17230
17231 void
17232 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17233 {
17234 REAL_VALUE_TYPE TWO31r;
17235 rtx x, fp;
17236
17237 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17238 NULL, 1, OPTAB_DIRECT);
17239
17240 fp = gen_reg_rtx (DFmode);
17241 emit_insn (gen_floatsidf2 (fp, x));
17242
17243 real_ldexp (&TWO31r, &dconst1, 31);
17244 x = const_double_from_real_value (TWO31r, DFmode);
17245
17246 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17247 if (x != target)
17248 emit_move_insn (target, x);
17249 }
17250
17251 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17252 32-bit mode; otherwise we have a direct convert instruction. */
17253
17254 void
17255 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17256 {
17257 REAL_VALUE_TYPE TWO32r;
17258 rtx fp_lo, fp_hi, x;
17259
17260 fp_lo = gen_reg_rtx (DFmode);
17261 fp_hi = gen_reg_rtx (DFmode);
17262
17263 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17264
17265 real_ldexp (&TWO32r, &dconst1, 32);
17266 x = const_double_from_real_value (TWO32r, DFmode);
17267 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17268
17269 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17270
17271 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17272 0, OPTAB_DIRECT);
17273 if (x != target)
17274 emit_move_insn (target, x);
17275 }
17276
17277 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17278 For x86_32, -mfpmath=sse, !optimize_size only. */
17279 void
17280 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17281 {
17282 REAL_VALUE_TYPE ONE16r;
17283 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17284
17285 real_ldexp (&ONE16r, &dconst1, 16);
17286 x = const_double_from_real_value (ONE16r, SFmode);
17287 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17288 NULL, 0, OPTAB_DIRECT);
17289 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17290 NULL, 0, OPTAB_DIRECT);
17291 fp_hi = gen_reg_rtx (SFmode);
17292 fp_lo = gen_reg_rtx (SFmode);
17293 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17294 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17295 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17296 0, OPTAB_DIRECT);
17297 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17298 0, OPTAB_DIRECT);
17299 if (!rtx_equal_p (target, fp_hi))
17300 emit_move_insn (target, fp_hi);
17301 }
17302
17303 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17304 a vector of unsigned ints VAL to vector of floats TARGET. */
17305
17306 void
17307 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17308 {
17309 rtx tmp[8];
17310 REAL_VALUE_TYPE TWO16r;
17311 enum machine_mode intmode = GET_MODE (val);
17312 enum machine_mode fltmode = GET_MODE (target);
17313 rtx (*cvt) (rtx, rtx);
17314
17315 if (intmode == V4SImode)
17316 cvt = gen_floatv4siv4sf2;
17317 else
17318 cvt = gen_floatv8siv8sf2;
17319 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17320 tmp[0] = force_reg (intmode, tmp[0]);
17321 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17322 OPTAB_DIRECT);
17323 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17324 NULL_RTX, 1, OPTAB_DIRECT);
17325 tmp[3] = gen_reg_rtx (fltmode);
17326 emit_insn (cvt (tmp[3], tmp[1]));
17327 tmp[4] = gen_reg_rtx (fltmode);
17328 emit_insn (cvt (tmp[4], tmp[2]));
17329 real_ldexp (&TWO16r, &dconst1, 16);
17330 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17331 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17332 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17333 OPTAB_DIRECT);
17334 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17335 OPTAB_DIRECT);
17336 if (tmp[7] != target)
17337 emit_move_insn (target, tmp[7]);
17338 }
17339
17340 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17341 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17342 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17343 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17344
17345 rtx
17346 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17347 {
17348 REAL_VALUE_TYPE TWO31r;
17349 rtx two31r, tmp[4];
17350 enum machine_mode mode = GET_MODE (val);
17351 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17352 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17353 rtx (*cmp) (rtx, rtx, rtx, rtx);
17354 int i;
17355
17356 for (i = 0; i < 3; i++)
17357 tmp[i] = gen_reg_rtx (mode);
17358 real_ldexp (&TWO31r, &dconst1, 31);
17359 two31r = const_double_from_real_value (TWO31r, scalarmode);
17360 two31r = ix86_build_const_vector (mode, 1, two31r);
17361 two31r = force_reg (mode, two31r);
17362 switch (mode)
17363 {
17364 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17365 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17366 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17367 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17368 default: gcc_unreachable ();
17369 }
17370 tmp[3] = gen_rtx_LE (mode, two31r, val);
17371 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17372 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17373 0, OPTAB_DIRECT);
17374 if (intmode == V4SImode || TARGET_AVX2)
17375 *xorp = expand_simple_binop (intmode, ASHIFT,
17376 gen_lowpart (intmode, tmp[0]),
17377 GEN_INT (31), NULL_RTX, 0,
17378 OPTAB_DIRECT);
17379 else
17380 {
17381 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17382 two31 = ix86_build_const_vector (intmode, 1, two31);
17383 *xorp = expand_simple_binop (intmode, AND,
17384 gen_lowpart (intmode, tmp[0]),
17385 two31, NULL_RTX, 0,
17386 OPTAB_DIRECT);
17387 }
17388 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17389 0, OPTAB_DIRECT);
17390 }
17391
17392 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17393 then replicate the value for all elements of the vector
17394 register. */
17395
17396 rtx
17397 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17398 {
17399 int i, n_elt;
17400 rtvec v;
17401 enum machine_mode scalar_mode;
17402
17403 switch (mode)
17404 {
17405 case V32QImode:
17406 case V16QImode:
17407 case V16HImode:
17408 case V8HImode:
17409 case V8SImode:
17410 case V4SImode:
17411 case V4DImode:
17412 case V2DImode:
17413 gcc_assert (vect);
17414 case V8SFmode:
17415 case V4SFmode:
17416 case V4DFmode:
17417 case V2DFmode:
17418 n_elt = GET_MODE_NUNITS (mode);
17419 v = rtvec_alloc (n_elt);
17420 scalar_mode = GET_MODE_INNER (mode);
17421
17422 RTVEC_ELT (v, 0) = value;
17423
17424 for (i = 1; i < n_elt; ++i)
17425 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17426
17427 return gen_rtx_CONST_VECTOR (mode, v);
17428
17429 default:
17430 gcc_unreachable ();
17431 }
17432 }
17433
17434 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17435 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17436 for an SSE register. If VECT is true, then replicate the mask for
17437 all elements of the vector register. If INVERT is true, then create
17438 a mask excluding the sign bit. */
17439
17440 rtx
17441 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17442 {
17443 enum machine_mode vec_mode, imode;
17444 HOST_WIDE_INT hi, lo;
17445 int shift = 63;
17446 rtx v;
17447 rtx mask;
17448
17449 /* Find the sign bit, sign extended to 2*HWI. */
17450 switch (mode)
17451 {
17452 case V8SImode:
17453 case V4SImode:
17454 case V8SFmode:
17455 case V4SFmode:
17456 vec_mode = mode;
17457 mode = GET_MODE_INNER (mode);
17458 imode = SImode;
17459 lo = 0x80000000, hi = lo < 0;
17460 break;
17461
17462 case V4DImode:
17463 case V2DImode:
17464 case V4DFmode:
17465 case V2DFmode:
17466 vec_mode = mode;
17467 mode = GET_MODE_INNER (mode);
17468 imode = DImode;
17469 if (HOST_BITS_PER_WIDE_INT >= 64)
17470 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17471 else
17472 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17473 break;
17474
17475 case TImode:
17476 case TFmode:
17477 vec_mode = VOIDmode;
17478 if (HOST_BITS_PER_WIDE_INT >= 64)
17479 {
17480 imode = TImode;
17481 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17482 }
17483 else
17484 {
17485 rtvec vec;
17486
17487 imode = DImode;
17488 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17489
17490 if (invert)
17491 {
17492 lo = ~lo, hi = ~hi;
17493 v = constm1_rtx;
17494 }
17495 else
17496 v = const0_rtx;
17497
17498 mask = immed_double_const (lo, hi, imode);
17499
17500 vec = gen_rtvec (2, v, mask);
17501 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17502 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17503
17504 return v;
17505 }
17506 break;
17507
17508 default:
17509 gcc_unreachable ();
17510 }
17511
17512 if (invert)
17513 lo = ~lo, hi = ~hi;
17514
17515 /* Force this value into the low part of a fp vector constant. */
17516 mask = immed_double_const (lo, hi, imode);
17517 mask = gen_lowpart (mode, mask);
17518
17519 if (vec_mode == VOIDmode)
17520 return force_reg (mode, mask);
17521
17522 v = ix86_build_const_vector (vec_mode, vect, mask);
17523 return force_reg (vec_mode, v);
17524 }
17525
17526 /* Generate code for floating point ABS or NEG. */
17527
17528 void
17529 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17530 rtx operands[])
17531 {
17532 rtx mask, set, dst, src;
17533 bool use_sse = false;
17534 bool vector_mode = VECTOR_MODE_P (mode);
17535 enum machine_mode vmode = mode;
17536
17537 if (vector_mode)
17538 use_sse = true;
17539 else if (mode == TFmode)
17540 use_sse = true;
17541 else if (TARGET_SSE_MATH)
17542 {
17543 use_sse = SSE_FLOAT_MODE_P (mode);
17544 if (mode == SFmode)
17545 vmode = V4SFmode;
17546 else if (mode == DFmode)
17547 vmode = V2DFmode;
17548 }
17549
17550 /* NEG and ABS performed with SSE use bitwise mask operations.
17551 Create the appropriate mask now. */
17552 if (use_sse)
17553 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17554 else
17555 mask = NULL_RTX;
17556
17557 dst = operands[0];
17558 src = operands[1];
17559
17560 set = gen_rtx_fmt_e (code, mode, src);
17561 set = gen_rtx_SET (VOIDmode, dst, set);
17562
17563 if (mask)
17564 {
17565 rtx use, clob;
17566 rtvec par;
17567
17568 use = gen_rtx_USE (VOIDmode, mask);
17569 if (vector_mode)
17570 par = gen_rtvec (2, set, use);
17571 else
17572 {
17573 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17574 par = gen_rtvec (3, set, use, clob);
17575 }
17576 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17577 }
17578 else
17579 emit_insn (set);
17580 }
17581
17582 /* Expand a copysign operation. Special case operand 0 being a constant. */
17583
17584 void
17585 ix86_expand_copysign (rtx operands[])
17586 {
17587 enum machine_mode mode, vmode;
17588 rtx dest, op0, op1, mask, nmask;
17589
17590 dest = operands[0];
17591 op0 = operands[1];
17592 op1 = operands[2];
17593
17594 mode = GET_MODE (dest);
17595
17596 if (mode == SFmode)
17597 vmode = V4SFmode;
17598 else if (mode == DFmode)
17599 vmode = V2DFmode;
17600 else
17601 vmode = mode;
17602
17603 if (GET_CODE (op0) == CONST_DOUBLE)
17604 {
17605 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17606
17607 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17608 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17609
17610 if (mode == SFmode || mode == DFmode)
17611 {
17612 if (op0 == CONST0_RTX (mode))
17613 op0 = CONST0_RTX (vmode);
17614 else
17615 {
17616 rtx v = ix86_build_const_vector (vmode, false, op0);
17617
17618 op0 = force_reg (vmode, v);
17619 }
17620 }
17621 else if (op0 != CONST0_RTX (mode))
17622 op0 = force_reg (mode, op0);
17623
17624 mask = ix86_build_signbit_mask (vmode, 0, 0);
17625
17626 if (mode == SFmode)
17627 copysign_insn = gen_copysignsf3_const;
17628 else if (mode == DFmode)
17629 copysign_insn = gen_copysigndf3_const;
17630 else
17631 copysign_insn = gen_copysigntf3_const;
17632
17633 emit_insn (copysign_insn (dest, op0, op1, mask));
17634 }
17635 else
17636 {
17637 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17638
17639 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17640 mask = ix86_build_signbit_mask (vmode, 0, 0);
17641
17642 if (mode == SFmode)
17643 copysign_insn = gen_copysignsf3_var;
17644 else if (mode == DFmode)
17645 copysign_insn = gen_copysigndf3_var;
17646 else
17647 copysign_insn = gen_copysigntf3_var;
17648
17649 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17650 }
17651 }
17652
17653 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17654 be a constant, and so has already been expanded into a vector constant. */
17655
17656 void
17657 ix86_split_copysign_const (rtx operands[])
17658 {
17659 enum machine_mode mode, vmode;
17660 rtx dest, op0, mask, x;
17661
17662 dest = operands[0];
17663 op0 = operands[1];
17664 mask = operands[3];
17665
17666 mode = GET_MODE (dest);
17667 vmode = GET_MODE (mask);
17668
17669 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17670 x = gen_rtx_AND (vmode, dest, mask);
17671 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17672
17673 if (op0 != CONST0_RTX (vmode))
17674 {
17675 x = gen_rtx_IOR (vmode, dest, op0);
17676 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17677 }
17678 }
17679
17680 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17681 so we have to do two masks. */
17682
17683 void
17684 ix86_split_copysign_var (rtx operands[])
17685 {
17686 enum machine_mode mode, vmode;
17687 rtx dest, scratch, op0, op1, mask, nmask, x;
17688
17689 dest = operands[0];
17690 scratch = operands[1];
17691 op0 = operands[2];
17692 op1 = operands[3];
17693 nmask = operands[4];
17694 mask = operands[5];
17695
17696 mode = GET_MODE (dest);
17697 vmode = GET_MODE (mask);
17698
17699 if (rtx_equal_p (op0, op1))
17700 {
17701 /* Shouldn't happen often (it's useless, obviously), but when it does
17702 we'd generate incorrect code if we continue below. */
17703 emit_move_insn (dest, op0);
17704 return;
17705 }
17706
17707 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17708 {
17709 gcc_assert (REGNO (op1) == REGNO (scratch));
17710
17711 x = gen_rtx_AND (vmode, scratch, mask);
17712 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17713
17714 dest = mask;
17715 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17716 x = gen_rtx_NOT (vmode, dest);
17717 x = gen_rtx_AND (vmode, x, op0);
17718 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17719 }
17720 else
17721 {
17722 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17723 {
17724 x = gen_rtx_AND (vmode, scratch, mask);
17725 }
17726 else /* alternative 2,4 */
17727 {
17728 gcc_assert (REGNO (mask) == REGNO (scratch));
17729 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17730 x = gen_rtx_AND (vmode, scratch, op1);
17731 }
17732 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17733
17734 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17735 {
17736 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17737 x = gen_rtx_AND (vmode, dest, nmask);
17738 }
17739 else /* alternative 3,4 */
17740 {
17741 gcc_assert (REGNO (nmask) == REGNO (dest));
17742 dest = nmask;
17743 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17744 x = gen_rtx_AND (vmode, dest, op0);
17745 }
17746 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17747 }
17748
17749 x = gen_rtx_IOR (vmode, dest, scratch);
17750 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17751 }
17752
17753 /* Return TRUE or FALSE depending on whether the first SET in INSN
17754 has source and destination with matching CC modes, and that the
17755 CC mode is at least as constrained as REQ_MODE. */
17756
17757 bool
17758 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17759 {
17760 rtx set;
17761 enum machine_mode set_mode;
17762
17763 set = PATTERN (insn);
17764 if (GET_CODE (set) == PARALLEL)
17765 set = XVECEXP (set, 0, 0);
17766 gcc_assert (GET_CODE (set) == SET);
17767 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17768
17769 set_mode = GET_MODE (SET_DEST (set));
17770 switch (set_mode)
17771 {
17772 case CCNOmode:
17773 if (req_mode != CCNOmode
17774 && (req_mode != CCmode
17775 || XEXP (SET_SRC (set), 1) != const0_rtx))
17776 return false;
17777 break;
17778 case CCmode:
17779 if (req_mode == CCGCmode)
17780 return false;
17781 /* FALLTHRU */
17782 case CCGCmode:
17783 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17784 return false;
17785 /* FALLTHRU */
17786 case CCGOCmode:
17787 if (req_mode == CCZmode)
17788 return false;
17789 /* FALLTHRU */
17790 case CCZmode:
17791 break;
17792
17793 case CCAmode:
17794 case CCCmode:
17795 case CCOmode:
17796 case CCSmode:
17797 if (set_mode != req_mode)
17798 return false;
17799 break;
17800
17801 default:
17802 gcc_unreachable ();
17803 }
17804
17805 return GET_MODE (SET_SRC (set)) == set_mode;
17806 }
17807
17808 /* Generate insn patterns to do an integer compare of OPERANDS. */
17809
17810 static rtx
17811 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17812 {
17813 enum machine_mode cmpmode;
17814 rtx tmp, flags;
17815
17816 cmpmode = SELECT_CC_MODE (code, op0, op1);
17817 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17818
17819 /* This is very simple, but making the interface the same as in the
17820 FP case makes the rest of the code easier. */
17821 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17822 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17823
17824 /* Return the test that should be put into the flags user, i.e.
17825 the bcc, scc, or cmov instruction. */
17826 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17827 }
17828
17829 /* Figure out whether to use ordered or unordered fp comparisons.
17830 Return the appropriate mode to use. */
17831
17832 enum machine_mode
17833 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17834 {
17835 /* ??? In order to make all comparisons reversible, we do all comparisons
17836 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17837 all forms trapping and nontrapping comparisons, we can make inequality
17838 comparisons trapping again, since it results in better code when using
17839 FCOM based compares. */
17840 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17841 }
17842
17843 enum machine_mode
17844 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17845 {
17846 enum machine_mode mode = GET_MODE (op0);
17847
17848 if (SCALAR_FLOAT_MODE_P (mode))
17849 {
17850 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17851 return ix86_fp_compare_mode (code);
17852 }
17853
17854 switch (code)
17855 {
17856 /* Only zero flag is needed. */
17857 case EQ: /* ZF=0 */
17858 case NE: /* ZF!=0 */
17859 return CCZmode;
17860 /* Codes needing carry flag. */
17861 case GEU: /* CF=0 */
17862 case LTU: /* CF=1 */
17863 /* Detect overflow checks. They need just the carry flag. */
17864 if (GET_CODE (op0) == PLUS
17865 && rtx_equal_p (op1, XEXP (op0, 0)))
17866 return CCCmode;
17867 else
17868 return CCmode;
17869 case GTU: /* CF=0 & ZF=0 */
17870 case LEU: /* CF=1 | ZF=1 */
17871 /* Detect overflow checks. They need just the carry flag. */
17872 if (GET_CODE (op0) == MINUS
17873 && rtx_equal_p (op1, XEXP (op0, 0)))
17874 return CCCmode;
17875 else
17876 return CCmode;
17877 /* Codes possibly doable only with sign flag when
17878 comparing against zero. */
17879 case GE: /* SF=OF or SF=0 */
17880 case LT: /* SF<>OF or SF=1 */
17881 if (op1 == const0_rtx)
17882 return CCGOCmode;
17883 else
17884 /* For other cases Carry flag is not required. */
17885 return CCGCmode;
17886 /* Codes doable only with sign flag when comparing
17887 against zero, but we miss jump instruction for it
17888 so we need to use relational tests against overflow
17889 that thus needs to be zero. */
17890 case GT: /* ZF=0 & SF=OF */
17891 case LE: /* ZF=1 | SF<>OF */
17892 if (op1 == const0_rtx)
17893 return CCNOmode;
17894 else
17895 return CCGCmode;
17896 /* strcmp pattern do (use flags) and combine may ask us for proper
17897 mode. */
17898 case USE:
17899 return CCmode;
17900 default:
17901 gcc_unreachable ();
17902 }
17903 }
17904
17905 /* Return the fixed registers used for condition codes. */
17906
17907 static bool
17908 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17909 {
17910 *p1 = FLAGS_REG;
17911 *p2 = FPSR_REG;
17912 return true;
17913 }
17914
17915 /* If two condition code modes are compatible, return a condition code
17916 mode which is compatible with both. Otherwise, return
17917 VOIDmode. */
17918
17919 static enum machine_mode
17920 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17921 {
17922 if (m1 == m2)
17923 return m1;
17924
17925 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17926 return VOIDmode;
17927
17928 if ((m1 == CCGCmode && m2 == CCGOCmode)
17929 || (m1 == CCGOCmode && m2 == CCGCmode))
17930 return CCGCmode;
17931
17932 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
17933 return m2;
17934 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
17935 return m1;
17936
17937 switch (m1)
17938 {
17939 default:
17940 gcc_unreachable ();
17941
17942 case CCmode:
17943 case CCGCmode:
17944 case CCGOCmode:
17945 case CCNOmode:
17946 case CCAmode:
17947 case CCCmode:
17948 case CCOmode:
17949 case CCSmode:
17950 case CCZmode:
17951 switch (m2)
17952 {
17953 default:
17954 return VOIDmode;
17955
17956 case CCmode:
17957 case CCGCmode:
17958 case CCGOCmode:
17959 case CCNOmode:
17960 case CCAmode:
17961 case CCCmode:
17962 case CCOmode:
17963 case CCSmode:
17964 case CCZmode:
17965 return CCmode;
17966 }
17967
17968 case CCFPmode:
17969 case CCFPUmode:
17970 /* These are only compatible with themselves, which we already
17971 checked above. */
17972 return VOIDmode;
17973 }
17974 }
17975
17976
17977 /* Return a comparison we can do and that it is equivalent to
17978 swap_condition (code) apart possibly from orderedness.
17979 But, never change orderedness if TARGET_IEEE_FP, returning
17980 UNKNOWN in that case if necessary. */
17981
17982 static enum rtx_code
17983 ix86_fp_swap_condition (enum rtx_code code)
17984 {
17985 switch (code)
17986 {
17987 case GT: /* GTU - CF=0 & ZF=0 */
17988 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
17989 case GE: /* GEU - CF=0 */
17990 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
17991 case UNLT: /* LTU - CF=1 */
17992 return TARGET_IEEE_FP ? UNKNOWN : GT;
17993 case UNLE: /* LEU - CF=1 | ZF=1 */
17994 return TARGET_IEEE_FP ? UNKNOWN : GE;
17995 default:
17996 return swap_condition (code);
17997 }
17998 }
17999
18000 /* Return cost of comparison CODE using the best strategy for performance.
18001 All following functions do use number of instructions as a cost metrics.
18002 In future this should be tweaked to compute bytes for optimize_size and
18003 take into account performance of various instructions on various CPUs. */
18004
18005 static int
18006 ix86_fp_comparison_cost (enum rtx_code code)
18007 {
18008 int arith_cost;
18009
18010 /* The cost of code using bit-twiddling on %ah. */
18011 switch (code)
18012 {
18013 case UNLE:
18014 case UNLT:
18015 case LTGT:
18016 case GT:
18017 case GE:
18018 case UNORDERED:
18019 case ORDERED:
18020 case UNEQ:
18021 arith_cost = 4;
18022 break;
18023 case LT:
18024 case NE:
18025 case EQ:
18026 case UNGE:
18027 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18028 break;
18029 case LE:
18030 case UNGT:
18031 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18032 break;
18033 default:
18034 gcc_unreachable ();
18035 }
18036
18037 switch (ix86_fp_comparison_strategy (code))
18038 {
18039 case IX86_FPCMP_COMI:
18040 return arith_cost > 4 ? 3 : 2;
18041 case IX86_FPCMP_SAHF:
18042 return arith_cost > 4 ? 4 : 3;
18043 default:
18044 return arith_cost;
18045 }
18046 }
18047
18048 /* Return strategy to use for floating-point. We assume that fcomi is always
18049 preferrable where available, since that is also true when looking at size
18050 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18051
18052 enum ix86_fpcmp_strategy
18053 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18054 {
18055 /* Do fcomi/sahf based test when profitable. */
18056
18057 if (TARGET_CMOVE)
18058 return IX86_FPCMP_COMI;
18059
18060 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18061 return IX86_FPCMP_SAHF;
18062
18063 return IX86_FPCMP_ARITH;
18064 }
18065
18066 /* Swap, force into registers, or otherwise massage the two operands
18067 to a fp comparison. The operands are updated in place; the new
18068 comparison code is returned. */
18069
18070 static enum rtx_code
18071 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18072 {
18073 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18074 rtx op0 = *pop0, op1 = *pop1;
18075 enum machine_mode op_mode = GET_MODE (op0);
18076 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18077
18078 /* All of the unordered compare instructions only work on registers.
18079 The same is true of the fcomi compare instructions. The XFmode
18080 compare instructions require registers except when comparing
18081 against zero or when converting operand 1 from fixed point to
18082 floating point. */
18083
18084 if (!is_sse
18085 && (fpcmp_mode == CCFPUmode
18086 || (op_mode == XFmode
18087 && ! (standard_80387_constant_p (op0) == 1
18088 || standard_80387_constant_p (op1) == 1)
18089 && GET_CODE (op1) != FLOAT)
18090 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18091 {
18092 op0 = force_reg (op_mode, op0);
18093 op1 = force_reg (op_mode, op1);
18094 }
18095 else
18096 {
18097 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18098 things around if they appear profitable, otherwise force op0
18099 into a register. */
18100
18101 if (standard_80387_constant_p (op0) == 0
18102 || (MEM_P (op0)
18103 && ! (standard_80387_constant_p (op1) == 0
18104 || MEM_P (op1))))
18105 {
18106 enum rtx_code new_code = ix86_fp_swap_condition (code);
18107 if (new_code != UNKNOWN)
18108 {
18109 rtx tmp;
18110 tmp = op0, op0 = op1, op1 = tmp;
18111 code = new_code;
18112 }
18113 }
18114
18115 if (!REG_P (op0))
18116 op0 = force_reg (op_mode, op0);
18117
18118 if (CONSTANT_P (op1))
18119 {
18120 int tmp = standard_80387_constant_p (op1);
18121 if (tmp == 0)
18122 op1 = validize_mem (force_const_mem (op_mode, op1));
18123 else if (tmp == 1)
18124 {
18125 if (TARGET_CMOVE)
18126 op1 = force_reg (op_mode, op1);
18127 }
18128 else
18129 op1 = force_reg (op_mode, op1);
18130 }
18131 }
18132
18133 /* Try to rearrange the comparison to make it cheaper. */
18134 if (ix86_fp_comparison_cost (code)
18135 > ix86_fp_comparison_cost (swap_condition (code))
18136 && (REG_P (op1) || can_create_pseudo_p ()))
18137 {
18138 rtx tmp;
18139 tmp = op0, op0 = op1, op1 = tmp;
18140 code = swap_condition (code);
18141 if (!REG_P (op0))
18142 op0 = force_reg (op_mode, op0);
18143 }
18144
18145 *pop0 = op0;
18146 *pop1 = op1;
18147 return code;
18148 }
18149
18150 /* Convert comparison codes we use to represent FP comparison to integer
18151 code that will result in proper branch. Return UNKNOWN if no such code
18152 is available. */
18153
18154 enum rtx_code
18155 ix86_fp_compare_code_to_integer (enum rtx_code code)
18156 {
18157 switch (code)
18158 {
18159 case GT:
18160 return GTU;
18161 case GE:
18162 return GEU;
18163 case ORDERED:
18164 case UNORDERED:
18165 return code;
18166 break;
18167 case UNEQ:
18168 return EQ;
18169 break;
18170 case UNLT:
18171 return LTU;
18172 break;
18173 case UNLE:
18174 return LEU;
18175 break;
18176 case LTGT:
18177 return NE;
18178 break;
18179 default:
18180 return UNKNOWN;
18181 }
18182 }
18183
18184 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18185
18186 static rtx
18187 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18188 {
18189 enum machine_mode fpcmp_mode, intcmp_mode;
18190 rtx tmp, tmp2;
18191
18192 fpcmp_mode = ix86_fp_compare_mode (code);
18193 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18194
18195 /* Do fcomi/sahf based test when profitable. */
18196 switch (ix86_fp_comparison_strategy (code))
18197 {
18198 case IX86_FPCMP_COMI:
18199 intcmp_mode = fpcmp_mode;
18200 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18201 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18202 tmp);
18203 emit_insn (tmp);
18204 break;
18205
18206 case IX86_FPCMP_SAHF:
18207 intcmp_mode = fpcmp_mode;
18208 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18209 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18210 tmp);
18211
18212 if (!scratch)
18213 scratch = gen_reg_rtx (HImode);
18214 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18215 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18216 break;
18217
18218 case IX86_FPCMP_ARITH:
18219 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18220 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18221 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18222 if (!scratch)
18223 scratch = gen_reg_rtx (HImode);
18224 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18225
18226 /* In the unordered case, we have to check C2 for NaN's, which
18227 doesn't happen to work out to anything nice combination-wise.
18228 So do some bit twiddling on the value we've got in AH to come
18229 up with an appropriate set of condition codes. */
18230
18231 intcmp_mode = CCNOmode;
18232 switch (code)
18233 {
18234 case GT:
18235 case UNGT:
18236 if (code == GT || !TARGET_IEEE_FP)
18237 {
18238 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18239 code = EQ;
18240 }
18241 else
18242 {
18243 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18244 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18245 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18246 intcmp_mode = CCmode;
18247 code = GEU;
18248 }
18249 break;
18250 case LT:
18251 case UNLT:
18252 if (code == LT && TARGET_IEEE_FP)
18253 {
18254 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18255 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18256 intcmp_mode = CCmode;
18257 code = EQ;
18258 }
18259 else
18260 {
18261 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18262 code = NE;
18263 }
18264 break;
18265 case GE:
18266 case UNGE:
18267 if (code == GE || !TARGET_IEEE_FP)
18268 {
18269 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18270 code = EQ;
18271 }
18272 else
18273 {
18274 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18275 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18276 code = NE;
18277 }
18278 break;
18279 case LE:
18280 case UNLE:
18281 if (code == LE && TARGET_IEEE_FP)
18282 {
18283 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18284 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18285 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18286 intcmp_mode = CCmode;
18287 code = LTU;
18288 }
18289 else
18290 {
18291 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18292 code = NE;
18293 }
18294 break;
18295 case EQ:
18296 case UNEQ:
18297 if (code == EQ && TARGET_IEEE_FP)
18298 {
18299 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18300 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18301 intcmp_mode = CCmode;
18302 code = EQ;
18303 }
18304 else
18305 {
18306 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18307 code = NE;
18308 }
18309 break;
18310 case NE:
18311 case LTGT:
18312 if (code == NE && TARGET_IEEE_FP)
18313 {
18314 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18315 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18316 GEN_INT (0x40)));
18317 code = NE;
18318 }
18319 else
18320 {
18321 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18322 code = EQ;
18323 }
18324 break;
18325
18326 case UNORDERED:
18327 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18328 code = NE;
18329 break;
18330 case ORDERED:
18331 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18332 code = EQ;
18333 break;
18334
18335 default:
18336 gcc_unreachable ();
18337 }
18338 break;
18339
18340 default:
18341 gcc_unreachable();
18342 }
18343
18344 /* Return the test that should be put into the flags user, i.e.
18345 the bcc, scc, or cmov instruction. */
18346 return gen_rtx_fmt_ee (code, VOIDmode,
18347 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18348 const0_rtx);
18349 }
18350
18351 static rtx
18352 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18353 {
18354 rtx ret;
18355
18356 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18357 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18358
18359 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18360 {
18361 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18362 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18363 }
18364 else
18365 ret = ix86_expand_int_compare (code, op0, op1);
18366
18367 return ret;
18368 }
18369
18370 void
18371 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18372 {
18373 enum machine_mode mode = GET_MODE (op0);
18374 rtx tmp;
18375
18376 switch (mode)
18377 {
18378 case SFmode:
18379 case DFmode:
18380 case XFmode:
18381 case QImode:
18382 case HImode:
18383 case SImode:
18384 simple:
18385 tmp = ix86_expand_compare (code, op0, op1);
18386 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18387 gen_rtx_LABEL_REF (VOIDmode, label),
18388 pc_rtx);
18389 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18390 return;
18391
18392 case DImode:
18393 if (TARGET_64BIT)
18394 goto simple;
18395 case TImode:
18396 /* Expand DImode branch into multiple compare+branch. */
18397 {
18398 rtx lo[2], hi[2], label2;
18399 enum rtx_code code1, code2, code3;
18400 enum machine_mode submode;
18401
18402 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18403 {
18404 tmp = op0, op0 = op1, op1 = tmp;
18405 code = swap_condition (code);
18406 }
18407
18408 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18409 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18410
18411 submode = mode == DImode ? SImode : DImode;
18412
18413 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18414 avoid two branches. This costs one extra insn, so disable when
18415 optimizing for size. */
18416
18417 if ((code == EQ || code == NE)
18418 && (!optimize_insn_for_size_p ()
18419 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18420 {
18421 rtx xor0, xor1;
18422
18423 xor1 = hi[0];
18424 if (hi[1] != const0_rtx)
18425 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18426 NULL_RTX, 0, OPTAB_WIDEN);
18427
18428 xor0 = lo[0];
18429 if (lo[1] != const0_rtx)
18430 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18431 NULL_RTX, 0, OPTAB_WIDEN);
18432
18433 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18434 NULL_RTX, 0, OPTAB_WIDEN);
18435
18436 ix86_expand_branch (code, tmp, const0_rtx, label);
18437 return;
18438 }
18439
18440 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18441 op1 is a constant and the low word is zero, then we can just
18442 examine the high word. Similarly for low word -1 and
18443 less-or-equal-than or greater-than. */
18444
18445 if (CONST_INT_P (hi[1]))
18446 switch (code)
18447 {
18448 case LT: case LTU: case GE: case GEU:
18449 if (lo[1] == const0_rtx)
18450 {
18451 ix86_expand_branch (code, hi[0], hi[1], label);
18452 return;
18453 }
18454 break;
18455 case LE: case LEU: case GT: case GTU:
18456 if (lo[1] == constm1_rtx)
18457 {
18458 ix86_expand_branch (code, hi[0], hi[1], label);
18459 return;
18460 }
18461 break;
18462 default:
18463 break;
18464 }
18465
18466 /* Otherwise, we need two or three jumps. */
18467
18468 label2 = gen_label_rtx ();
18469
18470 code1 = code;
18471 code2 = swap_condition (code);
18472 code3 = unsigned_condition (code);
18473
18474 switch (code)
18475 {
18476 case LT: case GT: case LTU: case GTU:
18477 break;
18478
18479 case LE: code1 = LT; code2 = GT; break;
18480 case GE: code1 = GT; code2 = LT; break;
18481 case LEU: code1 = LTU; code2 = GTU; break;
18482 case GEU: code1 = GTU; code2 = LTU; break;
18483
18484 case EQ: code1 = UNKNOWN; code2 = NE; break;
18485 case NE: code2 = UNKNOWN; break;
18486
18487 default:
18488 gcc_unreachable ();
18489 }
18490
18491 /*
18492 * a < b =>
18493 * if (hi(a) < hi(b)) goto true;
18494 * if (hi(a) > hi(b)) goto false;
18495 * if (lo(a) < lo(b)) goto true;
18496 * false:
18497 */
18498
18499 if (code1 != UNKNOWN)
18500 ix86_expand_branch (code1, hi[0], hi[1], label);
18501 if (code2 != UNKNOWN)
18502 ix86_expand_branch (code2, hi[0], hi[1], label2);
18503
18504 ix86_expand_branch (code3, lo[0], lo[1], label);
18505
18506 if (code2 != UNKNOWN)
18507 emit_label (label2);
18508 return;
18509 }
18510
18511 default:
18512 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18513 goto simple;
18514 }
18515 }
18516
18517 /* Split branch based on floating point condition. */
18518 void
18519 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18520 rtx target1, rtx target2, rtx tmp, rtx pushed)
18521 {
18522 rtx condition;
18523 rtx i;
18524
18525 if (target2 != pc_rtx)
18526 {
18527 rtx tmp = target2;
18528 code = reverse_condition_maybe_unordered (code);
18529 target2 = target1;
18530 target1 = tmp;
18531 }
18532
18533 condition = ix86_expand_fp_compare (code, op1, op2,
18534 tmp);
18535
18536 /* Remove pushed operand from stack. */
18537 if (pushed)
18538 ix86_free_from_memory (GET_MODE (pushed));
18539
18540 i = emit_jump_insn (gen_rtx_SET
18541 (VOIDmode, pc_rtx,
18542 gen_rtx_IF_THEN_ELSE (VOIDmode,
18543 condition, target1, target2)));
18544 if (split_branch_probability >= 0)
18545 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18546 }
18547
18548 void
18549 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18550 {
18551 rtx ret;
18552
18553 gcc_assert (GET_MODE (dest) == QImode);
18554
18555 ret = ix86_expand_compare (code, op0, op1);
18556 PUT_MODE (ret, QImode);
18557 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18558 }
18559
18560 /* Expand comparison setting or clearing carry flag. Return true when
18561 successful and set pop for the operation. */
18562 static bool
18563 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18564 {
18565 enum machine_mode mode =
18566 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18567
18568 /* Do not handle double-mode compares that go through special path. */
18569 if (mode == (TARGET_64BIT ? TImode : DImode))
18570 return false;
18571
18572 if (SCALAR_FLOAT_MODE_P (mode))
18573 {
18574 rtx compare_op, compare_seq;
18575
18576 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18577
18578 /* Shortcut: following common codes never translate
18579 into carry flag compares. */
18580 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18581 || code == ORDERED || code == UNORDERED)
18582 return false;
18583
18584 /* These comparisons require zero flag; swap operands so they won't. */
18585 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18586 && !TARGET_IEEE_FP)
18587 {
18588 rtx tmp = op0;
18589 op0 = op1;
18590 op1 = tmp;
18591 code = swap_condition (code);
18592 }
18593
18594 /* Try to expand the comparison and verify that we end up with
18595 carry flag based comparison. This fails to be true only when
18596 we decide to expand comparison using arithmetic that is not
18597 too common scenario. */
18598 start_sequence ();
18599 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18600 compare_seq = get_insns ();
18601 end_sequence ();
18602
18603 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18604 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18605 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18606 else
18607 code = GET_CODE (compare_op);
18608
18609 if (code != LTU && code != GEU)
18610 return false;
18611
18612 emit_insn (compare_seq);
18613 *pop = compare_op;
18614 return true;
18615 }
18616
18617 if (!INTEGRAL_MODE_P (mode))
18618 return false;
18619
18620 switch (code)
18621 {
18622 case LTU:
18623 case GEU:
18624 break;
18625
18626 /* Convert a==0 into (unsigned)a<1. */
18627 case EQ:
18628 case NE:
18629 if (op1 != const0_rtx)
18630 return false;
18631 op1 = const1_rtx;
18632 code = (code == EQ ? LTU : GEU);
18633 break;
18634
18635 /* Convert a>b into b<a or a>=b-1. */
18636 case GTU:
18637 case LEU:
18638 if (CONST_INT_P (op1))
18639 {
18640 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18641 /* Bail out on overflow. We still can swap operands but that
18642 would force loading of the constant into register. */
18643 if (op1 == const0_rtx
18644 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18645 return false;
18646 code = (code == GTU ? GEU : LTU);
18647 }
18648 else
18649 {
18650 rtx tmp = op1;
18651 op1 = op0;
18652 op0 = tmp;
18653 code = (code == GTU ? LTU : GEU);
18654 }
18655 break;
18656
18657 /* Convert a>=0 into (unsigned)a<0x80000000. */
18658 case LT:
18659 case GE:
18660 if (mode == DImode || op1 != const0_rtx)
18661 return false;
18662 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18663 code = (code == LT ? GEU : LTU);
18664 break;
18665 case LE:
18666 case GT:
18667 if (mode == DImode || op1 != constm1_rtx)
18668 return false;
18669 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18670 code = (code == LE ? GEU : LTU);
18671 break;
18672
18673 default:
18674 return false;
18675 }
18676 /* Swapping operands may cause constant to appear as first operand. */
18677 if (!nonimmediate_operand (op0, VOIDmode))
18678 {
18679 if (!can_create_pseudo_p ())
18680 return false;
18681 op0 = force_reg (mode, op0);
18682 }
18683 *pop = ix86_expand_compare (code, op0, op1);
18684 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18685 return true;
18686 }
18687
18688 bool
18689 ix86_expand_int_movcc (rtx operands[])
18690 {
18691 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18692 rtx compare_seq, compare_op;
18693 enum machine_mode mode = GET_MODE (operands[0]);
18694 bool sign_bit_compare_p = false;
18695 rtx op0 = XEXP (operands[1], 0);
18696 rtx op1 = XEXP (operands[1], 1);
18697
18698 start_sequence ();
18699 compare_op = ix86_expand_compare (code, op0, op1);
18700 compare_seq = get_insns ();
18701 end_sequence ();
18702
18703 compare_code = GET_CODE (compare_op);
18704
18705 if ((op1 == const0_rtx && (code == GE || code == LT))
18706 || (op1 == constm1_rtx && (code == GT || code == LE)))
18707 sign_bit_compare_p = true;
18708
18709 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18710 HImode insns, we'd be swallowed in word prefix ops. */
18711
18712 if ((mode != HImode || TARGET_FAST_PREFIX)
18713 && (mode != (TARGET_64BIT ? TImode : DImode))
18714 && CONST_INT_P (operands[2])
18715 && CONST_INT_P (operands[3]))
18716 {
18717 rtx out = operands[0];
18718 HOST_WIDE_INT ct = INTVAL (operands[2]);
18719 HOST_WIDE_INT cf = INTVAL (operands[3]);
18720 HOST_WIDE_INT diff;
18721
18722 diff = ct - cf;
18723 /* Sign bit compares are better done using shifts than we do by using
18724 sbb. */
18725 if (sign_bit_compare_p
18726 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18727 {
18728 /* Detect overlap between destination and compare sources. */
18729 rtx tmp = out;
18730
18731 if (!sign_bit_compare_p)
18732 {
18733 rtx flags;
18734 bool fpcmp = false;
18735
18736 compare_code = GET_CODE (compare_op);
18737
18738 flags = XEXP (compare_op, 0);
18739
18740 if (GET_MODE (flags) == CCFPmode
18741 || GET_MODE (flags) == CCFPUmode)
18742 {
18743 fpcmp = true;
18744 compare_code
18745 = ix86_fp_compare_code_to_integer (compare_code);
18746 }
18747
18748 /* To simplify rest of code, restrict to the GEU case. */
18749 if (compare_code == LTU)
18750 {
18751 HOST_WIDE_INT tmp = ct;
18752 ct = cf;
18753 cf = tmp;
18754 compare_code = reverse_condition (compare_code);
18755 code = reverse_condition (code);
18756 }
18757 else
18758 {
18759 if (fpcmp)
18760 PUT_CODE (compare_op,
18761 reverse_condition_maybe_unordered
18762 (GET_CODE (compare_op)));
18763 else
18764 PUT_CODE (compare_op,
18765 reverse_condition (GET_CODE (compare_op)));
18766 }
18767 diff = ct - cf;
18768
18769 if (reg_overlap_mentioned_p (out, op0)
18770 || reg_overlap_mentioned_p (out, op1))
18771 tmp = gen_reg_rtx (mode);
18772
18773 if (mode == DImode)
18774 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18775 else
18776 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18777 flags, compare_op));
18778 }
18779 else
18780 {
18781 if (code == GT || code == GE)
18782 code = reverse_condition (code);
18783 else
18784 {
18785 HOST_WIDE_INT tmp = ct;
18786 ct = cf;
18787 cf = tmp;
18788 diff = ct - cf;
18789 }
18790 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18791 }
18792
18793 if (diff == 1)
18794 {
18795 /*
18796 * cmpl op0,op1
18797 * sbbl dest,dest
18798 * [addl dest, ct]
18799 *
18800 * Size 5 - 8.
18801 */
18802 if (ct)
18803 tmp = expand_simple_binop (mode, PLUS,
18804 tmp, GEN_INT (ct),
18805 copy_rtx (tmp), 1, OPTAB_DIRECT);
18806 }
18807 else if (cf == -1)
18808 {
18809 /*
18810 * cmpl op0,op1
18811 * sbbl dest,dest
18812 * orl $ct, dest
18813 *
18814 * Size 8.
18815 */
18816 tmp = expand_simple_binop (mode, IOR,
18817 tmp, GEN_INT (ct),
18818 copy_rtx (tmp), 1, OPTAB_DIRECT);
18819 }
18820 else if (diff == -1 && ct)
18821 {
18822 /*
18823 * cmpl op0,op1
18824 * sbbl dest,dest
18825 * notl dest
18826 * [addl dest, cf]
18827 *
18828 * Size 8 - 11.
18829 */
18830 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18831 if (cf)
18832 tmp = expand_simple_binop (mode, PLUS,
18833 copy_rtx (tmp), GEN_INT (cf),
18834 copy_rtx (tmp), 1, OPTAB_DIRECT);
18835 }
18836 else
18837 {
18838 /*
18839 * cmpl op0,op1
18840 * sbbl dest,dest
18841 * [notl dest]
18842 * andl cf - ct, dest
18843 * [addl dest, ct]
18844 *
18845 * Size 8 - 11.
18846 */
18847
18848 if (cf == 0)
18849 {
18850 cf = ct;
18851 ct = 0;
18852 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18853 }
18854
18855 tmp = expand_simple_binop (mode, AND,
18856 copy_rtx (tmp),
18857 gen_int_mode (cf - ct, mode),
18858 copy_rtx (tmp), 1, OPTAB_DIRECT);
18859 if (ct)
18860 tmp = expand_simple_binop (mode, PLUS,
18861 copy_rtx (tmp), GEN_INT (ct),
18862 copy_rtx (tmp), 1, OPTAB_DIRECT);
18863 }
18864
18865 if (!rtx_equal_p (tmp, out))
18866 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18867
18868 return true;
18869 }
18870
18871 if (diff < 0)
18872 {
18873 enum machine_mode cmp_mode = GET_MODE (op0);
18874
18875 HOST_WIDE_INT tmp;
18876 tmp = ct, ct = cf, cf = tmp;
18877 diff = -diff;
18878
18879 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18880 {
18881 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18882
18883 /* We may be reversing unordered compare to normal compare, that
18884 is not valid in general (we may convert non-trapping condition
18885 to trapping one), however on i386 we currently emit all
18886 comparisons unordered. */
18887 compare_code = reverse_condition_maybe_unordered (compare_code);
18888 code = reverse_condition_maybe_unordered (code);
18889 }
18890 else
18891 {
18892 compare_code = reverse_condition (compare_code);
18893 code = reverse_condition (code);
18894 }
18895 }
18896
18897 compare_code = UNKNOWN;
18898 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18899 && CONST_INT_P (op1))
18900 {
18901 if (op1 == const0_rtx
18902 && (code == LT || code == GE))
18903 compare_code = code;
18904 else if (op1 == constm1_rtx)
18905 {
18906 if (code == LE)
18907 compare_code = LT;
18908 else if (code == GT)
18909 compare_code = GE;
18910 }
18911 }
18912
18913 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18914 if (compare_code != UNKNOWN
18915 && GET_MODE (op0) == GET_MODE (out)
18916 && (cf == -1 || ct == -1))
18917 {
18918 /* If lea code below could be used, only optimize
18919 if it results in a 2 insn sequence. */
18920
18921 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18922 || diff == 3 || diff == 5 || diff == 9)
18923 || (compare_code == LT && ct == -1)
18924 || (compare_code == GE && cf == -1))
18925 {
18926 /*
18927 * notl op1 (if necessary)
18928 * sarl $31, op1
18929 * orl cf, op1
18930 */
18931 if (ct != -1)
18932 {
18933 cf = ct;
18934 ct = -1;
18935 code = reverse_condition (code);
18936 }
18937
18938 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18939
18940 out = expand_simple_binop (mode, IOR,
18941 out, GEN_INT (cf),
18942 out, 1, OPTAB_DIRECT);
18943 if (out != operands[0])
18944 emit_move_insn (operands[0], out);
18945
18946 return true;
18947 }
18948 }
18949
18950
18951 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
18952 || diff == 3 || diff == 5 || diff == 9)
18953 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
18954 && (mode != DImode
18955 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
18956 {
18957 /*
18958 * xorl dest,dest
18959 * cmpl op1,op2
18960 * setcc dest
18961 * lea cf(dest*(ct-cf)),dest
18962 *
18963 * Size 14.
18964 *
18965 * This also catches the degenerate setcc-only case.
18966 */
18967
18968 rtx tmp;
18969 int nops;
18970
18971 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18972
18973 nops = 0;
18974 /* On x86_64 the lea instruction operates on Pmode, so we need
18975 to get arithmetics done in proper mode to match. */
18976 if (diff == 1)
18977 tmp = copy_rtx (out);
18978 else
18979 {
18980 rtx out1;
18981 out1 = copy_rtx (out);
18982 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
18983 nops++;
18984 if (diff & 1)
18985 {
18986 tmp = gen_rtx_PLUS (mode, tmp, out1);
18987 nops++;
18988 }
18989 }
18990 if (cf != 0)
18991 {
18992 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
18993 nops++;
18994 }
18995 if (!rtx_equal_p (tmp, out))
18996 {
18997 if (nops == 1)
18998 out = force_operand (tmp, copy_rtx (out));
18999 else
19000 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19001 }
19002 if (!rtx_equal_p (out, operands[0]))
19003 emit_move_insn (operands[0], copy_rtx (out));
19004
19005 return true;
19006 }
19007
19008 /*
19009 * General case: Jumpful:
19010 * xorl dest,dest cmpl op1, op2
19011 * cmpl op1, op2 movl ct, dest
19012 * setcc dest jcc 1f
19013 * decl dest movl cf, dest
19014 * andl (cf-ct),dest 1:
19015 * addl ct,dest
19016 *
19017 * Size 20. Size 14.
19018 *
19019 * This is reasonably steep, but branch mispredict costs are
19020 * high on modern cpus, so consider failing only if optimizing
19021 * for space.
19022 */
19023
19024 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19025 && BRANCH_COST (optimize_insn_for_speed_p (),
19026 false) >= 2)
19027 {
19028 if (cf == 0)
19029 {
19030 enum machine_mode cmp_mode = GET_MODE (op0);
19031
19032 cf = ct;
19033 ct = 0;
19034
19035 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19036 {
19037 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19038
19039 /* We may be reversing unordered compare to normal compare,
19040 that is not valid in general (we may convert non-trapping
19041 condition to trapping one), however on i386 we currently
19042 emit all comparisons unordered. */
19043 code = reverse_condition_maybe_unordered (code);
19044 }
19045 else
19046 {
19047 code = reverse_condition (code);
19048 if (compare_code != UNKNOWN)
19049 compare_code = reverse_condition (compare_code);
19050 }
19051 }
19052
19053 if (compare_code != UNKNOWN)
19054 {
19055 /* notl op1 (if needed)
19056 sarl $31, op1
19057 andl (cf-ct), op1
19058 addl ct, op1
19059
19060 For x < 0 (resp. x <= -1) there will be no notl,
19061 so if possible swap the constants to get rid of the
19062 complement.
19063 True/false will be -1/0 while code below (store flag
19064 followed by decrement) is 0/-1, so the constants need
19065 to be exchanged once more. */
19066
19067 if (compare_code == GE || !cf)
19068 {
19069 code = reverse_condition (code);
19070 compare_code = LT;
19071 }
19072 else
19073 {
19074 HOST_WIDE_INT tmp = cf;
19075 cf = ct;
19076 ct = tmp;
19077 }
19078
19079 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19080 }
19081 else
19082 {
19083 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19084
19085 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19086 constm1_rtx,
19087 copy_rtx (out), 1, OPTAB_DIRECT);
19088 }
19089
19090 out = expand_simple_binop (mode, AND, copy_rtx (out),
19091 gen_int_mode (cf - ct, mode),
19092 copy_rtx (out), 1, OPTAB_DIRECT);
19093 if (ct)
19094 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19095 copy_rtx (out), 1, OPTAB_DIRECT);
19096 if (!rtx_equal_p (out, operands[0]))
19097 emit_move_insn (operands[0], copy_rtx (out));
19098
19099 return true;
19100 }
19101 }
19102
19103 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19104 {
19105 /* Try a few things more with specific constants and a variable. */
19106
19107 optab op;
19108 rtx var, orig_out, out, tmp;
19109
19110 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19111 return false;
19112
19113 /* If one of the two operands is an interesting constant, load a
19114 constant with the above and mask it in with a logical operation. */
19115
19116 if (CONST_INT_P (operands[2]))
19117 {
19118 var = operands[3];
19119 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19120 operands[3] = constm1_rtx, op = and_optab;
19121 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19122 operands[3] = const0_rtx, op = ior_optab;
19123 else
19124 return false;
19125 }
19126 else if (CONST_INT_P (operands[3]))
19127 {
19128 var = operands[2];
19129 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19130 operands[2] = constm1_rtx, op = and_optab;
19131 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19132 operands[2] = const0_rtx, op = ior_optab;
19133 else
19134 return false;
19135 }
19136 else
19137 return false;
19138
19139 orig_out = operands[0];
19140 tmp = gen_reg_rtx (mode);
19141 operands[0] = tmp;
19142
19143 /* Recurse to get the constant loaded. */
19144 if (ix86_expand_int_movcc (operands) == 0)
19145 return false;
19146
19147 /* Mask in the interesting variable. */
19148 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19149 OPTAB_WIDEN);
19150 if (!rtx_equal_p (out, orig_out))
19151 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19152
19153 return true;
19154 }
19155
19156 /*
19157 * For comparison with above,
19158 *
19159 * movl cf,dest
19160 * movl ct,tmp
19161 * cmpl op1,op2
19162 * cmovcc tmp,dest
19163 *
19164 * Size 15.
19165 */
19166
19167 if (! nonimmediate_operand (operands[2], mode))
19168 operands[2] = force_reg (mode, operands[2]);
19169 if (! nonimmediate_operand (operands[3], mode))
19170 operands[3] = force_reg (mode, operands[3]);
19171
19172 if (! register_operand (operands[2], VOIDmode)
19173 && (mode == QImode
19174 || ! register_operand (operands[3], VOIDmode)))
19175 operands[2] = force_reg (mode, operands[2]);
19176
19177 if (mode == QImode
19178 && ! register_operand (operands[3], VOIDmode))
19179 operands[3] = force_reg (mode, operands[3]);
19180
19181 emit_insn (compare_seq);
19182 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19183 gen_rtx_IF_THEN_ELSE (mode,
19184 compare_op, operands[2],
19185 operands[3])));
19186 return true;
19187 }
19188
19189 /* Swap, force into registers, or otherwise massage the two operands
19190 to an sse comparison with a mask result. Thus we differ a bit from
19191 ix86_prepare_fp_compare_args which expects to produce a flags result.
19192
19193 The DEST operand exists to help determine whether to commute commutative
19194 operators. The POP0/POP1 operands are updated in place. The new
19195 comparison code is returned, or UNKNOWN if not implementable. */
19196
19197 static enum rtx_code
19198 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19199 rtx *pop0, rtx *pop1)
19200 {
19201 rtx tmp;
19202
19203 switch (code)
19204 {
19205 case LTGT:
19206 case UNEQ:
19207 /* AVX supports all the needed comparisons. */
19208 if (TARGET_AVX)
19209 break;
19210 /* We have no LTGT as an operator. We could implement it with
19211 NE & ORDERED, but this requires an extra temporary. It's
19212 not clear that it's worth it. */
19213 return UNKNOWN;
19214
19215 case LT:
19216 case LE:
19217 case UNGT:
19218 case UNGE:
19219 /* These are supported directly. */
19220 break;
19221
19222 case EQ:
19223 case NE:
19224 case UNORDERED:
19225 case ORDERED:
19226 /* AVX has 3 operand comparisons, no need to swap anything. */
19227 if (TARGET_AVX)
19228 break;
19229 /* For commutative operators, try to canonicalize the destination
19230 operand to be first in the comparison - this helps reload to
19231 avoid extra moves. */
19232 if (!dest || !rtx_equal_p (dest, *pop1))
19233 break;
19234 /* FALLTHRU */
19235
19236 case GE:
19237 case GT:
19238 case UNLE:
19239 case UNLT:
19240 /* These are not supported directly before AVX, and furthermore
19241 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19242 comparison operands to transform into something that is
19243 supported. */
19244 tmp = *pop0;
19245 *pop0 = *pop1;
19246 *pop1 = tmp;
19247 code = swap_condition (code);
19248 break;
19249
19250 default:
19251 gcc_unreachable ();
19252 }
19253
19254 return code;
19255 }
19256
19257 /* Detect conditional moves that exactly match min/max operational
19258 semantics. Note that this is IEEE safe, as long as we don't
19259 interchange the operands.
19260
19261 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19262 and TRUE if the operation is successful and instructions are emitted. */
19263
19264 static bool
19265 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19266 rtx cmp_op1, rtx if_true, rtx if_false)
19267 {
19268 enum machine_mode mode;
19269 bool is_min;
19270 rtx tmp;
19271
19272 if (code == LT)
19273 ;
19274 else if (code == UNGE)
19275 {
19276 tmp = if_true;
19277 if_true = if_false;
19278 if_false = tmp;
19279 }
19280 else
19281 return false;
19282
19283 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19284 is_min = true;
19285 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19286 is_min = false;
19287 else
19288 return false;
19289
19290 mode = GET_MODE (dest);
19291
19292 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19293 but MODE may be a vector mode and thus not appropriate. */
19294 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19295 {
19296 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19297 rtvec v;
19298
19299 if_true = force_reg (mode, if_true);
19300 v = gen_rtvec (2, if_true, if_false);
19301 tmp = gen_rtx_UNSPEC (mode, v, u);
19302 }
19303 else
19304 {
19305 code = is_min ? SMIN : SMAX;
19306 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19307 }
19308
19309 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19310 return true;
19311 }
19312
19313 /* Expand an sse vector comparison. Return the register with the result. */
19314
19315 static rtx
19316 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19317 rtx op_true, rtx op_false)
19318 {
19319 enum machine_mode mode = GET_MODE (dest);
19320 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19321 rtx x;
19322
19323 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19324 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19325 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19326
19327 if (optimize
19328 || reg_overlap_mentioned_p (dest, op_true)
19329 || reg_overlap_mentioned_p (dest, op_false))
19330 dest = gen_reg_rtx (mode);
19331
19332 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19333 if (cmp_mode != mode)
19334 {
19335 x = force_reg (cmp_mode, x);
19336 convert_move (dest, x, false);
19337 }
19338 else
19339 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19340
19341 return dest;
19342 }
19343
19344 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19345 operations. This is used for both scalar and vector conditional moves. */
19346
19347 static void
19348 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19349 {
19350 enum machine_mode mode = GET_MODE (dest);
19351 rtx t2, t3, x;
19352
19353 if (vector_all_ones_operand (op_true, mode)
19354 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19355 {
19356 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19357 }
19358 else if (op_false == CONST0_RTX (mode))
19359 {
19360 op_true = force_reg (mode, op_true);
19361 x = gen_rtx_AND (mode, cmp, op_true);
19362 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19363 }
19364 else if (op_true == CONST0_RTX (mode))
19365 {
19366 op_false = force_reg (mode, op_false);
19367 x = gen_rtx_NOT (mode, cmp);
19368 x = gen_rtx_AND (mode, x, op_false);
19369 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19370 }
19371 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19372 {
19373 op_false = force_reg (mode, op_false);
19374 x = gen_rtx_IOR (mode, cmp, op_false);
19375 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19376 }
19377 else if (TARGET_XOP)
19378 {
19379 op_true = force_reg (mode, op_true);
19380
19381 if (!nonimmediate_operand (op_false, mode))
19382 op_false = force_reg (mode, op_false);
19383
19384 emit_insn (gen_rtx_SET (mode, dest,
19385 gen_rtx_IF_THEN_ELSE (mode, cmp,
19386 op_true,
19387 op_false)));
19388 }
19389 else
19390 {
19391 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19392
19393 if (!nonimmediate_operand (op_true, mode))
19394 op_true = force_reg (mode, op_true);
19395
19396 op_false = force_reg (mode, op_false);
19397
19398 switch (mode)
19399 {
19400 case V4SFmode:
19401 if (TARGET_SSE4_1)
19402 gen = gen_sse4_1_blendvps;
19403 break;
19404 case V2DFmode:
19405 if (TARGET_SSE4_1)
19406 gen = gen_sse4_1_blendvpd;
19407 break;
19408 case V16QImode:
19409 case V8HImode:
19410 case V4SImode:
19411 case V2DImode:
19412 if (TARGET_SSE4_1)
19413 {
19414 gen = gen_sse4_1_pblendvb;
19415 dest = gen_lowpart (V16QImode, dest);
19416 op_false = gen_lowpart (V16QImode, op_false);
19417 op_true = gen_lowpart (V16QImode, op_true);
19418 cmp = gen_lowpart (V16QImode, cmp);
19419 }
19420 break;
19421 case V8SFmode:
19422 if (TARGET_AVX)
19423 gen = gen_avx_blendvps256;
19424 break;
19425 case V4DFmode:
19426 if (TARGET_AVX)
19427 gen = gen_avx_blendvpd256;
19428 break;
19429 case V32QImode:
19430 case V16HImode:
19431 case V8SImode:
19432 case V4DImode:
19433 if (TARGET_AVX2)
19434 {
19435 gen = gen_avx2_pblendvb;
19436 dest = gen_lowpart (V32QImode, dest);
19437 op_false = gen_lowpart (V32QImode, op_false);
19438 op_true = gen_lowpart (V32QImode, op_true);
19439 cmp = gen_lowpart (V32QImode, cmp);
19440 }
19441 break;
19442 default:
19443 break;
19444 }
19445
19446 if (gen != NULL)
19447 emit_insn (gen (dest, op_false, op_true, cmp));
19448 else
19449 {
19450 op_true = force_reg (mode, op_true);
19451
19452 t2 = gen_reg_rtx (mode);
19453 if (optimize)
19454 t3 = gen_reg_rtx (mode);
19455 else
19456 t3 = dest;
19457
19458 x = gen_rtx_AND (mode, op_true, cmp);
19459 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19460
19461 x = gen_rtx_NOT (mode, cmp);
19462 x = gen_rtx_AND (mode, x, op_false);
19463 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19464
19465 x = gen_rtx_IOR (mode, t3, t2);
19466 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19467 }
19468 }
19469 }
19470
19471 /* Expand a floating-point conditional move. Return true if successful. */
19472
19473 bool
19474 ix86_expand_fp_movcc (rtx operands[])
19475 {
19476 enum machine_mode mode = GET_MODE (operands[0]);
19477 enum rtx_code code = GET_CODE (operands[1]);
19478 rtx tmp, compare_op;
19479 rtx op0 = XEXP (operands[1], 0);
19480 rtx op1 = XEXP (operands[1], 1);
19481
19482 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19483 {
19484 enum machine_mode cmode;
19485
19486 /* Since we've no cmove for sse registers, don't force bad register
19487 allocation just to gain access to it. Deny movcc when the
19488 comparison mode doesn't match the move mode. */
19489 cmode = GET_MODE (op0);
19490 if (cmode == VOIDmode)
19491 cmode = GET_MODE (op1);
19492 if (cmode != mode)
19493 return false;
19494
19495 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19496 if (code == UNKNOWN)
19497 return false;
19498
19499 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19500 operands[2], operands[3]))
19501 return true;
19502
19503 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19504 operands[2], operands[3]);
19505 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19506 return true;
19507 }
19508
19509 /* The floating point conditional move instructions don't directly
19510 support conditions resulting from a signed integer comparison. */
19511
19512 compare_op = ix86_expand_compare (code, op0, op1);
19513 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19514 {
19515 tmp = gen_reg_rtx (QImode);
19516 ix86_expand_setcc (tmp, code, op0, op1);
19517
19518 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19519 }
19520
19521 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19522 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19523 operands[2], operands[3])));
19524
19525 return true;
19526 }
19527
19528 /* Expand a floating-point vector conditional move; a vcond operation
19529 rather than a movcc operation. */
19530
19531 bool
19532 ix86_expand_fp_vcond (rtx operands[])
19533 {
19534 enum rtx_code code = GET_CODE (operands[3]);
19535 rtx cmp;
19536
19537 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19538 &operands[4], &operands[5]);
19539 if (code == UNKNOWN)
19540 {
19541 rtx temp;
19542 switch (GET_CODE (operands[3]))
19543 {
19544 case LTGT:
19545 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19546 operands[5], operands[0], operands[0]);
19547 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19548 operands[5], operands[1], operands[2]);
19549 code = AND;
19550 break;
19551 case UNEQ:
19552 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19553 operands[5], operands[0], operands[0]);
19554 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19555 operands[5], operands[1], operands[2]);
19556 code = IOR;
19557 break;
19558 default:
19559 gcc_unreachable ();
19560 }
19561 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19562 OPTAB_DIRECT);
19563 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19564 return true;
19565 }
19566
19567 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19568 operands[5], operands[1], operands[2]))
19569 return true;
19570
19571 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19572 operands[1], operands[2]);
19573 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19574 return true;
19575 }
19576
19577 /* Expand a signed/unsigned integral vector conditional move. */
19578
19579 bool
19580 ix86_expand_int_vcond (rtx operands[])
19581 {
19582 enum machine_mode data_mode = GET_MODE (operands[0]);
19583 enum machine_mode mode = GET_MODE (operands[4]);
19584 enum rtx_code code = GET_CODE (operands[3]);
19585 bool negate = false;
19586 rtx x, cop0, cop1;
19587
19588 cop0 = operands[4];
19589 cop1 = operands[5];
19590
19591 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19592 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19593 if ((code == LT || code == GE)
19594 && data_mode == mode
19595 && cop1 == CONST0_RTX (mode)
19596 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19597 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19598 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19599 && (GET_MODE_SIZE (data_mode) == 16
19600 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19601 {
19602 rtx negop = operands[2 - (code == LT)];
19603 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19604 if (negop == CONST1_RTX (data_mode))
19605 {
19606 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19607 operands[0], 1, OPTAB_DIRECT);
19608 if (res != operands[0])
19609 emit_move_insn (operands[0], res);
19610 return true;
19611 }
19612 else if (GET_MODE_INNER (data_mode) != DImode
19613 && vector_all_ones_operand (negop, data_mode))
19614 {
19615 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19616 operands[0], 0, OPTAB_DIRECT);
19617 if (res != operands[0])
19618 emit_move_insn (operands[0], res);
19619 return true;
19620 }
19621 }
19622
19623 if (!nonimmediate_operand (cop1, mode))
19624 cop1 = force_reg (mode, cop1);
19625 if (!general_operand (operands[1], data_mode))
19626 operands[1] = force_reg (data_mode, operands[1]);
19627 if (!general_operand (operands[2], data_mode))
19628 operands[2] = force_reg (data_mode, operands[2]);
19629
19630 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19631 if (TARGET_XOP
19632 && (mode == V16QImode || mode == V8HImode
19633 || mode == V4SImode || mode == V2DImode))
19634 ;
19635 else
19636 {
19637 /* Canonicalize the comparison to EQ, GT, GTU. */
19638 switch (code)
19639 {
19640 case EQ:
19641 case GT:
19642 case GTU:
19643 break;
19644
19645 case NE:
19646 case LE:
19647 case LEU:
19648 code = reverse_condition (code);
19649 negate = true;
19650 break;
19651
19652 case GE:
19653 case GEU:
19654 code = reverse_condition (code);
19655 negate = true;
19656 /* FALLTHRU */
19657
19658 case LT:
19659 case LTU:
19660 code = swap_condition (code);
19661 x = cop0, cop0 = cop1, cop1 = x;
19662 break;
19663
19664 default:
19665 gcc_unreachable ();
19666 }
19667
19668 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19669 if (mode == V2DImode)
19670 {
19671 switch (code)
19672 {
19673 case EQ:
19674 /* SSE4.1 supports EQ. */
19675 if (!TARGET_SSE4_1)
19676 return false;
19677 break;
19678
19679 case GT:
19680 case GTU:
19681 /* SSE4.2 supports GT/GTU. */
19682 if (!TARGET_SSE4_2)
19683 return false;
19684 break;
19685
19686 default:
19687 gcc_unreachable ();
19688 }
19689 }
19690
19691 /* Unsigned parallel compare is not supported by the hardware.
19692 Play some tricks to turn this into a signed comparison
19693 against 0. */
19694 if (code == GTU)
19695 {
19696 cop0 = force_reg (mode, cop0);
19697
19698 switch (mode)
19699 {
19700 case V8SImode:
19701 case V4DImode:
19702 case V4SImode:
19703 case V2DImode:
19704 {
19705 rtx t1, t2, mask;
19706 rtx (*gen_sub3) (rtx, rtx, rtx);
19707
19708 switch (mode)
19709 {
19710 case V8SImode: gen_sub3 = gen_subv8si3; break;
19711 case V4DImode: gen_sub3 = gen_subv4di3; break;
19712 case V4SImode: gen_sub3 = gen_subv4si3; break;
19713 case V2DImode: gen_sub3 = gen_subv2di3; break;
19714 default:
19715 gcc_unreachable ();
19716 }
19717 /* Subtract (-(INT MAX) - 1) from both operands to make
19718 them signed. */
19719 mask = ix86_build_signbit_mask (mode, true, false);
19720 t1 = gen_reg_rtx (mode);
19721 emit_insn (gen_sub3 (t1, cop0, mask));
19722
19723 t2 = gen_reg_rtx (mode);
19724 emit_insn (gen_sub3 (t2, cop1, mask));
19725
19726 cop0 = t1;
19727 cop1 = t2;
19728 code = GT;
19729 }
19730 break;
19731
19732 case V32QImode:
19733 case V16HImode:
19734 case V16QImode:
19735 case V8HImode:
19736 /* Perform a parallel unsigned saturating subtraction. */
19737 x = gen_reg_rtx (mode);
19738 emit_insn (gen_rtx_SET (VOIDmode, x,
19739 gen_rtx_US_MINUS (mode, cop0, cop1)));
19740
19741 cop0 = x;
19742 cop1 = CONST0_RTX (mode);
19743 code = EQ;
19744 negate = !negate;
19745 break;
19746
19747 default:
19748 gcc_unreachable ();
19749 }
19750 }
19751 }
19752
19753 /* Allow the comparison to be done in one mode, but the movcc to
19754 happen in another mode. */
19755 if (data_mode == mode)
19756 {
19757 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19758 operands[1+negate], operands[2-negate]);
19759 }
19760 else
19761 {
19762 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19763 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19764 code, cop0, cop1,
19765 operands[1+negate], operands[2-negate]);
19766 x = gen_lowpart (data_mode, x);
19767 }
19768
19769 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19770 operands[2-negate]);
19771 return true;
19772 }
19773
19774 /* Expand a variable vector permutation. */
19775
19776 void
19777 ix86_expand_vec_perm (rtx operands[])
19778 {
19779 rtx target = operands[0];
19780 rtx op0 = operands[1];
19781 rtx op1 = operands[2];
19782 rtx mask = operands[3];
19783 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19784 enum machine_mode mode = GET_MODE (op0);
19785 enum machine_mode maskmode = GET_MODE (mask);
19786 int w, e, i;
19787 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19788
19789 /* Number of elements in the vector. */
19790 w = GET_MODE_NUNITS (mode);
19791 e = GET_MODE_UNIT_SIZE (mode);
19792 gcc_assert (w <= 32);
19793
19794 if (TARGET_AVX2)
19795 {
19796 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19797 {
19798 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19799 an constant shuffle operand. With a tiny bit of effort we can
19800 use VPERMD instead. A re-interpretation stall for V4DFmode is
19801 unfortunate but there's no avoiding it.
19802 Similarly for V16HImode we don't have instructions for variable
19803 shuffling, while for V32QImode we can use after preparing suitable
19804 masks vpshufb; vpshufb; vpermq; vpor. */
19805
19806 if (mode == V16HImode)
19807 {
19808 maskmode = mode = V32QImode;
19809 w = 32;
19810 e = 1;
19811 }
19812 else
19813 {
19814 maskmode = mode = V8SImode;
19815 w = 8;
19816 e = 4;
19817 }
19818 t1 = gen_reg_rtx (maskmode);
19819
19820 /* Replicate the low bits of the V4DImode mask into V8SImode:
19821 mask = { A B C D }
19822 t1 = { A A B B C C D D }. */
19823 for (i = 0; i < w / 2; ++i)
19824 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19825 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19826 vt = force_reg (maskmode, vt);
19827 mask = gen_lowpart (maskmode, mask);
19828 if (maskmode == V8SImode)
19829 emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
19830 else
19831 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19832
19833 /* Multiply the shuffle indicies by two. */
19834 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19835 OPTAB_DIRECT);
19836
19837 /* Add one to the odd shuffle indicies:
19838 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19839 for (i = 0; i < w / 2; ++i)
19840 {
19841 vec[i * 2] = const0_rtx;
19842 vec[i * 2 + 1] = const1_rtx;
19843 }
19844 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19845 vt = force_const_mem (maskmode, vt);
19846 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19847 OPTAB_DIRECT);
19848
19849 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19850 operands[3] = mask = t1;
19851 target = gen_lowpart (mode, target);
19852 op0 = gen_lowpart (mode, op0);
19853 op1 = gen_lowpart (mode, op1);
19854 }
19855
19856 switch (mode)
19857 {
19858 case V8SImode:
19859 /* The VPERMD and VPERMPS instructions already properly ignore
19860 the high bits of the shuffle elements. No need for us to
19861 perform an AND ourselves. */
19862 if (one_operand_shuffle)
19863 emit_insn (gen_avx2_permvarv8si (target, mask, op0));
19864 else
19865 {
19866 t1 = gen_reg_rtx (V8SImode);
19867 t2 = gen_reg_rtx (V8SImode);
19868 emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
19869 emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
19870 goto merge_two;
19871 }
19872 return;
19873
19874 case V8SFmode:
19875 mask = gen_lowpart (V8SFmode, mask);
19876 if (one_operand_shuffle)
19877 emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
19878 else
19879 {
19880 t1 = gen_reg_rtx (V8SFmode);
19881 t2 = gen_reg_rtx (V8SFmode);
19882 emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
19883 emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
19884 goto merge_two;
19885 }
19886 return;
19887
19888 case V4SImode:
19889 /* By combining the two 128-bit input vectors into one 256-bit
19890 input vector, we can use VPERMD and VPERMPS for the full
19891 two-operand shuffle. */
19892 t1 = gen_reg_rtx (V8SImode);
19893 t2 = gen_reg_rtx (V8SImode);
19894 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19895 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19896 emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
19897 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19898 return;
19899
19900 case V4SFmode:
19901 t1 = gen_reg_rtx (V8SFmode);
19902 t2 = gen_reg_rtx (V8SFmode);
19903 mask = gen_lowpart (V4SFmode, mask);
19904 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19905 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19906 emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
19907 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19908 return;
19909
19910 case V32QImode:
19911 t1 = gen_reg_rtx (V32QImode);
19912 t2 = gen_reg_rtx (V32QImode);
19913 t3 = gen_reg_rtx (V32QImode);
19914 vt2 = GEN_INT (128);
19915 for (i = 0; i < 32; i++)
19916 vec[i] = vt2;
19917 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19918 vt = force_reg (V32QImode, vt);
19919 for (i = 0; i < 32; i++)
19920 vec[i] = i < 16 ? vt2 : const0_rtx;
19921 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19922 vt2 = force_reg (V32QImode, vt2);
19923 /* From mask create two adjusted masks, which contain the same
19924 bits as mask in the low 7 bits of each vector element.
19925 The first mask will have the most significant bit clear
19926 if it requests element from the same 128-bit lane
19927 and MSB set if it requests element from the other 128-bit lane.
19928 The second mask will have the opposite values of the MSB,
19929 and additionally will have its 128-bit lanes swapped.
19930 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19931 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19932 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19933 stands for other 12 bytes. */
19934 /* The bit whether element is from the same lane or the other
19935 lane is bit 4, so shift it up by 3 to the MSB position. */
19936 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19937 gen_lowpart (V4DImode, mask),
19938 GEN_INT (3)));
19939 /* Clear MSB bits from the mask just in case it had them set. */
19940 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19941 /* After this t1 will have MSB set for elements from other lane. */
19942 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
19943 /* Clear bits other than MSB. */
19944 emit_insn (gen_andv32qi3 (t1, t1, vt));
19945 /* Or in the lower bits from mask into t3. */
19946 emit_insn (gen_iorv32qi3 (t3, t1, t2));
19947 /* And invert MSB bits in t1, so MSB is set for elements from the same
19948 lane. */
19949 emit_insn (gen_xorv32qi3 (t1, t1, vt));
19950 /* Swap 128-bit lanes in t3. */
19951 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19952 gen_lowpart (V4DImode, t3),
19953 const2_rtx, GEN_INT (3),
19954 const0_rtx, const1_rtx));
19955 /* And or in the lower bits from mask into t1. */
19956 emit_insn (gen_iorv32qi3 (t1, t1, t2));
19957 if (one_operand_shuffle)
19958 {
19959 /* Each of these shuffles will put 0s in places where
19960 element from the other 128-bit lane is needed, otherwise
19961 will shuffle in the requested value. */
19962 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
19963 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
19964 /* For t3 the 128-bit lanes are swapped again. */
19965 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19966 gen_lowpart (V4DImode, t3),
19967 const2_rtx, GEN_INT (3),
19968 const0_rtx, const1_rtx));
19969 /* And oring both together leads to the result. */
19970 emit_insn (gen_iorv32qi3 (target, t1, t3));
19971 return;
19972 }
19973
19974 t4 = gen_reg_rtx (V32QImode);
19975 /* Similarly to the above one_operand_shuffle code,
19976 just for repeated twice for each operand. merge_two:
19977 code will merge the two results together. */
19978 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
19979 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
19980 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
19981 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
19982 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
19983 gen_lowpart (V4DImode, t4),
19984 const2_rtx, GEN_INT (3),
19985 const0_rtx, const1_rtx));
19986 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19987 gen_lowpart (V4DImode, t3),
19988 const2_rtx, GEN_INT (3),
19989 const0_rtx, const1_rtx));
19990 emit_insn (gen_iorv32qi3 (t4, t2, t4));
19991 emit_insn (gen_iorv32qi3 (t3, t1, t3));
19992 t1 = t4;
19993 t2 = t3;
19994 goto merge_two;
19995
19996 default:
19997 gcc_assert (GET_MODE_SIZE (mode) <= 16);
19998 break;
19999 }
20000 }
20001
20002 if (TARGET_XOP)
20003 {
20004 /* The XOP VPPERM insn supports three inputs. By ignoring the
20005 one_operand_shuffle special case, we avoid creating another
20006 set of constant vectors in memory. */
20007 one_operand_shuffle = false;
20008
20009 /* mask = mask & {2*w-1, ...} */
20010 vt = GEN_INT (2*w - 1);
20011 }
20012 else
20013 {
20014 /* mask = mask & {w-1, ...} */
20015 vt = GEN_INT (w - 1);
20016 }
20017
20018 for (i = 0; i < w; i++)
20019 vec[i] = vt;
20020 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20021 mask = expand_simple_binop (maskmode, AND, mask, vt,
20022 NULL_RTX, 0, OPTAB_DIRECT);
20023
20024 /* For non-QImode operations, convert the word permutation control
20025 into a byte permutation control. */
20026 if (mode != V16QImode)
20027 {
20028 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20029 GEN_INT (exact_log2 (e)),
20030 NULL_RTX, 0, OPTAB_DIRECT);
20031
20032 /* Convert mask to vector of chars. */
20033 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20034
20035 /* Replicate each of the input bytes into byte positions:
20036 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20037 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20038 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20039 for (i = 0; i < 16; ++i)
20040 vec[i] = GEN_INT (i/e * e);
20041 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20042 vt = force_const_mem (V16QImode, vt);
20043 if (TARGET_XOP)
20044 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20045 else
20046 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20047
20048 /* Convert it into the byte positions by doing
20049 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20050 for (i = 0; i < 16; ++i)
20051 vec[i] = GEN_INT (i % e);
20052 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20053 vt = force_const_mem (V16QImode, vt);
20054 emit_insn (gen_addv16qi3 (mask, mask, vt));
20055 }
20056
20057 /* The actual shuffle operations all operate on V16QImode. */
20058 op0 = gen_lowpart (V16QImode, op0);
20059 op1 = gen_lowpart (V16QImode, op1);
20060 target = gen_lowpart (V16QImode, target);
20061
20062 if (TARGET_XOP)
20063 {
20064 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20065 }
20066 else if (one_operand_shuffle)
20067 {
20068 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20069 }
20070 else
20071 {
20072 rtx xops[6];
20073 bool ok;
20074
20075 /* Shuffle the two input vectors independently. */
20076 t1 = gen_reg_rtx (V16QImode);
20077 t2 = gen_reg_rtx (V16QImode);
20078 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20079 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20080
20081 merge_two:
20082 /* Then merge them together. The key is whether any given control
20083 element contained a bit set that indicates the second word. */
20084 mask = operands[3];
20085 vt = GEN_INT (w);
20086 if (maskmode == V2DImode && !TARGET_SSE4_1)
20087 {
20088 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20089 more shuffle to convert the V2DI input mask into a V4SI
20090 input mask. At which point the masking that expand_int_vcond
20091 will work as desired. */
20092 rtx t3 = gen_reg_rtx (V4SImode);
20093 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20094 const0_rtx, const0_rtx,
20095 const2_rtx, const2_rtx));
20096 mask = t3;
20097 maskmode = V4SImode;
20098 e = w = 4;
20099 }
20100
20101 for (i = 0; i < w; i++)
20102 vec[i] = vt;
20103 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20104 vt = force_reg (maskmode, vt);
20105 mask = expand_simple_binop (maskmode, AND, mask, vt,
20106 NULL_RTX, 0, OPTAB_DIRECT);
20107
20108 xops[0] = gen_lowpart (mode, operands[0]);
20109 xops[1] = gen_lowpart (mode, t2);
20110 xops[2] = gen_lowpart (mode, t1);
20111 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20112 xops[4] = mask;
20113 xops[5] = vt;
20114 ok = ix86_expand_int_vcond (xops);
20115 gcc_assert (ok);
20116 }
20117 }
20118
20119 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20120 true if we should do zero extension, else sign extension. HIGH_P is
20121 true if we want the N/2 high elements, else the low elements. */
20122
20123 void
20124 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
20125 {
20126 enum machine_mode imode = GET_MODE (operands[1]);
20127 rtx tmp, dest;
20128
20129 if (TARGET_SSE4_1)
20130 {
20131 rtx (*unpack)(rtx, rtx);
20132 rtx (*extract)(rtx, rtx) = NULL;
20133 enum machine_mode halfmode = BLKmode;
20134
20135 switch (imode)
20136 {
20137 case V32QImode:
20138 if (unsigned_p)
20139 unpack = gen_avx2_zero_extendv16qiv16hi2;
20140 else
20141 unpack = gen_avx2_sign_extendv16qiv16hi2;
20142 halfmode = V16QImode;
20143 extract
20144 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20145 break;
20146 case V16HImode:
20147 if (unsigned_p)
20148 unpack = gen_avx2_zero_extendv8hiv8si2;
20149 else
20150 unpack = gen_avx2_sign_extendv8hiv8si2;
20151 halfmode = V8HImode;
20152 extract
20153 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20154 break;
20155 case V8SImode:
20156 if (unsigned_p)
20157 unpack = gen_avx2_zero_extendv4siv4di2;
20158 else
20159 unpack = gen_avx2_sign_extendv4siv4di2;
20160 halfmode = V4SImode;
20161 extract
20162 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20163 break;
20164 case V16QImode:
20165 if (unsigned_p)
20166 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20167 else
20168 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20169 break;
20170 case V8HImode:
20171 if (unsigned_p)
20172 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20173 else
20174 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20175 break;
20176 case V4SImode:
20177 if (unsigned_p)
20178 unpack = gen_sse4_1_zero_extendv2siv2di2;
20179 else
20180 unpack = gen_sse4_1_sign_extendv2siv2di2;
20181 break;
20182 default:
20183 gcc_unreachable ();
20184 }
20185
20186 if (GET_MODE_SIZE (imode) == 32)
20187 {
20188 tmp = gen_reg_rtx (halfmode);
20189 emit_insn (extract (tmp, operands[1]));
20190 }
20191 else if (high_p)
20192 {
20193 /* Shift higher 8 bytes to lower 8 bytes. */
20194 tmp = gen_reg_rtx (imode);
20195 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20196 gen_lowpart (V1TImode, operands[1]),
20197 GEN_INT (64)));
20198 }
20199 else
20200 tmp = operands[1];
20201
20202 emit_insn (unpack (operands[0], tmp));
20203 }
20204 else
20205 {
20206 rtx (*unpack)(rtx, rtx, rtx);
20207
20208 switch (imode)
20209 {
20210 case V16QImode:
20211 if (high_p)
20212 unpack = gen_vec_interleave_highv16qi;
20213 else
20214 unpack = gen_vec_interleave_lowv16qi;
20215 break;
20216 case V8HImode:
20217 if (high_p)
20218 unpack = gen_vec_interleave_highv8hi;
20219 else
20220 unpack = gen_vec_interleave_lowv8hi;
20221 break;
20222 case V4SImode:
20223 if (high_p)
20224 unpack = gen_vec_interleave_highv4si;
20225 else
20226 unpack = gen_vec_interleave_lowv4si;
20227 break;
20228 default:
20229 gcc_unreachable ();
20230 }
20231
20232 dest = gen_lowpart (imode, operands[0]);
20233
20234 if (unsigned_p)
20235 tmp = force_reg (imode, CONST0_RTX (imode));
20236 else
20237 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20238 operands[1], pc_rtx, pc_rtx);
20239
20240 emit_insn (unpack (dest, operands[1], tmp));
20241 }
20242 }
20243
20244 /* Expand conditional increment or decrement using adb/sbb instructions.
20245 The default case using setcc followed by the conditional move can be
20246 done by generic code. */
20247 bool
20248 ix86_expand_int_addcc (rtx operands[])
20249 {
20250 enum rtx_code code = GET_CODE (operands[1]);
20251 rtx flags;
20252 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20253 rtx compare_op;
20254 rtx val = const0_rtx;
20255 bool fpcmp = false;
20256 enum machine_mode mode;
20257 rtx op0 = XEXP (operands[1], 0);
20258 rtx op1 = XEXP (operands[1], 1);
20259
20260 if (operands[3] != const1_rtx
20261 && operands[3] != constm1_rtx)
20262 return false;
20263 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20264 return false;
20265 code = GET_CODE (compare_op);
20266
20267 flags = XEXP (compare_op, 0);
20268
20269 if (GET_MODE (flags) == CCFPmode
20270 || GET_MODE (flags) == CCFPUmode)
20271 {
20272 fpcmp = true;
20273 code = ix86_fp_compare_code_to_integer (code);
20274 }
20275
20276 if (code != LTU)
20277 {
20278 val = constm1_rtx;
20279 if (fpcmp)
20280 PUT_CODE (compare_op,
20281 reverse_condition_maybe_unordered
20282 (GET_CODE (compare_op)));
20283 else
20284 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20285 }
20286
20287 mode = GET_MODE (operands[0]);
20288
20289 /* Construct either adc or sbb insn. */
20290 if ((code == LTU) == (operands[3] == constm1_rtx))
20291 {
20292 switch (mode)
20293 {
20294 case QImode:
20295 insn = gen_subqi3_carry;
20296 break;
20297 case HImode:
20298 insn = gen_subhi3_carry;
20299 break;
20300 case SImode:
20301 insn = gen_subsi3_carry;
20302 break;
20303 case DImode:
20304 insn = gen_subdi3_carry;
20305 break;
20306 default:
20307 gcc_unreachable ();
20308 }
20309 }
20310 else
20311 {
20312 switch (mode)
20313 {
20314 case QImode:
20315 insn = gen_addqi3_carry;
20316 break;
20317 case HImode:
20318 insn = gen_addhi3_carry;
20319 break;
20320 case SImode:
20321 insn = gen_addsi3_carry;
20322 break;
20323 case DImode:
20324 insn = gen_adddi3_carry;
20325 break;
20326 default:
20327 gcc_unreachable ();
20328 }
20329 }
20330 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20331
20332 return true;
20333 }
20334
20335
20336 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20337 but works for floating pointer parameters and nonoffsetable memories.
20338 For pushes, it returns just stack offsets; the values will be saved
20339 in the right order. Maximally three parts are generated. */
20340
20341 static int
20342 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20343 {
20344 int size;
20345
20346 if (!TARGET_64BIT)
20347 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20348 else
20349 size = (GET_MODE_SIZE (mode) + 4) / 8;
20350
20351 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20352 gcc_assert (size >= 2 && size <= 4);
20353
20354 /* Optimize constant pool reference to immediates. This is used by fp
20355 moves, that force all constants to memory to allow combining. */
20356 if (MEM_P (operand) && MEM_READONLY_P (operand))
20357 {
20358 rtx tmp = maybe_get_pool_constant (operand);
20359 if (tmp)
20360 operand = tmp;
20361 }
20362
20363 if (MEM_P (operand) && !offsettable_memref_p (operand))
20364 {
20365 /* The only non-offsetable memories we handle are pushes. */
20366 int ok = push_operand (operand, VOIDmode);
20367
20368 gcc_assert (ok);
20369
20370 operand = copy_rtx (operand);
20371 PUT_MODE (operand, word_mode);
20372 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20373 return size;
20374 }
20375
20376 if (GET_CODE (operand) == CONST_VECTOR)
20377 {
20378 enum machine_mode imode = int_mode_for_mode (mode);
20379 /* Caution: if we looked through a constant pool memory above,
20380 the operand may actually have a different mode now. That's
20381 ok, since we want to pun this all the way back to an integer. */
20382 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20383 gcc_assert (operand != NULL);
20384 mode = imode;
20385 }
20386
20387 if (!TARGET_64BIT)
20388 {
20389 if (mode == DImode)
20390 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20391 else
20392 {
20393 int i;
20394
20395 if (REG_P (operand))
20396 {
20397 gcc_assert (reload_completed);
20398 for (i = 0; i < size; i++)
20399 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20400 }
20401 else if (offsettable_memref_p (operand))
20402 {
20403 operand = adjust_address (operand, SImode, 0);
20404 parts[0] = operand;
20405 for (i = 1; i < size; i++)
20406 parts[i] = adjust_address (operand, SImode, 4 * i);
20407 }
20408 else if (GET_CODE (operand) == CONST_DOUBLE)
20409 {
20410 REAL_VALUE_TYPE r;
20411 long l[4];
20412
20413 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20414 switch (mode)
20415 {
20416 case TFmode:
20417 real_to_target (l, &r, mode);
20418 parts[3] = gen_int_mode (l[3], SImode);
20419 parts[2] = gen_int_mode (l[2], SImode);
20420 break;
20421 case XFmode:
20422 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20423 parts[2] = gen_int_mode (l[2], SImode);
20424 break;
20425 case DFmode:
20426 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20427 break;
20428 default:
20429 gcc_unreachable ();
20430 }
20431 parts[1] = gen_int_mode (l[1], SImode);
20432 parts[0] = gen_int_mode (l[0], SImode);
20433 }
20434 else
20435 gcc_unreachable ();
20436 }
20437 }
20438 else
20439 {
20440 if (mode == TImode)
20441 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20442 if (mode == XFmode || mode == TFmode)
20443 {
20444 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20445 if (REG_P (operand))
20446 {
20447 gcc_assert (reload_completed);
20448 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20449 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20450 }
20451 else if (offsettable_memref_p (operand))
20452 {
20453 operand = adjust_address (operand, DImode, 0);
20454 parts[0] = operand;
20455 parts[1] = adjust_address (operand, upper_mode, 8);
20456 }
20457 else if (GET_CODE (operand) == CONST_DOUBLE)
20458 {
20459 REAL_VALUE_TYPE r;
20460 long l[4];
20461
20462 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20463 real_to_target (l, &r, mode);
20464
20465 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20466 if (HOST_BITS_PER_WIDE_INT >= 64)
20467 parts[0]
20468 = gen_int_mode
20469 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20470 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20471 DImode);
20472 else
20473 parts[0] = immed_double_const (l[0], l[1], DImode);
20474
20475 if (upper_mode == SImode)
20476 parts[1] = gen_int_mode (l[2], SImode);
20477 else if (HOST_BITS_PER_WIDE_INT >= 64)
20478 parts[1]
20479 = gen_int_mode
20480 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20481 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20482 DImode);
20483 else
20484 parts[1] = immed_double_const (l[2], l[3], DImode);
20485 }
20486 else
20487 gcc_unreachable ();
20488 }
20489 }
20490
20491 return size;
20492 }
20493
20494 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20495 Return false when normal moves are needed; true when all required
20496 insns have been emitted. Operands 2-4 contain the input values
20497 int the correct order; operands 5-7 contain the output values. */
20498
20499 void
20500 ix86_split_long_move (rtx operands[])
20501 {
20502 rtx part[2][4];
20503 int nparts, i, j;
20504 int push = 0;
20505 int collisions = 0;
20506 enum machine_mode mode = GET_MODE (operands[0]);
20507 bool collisionparts[4];
20508
20509 /* The DFmode expanders may ask us to move double.
20510 For 64bit target this is single move. By hiding the fact
20511 here we simplify i386.md splitters. */
20512 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20513 {
20514 /* Optimize constant pool reference to immediates. This is used by
20515 fp moves, that force all constants to memory to allow combining. */
20516
20517 if (MEM_P (operands[1])
20518 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20519 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20520 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20521 if (push_operand (operands[0], VOIDmode))
20522 {
20523 operands[0] = copy_rtx (operands[0]);
20524 PUT_MODE (operands[0], word_mode);
20525 }
20526 else
20527 operands[0] = gen_lowpart (DImode, operands[0]);
20528 operands[1] = gen_lowpart (DImode, operands[1]);
20529 emit_move_insn (operands[0], operands[1]);
20530 return;
20531 }
20532
20533 /* The only non-offsettable memory we handle is push. */
20534 if (push_operand (operands[0], VOIDmode))
20535 push = 1;
20536 else
20537 gcc_assert (!MEM_P (operands[0])
20538 || offsettable_memref_p (operands[0]));
20539
20540 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20541 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20542
20543 /* When emitting push, take care for source operands on the stack. */
20544 if (push && MEM_P (operands[1])
20545 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20546 {
20547 rtx src_base = XEXP (part[1][nparts - 1], 0);
20548
20549 /* Compensate for the stack decrement by 4. */
20550 if (!TARGET_64BIT && nparts == 3
20551 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20552 src_base = plus_constant (src_base, 4);
20553
20554 /* src_base refers to the stack pointer and is
20555 automatically decreased by emitted push. */
20556 for (i = 0; i < nparts; i++)
20557 part[1][i] = change_address (part[1][i],
20558 GET_MODE (part[1][i]), src_base);
20559 }
20560
20561 /* We need to do copy in the right order in case an address register
20562 of the source overlaps the destination. */
20563 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20564 {
20565 rtx tmp;
20566
20567 for (i = 0; i < nparts; i++)
20568 {
20569 collisionparts[i]
20570 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20571 if (collisionparts[i])
20572 collisions++;
20573 }
20574
20575 /* Collision in the middle part can be handled by reordering. */
20576 if (collisions == 1 && nparts == 3 && collisionparts [1])
20577 {
20578 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20579 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20580 }
20581 else if (collisions == 1
20582 && nparts == 4
20583 && (collisionparts [1] || collisionparts [2]))
20584 {
20585 if (collisionparts [1])
20586 {
20587 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20588 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20589 }
20590 else
20591 {
20592 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20593 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20594 }
20595 }
20596
20597 /* If there are more collisions, we can't handle it by reordering.
20598 Do an lea to the last part and use only one colliding move. */
20599 else if (collisions > 1)
20600 {
20601 rtx base;
20602
20603 collisions = 1;
20604
20605 base = part[0][nparts - 1];
20606
20607 /* Handle the case when the last part isn't valid for lea.
20608 Happens in 64-bit mode storing the 12-byte XFmode. */
20609 if (GET_MODE (base) != Pmode)
20610 base = gen_rtx_REG (Pmode, REGNO (base));
20611
20612 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20613 part[1][0] = replace_equiv_address (part[1][0], base);
20614 for (i = 1; i < nparts; i++)
20615 {
20616 tmp = plus_constant (base, UNITS_PER_WORD * i);
20617 part[1][i] = replace_equiv_address (part[1][i], tmp);
20618 }
20619 }
20620 }
20621
20622 if (push)
20623 {
20624 if (!TARGET_64BIT)
20625 {
20626 if (nparts == 3)
20627 {
20628 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20629 emit_insn (gen_addsi3 (stack_pointer_rtx,
20630 stack_pointer_rtx, GEN_INT (-4)));
20631 emit_move_insn (part[0][2], part[1][2]);
20632 }
20633 else if (nparts == 4)
20634 {
20635 emit_move_insn (part[0][3], part[1][3]);
20636 emit_move_insn (part[0][2], part[1][2]);
20637 }
20638 }
20639 else
20640 {
20641 /* In 64bit mode we don't have 32bit push available. In case this is
20642 register, it is OK - we will just use larger counterpart. We also
20643 retype memory - these comes from attempt to avoid REX prefix on
20644 moving of second half of TFmode value. */
20645 if (GET_MODE (part[1][1]) == SImode)
20646 {
20647 switch (GET_CODE (part[1][1]))
20648 {
20649 case MEM:
20650 part[1][1] = adjust_address (part[1][1], DImode, 0);
20651 break;
20652
20653 case REG:
20654 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20655 break;
20656
20657 default:
20658 gcc_unreachable ();
20659 }
20660
20661 if (GET_MODE (part[1][0]) == SImode)
20662 part[1][0] = part[1][1];
20663 }
20664 }
20665 emit_move_insn (part[0][1], part[1][1]);
20666 emit_move_insn (part[0][0], part[1][0]);
20667 return;
20668 }
20669
20670 /* Choose correct order to not overwrite the source before it is copied. */
20671 if ((REG_P (part[0][0])
20672 && REG_P (part[1][1])
20673 && (REGNO (part[0][0]) == REGNO (part[1][1])
20674 || (nparts == 3
20675 && REGNO (part[0][0]) == REGNO (part[1][2]))
20676 || (nparts == 4
20677 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20678 || (collisions > 0
20679 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20680 {
20681 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20682 {
20683 operands[2 + i] = part[0][j];
20684 operands[6 + i] = part[1][j];
20685 }
20686 }
20687 else
20688 {
20689 for (i = 0; i < nparts; i++)
20690 {
20691 operands[2 + i] = part[0][i];
20692 operands[6 + i] = part[1][i];
20693 }
20694 }
20695
20696 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20697 if (optimize_insn_for_size_p ())
20698 {
20699 for (j = 0; j < nparts - 1; j++)
20700 if (CONST_INT_P (operands[6 + j])
20701 && operands[6 + j] != const0_rtx
20702 && REG_P (operands[2 + j]))
20703 for (i = j; i < nparts - 1; i++)
20704 if (CONST_INT_P (operands[7 + i])
20705 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20706 operands[7 + i] = operands[2 + j];
20707 }
20708
20709 for (i = 0; i < nparts; i++)
20710 emit_move_insn (operands[2 + i], operands[6 + i]);
20711
20712 return;
20713 }
20714
20715 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20716 left shift by a constant, either using a single shift or
20717 a sequence of add instructions. */
20718
20719 static void
20720 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20721 {
20722 rtx (*insn)(rtx, rtx, rtx);
20723
20724 if (count == 1
20725 || (count * ix86_cost->add <= ix86_cost->shift_const
20726 && !optimize_insn_for_size_p ()))
20727 {
20728 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20729 while (count-- > 0)
20730 emit_insn (insn (operand, operand, operand));
20731 }
20732 else
20733 {
20734 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20735 emit_insn (insn (operand, operand, GEN_INT (count)));
20736 }
20737 }
20738
20739 void
20740 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20741 {
20742 rtx (*gen_ashl3)(rtx, rtx, rtx);
20743 rtx (*gen_shld)(rtx, rtx, rtx);
20744 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20745
20746 rtx low[2], high[2];
20747 int count;
20748
20749 if (CONST_INT_P (operands[2]))
20750 {
20751 split_double_mode (mode, operands, 2, low, high);
20752 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20753
20754 if (count >= half_width)
20755 {
20756 emit_move_insn (high[0], low[1]);
20757 emit_move_insn (low[0], const0_rtx);
20758
20759 if (count > half_width)
20760 ix86_expand_ashl_const (high[0], count - half_width, mode);
20761 }
20762 else
20763 {
20764 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20765
20766 if (!rtx_equal_p (operands[0], operands[1]))
20767 emit_move_insn (operands[0], operands[1]);
20768
20769 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20770 ix86_expand_ashl_const (low[0], count, mode);
20771 }
20772 return;
20773 }
20774
20775 split_double_mode (mode, operands, 1, low, high);
20776
20777 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20778
20779 if (operands[1] == const1_rtx)
20780 {
20781 /* Assuming we've chosen a QImode capable registers, then 1 << N
20782 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20783 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20784 {
20785 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20786
20787 ix86_expand_clear (low[0]);
20788 ix86_expand_clear (high[0]);
20789 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20790
20791 d = gen_lowpart (QImode, low[0]);
20792 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20793 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20794 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20795
20796 d = gen_lowpart (QImode, high[0]);
20797 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20798 s = gen_rtx_NE (QImode, flags, const0_rtx);
20799 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20800 }
20801
20802 /* Otherwise, we can get the same results by manually performing
20803 a bit extract operation on bit 5/6, and then performing the two
20804 shifts. The two methods of getting 0/1 into low/high are exactly
20805 the same size. Avoiding the shift in the bit extract case helps
20806 pentium4 a bit; no one else seems to care much either way. */
20807 else
20808 {
20809 enum machine_mode half_mode;
20810 rtx (*gen_lshr3)(rtx, rtx, rtx);
20811 rtx (*gen_and3)(rtx, rtx, rtx);
20812 rtx (*gen_xor3)(rtx, rtx, rtx);
20813 HOST_WIDE_INT bits;
20814 rtx x;
20815
20816 if (mode == DImode)
20817 {
20818 half_mode = SImode;
20819 gen_lshr3 = gen_lshrsi3;
20820 gen_and3 = gen_andsi3;
20821 gen_xor3 = gen_xorsi3;
20822 bits = 5;
20823 }
20824 else
20825 {
20826 half_mode = DImode;
20827 gen_lshr3 = gen_lshrdi3;
20828 gen_and3 = gen_anddi3;
20829 gen_xor3 = gen_xordi3;
20830 bits = 6;
20831 }
20832
20833 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20834 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20835 else
20836 x = gen_lowpart (half_mode, operands[2]);
20837 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20838
20839 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20840 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20841 emit_move_insn (low[0], high[0]);
20842 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20843 }
20844
20845 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20846 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20847 return;
20848 }
20849
20850 if (operands[1] == constm1_rtx)
20851 {
20852 /* For -1 << N, we can avoid the shld instruction, because we
20853 know that we're shifting 0...31/63 ones into a -1. */
20854 emit_move_insn (low[0], constm1_rtx);
20855 if (optimize_insn_for_size_p ())
20856 emit_move_insn (high[0], low[0]);
20857 else
20858 emit_move_insn (high[0], constm1_rtx);
20859 }
20860 else
20861 {
20862 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20863
20864 if (!rtx_equal_p (operands[0], operands[1]))
20865 emit_move_insn (operands[0], operands[1]);
20866
20867 split_double_mode (mode, operands, 1, low, high);
20868 emit_insn (gen_shld (high[0], low[0], operands[2]));
20869 }
20870
20871 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20872
20873 if (TARGET_CMOVE && scratch)
20874 {
20875 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20876 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20877
20878 ix86_expand_clear (scratch);
20879 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20880 }
20881 else
20882 {
20883 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20884 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20885
20886 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20887 }
20888 }
20889
20890 void
20891 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20892 {
20893 rtx (*gen_ashr3)(rtx, rtx, rtx)
20894 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20895 rtx (*gen_shrd)(rtx, rtx, rtx);
20896 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20897
20898 rtx low[2], high[2];
20899 int count;
20900
20901 if (CONST_INT_P (operands[2]))
20902 {
20903 split_double_mode (mode, operands, 2, low, high);
20904 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20905
20906 if (count == GET_MODE_BITSIZE (mode) - 1)
20907 {
20908 emit_move_insn (high[0], high[1]);
20909 emit_insn (gen_ashr3 (high[0], high[0],
20910 GEN_INT (half_width - 1)));
20911 emit_move_insn (low[0], high[0]);
20912
20913 }
20914 else if (count >= half_width)
20915 {
20916 emit_move_insn (low[0], high[1]);
20917 emit_move_insn (high[0], low[0]);
20918 emit_insn (gen_ashr3 (high[0], high[0],
20919 GEN_INT (half_width - 1)));
20920
20921 if (count > half_width)
20922 emit_insn (gen_ashr3 (low[0], low[0],
20923 GEN_INT (count - half_width)));
20924 }
20925 else
20926 {
20927 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20928
20929 if (!rtx_equal_p (operands[0], operands[1]))
20930 emit_move_insn (operands[0], operands[1]);
20931
20932 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20933 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20934 }
20935 }
20936 else
20937 {
20938 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20939
20940 if (!rtx_equal_p (operands[0], operands[1]))
20941 emit_move_insn (operands[0], operands[1]);
20942
20943 split_double_mode (mode, operands, 1, low, high);
20944
20945 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20946 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
20947
20948 if (TARGET_CMOVE && scratch)
20949 {
20950 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20951 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20952
20953 emit_move_insn (scratch, high[0]);
20954 emit_insn (gen_ashr3 (scratch, scratch,
20955 GEN_INT (half_width - 1)));
20956 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20957 scratch));
20958 }
20959 else
20960 {
20961 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
20962 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
20963
20964 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
20965 }
20966 }
20967 }
20968
20969 void
20970 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
20971 {
20972 rtx (*gen_lshr3)(rtx, rtx, rtx)
20973 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
20974 rtx (*gen_shrd)(rtx, rtx, rtx);
20975 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20976
20977 rtx low[2], high[2];
20978 int count;
20979
20980 if (CONST_INT_P (operands[2]))
20981 {
20982 split_double_mode (mode, operands, 2, low, high);
20983 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20984
20985 if (count >= half_width)
20986 {
20987 emit_move_insn (low[0], high[1]);
20988 ix86_expand_clear (high[0]);
20989
20990 if (count > half_width)
20991 emit_insn (gen_lshr3 (low[0], low[0],
20992 GEN_INT (count - half_width)));
20993 }
20994 else
20995 {
20996 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20997
20998 if (!rtx_equal_p (operands[0], operands[1]))
20999 emit_move_insn (operands[0], operands[1]);
21000
21001 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21002 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21003 }
21004 }
21005 else
21006 {
21007 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21008
21009 if (!rtx_equal_p (operands[0], operands[1]))
21010 emit_move_insn (operands[0], operands[1]);
21011
21012 split_double_mode (mode, operands, 1, low, high);
21013
21014 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21015 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21016
21017 if (TARGET_CMOVE && scratch)
21018 {
21019 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21020 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21021
21022 ix86_expand_clear (scratch);
21023 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21024 scratch));
21025 }
21026 else
21027 {
21028 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21029 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21030
21031 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21032 }
21033 }
21034 }
21035
21036 /* Predict just emitted jump instruction to be taken with probability PROB. */
21037 static void
21038 predict_jump (int prob)
21039 {
21040 rtx insn = get_last_insn ();
21041 gcc_assert (JUMP_P (insn));
21042 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21043 }
21044
21045 /* Helper function for the string operations below. Dest VARIABLE whether
21046 it is aligned to VALUE bytes. If true, jump to the label. */
21047 static rtx
21048 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21049 {
21050 rtx label = gen_label_rtx ();
21051 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21052 if (GET_MODE (variable) == DImode)
21053 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21054 else
21055 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21056 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21057 1, label);
21058 if (epilogue)
21059 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21060 else
21061 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21062 return label;
21063 }
21064
21065 /* Adjust COUNTER by the VALUE. */
21066 static void
21067 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21068 {
21069 rtx (*gen_add)(rtx, rtx, rtx)
21070 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21071
21072 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21073 }
21074
21075 /* Zero extend possibly SImode EXP to Pmode register. */
21076 rtx
21077 ix86_zero_extend_to_Pmode (rtx exp)
21078 {
21079 if (GET_MODE (exp) != Pmode)
21080 exp = convert_to_mode (Pmode, exp, 1);
21081 return force_reg (Pmode, exp);
21082 }
21083
21084 /* Divide COUNTREG by SCALE. */
21085 static rtx
21086 scale_counter (rtx countreg, int scale)
21087 {
21088 rtx sc;
21089
21090 if (scale == 1)
21091 return countreg;
21092 if (CONST_INT_P (countreg))
21093 return GEN_INT (INTVAL (countreg) / scale);
21094 gcc_assert (REG_P (countreg));
21095
21096 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21097 GEN_INT (exact_log2 (scale)),
21098 NULL, 1, OPTAB_DIRECT);
21099 return sc;
21100 }
21101
21102 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21103 DImode for constant loop counts. */
21104
21105 static enum machine_mode
21106 counter_mode (rtx count_exp)
21107 {
21108 if (GET_MODE (count_exp) != VOIDmode)
21109 return GET_MODE (count_exp);
21110 if (!CONST_INT_P (count_exp))
21111 return Pmode;
21112 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21113 return DImode;
21114 return SImode;
21115 }
21116
21117 /* When SRCPTR is non-NULL, output simple loop to move memory
21118 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21119 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21120 equivalent loop to set memory by VALUE (supposed to be in MODE).
21121
21122 The size is rounded down to whole number of chunk size moved at once.
21123 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21124
21125
21126 static void
21127 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21128 rtx destptr, rtx srcptr, rtx value,
21129 rtx count, enum machine_mode mode, int unroll,
21130 int expected_size)
21131 {
21132 rtx out_label, top_label, iter, tmp;
21133 enum machine_mode iter_mode = counter_mode (count);
21134 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21135 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21136 rtx size;
21137 rtx x_addr;
21138 rtx y_addr;
21139 int i;
21140
21141 top_label = gen_label_rtx ();
21142 out_label = gen_label_rtx ();
21143 iter = gen_reg_rtx (iter_mode);
21144
21145 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21146 NULL, 1, OPTAB_DIRECT);
21147 /* Those two should combine. */
21148 if (piece_size == const1_rtx)
21149 {
21150 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21151 true, out_label);
21152 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21153 }
21154 emit_move_insn (iter, const0_rtx);
21155
21156 emit_label (top_label);
21157
21158 tmp = convert_modes (Pmode, iter_mode, iter, true);
21159 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21160 destmem = change_address (destmem, mode, x_addr);
21161
21162 if (srcmem)
21163 {
21164 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21165 srcmem = change_address (srcmem, mode, y_addr);
21166
21167 /* When unrolling for chips that reorder memory reads and writes,
21168 we can save registers by using single temporary.
21169 Also using 4 temporaries is overkill in 32bit mode. */
21170 if (!TARGET_64BIT && 0)
21171 {
21172 for (i = 0; i < unroll; i++)
21173 {
21174 if (i)
21175 {
21176 destmem =
21177 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21178 srcmem =
21179 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21180 }
21181 emit_move_insn (destmem, srcmem);
21182 }
21183 }
21184 else
21185 {
21186 rtx tmpreg[4];
21187 gcc_assert (unroll <= 4);
21188 for (i = 0; i < unroll; i++)
21189 {
21190 tmpreg[i] = gen_reg_rtx (mode);
21191 if (i)
21192 {
21193 srcmem =
21194 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21195 }
21196 emit_move_insn (tmpreg[i], srcmem);
21197 }
21198 for (i = 0; i < unroll; i++)
21199 {
21200 if (i)
21201 {
21202 destmem =
21203 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21204 }
21205 emit_move_insn (destmem, tmpreg[i]);
21206 }
21207 }
21208 }
21209 else
21210 for (i = 0; i < unroll; i++)
21211 {
21212 if (i)
21213 destmem =
21214 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21215 emit_move_insn (destmem, value);
21216 }
21217
21218 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21219 true, OPTAB_LIB_WIDEN);
21220 if (tmp != iter)
21221 emit_move_insn (iter, tmp);
21222
21223 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21224 true, top_label);
21225 if (expected_size != -1)
21226 {
21227 expected_size /= GET_MODE_SIZE (mode) * unroll;
21228 if (expected_size == 0)
21229 predict_jump (0);
21230 else if (expected_size > REG_BR_PROB_BASE)
21231 predict_jump (REG_BR_PROB_BASE - 1);
21232 else
21233 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21234 }
21235 else
21236 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21237 iter = ix86_zero_extend_to_Pmode (iter);
21238 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21239 true, OPTAB_LIB_WIDEN);
21240 if (tmp != destptr)
21241 emit_move_insn (destptr, tmp);
21242 if (srcptr)
21243 {
21244 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21245 true, OPTAB_LIB_WIDEN);
21246 if (tmp != srcptr)
21247 emit_move_insn (srcptr, tmp);
21248 }
21249 emit_label (out_label);
21250 }
21251
21252 /* Output "rep; mov" instruction.
21253 Arguments have same meaning as for previous function */
21254 static void
21255 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21256 rtx destptr, rtx srcptr,
21257 rtx count,
21258 enum machine_mode mode)
21259 {
21260 rtx destexp;
21261 rtx srcexp;
21262 rtx countreg;
21263 HOST_WIDE_INT rounded_count;
21264
21265 /* If the size is known, it is shorter to use rep movs. */
21266 if (mode == QImode && CONST_INT_P (count)
21267 && !(INTVAL (count) & 3))
21268 mode = SImode;
21269
21270 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21271 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21272 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21273 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21274 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21275 if (mode != QImode)
21276 {
21277 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21278 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21279 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21280 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21281 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21282 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21283 }
21284 else
21285 {
21286 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21287 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21288 }
21289 if (CONST_INT_P (count))
21290 {
21291 rounded_count = (INTVAL (count)
21292 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21293 destmem = shallow_copy_rtx (destmem);
21294 srcmem = shallow_copy_rtx (srcmem);
21295 set_mem_size (destmem, rounded_count);
21296 set_mem_size (srcmem, rounded_count);
21297 }
21298 else
21299 {
21300 if (MEM_SIZE_KNOWN_P (destmem))
21301 clear_mem_size (destmem);
21302 if (MEM_SIZE_KNOWN_P (srcmem))
21303 clear_mem_size (srcmem);
21304 }
21305 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21306 destexp, srcexp));
21307 }
21308
21309 /* Output "rep; stos" instruction.
21310 Arguments have same meaning as for previous function */
21311 static void
21312 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21313 rtx count, enum machine_mode mode,
21314 rtx orig_value)
21315 {
21316 rtx destexp;
21317 rtx countreg;
21318 HOST_WIDE_INT rounded_count;
21319
21320 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21321 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21322 value = force_reg (mode, gen_lowpart (mode, value));
21323 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21324 if (mode != QImode)
21325 {
21326 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21327 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21328 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21329 }
21330 else
21331 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21332 if (orig_value == const0_rtx && CONST_INT_P (count))
21333 {
21334 rounded_count = (INTVAL (count)
21335 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21336 destmem = shallow_copy_rtx (destmem);
21337 set_mem_size (destmem, rounded_count);
21338 }
21339 else if (MEM_SIZE_KNOWN_P (destmem))
21340 clear_mem_size (destmem);
21341 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21342 }
21343
21344 static void
21345 emit_strmov (rtx destmem, rtx srcmem,
21346 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21347 {
21348 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21349 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21350 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21351 }
21352
21353 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21354 static void
21355 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21356 rtx destptr, rtx srcptr, rtx count, int max_size)
21357 {
21358 rtx src, dest;
21359 if (CONST_INT_P (count))
21360 {
21361 HOST_WIDE_INT countval = INTVAL (count);
21362 int offset = 0;
21363
21364 if ((countval & 0x10) && max_size > 16)
21365 {
21366 if (TARGET_64BIT)
21367 {
21368 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21369 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21370 }
21371 else
21372 gcc_unreachable ();
21373 offset += 16;
21374 }
21375 if ((countval & 0x08) && max_size > 8)
21376 {
21377 if (TARGET_64BIT)
21378 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21379 else
21380 {
21381 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21382 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21383 }
21384 offset += 8;
21385 }
21386 if ((countval & 0x04) && max_size > 4)
21387 {
21388 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21389 offset += 4;
21390 }
21391 if ((countval & 0x02) && max_size > 2)
21392 {
21393 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21394 offset += 2;
21395 }
21396 if ((countval & 0x01) && max_size > 1)
21397 {
21398 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21399 offset += 1;
21400 }
21401 return;
21402 }
21403 if (max_size > 8)
21404 {
21405 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21406 count, 1, OPTAB_DIRECT);
21407 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21408 count, QImode, 1, 4);
21409 return;
21410 }
21411
21412 /* When there are stringops, we can cheaply increase dest and src pointers.
21413 Otherwise we save code size by maintaining offset (zero is readily
21414 available from preceding rep operation) and using x86 addressing modes.
21415 */
21416 if (TARGET_SINGLE_STRINGOP)
21417 {
21418 if (max_size > 4)
21419 {
21420 rtx label = ix86_expand_aligntest (count, 4, true);
21421 src = change_address (srcmem, SImode, srcptr);
21422 dest = change_address (destmem, SImode, destptr);
21423 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21424 emit_label (label);
21425 LABEL_NUSES (label) = 1;
21426 }
21427 if (max_size > 2)
21428 {
21429 rtx label = ix86_expand_aligntest (count, 2, true);
21430 src = change_address (srcmem, HImode, srcptr);
21431 dest = change_address (destmem, HImode, destptr);
21432 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21433 emit_label (label);
21434 LABEL_NUSES (label) = 1;
21435 }
21436 if (max_size > 1)
21437 {
21438 rtx label = ix86_expand_aligntest (count, 1, true);
21439 src = change_address (srcmem, QImode, srcptr);
21440 dest = change_address (destmem, QImode, destptr);
21441 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21442 emit_label (label);
21443 LABEL_NUSES (label) = 1;
21444 }
21445 }
21446 else
21447 {
21448 rtx offset = force_reg (Pmode, const0_rtx);
21449 rtx tmp;
21450
21451 if (max_size > 4)
21452 {
21453 rtx label = ix86_expand_aligntest (count, 4, true);
21454 src = change_address (srcmem, SImode, srcptr);
21455 dest = change_address (destmem, SImode, destptr);
21456 emit_move_insn (dest, src);
21457 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21458 true, OPTAB_LIB_WIDEN);
21459 if (tmp != offset)
21460 emit_move_insn (offset, tmp);
21461 emit_label (label);
21462 LABEL_NUSES (label) = 1;
21463 }
21464 if (max_size > 2)
21465 {
21466 rtx label = ix86_expand_aligntest (count, 2, true);
21467 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21468 src = change_address (srcmem, HImode, tmp);
21469 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21470 dest = change_address (destmem, HImode, tmp);
21471 emit_move_insn (dest, src);
21472 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21473 true, OPTAB_LIB_WIDEN);
21474 if (tmp != offset)
21475 emit_move_insn (offset, tmp);
21476 emit_label (label);
21477 LABEL_NUSES (label) = 1;
21478 }
21479 if (max_size > 1)
21480 {
21481 rtx label = ix86_expand_aligntest (count, 1, true);
21482 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21483 src = change_address (srcmem, QImode, tmp);
21484 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21485 dest = change_address (destmem, QImode, tmp);
21486 emit_move_insn (dest, src);
21487 emit_label (label);
21488 LABEL_NUSES (label) = 1;
21489 }
21490 }
21491 }
21492
21493 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21494 static void
21495 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21496 rtx count, int max_size)
21497 {
21498 count =
21499 expand_simple_binop (counter_mode (count), AND, count,
21500 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21501 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21502 gen_lowpart (QImode, value), count, QImode,
21503 1, max_size / 2);
21504 }
21505
21506 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21507 static void
21508 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21509 {
21510 rtx dest;
21511
21512 if (CONST_INT_P (count))
21513 {
21514 HOST_WIDE_INT countval = INTVAL (count);
21515 int offset = 0;
21516
21517 if ((countval & 0x10) && max_size > 16)
21518 {
21519 if (TARGET_64BIT)
21520 {
21521 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21522 emit_insn (gen_strset (destptr, dest, value));
21523 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21524 emit_insn (gen_strset (destptr, dest, value));
21525 }
21526 else
21527 gcc_unreachable ();
21528 offset += 16;
21529 }
21530 if ((countval & 0x08) && max_size > 8)
21531 {
21532 if (TARGET_64BIT)
21533 {
21534 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21535 emit_insn (gen_strset (destptr, dest, value));
21536 }
21537 else
21538 {
21539 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21540 emit_insn (gen_strset (destptr, dest, value));
21541 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21542 emit_insn (gen_strset (destptr, dest, value));
21543 }
21544 offset += 8;
21545 }
21546 if ((countval & 0x04) && max_size > 4)
21547 {
21548 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21549 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21550 offset += 4;
21551 }
21552 if ((countval & 0x02) && max_size > 2)
21553 {
21554 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21555 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21556 offset += 2;
21557 }
21558 if ((countval & 0x01) && max_size > 1)
21559 {
21560 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21561 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21562 offset += 1;
21563 }
21564 return;
21565 }
21566 if (max_size > 32)
21567 {
21568 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21569 return;
21570 }
21571 if (max_size > 16)
21572 {
21573 rtx label = ix86_expand_aligntest (count, 16, true);
21574 if (TARGET_64BIT)
21575 {
21576 dest = change_address (destmem, DImode, destptr);
21577 emit_insn (gen_strset (destptr, dest, value));
21578 emit_insn (gen_strset (destptr, dest, value));
21579 }
21580 else
21581 {
21582 dest = change_address (destmem, SImode, destptr);
21583 emit_insn (gen_strset (destptr, dest, value));
21584 emit_insn (gen_strset (destptr, dest, value));
21585 emit_insn (gen_strset (destptr, dest, value));
21586 emit_insn (gen_strset (destptr, dest, value));
21587 }
21588 emit_label (label);
21589 LABEL_NUSES (label) = 1;
21590 }
21591 if (max_size > 8)
21592 {
21593 rtx label = ix86_expand_aligntest (count, 8, true);
21594 if (TARGET_64BIT)
21595 {
21596 dest = change_address (destmem, DImode, destptr);
21597 emit_insn (gen_strset (destptr, dest, value));
21598 }
21599 else
21600 {
21601 dest = change_address (destmem, SImode, destptr);
21602 emit_insn (gen_strset (destptr, dest, value));
21603 emit_insn (gen_strset (destptr, dest, value));
21604 }
21605 emit_label (label);
21606 LABEL_NUSES (label) = 1;
21607 }
21608 if (max_size > 4)
21609 {
21610 rtx label = ix86_expand_aligntest (count, 4, true);
21611 dest = change_address (destmem, SImode, destptr);
21612 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21613 emit_label (label);
21614 LABEL_NUSES (label) = 1;
21615 }
21616 if (max_size > 2)
21617 {
21618 rtx label = ix86_expand_aligntest (count, 2, true);
21619 dest = change_address (destmem, HImode, destptr);
21620 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21621 emit_label (label);
21622 LABEL_NUSES (label) = 1;
21623 }
21624 if (max_size > 1)
21625 {
21626 rtx label = ix86_expand_aligntest (count, 1, true);
21627 dest = change_address (destmem, QImode, destptr);
21628 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21629 emit_label (label);
21630 LABEL_NUSES (label) = 1;
21631 }
21632 }
21633
21634 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21635 DESIRED_ALIGNMENT. */
21636 static void
21637 expand_movmem_prologue (rtx destmem, rtx srcmem,
21638 rtx destptr, rtx srcptr, rtx count,
21639 int align, int desired_alignment)
21640 {
21641 if (align <= 1 && desired_alignment > 1)
21642 {
21643 rtx label = ix86_expand_aligntest (destptr, 1, false);
21644 srcmem = change_address (srcmem, QImode, srcptr);
21645 destmem = change_address (destmem, QImode, destptr);
21646 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21647 ix86_adjust_counter (count, 1);
21648 emit_label (label);
21649 LABEL_NUSES (label) = 1;
21650 }
21651 if (align <= 2 && desired_alignment > 2)
21652 {
21653 rtx label = ix86_expand_aligntest (destptr, 2, false);
21654 srcmem = change_address (srcmem, HImode, srcptr);
21655 destmem = change_address (destmem, HImode, destptr);
21656 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21657 ix86_adjust_counter (count, 2);
21658 emit_label (label);
21659 LABEL_NUSES (label) = 1;
21660 }
21661 if (align <= 4 && desired_alignment > 4)
21662 {
21663 rtx label = ix86_expand_aligntest (destptr, 4, false);
21664 srcmem = change_address (srcmem, SImode, srcptr);
21665 destmem = change_address (destmem, SImode, destptr);
21666 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21667 ix86_adjust_counter (count, 4);
21668 emit_label (label);
21669 LABEL_NUSES (label) = 1;
21670 }
21671 gcc_assert (desired_alignment <= 8);
21672 }
21673
21674 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21675 ALIGN_BYTES is how many bytes need to be copied. */
21676 static rtx
21677 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21678 int desired_align, int align_bytes)
21679 {
21680 rtx src = *srcp;
21681 rtx orig_dst = dst;
21682 rtx orig_src = src;
21683 int off = 0;
21684 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21685 if (src_align_bytes >= 0)
21686 src_align_bytes = desired_align - src_align_bytes;
21687 if (align_bytes & 1)
21688 {
21689 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21690 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21691 off = 1;
21692 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21693 }
21694 if (align_bytes & 2)
21695 {
21696 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21697 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21698 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21699 set_mem_align (dst, 2 * BITS_PER_UNIT);
21700 if (src_align_bytes >= 0
21701 && (src_align_bytes & 1) == (align_bytes & 1)
21702 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21703 set_mem_align (src, 2 * BITS_PER_UNIT);
21704 off = 2;
21705 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21706 }
21707 if (align_bytes & 4)
21708 {
21709 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21710 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21711 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21712 set_mem_align (dst, 4 * BITS_PER_UNIT);
21713 if (src_align_bytes >= 0)
21714 {
21715 unsigned int src_align = 0;
21716 if ((src_align_bytes & 3) == (align_bytes & 3))
21717 src_align = 4;
21718 else if ((src_align_bytes & 1) == (align_bytes & 1))
21719 src_align = 2;
21720 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21721 set_mem_align (src, src_align * BITS_PER_UNIT);
21722 }
21723 off = 4;
21724 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21725 }
21726 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21727 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21728 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21729 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21730 if (src_align_bytes >= 0)
21731 {
21732 unsigned int src_align = 0;
21733 if ((src_align_bytes & 7) == (align_bytes & 7))
21734 src_align = 8;
21735 else if ((src_align_bytes & 3) == (align_bytes & 3))
21736 src_align = 4;
21737 else if ((src_align_bytes & 1) == (align_bytes & 1))
21738 src_align = 2;
21739 if (src_align > (unsigned int) desired_align)
21740 src_align = desired_align;
21741 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21742 set_mem_align (src, src_align * BITS_PER_UNIT);
21743 }
21744 if (MEM_SIZE_KNOWN_P (orig_dst))
21745 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21746 if (MEM_SIZE_KNOWN_P (orig_src))
21747 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21748 *srcp = src;
21749 return dst;
21750 }
21751
21752 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21753 DESIRED_ALIGNMENT. */
21754 static void
21755 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21756 int align, int desired_alignment)
21757 {
21758 if (align <= 1 && desired_alignment > 1)
21759 {
21760 rtx label = ix86_expand_aligntest (destptr, 1, false);
21761 destmem = change_address (destmem, QImode, destptr);
21762 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21763 ix86_adjust_counter (count, 1);
21764 emit_label (label);
21765 LABEL_NUSES (label) = 1;
21766 }
21767 if (align <= 2 && desired_alignment > 2)
21768 {
21769 rtx label = ix86_expand_aligntest (destptr, 2, false);
21770 destmem = change_address (destmem, HImode, destptr);
21771 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21772 ix86_adjust_counter (count, 2);
21773 emit_label (label);
21774 LABEL_NUSES (label) = 1;
21775 }
21776 if (align <= 4 && desired_alignment > 4)
21777 {
21778 rtx label = ix86_expand_aligntest (destptr, 4, false);
21779 destmem = change_address (destmem, SImode, destptr);
21780 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21781 ix86_adjust_counter (count, 4);
21782 emit_label (label);
21783 LABEL_NUSES (label) = 1;
21784 }
21785 gcc_assert (desired_alignment <= 8);
21786 }
21787
21788 /* Set enough from DST to align DST known to by aligned by ALIGN to
21789 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21790 static rtx
21791 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21792 int desired_align, int align_bytes)
21793 {
21794 int off = 0;
21795 rtx orig_dst = dst;
21796 if (align_bytes & 1)
21797 {
21798 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21799 off = 1;
21800 emit_insn (gen_strset (destreg, dst,
21801 gen_lowpart (QImode, value)));
21802 }
21803 if (align_bytes & 2)
21804 {
21805 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21806 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21807 set_mem_align (dst, 2 * BITS_PER_UNIT);
21808 off = 2;
21809 emit_insn (gen_strset (destreg, dst,
21810 gen_lowpart (HImode, value)));
21811 }
21812 if (align_bytes & 4)
21813 {
21814 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21815 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21816 set_mem_align (dst, 4 * BITS_PER_UNIT);
21817 off = 4;
21818 emit_insn (gen_strset (destreg, dst,
21819 gen_lowpart (SImode, value)));
21820 }
21821 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21822 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21823 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21824 if (MEM_SIZE_KNOWN_P (orig_dst))
21825 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21826 return dst;
21827 }
21828
21829 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21830 static enum stringop_alg
21831 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21832 int *dynamic_check)
21833 {
21834 const struct stringop_algs * algs;
21835 bool optimize_for_speed;
21836 /* Algorithms using the rep prefix want at least edi and ecx;
21837 additionally, memset wants eax and memcpy wants esi. Don't
21838 consider such algorithms if the user has appropriated those
21839 registers for their own purposes. */
21840 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21841 || (memset
21842 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21843
21844 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21845 || (alg != rep_prefix_1_byte \
21846 && alg != rep_prefix_4_byte \
21847 && alg != rep_prefix_8_byte))
21848 const struct processor_costs *cost;
21849
21850 /* Even if the string operation call is cold, we still might spend a lot
21851 of time processing large blocks. */
21852 if (optimize_function_for_size_p (cfun)
21853 || (optimize_insn_for_size_p ()
21854 && expected_size != -1 && expected_size < 256))
21855 optimize_for_speed = false;
21856 else
21857 optimize_for_speed = true;
21858
21859 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21860
21861 *dynamic_check = -1;
21862 if (memset)
21863 algs = &cost->memset[TARGET_64BIT != 0];
21864 else
21865 algs = &cost->memcpy[TARGET_64BIT != 0];
21866 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21867 return ix86_stringop_alg;
21868 /* rep; movq or rep; movl is the smallest variant. */
21869 else if (!optimize_for_speed)
21870 {
21871 if (!count || (count & 3))
21872 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21873 else
21874 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21875 }
21876 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21877 */
21878 else if (expected_size != -1 && expected_size < 4)
21879 return loop_1_byte;
21880 else if (expected_size != -1)
21881 {
21882 unsigned int i;
21883 enum stringop_alg alg = libcall;
21884 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21885 {
21886 /* We get here if the algorithms that were not libcall-based
21887 were rep-prefix based and we are unable to use rep prefixes
21888 based on global register usage. Break out of the loop and
21889 use the heuristic below. */
21890 if (algs->size[i].max == 0)
21891 break;
21892 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21893 {
21894 enum stringop_alg candidate = algs->size[i].alg;
21895
21896 if (candidate != libcall && ALG_USABLE_P (candidate))
21897 alg = candidate;
21898 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21899 last non-libcall inline algorithm. */
21900 if (TARGET_INLINE_ALL_STRINGOPS)
21901 {
21902 /* When the current size is best to be copied by a libcall,
21903 but we are still forced to inline, run the heuristic below
21904 that will pick code for medium sized blocks. */
21905 if (alg != libcall)
21906 return alg;
21907 break;
21908 }
21909 else if (ALG_USABLE_P (candidate))
21910 return candidate;
21911 }
21912 }
21913 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21914 }
21915 /* When asked to inline the call anyway, try to pick meaningful choice.
21916 We look for maximal size of block that is faster to copy by hand and
21917 take blocks of at most of that size guessing that average size will
21918 be roughly half of the block.
21919
21920 If this turns out to be bad, we might simply specify the preferred
21921 choice in ix86_costs. */
21922 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21923 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21924 {
21925 int max = -1;
21926 enum stringop_alg alg;
21927 int i;
21928 bool any_alg_usable_p = true;
21929
21930 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21931 {
21932 enum stringop_alg candidate = algs->size[i].alg;
21933 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21934
21935 if (candidate != libcall && candidate
21936 && ALG_USABLE_P (candidate))
21937 max = algs->size[i].max;
21938 }
21939 /* If there aren't any usable algorithms, then recursing on
21940 smaller sizes isn't going to find anything. Just return the
21941 simple byte-at-a-time copy loop. */
21942 if (!any_alg_usable_p)
21943 {
21944 /* Pick something reasonable. */
21945 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21946 *dynamic_check = 128;
21947 return loop_1_byte;
21948 }
21949 if (max == -1)
21950 max = 4096;
21951 alg = decide_alg (count, max / 2, memset, dynamic_check);
21952 gcc_assert (*dynamic_check == -1);
21953 gcc_assert (alg != libcall);
21954 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21955 *dynamic_check = max;
21956 return alg;
21957 }
21958 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
21959 #undef ALG_USABLE_P
21960 }
21961
21962 /* Decide on alignment. We know that the operand is already aligned to ALIGN
21963 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
21964 static int
21965 decide_alignment (int align,
21966 enum stringop_alg alg,
21967 int expected_size)
21968 {
21969 int desired_align = 0;
21970 switch (alg)
21971 {
21972 case no_stringop:
21973 gcc_unreachable ();
21974 case loop:
21975 case unrolled_loop:
21976 desired_align = GET_MODE_SIZE (Pmode);
21977 break;
21978 case rep_prefix_8_byte:
21979 desired_align = 8;
21980 break;
21981 case rep_prefix_4_byte:
21982 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21983 copying whole cacheline at once. */
21984 if (TARGET_PENTIUMPRO)
21985 desired_align = 8;
21986 else
21987 desired_align = 4;
21988 break;
21989 case rep_prefix_1_byte:
21990 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21991 copying whole cacheline at once. */
21992 if (TARGET_PENTIUMPRO)
21993 desired_align = 8;
21994 else
21995 desired_align = 1;
21996 break;
21997 case loop_1_byte:
21998 desired_align = 1;
21999 break;
22000 case libcall:
22001 return 0;
22002 }
22003
22004 if (optimize_size)
22005 desired_align = 1;
22006 if (desired_align < align)
22007 desired_align = align;
22008 if (expected_size != -1 && expected_size < 4)
22009 desired_align = align;
22010 return desired_align;
22011 }
22012
22013 /* Return the smallest power of 2 greater than VAL. */
22014 static int
22015 smallest_pow2_greater_than (int val)
22016 {
22017 int ret = 1;
22018 while (ret <= val)
22019 ret <<= 1;
22020 return ret;
22021 }
22022
22023 /* Expand string move (memcpy) operation. Use i386 string operations
22024 when profitable. expand_setmem contains similar code. The code
22025 depends upon architecture, block size and alignment, but always has
22026 the same overall structure:
22027
22028 1) Prologue guard: Conditional that jumps up to epilogues for small
22029 blocks that can be handled by epilogue alone. This is faster
22030 but also needed for correctness, since prologue assume the block
22031 is larger than the desired alignment.
22032
22033 Optional dynamic check for size and libcall for large
22034 blocks is emitted here too, with -minline-stringops-dynamically.
22035
22036 2) Prologue: copy first few bytes in order to get destination
22037 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22038 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22039 copied. We emit either a jump tree on power of two sized
22040 blocks, or a byte loop.
22041
22042 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22043 with specified algorithm.
22044
22045 4) Epilogue: code copying tail of the block that is too small to be
22046 handled by main body (or up to size guarded by prologue guard). */
22047
22048 bool
22049 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22050 rtx expected_align_exp, rtx expected_size_exp)
22051 {
22052 rtx destreg;
22053 rtx srcreg;
22054 rtx label = NULL;
22055 rtx tmp;
22056 rtx jump_around_label = NULL;
22057 HOST_WIDE_INT align = 1;
22058 unsigned HOST_WIDE_INT count = 0;
22059 HOST_WIDE_INT expected_size = -1;
22060 int size_needed = 0, epilogue_size_needed;
22061 int desired_align = 0, align_bytes = 0;
22062 enum stringop_alg alg;
22063 int dynamic_check;
22064 bool need_zero_guard = false;
22065
22066 if (CONST_INT_P (align_exp))
22067 align = INTVAL (align_exp);
22068 /* i386 can do misaligned access on reasonably increased cost. */
22069 if (CONST_INT_P (expected_align_exp)
22070 && INTVAL (expected_align_exp) > align)
22071 align = INTVAL (expected_align_exp);
22072 /* ALIGN is the minimum of destination and source alignment, but we care here
22073 just about destination alignment. */
22074 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22075 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22076
22077 if (CONST_INT_P (count_exp))
22078 count = expected_size = INTVAL (count_exp);
22079 if (CONST_INT_P (expected_size_exp) && count == 0)
22080 expected_size = INTVAL (expected_size_exp);
22081
22082 /* Make sure we don't need to care about overflow later on. */
22083 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22084 return false;
22085
22086 /* Step 0: Decide on preferred algorithm, desired alignment and
22087 size of chunks to be copied by main loop. */
22088
22089 alg = decide_alg (count, expected_size, false, &dynamic_check);
22090 desired_align = decide_alignment (align, alg, expected_size);
22091
22092 if (!TARGET_ALIGN_STRINGOPS)
22093 align = desired_align;
22094
22095 if (alg == libcall)
22096 return false;
22097 gcc_assert (alg != no_stringop);
22098 if (!count)
22099 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22100 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22101 srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
22102 switch (alg)
22103 {
22104 case libcall:
22105 case no_stringop:
22106 gcc_unreachable ();
22107 case loop:
22108 need_zero_guard = true;
22109 size_needed = GET_MODE_SIZE (word_mode);
22110 break;
22111 case unrolled_loop:
22112 need_zero_guard = true;
22113 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22114 break;
22115 case rep_prefix_8_byte:
22116 size_needed = 8;
22117 break;
22118 case rep_prefix_4_byte:
22119 size_needed = 4;
22120 break;
22121 case rep_prefix_1_byte:
22122 size_needed = 1;
22123 break;
22124 case loop_1_byte:
22125 need_zero_guard = true;
22126 size_needed = 1;
22127 break;
22128 }
22129
22130 epilogue_size_needed = size_needed;
22131
22132 /* Step 1: Prologue guard. */
22133
22134 /* Alignment code needs count to be in register. */
22135 if (CONST_INT_P (count_exp) && desired_align > align)
22136 {
22137 if (INTVAL (count_exp) > desired_align
22138 && INTVAL (count_exp) > size_needed)
22139 {
22140 align_bytes
22141 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22142 if (align_bytes <= 0)
22143 align_bytes = 0;
22144 else
22145 align_bytes = desired_align - align_bytes;
22146 }
22147 if (align_bytes == 0)
22148 count_exp = force_reg (counter_mode (count_exp), count_exp);
22149 }
22150 gcc_assert (desired_align >= 1 && align >= 1);
22151
22152 /* Ensure that alignment prologue won't copy past end of block. */
22153 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22154 {
22155 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22156 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22157 Make sure it is power of 2. */
22158 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22159
22160 if (count)
22161 {
22162 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22163 {
22164 /* If main algorithm works on QImode, no epilogue is needed.
22165 For small sizes just don't align anything. */
22166 if (size_needed == 1)
22167 desired_align = align;
22168 else
22169 goto epilogue;
22170 }
22171 }
22172 else
22173 {
22174 label = gen_label_rtx ();
22175 emit_cmp_and_jump_insns (count_exp,
22176 GEN_INT (epilogue_size_needed),
22177 LTU, 0, counter_mode (count_exp), 1, label);
22178 if (expected_size == -1 || expected_size < epilogue_size_needed)
22179 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22180 else
22181 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22182 }
22183 }
22184
22185 /* Emit code to decide on runtime whether library call or inline should be
22186 used. */
22187 if (dynamic_check != -1)
22188 {
22189 if (CONST_INT_P (count_exp))
22190 {
22191 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22192 {
22193 emit_block_move_via_libcall (dst, src, count_exp, false);
22194 count_exp = const0_rtx;
22195 goto epilogue;
22196 }
22197 }
22198 else
22199 {
22200 rtx hot_label = gen_label_rtx ();
22201 jump_around_label = gen_label_rtx ();
22202 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22203 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22204 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22205 emit_block_move_via_libcall (dst, src, count_exp, false);
22206 emit_jump (jump_around_label);
22207 emit_label (hot_label);
22208 }
22209 }
22210
22211 /* Step 2: Alignment prologue. */
22212
22213 if (desired_align > align)
22214 {
22215 if (align_bytes == 0)
22216 {
22217 /* Except for the first move in epilogue, we no longer know
22218 constant offset in aliasing info. It don't seems to worth
22219 the pain to maintain it for the first move, so throw away
22220 the info early. */
22221 src = change_address (src, BLKmode, srcreg);
22222 dst = change_address (dst, BLKmode, destreg);
22223 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22224 desired_align);
22225 }
22226 else
22227 {
22228 /* If we know how many bytes need to be stored before dst is
22229 sufficiently aligned, maintain aliasing info accurately. */
22230 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22231 desired_align, align_bytes);
22232 count_exp = plus_constant (count_exp, -align_bytes);
22233 count -= align_bytes;
22234 }
22235 if (need_zero_guard
22236 && (count < (unsigned HOST_WIDE_INT) size_needed
22237 || (align_bytes == 0
22238 && count < ((unsigned HOST_WIDE_INT) size_needed
22239 + desired_align - align))))
22240 {
22241 /* It is possible that we copied enough so the main loop will not
22242 execute. */
22243 gcc_assert (size_needed > 1);
22244 if (label == NULL_RTX)
22245 label = gen_label_rtx ();
22246 emit_cmp_and_jump_insns (count_exp,
22247 GEN_INT (size_needed),
22248 LTU, 0, counter_mode (count_exp), 1, label);
22249 if (expected_size == -1
22250 || expected_size < (desired_align - align) / 2 + size_needed)
22251 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22252 else
22253 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22254 }
22255 }
22256 if (label && size_needed == 1)
22257 {
22258 emit_label (label);
22259 LABEL_NUSES (label) = 1;
22260 label = NULL;
22261 epilogue_size_needed = 1;
22262 }
22263 else if (label == NULL_RTX)
22264 epilogue_size_needed = size_needed;
22265
22266 /* Step 3: Main loop. */
22267
22268 switch (alg)
22269 {
22270 case libcall:
22271 case no_stringop:
22272 gcc_unreachable ();
22273 case loop_1_byte:
22274 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22275 count_exp, QImode, 1, expected_size);
22276 break;
22277 case loop:
22278 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22279 count_exp, word_mode, 1, expected_size);
22280 break;
22281 case unrolled_loop:
22282 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22283 registers for 4 temporaries anyway. */
22284 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22285 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22286 expected_size);
22287 break;
22288 case rep_prefix_8_byte:
22289 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22290 DImode);
22291 break;
22292 case rep_prefix_4_byte:
22293 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22294 SImode);
22295 break;
22296 case rep_prefix_1_byte:
22297 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22298 QImode);
22299 break;
22300 }
22301 /* Adjust properly the offset of src and dest memory for aliasing. */
22302 if (CONST_INT_P (count_exp))
22303 {
22304 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22305 (count / size_needed) * size_needed);
22306 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22307 (count / size_needed) * size_needed);
22308 }
22309 else
22310 {
22311 src = change_address (src, BLKmode, srcreg);
22312 dst = change_address (dst, BLKmode, destreg);
22313 }
22314
22315 /* Step 4: Epilogue to copy the remaining bytes. */
22316 epilogue:
22317 if (label)
22318 {
22319 /* When the main loop is done, COUNT_EXP might hold original count,
22320 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22321 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22322 bytes. Compensate if needed. */
22323
22324 if (size_needed < epilogue_size_needed)
22325 {
22326 tmp =
22327 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22328 GEN_INT (size_needed - 1), count_exp, 1,
22329 OPTAB_DIRECT);
22330 if (tmp != count_exp)
22331 emit_move_insn (count_exp, tmp);
22332 }
22333 emit_label (label);
22334 LABEL_NUSES (label) = 1;
22335 }
22336
22337 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22338 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22339 epilogue_size_needed);
22340 if (jump_around_label)
22341 emit_label (jump_around_label);
22342 return true;
22343 }
22344
22345 /* Helper function for memcpy. For QImode value 0xXY produce
22346 0xXYXYXYXY of wide specified by MODE. This is essentially
22347 a * 0x10101010, but we can do slightly better than
22348 synth_mult by unwinding the sequence by hand on CPUs with
22349 slow multiply. */
22350 static rtx
22351 promote_duplicated_reg (enum machine_mode mode, rtx val)
22352 {
22353 enum machine_mode valmode = GET_MODE (val);
22354 rtx tmp;
22355 int nops = mode == DImode ? 3 : 2;
22356
22357 gcc_assert (mode == SImode || mode == DImode);
22358 if (val == const0_rtx)
22359 return copy_to_mode_reg (mode, const0_rtx);
22360 if (CONST_INT_P (val))
22361 {
22362 HOST_WIDE_INT v = INTVAL (val) & 255;
22363
22364 v |= v << 8;
22365 v |= v << 16;
22366 if (mode == DImode)
22367 v |= (v << 16) << 16;
22368 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22369 }
22370
22371 if (valmode == VOIDmode)
22372 valmode = QImode;
22373 if (valmode != QImode)
22374 val = gen_lowpart (QImode, val);
22375 if (mode == QImode)
22376 return val;
22377 if (!TARGET_PARTIAL_REG_STALL)
22378 nops--;
22379 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22380 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22381 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22382 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22383 {
22384 rtx reg = convert_modes (mode, QImode, val, true);
22385 tmp = promote_duplicated_reg (mode, const1_rtx);
22386 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22387 OPTAB_DIRECT);
22388 }
22389 else
22390 {
22391 rtx reg = convert_modes (mode, QImode, val, true);
22392
22393 if (!TARGET_PARTIAL_REG_STALL)
22394 if (mode == SImode)
22395 emit_insn (gen_movsi_insv_1 (reg, reg));
22396 else
22397 emit_insn (gen_movdi_insv_1 (reg, reg));
22398 else
22399 {
22400 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22401 NULL, 1, OPTAB_DIRECT);
22402 reg =
22403 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22404 }
22405 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22406 NULL, 1, OPTAB_DIRECT);
22407 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22408 if (mode == SImode)
22409 return reg;
22410 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22411 NULL, 1, OPTAB_DIRECT);
22412 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22413 return reg;
22414 }
22415 }
22416
22417 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22418 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22419 alignment from ALIGN to DESIRED_ALIGN. */
22420 static rtx
22421 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22422 {
22423 rtx promoted_val;
22424
22425 if (TARGET_64BIT
22426 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22427 promoted_val = promote_duplicated_reg (DImode, val);
22428 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22429 promoted_val = promote_duplicated_reg (SImode, val);
22430 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22431 promoted_val = promote_duplicated_reg (HImode, val);
22432 else
22433 promoted_val = val;
22434
22435 return promoted_val;
22436 }
22437
22438 /* Expand string clear operation (bzero). Use i386 string operations when
22439 profitable. See expand_movmem comment for explanation of individual
22440 steps performed. */
22441 bool
22442 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22443 rtx expected_align_exp, rtx expected_size_exp)
22444 {
22445 rtx destreg;
22446 rtx label = NULL;
22447 rtx tmp;
22448 rtx jump_around_label = NULL;
22449 HOST_WIDE_INT align = 1;
22450 unsigned HOST_WIDE_INT count = 0;
22451 HOST_WIDE_INT expected_size = -1;
22452 int size_needed = 0, epilogue_size_needed;
22453 int desired_align = 0, align_bytes = 0;
22454 enum stringop_alg alg;
22455 rtx promoted_val = NULL;
22456 bool force_loopy_epilogue = false;
22457 int dynamic_check;
22458 bool need_zero_guard = false;
22459
22460 if (CONST_INT_P (align_exp))
22461 align = INTVAL (align_exp);
22462 /* i386 can do misaligned access on reasonably increased cost. */
22463 if (CONST_INT_P (expected_align_exp)
22464 && INTVAL (expected_align_exp) > align)
22465 align = INTVAL (expected_align_exp);
22466 if (CONST_INT_P (count_exp))
22467 count = expected_size = INTVAL (count_exp);
22468 if (CONST_INT_P (expected_size_exp) && count == 0)
22469 expected_size = INTVAL (expected_size_exp);
22470
22471 /* Make sure we don't need to care about overflow later on. */
22472 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22473 return false;
22474
22475 /* Step 0: Decide on preferred algorithm, desired alignment and
22476 size of chunks to be copied by main loop. */
22477
22478 alg = decide_alg (count, expected_size, true, &dynamic_check);
22479 desired_align = decide_alignment (align, alg, expected_size);
22480
22481 if (!TARGET_ALIGN_STRINGOPS)
22482 align = desired_align;
22483
22484 if (alg == libcall)
22485 return false;
22486 gcc_assert (alg != no_stringop);
22487 if (!count)
22488 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22489 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22490 switch (alg)
22491 {
22492 case libcall:
22493 case no_stringop:
22494 gcc_unreachable ();
22495 case loop:
22496 need_zero_guard = true;
22497 size_needed = GET_MODE_SIZE (word_mode);
22498 break;
22499 case unrolled_loop:
22500 need_zero_guard = true;
22501 size_needed = GET_MODE_SIZE (word_mode) * 4;
22502 break;
22503 case rep_prefix_8_byte:
22504 size_needed = 8;
22505 break;
22506 case rep_prefix_4_byte:
22507 size_needed = 4;
22508 break;
22509 case rep_prefix_1_byte:
22510 size_needed = 1;
22511 break;
22512 case loop_1_byte:
22513 need_zero_guard = true;
22514 size_needed = 1;
22515 break;
22516 }
22517 epilogue_size_needed = size_needed;
22518
22519 /* Step 1: Prologue guard. */
22520
22521 /* Alignment code needs count to be in register. */
22522 if (CONST_INT_P (count_exp) && desired_align > align)
22523 {
22524 if (INTVAL (count_exp) > desired_align
22525 && INTVAL (count_exp) > size_needed)
22526 {
22527 align_bytes
22528 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22529 if (align_bytes <= 0)
22530 align_bytes = 0;
22531 else
22532 align_bytes = desired_align - align_bytes;
22533 }
22534 if (align_bytes == 0)
22535 {
22536 enum machine_mode mode = SImode;
22537 if (TARGET_64BIT && (count & ~0xffffffff))
22538 mode = DImode;
22539 count_exp = force_reg (mode, count_exp);
22540 }
22541 }
22542 /* Do the cheap promotion to allow better CSE across the
22543 main loop and epilogue (ie one load of the big constant in the
22544 front of all code. */
22545 if (CONST_INT_P (val_exp))
22546 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22547 desired_align, align);
22548 /* Ensure that alignment prologue won't copy past end of block. */
22549 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22550 {
22551 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22552 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22553 Make sure it is power of 2. */
22554 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22555
22556 /* To improve performance of small blocks, we jump around the VAL
22557 promoting mode. This mean that if the promoted VAL is not constant,
22558 we might not use it in the epilogue and have to use byte
22559 loop variant. */
22560 if (epilogue_size_needed > 2 && !promoted_val)
22561 force_loopy_epilogue = true;
22562 if (count)
22563 {
22564 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22565 {
22566 /* If main algorithm works on QImode, no epilogue is needed.
22567 For small sizes just don't align anything. */
22568 if (size_needed == 1)
22569 desired_align = align;
22570 else
22571 goto epilogue;
22572 }
22573 }
22574 else
22575 {
22576 label = gen_label_rtx ();
22577 emit_cmp_and_jump_insns (count_exp,
22578 GEN_INT (epilogue_size_needed),
22579 LTU, 0, counter_mode (count_exp), 1, label);
22580 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22581 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22582 else
22583 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22584 }
22585 }
22586 if (dynamic_check != -1)
22587 {
22588 rtx hot_label = gen_label_rtx ();
22589 jump_around_label = gen_label_rtx ();
22590 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22591 LEU, 0, counter_mode (count_exp), 1, hot_label);
22592 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22593 set_storage_via_libcall (dst, count_exp, val_exp, false);
22594 emit_jump (jump_around_label);
22595 emit_label (hot_label);
22596 }
22597
22598 /* Step 2: Alignment prologue. */
22599
22600 /* Do the expensive promotion once we branched off the small blocks. */
22601 if (!promoted_val)
22602 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22603 desired_align, align);
22604 gcc_assert (desired_align >= 1 && align >= 1);
22605
22606 if (desired_align > align)
22607 {
22608 if (align_bytes == 0)
22609 {
22610 /* Except for the first move in epilogue, we no longer know
22611 constant offset in aliasing info. It don't seems to worth
22612 the pain to maintain it for the first move, so throw away
22613 the info early. */
22614 dst = change_address (dst, BLKmode, destreg);
22615 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22616 desired_align);
22617 }
22618 else
22619 {
22620 /* If we know how many bytes need to be stored before dst is
22621 sufficiently aligned, maintain aliasing info accurately. */
22622 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22623 desired_align, align_bytes);
22624 count_exp = plus_constant (count_exp, -align_bytes);
22625 count -= align_bytes;
22626 }
22627 if (need_zero_guard
22628 && (count < (unsigned HOST_WIDE_INT) size_needed
22629 || (align_bytes == 0
22630 && count < ((unsigned HOST_WIDE_INT) size_needed
22631 + desired_align - align))))
22632 {
22633 /* It is possible that we copied enough so the main loop will not
22634 execute. */
22635 gcc_assert (size_needed > 1);
22636 if (label == NULL_RTX)
22637 label = gen_label_rtx ();
22638 emit_cmp_and_jump_insns (count_exp,
22639 GEN_INT (size_needed),
22640 LTU, 0, counter_mode (count_exp), 1, label);
22641 if (expected_size == -1
22642 || expected_size < (desired_align - align) / 2 + size_needed)
22643 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22644 else
22645 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22646 }
22647 }
22648 if (label && size_needed == 1)
22649 {
22650 emit_label (label);
22651 LABEL_NUSES (label) = 1;
22652 label = NULL;
22653 promoted_val = val_exp;
22654 epilogue_size_needed = 1;
22655 }
22656 else if (label == NULL_RTX)
22657 epilogue_size_needed = size_needed;
22658
22659 /* Step 3: Main loop. */
22660
22661 switch (alg)
22662 {
22663 case libcall:
22664 case no_stringop:
22665 gcc_unreachable ();
22666 case loop_1_byte:
22667 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22668 count_exp, QImode, 1, expected_size);
22669 break;
22670 case loop:
22671 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22672 count_exp, word_mode, 1, expected_size);
22673 break;
22674 case unrolled_loop:
22675 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22676 count_exp, word_mode, 4, expected_size);
22677 break;
22678 case rep_prefix_8_byte:
22679 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22680 DImode, val_exp);
22681 break;
22682 case rep_prefix_4_byte:
22683 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22684 SImode, val_exp);
22685 break;
22686 case rep_prefix_1_byte:
22687 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22688 QImode, val_exp);
22689 break;
22690 }
22691 /* Adjust properly the offset of src and dest memory for aliasing. */
22692 if (CONST_INT_P (count_exp))
22693 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22694 (count / size_needed) * size_needed);
22695 else
22696 dst = change_address (dst, BLKmode, destreg);
22697
22698 /* Step 4: Epilogue to copy the remaining bytes. */
22699
22700 if (label)
22701 {
22702 /* When the main loop is done, COUNT_EXP might hold original count,
22703 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22704 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22705 bytes. Compensate if needed. */
22706
22707 if (size_needed < epilogue_size_needed)
22708 {
22709 tmp =
22710 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22711 GEN_INT (size_needed - 1), count_exp, 1,
22712 OPTAB_DIRECT);
22713 if (tmp != count_exp)
22714 emit_move_insn (count_exp, tmp);
22715 }
22716 emit_label (label);
22717 LABEL_NUSES (label) = 1;
22718 }
22719 epilogue:
22720 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22721 {
22722 if (force_loopy_epilogue)
22723 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22724 epilogue_size_needed);
22725 else
22726 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22727 epilogue_size_needed);
22728 }
22729 if (jump_around_label)
22730 emit_label (jump_around_label);
22731 return true;
22732 }
22733
22734 /* Expand the appropriate insns for doing strlen if not just doing
22735 repnz; scasb
22736
22737 out = result, initialized with the start address
22738 align_rtx = alignment of the address.
22739 scratch = scratch register, initialized with the startaddress when
22740 not aligned, otherwise undefined
22741
22742 This is just the body. It needs the initializations mentioned above and
22743 some address computing at the end. These things are done in i386.md. */
22744
22745 static void
22746 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22747 {
22748 int align;
22749 rtx tmp;
22750 rtx align_2_label = NULL_RTX;
22751 rtx align_3_label = NULL_RTX;
22752 rtx align_4_label = gen_label_rtx ();
22753 rtx end_0_label = gen_label_rtx ();
22754 rtx mem;
22755 rtx tmpreg = gen_reg_rtx (SImode);
22756 rtx scratch = gen_reg_rtx (SImode);
22757 rtx cmp;
22758
22759 align = 0;
22760 if (CONST_INT_P (align_rtx))
22761 align = INTVAL (align_rtx);
22762
22763 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22764
22765 /* Is there a known alignment and is it less than 4? */
22766 if (align < 4)
22767 {
22768 rtx scratch1 = gen_reg_rtx (Pmode);
22769 emit_move_insn (scratch1, out);
22770 /* Is there a known alignment and is it not 2? */
22771 if (align != 2)
22772 {
22773 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22774 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22775
22776 /* Leave just the 3 lower bits. */
22777 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22778 NULL_RTX, 0, OPTAB_WIDEN);
22779
22780 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22781 Pmode, 1, align_4_label);
22782 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22783 Pmode, 1, align_2_label);
22784 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22785 Pmode, 1, align_3_label);
22786 }
22787 else
22788 {
22789 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22790 check if is aligned to 4 - byte. */
22791
22792 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22793 NULL_RTX, 0, OPTAB_WIDEN);
22794
22795 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22796 Pmode, 1, align_4_label);
22797 }
22798
22799 mem = change_address (src, QImode, out);
22800
22801 /* Now compare the bytes. */
22802
22803 /* Compare the first n unaligned byte on a byte per byte basis. */
22804 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22805 QImode, 1, end_0_label);
22806
22807 /* Increment the address. */
22808 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22809
22810 /* Not needed with an alignment of 2 */
22811 if (align != 2)
22812 {
22813 emit_label (align_2_label);
22814
22815 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22816 end_0_label);
22817
22818 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22819
22820 emit_label (align_3_label);
22821 }
22822
22823 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22824 end_0_label);
22825
22826 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22827 }
22828
22829 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22830 align this loop. It gives only huge programs, but does not help to
22831 speed up. */
22832 emit_label (align_4_label);
22833
22834 mem = change_address (src, SImode, out);
22835 emit_move_insn (scratch, mem);
22836 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22837
22838 /* This formula yields a nonzero result iff one of the bytes is zero.
22839 This saves three branches inside loop and many cycles. */
22840
22841 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22842 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22843 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22844 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22845 gen_int_mode (0x80808080, SImode)));
22846 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22847 align_4_label);
22848
22849 if (TARGET_CMOVE)
22850 {
22851 rtx reg = gen_reg_rtx (SImode);
22852 rtx reg2 = gen_reg_rtx (Pmode);
22853 emit_move_insn (reg, tmpreg);
22854 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22855
22856 /* If zero is not in the first two bytes, move two bytes forward. */
22857 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22858 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22859 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22860 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22861 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22862 reg,
22863 tmpreg)));
22864 /* Emit lea manually to avoid clobbering of flags. */
22865 emit_insn (gen_rtx_SET (SImode, reg2,
22866 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22867
22868 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22869 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22870 emit_insn (gen_rtx_SET (VOIDmode, out,
22871 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22872 reg2,
22873 out)));
22874 }
22875 else
22876 {
22877 rtx end_2_label = gen_label_rtx ();
22878 /* Is zero in the first two bytes? */
22879
22880 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22881 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22882 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22883 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22884 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22885 pc_rtx);
22886 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22887 JUMP_LABEL (tmp) = end_2_label;
22888
22889 /* Not in the first two. Move two bytes forward. */
22890 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22891 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22892
22893 emit_label (end_2_label);
22894
22895 }
22896
22897 /* Avoid branch in fixing the byte. */
22898 tmpreg = gen_lowpart (QImode, tmpreg);
22899 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22900 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22901 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22902 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22903
22904 emit_label (end_0_label);
22905 }
22906
22907 /* Expand strlen. */
22908
22909 bool
22910 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22911 {
22912 rtx addr, scratch1, scratch2, scratch3, scratch4;
22913
22914 /* The generic case of strlen expander is long. Avoid it's
22915 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22916
22917 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22918 && !TARGET_INLINE_ALL_STRINGOPS
22919 && !optimize_insn_for_size_p ()
22920 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22921 return false;
22922
22923 addr = force_reg (Pmode, XEXP (src, 0));
22924 scratch1 = gen_reg_rtx (Pmode);
22925
22926 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22927 && !optimize_insn_for_size_p ())
22928 {
22929 /* Well it seems that some optimizer does not combine a call like
22930 foo(strlen(bar), strlen(bar));
22931 when the move and the subtraction is done here. It does calculate
22932 the length just once when these instructions are done inside of
22933 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
22934 often used and I use one fewer register for the lifetime of
22935 output_strlen_unroll() this is better. */
22936
22937 emit_move_insn (out, addr);
22938
22939 ix86_expand_strlensi_unroll_1 (out, src, align);
22940
22941 /* strlensi_unroll_1 returns the address of the zero at the end of
22942 the string, like memchr(), so compute the length by subtracting
22943 the start address. */
22944 emit_insn (ix86_gen_sub3 (out, out, addr));
22945 }
22946 else
22947 {
22948 rtx unspec;
22949
22950 /* Can't use this if the user has appropriated eax, ecx, or edi. */
22951 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
22952 return false;
22953
22954 scratch2 = gen_reg_rtx (Pmode);
22955 scratch3 = gen_reg_rtx (Pmode);
22956 scratch4 = force_reg (Pmode, constm1_rtx);
22957
22958 emit_move_insn (scratch3, addr);
22959 eoschar = force_reg (QImode, eoschar);
22960
22961 src = replace_equiv_address_nv (src, scratch3);
22962
22963 /* If .md starts supporting :P, this can be done in .md. */
22964 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
22965 scratch4), UNSPEC_SCAS);
22966 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
22967 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
22968 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
22969 }
22970 return true;
22971 }
22972
22973 /* For given symbol (function) construct code to compute address of it's PLT
22974 entry in large x86-64 PIC model. */
22975 rtx
22976 construct_plt_address (rtx symbol)
22977 {
22978 rtx tmp = gen_reg_rtx (Pmode);
22979 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
22980
22981 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
22982 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
22983
22984 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
22985 emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
22986 return tmp;
22987 }
22988
22989 rtx
22990 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
22991 rtx callarg2,
22992 rtx pop, bool sibcall)
22993 {
22994 /* We need to represent that SI and DI registers are clobbered
22995 by SYSV calls. */
22996 static int clobbered_registers[] = {
22997 XMM6_REG, XMM7_REG, XMM8_REG,
22998 XMM9_REG, XMM10_REG, XMM11_REG,
22999 XMM12_REG, XMM13_REG, XMM14_REG,
23000 XMM15_REG, SI_REG, DI_REG
23001 };
23002 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23003 rtx use = NULL, call;
23004 unsigned int vec_len;
23005
23006 if (pop == const0_rtx)
23007 pop = NULL;
23008 gcc_assert (!TARGET_64BIT || !pop);
23009
23010 if (TARGET_MACHO && !TARGET_64BIT)
23011 {
23012 #if TARGET_MACHO
23013 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23014 fnaddr = machopic_indirect_call_target (fnaddr);
23015 #endif
23016 }
23017 else
23018 {
23019 /* Static functions and indirect calls don't need the pic register. */
23020 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23021 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23022 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23023 use_reg (&use, pic_offset_table_rtx);
23024 }
23025
23026 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23027 {
23028 rtx al = gen_rtx_REG (QImode, AX_REG);
23029 emit_move_insn (al, callarg2);
23030 use_reg (&use, al);
23031 }
23032
23033 if (ix86_cmodel == CM_LARGE_PIC
23034 && MEM_P (fnaddr)
23035 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23036 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23037 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23038 else if (sibcall
23039 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23040 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23041 {
23042 fnaddr = XEXP (fnaddr, 0);
23043 if (GET_MODE (fnaddr) != word_mode)
23044 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23045 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23046 }
23047
23048 vec_len = 0;
23049 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23050 if (retval)
23051 call = gen_rtx_SET (VOIDmode, retval, call);
23052 vec[vec_len++] = call;
23053
23054 if (pop)
23055 {
23056 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23057 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23058 vec[vec_len++] = pop;
23059 }
23060
23061 if (TARGET_64BIT_MS_ABI
23062 && (!callarg2 || INTVAL (callarg2) != -2))
23063 {
23064 unsigned i;
23065
23066 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23067 UNSPEC_MS_TO_SYSV_CALL);
23068
23069 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23070 vec[vec_len++]
23071 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23072 ? TImode : DImode,
23073 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23074 ? TImode : DImode,
23075 clobbered_registers[i]));
23076 }
23077
23078 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23079 if (TARGET_VZEROUPPER)
23080 {
23081 int avx256;
23082 if (cfun->machine->callee_pass_avx256_p)
23083 {
23084 if (cfun->machine->callee_return_avx256_p)
23085 avx256 = callee_return_pass_avx256;
23086 else
23087 avx256 = callee_pass_avx256;
23088 }
23089 else if (cfun->machine->callee_return_avx256_p)
23090 avx256 = callee_return_avx256;
23091 else
23092 avx256 = call_no_avx256;
23093
23094 if (reload_completed)
23095 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23096 else
23097 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23098 gen_rtvec (1, GEN_INT (avx256)),
23099 UNSPEC_CALL_NEEDS_VZEROUPPER);
23100 }
23101
23102 if (vec_len > 1)
23103 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23104 call = emit_call_insn (call);
23105 if (use)
23106 CALL_INSN_FUNCTION_USAGE (call) = use;
23107
23108 return call;
23109 }
23110
23111 void
23112 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23113 {
23114 rtx pat = PATTERN (insn);
23115 rtvec vec = XVEC (pat, 0);
23116 int len = GET_NUM_ELEM (vec) - 1;
23117
23118 /* Strip off the last entry of the parallel. */
23119 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23120 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23121 if (len == 1)
23122 pat = RTVEC_ELT (vec, 0);
23123 else
23124 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23125
23126 emit_insn (gen_avx_vzeroupper (vzeroupper));
23127 emit_call_insn (pat);
23128 }
23129
23130 /* Output the assembly for a call instruction. */
23131
23132 const char *
23133 ix86_output_call_insn (rtx insn, rtx call_op)
23134 {
23135 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23136 bool seh_nop_p = false;
23137 const char *xasm;
23138
23139 if (SIBLING_CALL_P (insn))
23140 {
23141 if (direct_p)
23142 xasm = "jmp\t%P0";
23143 /* SEH epilogue detection requires the indirect branch case
23144 to include REX.W. */
23145 else if (TARGET_SEH)
23146 xasm = "rex.W jmp %A0";
23147 else
23148 xasm = "jmp\t%A0";
23149
23150 output_asm_insn (xasm, &call_op);
23151 return "";
23152 }
23153
23154 /* SEH unwinding can require an extra nop to be emitted in several
23155 circumstances. Determine if we have one of those. */
23156 if (TARGET_SEH)
23157 {
23158 rtx i;
23159
23160 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23161 {
23162 /* If we get to another real insn, we don't need the nop. */
23163 if (INSN_P (i))
23164 break;
23165
23166 /* If we get to the epilogue note, prevent a catch region from
23167 being adjacent to the standard epilogue sequence. If non-
23168 call-exceptions, we'll have done this during epilogue emission. */
23169 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23170 && !flag_non_call_exceptions
23171 && !can_throw_internal (insn))
23172 {
23173 seh_nop_p = true;
23174 break;
23175 }
23176 }
23177
23178 /* If we didn't find a real insn following the call, prevent the
23179 unwinder from looking into the next function. */
23180 if (i == NULL)
23181 seh_nop_p = true;
23182 }
23183
23184 if (direct_p)
23185 xasm = "call\t%P0";
23186 else
23187 xasm = "call\t%A0";
23188
23189 output_asm_insn (xasm, &call_op);
23190
23191 if (seh_nop_p)
23192 return "nop";
23193
23194 return "";
23195 }
23196 \f
23197 /* Clear stack slot assignments remembered from previous functions.
23198 This is called from INIT_EXPANDERS once before RTL is emitted for each
23199 function. */
23200
23201 static struct machine_function *
23202 ix86_init_machine_status (void)
23203 {
23204 struct machine_function *f;
23205
23206 f = ggc_alloc_cleared_machine_function ();
23207 f->use_fast_prologue_epilogue_nregs = -1;
23208 f->tls_descriptor_call_expanded_p = 0;
23209 f->call_abi = ix86_abi;
23210
23211 return f;
23212 }
23213
23214 /* Return a MEM corresponding to a stack slot with mode MODE.
23215 Allocate a new slot if necessary.
23216
23217 The RTL for a function can have several slots available: N is
23218 which slot to use. */
23219
23220 rtx
23221 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23222 {
23223 struct stack_local_entry *s;
23224
23225 gcc_assert (n < MAX_386_STACK_LOCALS);
23226
23227 /* Virtual slot is valid only before vregs are instantiated. */
23228 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23229
23230 for (s = ix86_stack_locals; s; s = s->next)
23231 if (s->mode == mode && s->n == n)
23232 return validize_mem (copy_rtx (s->rtl));
23233
23234 s = ggc_alloc_stack_local_entry ();
23235 s->n = n;
23236 s->mode = mode;
23237 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23238
23239 s->next = ix86_stack_locals;
23240 ix86_stack_locals = s;
23241 return validize_mem (s->rtl);
23242 }
23243 \f
23244 /* Calculate the length of the memory address in the instruction encoding.
23245 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23246 or other prefixes. */
23247
23248 int
23249 memory_address_length (rtx addr)
23250 {
23251 struct ix86_address parts;
23252 rtx base, index, disp;
23253 int len;
23254 int ok;
23255
23256 if (GET_CODE (addr) == PRE_DEC
23257 || GET_CODE (addr) == POST_INC
23258 || GET_CODE (addr) == PRE_MODIFY
23259 || GET_CODE (addr) == POST_MODIFY)
23260 return 0;
23261
23262 ok = ix86_decompose_address (addr, &parts);
23263 gcc_assert (ok);
23264
23265 if (parts.base && GET_CODE (parts.base) == SUBREG)
23266 parts.base = SUBREG_REG (parts.base);
23267 if (parts.index && GET_CODE (parts.index) == SUBREG)
23268 parts.index = SUBREG_REG (parts.index);
23269
23270 base = parts.base;
23271 index = parts.index;
23272 disp = parts.disp;
23273
23274 /* Add length of addr32 prefix. */
23275 len = (GET_CODE (addr) == ZERO_EXTEND
23276 || GET_CODE (addr) == AND);
23277
23278 /* Rule of thumb:
23279 - esp as the base always wants an index,
23280 - ebp as the base always wants a displacement,
23281 - r12 as the base always wants an index,
23282 - r13 as the base always wants a displacement. */
23283
23284 /* Register Indirect. */
23285 if (base && !index && !disp)
23286 {
23287 /* esp (for its index) and ebp (for its displacement) need
23288 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23289 code. */
23290 if (REG_P (addr)
23291 && (addr == arg_pointer_rtx
23292 || addr == frame_pointer_rtx
23293 || REGNO (addr) == SP_REG
23294 || REGNO (addr) == BP_REG
23295 || REGNO (addr) == R12_REG
23296 || REGNO (addr) == R13_REG))
23297 len = 1;
23298 }
23299
23300 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23301 is not disp32, but disp32(%rip), so for disp32
23302 SIB byte is needed, unless print_operand_address
23303 optimizes it into disp32(%rip) or (%rip) is implied
23304 by UNSPEC. */
23305 else if (disp && !base && !index)
23306 {
23307 len = 4;
23308 if (TARGET_64BIT)
23309 {
23310 rtx symbol = disp;
23311
23312 if (GET_CODE (disp) == CONST)
23313 symbol = XEXP (disp, 0);
23314 if (GET_CODE (symbol) == PLUS
23315 && CONST_INT_P (XEXP (symbol, 1)))
23316 symbol = XEXP (symbol, 0);
23317
23318 if (GET_CODE (symbol) != LABEL_REF
23319 && (GET_CODE (symbol) != SYMBOL_REF
23320 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23321 && (GET_CODE (symbol) != UNSPEC
23322 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23323 && XINT (symbol, 1) != UNSPEC_PCREL
23324 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23325 len += 1;
23326 }
23327 }
23328
23329 else
23330 {
23331 /* Find the length of the displacement constant. */
23332 if (disp)
23333 {
23334 if (base && satisfies_constraint_K (disp))
23335 len = 1;
23336 else
23337 len = 4;
23338 }
23339 /* ebp always wants a displacement. Similarly r13. */
23340 else if (base && REG_P (base)
23341 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23342 len = 1;
23343
23344 /* An index requires the two-byte modrm form.... */
23345 if (index
23346 /* ...like esp (or r12), which always wants an index. */
23347 || base == arg_pointer_rtx
23348 || base == frame_pointer_rtx
23349 || (base && REG_P (base)
23350 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23351 len += 1;
23352 }
23353
23354 switch (parts.seg)
23355 {
23356 case SEG_FS:
23357 case SEG_GS:
23358 len += 1;
23359 break;
23360 default:
23361 break;
23362 }
23363
23364 return len;
23365 }
23366
23367 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23368 is set, expect that insn have 8bit immediate alternative. */
23369 int
23370 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23371 {
23372 int len = 0;
23373 int i;
23374 extract_insn_cached (insn);
23375 for (i = recog_data.n_operands - 1; i >= 0; --i)
23376 if (CONSTANT_P (recog_data.operand[i]))
23377 {
23378 enum attr_mode mode = get_attr_mode (insn);
23379
23380 gcc_assert (!len);
23381 if (shortform && CONST_INT_P (recog_data.operand[i]))
23382 {
23383 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23384 switch (mode)
23385 {
23386 case MODE_QI:
23387 len = 1;
23388 continue;
23389 case MODE_HI:
23390 ival = trunc_int_for_mode (ival, HImode);
23391 break;
23392 case MODE_SI:
23393 ival = trunc_int_for_mode (ival, SImode);
23394 break;
23395 default:
23396 break;
23397 }
23398 if (IN_RANGE (ival, -128, 127))
23399 {
23400 len = 1;
23401 continue;
23402 }
23403 }
23404 switch (mode)
23405 {
23406 case MODE_QI:
23407 len = 1;
23408 break;
23409 case MODE_HI:
23410 len = 2;
23411 break;
23412 case MODE_SI:
23413 len = 4;
23414 break;
23415 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23416 case MODE_DI:
23417 len = 4;
23418 break;
23419 default:
23420 fatal_insn ("unknown insn mode", insn);
23421 }
23422 }
23423 return len;
23424 }
23425 /* Compute default value for "length_address" attribute. */
23426 int
23427 ix86_attr_length_address_default (rtx insn)
23428 {
23429 int i;
23430
23431 if (get_attr_type (insn) == TYPE_LEA)
23432 {
23433 rtx set = PATTERN (insn), addr;
23434
23435 if (GET_CODE (set) == PARALLEL)
23436 set = XVECEXP (set, 0, 0);
23437
23438 gcc_assert (GET_CODE (set) == SET);
23439
23440 addr = SET_SRC (set);
23441 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23442 {
23443 if (GET_CODE (addr) == ZERO_EXTEND)
23444 addr = XEXP (addr, 0);
23445 if (GET_CODE (addr) == SUBREG)
23446 addr = SUBREG_REG (addr);
23447 }
23448
23449 return memory_address_length (addr);
23450 }
23451
23452 extract_insn_cached (insn);
23453 for (i = recog_data.n_operands - 1; i >= 0; --i)
23454 if (MEM_P (recog_data.operand[i]))
23455 {
23456 constrain_operands_cached (reload_completed);
23457 if (which_alternative != -1)
23458 {
23459 const char *constraints = recog_data.constraints[i];
23460 int alt = which_alternative;
23461
23462 while (*constraints == '=' || *constraints == '+')
23463 constraints++;
23464 while (alt-- > 0)
23465 while (*constraints++ != ',')
23466 ;
23467 /* Skip ignored operands. */
23468 if (*constraints == 'X')
23469 continue;
23470 }
23471 return memory_address_length (XEXP (recog_data.operand[i], 0));
23472 }
23473 return 0;
23474 }
23475
23476 /* Compute default value for "length_vex" attribute. It includes
23477 2 or 3 byte VEX prefix and 1 opcode byte. */
23478
23479 int
23480 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23481 {
23482 int i;
23483
23484 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23485 byte VEX prefix. */
23486 if (!has_0f_opcode || has_vex_w)
23487 return 3 + 1;
23488
23489 /* We can always use 2 byte VEX prefix in 32bit. */
23490 if (!TARGET_64BIT)
23491 return 2 + 1;
23492
23493 extract_insn_cached (insn);
23494
23495 for (i = recog_data.n_operands - 1; i >= 0; --i)
23496 if (REG_P (recog_data.operand[i]))
23497 {
23498 /* REX.W bit uses 3 byte VEX prefix. */
23499 if (GET_MODE (recog_data.operand[i]) == DImode
23500 && GENERAL_REG_P (recog_data.operand[i]))
23501 return 3 + 1;
23502 }
23503 else
23504 {
23505 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23506 if (MEM_P (recog_data.operand[i])
23507 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23508 return 3 + 1;
23509 }
23510
23511 return 2 + 1;
23512 }
23513 \f
23514 /* Return the maximum number of instructions a cpu can issue. */
23515
23516 static int
23517 ix86_issue_rate (void)
23518 {
23519 switch (ix86_tune)
23520 {
23521 case PROCESSOR_PENTIUM:
23522 case PROCESSOR_ATOM:
23523 case PROCESSOR_K6:
23524 return 2;
23525
23526 case PROCESSOR_PENTIUMPRO:
23527 case PROCESSOR_PENTIUM4:
23528 case PROCESSOR_CORE2_32:
23529 case PROCESSOR_CORE2_64:
23530 case PROCESSOR_COREI7_32:
23531 case PROCESSOR_COREI7_64:
23532 case PROCESSOR_ATHLON:
23533 case PROCESSOR_K8:
23534 case PROCESSOR_AMDFAM10:
23535 case PROCESSOR_NOCONA:
23536 case PROCESSOR_GENERIC32:
23537 case PROCESSOR_GENERIC64:
23538 case PROCESSOR_BDVER1:
23539 case PROCESSOR_BDVER2:
23540 case PROCESSOR_BTVER1:
23541 return 3;
23542
23543 default:
23544 return 1;
23545 }
23546 }
23547
23548 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23549 by DEP_INSN and nothing set by DEP_INSN. */
23550
23551 static bool
23552 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23553 {
23554 rtx set, set2;
23555
23556 /* Simplify the test for uninteresting insns. */
23557 if (insn_type != TYPE_SETCC
23558 && insn_type != TYPE_ICMOV
23559 && insn_type != TYPE_FCMOV
23560 && insn_type != TYPE_IBR)
23561 return false;
23562
23563 if ((set = single_set (dep_insn)) != 0)
23564 {
23565 set = SET_DEST (set);
23566 set2 = NULL_RTX;
23567 }
23568 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23569 && XVECLEN (PATTERN (dep_insn), 0) == 2
23570 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23571 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23572 {
23573 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23574 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23575 }
23576 else
23577 return false;
23578
23579 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23580 return false;
23581
23582 /* This test is true if the dependent insn reads the flags but
23583 not any other potentially set register. */
23584 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23585 return false;
23586
23587 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23588 return false;
23589
23590 return true;
23591 }
23592
23593 /* Return true iff USE_INSN has a memory address with operands set by
23594 SET_INSN. */
23595
23596 bool
23597 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23598 {
23599 int i;
23600 extract_insn_cached (use_insn);
23601 for (i = recog_data.n_operands - 1; i >= 0; --i)
23602 if (MEM_P (recog_data.operand[i]))
23603 {
23604 rtx addr = XEXP (recog_data.operand[i], 0);
23605 return modified_in_p (addr, set_insn) != 0;
23606 }
23607 return false;
23608 }
23609
23610 static int
23611 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23612 {
23613 enum attr_type insn_type, dep_insn_type;
23614 enum attr_memory memory;
23615 rtx set, set2;
23616 int dep_insn_code_number;
23617
23618 /* Anti and output dependencies have zero cost on all CPUs. */
23619 if (REG_NOTE_KIND (link) != 0)
23620 return 0;
23621
23622 dep_insn_code_number = recog_memoized (dep_insn);
23623
23624 /* If we can't recognize the insns, we can't really do anything. */
23625 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23626 return cost;
23627
23628 insn_type = get_attr_type (insn);
23629 dep_insn_type = get_attr_type (dep_insn);
23630
23631 switch (ix86_tune)
23632 {
23633 case PROCESSOR_PENTIUM:
23634 /* Address Generation Interlock adds a cycle of latency. */
23635 if (insn_type == TYPE_LEA)
23636 {
23637 rtx addr = PATTERN (insn);
23638
23639 if (GET_CODE (addr) == PARALLEL)
23640 addr = XVECEXP (addr, 0, 0);
23641
23642 gcc_assert (GET_CODE (addr) == SET);
23643
23644 addr = SET_SRC (addr);
23645 if (modified_in_p (addr, dep_insn))
23646 cost += 1;
23647 }
23648 else if (ix86_agi_dependent (dep_insn, insn))
23649 cost += 1;
23650
23651 /* ??? Compares pair with jump/setcc. */
23652 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23653 cost = 0;
23654
23655 /* Floating point stores require value to be ready one cycle earlier. */
23656 if (insn_type == TYPE_FMOV
23657 && get_attr_memory (insn) == MEMORY_STORE
23658 && !ix86_agi_dependent (dep_insn, insn))
23659 cost += 1;
23660 break;
23661
23662 case PROCESSOR_PENTIUMPRO:
23663 memory = get_attr_memory (insn);
23664
23665 /* INT->FP conversion is expensive. */
23666 if (get_attr_fp_int_src (dep_insn))
23667 cost += 5;
23668
23669 /* There is one cycle extra latency between an FP op and a store. */
23670 if (insn_type == TYPE_FMOV
23671 && (set = single_set (dep_insn)) != NULL_RTX
23672 && (set2 = single_set (insn)) != NULL_RTX
23673 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23674 && MEM_P (SET_DEST (set2)))
23675 cost += 1;
23676
23677 /* Show ability of reorder buffer to hide latency of load by executing
23678 in parallel with previous instruction in case
23679 previous instruction is not needed to compute the address. */
23680 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23681 && !ix86_agi_dependent (dep_insn, insn))
23682 {
23683 /* Claim moves to take one cycle, as core can issue one load
23684 at time and the next load can start cycle later. */
23685 if (dep_insn_type == TYPE_IMOV
23686 || dep_insn_type == TYPE_FMOV)
23687 cost = 1;
23688 else if (cost > 1)
23689 cost--;
23690 }
23691 break;
23692
23693 case PROCESSOR_K6:
23694 memory = get_attr_memory (insn);
23695
23696 /* The esp dependency is resolved before the instruction is really
23697 finished. */
23698 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23699 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23700 return 1;
23701
23702 /* INT->FP conversion is expensive. */
23703 if (get_attr_fp_int_src (dep_insn))
23704 cost += 5;
23705
23706 /* Show ability of reorder buffer to hide latency of load by executing
23707 in parallel with previous instruction in case
23708 previous instruction is not needed to compute the address. */
23709 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23710 && !ix86_agi_dependent (dep_insn, insn))
23711 {
23712 /* Claim moves to take one cycle, as core can issue one load
23713 at time and the next load can start cycle later. */
23714 if (dep_insn_type == TYPE_IMOV
23715 || dep_insn_type == TYPE_FMOV)
23716 cost = 1;
23717 else if (cost > 2)
23718 cost -= 2;
23719 else
23720 cost = 1;
23721 }
23722 break;
23723
23724 case PROCESSOR_ATHLON:
23725 case PROCESSOR_K8:
23726 case PROCESSOR_AMDFAM10:
23727 case PROCESSOR_BDVER1:
23728 case PROCESSOR_BDVER2:
23729 case PROCESSOR_BTVER1:
23730 case PROCESSOR_ATOM:
23731 case PROCESSOR_GENERIC32:
23732 case PROCESSOR_GENERIC64:
23733 memory = get_attr_memory (insn);
23734
23735 /* Show ability of reorder buffer to hide latency of load by executing
23736 in parallel with previous instruction in case
23737 previous instruction is not needed to compute the address. */
23738 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23739 && !ix86_agi_dependent (dep_insn, insn))
23740 {
23741 enum attr_unit unit = get_attr_unit (insn);
23742 int loadcost = 3;
23743
23744 /* Because of the difference between the length of integer and
23745 floating unit pipeline preparation stages, the memory operands
23746 for floating point are cheaper.
23747
23748 ??? For Athlon it the difference is most probably 2. */
23749 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23750 loadcost = 3;
23751 else
23752 loadcost = TARGET_ATHLON ? 2 : 0;
23753
23754 if (cost >= loadcost)
23755 cost -= loadcost;
23756 else
23757 cost = 0;
23758 }
23759
23760 default:
23761 break;
23762 }
23763
23764 return cost;
23765 }
23766
23767 /* How many alternative schedules to try. This should be as wide as the
23768 scheduling freedom in the DFA, but no wider. Making this value too
23769 large results extra work for the scheduler. */
23770
23771 static int
23772 ia32_multipass_dfa_lookahead (void)
23773 {
23774 switch (ix86_tune)
23775 {
23776 case PROCESSOR_PENTIUM:
23777 return 2;
23778
23779 case PROCESSOR_PENTIUMPRO:
23780 case PROCESSOR_K6:
23781 return 1;
23782
23783 case PROCESSOR_CORE2_32:
23784 case PROCESSOR_CORE2_64:
23785 case PROCESSOR_COREI7_32:
23786 case PROCESSOR_COREI7_64:
23787 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23788 as many instructions can be executed on a cycle, i.e.,
23789 issue_rate. I wonder why tuning for many CPUs does not do this. */
23790 return ix86_issue_rate ();
23791
23792 default:
23793 return 0;
23794 }
23795 }
23796
23797 \f
23798
23799 /* Model decoder of Core 2/i7.
23800 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23801 track the instruction fetch block boundaries and make sure that long
23802 (9+ bytes) instructions are assigned to D0. */
23803
23804 /* Maximum length of an insn that can be handled by
23805 a secondary decoder unit. '8' for Core 2/i7. */
23806 static int core2i7_secondary_decoder_max_insn_size;
23807
23808 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23809 '16' for Core 2/i7. */
23810 static int core2i7_ifetch_block_size;
23811
23812 /* Maximum number of instructions decoder can handle per cycle.
23813 '6' for Core 2/i7. */
23814 static int core2i7_ifetch_block_max_insns;
23815
23816 typedef struct ix86_first_cycle_multipass_data_ *
23817 ix86_first_cycle_multipass_data_t;
23818 typedef const struct ix86_first_cycle_multipass_data_ *
23819 const_ix86_first_cycle_multipass_data_t;
23820
23821 /* A variable to store target state across calls to max_issue within
23822 one cycle. */
23823 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23824 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23825
23826 /* Initialize DATA. */
23827 static void
23828 core2i7_first_cycle_multipass_init (void *_data)
23829 {
23830 ix86_first_cycle_multipass_data_t data
23831 = (ix86_first_cycle_multipass_data_t) _data;
23832
23833 data->ifetch_block_len = 0;
23834 data->ifetch_block_n_insns = 0;
23835 data->ready_try_change = NULL;
23836 data->ready_try_change_size = 0;
23837 }
23838
23839 /* Advancing the cycle; reset ifetch block counts. */
23840 static void
23841 core2i7_dfa_post_advance_cycle (void)
23842 {
23843 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
23844
23845 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23846
23847 data->ifetch_block_len = 0;
23848 data->ifetch_block_n_insns = 0;
23849 }
23850
23851 static int min_insn_size (rtx);
23852
23853 /* Filter out insns from ready_try that the core will not be able to issue
23854 on current cycle due to decoder. */
23855 static void
23856 core2i7_first_cycle_multipass_filter_ready_try
23857 (const_ix86_first_cycle_multipass_data_t data,
23858 char *ready_try, int n_ready, bool first_cycle_insn_p)
23859 {
23860 while (n_ready--)
23861 {
23862 rtx insn;
23863 int insn_size;
23864
23865 if (ready_try[n_ready])
23866 continue;
23867
23868 insn = get_ready_element (n_ready);
23869 insn_size = min_insn_size (insn);
23870
23871 if (/* If this is a too long an insn for a secondary decoder ... */
23872 (!first_cycle_insn_p
23873 && insn_size > core2i7_secondary_decoder_max_insn_size)
23874 /* ... or it would not fit into the ifetch block ... */
23875 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
23876 /* ... or the decoder is full already ... */
23877 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
23878 /* ... mask the insn out. */
23879 {
23880 ready_try[n_ready] = 1;
23881
23882 if (data->ready_try_change)
23883 SET_BIT (data->ready_try_change, n_ready);
23884 }
23885 }
23886 }
23887
23888 /* Prepare for a new round of multipass lookahead scheduling. */
23889 static void
23890 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
23891 bool first_cycle_insn_p)
23892 {
23893 ix86_first_cycle_multipass_data_t data
23894 = (ix86_first_cycle_multipass_data_t) _data;
23895 const_ix86_first_cycle_multipass_data_t prev_data
23896 = ix86_first_cycle_multipass_data;
23897
23898 /* Restore the state from the end of the previous round. */
23899 data->ifetch_block_len = prev_data->ifetch_block_len;
23900 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
23901
23902 /* Filter instructions that cannot be issued on current cycle due to
23903 decoder restrictions. */
23904 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23905 first_cycle_insn_p);
23906 }
23907
23908 /* INSN is being issued in current solution. Account for its impact on
23909 the decoder model. */
23910 static void
23911 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
23912 rtx insn, const void *_prev_data)
23913 {
23914 ix86_first_cycle_multipass_data_t data
23915 = (ix86_first_cycle_multipass_data_t) _data;
23916 const_ix86_first_cycle_multipass_data_t prev_data
23917 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
23918
23919 int insn_size = min_insn_size (insn);
23920
23921 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
23922 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
23923 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
23924 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23925
23926 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
23927 if (!data->ready_try_change)
23928 {
23929 data->ready_try_change = sbitmap_alloc (n_ready);
23930 data->ready_try_change_size = n_ready;
23931 }
23932 else if (data->ready_try_change_size < n_ready)
23933 {
23934 data->ready_try_change = sbitmap_resize (data->ready_try_change,
23935 n_ready, 0);
23936 data->ready_try_change_size = n_ready;
23937 }
23938 sbitmap_zero (data->ready_try_change);
23939
23940 /* Filter out insns from ready_try that the core will not be able to issue
23941 on current cycle due to decoder. */
23942 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23943 false);
23944 }
23945
23946 /* Revert the effect on ready_try. */
23947 static void
23948 core2i7_first_cycle_multipass_backtrack (const void *_data,
23949 char *ready_try,
23950 int n_ready ATTRIBUTE_UNUSED)
23951 {
23952 const_ix86_first_cycle_multipass_data_t data
23953 = (const_ix86_first_cycle_multipass_data_t) _data;
23954 unsigned int i = 0;
23955 sbitmap_iterator sbi;
23956
23957 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
23958 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
23959 {
23960 ready_try[i] = 0;
23961 }
23962 }
23963
23964 /* Save the result of multipass lookahead scheduling for the next round. */
23965 static void
23966 core2i7_first_cycle_multipass_end (const void *_data)
23967 {
23968 const_ix86_first_cycle_multipass_data_t data
23969 = (const_ix86_first_cycle_multipass_data_t) _data;
23970 ix86_first_cycle_multipass_data_t next_data
23971 = ix86_first_cycle_multipass_data;
23972
23973 if (data != NULL)
23974 {
23975 next_data->ifetch_block_len = data->ifetch_block_len;
23976 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
23977 }
23978 }
23979
23980 /* Deallocate target data. */
23981 static void
23982 core2i7_first_cycle_multipass_fini (void *_data)
23983 {
23984 ix86_first_cycle_multipass_data_t data
23985 = (ix86_first_cycle_multipass_data_t) _data;
23986
23987 if (data->ready_try_change)
23988 {
23989 sbitmap_free (data->ready_try_change);
23990 data->ready_try_change = NULL;
23991 data->ready_try_change_size = 0;
23992 }
23993 }
23994
23995 /* Prepare for scheduling pass. */
23996 static void
23997 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
23998 int verbose ATTRIBUTE_UNUSED,
23999 int max_uid ATTRIBUTE_UNUSED)
24000 {
24001 /* Install scheduling hooks for current CPU. Some of these hooks are used
24002 in time-critical parts of the scheduler, so we only set them up when
24003 they are actually used. */
24004 switch (ix86_tune)
24005 {
24006 case PROCESSOR_CORE2_32:
24007 case PROCESSOR_CORE2_64:
24008 case PROCESSOR_COREI7_32:
24009 case PROCESSOR_COREI7_64:
24010 targetm.sched.dfa_post_advance_cycle
24011 = core2i7_dfa_post_advance_cycle;
24012 targetm.sched.first_cycle_multipass_init
24013 = core2i7_first_cycle_multipass_init;
24014 targetm.sched.first_cycle_multipass_begin
24015 = core2i7_first_cycle_multipass_begin;
24016 targetm.sched.first_cycle_multipass_issue
24017 = core2i7_first_cycle_multipass_issue;
24018 targetm.sched.first_cycle_multipass_backtrack
24019 = core2i7_first_cycle_multipass_backtrack;
24020 targetm.sched.first_cycle_multipass_end
24021 = core2i7_first_cycle_multipass_end;
24022 targetm.sched.first_cycle_multipass_fini
24023 = core2i7_first_cycle_multipass_fini;
24024
24025 /* Set decoder parameters. */
24026 core2i7_secondary_decoder_max_insn_size = 8;
24027 core2i7_ifetch_block_size = 16;
24028 core2i7_ifetch_block_max_insns = 6;
24029 break;
24030
24031 default:
24032 targetm.sched.dfa_post_advance_cycle = NULL;
24033 targetm.sched.first_cycle_multipass_init = NULL;
24034 targetm.sched.first_cycle_multipass_begin = NULL;
24035 targetm.sched.first_cycle_multipass_issue = NULL;
24036 targetm.sched.first_cycle_multipass_backtrack = NULL;
24037 targetm.sched.first_cycle_multipass_end = NULL;
24038 targetm.sched.first_cycle_multipass_fini = NULL;
24039 break;
24040 }
24041 }
24042
24043 \f
24044 /* Compute the alignment given to a constant that is being placed in memory.
24045 EXP is the constant and ALIGN is the alignment that the object would
24046 ordinarily have.
24047 The value of this function is used instead of that alignment to align
24048 the object. */
24049
24050 int
24051 ix86_constant_alignment (tree exp, int align)
24052 {
24053 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24054 || TREE_CODE (exp) == INTEGER_CST)
24055 {
24056 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24057 return 64;
24058 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24059 return 128;
24060 }
24061 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24062 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24063 return BITS_PER_WORD;
24064
24065 return align;
24066 }
24067
24068 /* Compute the alignment for a static variable.
24069 TYPE is the data type, and ALIGN is the alignment that
24070 the object would ordinarily have. The value of this function is used
24071 instead of that alignment to align the object. */
24072
24073 int
24074 ix86_data_alignment (tree type, int align)
24075 {
24076 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24077
24078 if (AGGREGATE_TYPE_P (type)
24079 && TYPE_SIZE (type)
24080 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24081 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24082 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24083 && align < max_align)
24084 align = max_align;
24085
24086 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24087 to 16byte boundary. */
24088 if (TARGET_64BIT)
24089 {
24090 if (AGGREGATE_TYPE_P (type)
24091 && TYPE_SIZE (type)
24092 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24093 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24094 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24095 return 128;
24096 }
24097
24098 if (TREE_CODE (type) == ARRAY_TYPE)
24099 {
24100 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24101 return 64;
24102 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24103 return 128;
24104 }
24105 else if (TREE_CODE (type) == COMPLEX_TYPE)
24106 {
24107
24108 if (TYPE_MODE (type) == DCmode && align < 64)
24109 return 64;
24110 if ((TYPE_MODE (type) == XCmode
24111 || TYPE_MODE (type) == TCmode) && align < 128)
24112 return 128;
24113 }
24114 else if ((TREE_CODE (type) == RECORD_TYPE
24115 || TREE_CODE (type) == UNION_TYPE
24116 || TREE_CODE (type) == QUAL_UNION_TYPE)
24117 && TYPE_FIELDS (type))
24118 {
24119 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24120 return 64;
24121 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24122 return 128;
24123 }
24124 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24125 || TREE_CODE (type) == INTEGER_TYPE)
24126 {
24127 if (TYPE_MODE (type) == DFmode && align < 64)
24128 return 64;
24129 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24130 return 128;
24131 }
24132
24133 return align;
24134 }
24135
24136 /* Compute the alignment for a local variable or a stack slot. EXP is
24137 the data type or decl itself, MODE is the widest mode available and
24138 ALIGN is the alignment that the object would ordinarily have. The
24139 value of this macro is used instead of that alignment to align the
24140 object. */
24141
24142 unsigned int
24143 ix86_local_alignment (tree exp, enum machine_mode mode,
24144 unsigned int align)
24145 {
24146 tree type, decl;
24147
24148 if (exp && DECL_P (exp))
24149 {
24150 type = TREE_TYPE (exp);
24151 decl = exp;
24152 }
24153 else
24154 {
24155 type = exp;
24156 decl = NULL;
24157 }
24158
24159 /* Don't do dynamic stack realignment for long long objects with
24160 -mpreferred-stack-boundary=2. */
24161 if (!TARGET_64BIT
24162 && align == 64
24163 && ix86_preferred_stack_boundary < 64
24164 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24165 && (!type || !TYPE_USER_ALIGN (type))
24166 && (!decl || !DECL_USER_ALIGN (decl)))
24167 align = 32;
24168
24169 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24170 register in MODE. We will return the largest alignment of XF
24171 and DF. */
24172 if (!type)
24173 {
24174 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24175 align = GET_MODE_ALIGNMENT (DFmode);
24176 return align;
24177 }
24178
24179 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24180 to 16byte boundary. Exact wording is:
24181
24182 An array uses the same alignment as its elements, except that a local or
24183 global array variable of length at least 16 bytes or
24184 a C99 variable-length array variable always has alignment of at least 16 bytes.
24185
24186 This was added to allow use of aligned SSE instructions at arrays. This
24187 rule is meant for static storage (where compiler can not do the analysis
24188 by itself). We follow it for automatic variables only when convenient.
24189 We fully control everything in the function compiled and functions from
24190 other unit can not rely on the alignment.
24191
24192 Exclude va_list type. It is the common case of local array where
24193 we can not benefit from the alignment. */
24194 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24195 && TARGET_SSE)
24196 {
24197 if (AGGREGATE_TYPE_P (type)
24198 && (va_list_type_node == NULL_TREE
24199 || (TYPE_MAIN_VARIANT (type)
24200 != TYPE_MAIN_VARIANT (va_list_type_node)))
24201 && TYPE_SIZE (type)
24202 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24203 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24204 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24205 return 128;
24206 }
24207 if (TREE_CODE (type) == ARRAY_TYPE)
24208 {
24209 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24210 return 64;
24211 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24212 return 128;
24213 }
24214 else if (TREE_CODE (type) == COMPLEX_TYPE)
24215 {
24216 if (TYPE_MODE (type) == DCmode && align < 64)
24217 return 64;
24218 if ((TYPE_MODE (type) == XCmode
24219 || TYPE_MODE (type) == TCmode) && align < 128)
24220 return 128;
24221 }
24222 else if ((TREE_CODE (type) == RECORD_TYPE
24223 || TREE_CODE (type) == UNION_TYPE
24224 || TREE_CODE (type) == QUAL_UNION_TYPE)
24225 && TYPE_FIELDS (type))
24226 {
24227 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24228 return 64;
24229 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24230 return 128;
24231 }
24232 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24233 || TREE_CODE (type) == INTEGER_TYPE)
24234 {
24235
24236 if (TYPE_MODE (type) == DFmode && align < 64)
24237 return 64;
24238 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24239 return 128;
24240 }
24241 return align;
24242 }
24243
24244 /* Compute the minimum required alignment for dynamic stack realignment
24245 purposes for a local variable, parameter or a stack slot. EXP is
24246 the data type or decl itself, MODE is its mode and ALIGN is the
24247 alignment that the object would ordinarily have. */
24248
24249 unsigned int
24250 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24251 unsigned int align)
24252 {
24253 tree type, decl;
24254
24255 if (exp && DECL_P (exp))
24256 {
24257 type = TREE_TYPE (exp);
24258 decl = exp;
24259 }
24260 else
24261 {
24262 type = exp;
24263 decl = NULL;
24264 }
24265
24266 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24267 return align;
24268
24269 /* Don't do dynamic stack realignment for long long objects with
24270 -mpreferred-stack-boundary=2. */
24271 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24272 && (!type || !TYPE_USER_ALIGN (type))
24273 && (!decl || !DECL_USER_ALIGN (decl)))
24274 return 32;
24275
24276 return align;
24277 }
24278 \f
24279 /* Find a location for the static chain incoming to a nested function.
24280 This is a register, unless all free registers are used by arguments. */
24281
24282 static rtx
24283 ix86_static_chain (const_tree fndecl, bool incoming_p)
24284 {
24285 unsigned regno;
24286
24287 if (!DECL_STATIC_CHAIN (fndecl))
24288 return NULL;
24289
24290 if (TARGET_64BIT)
24291 {
24292 /* We always use R10 in 64-bit mode. */
24293 regno = R10_REG;
24294 }
24295 else
24296 {
24297 tree fntype;
24298 unsigned int ccvt;
24299
24300 /* By default in 32-bit mode we use ECX to pass the static chain. */
24301 regno = CX_REG;
24302
24303 fntype = TREE_TYPE (fndecl);
24304 ccvt = ix86_get_callcvt (fntype);
24305 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24306 {
24307 /* Fastcall functions use ecx/edx for arguments, which leaves
24308 us with EAX for the static chain.
24309 Thiscall functions use ecx for arguments, which also
24310 leaves us with EAX for the static chain. */
24311 regno = AX_REG;
24312 }
24313 else if (ix86_function_regparm (fntype, fndecl) == 3)
24314 {
24315 /* For regparm 3, we have no free call-clobbered registers in
24316 which to store the static chain. In order to implement this,
24317 we have the trampoline push the static chain to the stack.
24318 However, we can't push a value below the return address when
24319 we call the nested function directly, so we have to use an
24320 alternate entry point. For this we use ESI, and have the
24321 alternate entry point push ESI, so that things appear the
24322 same once we're executing the nested function. */
24323 if (incoming_p)
24324 {
24325 if (fndecl == current_function_decl)
24326 ix86_static_chain_on_stack = true;
24327 return gen_frame_mem (SImode,
24328 plus_constant (arg_pointer_rtx, -8));
24329 }
24330 regno = SI_REG;
24331 }
24332 }
24333
24334 return gen_rtx_REG (Pmode, regno);
24335 }
24336
24337 /* Emit RTL insns to initialize the variable parts of a trampoline.
24338 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24339 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24340 to be passed to the target function. */
24341
24342 static void
24343 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24344 {
24345 rtx mem, fnaddr;
24346 int opcode;
24347 int offset = 0;
24348
24349 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24350
24351 if (TARGET_64BIT)
24352 {
24353 int size;
24354
24355 /* Load the function address to r11. Try to load address using
24356 the shorter movl instead of movabs. We may want to support
24357 movq for kernel mode, but kernel does not use trampolines at
24358 the moment. FNADDR is a 32bit address and may not be in
24359 DImode when ptr_mode == SImode. Always use movl in this
24360 case. */
24361 if (ptr_mode == SImode
24362 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24363 {
24364 fnaddr = copy_to_mode_reg (Pmode, fnaddr);
24365
24366 mem = adjust_address (m_tramp, HImode, offset);
24367 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24368
24369 mem = adjust_address (m_tramp, SImode, offset + 2);
24370 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24371 offset += 6;
24372 }
24373 else
24374 {
24375 mem = adjust_address (m_tramp, HImode, offset);
24376 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24377
24378 mem = adjust_address (m_tramp, DImode, offset + 2);
24379 emit_move_insn (mem, fnaddr);
24380 offset += 10;
24381 }
24382
24383 /* Load static chain using movabs to r10. Use the shorter movl
24384 instead of movabs when ptr_mode == SImode. */
24385 if (ptr_mode == SImode)
24386 {
24387 opcode = 0xba41;
24388 size = 6;
24389 }
24390 else
24391 {
24392 opcode = 0xba49;
24393 size = 10;
24394 }
24395
24396 mem = adjust_address (m_tramp, HImode, offset);
24397 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24398
24399 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24400 emit_move_insn (mem, chain_value);
24401 offset += size;
24402
24403 /* Jump to r11; the last (unused) byte is a nop, only there to
24404 pad the write out to a single 32-bit store. */
24405 mem = adjust_address (m_tramp, SImode, offset);
24406 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24407 offset += 4;
24408 }
24409 else
24410 {
24411 rtx disp, chain;
24412
24413 /* Depending on the static chain location, either load a register
24414 with a constant, or push the constant to the stack. All of the
24415 instructions are the same size. */
24416 chain = ix86_static_chain (fndecl, true);
24417 if (REG_P (chain))
24418 {
24419 switch (REGNO (chain))
24420 {
24421 case AX_REG:
24422 opcode = 0xb8; break;
24423 case CX_REG:
24424 opcode = 0xb9; break;
24425 default:
24426 gcc_unreachable ();
24427 }
24428 }
24429 else
24430 opcode = 0x68;
24431
24432 mem = adjust_address (m_tramp, QImode, offset);
24433 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24434
24435 mem = adjust_address (m_tramp, SImode, offset + 1);
24436 emit_move_insn (mem, chain_value);
24437 offset += 5;
24438
24439 mem = adjust_address (m_tramp, QImode, offset);
24440 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24441
24442 mem = adjust_address (m_tramp, SImode, offset + 1);
24443
24444 /* Compute offset from the end of the jmp to the target function.
24445 In the case in which the trampoline stores the static chain on
24446 the stack, we need to skip the first insn which pushes the
24447 (call-saved) register static chain; this push is 1 byte. */
24448 offset += 5;
24449 disp = expand_binop (SImode, sub_optab, fnaddr,
24450 plus_constant (XEXP (m_tramp, 0),
24451 offset - (MEM_P (chain) ? 1 : 0)),
24452 NULL_RTX, 1, OPTAB_DIRECT);
24453 emit_move_insn (mem, disp);
24454 }
24455
24456 gcc_assert (offset <= TRAMPOLINE_SIZE);
24457
24458 #ifdef HAVE_ENABLE_EXECUTE_STACK
24459 #ifdef CHECK_EXECUTE_STACK_ENABLED
24460 if (CHECK_EXECUTE_STACK_ENABLED)
24461 #endif
24462 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24463 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24464 #endif
24465 }
24466 \f
24467 /* The following file contains several enumerations and data structures
24468 built from the definitions in i386-builtin-types.def. */
24469
24470 #include "i386-builtin-types.inc"
24471
24472 /* Table for the ix86 builtin non-function types. */
24473 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24474
24475 /* Retrieve an element from the above table, building some of
24476 the types lazily. */
24477
24478 static tree
24479 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24480 {
24481 unsigned int index;
24482 tree type, itype;
24483
24484 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24485
24486 type = ix86_builtin_type_tab[(int) tcode];
24487 if (type != NULL)
24488 return type;
24489
24490 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24491 if (tcode <= IX86_BT_LAST_VECT)
24492 {
24493 enum machine_mode mode;
24494
24495 index = tcode - IX86_BT_LAST_PRIM - 1;
24496 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24497 mode = ix86_builtin_type_vect_mode[index];
24498
24499 type = build_vector_type_for_mode (itype, mode);
24500 }
24501 else
24502 {
24503 int quals;
24504
24505 index = tcode - IX86_BT_LAST_VECT - 1;
24506 if (tcode <= IX86_BT_LAST_PTR)
24507 quals = TYPE_UNQUALIFIED;
24508 else
24509 quals = TYPE_QUAL_CONST;
24510
24511 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24512 if (quals != TYPE_UNQUALIFIED)
24513 itype = build_qualified_type (itype, quals);
24514
24515 type = build_pointer_type (itype);
24516 }
24517
24518 ix86_builtin_type_tab[(int) tcode] = type;
24519 return type;
24520 }
24521
24522 /* Table for the ix86 builtin function types. */
24523 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24524
24525 /* Retrieve an element from the above table, building some of
24526 the types lazily. */
24527
24528 static tree
24529 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24530 {
24531 tree type;
24532
24533 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24534
24535 type = ix86_builtin_func_type_tab[(int) tcode];
24536 if (type != NULL)
24537 return type;
24538
24539 if (tcode <= IX86_BT_LAST_FUNC)
24540 {
24541 unsigned start = ix86_builtin_func_start[(int) tcode];
24542 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24543 tree rtype, atype, args = void_list_node;
24544 unsigned i;
24545
24546 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24547 for (i = after - 1; i > start; --i)
24548 {
24549 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24550 args = tree_cons (NULL, atype, args);
24551 }
24552
24553 type = build_function_type (rtype, args);
24554 }
24555 else
24556 {
24557 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24558 enum ix86_builtin_func_type icode;
24559
24560 icode = ix86_builtin_func_alias_base[index];
24561 type = ix86_get_builtin_func_type (icode);
24562 }
24563
24564 ix86_builtin_func_type_tab[(int) tcode] = type;
24565 return type;
24566 }
24567
24568
24569 /* Codes for all the SSE/MMX builtins. */
24570 enum ix86_builtins
24571 {
24572 IX86_BUILTIN_ADDPS,
24573 IX86_BUILTIN_ADDSS,
24574 IX86_BUILTIN_DIVPS,
24575 IX86_BUILTIN_DIVSS,
24576 IX86_BUILTIN_MULPS,
24577 IX86_BUILTIN_MULSS,
24578 IX86_BUILTIN_SUBPS,
24579 IX86_BUILTIN_SUBSS,
24580
24581 IX86_BUILTIN_CMPEQPS,
24582 IX86_BUILTIN_CMPLTPS,
24583 IX86_BUILTIN_CMPLEPS,
24584 IX86_BUILTIN_CMPGTPS,
24585 IX86_BUILTIN_CMPGEPS,
24586 IX86_BUILTIN_CMPNEQPS,
24587 IX86_BUILTIN_CMPNLTPS,
24588 IX86_BUILTIN_CMPNLEPS,
24589 IX86_BUILTIN_CMPNGTPS,
24590 IX86_BUILTIN_CMPNGEPS,
24591 IX86_BUILTIN_CMPORDPS,
24592 IX86_BUILTIN_CMPUNORDPS,
24593 IX86_BUILTIN_CMPEQSS,
24594 IX86_BUILTIN_CMPLTSS,
24595 IX86_BUILTIN_CMPLESS,
24596 IX86_BUILTIN_CMPNEQSS,
24597 IX86_BUILTIN_CMPNLTSS,
24598 IX86_BUILTIN_CMPNLESS,
24599 IX86_BUILTIN_CMPNGTSS,
24600 IX86_BUILTIN_CMPNGESS,
24601 IX86_BUILTIN_CMPORDSS,
24602 IX86_BUILTIN_CMPUNORDSS,
24603
24604 IX86_BUILTIN_COMIEQSS,
24605 IX86_BUILTIN_COMILTSS,
24606 IX86_BUILTIN_COMILESS,
24607 IX86_BUILTIN_COMIGTSS,
24608 IX86_BUILTIN_COMIGESS,
24609 IX86_BUILTIN_COMINEQSS,
24610 IX86_BUILTIN_UCOMIEQSS,
24611 IX86_BUILTIN_UCOMILTSS,
24612 IX86_BUILTIN_UCOMILESS,
24613 IX86_BUILTIN_UCOMIGTSS,
24614 IX86_BUILTIN_UCOMIGESS,
24615 IX86_BUILTIN_UCOMINEQSS,
24616
24617 IX86_BUILTIN_CVTPI2PS,
24618 IX86_BUILTIN_CVTPS2PI,
24619 IX86_BUILTIN_CVTSI2SS,
24620 IX86_BUILTIN_CVTSI642SS,
24621 IX86_BUILTIN_CVTSS2SI,
24622 IX86_BUILTIN_CVTSS2SI64,
24623 IX86_BUILTIN_CVTTPS2PI,
24624 IX86_BUILTIN_CVTTSS2SI,
24625 IX86_BUILTIN_CVTTSS2SI64,
24626
24627 IX86_BUILTIN_MAXPS,
24628 IX86_BUILTIN_MAXSS,
24629 IX86_BUILTIN_MINPS,
24630 IX86_BUILTIN_MINSS,
24631
24632 IX86_BUILTIN_LOADUPS,
24633 IX86_BUILTIN_STOREUPS,
24634 IX86_BUILTIN_MOVSS,
24635
24636 IX86_BUILTIN_MOVHLPS,
24637 IX86_BUILTIN_MOVLHPS,
24638 IX86_BUILTIN_LOADHPS,
24639 IX86_BUILTIN_LOADLPS,
24640 IX86_BUILTIN_STOREHPS,
24641 IX86_BUILTIN_STORELPS,
24642
24643 IX86_BUILTIN_MASKMOVQ,
24644 IX86_BUILTIN_MOVMSKPS,
24645 IX86_BUILTIN_PMOVMSKB,
24646
24647 IX86_BUILTIN_MOVNTPS,
24648 IX86_BUILTIN_MOVNTQ,
24649
24650 IX86_BUILTIN_LOADDQU,
24651 IX86_BUILTIN_STOREDQU,
24652
24653 IX86_BUILTIN_PACKSSWB,
24654 IX86_BUILTIN_PACKSSDW,
24655 IX86_BUILTIN_PACKUSWB,
24656
24657 IX86_BUILTIN_PADDB,
24658 IX86_BUILTIN_PADDW,
24659 IX86_BUILTIN_PADDD,
24660 IX86_BUILTIN_PADDQ,
24661 IX86_BUILTIN_PADDSB,
24662 IX86_BUILTIN_PADDSW,
24663 IX86_BUILTIN_PADDUSB,
24664 IX86_BUILTIN_PADDUSW,
24665 IX86_BUILTIN_PSUBB,
24666 IX86_BUILTIN_PSUBW,
24667 IX86_BUILTIN_PSUBD,
24668 IX86_BUILTIN_PSUBQ,
24669 IX86_BUILTIN_PSUBSB,
24670 IX86_BUILTIN_PSUBSW,
24671 IX86_BUILTIN_PSUBUSB,
24672 IX86_BUILTIN_PSUBUSW,
24673
24674 IX86_BUILTIN_PAND,
24675 IX86_BUILTIN_PANDN,
24676 IX86_BUILTIN_POR,
24677 IX86_BUILTIN_PXOR,
24678
24679 IX86_BUILTIN_PAVGB,
24680 IX86_BUILTIN_PAVGW,
24681
24682 IX86_BUILTIN_PCMPEQB,
24683 IX86_BUILTIN_PCMPEQW,
24684 IX86_BUILTIN_PCMPEQD,
24685 IX86_BUILTIN_PCMPGTB,
24686 IX86_BUILTIN_PCMPGTW,
24687 IX86_BUILTIN_PCMPGTD,
24688
24689 IX86_BUILTIN_PMADDWD,
24690
24691 IX86_BUILTIN_PMAXSW,
24692 IX86_BUILTIN_PMAXUB,
24693 IX86_BUILTIN_PMINSW,
24694 IX86_BUILTIN_PMINUB,
24695
24696 IX86_BUILTIN_PMULHUW,
24697 IX86_BUILTIN_PMULHW,
24698 IX86_BUILTIN_PMULLW,
24699
24700 IX86_BUILTIN_PSADBW,
24701 IX86_BUILTIN_PSHUFW,
24702
24703 IX86_BUILTIN_PSLLW,
24704 IX86_BUILTIN_PSLLD,
24705 IX86_BUILTIN_PSLLQ,
24706 IX86_BUILTIN_PSRAW,
24707 IX86_BUILTIN_PSRAD,
24708 IX86_BUILTIN_PSRLW,
24709 IX86_BUILTIN_PSRLD,
24710 IX86_BUILTIN_PSRLQ,
24711 IX86_BUILTIN_PSLLWI,
24712 IX86_BUILTIN_PSLLDI,
24713 IX86_BUILTIN_PSLLQI,
24714 IX86_BUILTIN_PSRAWI,
24715 IX86_BUILTIN_PSRADI,
24716 IX86_BUILTIN_PSRLWI,
24717 IX86_BUILTIN_PSRLDI,
24718 IX86_BUILTIN_PSRLQI,
24719
24720 IX86_BUILTIN_PUNPCKHBW,
24721 IX86_BUILTIN_PUNPCKHWD,
24722 IX86_BUILTIN_PUNPCKHDQ,
24723 IX86_BUILTIN_PUNPCKLBW,
24724 IX86_BUILTIN_PUNPCKLWD,
24725 IX86_BUILTIN_PUNPCKLDQ,
24726
24727 IX86_BUILTIN_SHUFPS,
24728
24729 IX86_BUILTIN_RCPPS,
24730 IX86_BUILTIN_RCPSS,
24731 IX86_BUILTIN_RSQRTPS,
24732 IX86_BUILTIN_RSQRTPS_NR,
24733 IX86_BUILTIN_RSQRTSS,
24734 IX86_BUILTIN_RSQRTF,
24735 IX86_BUILTIN_SQRTPS,
24736 IX86_BUILTIN_SQRTPS_NR,
24737 IX86_BUILTIN_SQRTSS,
24738
24739 IX86_BUILTIN_UNPCKHPS,
24740 IX86_BUILTIN_UNPCKLPS,
24741
24742 IX86_BUILTIN_ANDPS,
24743 IX86_BUILTIN_ANDNPS,
24744 IX86_BUILTIN_ORPS,
24745 IX86_BUILTIN_XORPS,
24746
24747 IX86_BUILTIN_EMMS,
24748 IX86_BUILTIN_LDMXCSR,
24749 IX86_BUILTIN_STMXCSR,
24750 IX86_BUILTIN_SFENCE,
24751
24752 /* 3DNow! Original */
24753 IX86_BUILTIN_FEMMS,
24754 IX86_BUILTIN_PAVGUSB,
24755 IX86_BUILTIN_PF2ID,
24756 IX86_BUILTIN_PFACC,
24757 IX86_BUILTIN_PFADD,
24758 IX86_BUILTIN_PFCMPEQ,
24759 IX86_BUILTIN_PFCMPGE,
24760 IX86_BUILTIN_PFCMPGT,
24761 IX86_BUILTIN_PFMAX,
24762 IX86_BUILTIN_PFMIN,
24763 IX86_BUILTIN_PFMUL,
24764 IX86_BUILTIN_PFRCP,
24765 IX86_BUILTIN_PFRCPIT1,
24766 IX86_BUILTIN_PFRCPIT2,
24767 IX86_BUILTIN_PFRSQIT1,
24768 IX86_BUILTIN_PFRSQRT,
24769 IX86_BUILTIN_PFSUB,
24770 IX86_BUILTIN_PFSUBR,
24771 IX86_BUILTIN_PI2FD,
24772 IX86_BUILTIN_PMULHRW,
24773
24774 /* 3DNow! Athlon Extensions */
24775 IX86_BUILTIN_PF2IW,
24776 IX86_BUILTIN_PFNACC,
24777 IX86_BUILTIN_PFPNACC,
24778 IX86_BUILTIN_PI2FW,
24779 IX86_BUILTIN_PSWAPDSI,
24780 IX86_BUILTIN_PSWAPDSF,
24781
24782 /* SSE2 */
24783 IX86_BUILTIN_ADDPD,
24784 IX86_BUILTIN_ADDSD,
24785 IX86_BUILTIN_DIVPD,
24786 IX86_BUILTIN_DIVSD,
24787 IX86_BUILTIN_MULPD,
24788 IX86_BUILTIN_MULSD,
24789 IX86_BUILTIN_SUBPD,
24790 IX86_BUILTIN_SUBSD,
24791
24792 IX86_BUILTIN_CMPEQPD,
24793 IX86_BUILTIN_CMPLTPD,
24794 IX86_BUILTIN_CMPLEPD,
24795 IX86_BUILTIN_CMPGTPD,
24796 IX86_BUILTIN_CMPGEPD,
24797 IX86_BUILTIN_CMPNEQPD,
24798 IX86_BUILTIN_CMPNLTPD,
24799 IX86_BUILTIN_CMPNLEPD,
24800 IX86_BUILTIN_CMPNGTPD,
24801 IX86_BUILTIN_CMPNGEPD,
24802 IX86_BUILTIN_CMPORDPD,
24803 IX86_BUILTIN_CMPUNORDPD,
24804 IX86_BUILTIN_CMPEQSD,
24805 IX86_BUILTIN_CMPLTSD,
24806 IX86_BUILTIN_CMPLESD,
24807 IX86_BUILTIN_CMPNEQSD,
24808 IX86_BUILTIN_CMPNLTSD,
24809 IX86_BUILTIN_CMPNLESD,
24810 IX86_BUILTIN_CMPORDSD,
24811 IX86_BUILTIN_CMPUNORDSD,
24812
24813 IX86_BUILTIN_COMIEQSD,
24814 IX86_BUILTIN_COMILTSD,
24815 IX86_BUILTIN_COMILESD,
24816 IX86_BUILTIN_COMIGTSD,
24817 IX86_BUILTIN_COMIGESD,
24818 IX86_BUILTIN_COMINEQSD,
24819 IX86_BUILTIN_UCOMIEQSD,
24820 IX86_BUILTIN_UCOMILTSD,
24821 IX86_BUILTIN_UCOMILESD,
24822 IX86_BUILTIN_UCOMIGTSD,
24823 IX86_BUILTIN_UCOMIGESD,
24824 IX86_BUILTIN_UCOMINEQSD,
24825
24826 IX86_BUILTIN_MAXPD,
24827 IX86_BUILTIN_MAXSD,
24828 IX86_BUILTIN_MINPD,
24829 IX86_BUILTIN_MINSD,
24830
24831 IX86_BUILTIN_ANDPD,
24832 IX86_BUILTIN_ANDNPD,
24833 IX86_BUILTIN_ORPD,
24834 IX86_BUILTIN_XORPD,
24835
24836 IX86_BUILTIN_SQRTPD,
24837 IX86_BUILTIN_SQRTSD,
24838
24839 IX86_BUILTIN_UNPCKHPD,
24840 IX86_BUILTIN_UNPCKLPD,
24841
24842 IX86_BUILTIN_SHUFPD,
24843
24844 IX86_BUILTIN_LOADUPD,
24845 IX86_BUILTIN_STOREUPD,
24846 IX86_BUILTIN_MOVSD,
24847
24848 IX86_BUILTIN_LOADHPD,
24849 IX86_BUILTIN_LOADLPD,
24850
24851 IX86_BUILTIN_CVTDQ2PD,
24852 IX86_BUILTIN_CVTDQ2PS,
24853
24854 IX86_BUILTIN_CVTPD2DQ,
24855 IX86_BUILTIN_CVTPD2PI,
24856 IX86_BUILTIN_CVTPD2PS,
24857 IX86_BUILTIN_CVTTPD2DQ,
24858 IX86_BUILTIN_CVTTPD2PI,
24859
24860 IX86_BUILTIN_CVTPI2PD,
24861 IX86_BUILTIN_CVTSI2SD,
24862 IX86_BUILTIN_CVTSI642SD,
24863
24864 IX86_BUILTIN_CVTSD2SI,
24865 IX86_BUILTIN_CVTSD2SI64,
24866 IX86_BUILTIN_CVTSD2SS,
24867 IX86_BUILTIN_CVTSS2SD,
24868 IX86_BUILTIN_CVTTSD2SI,
24869 IX86_BUILTIN_CVTTSD2SI64,
24870
24871 IX86_BUILTIN_CVTPS2DQ,
24872 IX86_BUILTIN_CVTPS2PD,
24873 IX86_BUILTIN_CVTTPS2DQ,
24874
24875 IX86_BUILTIN_MOVNTI,
24876 IX86_BUILTIN_MOVNTI64,
24877 IX86_BUILTIN_MOVNTPD,
24878 IX86_BUILTIN_MOVNTDQ,
24879
24880 IX86_BUILTIN_MOVQ128,
24881
24882 /* SSE2 MMX */
24883 IX86_BUILTIN_MASKMOVDQU,
24884 IX86_BUILTIN_MOVMSKPD,
24885 IX86_BUILTIN_PMOVMSKB128,
24886
24887 IX86_BUILTIN_PACKSSWB128,
24888 IX86_BUILTIN_PACKSSDW128,
24889 IX86_BUILTIN_PACKUSWB128,
24890
24891 IX86_BUILTIN_PADDB128,
24892 IX86_BUILTIN_PADDW128,
24893 IX86_BUILTIN_PADDD128,
24894 IX86_BUILTIN_PADDQ128,
24895 IX86_BUILTIN_PADDSB128,
24896 IX86_BUILTIN_PADDSW128,
24897 IX86_BUILTIN_PADDUSB128,
24898 IX86_BUILTIN_PADDUSW128,
24899 IX86_BUILTIN_PSUBB128,
24900 IX86_BUILTIN_PSUBW128,
24901 IX86_BUILTIN_PSUBD128,
24902 IX86_BUILTIN_PSUBQ128,
24903 IX86_BUILTIN_PSUBSB128,
24904 IX86_BUILTIN_PSUBSW128,
24905 IX86_BUILTIN_PSUBUSB128,
24906 IX86_BUILTIN_PSUBUSW128,
24907
24908 IX86_BUILTIN_PAND128,
24909 IX86_BUILTIN_PANDN128,
24910 IX86_BUILTIN_POR128,
24911 IX86_BUILTIN_PXOR128,
24912
24913 IX86_BUILTIN_PAVGB128,
24914 IX86_BUILTIN_PAVGW128,
24915
24916 IX86_BUILTIN_PCMPEQB128,
24917 IX86_BUILTIN_PCMPEQW128,
24918 IX86_BUILTIN_PCMPEQD128,
24919 IX86_BUILTIN_PCMPGTB128,
24920 IX86_BUILTIN_PCMPGTW128,
24921 IX86_BUILTIN_PCMPGTD128,
24922
24923 IX86_BUILTIN_PMADDWD128,
24924
24925 IX86_BUILTIN_PMAXSW128,
24926 IX86_BUILTIN_PMAXUB128,
24927 IX86_BUILTIN_PMINSW128,
24928 IX86_BUILTIN_PMINUB128,
24929
24930 IX86_BUILTIN_PMULUDQ,
24931 IX86_BUILTIN_PMULUDQ128,
24932 IX86_BUILTIN_PMULHUW128,
24933 IX86_BUILTIN_PMULHW128,
24934 IX86_BUILTIN_PMULLW128,
24935
24936 IX86_BUILTIN_PSADBW128,
24937 IX86_BUILTIN_PSHUFHW,
24938 IX86_BUILTIN_PSHUFLW,
24939 IX86_BUILTIN_PSHUFD,
24940
24941 IX86_BUILTIN_PSLLDQI128,
24942 IX86_BUILTIN_PSLLWI128,
24943 IX86_BUILTIN_PSLLDI128,
24944 IX86_BUILTIN_PSLLQI128,
24945 IX86_BUILTIN_PSRAWI128,
24946 IX86_BUILTIN_PSRADI128,
24947 IX86_BUILTIN_PSRLDQI128,
24948 IX86_BUILTIN_PSRLWI128,
24949 IX86_BUILTIN_PSRLDI128,
24950 IX86_BUILTIN_PSRLQI128,
24951
24952 IX86_BUILTIN_PSLLDQ128,
24953 IX86_BUILTIN_PSLLW128,
24954 IX86_BUILTIN_PSLLD128,
24955 IX86_BUILTIN_PSLLQ128,
24956 IX86_BUILTIN_PSRAW128,
24957 IX86_BUILTIN_PSRAD128,
24958 IX86_BUILTIN_PSRLW128,
24959 IX86_BUILTIN_PSRLD128,
24960 IX86_BUILTIN_PSRLQ128,
24961
24962 IX86_BUILTIN_PUNPCKHBW128,
24963 IX86_BUILTIN_PUNPCKHWD128,
24964 IX86_BUILTIN_PUNPCKHDQ128,
24965 IX86_BUILTIN_PUNPCKHQDQ128,
24966 IX86_BUILTIN_PUNPCKLBW128,
24967 IX86_BUILTIN_PUNPCKLWD128,
24968 IX86_BUILTIN_PUNPCKLDQ128,
24969 IX86_BUILTIN_PUNPCKLQDQ128,
24970
24971 IX86_BUILTIN_CLFLUSH,
24972 IX86_BUILTIN_MFENCE,
24973 IX86_BUILTIN_LFENCE,
24974 IX86_BUILTIN_PAUSE,
24975
24976 IX86_BUILTIN_BSRSI,
24977 IX86_BUILTIN_BSRDI,
24978 IX86_BUILTIN_RDPMC,
24979 IX86_BUILTIN_RDTSC,
24980 IX86_BUILTIN_RDTSCP,
24981 IX86_BUILTIN_ROLQI,
24982 IX86_BUILTIN_ROLHI,
24983 IX86_BUILTIN_RORQI,
24984 IX86_BUILTIN_RORHI,
24985
24986 /* SSE3. */
24987 IX86_BUILTIN_ADDSUBPS,
24988 IX86_BUILTIN_HADDPS,
24989 IX86_BUILTIN_HSUBPS,
24990 IX86_BUILTIN_MOVSHDUP,
24991 IX86_BUILTIN_MOVSLDUP,
24992 IX86_BUILTIN_ADDSUBPD,
24993 IX86_BUILTIN_HADDPD,
24994 IX86_BUILTIN_HSUBPD,
24995 IX86_BUILTIN_LDDQU,
24996
24997 IX86_BUILTIN_MONITOR,
24998 IX86_BUILTIN_MWAIT,
24999
25000 /* SSSE3. */
25001 IX86_BUILTIN_PHADDW,
25002 IX86_BUILTIN_PHADDD,
25003 IX86_BUILTIN_PHADDSW,
25004 IX86_BUILTIN_PHSUBW,
25005 IX86_BUILTIN_PHSUBD,
25006 IX86_BUILTIN_PHSUBSW,
25007 IX86_BUILTIN_PMADDUBSW,
25008 IX86_BUILTIN_PMULHRSW,
25009 IX86_BUILTIN_PSHUFB,
25010 IX86_BUILTIN_PSIGNB,
25011 IX86_BUILTIN_PSIGNW,
25012 IX86_BUILTIN_PSIGND,
25013 IX86_BUILTIN_PALIGNR,
25014 IX86_BUILTIN_PABSB,
25015 IX86_BUILTIN_PABSW,
25016 IX86_BUILTIN_PABSD,
25017
25018 IX86_BUILTIN_PHADDW128,
25019 IX86_BUILTIN_PHADDD128,
25020 IX86_BUILTIN_PHADDSW128,
25021 IX86_BUILTIN_PHSUBW128,
25022 IX86_BUILTIN_PHSUBD128,
25023 IX86_BUILTIN_PHSUBSW128,
25024 IX86_BUILTIN_PMADDUBSW128,
25025 IX86_BUILTIN_PMULHRSW128,
25026 IX86_BUILTIN_PSHUFB128,
25027 IX86_BUILTIN_PSIGNB128,
25028 IX86_BUILTIN_PSIGNW128,
25029 IX86_BUILTIN_PSIGND128,
25030 IX86_BUILTIN_PALIGNR128,
25031 IX86_BUILTIN_PABSB128,
25032 IX86_BUILTIN_PABSW128,
25033 IX86_BUILTIN_PABSD128,
25034
25035 /* AMDFAM10 - SSE4A New Instructions. */
25036 IX86_BUILTIN_MOVNTSD,
25037 IX86_BUILTIN_MOVNTSS,
25038 IX86_BUILTIN_EXTRQI,
25039 IX86_BUILTIN_EXTRQ,
25040 IX86_BUILTIN_INSERTQI,
25041 IX86_BUILTIN_INSERTQ,
25042
25043 /* SSE4.1. */
25044 IX86_BUILTIN_BLENDPD,
25045 IX86_BUILTIN_BLENDPS,
25046 IX86_BUILTIN_BLENDVPD,
25047 IX86_BUILTIN_BLENDVPS,
25048 IX86_BUILTIN_PBLENDVB128,
25049 IX86_BUILTIN_PBLENDW128,
25050
25051 IX86_BUILTIN_DPPD,
25052 IX86_BUILTIN_DPPS,
25053
25054 IX86_BUILTIN_INSERTPS128,
25055
25056 IX86_BUILTIN_MOVNTDQA,
25057 IX86_BUILTIN_MPSADBW128,
25058 IX86_BUILTIN_PACKUSDW128,
25059 IX86_BUILTIN_PCMPEQQ,
25060 IX86_BUILTIN_PHMINPOSUW128,
25061
25062 IX86_BUILTIN_PMAXSB128,
25063 IX86_BUILTIN_PMAXSD128,
25064 IX86_BUILTIN_PMAXUD128,
25065 IX86_BUILTIN_PMAXUW128,
25066
25067 IX86_BUILTIN_PMINSB128,
25068 IX86_BUILTIN_PMINSD128,
25069 IX86_BUILTIN_PMINUD128,
25070 IX86_BUILTIN_PMINUW128,
25071
25072 IX86_BUILTIN_PMOVSXBW128,
25073 IX86_BUILTIN_PMOVSXBD128,
25074 IX86_BUILTIN_PMOVSXBQ128,
25075 IX86_BUILTIN_PMOVSXWD128,
25076 IX86_BUILTIN_PMOVSXWQ128,
25077 IX86_BUILTIN_PMOVSXDQ128,
25078
25079 IX86_BUILTIN_PMOVZXBW128,
25080 IX86_BUILTIN_PMOVZXBD128,
25081 IX86_BUILTIN_PMOVZXBQ128,
25082 IX86_BUILTIN_PMOVZXWD128,
25083 IX86_BUILTIN_PMOVZXWQ128,
25084 IX86_BUILTIN_PMOVZXDQ128,
25085
25086 IX86_BUILTIN_PMULDQ128,
25087 IX86_BUILTIN_PMULLD128,
25088
25089 IX86_BUILTIN_ROUNDSD,
25090 IX86_BUILTIN_ROUNDSS,
25091
25092 IX86_BUILTIN_ROUNDPD,
25093 IX86_BUILTIN_ROUNDPS,
25094
25095 IX86_BUILTIN_FLOORPD,
25096 IX86_BUILTIN_CEILPD,
25097 IX86_BUILTIN_TRUNCPD,
25098 IX86_BUILTIN_RINTPD,
25099 IX86_BUILTIN_ROUNDPD_AZ,
25100
25101 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25102 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25103 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25104
25105 IX86_BUILTIN_FLOORPS,
25106 IX86_BUILTIN_CEILPS,
25107 IX86_BUILTIN_TRUNCPS,
25108 IX86_BUILTIN_RINTPS,
25109 IX86_BUILTIN_ROUNDPS_AZ,
25110
25111 IX86_BUILTIN_FLOORPS_SFIX,
25112 IX86_BUILTIN_CEILPS_SFIX,
25113 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25114
25115 IX86_BUILTIN_PTESTZ,
25116 IX86_BUILTIN_PTESTC,
25117 IX86_BUILTIN_PTESTNZC,
25118
25119 IX86_BUILTIN_VEC_INIT_V2SI,
25120 IX86_BUILTIN_VEC_INIT_V4HI,
25121 IX86_BUILTIN_VEC_INIT_V8QI,
25122 IX86_BUILTIN_VEC_EXT_V2DF,
25123 IX86_BUILTIN_VEC_EXT_V2DI,
25124 IX86_BUILTIN_VEC_EXT_V4SF,
25125 IX86_BUILTIN_VEC_EXT_V4SI,
25126 IX86_BUILTIN_VEC_EXT_V8HI,
25127 IX86_BUILTIN_VEC_EXT_V2SI,
25128 IX86_BUILTIN_VEC_EXT_V4HI,
25129 IX86_BUILTIN_VEC_EXT_V16QI,
25130 IX86_BUILTIN_VEC_SET_V2DI,
25131 IX86_BUILTIN_VEC_SET_V4SF,
25132 IX86_BUILTIN_VEC_SET_V4SI,
25133 IX86_BUILTIN_VEC_SET_V8HI,
25134 IX86_BUILTIN_VEC_SET_V4HI,
25135 IX86_BUILTIN_VEC_SET_V16QI,
25136
25137 IX86_BUILTIN_VEC_PACK_SFIX,
25138 IX86_BUILTIN_VEC_PACK_SFIX256,
25139
25140 /* SSE4.2. */
25141 IX86_BUILTIN_CRC32QI,
25142 IX86_BUILTIN_CRC32HI,
25143 IX86_BUILTIN_CRC32SI,
25144 IX86_BUILTIN_CRC32DI,
25145
25146 IX86_BUILTIN_PCMPESTRI128,
25147 IX86_BUILTIN_PCMPESTRM128,
25148 IX86_BUILTIN_PCMPESTRA128,
25149 IX86_BUILTIN_PCMPESTRC128,
25150 IX86_BUILTIN_PCMPESTRO128,
25151 IX86_BUILTIN_PCMPESTRS128,
25152 IX86_BUILTIN_PCMPESTRZ128,
25153 IX86_BUILTIN_PCMPISTRI128,
25154 IX86_BUILTIN_PCMPISTRM128,
25155 IX86_BUILTIN_PCMPISTRA128,
25156 IX86_BUILTIN_PCMPISTRC128,
25157 IX86_BUILTIN_PCMPISTRO128,
25158 IX86_BUILTIN_PCMPISTRS128,
25159 IX86_BUILTIN_PCMPISTRZ128,
25160
25161 IX86_BUILTIN_PCMPGTQ,
25162
25163 /* AES instructions */
25164 IX86_BUILTIN_AESENC128,
25165 IX86_BUILTIN_AESENCLAST128,
25166 IX86_BUILTIN_AESDEC128,
25167 IX86_BUILTIN_AESDECLAST128,
25168 IX86_BUILTIN_AESIMC128,
25169 IX86_BUILTIN_AESKEYGENASSIST128,
25170
25171 /* PCLMUL instruction */
25172 IX86_BUILTIN_PCLMULQDQ128,
25173
25174 /* AVX */
25175 IX86_BUILTIN_ADDPD256,
25176 IX86_BUILTIN_ADDPS256,
25177 IX86_BUILTIN_ADDSUBPD256,
25178 IX86_BUILTIN_ADDSUBPS256,
25179 IX86_BUILTIN_ANDPD256,
25180 IX86_BUILTIN_ANDPS256,
25181 IX86_BUILTIN_ANDNPD256,
25182 IX86_BUILTIN_ANDNPS256,
25183 IX86_BUILTIN_BLENDPD256,
25184 IX86_BUILTIN_BLENDPS256,
25185 IX86_BUILTIN_BLENDVPD256,
25186 IX86_BUILTIN_BLENDVPS256,
25187 IX86_BUILTIN_DIVPD256,
25188 IX86_BUILTIN_DIVPS256,
25189 IX86_BUILTIN_DPPS256,
25190 IX86_BUILTIN_HADDPD256,
25191 IX86_BUILTIN_HADDPS256,
25192 IX86_BUILTIN_HSUBPD256,
25193 IX86_BUILTIN_HSUBPS256,
25194 IX86_BUILTIN_MAXPD256,
25195 IX86_BUILTIN_MAXPS256,
25196 IX86_BUILTIN_MINPD256,
25197 IX86_BUILTIN_MINPS256,
25198 IX86_BUILTIN_MULPD256,
25199 IX86_BUILTIN_MULPS256,
25200 IX86_BUILTIN_ORPD256,
25201 IX86_BUILTIN_ORPS256,
25202 IX86_BUILTIN_SHUFPD256,
25203 IX86_BUILTIN_SHUFPS256,
25204 IX86_BUILTIN_SUBPD256,
25205 IX86_BUILTIN_SUBPS256,
25206 IX86_BUILTIN_XORPD256,
25207 IX86_BUILTIN_XORPS256,
25208 IX86_BUILTIN_CMPSD,
25209 IX86_BUILTIN_CMPSS,
25210 IX86_BUILTIN_CMPPD,
25211 IX86_BUILTIN_CMPPS,
25212 IX86_BUILTIN_CMPPD256,
25213 IX86_BUILTIN_CMPPS256,
25214 IX86_BUILTIN_CVTDQ2PD256,
25215 IX86_BUILTIN_CVTDQ2PS256,
25216 IX86_BUILTIN_CVTPD2PS256,
25217 IX86_BUILTIN_CVTPS2DQ256,
25218 IX86_BUILTIN_CVTPS2PD256,
25219 IX86_BUILTIN_CVTTPD2DQ256,
25220 IX86_BUILTIN_CVTPD2DQ256,
25221 IX86_BUILTIN_CVTTPS2DQ256,
25222 IX86_BUILTIN_EXTRACTF128PD256,
25223 IX86_BUILTIN_EXTRACTF128PS256,
25224 IX86_BUILTIN_EXTRACTF128SI256,
25225 IX86_BUILTIN_VZEROALL,
25226 IX86_BUILTIN_VZEROUPPER,
25227 IX86_BUILTIN_VPERMILVARPD,
25228 IX86_BUILTIN_VPERMILVARPS,
25229 IX86_BUILTIN_VPERMILVARPD256,
25230 IX86_BUILTIN_VPERMILVARPS256,
25231 IX86_BUILTIN_VPERMILPD,
25232 IX86_BUILTIN_VPERMILPS,
25233 IX86_BUILTIN_VPERMILPD256,
25234 IX86_BUILTIN_VPERMILPS256,
25235 IX86_BUILTIN_VPERMIL2PD,
25236 IX86_BUILTIN_VPERMIL2PS,
25237 IX86_BUILTIN_VPERMIL2PD256,
25238 IX86_BUILTIN_VPERMIL2PS256,
25239 IX86_BUILTIN_VPERM2F128PD256,
25240 IX86_BUILTIN_VPERM2F128PS256,
25241 IX86_BUILTIN_VPERM2F128SI256,
25242 IX86_BUILTIN_VBROADCASTSS,
25243 IX86_BUILTIN_VBROADCASTSD256,
25244 IX86_BUILTIN_VBROADCASTSS256,
25245 IX86_BUILTIN_VBROADCASTPD256,
25246 IX86_BUILTIN_VBROADCASTPS256,
25247 IX86_BUILTIN_VINSERTF128PD256,
25248 IX86_BUILTIN_VINSERTF128PS256,
25249 IX86_BUILTIN_VINSERTF128SI256,
25250 IX86_BUILTIN_LOADUPD256,
25251 IX86_BUILTIN_LOADUPS256,
25252 IX86_BUILTIN_STOREUPD256,
25253 IX86_BUILTIN_STOREUPS256,
25254 IX86_BUILTIN_LDDQU256,
25255 IX86_BUILTIN_MOVNTDQ256,
25256 IX86_BUILTIN_MOVNTPD256,
25257 IX86_BUILTIN_MOVNTPS256,
25258 IX86_BUILTIN_LOADDQU256,
25259 IX86_BUILTIN_STOREDQU256,
25260 IX86_BUILTIN_MASKLOADPD,
25261 IX86_BUILTIN_MASKLOADPS,
25262 IX86_BUILTIN_MASKSTOREPD,
25263 IX86_BUILTIN_MASKSTOREPS,
25264 IX86_BUILTIN_MASKLOADPD256,
25265 IX86_BUILTIN_MASKLOADPS256,
25266 IX86_BUILTIN_MASKSTOREPD256,
25267 IX86_BUILTIN_MASKSTOREPS256,
25268 IX86_BUILTIN_MOVSHDUP256,
25269 IX86_BUILTIN_MOVSLDUP256,
25270 IX86_BUILTIN_MOVDDUP256,
25271
25272 IX86_BUILTIN_SQRTPD256,
25273 IX86_BUILTIN_SQRTPS256,
25274 IX86_BUILTIN_SQRTPS_NR256,
25275 IX86_BUILTIN_RSQRTPS256,
25276 IX86_BUILTIN_RSQRTPS_NR256,
25277
25278 IX86_BUILTIN_RCPPS256,
25279
25280 IX86_BUILTIN_ROUNDPD256,
25281 IX86_BUILTIN_ROUNDPS256,
25282
25283 IX86_BUILTIN_FLOORPD256,
25284 IX86_BUILTIN_CEILPD256,
25285 IX86_BUILTIN_TRUNCPD256,
25286 IX86_BUILTIN_RINTPD256,
25287 IX86_BUILTIN_ROUNDPD_AZ256,
25288
25289 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25290 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25291 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25292
25293 IX86_BUILTIN_FLOORPS256,
25294 IX86_BUILTIN_CEILPS256,
25295 IX86_BUILTIN_TRUNCPS256,
25296 IX86_BUILTIN_RINTPS256,
25297 IX86_BUILTIN_ROUNDPS_AZ256,
25298
25299 IX86_BUILTIN_FLOORPS_SFIX256,
25300 IX86_BUILTIN_CEILPS_SFIX256,
25301 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25302
25303 IX86_BUILTIN_UNPCKHPD256,
25304 IX86_BUILTIN_UNPCKLPD256,
25305 IX86_BUILTIN_UNPCKHPS256,
25306 IX86_BUILTIN_UNPCKLPS256,
25307
25308 IX86_BUILTIN_SI256_SI,
25309 IX86_BUILTIN_PS256_PS,
25310 IX86_BUILTIN_PD256_PD,
25311 IX86_BUILTIN_SI_SI256,
25312 IX86_BUILTIN_PS_PS256,
25313 IX86_BUILTIN_PD_PD256,
25314
25315 IX86_BUILTIN_VTESTZPD,
25316 IX86_BUILTIN_VTESTCPD,
25317 IX86_BUILTIN_VTESTNZCPD,
25318 IX86_BUILTIN_VTESTZPS,
25319 IX86_BUILTIN_VTESTCPS,
25320 IX86_BUILTIN_VTESTNZCPS,
25321 IX86_BUILTIN_VTESTZPD256,
25322 IX86_BUILTIN_VTESTCPD256,
25323 IX86_BUILTIN_VTESTNZCPD256,
25324 IX86_BUILTIN_VTESTZPS256,
25325 IX86_BUILTIN_VTESTCPS256,
25326 IX86_BUILTIN_VTESTNZCPS256,
25327 IX86_BUILTIN_PTESTZ256,
25328 IX86_BUILTIN_PTESTC256,
25329 IX86_BUILTIN_PTESTNZC256,
25330
25331 IX86_BUILTIN_MOVMSKPD256,
25332 IX86_BUILTIN_MOVMSKPS256,
25333
25334 /* AVX2 */
25335 IX86_BUILTIN_MPSADBW256,
25336 IX86_BUILTIN_PABSB256,
25337 IX86_BUILTIN_PABSW256,
25338 IX86_BUILTIN_PABSD256,
25339 IX86_BUILTIN_PACKSSDW256,
25340 IX86_BUILTIN_PACKSSWB256,
25341 IX86_BUILTIN_PACKUSDW256,
25342 IX86_BUILTIN_PACKUSWB256,
25343 IX86_BUILTIN_PADDB256,
25344 IX86_BUILTIN_PADDW256,
25345 IX86_BUILTIN_PADDD256,
25346 IX86_BUILTIN_PADDQ256,
25347 IX86_BUILTIN_PADDSB256,
25348 IX86_BUILTIN_PADDSW256,
25349 IX86_BUILTIN_PADDUSB256,
25350 IX86_BUILTIN_PADDUSW256,
25351 IX86_BUILTIN_PALIGNR256,
25352 IX86_BUILTIN_AND256I,
25353 IX86_BUILTIN_ANDNOT256I,
25354 IX86_BUILTIN_PAVGB256,
25355 IX86_BUILTIN_PAVGW256,
25356 IX86_BUILTIN_PBLENDVB256,
25357 IX86_BUILTIN_PBLENDVW256,
25358 IX86_BUILTIN_PCMPEQB256,
25359 IX86_BUILTIN_PCMPEQW256,
25360 IX86_BUILTIN_PCMPEQD256,
25361 IX86_BUILTIN_PCMPEQQ256,
25362 IX86_BUILTIN_PCMPGTB256,
25363 IX86_BUILTIN_PCMPGTW256,
25364 IX86_BUILTIN_PCMPGTD256,
25365 IX86_BUILTIN_PCMPGTQ256,
25366 IX86_BUILTIN_PHADDW256,
25367 IX86_BUILTIN_PHADDD256,
25368 IX86_BUILTIN_PHADDSW256,
25369 IX86_BUILTIN_PHSUBW256,
25370 IX86_BUILTIN_PHSUBD256,
25371 IX86_BUILTIN_PHSUBSW256,
25372 IX86_BUILTIN_PMADDUBSW256,
25373 IX86_BUILTIN_PMADDWD256,
25374 IX86_BUILTIN_PMAXSB256,
25375 IX86_BUILTIN_PMAXSW256,
25376 IX86_BUILTIN_PMAXSD256,
25377 IX86_BUILTIN_PMAXUB256,
25378 IX86_BUILTIN_PMAXUW256,
25379 IX86_BUILTIN_PMAXUD256,
25380 IX86_BUILTIN_PMINSB256,
25381 IX86_BUILTIN_PMINSW256,
25382 IX86_BUILTIN_PMINSD256,
25383 IX86_BUILTIN_PMINUB256,
25384 IX86_BUILTIN_PMINUW256,
25385 IX86_BUILTIN_PMINUD256,
25386 IX86_BUILTIN_PMOVMSKB256,
25387 IX86_BUILTIN_PMOVSXBW256,
25388 IX86_BUILTIN_PMOVSXBD256,
25389 IX86_BUILTIN_PMOVSXBQ256,
25390 IX86_BUILTIN_PMOVSXWD256,
25391 IX86_BUILTIN_PMOVSXWQ256,
25392 IX86_BUILTIN_PMOVSXDQ256,
25393 IX86_BUILTIN_PMOVZXBW256,
25394 IX86_BUILTIN_PMOVZXBD256,
25395 IX86_BUILTIN_PMOVZXBQ256,
25396 IX86_BUILTIN_PMOVZXWD256,
25397 IX86_BUILTIN_PMOVZXWQ256,
25398 IX86_BUILTIN_PMOVZXDQ256,
25399 IX86_BUILTIN_PMULDQ256,
25400 IX86_BUILTIN_PMULHRSW256,
25401 IX86_BUILTIN_PMULHUW256,
25402 IX86_BUILTIN_PMULHW256,
25403 IX86_BUILTIN_PMULLW256,
25404 IX86_BUILTIN_PMULLD256,
25405 IX86_BUILTIN_PMULUDQ256,
25406 IX86_BUILTIN_POR256,
25407 IX86_BUILTIN_PSADBW256,
25408 IX86_BUILTIN_PSHUFB256,
25409 IX86_BUILTIN_PSHUFD256,
25410 IX86_BUILTIN_PSHUFHW256,
25411 IX86_BUILTIN_PSHUFLW256,
25412 IX86_BUILTIN_PSIGNB256,
25413 IX86_BUILTIN_PSIGNW256,
25414 IX86_BUILTIN_PSIGND256,
25415 IX86_BUILTIN_PSLLDQI256,
25416 IX86_BUILTIN_PSLLWI256,
25417 IX86_BUILTIN_PSLLW256,
25418 IX86_BUILTIN_PSLLDI256,
25419 IX86_BUILTIN_PSLLD256,
25420 IX86_BUILTIN_PSLLQI256,
25421 IX86_BUILTIN_PSLLQ256,
25422 IX86_BUILTIN_PSRAWI256,
25423 IX86_BUILTIN_PSRAW256,
25424 IX86_BUILTIN_PSRADI256,
25425 IX86_BUILTIN_PSRAD256,
25426 IX86_BUILTIN_PSRLDQI256,
25427 IX86_BUILTIN_PSRLWI256,
25428 IX86_BUILTIN_PSRLW256,
25429 IX86_BUILTIN_PSRLDI256,
25430 IX86_BUILTIN_PSRLD256,
25431 IX86_BUILTIN_PSRLQI256,
25432 IX86_BUILTIN_PSRLQ256,
25433 IX86_BUILTIN_PSUBB256,
25434 IX86_BUILTIN_PSUBW256,
25435 IX86_BUILTIN_PSUBD256,
25436 IX86_BUILTIN_PSUBQ256,
25437 IX86_BUILTIN_PSUBSB256,
25438 IX86_BUILTIN_PSUBSW256,
25439 IX86_BUILTIN_PSUBUSB256,
25440 IX86_BUILTIN_PSUBUSW256,
25441 IX86_BUILTIN_PUNPCKHBW256,
25442 IX86_BUILTIN_PUNPCKHWD256,
25443 IX86_BUILTIN_PUNPCKHDQ256,
25444 IX86_BUILTIN_PUNPCKHQDQ256,
25445 IX86_BUILTIN_PUNPCKLBW256,
25446 IX86_BUILTIN_PUNPCKLWD256,
25447 IX86_BUILTIN_PUNPCKLDQ256,
25448 IX86_BUILTIN_PUNPCKLQDQ256,
25449 IX86_BUILTIN_PXOR256,
25450 IX86_BUILTIN_MOVNTDQA256,
25451 IX86_BUILTIN_VBROADCASTSS_PS,
25452 IX86_BUILTIN_VBROADCASTSS_PS256,
25453 IX86_BUILTIN_VBROADCASTSD_PD256,
25454 IX86_BUILTIN_VBROADCASTSI256,
25455 IX86_BUILTIN_PBLENDD256,
25456 IX86_BUILTIN_PBLENDD128,
25457 IX86_BUILTIN_PBROADCASTB256,
25458 IX86_BUILTIN_PBROADCASTW256,
25459 IX86_BUILTIN_PBROADCASTD256,
25460 IX86_BUILTIN_PBROADCASTQ256,
25461 IX86_BUILTIN_PBROADCASTB128,
25462 IX86_BUILTIN_PBROADCASTW128,
25463 IX86_BUILTIN_PBROADCASTD128,
25464 IX86_BUILTIN_PBROADCASTQ128,
25465 IX86_BUILTIN_VPERMVARSI256,
25466 IX86_BUILTIN_VPERMDF256,
25467 IX86_BUILTIN_VPERMVARSF256,
25468 IX86_BUILTIN_VPERMDI256,
25469 IX86_BUILTIN_VPERMTI256,
25470 IX86_BUILTIN_VEXTRACT128I256,
25471 IX86_BUILTIN_VINSERT128I256,
25472 IX86_BUILTIN_MASKLOADD,
25473 IX86_BUILTIN_MASKLOADQ,
25474 IX86_BUILTIN_MASKLOADD256,
25475 IX86_BUILTIN_MASKLOADQ256,
25476 IX86_BUILTIN_MASKSTORED,
25477 IX86_BUILTIN_MASKSTOREQ,
25478 IX86_BUILTIN_MASKSTORED256,
25479 IX86_BUILTIN_MASKSTOREQ256,
25480 IX86_BUILTIN_PSLLVV4DI,
25481 IX86_BUILTIN_PSLLVV2DI,
25482 IX86_BUILTIN_PSLLVV8SI,
25483 IX86_BUILTIN_PSLLVV4SI,
25484 IX86_BUILTIN_PSRAVV8SI,
25485 IX86_BUILTIN_PSRAVV4SI,
25486 IX86_BUILTIN_PSRLVV4DI,
25487 IX86_BUILTIN_PSRLVV2DI,
25488 IX86_BUILTIN_PSRLVV8SI,
25489 IX86_BUILTIN_PSRLVV4SI,
25490
25491 IX86_BUILTIN_GATHERSIV2DF,
25492 IX86_BUILTIN_GATHERSIV4DF,
25493 IX86_BUILTIN_GATHERDIV2DF,
25494 IX86_BUILTIN_GATHERDIV4DF,
25495 IX86_BUILTIN_GATHERSIV4SF,
25496 IX86_BUILTIN_GATHERSIV8SF,
25497 IX86_BUILTIN_GATHERDIV4SF,
25498 IX86_BUILTIN_GATHERDIV8SF,
25499 IX86_BUILTIN_GATHERSIV2DI,
25500 IX86_BUILTIN_GATHERSIV4DI,
25501 IX86_BUILTIN_GATHERDIV2DI,
25502 IX86_BUILTIN_GATHERDIV4DI,
25503 IX86_BUILTIN_GATHERSIV4SI,
25504 IX86_BUILTIN_GATHERSIV8SI,
25505 IX86_BUILTIN_GATHERDIV4SI,
25506 IX86_BUILTIN_GATHERDIV8SI,
25507
25508 /* Alternate 4 element gather for the vectorizer where
25509 all operands are 32-byte wide. */
25510 IX86_BUILTIN_GATHERALTSIV4DF,
25511 IX86_BUILTIN_GATHERALTDIV8SF,
25512 IX86_BUILTIN_GATHERALTSIV4DI,
25513 IX86_BUILTIN_GATHERALTDIV8SI,
25514
25515 /* TFmode support builtins. */
25516 IX86_BUILTIN_INFQ,
25517 IX86_BUILTIN_HUGE_VALQ,
25518 IX86_BUILTIN_FABSQ,
25519 IX86_BUILTIN_COPYSIGNQ,
25520
25521 /* Vectorizer support builtins. */
25522 IX86_BUILTIN_CPYSGNPS,
25523 IX86_BUILTIN_CPYSGNPD,
25524 IX86_BUILTIN_CPYSGNPS256,
25525 IX86_BUILTIN_CPYSGNPD256,
25526
25527 /* FMA4 instructions. */
25528 IX86_BUILTIN_VFMADDSS,
25529 IX86_BUILTIN_VFMADDSD,
25530 IX86_BUILTIN_VFMADDPS,
25531 IX86_BUILTIN_VFMADDPD,
25532 IX86_BUILTIN_VFMADDPS256,
25533 IX86_BUILTIN_VFMADDPD256,
25534 IX86_BUILTIN_VFMADDSUBPS,
25535 IX86_BUILTIN_VFMADDSUBPD,
25536 IX86_BUILTIN_VFMADDSUBPS256,
25537 IX86_BUILTIN_VFMADDSUBPD256,
25538
25539 /* FMA3 instructions. */
25540 IX86_BUILTIN_VFMADDSS3,
25541 IX86_BUILTIN_VFMADDSD3,
25542
25543 /* XOP instructions. */
25544 IX86_BUILTIN_VPCMOV,
25545 IX86_BUILTIN_VPCMOV_V2DI,
25546 IX86_BUILTIN_VPCMOV_V4SI,
25547 IX86_BUILTIN_VPCMOV_V8HI,
25548 IX86_BUILTIN_VPCMOV_V16QI,
25549 IX86_BUILTIN_VPCMOV_V4SF,
25550 IX86_BUILTIN_VPCMOV_V2DF,
25551 IX86_BUILTIN_VPCMOV256,
25552 IX86_BUILTIN_VPCMOV_V4DI256,
25553 IX86_BUILTIN_VPCMOV_V8SI256,
25554 IX86_BUILTIN_VPCMOV_V16HI256,
25555 IX86_BUILTIN_VPCMOV_V32QI256,
25556 IX86_BUILTIN_VPCMOV_V8SF256,
25557 IX86_BUILTIN_VPCMOV_V4DF256,
25558
25559 IX86_BUILTIN_VPPERM,
25560
25561 IX86_BUILTIN_VPMACSSWW,
25562 IX86_BUILTIN_VPMACSWW,
25563 IX86_BUILTIN_VPMACSSWD,
25564 IX86_BUILTIN_VPMACSWD,
25565 IX86_BUILTIN_VPMACSSDD,
25566 IX86_BUILTIN_VPMACSDD,
25567 IX86_BUILTIN_VPMACSSDQL,
25568 IX86_BUILTIN_VPMACSSDQH,
25569 IX86_BUILTIN_VPMACSDQL,
25570 IX86_BUILTIN_VPMACSDQH,
25571 IX86_BUILTIN_VPMADCSSWD,
25572 IX86_BUILTIN_VPMADCSWD,
25573
25574 IX86_BUILTIN_VPHADDBW,
25575 IX86_BUILTIN_VPHADDBD,
25576 IX86_BUILTIN_VPHADDBQ,
25577 IX86_BUILTIN_VPHADDWD,
25578 IX86_BUILTIN_VPHADDWQ,
25579 IX86_BUILTIN_VPHADDDQ,
25580 IX86_BUILTIN_VPHADDUBW,
25581 IX86_BUILTIN_VPHADDUBD,
25582 IX86_BUILTIN_VPHADDUBQ,
25583 IX86_BUILTIN_VPHADDUWD,
25584 IX86_BUILTIN_VPHADDUWQ,
25585 IX86_BUILTIN_VPHADDUDQ,
25586 IX86_BUILTIN_VPHSUBBW,
25587 IX86_BUILTIN_VPHSUBWD,
25588 IX86_BUILTIN_VPHSUBDQ,
25589
25590 IX86_BUILTIN_VPROTB,
25591 IX86_BUILTIN_VPROTW,
25592 IX86_BUILTIN_VPROTD,
25593 IX86_BUILTIN_VPROTQ,
25594 IX86_BUILTIN_VPROTB_IMM,
25595 IX86_BUILTIN_VPROTW_IMM,
25596 IX86_BUILTIN_VPROTD_IMM,
25597 IX86_BUILTIN_VPROTQ_IMM,
25598
25599 IX86_BUILTIN_VPSHLB,
25600 IX86_BUILTIN_VPSHLW,
25601 IX86_BUILTIN_VPSHLD,
25602 IX86_BUILTIN_VPSHLQ,
25603 IX86_BUILTIN_VPSHAB,
25604 IX86_BUILTIN_VPSHAW,
25605 IX86_BUILTIN_VPSHAD,
25606 IX86_BUILTIN_VPSHAQ,
25607
25608 IX86_BUILTIN_VFRCZSS,
25609 IX86_BUILTIN_VFRCZSD,
25610 IX86_BUILTIN_VFRCZPS,
25611 IX86_BUILTIN_VFRCZPD,
25612 IX86_BUILTIN_VFRCZPS256,
25613 IX86_BUILTIN_VFRCZPD256,
25614
25615 IX86_BUILTIN_VPCOMEQUB,
25616 IX86_BUILTIN_VPCOMNEUB,
25617 IX86_BUILTIN_VPCOMLTUB,
25618 IX86_BUILTIN_VPCOMLEUB,
25619 IX86_BUILTIN_VPCOMGTUB,
25620 IX86_BUILTIN_VPCOMGEUB,
25621 IX86_BUILTIN_VPCOMFALSEUB,
25622 IX86_BUILTIN_VPCOMTRUEUB,
25623
25624 IX86_BUILTIN_VPCOMEQUW,
25625 IX86_BUILTIN_VPCOMNEUW,
25626 IX86_BUILTIN_VPCOMLTUW,
25627 IX86_BUILTIN_VPCOMLEUW,
25628 IX86_BUILTIN_VPCOMGTUW,
25629 IX86_BUILTIN_VPCOMGEUW,
25630 IX86_BUILTIN_VPCOMFALSEUW,
25631 IX86_BUILTIN_VPCOMTRUEUW,
25632
25633 IX86_BUILTIN_VPCOMEQUD,
25634 IX86_BUILTIN_VPCOMNEUD,
25635 IX86_BUILTIN_VPCOMLTUD,
25636 IX86_BUILTIN_VPCOMLEUD,
25637 IX86_BUILTIN_VPCOMGTUD,
25638 IX86_BUILTIN_VPCOMGEUD,
25639 IX86_BUILTIN_VPCOMFALSEUD,
25640 IX86_BUILTIN_VPCOMTRUEUD,
25641
25642 IX86_BUILTIN_VPCOMEQUQ,
25643 IX86_BUILTIN_VPCOMNEUQ,
25644 IX86_BUILTIN_VPCOMLTUQ,
25645 IX86_BUILTIN_VPCOMLEUQ,
25646 IX86_BUILTIN_VPCOMGTUQ,
25647 IX86_BUILTIN_VPCOMGEUQ,
25648 IX86_BUILTIN_VPCOMFALSEUQ,
25649 IX86_BUILTIN_VPCOMTRUEUQ,
25650
25651 IX86_BUILTIN_VPCOMEQB,
25652 IX86_BUILTIN_VPCOMNEB,
25653 IX86_BUILTIN_VPCOMLTB,
25654 IX86_BUILTIN_VPCOMLEB,
25655 IX86_BUILTIN_VPCOMGTB,
25656 IX86_BUILTIN_VPCOMGEB,
25657 IX86_BUILTIN_VPCOMFALSEB,
25658 IX86_BUILTIN_VPCOMTRUEB,
25659
25660 IX86_BUILTIN_VPCOMEQW,
25661 IX86_BUILTIN_VPCOMNEW,
25662 IX86_BUILTIN_VPCOMLTW,
25663 IX86_BUILTIN_VPCOMLEW,
25664 IX86_BUILTIN_VPCOMGTW,
25665 IX86_BUILTIN_VPCOMGEW,
25666 IX86_BUILTIN_VPCOMFALSEW,
25667 IX86_BUILTIN_VPCOMTRUEW,
25668
25669 IX86_BUILTIN_VPCOMEQD,
25670 IX86_BUILTIN_VPCOMNED,
25671 IX86_BUILTIN_VPCOMLTD,
25672 IX86_BUILTIN_VPCOMLED,
25673 IX86_BUILTIN_VPCOMGTD,
25674 IX86_BUILTIN_VPCOMGED,
25675 IX86_BUILTIN_VPCOMFALSED,
25676 IX86_BUILTIN_VPCOMTRUED,
25677
25678 IX86_BUILTIN_VPCOMEQQ,
25679 IX86_BUILTIN_VPCOMNEQ,
25680 IX86_BUILTIN_VPCOMLTQ,
25681 IX86_BUILTIN_VPCOMLEQ,
25682 IX86_BUILTIN_VPCOMGTQ,
25683 IX86_BUILTIN_VPCOMGEQ,
25684 IX86_BUILTIN_VPCOMFALSEQ,
25685 IX86_BUILTIN_VPCOMTRUEQ,
25686
25687 /* LWP instructions. */
25688 IX86_BUILTIN_LLWPCB,
25689 IX86_BUILTIN_SLWPCB,
25690 IX86_BUILTIN_LWPVAL32,
25691 IX86_BUILTIN_LWPVAL64,
25692 IX86_BUILTIN_LWPINS32,
25693 IX86_BUILTIN_LWPINS64,
25694
25695 IX86_BUILTIN_CLZS,
25696
25697 /* RTM */
25698 IX86_BUILTIN_XBEGIN,
25699 IX86_BUILTIN_XEND,
25700 IX86_BUILTIN_XABORT,
25701 IX86_BUILTIN_XTEST,
25702
25703 /* BMI instructions. */
25704 IX86_BUILTIN_BEXTR32,
25705 IX86_BUILTIN_BEXTR64,
25706 IX86_BUILTIN_CTZS,
25707
25708 /* TBM instructions. */
25709 IX86_BUILTIN_BEXTRI32,
25710 IX86_BUILTIN_BEXTRI64,
25711
25712 /* BMI2 instructions. */
25713 IX86_BUILTIN_BZHI32,
25714 IX86_BUILTIN_BZHI64,
25715 IX86_BUILTIN_PDEP32,
25716 IX86_BUILTIN_PDEP64,
25717 IX86_BUILTIN_PEXT32,
25718 IX86_BUILTIN_PEXT64,
25719
25720 /* FSGSBASE instructions. */
25721 IX86_BUILTIN_RDFSBASE32,
25722 IX86_BUILTIN_RDFSBASE64,
25723 IX86_BUILTIN_RDGSBASE32,
25724 IX86_BUILTIN_RDGSBASE64,
25725 IX86_BUILTIN_WRFSBASE32,
25726 IX86_BUILTIN_WRFSBASE64,
25727 IX86_BUILTIN_WRGSBASE32,
25728 IX86_BUILTIN_WRGSBASE64,
25729
25730 /* RDRND instructions. */
25731 IX86_BUILTIN_RDRAND16_STEP,
25732 IX86_BUILTIN_RDRAND32_STEP,
25733 IX86_BUILTIN_RDRAND64_STEP,
25734
25735 /* F16C instructions. */
25736 IX86_BUILTIN_CVTPH2PS,
25737 IX86_BUILTIN_CVTPH2PS256,
25738 IX86_BUILTIN_CVTPS2PH,
25739 IX86_BUILTIN_CVTPS2PH256,
25740
25741 /* CFString built-in for darwin */
25742 IX86_BUILTIN_CFSTRING,
25743
25744 IX86_BUILTIN_MAX
25745 };
25746
25747 /* Table for the ix86 builtin decls. */
25748 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25749
25750 /* Table of all of the builtin functions that are possible with different ISA's
25751 but are waiting to be built until a function is declared to use that
25752 ISA. */
25753 struct builtin_isa {
25754 const char *name; /* function name */
25755 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25756 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25757 bool const_p; /* true if the declaration is constant */
25758 bool set_and_not_built_p;
25759 };
25760
25761 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25762
25763
25764 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25765 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25766 function decl in the ix86_builtins array. Returns the function decl or
25767 NULL_TREE, if the builtin was not added.
25768
25769 If the front end has a special hook for builtin functions, delay adding
25770 builtin functions that aren't in the current ISA until the ISA is changed
25771 with function specific optimization. Doing so, can save about 300K for the
25772 default compiler. When the builtin is expanded, check at that time whether
25773 it is valid.
25774
25775 If the front end doesn't have a special hook, record all builtins, even if
25776 it isn't an instruction set in the current ISA in case the user uses
25777 function specific options for a different ISA, so that we don't get scope
25778 errors if a builtin is added in the middle of a function scope. */
25779
25780 static inline tree
25781 def_builtin (HOST_WIDE_INT mask, const char *name,
25782 enum ix86_builtin_func_type tcode,
25783 enum ix86_builtins code)
25784 {
25785 tree decl = NULL_TREE;
25786
25787 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25788 {
25789 ix86_builtins_isa[(int) code].isa = mask;
25790
25791 mask &= ~OPTION_MASK_ISA_64BIT;
25792 if (mask == 0
25793 || (mask & ix86_isa_flags) != 0
25794 || (lang_hooks.builtin_function
25795 == lang_hooks.builtin_function_ext_scope))
25796
25797 {
25798 tree type = ix86_get_builtin_func_type (tcode);
25799 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25800 NULL, NULL_TREE);
25801 ix86_builtins[(int) code] = decl;
25802 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25803 }
25804 else
25805 {
25806 ix86_builtins[(int) code] = NULL_TREE;
25807 ix86_builtins_isa[(int) code].tcode = tcode;
25808 ix86_builtins_isa[(int) code].name = name;
25809 ix86_builtins_isa[(int) code].const_p = false;
25810 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25811 }
25812 }
25813
25814 return decl;
25815 }
25816
25817 /* Like def_builtin, but also marks the function decl "const". */
25818
25819 static inline tree
25820 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25821 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25822 {
25823 tree decl = def_builtin (mask, name, tcode, code);
25824 if (decl)
25825 TREE_READONLY (decl) = 1;
25826 else
25827 ix86_builtins_isa[(int) code].const_p = true;
25828
25829 return decl;
25830 }
25831
25832 /* Add any new builtin functions for a given ISA that may not have been
25833 declared. This saves a bit of space compared to adding all of the
25834 declarations to the tree, even if we didn't use them. */
25835
25836 static void
25837 ix86_add_new_builtins (HOST_WIDE_INT isa)
25838 {
25839 int i;
25840
25841 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
25842 {
25843 if ((ix86_builtins_isa[i].isa & isa) != 0
25844 && ix86_builtins_isa[i].set_and_not_built_p)
25845 {
25846 tree decl, type;
25847
25848 /* Don't define the builtin again. */
25849 ix86_builtins_isa[i].set_and_not_built_p = false;
25850
25851 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
25852 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
25853 type, i, BUILT_IN_MD, NULL,
25854 NULL_TREE);
25855
25856 ix86_builtins[i] = decl;
25857 if (ix86_builtins_isa[i].const_p)
25858 TREE_READONLY (decl) = 1;
25859 }
25860 }
25861 }
25862
25863 /* Bits for builtin_description.flag. */
25864
25865 /* Set when we don't support the comparison natively, and should
25866 swap_comparison in order to support it. */
25867 #define BUILTIN_DESC_SWAP_OPERANDS 1
25868
25869 struct builtin_description
25870 {
25871 const HOST_WIDE_INT mask;
25872 const enum insn_code icode;
25873 const char *const name;
25874 const enum ix86_builtins code;
25875 const enum rtx_code comparison;
25876 const int flag;
25877 };
25878
25879 static const struct builtin_description bdesc_comi[] =
25880 {
25881 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
25882 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
25883 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
25884 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
25885 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
25886 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
25887 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
25888 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
25889 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
25890 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
25891 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
25892 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
25893 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
25894 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
25895 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
25896 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
25897 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
25898 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
25899 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
25900 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
25901 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
25902 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
25903 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
25904 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
25905 };
25906
25907 static const struct builtin_description bdesc_pcmpestr[] =
25908 {
25909 /* SSE4.2 */
25910 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
25911 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
25912 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
25913 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
25914 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
25915 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
25916 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
25917 };
25918
25919 static const struct builtin_description bdesc_pcmpistr[] =
25920 {
25921 /* SSE4.2 */
25922 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
25923 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
25924 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
25925 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
25926 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
25927 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
25928 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
25929 };
25930
25931 /* Special builtins with variable number of arguments. */
25932 static const struct builtin_description bdesc_special_args[] =
25933 {
25934 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
25935 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
25936 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
25937
25938 /* MMX */
25939 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25940
25941 /* 3DNow! */
25942 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25943
25944 /* SSE */
25945 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25946 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25947 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25948
25949 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25950 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25951 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25952 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25953
25954 /* SSE or 3DNow!A */
25955 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25956 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
25957
25958 /* SSE2 */
25959 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25960 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25961 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25962 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
25963 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25964 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
25965 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
25966 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
25967 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
25968 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25969
25970 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25971 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25972
25973 /* SSE3 */
25974 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25975
25976 /* SSE4.1 */
25977 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
25978
25979 /* SSE4A */
25980 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25981 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25982
25983 /* AVX */
25984 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
25985 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
25986
25987 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25988 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25989 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25990 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
25991 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
25992
25993 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25994 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25995 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25996 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25997 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25998 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
25999 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26000
26001 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26002 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26003 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26004
26005 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26006 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26007 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26008 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26009 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26010 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26011 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26012 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26013
26014 /* AVX2 */
26015 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26016 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26017 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26018 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26019 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26020 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26021 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26022 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26023 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26024
26025 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26026 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26027 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26028 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26029 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26030 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26031
26032 /* FSGSBASE */
26033 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26034 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26035 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26036 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26037 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26038 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26039 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26040 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26041
26042 /* RTM */
26043 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26044 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26045 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26046 };
26047
26048 /* Builtins with variable number of arguments. */
26049 static const struct builtin_description bdesc_args[] =
26050 {
26051 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26052 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26053 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26054 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26055 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26056 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26057 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26058
26059 /* MMX */
26060 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26061 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26062 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26063 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26064 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26065 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26066
26067 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26068 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26069 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26070 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26071 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26072 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26073 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26074 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26075
26076 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26077 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26078
26079 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26080 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26081 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26082 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26083
26084 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26085 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26086 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26087 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26088 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26089 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26090
26091 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26092 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26093 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26094 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26095 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26096 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26097
26098 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26099 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26100 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26101
26102 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26103
26104 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26105 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26106 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26107 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26108 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26109 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26110
26111 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26112 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26113 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26114 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26115 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26116 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26117
26118 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26119 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26120 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26121 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26122
26123 /* 3DNow! */
26124 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26125 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26126 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26127 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26128
26129 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26130 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26131 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26132 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26133 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26134 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26135 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26136 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26137 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26138 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26139 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26140 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26141 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26142 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26143 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26144
26145 /* 3DNow!A */
26146 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26147 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26148 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26149 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26150 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26151 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26152
26153 /* SSE */
26154 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26155 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26156 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26157 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26158 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26159 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26160 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26161 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26162 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26163 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26164 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26165 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26166
26167 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26168
26169 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26170 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26171 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26172 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26173 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26174 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26175 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26176 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26177
26178 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26179 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26180 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26181 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26182 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26183 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26184 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26185 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26186 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26187 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26188 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26189 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26190 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26191 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26192 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26193 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26194 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26195 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26196 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26197 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26198 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26199 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26200
26201 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26202 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26203 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26204 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26205
26206 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26207 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26208 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26209 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26210
26211 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26212
26213 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26214 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26215 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26216 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26217 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26218
26219 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26220 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26221 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26222
26223 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26224
26225 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26226 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26227 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26228
26229 /* SSE MMX or 3Dnow!A */
26230 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26231 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26232 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26233
26234 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26235 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26236 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26237 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26238
26239 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26240 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26241
26242 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26243
26244 /* SSE2 */
26245 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26246
26247 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26248 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26249 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26250 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26251 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26252
26253 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26254 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26255 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26256 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26257 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26258
26259 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26260
26261 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26262 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26263 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26264 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26265
26266 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26267 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26268 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26269
26270 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26271 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26272 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26273 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26274 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26275 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26276 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26277 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26278
26279 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26280 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26281 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26282 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26283 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26284 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26285 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26286 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26287 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26288 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26289 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26290 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26291 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26292 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26293 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26294 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26295 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26296 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26297 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26298 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26299
26300 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26301 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26302 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26303 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26304
26305 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26306 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26307 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26308 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26309
26310 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26311
26312 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26313 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26314 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26315
26316 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26317
26318 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26319 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26320 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26321 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26322 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26323 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26324 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26325 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26326
26327 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26328 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26329 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26330 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26331 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26332 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26333 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26334 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26335
26336 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26337 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26338
26339 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26340 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26341 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26342 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26343
26344 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26345 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26346
26347 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26348 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26349 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26350 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26351 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26352 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26353
26354 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26355 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26356 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26357 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26358
26359 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26360 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26361 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26362 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26363 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26364 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26365 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26366 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26367
26368 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26369 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26370 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26371
26372 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26373 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26374
26375 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26376 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26377
26378 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26379
26380 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26381 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26382 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26383 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26384
26385 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26386 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26387 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26388 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26389 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26390 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26391 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26392
26393 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26394 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26395 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26396 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26397 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26398 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26399 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26400
26401 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26402 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26403 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26404 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26405
26406 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26407 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26408 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26409
26410 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26411
26412 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26413 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26414
26415 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26416
26417 /* SSE2 MMX */
26418 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26419 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26420
26421 /* SSE3 */
26422 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26423 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26424
26425 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26426 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26427 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26428 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26429 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26430 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26431
26432 /* SSSE3 */
26433 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26434 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26435 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26436 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26437 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26438 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26439
26440 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26441 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26442 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26443 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26444 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26445 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26446 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26447 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26448 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26449 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26450 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26451 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26452 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26453 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26454 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26455 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26456 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26457 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26458 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26459 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26460 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26461 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26462 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26463 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26464
26465 /* SSSE3. */
26466 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26467 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26468
26469 /* SSE4.1 */
26470 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26471 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26472 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26473 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26474 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26475 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26476 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26477 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26478 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26479 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26480
26481 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26482 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26483 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26484 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26485 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26486 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26487 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26488 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26489 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26490 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26491 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26492 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26493 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26494
26495 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26496 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26497 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26498 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26499 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26500 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26501 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26502 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26503 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26504 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26505 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26506 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26507
26508 /* SSE4.1 */
26509 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26510 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26511 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26512 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26513
26514 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26515 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26516 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26517 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26518
26519 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26520 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26521
26522 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26523 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26524
26525 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26526 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26527 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26528 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26529
26530 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26531 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26532
26533 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26534 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26535
26536 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26537 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26538 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26539
26540 /* SSE4.2 */
26541 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26542 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26543 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26544 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26545 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26546
26547 /* SSE4A */
26548 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26549 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26550 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26551 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26552
26553 /* AES */
26554 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26555 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26556
26557 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26558 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26559 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26560 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26561
26562 /* PCLMUL */
26563 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26564
26565 /* AVX */
26566 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26567 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26568 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26569 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26570 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26571 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26572 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26573 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26574 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26575 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26576 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26577 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26579 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26580 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26581 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26582 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26583 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26584 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26585 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26586 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26587 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26588 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26589 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26590 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26591 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26592
26593 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26594 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26595 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26596 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26597
26598 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26599 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26600 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26601 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26602 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26603 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26604 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26605 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26606 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26607 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26608 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26609 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26610 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26611 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26612 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26613 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26614 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26615 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26616 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26617 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26618 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26619 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26620 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26621 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26622 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26623 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26624 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26625 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26626 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26627 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26628 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26629 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26630 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26631 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26632
26633 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26634 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26635 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26636
26637 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26638 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26639 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26640 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26641 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26642
26643 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26644
26645 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26646 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26647
26648 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26649 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26650 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26651 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26652
26653 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26654 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26655
26656 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26657 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26658
26659 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26660 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26661 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26662 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26663
26664 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26665 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26666
26667 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26668 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26669
26670 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26671 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26672 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26673 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26674
26675 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26676 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26677 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26678 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26679 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26680 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26681
26682 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26683 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26684 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26685 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26686 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26687 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26688 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26689 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26690 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26691 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26692 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26693 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26694 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26695 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26696 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26697
26698 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26699 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26700
26701 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26702 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26703
26704 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26705
26706 /* AVX2 */
26707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26708 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26709 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26710 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26714 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26715 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26716 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26717 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26718 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26719 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26720 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26721 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26722 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26723 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26724 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26725 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26726 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26727 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26733 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26734 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26737 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26738 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26739 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26740 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26742 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26744 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26745 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26746 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26747 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26748 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26749 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26750 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26751 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26752 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26753 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26754 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26755 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26756 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26757 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26758 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26759 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26760 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26761 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26762 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26763 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26764 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26765 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26766 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26767 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26768 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26769 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26770 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26771 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26772 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26773 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26774 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26775 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26776 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26778 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26779 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26780 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26781 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26782 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26783 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26784 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26785 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26786 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26787 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26788 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26789 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26790 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26791 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26792 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26793 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26794 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26795 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26796 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26797 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26798 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26799 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26800 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26801 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26802 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26803 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26804 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26805 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26806 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26807 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26808 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26809 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26810 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26811 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26812 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26813 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26814 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26815 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26816 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26817 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26818 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26819 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26820 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26821 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26822 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26823 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26824 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26825 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26826 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
26827 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26828 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
26829 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
26830 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26831 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26832 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26833 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26834 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26835 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26836 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26837 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26838 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26839 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
26840 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
26841 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
26842 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
26843 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26844 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26845 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26846 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26847 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26848 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26849 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26850 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26851 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26852 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26853
26854 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26855
26856 /* BMI */
26857 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26858 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26859 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26860
26861 /* TBM */
26862 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26863 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26864
26865 /* F16C */
26866 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
26867 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
26868 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
26869 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
26870
26871 /* BMI2 */
26872 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26873 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26874 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26875 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26876 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26877 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26878 };
26879
26880 /* FMA4 and XOP. */
26881 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
26882 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
26883 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
26884 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
26885 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
26886 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
26887 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
26888 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
26889 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
26890 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
26891 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
26892 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
26893 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
26894 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
26895 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
26896 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
26897 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
26898 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
26899 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
26900 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
26901 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
26902 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
26903 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
26904 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
26905 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
26906 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
26907 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
26908 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
26909 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
26910 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
26911 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
26912 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
26913 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
26914 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
26915 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
26916 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
26917 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
26918 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
26919 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
26920 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
26921 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
26922 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
26923 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
26924 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
26925 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
26926 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
26927 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
26928 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
26929 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
26930 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
26931 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
26932 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
26933
26934 static const struct builtin_description bdesc_multi_arg[] =
26935 {
26936 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
26937 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
26938 UNKNOWN, (int)MULTI_ARG_3_SF },
26939 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
26940 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
26941 UNKNOWN, (int)MULTI_ARG_3_DF },
26942
26943 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
26944 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
26945 UNKNOWN, (int)MULTI_ARG_3_SF },
26946 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
26947 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
26948 UNKNOWN, (int)MULTI_ARG_3_DF },
26949
26950 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
26951 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
26952 UNKNOWN, (int)MULTI_ARG_3_SF },
26953 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
26954 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
26955 UNKNOWN, (int)MULTI_ARG_3_DF },
26956 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
26957 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
26958 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26959 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
26960 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
26961 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26962
26963 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
26964 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
26965 UNKNOWN, (int)MULTI_ARG_3_SF },
26966 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
26967 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
26968 UNKNOWN, (int)MULTI_ARG_3_DF },
26969 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
26970 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
26971 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26972 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
26973 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
26974 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26975
26976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
26977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
26978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
26979 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
26980 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
26981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
26982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
26983
26984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
26987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
26988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
26989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
26990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
26991
26992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
26993
26994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27006
27007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27023
27024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27028 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27029 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27030
27031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27032 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27037 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27039 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27042 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27045 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27046
27047 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27049 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27051 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27053 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27054
27055 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27059 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27060 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27061 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27062
27063 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27064 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27065 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27066 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27067 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27068 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27069 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27070
27071 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27072 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27073 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27074 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27075 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27076 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27077 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27078
27079 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27080 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27081 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27082 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27083 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27084 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27085 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27086
27087 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27088 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27089 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27090 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27091 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27092 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27093 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27094
27095 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27096 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27097 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27098 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27099 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27100 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27101 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27102
27103 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27104 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27105 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27106 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27107 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27108 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27109 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27110
27111 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27112 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27113 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27114 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27115 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27116 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27117 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27118 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27119
27120 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27121 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27122 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27123 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27124 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27125 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27126 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27127 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27128
27129 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27130 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27131 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27132 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27133
27134 };
27135 \f
27136 /* TM vector builtins. */
27137
27138 /* Reuse the existing x86-specific `struct builtin_description' cause
27139 we're lazy. Add casts to make them fit. */
27140 static const struct builtin_description bdesc_tm[] =
27141 {
27142 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27143 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27144 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27145 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27146 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27147 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27148 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27149
27150 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27151 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27152 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27153 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27154 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27155 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27156 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27157
27158 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27159 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27160 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27161 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27162 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27163 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27164 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27165
27166 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27167 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27168 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27169 };
27170
27171 /* TM callbacks. */
27172
27173 /* Return the builtin decl needed to load a vector of TYPE. */
27174
27175 static tree
27176 ix86_builtin_tm_load (tree type)
27177 {
27178 if (TREE_CODE (type) == VECTOR_TYPE)
27179 {
27180 switch (tree_low_cst (TYPE_SIZE (type), 1))
27181 {
27182 case 64:
27183 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27184 case 128:
27185 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27186 case 256:
27187 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27188 }
27189 }
27190 return NULL_TREE;
27191 }
27192
27193 /* Return the builtin decl needed to store a vector of TYPE. */
27194
27195 static tree
27196 ix86_builtin_tm_store (tree type)
27197 {
27198 if (TREE_CODE (type) == VECTOR_TYPE)
27199 {
27200 switch (tree_low_cst (TYPE_SIZE (type), 1))
27201 {
27202 case 64:
27203 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27204 case 128:
27205 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27206 case 256:
27207 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27208 }
27209 }
27210 return NULL_TREE;
27211 }
27212 \f
27213 /* Initialize the transactional memory vector load/store builtins. */
27214
27215 static void
27216 ix86_init_tm_builtins (void)
27217 {
27218 enum ix86_builtin_func_type ftype;
27219 const struct builtin_description *d;
27220 size_t i;
27221 tree decl;
27222 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27223 tree attrs_log, attrs_type_log;
27224
27225 if (!flag_tm)
27226 return;
27227
27228 /* If there are no builtins defined, we must be compiling in a
27229 language without trans-mem support. */
27230 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27231 return;
27232
27233 /* Use whatever attributes a normal TM load has. */
27234 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27235 attrs_load = DECL_ATTRIBUTES (decl);
27236 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27237 /* Use whatever attributes a normal TM store has. */
27238 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27239 attrs_store = DECL_ATTRIBUTES (decl);
27240 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27241 /* Use whatever attributes a normal TM log has. */
27242 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27243 attrs_log = DECL_ATTRIBUTES (decl);
27244 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27245
27246 for (i = 0, d = bdesc_tm;
27247 i < ARRAY_SIZE (bdesc_tm);
27248 i++, d++)
27249 {
27250 if ((d->mask & ix86_isa_flags) != 0
27251 || (lang_hooks.builtin_function
27252 == lang_hooks.builtin_function_ext_scope))
27253 {
27254 tree type, attrs, attrs_type;
27255 enum built_in_function code = (enum built_in_function) d->code;
27256
27257 ftype = (enum ix86_builtin_func_type) d->flag;
27258 type = ix86_get_builtin_func_type (ftype);
27259
27260 if (BUILTIN_TM_LOAD_P (code))
27261 {
27262 attrs = attrs_load;
27263 attrs_type = attrs_type_load;
27264 }
27265 else if (BUILTIN_TM_STORE_P (code))
27266 {
27267 attrs = attrs_store;
27268 attrs_type = attrs_type_store;
27269 }
27270 else
27271 {
27272 attrs = attrs_log;
27273 attrs_type = attrs_type_log;
27274 }
27275 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27276 /* The builtin without the prefix for
27277 calling it directly. */
27278 d->name + strlen ("__builtin_"),
27279 attrs);
27280 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27281 set the TYPE_ATTRIBUTES. */
27282 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27283
27284 set_builtin_decl (code, decl, false);
27285 }
27286 }
27287 }
27288
27289 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27290 in the current target ISA to allow the user to compile particular modules
27291 with different target specific options that differ from the command line
27292 options. */
27293 static void
27294 ix86_init_mmx_sse_builtins (void)
27295 {
27296 const struct builtin_description * d;
27297 enum ix86_builtin_func_type ftype;
27298 size_t i;
27299
27300 /* Add all special builtins with variable number of operands. */
27301 for (i = 0, d = bdesc_special_args;
27302 i < ARRAY_SIZE (bdesc_special_args);
27303 i++, d++)
27304 {
27305 if (d->name == 0)
27306 continue;
27307
27308 ftype = (enum ix86_builtin_func_type) d->flag;
27309 def_builtin (d->mask, d->name, ftype, d->code);
27310 }
27311
27312 /* Add all builtins with variable number of operands. */
27313 for (i = 0, d = bdesc_args;
27314 i < ARRAY_SIZE (bdesc_args);
27315 i++, d++)
27316 {
27317 if (d->name == 0)
27318 continue;
27319
27320 ftype = (enum ix86_builtin_func_type) d->flag;
27321 def_builtin_const (d->mask, d->name, ftype, d->code);
27322 }
27323
27324 /* pcmpestr[im] insns. */
27325 for (i = 0, d = bdesc_pcmpestr;
27326 i < ARRAY_SIZE (bdesc_pcmpestr);
27327 i++, d++)
27328 {
27329 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27330 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27331 else
27332 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27333 def_builtin_const (d->mask, d->name, ftype, d->code);
27334 }
27335
27336 /* pcmpistr[im] insns. */
27337 for (i = 0, d = bdesc_pcmpistr;
27338 i < ARRAY_SIZE (bdesc_pcmpistr);
27339 i++, d++)
27340 {
27341 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27342 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27343 else
27344 ftype = INT_FTYPE_V16QI_V16QI_INT;
27345 def_builtin_const (d->mask, d->name, ftype, d->code);
27346 }
27347
27348 /* comi/ucomi insns. */
27349 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27350 {
27351 if (d->mask == OPTION_MASK_ISA_SSE2)
27352 ftype = INT_FTYPE_V2DF_V2DF;
27353 else
27354 ftype = INT_FTYPE_V4SF_V4SF;
27355 def_builtin_const (d->mask, d->name, ftype, d->code);
27356 }
27357
27358 /* SSE */
27359 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27360 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27361 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27362 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27363
27364 /* SSE or 3DNow!A */
27365 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27366 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27367 IX86_BUILTIN_MASKMOVQ);
27368
27369 /* SSE2 */
27370 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27371 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27372
27373 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27374 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27375 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27376 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27377
27378 /* SSE3. */
27379 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27380 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27381 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27382 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27383
27384 /* AES */
27385 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27386 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27387 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27388 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27389 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27390 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27391 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27392 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27393 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27394 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27395 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27396 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27397
27398 /* PCLMUL */
27399 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27400 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27401
27402 /* RDRND */
27403 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27404 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27405 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27406 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27407 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27408 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27409 IX86_BUILTIN_RDRAND64_STEP);
27410
27411 /* AVX2 */
27412 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27413 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27414 IX86_BUILTIN_GATHERSIV2DF);
27415
27416 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27417 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27418 IX86_BUILTIN_GATHERSIV4DF);
27419
27420 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27421 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27422 IX86_BUILTIN_GATHERDIV2DF);
27423
27424 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27425 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27426 IX86_BUILTIN_GATHERDIV4DF);
27427
27428 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27429 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27430 IX86_BUILTIN_GATHERSIV4SF);
27431
27432 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27433 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27434 IX86_BUILTIN_GATHERSIV8SF);
27435
27436 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27437 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27438 IX86_BUILTIN_GATHERDIV4SF);
27439
27440 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27441 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27442 IX86_BUILTIN_GATHERDIV8SF);
27443
27444 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27445 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27446 IX86_BUILTIN_GATHERSIV2DI);
27447
27448 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27449 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27450 IX86_BUILTIN_GATHERSIV4DI);
27451
27452 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27453 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27454 IX86_BUILTIN_GATHERDIV2DI);
27455
27456 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27457 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27458 IX86_BUILTIN_GATHERDIV4DI);
27459
27460 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27461 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27462 IX86_BUILTIN_GATHERSIV4SI);
27463
27464 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27465 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27466 IX86_BUILTIN_GATHERSIV8SI);
27467
27468 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27469 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27470 IX86_BUILTIN_GATHERDIV4SI);
27471
27472 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27473 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27474 IX86_BUILTIN_GATHERDIV8SI);
27475
27476 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27477 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27478 IX86_BUILTIN_GATHERALTSIV4DF);
27479
27480 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27481 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27482 IX86_BUILTIN_GATHERALTDIV8SF);
27483
27484 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27485 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27486 IX86_BUILTIN_GATHERALTSIV4DI);
27487
27488 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27489 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27490 IX86_BUILTIN_GATHERALTDIV8SI);
27491
27492 /* RTM. */
27493 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
27494 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
27495
27496 /* MMX access to the vec_init patterns. */
27497 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27498 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27499
27500 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27501 V4HI_FTYPE_HI_HI_HI_HI,
27502 IX86_BUILTIN_VEC_INIT_V4HI);
27503
27504 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27505 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27506 IX86_BUILTIN_VEC_INIT_V8QI);
27507
27508 /* Access to the vec_extract patterns. */
27509 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27510 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27511 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27512 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27513 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27514 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27515 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27516 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27517 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27518 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27519
27520 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27521 "__builtin_ia32_vec_ext_v4hi",
27522 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27523
27524 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27525 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27526
27527 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27528 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27529
27530 /* Access to the vec_set patterns. */
27531 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27532 "__builtin_ia32_vec_set_v2di",
27533 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27534
27535 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27536 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27537
27538 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27539 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27540
27541 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27542 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27543
27544 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27545 "__builtin_ia32_vec_set_v4hi",
27546 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27547
27548 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27549 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27550
27551 /* Add FMA4 multi-arg argument instructions */
27552 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27553 {
27554 if (d->name == 0)
27555 continue;
27556
27557 ftype = (enum ix86_builtin_func_type) d->flag;
27558 def_builtin_const (d->mask, d->name, ftype, d->code);
27559 }
27560 }
27561
27562 /* Internal method for ix86_init_builtins. */
27563
27564 static void
27565 ix86_init_builtins_va_builtins_abi (void)
27566 {
27567 tree ms_va_ref, sysv_va_ref;
27568 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27569 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
27570 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
27571 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
27572
27573 if (!TARGET_64BIT)
27574 return;
27575 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
27576 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
27577 ms_va_ref = build_reference_type (ms_va_list_type_node);
27578 sysv_va_ref =
27579 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
27580
27581 fnvoid_va_end_ms =
27582 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27583 fnvoid_va_start_ms =
27584 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27585 fnvoid_va_end_sysv =
27586 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
27587 fnvoid_va_start_sysv =
27588 build_varargs_function_type_list (void_type_node, sysv_va_ref,
27589 NULL_TREE);
27590 fnvoid_va_copy_ms =
27591 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
27592 NULL_TREE);
27593 fnvoid_va_copy_sysv =
27594 build_function_type_list (void_type_node, sysv_va_ref,
27595 sysv_va_ref, NULL_TREE);
27596
27597 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
27598 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
27599 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
27600 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
27601 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
27602 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
27603 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
27604 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27605 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
27606 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27607 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
27608 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27609 }
27610
27611 static void
27612 ix86_init_builtin_types (void)
27613 {
27614 tree float128_type_node, float80_type_node;
27615
27616 /* The __float80 type. */
27617 float80_type_node = long_double_type_node;
27618 if (TYPE_MODE (float80_type_node) != XFmode)
27619 {
27620 /* The __float80 type. */
27621 float80_type_node = make_node (REAL_TYPE);
27622
27623 TYPE_PRECISION (float80_type_node) = 80;
27624 layout_type (float80_type_node);
27625 }
27626 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
27627
27628 /* The __float128 type. */
27629 float128_type_node = make_node (REAL_TYPE);
27630 TYPE_PRECISION (float128_type_node) = 128;
27631 layout_type (float128_type_node);
27632 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
27633
27634 /* This macro is built by i386-builtin-types.awk. */
27635 DEFINE_BUILTIN_PRIMITIVE_TYPES;
27636 }
27637
27638 static void
27639 ix86_init_builtins (void)
27640 {
27641 tree t;
27642
27643 ix86_init_builtin_types ();
27644
27645 /* TFmode support builtins. */
27646 def_builtin_const (0, "__builtin_infq",
27647 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
27648 def_builtin_const (0, "__builtin_huge_valq",
27649 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
27650
27651 /* We will expand them to normal call if SSE2 isn't available since
27652 they are used by libgcc. */
27653 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27654 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27655 BUILT_IN_MD, "__fabstf2", NULL_TREE);
27656 TREE_READONLY (t) = 1;
27657 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27658
27659 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27660 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27661 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27662 TREE_READONLY (t) = 1;
27663 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27664
27665 ix86_init_tm_builtins ();
27666 ix86_init_mmx_sse_builtins ();
27667
27668 if (TARGET_LP64)
27669 ix86_init_builtins_va_builtins_abi ();
27670
27671 #ifdef SUBTARGET_INIT_BUILTINS
27672 SUBTARGET_INIT_BUILTINS;
27673 #endif
27674 }
27675
27676 /* Return the ix86 builtin for CODE. */
27677
27678 static tree
27679 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27680 {
27681 if (code >= IX86_BUILTIN_MAX)
27682 return error_mark_node;
27683
27684 return ix86_builtins[code];
27685 }
27686
27687 /* Errors in the source file can cause expand_expr to return const0_rtx
27688 where we expect a vector. To avoid crashing, use one of the vector
27689 clear instructions. */
27690 static rtx
27691 safe_vector_operand (rtx x, enum machine_mode mode)
27692 {
27693 if (x == const0_rtx)
27694 x = CONST0_RTX (mode);
27695 return x;
27696 }
27697
27698 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
27699
27700 static rtx
27701 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27702 {
27703 rtx pat;
27704 tree arg0 = CALL_EXPR_ARG (exp, 0);
27705 tree arg1 = CALL_EXPR_ARG (exp, 1);
27706 rtx op0 = expand_normal (arg0);
27707 rtx op1 = expand_normal (arg1);
27708 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27709 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27710 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27711
27712 if (VECTOR_MODE_P (mode0))
27713 op0 = safe_vector_operand (op0, mode0);
27714 if (VECTOR_MODE_P (mode1))
27715 op1 = safe_vector_operand (op1, mode1);
27716
27717 if (optimize || !target
27718 || GET_MODE (target) != tmode
27719 || !insn_data[icode].operand[0].predicate (target, tmode))
27720 target = gen_reg_rtx (tmode);
27721
27722 if (GET_MODE (op1) == SImode && mode1 == TImode)
27723 {
27724 rtx x = gen_reg_rtx (V4SImode);
27725 emit_insn (gen_sse2_loadd (x, op1));
27726 op1 = gen_lowpart (TImode, x);
27727 }
27728
27729 if (!insn_data[icode].operand[1].predicate (op0, mode0))
27730 op0 = copy_to_mode_reg (mode0, op0);
27731 if (!insn_data[icode].operand[2].predicate (op1, mode1))
27732 op1 = copy_to_mode_reg (mode1, op1);
27733
27734 pat = GEN_FCN (icode) (target, op0, op1);
27735 if (! pat)
27736 return 0;
27737
27738 emit_insn (pat);
27739
27740 return target;
27741 }
27742
27743 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
27744
27745 static rtx
27746 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27747 enum ix86_builtin_func_type m_type,
27748 enum rtx_code sub_code)
27749 {
27750 rtx pat;
27751 int i;
27752 int nargs;
27753 bool comparison_p = false;
27754 bool tf_p = false;
27755 bool last_arg_constant = false;
27756 int num_memory = 0;
27757 struct {
27758 rtx op;
27759 enum machine_mode mode;
27760 } args[4];
27761
27762 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27763
27764 switch (m_type)
27765 {
27766 case MULTI_ARG_4_DF2_DI_I:
27767 case MULTI_ARG_4_DF2_DI_I1:
27768 case MULTI_ARG_4_SF2_SI_I:
27769 case MULTI_ARG_4_SF2_SI_I1:
27770 nargs = 4;
27771 last_arg_constant = true;
27772 break;
27773
27774 case MULTI_ARG_3_SF:
27775 case MULTI_ARG_3_DF:
27776 case MULTI_ARG_3_SF2:
27777 case MULTI_ARG_3_DF2:
27778 case MULTI_ARG_3_DI:
27779 case MULTI_ARG_3_SI:
27780 case MULTI_ARG_3_SI_DI:
27781 case MULTI_ARG_3_HI:
27782 case MULTI_ARG_3_HI_SI:
27783 case MULTI_ARG_3_QI:
27784 case MULTI_ARG_3_DI2:
27785 case MULTI_ARG_3_SI2:
27786 case MULTI_ARG_3_HI2:
27787 case MULTI_ARG_3_QI2:
27788 nargs = 3;
27789 break;
27790
27791 case MULTI_ARG_2_SF:
27792 case MULTI_ARG_2_DF:
27793 case MULTI_ARG_2_DI:
27794 case MULTI_ARG_2_SI:
27795 case MULTI_ARG_2_HI:
27796 case MULTI_ARG_2_QI:
27797 nargs = 2;
27798 break;
27799
27800 case MULTI_ARG_2_DI_IMM:
27801 case MULTI_ARG_2_SI_IMM:
27802 case MULTI_ARG_2_HI_IMM:
27803 case MULTI_ARG_2_QI_IMM:
27804 nargs = 2;
27805 last_arg_constant = true;
27806 break;
27807
27808 case MULTI_ARG_1_SF:
27809 case MULTI_ARG_1_DF:
27810 case MULTI_ARG_1_SF2:
27811 case MULTI_ARG_1_DF2:
27812 case MULTI_ARG_1_DI:
27813 case MULTI_ARG_1_SI:
27814 case MULTI_ARG_1_HI:
27815 case MULTI_ARG_1_QI:
27816 case MULTI_ARG_1_SI_DI:
27817 case MULTI_ARG_1_HI_DI:
27818 case MULTI_ARG_1_HI_SI:
27819 case MULTI_ARG_1_QI_DI:
27820 case MULTI_ARG_1_QI_SI:
27821 case MULTI_ARG_1_QI_HI:
27822 nargs = 1;
27823 break;
27824
27825 case MULTI_ARG_2_DI_CMP:
27826 case MULTI_ARG_2_SI_CMP:
27827 case MULTI_ARG_2_HI_CMP:
27828 case MULTI_ARG_2_QI_CMP:
27829 nargs = 2;
27830 comparison_p = true;
27831 break;
27832
27833 case MULTI_ARG_2_SF_TF:
27834 case MULTI_ARG_2_DF_TF:
27835 case MULTI_ARG_2_DI_TF:
27836 case MULTI_ARG_2_SI_TF:
27837 case MULTI_ARG_2_HI_TF:
27838 case MULTI_ARG_2_QI_TF:
27839 nargs = 2;
27840 tf_p = true;
27841 break;
27842
27843 default:
27844 gcc_unreachable ();
27845 }
27846
27847 if (optimize || !target
27848 || GET_MODE (target) != tmode
27849 || !insn_data[icode].operand[0].predicate (target, tmode))
27850 target = gen_reg_rtx (tmode);
27851
27852 gcc_assert (nargs <= 4);
27853
27854 for (i = 0; i < nargs; i++)
27855 {
27856 tree arg = CALL_EXPR_ARG (exp, i);
27857 rtx op = expand_normal (arg);
27858 int adjust = (comparison_p) ? 1 : 0;
27859 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
27860
27861 if (last_arg_constant && i == nargs - 1)
27862 {
27863 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
27864 {
27865 enum insn_code new_icode = icode;
27866 switch (icode)
27867 {
27868 case CODE_FOR_xop_vpermil2v2df3:
27869 case CODE_FOR_xop_vpermil2v4sf3:
27870 case CODE_FOR_xop_vpermil2v4df3:
27871 case CODE_FOR_xop_vpermil2v8sf3:
27872 error ("the last argument must be a 2-bit immediate");
27873 return gen_reg_rtx (tmode);
27874 case CODE_FOR_xop_rotlv2di3:
27875 new_icode = CODE_FOR_rotlv2di3;
27876 goto xop_rotl;
27877 case CODE_FOR_xop_rotlv4si3:
27878 new_icode = CODE_FOR_rotlv4si3;
27879 goto xop_rotl;
27880 case CODE_FOR_xop_rotlv8hi3:
27881 new_icode = CODE_FOR_rotlv8hi3;
27882 goto xop_rotl;
27883 case CODE_FOR_xop_rotlv16qi3:
27884 new_icode = CODE_FOR_rotlv16qi3;
27885 xop_rotl:
27886 if (CONST_INT_P (op))
27887 {
27888 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
27889 op = GEN_INT (INTVAL (op) & mask);
27890 gcc_checking_assert
27891 (insn_data[icode].operand[i + 1].predicate (op, mode));
27892 }
27893 else
27894 {
27895 gcc_checking_assert
27896 (nargs == 2
27897 && insn_data[new_icode].operand[0].mode == tmode
27898 && insn_data[new_icode].operand[1].mode == tmode
27899 && insn_data[new_icode].operand[2].mode == mode
27900 && insn_data[new_icode].operand[0].predicate
27901 == insn_data[icode].operand[0].predicate
27902 && insn_data[new_icode].operand[1].predicate
27903 == insn_data[icode].operand[1].predicate);
27904 icode = new_icode;
27905 goto non_constant;
27906 }
27907 break;
27908 default:
27909 gcc_unreachable ();
27910 }
27911 }
27912 }
27913 else
27914 {
27915 non_constant:
27916 if (VECTOR_MODE_P (mode))
27917 op = safe_vector_operand (op, mode);
27918
27919 /* If we aren't optimizing, only allow one memory operand to be
27920 generated. */
27921 if (memory_operand (op, mode))
27922 num_memory++;
27923
27924 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
27925
27926 if (optimize
27927 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
27928 || num_memory > 1)
27929 op = force_reg (mode, op);
27930 }
27931
27932 args[i].op = op;
27933 args[i].mode = mode;
27934 }
27935
27936 switch (nargs)
27937 {
27938 case 1:
27939 pat = GEN_FCN (icode) (target, args[0].op);
27940 break;
27941
27942 case 2:
27943 if (tf_p)
27944 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
27945 GEN_INT ((int)sub_code));
27946 else if (! comparison_p)
27947 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
27948 else
27949 {
27950 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
27951 args[0].op,
27952 args[1].op);
27953
27954 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
27955 }
27956 break;
27957
27958 case 3:
27959 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
27960 break;
27961
27962 case 4:
27963 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
27964 break;
27965
27966 default:
27967 gcc_unreachable ();
27968 }
27969
27970 if (! pat)
27971 return 0;
27972
27973 emit_insn (pat);
27974 return target;
27975 }
27976
27977 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
27978 insns with vec_merge. */
27979
27980 static rtx
27981 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
27982 rtx target)
27983 {
27984 rtx pat;
27985 tree arg0 = CALL_EXPR_ARG (exp, 0);
27986 rtx op1, op0 = expand_normal (arg0);
27987 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27988 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27989
27990 if (optimize || !target
27991 || GET_MODE (target) != tmode
27992 || !insn_data[icode].operand[0].predicate (target, tmode))
27993 target = gen_reg_rtx (tmode);
27994
27995 if (VECTOR_MODE_P (mode0))
27996 op0 = safe_vector_operand (op0, mode0);
27997
27998 if ((optimize && !register_operand (op0, mode0))
27999 || !insn_data[icode].operand[1].predicate (op0, mode0))
28000 op0 = copy_to_mode_reg (mode0, op0);
28001
28002 op1 = op0;
28003 if (!insn_data[icode].operand[2].predicate (op1, mode0))
28004 op1 = copy_to_mode_reg (mode0, op1);
28005
28006 pat = GEN_FCN (icode) (target, op0, op1);
28007 if (! pat)
28008 return 0;
28009 emit_insn (pat);
28010 return target;
28011 }
28012
28013 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
28014
28015 static rtx
28016 ix86_expand_sse_compare (const struct builtin_description *d,
28017 tree exp, rtx target, bool swap)
28018 {
28019 rtx pat;
28020 tree arg0 = CALL_EXPR_ARG (exp, 0);
28021 tree arg1 = CALL_EXPR_ARG (exp, 1);
28022 rtx op0 = expand_normal (arg0);
28023 rtx op1 = expand_normal (arg1);
28024 rtx op2;
28025 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28026 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28027 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28028 enum rtx_code comparison = d->comparison;
28029
28030 if (VECTOR_MODE_P (mode0))
28031 op0 = safe_vector_operand (op0, mode0);
28032 if (VECTOR_MODE_P (mode1))
28033 op1 = safe_vector_operand (op1, mode1);
28034
28035 /* Swap operands if we have a comparison that isn't available in
28036 hardware. */
28037 if (swap)
28038 {
28039 rtx tmp = gen_reg_rtx (mode1);
28040 emit_move_insn (tmp, op1);
28041 op1 = op0;
28042 op0 = tmp;
28043 }
28044
28045 if (optimize || !target
28046 || GET_MODE (target) != tmode
28047 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28048 target = gen_reg_rtx (tmode);
28049
28050 if ((optimize && !register_operand (op0, mode0))
28051 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
28052 op0 = copy_to_mode_reg (mode0, op0);
28053 if ((optimize && !register_operand (op1, mode1))
28054 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
28055 op1 = copy_to_mode_reg (mode1, op1);
28056
28057 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
28058 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28059 if (! pat)
28060 return 0;
28061 emit_insn (pat);
28062 return target;
28063 }
28064
28065 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
28066
28067 static rtx
28068 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28069 rtx target)
28070 {
28071 rtx pat;
28072 tree arg0 = CALL_EXPR_ARG (exp, 0);
28073 tree arg1 = CALL_EXPR_ARG (exp, 1);
28074 rtx op0 = expand_normal (arg0);
28075 rtx op1 = expand_normal (arg1);
28076 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28077 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28078 enum rtx_code comparison = d->comparison;
28079
28080 if (VECTOR_MODE_P (mode0))
28081 op0 = safe_vector_operand (op0, mode0);
28082 if (VECTOR_MODE_P (mode1))
28083 op1 = safe_vector_operand (op1, mode1);
28084
28085 /* Swap operands if we have a comparison that isn't available in
28086 hardware. */
28087 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28088 {
28089 rtx tmp = op1;
28090 op1 = op0;
28091 op0 = tmp;
28092 }
28093
28094 target = gen_reg_rtx (SImode);
28095 emit_move_insn (target, const0_rtx);
28096 target = gen_rtx_SUBREG (QImode, target, 0);
28097
28098 if ((optimize && !register_operand (op0, mode0))
28099 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28100 op0 = copy_to_mode_reg (mode0, op0);
28101 if ((optimize && !register_operand (op1, mode1))
28102 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28103 op1 = copy_to_mode_reg (mode1, op1);
28104
28105 pat = GEN_FCN (d->icode) (op0, op1);
28106 if (! pat)
28107 return 0;
28108 emit_insn (pat);
28109 emit_insn (gen_rtx_SET (VOIDmode,
28110 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28111 gen_rtx_fmt_ee (comparison, QImode,
28112 SET_DEST (pat),
28113 const0_rtx)));
28114
28115 return SUBREG_REG (target);
28116 }
28117
28118 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
28119
28120 static rtx
28121 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28122 rtx target)
28123 {
28124 rtx pat;
28125 tree arg0 = CALL_EXPR_ARG (exp, 0);
28126 rtx op1, op0 = expand_normal (arg0);
28127 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28128 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28129
28130 if (optimize || target == 0
28131 || GET_MODE (target) != tmode
28132 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28133 target = gen_reg_rtx (tmode);
28134
28135 if (VECTOR_MODE_P (mode0))
28136 op0 = safe_vector_operand (op0, mode0);
28137
28138 if ((optimize && !register_operand (op0, mode0))
28139 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28140 op0 = copy_to_mode_reg (mode0, op0);
28141
28142 op1 = GEN_INT (d->comparison);
28143
28144 pat = GEN_FCN (d->icode) (target, op0, op1);
28145 if (! pat)
28146 return 0;
28147 emit_insn (pat);
28148 return target;
28149 }
28150
28151 static rtx
28152 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28153 tree exp, rtx target)
28154 {
28155 rtx pat;
28156 tree arg0 = CALL_EXPR_ARG (exp, 0);
28157 tree arg1 = CALL_EXPR_ARG (exp, 1);
28158 rtx op0 = expand_normal (arg0);
28159 rtx op1 = expand_normal (arg1);
28160 rtx op2;
28161 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28162 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28163 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28164
28165 if (optimize || target == 0
28166 || GET_MODE (target) != tmode
28167 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28168 target = gen_reg_rtx (tmode);
28169
28170 op0 = safe_vector_operand (op0, mode0);
28171 op1 = safe_vector_operand (op1, mode1);
28172
28173 if ((optimize && !register_operand (op0, mode0))
28174 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28175 op0 = copy_to_mode_reg (mode0, op0);
28176 if ((optimize && !register_operand (op1, mode1))
28177 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28178 op1 = copy_to_mode_reg (mode1, op1);
28179
28180 op2 = GEN_INT (d->comparison);
28181
28182 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28183 if (! pat)
28184 return 0;
28185 emit_insn (pat);
28186 return target;
28187 }
28188
28189 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28190
28191 static rtx
28192 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28193 rtx target)
28194 {
28195 rtx pat;
28196 tree arg0 = CALL_EXPR_ARG (exp, 0);
28197 tree arg1 = CALL_EXPR_ARG (exp, 1);
28198 rtx op0 = expand_normal (arg0);
28199 rtx op1 = expand_normal (arg1);
28200 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28201 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28202 enum rtx_code comparison = d->comparison;
28203
28204 if (VECTOR_MODE_P (mode0))
28205 op0 = safe_vector_operand (op0, mode0);
28206 if (VECTOR_MODE_P (mode1))
28207 op1 = safe_vector_operand (op1, mode1);
28208
28209 target = gen_reg_rtx (SImode);
28210 emit_move_insn (target, const0_rtx);
28211 target = gen_rtx_SUBREG (QImode, target, 0);
28212
28213 if ((optimize && !register_operand (op0, mode0))
28214 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28215 op0 = copy_to_mode_reg (mode0, op0);
28216 if ((optimize && !register_operand (op1, mode1))
28217 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28218 op1 = copy_to_mode_reg (mode1, op1);
28219
28220 pat = GEN_FCN (d->icode) (op0, op1);
28221 if (! pat)
28222 return 0;
28223 emit_insn (pat);
28224 emit_insn (gen_rtx_SET (VOIDmode,
28225 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28226 gen_rtx_fmt_ee (comparison, QImode,
28227 SET_DEST (pat),
28228 const0_rtx)));
28229
28230 return SUBREG_REG (target);
28231 }
28232
28233 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28234
28235 static rtx
28236 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28237 tree exp, rtx target)
28238 {
28239 rtx pat;
28240 tree arg0 = CALL_EXPR_ARG (exp, 0);
28241 tree arg1 = CALL_EXPR_ARG (exp, 1);
28242 tree arg2 = CALL_EXPR_ARG (exp, 2);
28243 tree arg3 = CALL_EXPR_ARG (exp, 3);
28244 tree arg4 = CALL_EXPR_ARG (exp, 4);
28245 rtx scratch0, scratch1;
28246 rtx op0 = expand_normal (arg0);
28247 rtx op1 = expand_normal (arg1);
28248 rtx op2 = expand_normal (arg2);
28249 rtx op3 = expand_normal (arg3);
28250 rtx op4 = expand_normal (arg4);
28251 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28252
28253 tmode0 = insn_data[d->icode].operand[0].mode;
28254 tmode1 = insn_data[d->icode].operand[1].mode;
28255 modev2 = insn_data[d->icode].operand[2].mode;
28256 modei3 = insn_data[d->icode].operand[3].mode;
28257 modev4 = insn_data[d->icode].operand[4].mode;
28258 modei5 = insn_data[d->icode].operand[5].mode;
28259 modeimm = insn_data[d->icode].operand[6].mode;
28260
28261 if (VECTOR_MODE_P (modev2))
28262 op0 = safe_vector_operand (op0, modev2);
28263 if (VECTOR_MODE_P (modev4))
28264 op2 = safe_vector_operand (op2, modev4);
28265
28266 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28267 op0 = copy_to_mode_reg (modev2, op0);
28268 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28269 op1 = copy_to_mode_reg (modei3, op1);
28270 if ((optimize && !register_operand (op2, modev4))
28271 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28272 op2 = copy_to_mode_reg (modev4, op2);
28273 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28274 op3 = copy_to_mode_reg (modei5, op3);
28275
28276 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28277 {
28278 error ("the fifth argument must be an 8-bit immediate");
28279 return const0_rtx;
28280 }
28281
28282 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28283 {
28284 if (optimize || !target
28285 || GET_MODE (target) != tmode0
28286 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28287 target = gen_reg_rtx (tmode0);
28288
28289 scratch1 = gen_reg_rtx (tmode1);
28290
28291 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28292 }
28293 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28294 {
28295 if (optimize || !target
28296 || GET_MODE (target) != tmode1
28297 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28298 target = gen_reg_rtx (tmode1);
28299
28300 scratch0 = gen_reg_rtx (tmode0);
28301
28302 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28303 }
28304 else
28305 {
28306 gcc_assert (d->flag);
28307
28308 scratch0 = gen_reg_rtx (tmode0);
28309 scratch1 = gen_reg_rtx (tmode1);
28310
28311 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28312 }
28313
28314 if (! pat)
28315 return 0;
28316
28317 emit_insn (pat);
28318
28319 if (d->flag)
28320 {
28321 target = gen_reg_rtx (SImode);
28322 emit_move_insn (target, const0_rtx);
28323 target = gen_rtx_SUBREG (QImode, target, 0);
28324
28325 emit_insn
28326 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28327 gen_rtx_fmt_ee (EQ, QImode,
28328 gen_rtx_REG ((enum machine_mode) d->flag,
28329 FLAGS_REG),
28330 const0_rtx)));
28331 return SUBREG_REG (target);
28332 }
28333 else
28334 return target;
28335 }
28336
28337
28338 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28339
28340 static rtx
28341 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28342 tree exp, rtx target)
28343 {
28344 rtx pat;
28345 tree arg0 = CALL_EXPR_ARG (exp, 0);
28346 tree arg1 = CALL_EXPR_ARG (exp, 1);
28347 tree arg2 = CALL_EXPR_ARG (exp, 2);
28348 rtx scratch0, scratch1;
28349 rtx op0 = expand_normal (arg0);
28350 rtx op1 = expand_normal (arg1);
28351 rtx op2 = expand_normal (arg2);
28352 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28353
28354 tmode0 = insn_data[d->icode].operand[0].mode;
28355 tmode1 = insn_data[d->icode].operand[1].mode;
28356 modev2 = insn_data[d->icode].operand[2].mode;
28357 modev3 = insn_data[d->icode].operand[3].mode;
28358 modeimm = insn_data[d->icode].operand[4].mode;
28359
28360 if (VECTOR_MODE_P (modev2))
28361 op0 = safe_vector_operand (op0, modev2);
28362 if (VECTOR_MODE_P (modev3))
28363 op1 = safe_vector_operand (op1, modev3);
28364
28365 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28366 op0 = copy_to_mode_reg (modev2, op0);
28367 if ((optimize && !register_operand (op1, modev3))
28368 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28369 op1 = copy_to_mode_reg (modev3, op1);
28370
28371 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28372 {
28373 error ("the third argument must be an 8-bit immediate");
28374 return const0_rtx;
28375 }
28376
28377 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28378 {
28379 if (optimize || !target
28380 || GET_MODE (target) != tmode0
28381 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28382 target = gen_reg_rtx (tmode0);
28383
28384 scratch1 = gen_reg_rtx (tmode1);
28385
28386 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28387 }
28388 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28389 {
28390 if (optimize || !target
28391 || GET_MODE (target) != tmode1
28392 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28393 target = gen_reg_rtx (tmode1);
28394
28395 scratch0 = gen_reg_rtx (tmode0);
28396
28397 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28398 }
28399 else
28400 {
28401 gcc_assert (d->flag);
28402
28403 scratch0 = gen_reg_rtx (tmode0);
28404 scratch1 = gen_reg_rtx (tmode1);
28405
28406 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28407 }
28408
28409 if (! pat)
28410 return 0;
28411
28412 emit_insn (pat);
28413
28414 if (d->flag)
28415 {
28416 target = gen_reg_rtx (SImode);
28417 emit_move_insn (target, const0_rtx);
28418 target = gen_rtx_SUBREG (QImode, target, 0);
28419
28420 emit_insn
28421 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28422 gen_rtx_fmt_ee (EQ, QImode,
28423 gen_rtx_REG ((enum machine_mode) d->flag,
28424 FLAGS_REG),
28425 const0_rtx)));
28426 return SUBREG_REG (target);
28427 }
28428 else
28429 return target;
28430 }
28431
28432 /* Subroutine of ix86_expand_builtin to take care of insns with
28433 variable number of operands. */
28434
28435 static rtx
28436 ix86_expand_args_builtin (const struct builtin_description *d,
28437 tree exp, rtx target)
28438 {
28439 rtx pat, real_target;
28440 unsigned int i, nargs;
28441 unsigned int nargs_constant = 0;
28442 int num_memory = 0;
28443 struct
28444 {
28445 rtx op;
28446 enum machine_mode mode;
28447 } args[4];
28448 bool last_arg_count = false;
28449 enum insn_code icode = d->icode;
28450 const struct insn_data_d *insn_p = &insn_data[icode];
28451 enum machine_mode tmode = insn_p->operand[0].mode;
28452 enum machine_mode rmode = VOIDmode;
28453 bool swap = false;
28454 enum rtx_code comparison = d->comparison;
28455
28456 switch ((enum ix86_builtin_func_type) d->flag)
28457 {
28458 case V2DF_FTYPE_V2DF_ROUND:
28459 case V4DF_FTYPE_V4DF_ROUND:
28460 case V4SF_FTYPE_V4SF_ROUND:
28461 case V8SF_FTYPE_V8SF_ROUND:
28462 case V4SI_FTYPE_V4SF_ROUND:
28463 case V8SI_FTYPE_V8SF_ROUND:
28464 return ix86_expand_sse_round (d, exp, target);
28465 case V4SI_FTYPE_V2DF_V2DF_ROUND:
28466 case V8SI_FTYPE_V4DF_V4DF_ROUND:
28467 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28468 case INT_FTYPE_V8SF_V8SF_PTEST:
28469 case INT_FTYPE_V4DI_V4DI_PTEST:
28470 case INT_FTYPE_V4DF_V4DF_PTEST:
28471 case INT_FTYPE_V4SF_V4SF_PTEST:
28472 case INT_FTYPE_V2DI_V2DI_PTEST:
28473 case INT_FTYPE_V2DF_V2DF_PTEST:
28474 return ix86_expand_sse_ptest (d, exp, target);
28475 case FLOAT128_FTYPE_FLOAT128:
28476 case FLOAT_FTYPE_FLOAT:
28477 case INT_FTYPE_INT:
28478 case UINT64_FTYPE_INT:
28479 case UINT16_FTYPE_UINT16:
28480 case INT64_FTYPE_INT64:
28481 case INT64_FTYPE_V4SF:
28482 case INT64_FTYPE_V2DF:
28483 case INT_FTYPE_V16QI:
28484 case INT_FTYPE_V8QI:
28485 case INT_FTYPE_V8SF:
28486 case INT_FTYPE_V4DF:
28487 case INT_FTYPE_V4SF:
28488 case INT_FTYPE_V2DF:
28489 case INT_FTYPE_V32QI:
28490 case V16QI_FTYPE_V16QI:
28491 case V8SI_FTYPE_V8SF:
28492 case V8SI_FTYPE_V4SI:
28493 case V8HI_FTYPE_V8HI:
28494 case V8HI_FTYPE_V16QI:
28495 case V8QI_FTYPE_V8QI:
28496 case V8SF_FTYPE_V8SF:
28497 case V8SF_FTYPE_V8SI:
28498 case V8SF_FTYPE_V4SF:
28499 case V8SF_FTYPE_V8HI:
28500 case V4SI_FTYPE_V4SI:
28501 case V4SI_FTYPE_V16QI:
28502 case V4SI_FTYPE_V4SF:
28503 case V4SI_FTYPE_V8SI:
28504 case V4SI_FTYPE_V8HI:
28505 case V4SI_FTYPE_V4DF:
28506 case V4SI_FTYPE_V2DF:
28507 case V4HI_FTYPE_V4HI:
28508 case V4DF_FTYPE_V4DF:
28509 case V4DF_FTYPE_V4SI:
28510 case V4DF_FTYPE_V4SF:
28511 case V4DF_FTYPE_V2DF:
28512 case V4SF_FTYPE_V4SF:
28513 case V4SF_FTYPE_V4SI:
28514 case V4SF_FTYPE_V8SF:
28515 case V4SF_FTYPE_V4DF:
28516 case V4SF_FTYPE_V8HI:
28517 case V4SF_FTYPE_V2DF:
28518 case V2DI_FTYPE_V2DI:
28519 case V2DI_FTYPE_V16QI:
28520 case V2DI_FTYPE_V8HI:
28521 case V2DI_FTYPE_V4SI:
28522 case V2DF_FTYPE_V2DF:
28523 case V2DF_FTYPE_V4SI:
28524 case V2DF_FTYPE_V4DF:
28525 case V2DF_FTYPE_V4SF:
28526 case V2DF_FTYPE_V2SI:
28527 case V2SI_FTYPE_V2SI:
28528 case V2SI_FTYPE_V4SF:
28529 case V2SI_FTYPE_V2SF:
28530 case V2SI_FTYPE_V2DF:
28531 case V2SF_FTYPE_V2SF:
28532 case V2SF_FTYPE_V2SI:
28533 case V32QI_FTYPE_V32QI:
28534 case V32QI_FTYPE_V16QI:
28535 case V16HI_FTYPE_V16HI:
28536 case V16HI_FTYPE_V8HI:
28537 case V8SI_FTYPE_V8SI:
28538 case V16HI_FTYPE_V16QI:
28539 case V8SI_FTYPE_V16QI:
28540 case V4DI_FTYPE_V16QI:
28541 case V8SI_FTYPE_V8HI:
28542 case V4DI_FTYPE_V8HI:
28543 case V4DI_FTYPE_V4SI:
28544 case V4DI_FTYPE_V2DI:
28545 nargs = 1;
28546 break;
28547 case V4SF_FTYPE_V4SF_VEC_MERGE:
28548 case V2DF_FTYPE_V2DF_VEC_MERGE:
28549 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28550 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28551 case V16QI_FTYPE_V16QI_V16QI:
28552 case V16QI_FTYPE_V8HI_V8HI:
28553 case V8QI_FTYPE_V8QI_V8QI:
28554 case V8QI_FTYPE_V4HI_V4HI:
28555 case V8HI_FTYPE_V8HI_V8HI:
28556 case V8HI_FTYPE_V16QI_V16QI:
28557 case V8HI_FTYPE_V4SI_V4SI:
28558 case V8SF_FTYPE_V8SF_V8SF:
28559 case V8SF_FTYPE_V8SF_V8SI:
28560 case V4SI_FTYPE_V4SI_V4SI:
28561 case V4SI_FTYPE_V8HI_V8HI:
28562 case V4SI_FTYPE_V4SF_V4SF:
28563 case V4SI_FTYPE_V2DF_V2DF:
28564 case V4HI_FTYPE_V4HI_V4HI:
28565 case V4HI_FTYPE_V8QI_V8QI:
28566 case V4HI_FTYPE_V2SI_V2SI:
28567 case V4DF_FTYPE_V4DF_V4DF:
28568 case V4DF_FTYPE_V4DF_V4DI:
28569 case V4SF_FTYPE_V4SF_V4SF:
28570 case V4SF_FTYPE_V4SF_V4SI:
28571 case V4SF_FTYPE_V4SF_V2SI:
28572 case V4SF_FTYPE_V4SF_V2DF:
28573 case V4SF_FTYPE_V4SF_DI:
28574 case V4SF_FTYPE_V4SF_SI:
28575 case V2DI_FTYPE_V2DI_V2DI:
28576 case V2DI_FTYPE_V16QI_V16QI:
28577 case V2DI_FTYPE_V4SI_V4SI:
28578 case V2DI_FTYPE_V2DI_V16QI:
28579 case V2DI_FTYPE_V2DF_V2DF:
28580 case V2SI_FTYPE_V2SI_V2SI:
28581 case V2SI_FTYPE_V4HI_V4HI:
28582 case V2SI_FTYPE_V2SF_V2SF:
28583 case V2DF_FTYPE_V2DF_V2DF:
28584 case V2DF_FTYPE_V2DF_V4SF:
28585 case V2DF_FTYPE_V2DF_V2DI:
28586 case V2DF_FTYPE_V2DF_DI:
28587 case V2DF_FTYPE_V2DF_SI:
28588 case V2SF_FTYPE_V2SF_V2SF:
28589 case V1DI_FTYPE_V1DI_V1DI:
28590 case V1DI_FTYPE_V8QI_V8QI:
28591 case V1DI_FTYPE_V2SI_V2SI:
28592 case V32QI_FTYPE_V16HI_V16HI:
28593 case V16HI_FTYPE_V8SI_V8SI:
28594 case V32QI_FTYPE_V32QI_V32QI:
28595 case V16HI_FTYPE_V32QI_V32QI:
28596 case V16HI_FTYPE_V16HI_V16HI:
28597 case V8SI_FTYPE_V4DF_V4DF:
28598 case V8SI_FTYPE_V8SI_V8SI:
28599 case V8SI_FTYPE_V16HI_V16HI:
28600 case V4DI_FTYPE_V4DI_V4DI:
28601 case V4DI_FTYPE_V8SI_V8SI:
28602 if (comparison == UNKNOWN)
28603 return ix86_expand_binop_builtin (icode, exp, target);
28604 nargs = 2;
28605 break;
28606 case V4SF_FTYPE_V4SF_V4SF_SWAP:
28607 case V2DF_FTYPE_V2DF_V2DF_SWAP:
28608 gcc_assert (comparison != UNKNOWN);
28609 nargs = 2;
28610 swap = true;
28611 break;
28612 case V16HI_FTYPE_V16HI_V8HI_COUNT:
28613 case V16HI_FTYPE_V16HI_SI_COUNT:
28614 case V8SI_FTYPE_V8SI_V4SI_COUNT:
28615 case V8SI_FTYPE_V8SI_SI_COUNT:
28616 case V4DI_FTYPE_V4DI_V2DI_COUNT:
28617 case V4DI_FTYPE_V4DI_INT_COUNT:
28618 case V8HI_FTYPE_V8HI_V8HI_COUNT:
28619 case V8HI_FTYPE_V8HI_SI_COUNT:
28620 case V4SI_FTYPE_V4SI_V4SI_COUNT:
28621 case V4SI_FTYPE_V4SI_SI_COUNT:
28622 case V4HI_FTYPE_V4HI_V4HI_COUNT:
28623 case V4HI_FTYPE_V4HI_SI_COUNT:
28624 case V2DI_FTYPE_V2DI_V2DI_COUNT:
28625 case V2DI_FTYPE_V2DI_SI_COUNT:
28626 case V2SI_FTYPE_V2SI_V2SI_COUNT:
28627 case V2SI_FTYPE_V2SI_SI_COUNT:
28628 case V1DI_FTYPE_V1DI_V1DI_COUNT:
28629 case V1DI_FTYPE_V1DI_SI_COUNT:
28630 nargs = 2;
28631 last_arg_count = true;
28632 break;
28633 case UINT64_FTYPE_UINT64_UINT64:
28634 case UINT_FTYPE_UINT_UINT:
28635 case UINT_FTYPE_UINT_USHORT:
28636 case UINT_FTYPE_UINT_UCHAR:
28637 case UINT16_FTYPE_UINT16_INT:
28638 case UINT8_FTYPE_UINT8_INT:
28639 nargs = 2;
28640 break;
28641 case V2DI_FTYPE_V2DI_INT_CONVERT:
28642 nargs = 2;
28643 rmode = V1TImode;
28644 nargs_constant = 1;
28645 break;
28646 case V4DI_FTYPE_V4DI_INT_CONVERT:
28647 nargs = 2;
28648 rmode = V2TImode;
28649 nargs_constant = 1;
28650 break;
28651 case V8HI_FTYPE_V8HI_INT:
28652 case V8HI_FTYPE_V8SF_INT:
28653 case V8HI_FTYPE_V4SF_INT:
28654 case V8SF_FTYPE_V8SF_INT:
28655 case V4SI_FTYPE_V4SI_INT:
28656 case V4SI_FTYPE_V8SI_INT:
28657 case V4HI_FTYPE_V4HI_INT:
28658 case V4DF_FTYPE_V4DF_INT:
28659 case V4SF_FTYPE_V4SF_INT:
28660 case V4SF_FTYPE_V8SF_INT:
28661 case V2DI_FTYPE_V2DI_INT:
28662 case V2DF_FTYPE_V2DF_INT:
28663 case V2DF_FTYPE_V4DF_INT:
28664 case V16HI_FTYPE_V16HI_INT:
28665 case V8SI_FTYPE_V8SI_INT:
28666 case V4DI_FTYPE_V4DI_INT:
28667 case V2DI_FTYPE_V4DI_INT:
28668 nargs = 2;
28669 nargs_constant = 1;
28670 break;
28671 case V16QI_FTYPE_V16QI_V16QI_V16QI:
28672 case V8SF_FTYPE_V8SF_V8SF_V8SF:
28673 case V4DF_FTYPE_V4DF_V4DF_V4DF:
28674 case V4SF_FTYPE_V4SF_V4SF_V4SF:
28675 case V2DF_FTYPE_V2DF_V2DF_V2DF:
28676 case V32QI_FTYPE_V32QI_V32QI_V32QI:
28677 nargs = 3;
28678 break;
28679 case V32QI_FTYPE_V32QI_V32QI_INT:
28680 case V16HI_FTYPE_V16HI_V16HI_INT:
28681 case V16QI_FTYPE_V16QI_V16QI_INT:
28682 case V4DI_FTYPE_V4DI_V4DI_INT:
28683 case V8HI_FTYPE_V8HI_V8HI_INT:
28684 case V8SI_FTYPE_V8SI_V8SI_INT:
28685 case V8SI_FTYPE_V8SI_V4SI_INT:
28686 case V8SF_FTYPE_V8SF_V8SF_INT:
28687 case V8SF_FTYPE_V8SF_V4SF_INT:
28688 case V4SI_FTYPE_V4SI_V4SI_INT:
28689 case V4DF_FTYPE_V4DF_V4DF_INT:
28690 case V4DF_FTYPE_V4DF_V2DF_INT:
28691 case V4SF_FTYPE_V4SF_V4SF_INT:
28692 case V2DI_FTYPE_V2DI_V2DI_INT:
28693 case V4DI_FTYPE_V4DI_V2DI_INT:
28694 case V2DF_FTYPE_V2DF_V2DF_INT:
28695 nargs = 3;
28696 nargs_constant = 1;
28697 break;
28698 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28699 nargs = 3;
28700 rmode = V4DImode;
28701 nargs_constant = 1;
28702 break;
28703 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28704 nargs = 3;
28705 rmode = V2DImode;
28706 nargs_constant = 1;
28707 break;
28708 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28709 nargs = 3;
28710 rmode = DImode;
28711 nargs_constant = 1;
28712 break;
28713 case V2DI_FTYPE_V2DI_UINT_UINT:
28714 nargs = 3;
28715 nargs_constant = 2;
28716 break;
28717 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28718 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28719 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28720 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28721 nargs = 4;
28722 nargs_constant = 1;
28723 break;
28724 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28725 nargs = 4;
28726 nargs_constant = 2;
28727 break;
28728 default:
28729 gcc_unreachable ();
28730 }
28731
28732 gcc_assert (nargs <= ARRAY_SIZE (args));
28733
28734 if (comparison != UNKNOWN)
28735 {
28736 gcc_assert (nargs == 2);
28737 return ix86_expand_sse_compare (d, exp, target, swap);
28738 }
28739
28740 if (rmode == VOIDmode || rmode == tmode)
28741 {
28742 if (optimize
28743 || target == 0
28744 || GET_MODE (target) != tmode
28745 || !insn_p->operand[0].predicate (target, tmode))
28746 target = gen_reg_rtx (tmode);
28747 real_target = target;
28748 }
28749 else
28750 {
28751 target = gen_reg_rtx (rmode);
28752 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28753 }
28754
28755 for (i = 0; i < nargs; i++)
28756 {
28757 tree arg = CALL_EXPR_ARG (exp, i);
28758 rtx op = expand_normal (arg);
28759 enum machine_mode mode = insn_p->operand[i + 1].mode;
28760 bool match = insn_p->operand[i + 1].predicate (op, mode);
28761
28762 if (last_arg_count && (i + 1) == nargs)
28763 {
28764 /* SIMD shift insns take either an 8-bit immediate or
28765 register as count. But builtin functions take int as
28766 count. If count doesn't match, we put it in register. */
28767 if (!match)
28768 {
28769 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28770 if (!insn_p->operand[i + 1].predicate (op, mode))
28771 op = copy_to_reg (op);
28772 }
28773 }
28774 else if ((nargs - i) <= nargs_constant)
28775 {
28776 if (!match)
28777 switch (icode)
28778 {
28779 case CODE_FOR_avx2_inserti128:
28780 case CODE_FOR_avx2_extracti128:
28781 error ("the last argument must be an 1-bit immediate");
28782 return const0_rtx;
28783
28784 case CODE_FOR_sse4_1_roundsd:
28785 case CODE_FOR_sse4_1_roundss:
28786
28787 case CODE_FOR_sse4_1_roundpd:
28788 case CODE_FOR_sse4_1_roundps:
28789 case CODE_FOR_avx_roundpd256:
28790 case CODE_FOR_avx_roundps256:
28791
28792 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
28793 case CODE_FOR_sse4_1_roundps_sfix:
28794 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
28795 case CODE_FOR_avx_roundps_sfix256:
28796
28797 case CODE_FOR_sse4_1_blendps:
28798 case CODE_FOR_avx_blendpd256:
28799 case CODE_FOR_avx_vpermilv4df:
28800 error ("the last argument must be a 4-bit immediate");
28801 return const0_rtx;
28802
28803 case CODE_FOR_sse4_1_blendpd:
28804 case CODE_FOR_avx_vpermilv2df:
28805 case CODE_FOR_xop_vpermil2v2df3:
28806 case CODE_FOR_xop_vpermil2v4sf3:
28807 case CODE_FOR_xop_vpermil2v4df3:
28808 case CODE_FOR_xop_vpermil2v8sf3:
28809 error ("the last argument must be a 2-bit immediate");
28810 return const0_rtx;
28811
28812 case CODE_FOR_avx_vextractf128v4df:
28813 case CODE_FOR_avx_vextractf128v8sf:
28814 case CODE_FOR_avx_vextractf128v8si:
28815 case CODE_FOR_avx_vinsertf128v4df:
28816 case CODE_FOR_avx_vinsertf128v8sf:
28817 case CODE_FOR_avx_vinsertf128v8si:
28818 error ("the last argument must be a 1-bit immediate");
28819 return const0_rtx;
28820
28821 case CODE_FOR_avx_vmcmpv2df3:
28822 case CODE_FOR_avx_vmcmpv4sf3:
28823 case CODE_FOR_avx_cmpv2df3:
28824 case CODE_FOR_avx_cmpv4sf3:
28825 case CODE_FOR_avx_cmpv4df3:
28826 case CODE_FOR_avx_cmpv8sf3:
28827 error ("the last argument must be a 5-bit immediate");
28828 return const0_rtx;
28829
28830 default:
28831 switch (nargs_constant)
28832 {
28833 case 2:
28834 if ((nargs - i) == nargs_constant)
28835 {
28836 error ("the next to last argument must be an 8-bit immediate");
28837 break;
28838 }
28839 case 1:
28840 error ("the last argument must be an 8-bit immediate");
28841 break;
28842 default:
28843 gcc_unreachable ();
28844 }
28845 return const0_rtx;
28846 }
28847 }
28848 else
28849 {
28850 if (VECTOR_MODE_P (mode))
28851 op = safe_vector_operand (op, mode);
28852
28853 /* If we aren't optimizing, only allow one memory operand to
28854 be generated. */
28855 if (memory_operand (op, mode))
28856 num_memory++;
28857
28858 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
28859 {
28860 if (optimize || !match || num_memory > 1)
28861 op = copy_to_mode_reg (mode, op);
28862 }
28863 else
28864 {
28865 op = copy_to_reg (op);
28866 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
28867 }
28868 }
28869
28870 args[i].op = op;
28871 args[i].mode = mode;
28872 }
28873
28874 switch (nargs)
28875 {
28876 case 1:
28877 pat = GEN_FCN (icode) (real_target, args[0].op);
28878 break;
28879 case 2:
28880 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
28881 break;
28882 case 3:
28883 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28884 args[2].op);
28885 break;
28886 case 4:
28887 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28888 args[2].op, args[3].op);
28889 break;
28890 default:
28891 gcc_unreachable ();
28892 }
28893
28894 if (! pat)
28895 return 0;
28896
28897 emit_insn (pat);
28898 return target;
28899 }
28900
28901 /* Subroutine of ix86_expand_builtin to take care of special insns
28902 with variable number of operands. */
28903
28904 static rtx
28905 ix86_expand_special_args_builtin (const struct builtin_description *d,
28906 tree exp, rtx target)
28907 {
28908 tree arg;
28909 rtx pat, op;
28910 unsigned int i, nargs, arg_adjust, memory;
28911 struct
28912 {
28913 rtx op;
28914 enum machine_mode mode;
28915 } args[3];
28916 enum insn_code icode = d->icode;
28917 bool last_arg_constant = false;
28918 const struct insn_data_d *insn_p = &insn_data[icode];
28919 enum machine_mode tmode = insn_p->operand[0].mode;
28920 enum { load, store } klass;
28921
28922 switch ((enum ix86_builtin_func_type) d->flag)
28923 {
28924 case VOID_FTYPE_VOID:
28925 if (icode == CODE_FOR_avx_vzeroupper)
28926 target = GEN_INT (vzeroupper_intrinsic);
28927 emit_insn (GEN_FCN (icode) (target));
28928 return 0;
28929 case VOID_FTYPE_UINT64:
28930 case VOID_FTYPE_UNSIGNED:
28931 nargs = 0;
28932 klass = store;
28933 memory = 0;
28934 break;
28935
28936 case INT_FTYPE_VOID:
28937 case UINT64_FTYPE_VOID:
28938 case UNSIGNED_FTYPE_VOID:
28939 nargs = 0;
28940 klass = load;
28941 memory = 0;
28942 break;
28943 case UINT64_FTYPE_PUNSIGNED:
28944 case V2DI_FTYPE_PV2DI:
28945 case V4DI_FTYPE_PV4DI:
28946 case V32QI_FTYPE_PCCHAR:
28947 case V16QI_FTYPE_PCCHAR:
28948 case V8SF_FTYPE_PCV4SF:
28949 case V8SF_FTYPE_PCFLOAT:
28950 case V4SF_FTYPE_PCFLOAT:
28951 case V4DF_FTYPE_PCV2DF:
28952 case V4DF_FTYPE_PCDOUBLE:
28953 case V2DF_FTYPE_PCDOUBLE:
28954 case VOID_FTYPE_PVOID:
28955 nargs = 1;
28956 klass = load;
28957 memory = 0;
28958 break;
28959 case VOID_FTYPE_PV2SF_V4SF:
28960 case VOID_FTYPE_PV4DI_V4DI:
28961 case VOID_FTYPE_PV2DI_V2DI:
28962 case VOID_FTYPE_PCHAR_V32QI:
28963 case VOID_FTYPE_PCHAR_V16QI:
28964 case VOID_FTYPE_PFLOAT_V8SF:
28965 case VOID_FTYPE_PFLOAT_V4SF:
28966 case VOID_FTYPE_PDOUBLE_V4DF:
28967 case VOID_FTYPE_PDOUBLE_V2DF:
28968 case VOID_FTYPE_PLONGLONG_LONGLONG:
28969 case VOID_FTYPE_PULONGLONG_ULONGLONG:
28970 case VOID_FTYPE_PINT_INT:
28971 nargs = 1;
28972 klass = store;
28973 /* Reserve memory operand for target. */
28974 memory = ARRAY_SIZE (args);
28975 break;
28976 case V4SF_FTYPE_V4SF_PCV2SF:
28977 case V2DF_FTYPE_V2DF_PCDOUBLE:
28978 nargs = 2;
28979 klass = load;
28980 memory = 1;
28981 break;
28982 case V8SF_FTYPE_PCV8SF_V8SI:
28983 case V4DF_FTYPE_PCV4DF_V4DI:
28984 case V4SF_FTYPE_PCV4SF_V4SI:
28985 case V2DF_FTYPE_PCV2DF_V2DI:
28986 case V8SI_FTYPE_PCV8SI_V8SI:
28987 case V4DI_FTYPE_PCV4DI_V4DI:
28988 case V4SI_FTYPE_PCV4SI_V4SI:
28989 case V2DI_FTYPE_PCV2DI_V2DI:
28990 nargs = 2;
28991 klass = load;
28992 memory = 0;
28993 break;
28994 case VOID_FTYPE_PV8SF_V8SI_V8SF:
28995 case VOID_FTYPE_PV4DF_V4DI_V4DF:
28996 case VOID_FTYPE_PV4SF_V4SI_V4SF:
28997 case VOID_FTYPE_PV2DF_V2DI_V2DF:
28998 case VOID_FTYPE_PV8SI_V8SI_V8SI:
28999 case VOID_FTYPE_PV4DI_V4DI_V4DI:
29000 case VOID_FTYPE_PV4SI_V4SI_V4SI:
29001 case VOID_FTYPE_PV2DI_V2DI_V2DI:
29002 nargs = 2;
29003 klass = store;
29004 /* Reserve memory operand for target. */
29005 memory = ARRAY_SIZE (args);
29006 break;
29007 case VOID_FTYPE_UINT_UINT_UINT:
29008 case VOID_FTYPE_UINT64_UINT_UINT:
29009 case UCHAR_FTYPE_UINT_UINT_UINT:
29010 case UCHAR_FTYPE_UINT64_UINT_UINT:
29011 nargs = 3;
29012 klass = load;
29013 memory = ARRAY_SIZE (args);
29014 last_arg_constant = true;
29015 break;
29016 default:
29017 gcc_unreachable ();
29018 }
29019
29020 gcc_assert (nargs <= ARRAY_SIZE (args));
29021
29022 if (klass == store)
29023 {
29024 arg = CALL_EXPR_ARG (exp, 0);
29025 op = expand_normal (arg);
29026 gcc_assert (target == 0);
29027 if (memory)
29028 {
29029 if (GET_MODE (op) != Pmode)
29030 op = convert_to_mode (Pmode, op, 1);
29031 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
29032 }
29033 else
29034 target = force_reg (tmode, op);
29035 arg_adjust = 1;
29036 }
29037 else
29038 {
29039 arg_adjust = 0;
29040 if (optimize
29041 || target == 0
29042 || GET_MODE (target) != tmode
29043 || !insn_p->operand[0].predicate (target, tmode))
29044 target = gen_reg_rtx (tmode);
29045 }
29046
29047 for (i = 0; i < nargs; i++)
29048 {
29049 enum machine_mode mode = insn_p->operand[i + 1].mode;
29050 bool match;
29051
29052 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
29053 op = expand_normal (arg);
29054 match = insn_p->operand[i + 1].predicate (op, mode);
29055
29056 if (last_arg_constant && (i + 1) == nargs)
29057 {
29058 if (!match)
29059 {
29060 if (icode == CODE_FOR_lwp_lwpvalsi3
29061 || icode == CODE_FOR_lwp_lwpinssi3
29062 || icode == CODE_FOR_lwp_lwpvaldi3
29063 || icode == CODE_FOR_lwp_lwpinsdi3)
29064 error ("the last argument must be a 32-bit immediate");
29065 else
29066 error ("the last argument must be an 8-bit immediate");
29067 return const0_rtx;
29068 }
29069 }
29070 else
29071 {
29072 if (i == memory)
29073 {
29074 /* This must be the memory operand. */
29075 if (GET_MODE (op) != Pmode)
29076 op = convert_to_mode (Pmode, op, 1);
29077 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29078 gcc_assert (GET_MODE (op) == mode
29079 || GET_MODE (op) == VOIDmode);
29080 }
29081 else
29082 {
29083 /* This must be register. */
29084 if (VECTOR_MODE_P (mode))
29085 op = safe_vector_operand (op, mode);
29086
29087 gcc_assert (GET_MODE (op) == mode
29088 || GET_MODE (op) == VOIDmode);
29089 op = copy_to_mode_reg (mode, op);
29090 }
29091 }
29092
29093 args[i].op = op;
29094 args[i].mode = mode;
29095 }
29096
29097 switch (nargs)
29098 {
29099 case 0:
29100 pat = GEN_FCN (icode) (target);
29101 break;
29102 case 1:
29103 pat = GEN_FCN (icode) (target, args[0].op);
29104 break;
29105 case 2:
29106 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29107 break;
29108 case 3:
29109 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29110 break;
29111 default:
29112 gcc_unreachable ();
29113 }
29114
29115 if (! pat)
29116 return 0;
29117 emit_insn (pat);
29118 return klass == store ? 0 : target;
29119 }
29120
29121 /* Return the integer constant in ARG. Constrain it to be in the range
29122 of the subparts of VEC_TYPE; issue an error if not. */
29123
29124 static int
29125 get_element_number (tree vec_type, tree arg)
29126 {
29127 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29128
29129 if (!host_integerp (arg, 1)
29130 || (elt = tree_low_cst (arg, 1), elt > max))
29131 {
29132 error ("selector must be an integer constant in the range 0..%wi", max);
29133 return 0;
29134 }
29135
29136 return elt;
29137 }
29138
29139 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29140 ix86_expand_vector_init. We DO have language-level syntax for this, in
29141 the form of (type){ init-list }. Except that since we can't place emms
29142 instructions from inside the compiler, we can't allow the use of MMX
29143 registers unless the user explicitly asks for it. So we do *not* define
29144 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29145 we have builtins invoked by mmintrin.h that gives us license to emit
29146 these sorts of instructions. */
29147
29148 static rtx
29149 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29150 {
29151 enum machine_mode tmode = TYPE_MODE (type);
29152 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29153 int i, n_elt = GET_MODE_NUNITS (tmode);
29154 rtvec v = rtvec_alloc (n_elt);
29155
29156 gcc_assert (VECTOR_MODE_P (tmode));
29157 gcc_assert (call_expr_nargs (exp) == n_elt);
29158
29159 for (i = 0; i < n_elt; ++i)
29160 {
29161 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29162 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29163 }
29164
29165 if (!target || !register_operand (target, tmode))
29166 target = gen_reg_rtx (tmode);
29167
29168 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29169 return target;
29170 }
29171
29172 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29173 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29174 had a language-level syntax for referencing vector elements. */
29175
29176 static rtx
29177 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29178 {
29179 enum machine_mode tmode, mode0;
29180 tree arg0, arg1;
29181 int elt;
29182 rtx op0;
29183
29184 arg0 = CALL_EXPR_ARG (exp, 0);
29185 arg1 = CALL_EXPR_ARG (exp, 1);
29186
29187 op0 = expand_normal (arg0);
29188 elt = get_element_number (TREE_TYPE (arg0), arg1);
29189
29190 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29191 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29192 gcc_assert (VECTOR_MODE_P (mode0));
29193
29194 op0 = force_reg (mode0, op0);
29195
29196 if (optimize || !target || !register_operand (target, tmode))
29197 target = gen_reg_rtx (tmode);
29198
29199 ix86_expand_vector_extract (true, target, op0, elt);
29200
29201 return target;
29202 }
29203
29204 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29205 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29206 a language-level syntax for referencing vector elements. */
29207
29208 static rtx
29209 ix86_expand_vec_set_builtin (tree exp)
29210 {
29211 enum machine_mode tmode, mode1;
29212 tree arg0, arg1, arg2;
29213 int elt;
29214 rtx op0, op1, target;
29215
29216 arg0 = CALL_EXPR_ARG (exp, 0);
29217 arg1 = CALL_EXPR_ARG (exp, 1);
29218 arg2 = CALL_EXPR_ARG (exp, 2);
29219
29220 tmode = TYPE_MODE (TREE_TYPE (arg0));
29221 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29222 gcc_assert (VECTOR_MODE_P (tmode));
29223
29224 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29225 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29226 elt = get_element_number (TREE_TYPE (arg0), arg2);
29227
29228 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29229 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29230
29231 op0 = force_reg (tmode, op0);
29232 op1 = force_reg (mode1, op1);
29233
29234 /* OP0 is the source of these builtin functions and shouldn't be
29235 modified. Create a copy, use it and return it as target. */
29236 target = gen_reg_rtx (tmode);
29237 emit_move_insn (target, op0);
29238 ix86_expand_vector_set (true, target, op1, elt);
29239
29240 return target;
29241 }
29242
29243 /* Expand an expression EXP that calls a built-in function,
29244 with result going to TARGET if that's convenient
29245 (and in mode MODE if that's convenient).
29246 SUBTARGET may be used as the target for computing one of EXP's operands.
29247 IGNORE is nonzero if the value is to be ignored. */
29248
29249 static rtx
29250 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29251 enum machine_mode mode ATTRIBUTE_UNUSED,
29252 int ignore ATTRIBUTE_UNUSED)
29253 {
29254 const struct builtin_description *d;
29255 size_t i;
29256 enum insn_code icode;
29257 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29258 tree arg0, arg1, arg2, arg3, arg4;
29259 rtx op0, op1, op2, op3, op4, pat;
29260 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29261 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29262
29263 /* Determine whether the builtin function is available under the current ISA.
29264 Originally the builtin was not created if it wasn't applicable to the
29265 current ISA based on the command line switches. With function specific
29266 options, we need to check in the context of the function making the call
29267 whether it is supported. */
29268 if (ix86_builtins_isa[fcode].isa
29269 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29270 {
29271 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29272 NULL, (enum fpmath_unit) 0, false);
29273
29274 if (!opts)
29275 error ("%qE needs unknown isa option", fndecl);
29276 else
29277 {
29278 gcc_assert (opts != NULL);
29279 error ("%qE needs isa option %s", fndecl, opts);
29280 free (opts);
29281 }
29282 return const0_rtx;
29283 }
29284
29285 switch (fcode)
29286 {
29287 case IX86_BUILTIN_MASKMOVQ:
29288 case IX86_BUILTIN_MASKMOVDQU:
29289 icode = (fcode == IX86_BUILTIN_MASKMOVQ
29290 ? CODE_FOR_mmx_maskmovq
29291 : CODE_FOR_sse2_maskmovdqu);
29292 /* Note the arg order is different from the operand order. */
29293 arg1 = CALL_EXPR_ARG (exp, 0);
29294 arg2 = CALL_EXPR_ARG (exp, 1);
29295 arg0 = CALL_EXPR_ARG (exp, 2);
29296 op0 = expand_normal (arg0);
29297 op1 = expand_normal (arg1);
29298 op2 = expand_normal (arg2);
29299 mode0 = insn_data[icode].operand[0].mode;
29300 mode1 = insn_data[icode].operand[1].mode;
29301 mode2 = insn_data[icode].operand[2].mode;
29302
29303 if (GET_MODE (op0) != Pmode)
29304 op0 = convert_to_mode (Pmode, op0, 1);
29305 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29306
29307 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29308 op0 = copy_to_mode_reg (mode0, op0);
29309 if (!insn_data[icode].operand[1].predicate (op1, mode1))
29310 op1 = copy_to_mode_reg (mode1, op1);
29311 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29312 op2 = copy_to_mode_reg (mode2, op2);
29313 pat = GEN_FCN (icode) (op0, op1, op2);
29314 if (! pat)
29315 return 0;
29316 emit_insn (pat);
29317 return 0;
29318
29319 case IX86_BUILTIN_LDMXCSR:
29320 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29321 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29322 emit_move_insn (target, op0);
29323 emit_insn (gen_sse_ldmxcsr (target));
29324 return 0;
29325
29326 case IX86_BUILTIN_STMXCSR:
29327 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29328 emit_insn (gen_sse_stmxcsr (target));
29329 return copy_to_mode_reg (SImode, target);
29330
29331 case IX86_BUILTIN_CLFLUSH:
29332 arg0 = CALL_EXPR_ARG (exp, 0);
29333 op0 = expand_normal (arg0);
29334 icode = CODE_FOR_sse2_clflush;
29335 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29336 {
29337 if (GET_MODE (op0) != Pmode)
29338 op0 = convert_to_mode (Pmode, op0, 1);
29339 op0 = force_reg (Pmode, op0);
29340 }
29341
29342 emit_insn (gen_sse2_clflush (op0));
29343 return 0;
29344
29345 case IX86_BUILTIN_MONITOR:
29346 arg0 = CALL_EXPR_ARG (exp, 0);
29347 arg1 = CALL_EXPR_ARG (exp, 1);
29348 arg2 = CALL_EXPR_ARG (exp, 2);
29349 op0 = expand_normal (arg0);
29350 op1 = expand_normal (arg1);
29351 op2 = expand_normal (arg2);
29352 if (!REG_P (op0))
29353 {
29354 if (GET_MODE (op0) != Pmode)
29355 op0 = convert_to_mode (Pmode, op0, 1);
29356 op0 = force_reg (Pmode, op0);
29357 }
29358 if (!REG_P (op1))
29359 op1 = copy_to_mode_reg (SImode, op1);
29360 if (!REG_P (op2))
29361 op2 = copy_to_mode_reg (SImode, op2);
29362 emit_insn (ix86_gen_monitor (op0, op1, op2));
29363 return 0;
29364
29365 case IX86_BUILTIN_MWAIT:
29366 arg0 = CALL_EXPR_ARG (exp, 0);
29367 arg1 = CALL_EXPR_ARG (exp, 1);
29368 op0 = expand_normal (arg0);
29369 op1 = expand_normal (arg1);
29370 if (!REG_P (op0))
29371 op0 = copy_to_mode_reg (SImode, op0);
29372 if (!REG_P (op1))
29373 op1 = copy_to_mode_reg (SImode, op1);
29374 emit_insn (gen_sse3_mwait (op0, op1));
29375 return 0;
29376
29377 case IX86_BUILTIN_VEC_INIT_V2SI:
29378 case IX86_BUILTIN_VEC_INIT_V4HI:
29379 case IX86_BUILTIN_VEC_INIT_V8QI:
29380 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29381
29382 case IX86_BUILTIN_VEC_EXT_V2DF:
29383 case IX86_BUILTIN_VEC_EXT_V2DI:
29384 case IX86_BUILTIN_VEC_EXT_V4SF:
29385 case IX86_BUILTIN_VEC_EXT_V4SI:
29386 case IX86_BUILTIN_VEC_EXT_V8HI:
29387 case IX86_BUILTIN_VEC_EXT_V2SI:
29388 case IX86_BUILTIN_VEC_EXT_V4HI:
29389 case IX86_BUILTIN_VEC_EXT_V16QI:
29390 return ix86_expand_vec_ext_builtin (exp, target);
29391
29392 case IX86_BUILTIN_VEC_SET_V2DI:
29393 case IX86_BUILTIN_VEC_SET_V4SF:
29394 case IX86_BUILTIN_VEC_SET_V4SI:
29395 case IX86_BUILTIN_VEC_SET_V8HI:
29396 case IX86_BUILTIN_VEC_SET_V4HI:
29397 case IX86_BUILTIN_VEC_SET_V16QI:
29398 return ix86_expand_vec_set_builtin (exp);
29399
29400 case IX86_BUILTIN_INFQ:
29401 case IX86_BUILTIN_HUGE_VALQ:
29402 {
29403 REAL_VALUE_TYPE inf;
29404 rtx tmp;
29405
29406 real_inf (&inf);
29407 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29408
29409 tmp = validize_mem (force_const_mem (mode, tmp));
29410
29411 if (target == 0)
29412 target = gen_reg_rtx (mode);
29413
29414 emit_move_insn (target, tmp);
29415 return target;
29416 }
29417
29418 case IX86_BUILTIN_LLWPCB:
29419 arg0 = CALL_EXPR_ARG (exp, 0);
29420 op0 = expand_normal (arg0);
29421 icode = CODE_FOR_lwp_llwpcb;
29422 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29423 {
29424 if (GET_MODE (op0) != Pmode)
29425 op0 = convert_to_mode (Pmode, op0, 1);
29426 op0 = force_reg (Pmode, op0);
29427 }
29428 emit_insn (gen_lwp_llwpcb (op0));
29429 return 0;
29430
29431 case IX86_BUILTIN_SLWPCB:
29432 icode = CODE_FOR_lwp_slwpcb;
29433 if (!target
29434 || !insn_data[icode].operand[0].predicate (target, Pmode))
29435 target = gen_reg_rtx (Pmode);
29436 emit_insn (gen_lwp_slwpcb (target));
29437 return target;
29438
29439 case IX86_BUILTIN_BEXTRI32:
29440 case IX86_BUILTIN_BEXTRI64:
29441 arg0 = CALL_EXPR_ARG (exp, 0);
29442 arg1 = CALL_EXPR_ARG (exp, 1);
29443 op0 = expand_normal (arg0);
29444 op1 = expand_normal (arg1);
29445 icode = (fcode == IX86_BUILTIN_BEXTRI32
29446 ? CODE_FOR_tbm_bextri_si
29447 : CODE_FOR_tbm_bextri_di);
29448 if (!CONST_INT_P (op1))
29449 {
29450 error ("last argument must be an immediate");
29451 return const0_rtx;
29452 }
29453 else
29454 {
29455 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29456 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29457 op1 = GEN_INT (length);
29458 op2 = GEN_INT (lsb_index);
29459 pat = GEN_FCN (icode) (target, op0, op1, op2);
29460 if (pat)
29461 emit_insn (pat);
29462 return target;
29463 }
29464
29465 case IX86_BUILTIN_RDRAND16_STEP:
29466 icode = CODE_FOR_rdrandhi_1;
29467 mode0 = HImode;
29468 goto rdrand_step;
29469
29470 case IX86_BUILTIN_RDRAND32_STEP:
29471 icode = CODE_FOR_rdrandsi_1;
29472 mode0 = SImode;
29473 goto rdrand_step;
29474
29475 case IX86_BUILTIN_RDRAND64_STEP:
29476 icode = CODE_FOR_rdranddi_1;
29477 mode0 = DImode;
29478
29479 rdrand_step:
29480 op0 = gen_reg_rtx (mode0);
29481 emit_insn (GEN_FCN (icode) (op0));
29482
29483 arg0 = CALL_EXPR_ARG (exp, 0);
29484 op1 = expand_normal (arg0);
29485 if (!address_operand (op1, VOIDmode))
29486 {
29487 op1 = convert_memory_address (Pmode, op1);
29488 op1 = copy_addr_to_reg (op1);
29489 }
29490 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29491
29492 op1 = gen_reg_rtx (SImode);
29493 emit_move_insn (op1, CONST1_RTX (SImode));
29494
29495 /* Emit SImode conditional move. */
29496 if (mode0 == HImode)
29497 {
29498 op2 = gen_reg_rtx (SImode);
29499 emit_insn (gen_zero_extendhisi2 (op2, op0));
29500 }
29501 else if (mode0 == SImode)
29502 op2 = op0;
29503 else
29504 op2 = gen_rtx_SUBREG (SImode, op0, 0);
29505
29506 if (target == 0)
29507 target = gen_reg_rtx (SImode);
29508
29509 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29510 const0_rtx);
29511 emit_insn (gen_rtx_SET (VOIDmode, target,
29512 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29513 return target;
29514
29515 case IX86_BUILTIN_GATHERSIV2DF:
29516 icode = CODE_FOR_avx2_gathersiv2df;
29517 goto gather_gen;
29518 case IX86_BUILTIN_GATHERSIV4DF:
29519 icode = CODE_FOR_avx2_gathersiv4df;
29520 goto gather_gen;
29521 case IX86_BUILTIN_GATHERDIV2DF:
29522 icode = CODE_FOR_avx2_gatherdiv2df;
29523 goto gather_gen;
29524 case IX86_BUILTIN_GATHERDIV4DF:
29525 icode = CODE_FOR_avx2_gatherdiv4df;
29526 goto gather_gen;
29527 case IX86_BUILTIN_GATHERSIV4SF:
29528 icode = CODE_FOR_avx2_gathersiv4sf;
29529 goto gather_gen;
29530 case IX86_BUILTIN_GATHERSIV8SF:
29531 icode = CODE_FOR_avx2_gathersiv8sf;
29532 goto gather_gen;
29533 case IX86_BUILTIN_GATHERDIV4SF:
29534 icode = CODE_FOR_avx2_gatherdiv4sf;
29535 goto gather_gen;
29536 case IX86_BUILTIN_GATHERDIV8SF:
29537 icode = CODE_FOR_avx2_gatherdiv8sf;
29538 goto gather_gen;
29539 case IX86_BUILTIN_GATHERSIV2DI:
29540 icode = CODE_FOR_avx2_gathersiv2di;
29541 goto gather_gen;
29542 case IX86_BUILTIN_GATHERSIV4DI:
29543 icode = CODE_FOR_avx2_gathersiv4di;
29544 goto gather_gen;
29545 case IX86_BUILTIN_GATHERDIV2DI:
29546 icode = CODE_FOR_avx2_gatherdiv2di;
29547 goto gather_gen;
29548 case IX86_BUILTIN_GATHERDIV4DI:
29549 icode = CODE_FOR_avx2_gatherdiv4di;
29550 goto gather_gen;
29551 case IX86_BUILTIN_GATHERSIV4SI:
29552 icode = CODE_FOR_avx2_gathersiv4si;
29553 goto gather_gen;
29554 case IX86_BUILTIN_GATHERSIV8SI:
29555 icode = CODE_FOR_avx2_gathersiv8si;
29556 goto gather_gen;
29557 case IX86_BUILTIN_GATHERDIV4SI:
29558 icode = CODE_FOR_avx2_gatherdiv4si;
29559 goto gather_gen;
29560 case IX86_BUILTIN_GATHERDIV8SI:
29561 icode = CODE_FOR_avx2_gatherdiv8si;
29562 goto gather_gen;
29563 case IX86_BUILTIN_GATHERALTSIV4DF:
29564 icode = CODE_FOR_avx2_gathersiv4df;
29565 goto gather_gen;
29566 case IX86_BUILTIN_GATHERALTDIV8SF:
29567 icode = CODE_FOR_avx2_gatherdiv8sf;
29568 goto gather_gen;
29569 case IX86_BUILTIN_GATHERALTSIV4DI:
29570 icode = CODE_FOR_avx2_gathersiv4di;
29571 goto gather_gen;
29572 case IX86_BUILTIN_GATHERALTDIV8SI:
29573 icode = CODE_FOR_avx2_gatherdiv8si;
29574 goto gather_gen;
29575
29576 gather_gen:
29577 arg0 = CALL_EXPR_ARG (exp, 0);
29578 arg1 = CALL_EXPR_ARG (exp, 1);
29579 arg2 = CALL_EXPR_ARG (exp, 2);
29580 arg3 = CALL_EXPR_ARG (exp, 3);
29581 arg4 = CALL_EXPR_ARG (exp, 4);
29582 op0 = expand_normal (arg0);
29583 op1 = expand_normal (arg1);
29584 op2 = expand_normal (arg2);
29585 op3 = expand_normal (arg3);
29586 op4 = expand_normal (arg4);
29587 /* Note the arg order is different from the operand order. */
29588 mode0 = insn_data[icode].operand[1].mode;
29589 mode2 = insn_data[icode].operand[3].mode;
29590 mode3 = insn_data[icode].operand[4].mode;
29591 mode4 = insn_data[icode].operand[5].mode;
29592
29593 if (target == NULL_RTX
29594 || GET_MODE (target) != insn_data[icode].operand[0].mode)
29595 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
29596 else
29597 subtarget = target;
29598
29599 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
29600 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
29601 {
29602 rtx half = gen_reg_rtx (V4SImode);
29603 if (!nonimmediate_operand (op2, V8SImode))
29604 op2 = copy_to_mode_reg (V8SImode, op2);
29605 emit_insn (gen_vec_extract_lo_v8si (half, op2));
29606 op2 = half;
29607 }
29608 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
29609 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
29610 {
29611 rtx (*gen) (rtx, rtx);
29612 rtx half = gen_reg_rtx (mode0);
29613 if (mode0 == V4SFmode)
29614 gen = gen_vec_extract_lo_v8sf;
29615 else
29616 gen = gen_vec_extract_lo_v8si;
29617 if (!nonimmediate_operand (op0, GET_MODE (op0)))
29618 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
29619 emit_insn (gen (half, op0));
29620 op0 = half;
29621 if (!nonimmediate_operand (op3, GET_MODE (op3)))
29622 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
29623 emit_insn (gen (half, op3));
29624 op3 = half;
29625 }
29626
29627 /* Force memory operand only with base register here. But we
29628 don't want to do it on memory operand for other builtin
29629 functions. */
29630 if (GET_MODE (op1) != Pmode)
29631 op1 = convert_to_mode (Pmode, op1, 1);
29632 op1 = force_reg (Pmode, op1);
29633
29634 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29635 op0 = copy_to_mode_reg (mode0, op0);
29636 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
29637 op1 = copy_to_mode_reg (Pmode, op1);
29638 if (!insn_data[icode].operand[3].predicate (op2, mode2))
29639 op2 = copy_to_mode_reg (mode2, op2);
29640 if (!insn_data[icode].operand[4].predicate (op3, mode3))
29641 op3 = copy_to_mode_reg (mode3, op3);
29642 if (!insn_data[icode].operand[5].predicate (op4, mode4))
29643 {
29644 error ("last argument must be scale 1, 2, 4, 8");
29645 return const0_rtx;
29646 }
29647
29648 /* Optimize. If mask is known to have all high bits set,
29649 replace op0 with pc_rtx to signal that the instruction
29650 overwrites the whole destination and doesn't use its
29651 previous contents. */
29652 if (optimize)
29653 {
29654 if (TREE_CODE (arg3) == VECTOR_CST)
29655 {
29656 unsigned int negative = 0;
29657 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
29658 {
29659 tree cst = VECTOR_CST_ELT (arg3, i);
29660 if (TREE_CODE (cst) == INTEGER_CST
29661 && tree_int_cst_sign_bit (cst))
29662 negative++;
29663 else if (TREE_CODE (cst) == REAL_CST
29664 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
29665 negative++;
29666 }
29667 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
29668 op0 = pc_rtx;
29669 }
29670 else if (TREE_CODE (arg3) == SSA_NAME)
29671 {
29672 /* Recognize also when mask is like:
29673 __v2df src = _mm_setzero_pd ();
29674 __v2df mask = _mm_cmpeq_pd (src, src);
29675 or
29676 __v8sf src = _mm256_setzero_ps ();
29677 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
29678 as that is a cheaper way to load all ones into
29679 a register than having to load a constant from
29680 memory. */
29681 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
29682 if (is_gimple_call (def_stmt))
29683 {
29684 tree fndecl = gimple_call_fndecl (def_stmt);
29685 if (fndecl
29686 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29687 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
29688 {
29689 case IX86_BUILTIN_CMPPD:
29690 case IX86_BUILTIN_CMPPS:
29691 case IX86_BUILTIN_CMPPD256:
29692 case IX86_BUILTIN_CMPPS256:
29693 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
29694 break;
29695 /* FALLTHRU */
29696 case IX86_BUILTIN_CMPEQPD:
29697 case IX86_BUILTIN_CMPEQPS:
29698 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
29699 && initializer_zerop (gimple_call_arg (def_stmt,
29700 1)))
29701 op0 = pc_rtx;
29702 break;
29703 default:
29704 break;
29705 }
29706 }
29707 }
29708 }
29709
29710 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
29711 if (! pat)
29712 return const0_rtx;
29713 emit_insn (pat);
29714
29715 if (fcode == IX86_BUILTIN_GATHERDIV8SF
29716 || fcode == IX86_BUILTIN_GATHERDIV8SI)
29717 {
29718 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
29719 ? V4SFmode : V4SImode;
29720 if (target == NULL_RTX)
29721 target = gen_reg_rtx (tmode);
29722 if (tmode == V4SFmode)
29723 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
29724 else
29725 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
29726 }
29727 else
29728 target = subtarget;
29729
29730 return target;
29731
29732 case IX86_BUILTIN_XABORT:
29733 icode = CODE_FOR_xabort;
29734 arg0 = CALL_EXPR_ARG (exp, 0);
29735 op0 = expand_normal (arg0);
29736 mode0 = insn_data[icode].operand[0].mode;
29737 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29738 {
29739 error ("the xabort's argument must be an 8-bit immediate");
29740 return const0_rtx;
29741 }
29742 emit_insn (gen_xabort (op0));
29743 return 0;
29744
29745 default:
29746 break;
29747 }
29748
29749 for (i = 0, d = bdesc_special_args;
29750 i < ARRAY_SIZE (bdesc_special_args);
29751 i++, d++)
29752 if (d->code == fcode)
29753 return ix86_expand_special_args_builtin (d, exp, target);
29754
29755 for (i = 0, d = bdesc_args;
29756 i < ARRAY_SIZE (bdesc_args);
29757 i++, d++)
29758 if (d->code == fcode)
29759 switch (fcode)
29760 {
29761 case IX86_BUILTIN_FABSQ:
29762 case IX86_BUILTIN_COPYSIGNQ:
29763 if (!TARGET_SSE2)
29764 /* Emit a normal call if SSE2 isn't available. */
29765 return expand_call (exp, target, ignore);
29766 default:
29767 return ix86_expand_args_builtin (d, exp, target);
29768 }
29769
29770 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29771 if (d->code == fcode)
29772 return ix86_expand_sse_comi (d, exp, target);
29773
29774 for (i = 0, d = bdesc_pcmpestr;
29775 i < ARRAY_SIZE (bdesc_pcmpestr);
29776 i++, d++)
29777 if (d->code == fcode)
29778 return ix86_expand_sse_pcmpestr (d, exp, target);
29779
29780 for (i = 0, d = bdesc_pcmpistr;
29781 i < ARRAY_SIZE (bdesc_pcmpistr);
29782 i++, d++)
29783 if (d->code == fcode)
29784 return ix86_expand_sse_pcmpistr (d, exp, target);
29785
29786 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29787 if (d->code == fcode)
29788 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
29789 (enum ix86_builtin_func_type)
29790 d->flag, d->comparison);
29791
29792 gcc_unreachable ();
29793 }
29794
29795 /* Returns a function decl for a vectorized version of the builtin function
29796 with builtin function code FN and the result vector type TYPE, or NULL_TREE
29797 if it is not available. */
29798
29799 static tree
29800 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
29801 tree type_in)
29802 {
29803 enum machine_mode in_mode, out_mode;
29804 int in_n, out_n;
29805 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
29806
29807 if (TREE_CODE (type_out) != VECTOR_TYPE
29808 || TREE_CODE (type_in) != VECTOR_TYPE
29809 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
29810 return NULL_TREE;
29811
29812 out_mode = TYPE_MODE (TREE_TYPE (type_out));
29813 out_n = TYPE_VECTOR_SUBPARTS (type_out);
29814 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29815 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29816
29817 switch (fn)
29818 {
29819 case BUILT_IN_SQRT:
29820 if (out_mode == DFmode && in_mode == DFmode)
29821 {
29822 if (out_n == 2 && in_n == 2)
29823 return ix86_builtins[IX86_BUILTIN_SQRTPD];
29824 else if (out_n == 4 && in_n == 4)
29825 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
29826 }
29827 break;
29828
29829 case BUILT_IN_SQRTF:
29830 if (out_mode == SFmode && in_mode == SFmode)
29831 {
29832 if (out_n == 4 && in_n == 4)
29833 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
29834 else if (out_n == 8 && in_n == 8)
29835 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
29836 }
29837 break;
29838
29839 case BUILT_IN_IFLOOR:
29840 case BUILT_IN_LFLOOR:
29841 case BUILT_IN_LLFLOOR:
29842 /* The round insn does not trap on denormals. */
29843 if (flag_trapping_math || !TARGET_ROUND)
29844 break;
29845
29846 if (out_mode == SImode && in_mode == DFmode)
29847 {
29848 if (out_n == 4 && in_n == 2)
29849 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
29850 else if (out_n == 8 && in_n == 4)
29851 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
29852 }
29853 break;
29854
29855 case BUILT_IN_IFLOORF:
29856 case BUILT_IN_LFLOORF:
29857 case BUILT_IN_LLFLOORF:
29858 /* The round insn does not trap on denormals. */
29859 if (flag_trapping_math || !TARGET_ROUND)
29860 break;
29861
29862 if (out_mode == SImode && in_mode == SFmode)
29863 {
29864 if (out_n == 4 && in_n == 4)
29865 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
29866 else if (out_n == 8 && in_n == 8)
29867 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
29868 }
29869 break;
29870
29871 case BUILT_IN_ICEIL:
29872 case BUILT_IN_LCEIL:
29873 case BUILT_IN_LLCEIL:
29874 /* The round insn does not trap on denormals. */
29875 if (flag_trapping_math || !TARGET_ROUND)
29876 break;
29877
29878 if (out_mode == SImode && in_mode == DFmode)
29879 {
29880 if (out_n == 4 && in_n == 2)
29881 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
29882 else if (out_n == 8 && in_n == 4)
29883 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
29884 }
29885 break;
29886
29887 case BUILT_IN_ICEILF:
29888 case BUILT_IN_LCEILF:
29889 case BUILT_IN_LLCEILF:
29890 /* The round insn does not trap on denormals. */
29891 if (flag_trapping_math || !TARGET_ROUND)
29892 break;
29893
29894 if (out_mode == SImode && in_mode == SFmode)
29895 {
29896 if (out_n == 4 && in_n == 4)
29897 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
29898 else if (out_n == 8 && in_n == 8)
29899 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
29900 }
29901 break;
29902
29903 case BUILT_IN_IRINT:
29904 case BUILT_IN_LRINT:
29905 case BUILT_IN_LLRINT:
29906 if (out_mode == SImode && in_mode == DFmode)
29907 {
29908 if (out_n == 4 && in_n == 2)
29909 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
29910 else if (out_n == 8 && in_n == 4)
29911 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
29912 }
29913 break;
29914
29915 case BUILT_IN_IRINTF:
29916 case BUILT_IN_LRINTF:
29917 case BUILT_IN_LLRINTF:
29918 if (out_mode == SImode && in_mode == SFmode)
29919 {
29920 if (out_n == 4 && in_n == 4)
29921 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
29922 else if (out_n == 8 && in_n == 8)
29923 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
29924 }
29925 break;
29926
29927 case BUILT_IN_IROUND:
29928 case BUILT_IN_LROUND:
29929 case BUILT_IN_LLROUND:
29930 /* The round insn does not trap on denormals. */
29931 if (flag_trapping_math || !TARGET_ROUND)
29932 break;
29933
29934 if (out_mode == SImode && in_mode == DFmode)
29935 {
29936 if (out_n == 4 && in_n == 2)
29937 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
29938 else if (out_n == 8 && in_n == 4)
29939 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
29940 }
29941 break;
29942
29943 case BUILT_IN_IROUNDF:
29944 case BUILT_IN_LROUNDF:
29945 case BUILT_IN_LLROUNDF:
29946 /* The round insn does not trap on denormals. */
29947 if (flag_trapping_math || !TARGET_ROUND)
29948 break;
29949
29950 if (out_mode == SImode && in_mode == SFmode)
29951 {
29952 if (out_n == 4 && in_n == 4)
29953 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
29954 else if (out_n == 8 && in_n == 8)
29955 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
29956 }
29957 break;
29958
29959 case BUILT_IN_COPYSIGN:
29960 if (out_mode == DFmode && in_mode == DFmode)
29961 {
29962 if (out_n == 2 && in_n == 2)
29963 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
29964 else if (out_n == 4 && in_n == 4)
29965 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
29966 }
29967 break;
29968
29969 case BUILT_IN_COPYSIGNF:
29970 if (out_mode == SFmode && in_mode == SFmode)
29971 {
29972 if (out_n == 4 && in_n == 4)
29973 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
29974 else if (out_n == 8 && in_n == 8)
29975 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
29976 }
29977 break;
29978
29979 case BUILT_IN_FLOOR:
29980 /* The round insn does not trap on denormals. */
29981 if (flag_trapping_math || !TARGET_ROUND)
29982 break;
29983
29984 if (out_mode == DFmode && in_mode == DFmode)
29985 {
29986 if (out_n == 2 && in_n == 2)
29987 return ix86_builtins[IX86_BUILTIN_FLOORPD];
29988 else if (out_n == 4 && in_n == 4)
29989 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
29990 }
29991 break;
29992
29993 case BUILT_IN_FLOORF:
29994 /* The round insn does not trap on denormals. */
29995 if (flag_trapping_math || !TARGET_ROUND)
29996 break;
29997
29998 if (out_mode == SFmode && in_mode == SFmode)
29999 {
30000 if (out_n == 4 && in_n == 4)
30001 return ix86_builtins[IX86_BUILTIN_FLOORPS];
30002 else if (out_n == 8 && in_n == 8)
30003 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
30004 }
30005 break;
30006
30007 case BUILT_IN_CEIL:
30008 /* The round insn does not trap on denormals. */
30009 if (flag_trapping_math || !TARGET_ROUND)
30010 break;
30011
30012 if (out_mode == DFmode && in_mode == DFmode)
30013 {
30014 if (out_n == 2 && in_n == 2)
30015 return ix86_builtins[IX86_BUILTIN_CEILPD];
30016 else if (out_n == 4 && in_n == 4)
30017 return ix86_builtins[IX86_BUILTIN_CEILPD256];
30018 }
30019 break;
30020
30021 case BUILT_IN_CEILF:
30022 /* The round insn does not trap on denormals. */
30023 if (flag_trapping_math || !TARGET_ROUND)
30024 break;
30025
30026 if (out_mode == SFmode && in_mode == SFmode)
30027 {
30028 if (out_n == 4 && in_n == 4)
30029 return ix86_builtins[IX86_BUILTIN_CEILPS];
30030 else if (out_n == 8 && in_n == 8)
30031 return ix86_builtins[IX86_BUILTIN_CEILPS256];
30032 }
30033 break;
30034
30035 case BUILT_IN_TRUNC:
30036 /* The round insn does not trap on denormals. */
30037 if (flag_trapping_math || !TARGET_ROUND)
30038 break;
30039
30040 if (out_mode == DFmode && in_mode == DFmode)
30041 {
30042 if (out_n == 2 && in_n == 2)
30043 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
30044 else if (out_n == 4 && in_n == 4)
30045 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
30046 }
30047 break;
30048
30049 case BUILT_IN_TRUNCF:
30050 /* The round insn does not trap on denormals. */
30051 if (flag_trapping_math || !TARGET_ROUND)
30052 break;
30053
30054 if (out_mode == SFmode && in_mode == SFmode)
30055 {
30056 if (out_n == 4 && in_n == 4)
30057 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
30058 else if (out_n == 8 && in_n == 8)
30059 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
30060 }
30061 break;
30062
30063 case BUILT_IN_RINT:
30064 /* The round insn does not trap on denormals. */
30065 if (flag_trapping_math || !TARGET_ROUND)
30066 break;
30067
30068 if (out_mode == DFmode && in_mode == DFmode)
30069 {
30070 if (out_n == 2 && in_n == 2)
30071 return ix86_builtins[IX86_BUILTIN_RINTPD];
30072 else if (out_n == 4 && in_n == 4)
30073 return ix86_builtins[IX86_BUILTIN_RINTPD256];
30074 }
30075 break;
30076
30077 case BUILT_IN_RINTF:
30078 /* The round insn does not trap on denormals. */
30079 if (flag_trapping_math || !TARGET_ROUND)
30080 break;
30081
30082 if (out_mode == SFmode && in_mode == SFmode)
30083 {
30084 if (out_n == 4 && in_n == 4)
30085 return ix86_builtins[IX86_BUILTIN_RINTPS];
30086 else if (out_n == 8 && in_n == 8)
30087 return ix86_builtins[IX86_BUILTIN_RINTPS256];
30088 }
30089 break;
30090
30091 case BUILT_IN_ROUND:
30092 /* The round insn does not trap on denormals. */
30093 if (flag_trapping_math || !TARGET_ROUND)
30094 break;
30095
30096 if (out_mode == DFmode && in_mode == DFmode)
30097 {
30098 if (out_n == 2 && in_n == 2)
30099 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
30100 else if (out_n == 4 && in_n == 4)
30101 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
30102 }
30103 break;
30104
30105 case BUILT_IN_ROUNDF:
30106 /* The round insn does not trap on denormals. */
30107 if (flag_trapping_math || !TARGET_ROUND)
30108 break;
30109
30110 if (out_mode == SFmode && in_mode == SFmode)
30111 {
30112 if (out_n == 4 && in_n == 4)
30113 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
30114 else if (out_n == 8 && in_n == 8)
30115 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
30116 }
30117 break;
30118
30119 case BUILT_IN_FMA:
30120 if (out_mode == DFmode && in_mode == DFmode)
30121 {
30122 if (out_n == 2 && in_n == 2)
30123 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
30124 if (out_n == 4 && in_n == 4)
30125 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30126 }
30127 break;
30128
30129 case BUILT_IN_FMAF:
30130 if (out_mode == SFmode && in_mode == SFmode)
30131 {
30132 if (out_n == 4 && in_n == 4)
30133 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30134 if (out_n == 8 && in_n == 8)
30135 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30136 }
30137 break;
30138
30139 default:
30140 break;
30141 }
30142
30143 /* Dispatch to a handler for a vectorization library. */
30144 if (ix86_veclib_handler)
30145 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30146 type_in);
30147
30148 return NULL_TREE;
30149 }
30150
30151 /* Handler for an SVML-style interface to
30152 a library with vectorized intrinsics. */
30153
30154 static tree
30155 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30156 {
30157 char name[20];
30158 tree fntype, new_fndecl, args;
30159 unsigned arity;
30160 const char *bname;
30161 enum machine_mode el_mode, in_mode;
30162 int n, in_n;
30163
30164 /* The SVML is suitable for unsafe math only. */
30165 if (!flag_unsafe_math_optimizations)
30166 return NULL_TREE;
30167
30168 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30169 n = TYPE_VECTOR_SUBPARTS (type_out);
30170 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30171 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30172 if (el_mode != in_mode
30173 || n != in_n)
30174 return NULL_TREE;
30175
30176 switch (fn)
30177 {
30178 case BUILT_IN_EXP:
30179 case BUILT_IN_LOG:
30180 case BUILT_IN_LOG10:
30181 case BUILT_IN_POW:
30182 case BUILT_IN_TANH:
30183 case BUILT_IN_TAN:
30184 case BUILT_IN_ATAN:
30185 case BUILT_IN_ATAN2:
30186 case BUILT_IN_ATANH:
30187 case BUILT_IN_CBRT:
30188 case BUILT_IN_SINH:
30189 case BUILT_IN_SIN:
30190 case BUILT_IN_ASINH:
30191 case BUILT_IN_ASIN:
30192 case BUILT_IN_COSH:
30193 case BUILT_IN_COS:
30194 case BUILT_IN_ACOSH:
30195 case BUILT_IN_ACOS:
30196 if (el_mode != DFmode || n != 2)
30197 return NULL_TREE;
30198 break;
30199
30200 case BUILT_IN_EXPF:
30201 case BUILT_IN_LOGF:
30202 case BUILT_IN_LOG10F:
30203 case BUILT_IN_POWF:
30204 case BUILT_IN_TANHF:
30205 case BUILT_IN_TANF:
30206 case BUILT_IN_ATANF:
30207 case BUILT_IN_ATAN2F:
30208 case BUILT_IN_ATANHF:
30209 case BUILT_IN_CBRTF:
30210 case BUILT_IN_SINHF:
30211 case BUILT_IN_SINF:
30212 case BUILT_IN_ASINHF:
30213 case BUILT_IN_ASINF:
30214 case BUILT_IN_COSHF:
30215 case BUILT_IN_COSF:
30216 case BUILT_IN_ACOSHF:
30217 case BUILT_IN_ACOSF:
30218 if (el_mode != SFmode || n != 4)
30219 return NULL_TREE;
30220 break;
30221
30222 default:
30223 return NULL_TREE;
30224 }
30225
30226 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30227
30228 if (fn == BUILT_IN_LOGF)
30229 strcpy (name, "vmlsLn4");
30230 else if (fn == BUILT_IN_LOG)
30231 strcpy (name, "vmldLn2");
30232 else if (n == 4)
30233 {
30234 sprintf (name, "vmls%s", bname+10);
30235 name[strlen (name)-1] = '4';
30236 }
30237 else
30238 sprintf (name, "vmld%s2", bname+10);
30239
30240 /* Convert to uppercase. */
30241 name[4] &= ~0x20;
30242
30243 arity = 0;
30244 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30245 args;
30246 args = TREE_CHAIN (args))
30247 arity++;
30248
30249 if (arity == 1)
30250 fntype = build_function_type_list (type_out, type_in, NULL);
30251 else
30252 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30253
30254 /* Build a function declaration for the vectorized function. */
30255 new_fndecl = build_decl (BUILTINS_LOCATION,
30256 FUNCTION_DECL, get_identifier (name), fntype);
30257 TREE_PUBLIC (new_fndecl) = 1;
30258 DECL_EXTERNAL (new_fndecl) = 1;
30259 DECL_IS_NOVOPS (new_fndecl) = 1;
30260 TREE_READONLY (new_fndecl) = 1;
30261
30262 return new_fndecl;
30263 }
30264
30265 /* Handler for an ACML-style interface to
30266 a library with vectorized intrinsics. */
30267
30268 static tree
30269 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30270 {
30271 char name[20] = "__vr.._";
30272 tree fntype, new_fndecl, args;
30273 unsigned arity;
30274 const char *bname;
30275 enum machine_mode el_mode, in_mode;
30276 int n, in_n;
30277
30278 /* The ACML is 64bits only and suitable for unsafe math only as
30279 it does not correctly support parts of IEEE with the required
30280 precision such as denormals. */
30281 if (!TARGET_64BIT
30282 || !flag_unsafe_math_optimizations)
30283 return NULL_TREE;
30284
30285 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30286 n = TYPE_VECTOR_SUBPARTS (type_out);
30287 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30288 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30289 if (el_mode != in_mode
30290 || n != in_n)
30291 return NULL_TREE;
30292
30293 switch (fn)
30294 {
30295 case BUILT_IN_SIN:
30296 case BUILT_IN_COS:
30297 case BUILT_IN_EXP:
30298 case BUILT_IN_LOG:
30299 case BUILT_IN_LOG2:
30300 case BUILT_IN_LOG10:
30301 name[4] = 'd';
30302 name[5] = '2';
30303 if (el_mode != DFmode
30304 || n != 2)
30305 return NULL_TREE;
30306 break;
30307
30308 case BUILT_IN_SINF:
30309 case BUILT_IN_COSF:
30310 case BUILT_IN_EXPF:
30311 case BUILT_IN_POWF:
30312 case BUILT_IN_LOGF:
30313 case BUILT_IN_LOG2F:
30314 case BUILT_IN_LOG10F:
30315 name[4] = 's';
30316 name[5] = '4';
30317 if (el_mode != SFmode
30318 || n != 4)
30319 return NULL_TREE;
30320 break;
30321
30322 default:
30323 return NULL_TREE;
30324 }
30325
30326 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30327 sprintf (name + 7, "%s", bname+10);
30328
30329 arity = 0;
30330 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30331 args;
30332 args = TREE_CHAIN (args))
30333 arity++;
30334
30335 if (arity == 1)
30336 fntype = build_function_type_list (type_out, type_in, NULL);
30337 else
30338 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30339
30340 /* Build a function declaration for the vectorized function. */
30341 new_fndecl = build_decl (BUILTINS_LOCATION,
30342 FUNCTION_DECL, get_identifier (name), fntype);
30343 TREE_PUBLIC (new_fndecl) = 1;
30344 DECL_EXTERNAL (new_fndecl) = 1;
30345 DECL_IS_NOVOPS (new_fndecl) = 1;
30346 TREE_READONLY (new_fndecl) = 1;
30347
30348 return new_fndecl;
30349 }
30350
30351 /* Returns a decl of a function that implements gather load with
30352 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30353 Return NULL_TREE if it is not available. */
30354
30355 static tree
30356 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30357 const_tree index_type, int scale)
30358 {
30359 bool si;
30360 enum ix86_builtins code;
30361
30362 if (! TARGET_AVX2)
30363 return NULL_TREE;
30364
30365 if ((TREE_CODE (index_type) != INTEGER_TYPE
30366 && !POINTER_TYPE_P (index_type))
30367 || (TYPE_MODE (index_type) != SImode
30368 && TYPE_MODE (index_type) != DImode))
30369 return NULL_TREE;
30370
30371 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30372 return NULL_TREE;
30373
30374 /* v*gather* insn sign extends index to pointer mode. */
30375 if (TYPE_PRECISION (index_type) < POINTER_SIZE
30376 && TYPE_UNSIGNED (index_type))
30377 return NULL_TREE;
30378
30379 if (scale <= 0
30380 || scale > 8
30381 || (scale & (scale - 1)) != 0)
30382 return NULL_TREE;
30383
30384 si = TYPE_MODE (index_type) == SImode;
30385 switch (TYPE_MODE (mem_vectype))
30386 {
30387 case V2DFmode:
30388 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30389 break;
30390 case V4DFmode:
30391 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30392 break;
30393 case V2DImode:
30394 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30395 break;
30396 case V4DImode:
30397 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30398 break;
30399 case V4SFmode:
30400 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30401 break;
30402 case V8SFmode:
30403 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30404 break;
30405 case V4SImode:
30406 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30407 break;
30408 case V8SImode:
30409 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30410 break;
30411 default:
30412 return NULL_TREE;
30413 }
30414
30415 return ix86_builtins[code];
30416 }
30417
30418 /* Returns a code for a target-specific builtin that implements
30419 reciprocal of the function, or NULL_TREE if not available. */
30420
30421 static tree
30422 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30423 bool sqrt ATTRIBUTE_UNUSED)
30424 {
30425 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30426 && flag_finite_math_only && !flag_trapping_math
30427 && flag_unsafe_math_optimizations))
30428 return NULL_TREE;
30429
30430 if (md_fn)
30431 /* Machine dependent builtins. */
30432 switch (fn)
30433 {
30434 /* Vectorized version of sqrt to rsqrt conversion. */
30435 case IX86_BUILTIN_SQRTPS_NR:
30436 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30437
30438 case IX86_BUILTIN_SQRTPS_NR256:
30439 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30440
30441 default:
30442 return NULL_TREE;
30443 }
30444 else
30445 /* Normal builtins. */
30446 switch (fn)
30447 {
30448 /* Sqrt to rsqrt conversion. */
30449 case BUILT_IN_SQRTF:
30450 return ix86_builtins[IX86_BUILTIN_RSQRTF];
30451
30452 default:
30453 return NULL_TREE;
30454 }
30455 }
30456 \f
30457 /* Helper for avx_vpermilps256_operand et al. This is also used by
30458 the expansion functions to turn the parallel back into a mask.
30459 The return value is 0 for no match and the imm8+1 for a match. */
30460
30461 int
30462 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30463 {
30464 unsigned i, nelt = GET_MODE_NUNITS (mode);
30465 unsigned mask = 0;
30466 unsigned char ipar[8];
30467
30468 if (XVECLEN (par, 0) != (int) nelt)
30469 return 0;
30470
30471 /* Validate that all of the elements are constants, and not totally
30472 out of range. Copy the data into an integral array to make the
30473 subsequent checks easier. */
30474 for (i = 0; i < nelt; ++i)
30475 {
30476 rtx er = XVECEXP (par, 0, i);
30477 unsigned HOST_WIDE_INT ei;
30478
30479 if (!CONST_INT_P (er))
30480 return 0;
30481 ei = INTVAL (er);
30482 if (ei >= nelt)
30483 return 0;
30484 ipar[i] = ei;
30485 }
30486
30487 switch (mode)
30488 {
30489 case V4DFmode:
30490 /* In the 256-bit DFmode case, we can only move elements within
30491 a 128-bit lane. */
30492 for (i = 0; i < 2; ++i)
30493 {
30494 if (ipar[i] >= 2)
30495 return 0;
30496 mask |= ipar[i] << i;
30497 }
30498 for (i = 2; i < 4; ++i)
30499 {
30500 if (ipar[i] < 2)
30501 return 0;
30502 mask |= (ipar[i] - 2) << i;
30503 }
30504 break;
30505
30506 case V8SFmode:
30507 /* In the 256-bit SFmode case, we have full freedom of movement
30508 within the low 128-bit lane, but the high 128-bit lane must
30509 mirror the exact same pattern. */
30510 for (i = 0; i < 4; ++i)
30511 if (ipar[i] + 4 != ipar[i + 4])
30512 return 0;
30513 nelt = 4;
30514 /* FALLTHRU */
30515
30516 case V2DFmode:
30517 case V4SFmode:
30518 /* In the 128-bit case, we've full freedom in the placement of
30519 the elements from the source operand. */
30520 for (i = 0; i < nelt; ++i)
30521 mask |= ipar[i] << (i * (nelt / 2));
30522 break;
30523
30524 default:
30525 gcc_unreachable ();
30526 }
30527
30528 /* Make sure success has a non-zero value by adding one. */
30529 return mask + 1;
30530 }
30531
30532 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
30533 the expansion functions to turn the parallel back into a mask.
30534 The return value is 0 for no match and the imm8+1 for a match. */
30535
30536 int
30537 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30538 {
30539 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30540 unsigned mask = 0;
30541 unsigned char ipar[8];
30542
30543 if (XVECLEN (par, 0) != (int) nelt)
30544 return 0;
30545
30546 /* Validate that all of the elements are constants, and not totally
30547 out of range. Copy the data into an integral array to make the
30548 subsequent checks easier. */
30549 for (i = 0; i < nelt; ++i)
30550 {
30551 rtx er = XVECEXP (par, 0, i);
30552 unsigned HOST_WIDE_INT ei;
30553
30554 if (!CONST_INT_P (er))
30555 return 0;
30556 ei = INTVAL (er);
30557 if (ei >= 2 * nelt)
30558 return 0;
30559 ipar[i] = ei;
30560 }
30561
30562 /* Validate that the halves of the permute are halves. */
30563 for (i = 0; i < nelt2 - 1; ++i)
30564 if (ipar[i] + 1 != ipar[i + 1])
30565 return 0;
30566 for (i = nelt2; i < nelt - 1; ++i)
30567 if (ipar[i] + 1 != ipar[i + 1])
30568 return 0;
30569
30570 /* Reconstruct the mask. */
30571 for (i = 0; i < 2; ++i)
30572 {
30573 unsigned e = ipar[i * nelt2];
30574 if (e % nelt2)
30575 return 0;
30576 e /= nelt2;
30577 mask |= e << (i * 4);
30578 }
30579
30580 /* Make sure success has a non-zero value by adding one. */
30581 return mask + 1;
30582 }
30583 \f
30584 /* Store OPERAND to the memory after reload is completed. This means
30585 that we can't easily use assign_stack_local. */
30586 rtx
30587 ix86_force_to_memory (enum machine_mode mode, rtx operand)
30588 {
30589 rtx result;
30590
30591 gcc_assert (reload_completed);
30592 if (ix86_using_red_zone ())
30593 {
30594 result = gen_rtx_MEM (mode,
30595 gen_rtx_PLUS (Pmode,
30596 stack_pointer_rtx,
30597 GEN_INT (-RED_ZONE_SIZE)));
30598 emit_move_insn (result, operand);
30599 }
30600 else if (TARGET_64BIT)
30601 {
30602 switch (mode)
30603 {
30604 case HImode:
30605 case SImode:
30606 operand = gen_lowpart (DImode, operand);
30607 /* FALLTHRU */
30608 case DImode:
30609 emit_insn (
30610 gen_rtx_SET (VOIDmode,
30611 gen_rtx_MEM (DImode,
30612 gen_rtx_PRE_DEC (DImode,
30613 stack_pointer_rtx)),
30614 operand));
30615 break;
30616 default:
30617 gcc_unreachable ();
30618 }
30619 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30620 }
30621 else
30622 {
30623 switch (mode)
30624 {
30625 case DImode:
30626 {
30627 rtx operands[2];
30628 split_double_mode (mode, &operand, 1, operands, operands + 1);
30629 emit_insn (
30630 gen_rtx_SET (VOIDmode,
30631 gen_rtx_MEM (SImode,
30632 gen_rtx_PRE_DEC (Pmode,
30633 stack_pointer_rtx)),
30634 operands[1]));
30635 emit_insn (
30636 gen_rtx_SET (VOIDmode,
30637 gen_rtx_MEM (SImode,
30638 gen_rtx_PRE_DEC (Pmode,
30639 stack_pointer_rtx)),
30640 operands[0]));
30641 }
30642 break;
30643 case HImode:
30644 /* Store HImodes as SImodes. */
30645 operand = gen_lowpart (SImode, operand);
30646 /* FALLTHRU */
30647 case SImode:
30648 emit_insn (
30649 gen_rtx_SET (VOIDmode,
30650 gen_rtx_MEM (GET_MODE (operand),
30651 gen_rtx_PRE_DEC (SImode,
30652 stack_pointer_rtx)),
30653 operand));
30654 break;
30655 default:
30656 gcc_unreachable ();
30657 }
30658 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30659 }
30660 return result;
30661 }
30662
30663 /* Free operand from the memory. */
30664 void
30665 ix86_free_from_memory (enum machine_mode mode)
30666 {
30667 if (!ix86_using_red_zone ())
30668 {
30669 int size;
30670
30671 if (mode == DImode || TARGET_64BIT)
30672 size = 8;
30673 else
30674 size = 4;
30675 /* Use LEA to deallocate stack space. In peephole2 it will be converted
30676 to pop or add instruction if registers are available. */
30677 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
30678 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
30679 GEN_INT (size))));
30680 }
30681 }
30682
30683 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
30684
30685 Put float CONST_DOUBLE in the constant pool instead of fp regs.
30686 QImode must go into class Q_REGS.
30687 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
30688 movdf to do mem-to-mem moves through integer regs. */
30689
30690 static reg_class_t
30691 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
30692 {
30693 enum machine_mode mode = GET_MODE (x);
30694
30695 /* We're only allowed to return a subclass of CLASS. Many of the
30696 following checks fail for NO_REGS, so eliminate that early. */
30697 if (regclass == NO_REGS)
30698 return NO_REGS;
30699
30700 /* All classes can load zeros. */
30701 if (x == CONST0_RTX (mode))
30702 return regclass;
30703
30704 /* Force constants into memory if we are loading a (nonzero) constant into
30705 an MMX or SSE register. This is because there are no MMX/SSE instructions
30706 to load from a constant. */
30707 if (CONSTANT_P (x)
30708 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
30709 return NO_REGS;
30710
30711 /* Prefer SSE regs only, if we can use them for math. */
30712 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
30713 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
30714
30715 /* Floating-point constants need more complex checks. */
30716 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
30717 {
30718 /* General regs can load everything. */
30719 if (reg_class_subset_p (regclass, GENERAL_REGS))
30720 return regclass;
30721
30722 /* Floats can load 0 and 1 plus some others. Note that we eliminated
30723 zero above. We only want to wind up preferring 80387 registers if
30724 we plan on doing computation with them. */
30725 if (TARGET_80387
30726 && standard_80387_constant_p (x) > 0)
30727 {
30728 /* Limit class to non-sse. */
30729 if (regclass == FLOAT_SSE_REGS)
30730 return FLOAT_REGS;
30731 if (regclass == FP_TOP_SSE_REGS)
30732 return FP_TOP_REG;
30733 if (regclass == FP_SECOND_SSE_REGS)
30734 return FP_SECOND_REG;
30735 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
30736 return regclass;
30737 }
30738
30739 return NO_REGS;
30740 }
30741
30742 /* Generally when we see PLUS here, it's the function invariant
30743 (plus soft-fp const_int). Which can only be computed into general
30744 regs. */
30745 if (GET_CODE (x) == PLUS)
30746 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
30747
30748 /* QImode constants are easy to load, but non-constant QImode data
30749 must go into Q_REGS. */
30750 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
30751 {
30752 if (reg_class_subset_p (regclass, Q_REGS))
30753 return regclass;
30754 if (reg_class_subset_p (Q_REGS, regclass))
30755 return Q_REGS;
30756 return NO_REGS;
30757 }
30758
30759 return regclass;
30760 }
30761
30762 /* Discourage putting floating-point values in SSE registers unless
30763 SSE math is being used, and likewise for the 387 registers. */
30764 static reg_class_t
30765 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
30766 {
30767 enum machine_mode mode = GET_MODE (x);
30768
30769 /* Restrict the output reload class to the register bank that we are doing
30770 math on. If we would like not to return a subset of CLASS, reject this
30771 alternative: if reload cannot do this, it will still use its choice. */
30772 mode = GET_MODE (x);
30773 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
30774 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
30775
30776 if (X87_FLOAT_MODE_P (mode))
30777 {
30778 if (regclass == FP_TOP_SSE_REGS)
30779 return FP_TOP_REG;
30780 else if (regclass == FP_SECOND_SSE_REGS)
30781 return FP_SECOND_REG;
30782 else
30783 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
30784 }
30785
30786 return regclass;
30787 }
30788
30789 static reg_class_t
30790 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
30791 enum machine_mode mode, secondary_reload_info *sri)
30792 {
30793 /* Double-word spills from general registers to non-offsettable memory
30794 references (zero-extended addresses) require special handling. */
30795 if (TARGET_64BIT
30796 && MEM_P (x)
30797 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
30798 && rclass == GENERAL_REGS
30799 && !offsettable_memref_p (x))
30800 {
30801 sri->icode = (in_p
30802 ? CODE_FOR_reload_noff_load
30803 : CODE_FOR_reload_noff_store);
30804 /* Add the cost of moving address to a temporary. */
30805 sri->extra_cost = 1;
30806
30807 return NO_REGS;
30808 }
30809
30810 /* QImode spills from non-QI registers require
30811 intermediate register on 32bit targets. */
30812 if (!TARGET_64BIT
30813 && !in_p && mode == QImode
30814 && (rclass == GENERAL_REGS
30815 || rclass == LEGACY_REGS
30816 || rclass == INDEX_REGS))
30817 {
30818 int regno;
30819
30820 if (REG_P (x))
30821 regno = REGNO (x);
30822 else
30823 regno = -1;
30824
30825 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
30826 regno = true_regnum (x);
30827
30828 /* Return Q_REGS if the operand is in memory. */
30829 if (regno == -1)
30830 return Q_REGS;
30831 }
30832
30833 /* This condition handles corner case where an expression involving
30834 pointers gets vectorized. We're trying to use the address of a
30835 stack slot as a vector initializer.
30836
30837 (set (reg:V2DI 74 [ vect_cst_.2 ])
30838 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
30839
30840 Eventually frame gets turned into sp+offset like this:
30841
30842 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30843 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30844 (const_int 392 [0x188]))))
30845
30846 That later gets turned into:
30847
30848 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30849 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30850 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
30851
30852 We'll have the following reload recorded:
30853
30854 Reload 0: reload_in (DI) =
30855 (plus:DI (reg/f:DI 7 sp)
30856 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
30857 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30858 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
30859 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
30860 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30861 reload_reg_rtx: (reg:V2DI 22 xmm1)
30862
30863 Which isn't going to work since SSE instructions can't handle scalar
30864 additions. Returning GENERAL_REGS forces the addition into integer
30865 register and reload can handle subsequent reloads without problems. */
30866
30867 if (in_p && GET_CODE (x) == PLUS
30868 && SSE_CLASS_P (rclass)
30869 && SCALAR_INT_MODE_P (mode))
30870 return GENERAL_REGS;
30871
30872 return NO_REGS;
30873 }
30874
30875 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
30876
30877 static bool
30878 ix86_class_likely_spilled_p (reg_class_t rclass)
30879 {
30880 switch (rclass)
30881 {
30882 case AREG:
30883 case DREG:
30884 case CREG:
30885 case BREG:
30886 case AD_REGS:
30887 case SIREG:
30888 case DIREG:
30889 case SSE_FIRST_REG:
30890 case FP_TOP_REG:
30891 case FP_SECOND_REG:
30892 return true;
30893
30894 default:
30895 break;
30896 }
30897
30898 return false;
30899 }
30900
30901 /* If we are copying between general and FP registers, we need a memory
30902 location. The same is true for SSE and MMX registers.
30903
30904 To optimize register_move_cost performance, allow inline variant.
30905
30906 The macro can't work reliably when one of the CLASSES is class containing
30907 registers from multiple units (SSE, MMX, integer). We avoid this by never
30908 combining those units in single alternative in the machine description.
30909 Ensure that this constraint holds to avoid unexpected surprises.
30910
30911 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
30912 enforce these sanity checks. */
30913
30914 static inline bool
30915 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30916 enum machine_mode mode, int strict)
30917 {
30918 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
30919 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
30920 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
30921 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
30922 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
30923 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
30924 {
30925 gcc_assert (!strict);
30926 return true;
30927 }
30928
30929 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
30930 return true;
30931
30932 /* ??? This is a lie. We do have moves between mmx/general, and for
30933 mmx/sse2. But by saying we need secondary memory we discourage the
30934 register allocator from using the mmx registers unless needed. */
30935 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
30936 return true;
30937
30938 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30939 {
30940 /* SSE1 doesn't have any direct moves from other classes. */
30941 if (!TARGET_SSE2)
30942 return true;
30943
30944 /* If the target says that inter-unit moves are more expensive
30945 than moving through memory, then don't generate them. */
30946 if (!TARGET_INTER_UNIT_MOVES)
30947 return true;
30948
30949 /* Between SSE and general, we have moves no larger than word size. */
30950 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
30951 return true;
30952 }
30953
30954 return false;
30955 }
30956
30957 bool
30958 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30959 enum machine_mode mode, int strict)
30960 {
30961 return inline_secondary_memory_needed (class1, class2, mode, strict);
30962 }
30963
30964 /* Implement the TARGET_CLASS_MAX_NREGS hook.
30965
30966 On the 80386, this is the size of MODE in words,
30967 except in the FP regs, where a single reg is always enough. */
30968
30969 static unsigned char
30970 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
30971 {
30972 if (MAYBE_INTEGER_CLASS_P (rclass))
30973 {
30974 if (mode == XFmode)
30975 return (TARGET_64BIT ? 2 : 3);
30976 else if (mode == XCmode)
30977 return (TARGET_64BIT ? 4 : 6);
30978 else
30979 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
30980 }
30981 else
30982 {
30983 if (COMPLEX_MODE_P (mode))
30984 return 2;
30985 else
30986 return 1;
30987 }
30988 }
30989
30990 /* Return true if the registers in CLASS cannot represent the change from
30991 modes FROM to TO. */
30992
30993 bool
30994 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
30995 enum reg_class regclass)
30996 {
30997 if (from == to)
30998 return false;
30999
31000 /* x87 registers can't do subreg at all, as all values are reformatted
31001 to extended precision. */
31002 if (MAYBE_FLOAT_CLASS_P (regclass))
31003 return true;
31004
31005 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
31006 {
31007 /* Vector registers do not support QI or HImode loads. If we don't
31008 disallow a change to these modes, reload will assume it's ok to
31009 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
31010 the vec_dupv4hi pattern. */
31011 if (GET_MODE_SIZE (from) < 4)
31012 return true;
31013
31014 /* Vector registers do not support subreg with nonzero offsets, which
31015 are otherwise valid for integer registers. Since we can't see
31016 whether we have a nonzero offset from here, prohibit all
31017 nonparadoxical subregs changing size. */
31018 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
31019 return true;
31020 }
31021
31022 return false;
31023 }
31024
31025 /* Return the cost of moving data of mode M between a
31026 register and memory. A value of 2 is the default; this cost is
31027 relative to those in `REGISTER_MOVE_COST'.
31028
31029 This function is used extensively by register_move_cost that is used to
31030 build tables at startup. Make it inline in this case.
31031 When IN is 2, return maximum of in and out move cost.
31032
31033 If moving between registers and memory is more expensive than
31034 between two registers, you should define this macro to express the
31035 relative cost.
31036
31037 Model also increased moving costs of QImode registers in non
31038 Q_REGS classes.
31039 */
31040 static inline int
31041 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
31042 int in)
31043 {
31044 int cost;
31045 if (FLOAT_CLASS_P (regclass))
31046 {
31047 int index;
31048 switch (mode)
31049 {
31050 case SFmode:
31051 index = 0;
31052 break;
31053 case DFmode:
31054 index = 1;
31055 break;
31056 case XFmode:
31057 index = 2;
31058 break;
31059 default:
31060 return 100;
31061 }
31062 if (in == 2)
31063 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
31064 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
31065 }
31066 if (SSE_CLASS_P (regclass))
31067 {
31068 int index;
31069 switch (GET_MODE_SIZE (mode))
31070 {
31071 case 4:
31072 index = 0;
31073 break;
31074 case 8:
31075 index = 1;
31076 break;
31077 case 16:
31078 index = 2;
31079 break;
31080 default:
31081 return 100;
31082 }
31083 if (in == 2)
31084 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
31085 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
31086 }
31087 if (MMX_CLASS_P (regclass))
31088 {
31089 int index;
31090 switch (GET_MODE_SIZE (mode))
31091 {
31092 case 4:
31093 index = 0;
31094 break;
31095 case 8:
31096 index = 1;
31097 break;
31098 default:
31099 return 100;
31100 }
31101 if (in)
31102 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
31103 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
31104 }
31105 switch (GET_MODE_SIZE (mode))
31106 {
31107 case 1:
31108 if (Q_CLASS_P (regclass) || TARGET_64BIT)
31109 {
31110 if (!in)
31111 return ix86_cost->int_store[0];
31112 if (TARGET_PARTIAL_REG_DEPENDENCY
31113 && optimize_function_for_speed_p (cfun))
31114 cost = ix86_cost->movzbl_load;
31115 else
31116 cost = ix86_cost->int_load[0];
31117 if (in == 2)
31118 return MAX (cost, ix86_cost->int_store[0]);
31119 return cost;
31120 }
31121 else
31122 {
31123 if (in == 2)
31124 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
31125 if (in)
31126 return ix86_cost->movzbl_load;
31127 else
31128 return ix86_cost->int_store[0] + 4;
31129 }
31130 break;
31131 case 2:
31132 if (in == 2)
31133 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31134 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31135 default:
31136 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
31137 if (mode == TFmode)
31138 mode = XFmode;
31139 if (in == 2)
31140 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31141 else if (in)
31142 cost = ix86_cost->int_load[2];
31143 else
31144 cost = ix86_cost->int_store[2];
31145 return (cost * (((int) GET_MODE_SIZE (mode)
31146 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31147 }
31148 }
31149
31150 static int
31151 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31152 bool in)
31153 {
31154 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31155 }
31156
31157
31158 /* Return the cost of moving data from a register in class CLASS1 to
31159 one in class CLASS2.
31160
31161 It is not required that the cost always equal 2 when FROM is the same as TO;
31162 on some machines it is expensive to move between registers if they are not
31163 general registers. */
31164
31165 static int
31166 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31167 reg_class_t class2_i)
31168 {
31169 enum reg_class class1 = (enum reg_class) class1_i;
31170 enum reg_class class2 = (enum reg_class) class2_i;
31171
31172 /* In case we require secondary memory, compute cost of the store followed
31173 by load. In order to avoid bad register allocation choices, we need
31174 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
31175
31176 if (inline_secondary_memory_needed (class1, class2, mode, 0))
31177 {
31178 int cost = 1;
31179
31180 cost += inline_memory_move_cost (mode, class1, 2);
31181 cost += inline_memory_move_cost (mode, class2, 2);
31182
31183 /* In case of copying from general_purpose_register we may emit multiple
31184 stores followed by single load causing memory size mismatch stall.
31185 Count this as arbitrarily high cost of 20. */
31186 if (targetm.class_max_nregs (class1, mode)
31187 > targetm.class_max_nregs (class2, mode))
31188 cost += 20;
31189
31190 /* In the case of FP/MMX moves, the registers actually overlap, and we
31191 have to switch modes in order to treat them differently. */
31192 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31193 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31194 cost += 20;
31195
31196 return cost;
31197 }
31198
31199 /* Moves between SSE/MMX and integer unit are expensive. */
31200 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31201 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31202
31203 /* ??? By keeping returned value relatively high, we limit the number
31204 of moves between integer and MMX/SSE registers for all targets.
31205 Additionally, high value prevents problem with x86_modes_tieable_p(),
31206 where integer modes in MMX/SSE registers are not tieable
31207 because of missing QImode and HImode moves to, from or between
31208 MMX/SSE registers. */
31209 return MAX (8, ix86_cost->mmxsse_to_integer);
31210
31211 if (MAYBE_FLOAT_CLASS_P (class1))
31212 return ix86_cost->fp_move;
31213 if (MAYBE_SSE_CLASS_P (class1))
31214 return ix86_cost->sse_move;
31215 if (MAYBE_MMX_CLASS_P (class1))
31216 return ix86_cost->mmx_move;
31217 return 2;
31218 }
31219
31220 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31221 MODE. */
31222
31223 bool
31224 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31225 {
31226 /* Flags and only flags can only hold CCmode values. */
31227 if (CC_REGNO_P (regno))
31228 return GET_MODE_CLASS (mode) == MODE_CC;
31229 if (GET_MODE_CLASS (mode) == MODE_CC
31230 || GET_MODE_CLASS (mode) == MODE_RANDOM
31231 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31232 return false;
31233 if (FP_REGNO_P (regno))
31234 return VALID_FP_MODE_P (mode);
31235 if (SSE_REGNO_P (regno))
31236 {
31237 /* We implement the move patterns for all vector modes into and
31238 out of SSE registers, even when no operation instructions
31239 are available. OImode move is available only when AVX is
31240 enabled. */
31241 return ((TARGET_AVX && mode == OImode)
31242 || VALID_AVX256_REG_MODE (mode)
31243 || VALID_SSE_REG_MODE (mode)
31244 || VALID_SSE2_REG_MODE (mode)
31245 || VALID_MMX_REG_MODE (mode)
31246 || VALID_MMX_REG_MODE_3DNOW (mode));
31247 }
31248 if (MMX_REGNO_P (regno))
31249 {
31250 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31251 so if the register is available at all, then we can move data of
31252 the given mode into or out of it. */
31253 return (VALID_MMX_REG_MODE (mode)
31254 || VALID_MMX_REG_MODE_3DNOW (mode));
31255 }
31256
31257 if (mode == QImode)
31258 {
31259 /* Take care for QImode values - they can be in non-QI regs,
31260 but then they do cause partial register stalls. */
31261 if (regno <= BX_REG || TARGET_64BIT)
31262 return true;
31263 if (!TARGET_PARTIAL_REG_STALL)
31264 return true;
31265 return !can_create_pseudo_p ();
31266 }
31267 /* We handle both integer and floats in the general purpose registers. */
31268 else if (VALID_INT_MODE_P (mode))
31269 return true;
31270 else if (VALID_FP_MODE_P (mode))
31271 return true;
31272 else if (VALID_DFP_MODE_P (mode))
31273 return true;
31274 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
31275 on to use that value in smaller contexts, this can easily force a
31276 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
31277 supporting DImode, allow it. */
31278 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31279 return true;
31280
31281 return false;
31282 }
31283
31284 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
31285 tieable integer mode. */
31286
31287 static bool
31288 ix86_tieable_integer_mode_p (enum machine_mode mode)
31289 {
31290 switch (mode)
31291 {
31292 case HImode:
31293 case SImode:
31294 return true;
31295
31296 case QImode:
31297 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31298
31299 case DImode:
31300 return TARGET_64BIT;
31301
31302 default:
31303 return false;
31304 }
31305 }
31306
31307 /* Return true if MODE1 is accessible in a register that can hold MODE2
31308 without copying. That is, all register classes that can hold MODE2
31309 can also hold MODE1. */
31310
31311 bool
31312 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31313 {
31314 if (mode1 == mode2)
31315 return true;
31316
31317 if (ix86_tieable_integer_mode_p (mode1)
31318 && ix86_tieable_integer_mode_p (mode2))
31319 return true;
31320
31321 /* MODE2 being XFmode implies fp stack or general regs, which means we
31322 can tie any smaller floating point modes to it. Note that we do not
31323 tie this with TFmode. */
31324 if (mode2 == XFmode)
31325 return mode1 == SFmode || mode1 == DFmode;
31326
31327 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31328 that we can tie it with SFmode. */
31329 if (mode2 == DFmode)
31330 return mode1 == SFmode;
31331
31332 /* If MODE2 is only appropriate for an SSE register, then tie with
31333 any other mode acceptable to SSE registers. */
31334 if (GET_MODE_SIZE (mode2) == 16
31335 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31336 return (GET_MODE_SIZE (mode1) == 16
31337 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31338
31339 /* If MODE2 is appropriate for an MMX register, then tie
31340 with any other mode acceptable to MMX registers. */
31341 if (GET_MODE_SIZE (mode2) == 8
31342 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31343 return (GET_MODE_SIZE (mode1) == 8
31344 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31345
31346 return false;
31347 }
31348
31349 /* Compute a (partial) cost for rtx X. Return true if the complete
31350 cost has been computed, and false if subexpressions should be
31351 scanned. In either case, *TOTAL contains the cost result. */
31352
31353 static bool
31354 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
31355 bool speed)
31356 {
31357 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31358 enum machine_mode mode = GET_MODE (x);
31359 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31360
31361 switch (code)
31362 {
31363 case CONST_INT:
31364 case CONST:
31365 case LABEL_REF:
31366 case SYMBOL_REF:
31367 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31368 *total = 3;
31369 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31370 *total = 2;
31371 else if (flag_pic && SYMBOLIC_CONST (x)
31372 && (!TARGET_64BIT
31373 || (!GET_CODE (x) != LABEL_REF
31374 && (GET_CODE (x) != SYMBOL_REF
31375 || !SYMBOL_REF_LOCAL_P (x)))))
31376 *total = 1;
31377 else
31378 *total = 0;
31379 return true;
31380
31381 case CONST_DOUBLE:
31382 if (mode == VOIDmode)
31383 *total = 0;
31384 else
31385 switch (standard_80387_constant_p (x))
31386 {
31387 case 1: /* 0.0 */
31388 *total = 1;
31389 break;
31390 default: /* Other constants */
31391 *total = 2;
31392 break;
31393 case 0:
31394 case -1:
31395 /* Start with (MEM (SYMBOL_REF)), since that's where
31396 it'll probably end up. Add a penalty for size. */
31397 *total = (COSTS_N_INSNS (1)
31398 + (flag_pic != 0 && !TARGET_64BIT)
31399 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
31400 break;
31401 }
31402 return true;
31403
31404 case ZERO_EXTEND:
31405 /* The zero extensions is often completely free on x86_64, so make
31406 it as cheap as possible. */
31407 if (TARGET_64BIT && mode == DImode
31408 && GET_MODE (XEXP (x, 0)) == SImode)
31409 *total = 1;
31410 else if (TARGET_ZERO_EXTEND_WITH_AND)
31411 *total = cost->add;
31412 else
31413 *total = cost->movzx;
31414 return false;
31415
31416 case SIGN_EXTEND:
31417 *total = cost->movsx;
31418 return false;
31419
31420 case ASHIFT:
31421 if (CONST_INT_P (XEXP (x, 1))
31422 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
31423 {
31424 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31425 if (value == 1)
31426 {
31427 *total = cost->add;
31428 return false;
31429 }
31430 if ((value == 2 || value == 3)
31431 && cost->lea <= cost->shift_const)
31432 {
31433 *total = cost->lea;
31434 return false;
31435 }
31436 }
31437 /* FALLTHRU */
31438
31439 case ROTATE:
31440 case ASHIFTRT:
31441 case LSHIFTRT:
31442 case ROTATERT:
31443 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
31444 {
31445 if (CONST_INT_P (XEXP (x, 1)))
31446 {
31447 if (INTVAL (XEXP (x, 1)) > 32)
31448 *total = cost->shift_const + COSTS_N_INSNS (2);
31449 else
31450 *total = cost->shift_const * 2;
31451 }
31452 else
31453 {
31454 if (GET_CODE (XEXP (x, 1)) == AND)
31455 *total = cost->shift_var * 2;
31456 else
31457 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
31458 }
31459 }
31460 else
31461 {
31462 if (CONST_INT_P (XEXP (x, 1)))
31463 *total = cost->shift_const;
31464 else
31465 *total = cost->shift_var;
31466 }
31467 return false;
31468
31469 case FMA:
31470 {
31471 rtx sub;
31472
31473 gcc_assert (FLOAT_MODE_P (mode));
31474 gcc_assert (TARGET_FMA || TARGET_FMA4);
31475
31476 /* ??? SSE scalar/vector cost should be used here. */
31477 /* ??? Bald assumption that fma has the same cost as fmul. */
31478 *total = cost->fmul;
31479 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31480
31481 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
31482 sub = XEXP (x, 0);
31483 if (GET_CODE (sub) == NEG)
31484 sub = XEXP (sub, 0);
31485 *total += rtx_cost (sub, FMA, 0, speed);
31486
31487 sub = XEXP (x, 2);
31488 if (GET_CODE (sub) == NEG)
31489 sub = XEXP (sub, 0);
31490 *total += rtx_cost (sub, FMA, 2, speed);
31491 return true;
31492 }
31493
31494 case MULT:
31495 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31496 {
31497 /* ??? SSE scalar cost should be used here. */
31498 *total = cost->fmul;
31499 return false;
31500 }
31501 else if (X87_FLOAT_MODE_P (mode))
31502 {
31503 *total = cost->fmul;
31504 return false;
31505 }
31506 else if (FLOAT_MODE_P (mode))
31507 {
31508 /* ??? SSE vector cost should be used here. */
31509 *total = cost->fmul;
31510 return false;
31511 }
31512 else
31513 {
31514 rtx op0 = XEXP (x, 0);
31515 rtx op1 = XEXP (x, 1);
31516 int nbits;
31517 if (CONST_INT_P (XEXP (x, 1)))
31518 {
31519 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31520 for (nbits = 0; value != 0; value &= value - 1)
31521 nbits++;
31522 }
31523 else
31524 /* This is arbitrary. */
31525 nbits = 7;
31526
31527 /* Compute costs correctly for widening multiplication. */
31528 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
31529 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
31530 == GET_MODE_SIZE (mode))
31531 {
31532 int is_mulwiden = 0;
31533 enum machine_mode inner_mode = GET_MODE (op0);
31534
31535 if (GET_CODE (op0) == GET_CODE (op1))
31536 is_mulwiden = 1, op1 = XEXP (op1, 0);
31537 else if (CONST_INT_P (op1))
31538 {
31539 if (GET_CODE (op0) == SIGN_EXTEND)
31540 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
31541 == INTVAL (op1);
31542 else
31543 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
31544 }
31545
31546 if (is_mulwiden)
31547 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
31548 }
31549
31550 *total = (cost->mult_init[MODE_INDEX (mode)]
31551 + nbits * cost->mult_bit
31552 + rtx_cost (op0, outer_code, opno, speed)
31553 + rtx_cost (op1, outer_code, opno, speed));
31554
31555 return true;
31556 }
31557
31558 case DIV:
31559 case UDIV:
31560 case MOD:
31561 case UMOD:
31562 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31563 /* ??? SSE cost should be used here. */
31564 *total = cost->fdiv;
31565 else if (X87_FLOAT_MODE_P (mode))
31566 *total = cost->fdiv;
31567 else if (FLOAT_MODE_P (mode))
31568 /* ??? SSE vector cost should be used here. */
31569 *total = cost->fdiv;
31570 else
31571 *total = cost->divide[MODE_INDEX (mode)];
31572 return false;
31573
31574 case PLUS:
31575 if (GET_MODE_CLASS (mode) == MODE_INT
31576 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
31577 {
31578 if (GET_CODE (XEXP (x, 0)) == PLUS
31579 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
31580 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
31581 && CONSTANT_P (XEXP (x, 1)))
31582 {
31583 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
31584 if (val == 2 || val == 4 || val == 8)
31585 {
31586 *total = cost->lea;
31587 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31588 outer_code, opno, speed);
31589 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
31590 outer_code, opno, speed);
31591 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31592 return true;
31593 }
31594 }
31595 else if (GET_CODE (XEXP (x, 0)) == MULT
31596 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
31597 {
31598 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
31599 if (val == 2 || val == 4 || val == 8)
31600 {
31601 *total = cost->lea;
31602 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31603 outer_code, opno, speed);
31604 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31605 return true;
31606 }
31607 }
31608 else if (GET_CODE (XEXP (x, 0)) == PLUS)
31609 {
31610 *total = cost->lea;
31611 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31612 outer_code, opno, speed);
31613 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31614 outer_code, opno, speed);
31615 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31616 return true;
31617 }
31618 }
31619 /* FALLTHRU */
31620
31621 case MINUS:
31622 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31623 {
31624 /* ??? SSE cost should be used here. */
31625 *total = cost->fadd;
31626 return false;
31627 }
31628 else if (X87_FLOAT_MODE_P (mode))
31629 {
31630 *total = cost->fadd;
31631 return false;
31632 }
31633 else if (FLOAT_MODE_P (mode))
31634 {
31635 /* ??? SSE vector cost should be used here. */
31636 *total = cost->fadd;
31637 return false;
31638 }
31639 /* FALLTHRU */
31640
31641 case AND:
31642 case IOR:
31643 case XOR:
31644 if (!TARGET_64BIT && mode == DImode)
31645 {
31646 *total = (cost->add * 2
31647 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
31648 << (GET_MODE (XEXP (x, 0)) != DImode))
31649 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
31650 << (GET_MODE (XEXP (x, 1)) != DImode)));
31651 return true;
31652 }
31653 /* FALLTHRU */
31654
31655 case NEG:
31656 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31657 {
31658 /* ??? SSE cost should be used here. */
31659 *total = cost->fchs;
31660 return false;
31661 }
31662 else if (X87_FLOAT_MODE_P (mode))
31663 {
31664 *total = cost->fchs;
31665 return false;
31666 }
31667 else if (FLOAT_MODE_P (mode))
31668 {
31669 /* ??? SSE vector cost should be used here. */
31670 *total = cost->fchs;
31671 return false;
31672 }
31673 /* FALLTHRU */
31674
31675 case NOT:
31676 if (!TARGET_64BIT && mode == DImode)
31677 *total = cost->add * 2;
31678 else
31679 *total = cost->add;
31680 return false;
31681
31682 case COMPARE:
31683 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
31684 && XEXP (XEXP (x, 0), 1) == const1_rtx
31685 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
31686 && XEXP (x, 1) == const0_rtx)
31687 {
31688 /* This kind of construct is implemented using test[bwl].
31689 Treat it as if we had an AND. */
31690 *total = (cost->add
31691 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
31692 + rtx_cost (const1_rtx, outer_code, opno, speed));
31693 return true;
31694 }
31695 return false;
31696
31697 case FLOAT_EXTEND:
31698 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
31699 *total = 0;
31700 return false;
31701
31702 case ABS:
31703 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31704 /* ??? SSE cost should be used here. */
31705 *total = cost->fabs;
31706 else if (X87_FLOAT_MODE_P (mode))
31707 *total = cost->fabs;
31708 else if (FLOAT_MODE_P (mode))
31709 /* ??? SSE vector cost should be used here. */
31710 *total = cost->fabs;
31711 return false;
31712
31713 case SQRT:
31714 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31715 /* ??? SSE cost should be used here. */
31716 *total = cost->fsqrt;
31717 else if (X87_FLOAT_MODE_P (mode))
31718 *total = cost->fsqrt;
31719 else if (FLOAT_MODE_P (mode))
31720 /* ??? SSE vector cost should be used here. */
31721 *total = cost->fsqrt;
31722 return false;
31723
31724 case UNSPEC:
31725 if (XINT (x, 1) == UNSPEC_TP)
31726 *total = 0;
31727 return false;
31728
31729 case VEC_SELECT:
31730 case VEC_CONCAT:
31731 case VEC_MERGE:
31732 case VEC_DUPLICATE:
31733 /* ??? Assume all of these vector manipulation patterns are
31734 recognizable. In which case they all pretty much have the
31735 same cost. */
31736 *total = COSTS_N_INSNS (1);
31737 return true;
31738
31739 default:
31740 return false;
31741 }
31742 }
31743
31744 #if TARGET_MACHO
31745
31746 static int current_machopic_label_num;
31747
31748 /* Given a symbol name and its associated stub, write out the
31749 definition of the stub. */
31750
31751 void
31752 machopic_output_stub (FILE *file, const char *symb, const char *stub)
31753 {
31754 unsigned int length;
31755 char *binder_name, *symbol_name, lazy_ptr_name[32];
31756 int label = ++current_machopic_label_num;
31757
31758 /* For 64-bit we shouldn't get here. */
31759 gcc_assert (!TARGET_64BIT);
31760
31761 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
31762 symb = targetm.strip_name_encoding (symb);
31763
31764 length = strlen (stub);
31765 binder_name = XALLOCAVEC (char, length + 32);
31766 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
31767
31768 length = strlen (symb);
31769 symbol_name = XALLOCAVEC (char, length + 32);
31770 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
31771
31772 sprintf (lazy_ptr_name, "L%d$lz", label);
31773
31774 if (MACHOPIC_ATT_STUB)
31775 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
31776 else if (MACHOPIC_PURE)
31777 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
31778 else
31779 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
31780
31781 fprintf (file, "%s:\n", stub);
31782 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31783
31784 if (MACHOPIC_ATT_STUB)
31785 {
31786 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
31787 }
31788 else if (MACHOPIC_PURE)
31789 {
31790 /* PIC stub. */
31791 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31792 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
31793 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
31794 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
31795 label, lazy_ptr_name, label);
31796 fprintf (file, "\tjmp\t*%%ecx\n");
31797 }
31798 else
31799 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
31800
31801 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
31802 it needs no stub-binding-helper. */
31803 if (MACHOPIC_ATT_STUB)
31804 return;
31805
31806 fprintf (file, "%s:\n", binder_name);
31807
31808 if (MACHOPIC_PURE)
31809 {
31810 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
31811 fprintf (file, "\tpushl\t%%ecx\n");
31812 }
31813 else
31814 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
31815
31816 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
31817
31818 /* N.B. Keep the correspondence of these
31819 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
31820 old-pic/new-pic/non-pic stubs; altering this will break
31821 compatibility with existing dylibs. */
31822 if (MACHOPIC_PURE)
31823 {
31824 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31825 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
31826 }
31827 else
31828 /* 16-byte -mdynamic-no-pic stub. */
31829 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
31830
31831 fprintf (file, "%s:\n", lazy_ptr_name);
31832 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31833 fprintf (file, ASM_LONG "%s\n", binder_name);
31834 }
31835 #endif /* TARGET_MACHO */
31836
31837 /* Order the registers for register allocator. */
31838
31839 void
31840 x86_order_regs_for_local_alloc (void)
31841 {
31842 int pos = 0;
31843 int i;
31844
31845 /* First allocate the local general purpose registers. */
31846 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31847 if (GENERAL_REGNO_P (i) && call_used_regs[i])
31848 reg_alloc_order [pos++] = i;
31849
31850 /* Global general purpose registers. */
31851 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31852 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
31853 reg_alloc_order [pos++] = i;
31854
31855 /* x87 registers come first in case we are doing FP math
31856 using them. */
31857 if (!TARGET_SSE_MATH)
31858 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31859 reg_alloc_order [pos++] = i;
31860
31861 /* SSE registers. */
31862 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
31863 reg_alloc_order [pos++] = i;
31864 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
31865 reg_alloc_order [pos++] = i;
31866
31867 /* x87 registers. */
31868 if (TARGET_SSE_MATH)
31869 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31870 reg_alloc_order [pos++] = i;
31871
31872 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
31873 reg_alloc_order [pos++] = i;
31874
31875 /* Initialize the rest of array as we do not allocate some registers
31876 at all. */
31877 while (pos < FIRST_PSEUDO_REGISTER)
31878 reg_alloc_order [pos++] = 0;
31879 }
31880
31881 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
31882 in struct attribute_spec handler. */
31883 static tree
31884 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
31885 tree args,
31886 int flags ATTRIBUTE_UNUSED,
31887 bool *no_add_attrs)
31888 {
31889 if (TREE_CODE (*node) != FUNCTION_TYPE
31890 && TREE_CODE (*node) != METHOD_TYPE
31891 && TREE_CODE (*node) != FIELD_DECL
31892 && TREE_CODE (*node) != TYPE_DECL)
31893 {
31894 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31895 name);
31896 *no_add_attrs = true;
31897 return NULL_TREE;
31898 }
31899 if (TARGET_64BIT)
31900 {
31901 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
31902 name);
31903 *no_add_attrs = true;
31904 return NULL_TREE;
31905 }
31906 if (is_attribute_p ("callee_pop_aggregate_return", name))
31907 {
31908 tree cst;
31909
31910 cst = TREE_VALUE (args);
31911 if (TREE_CODE (cst) != INTEGER_CST)
31912 {
31913 warning (OPT_Wattributes,
31914 "%qE attribute requires an integer constant argument",
31915 name);
31916 *no_add_attrs = true;
31917 }
31918 else if (compare_tree_int (cst, 0) != 0
31919 && compare_tree_int (cst, 1) != 0)
31920 {
31921 warning (OPT_Wattributes,
31922 "argument to %qE attribute is neither zero, nor one",
31923 name);
31924 *no_add_attrs = true;
31925 }
31926
31927 return NULL_TREE;
31928 }
31929
31930 return NULL_TREE;
31931 }
31932
31933 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
31934 struct attribute_spec.handler. */
31935 static tree
31936 ix86_handle_abi_attribute (tree *node, tree name,
31937 tree args ATTRIBUTE_UNUSED,
31938 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31939 {
31940 if (TREE_CODE (*node) != FUNCTION_TYPE
31941 && TREE_CODE (*node) != METHOD_TYPE
31942 && TREE_CODE (*node) != FIELD_DECL
31943 && TREE_CODE (*node) != TYPE_DECL)
31944 {
31945 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31946 name);
31947 *no_add_attrs = true;
31948 return NULL_TREE;
31949 }
31950
31951 /* Can combine regparm with all attributes but fastcall. */
31952 if (is_attribute_p ("ms_abi", name))
31953 {
31954 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
31955 {
31956 error ("ms_abi and sysv_abi attributes are not compatible");
31957 }
31958
31959 return NULL_TREE;
31960 }
31961 else if (is_attribute_p ("sysv_abi", name))
31962 {
31963 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
31964 {
31965 error ("ms_abi and sysv_abi attributes are not compatible");
31966 }
31967
31968 return NULL_TREE;
31969 }
31970
31971 return NULL_TREE;
31972 }
31973
31974 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
31975 struct attribute_spec.handler. */
31976 static tree
31977 ix86_handle_struct_attribute (tree *node, tree name,
31978 tree args ATTRIBUTE_UNUSED,
31979 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31980 {
31981 tree *type = NULL;
31982 if (DECL_P (*node))
31983 {
31984 if (TREE_CODE (*node) == TYPE_DECL)
31985 type = &TREE_TYPE (*node);
31986 }
31987 else
31988 type = node;
31989
31990 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
31991 || TREE_CODE (*type) == UNION_TYPE)))
31992 {
31993 warning (OPT_Wattributes, "%qE attribute ignored",
31994 name);
31995 *no_add_attrs = true;
31996 }
31997
31998 else if ((is_attribute_p ("ms_struct", name)
31999 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
32000 || ((is_attribute_p ("gcc_struct", name)
32001 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
32002 {
32003 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
32004 name);
32005 *no_add_attrs = true;
32006 }
32007
32008 return NULL_TREE;
32009 }
32010
32011 static tree
32012 ix86_handle_fndecl_attribute (tree *node, tree name,
32013 tree args ATTRIBUTE_UNUSED,
32014 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32015 {
32016 if (TREE_CODE (*node) != FUNCTION_DECL)
32017 {
32018 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32019 name);
32020 *no_add_attrs = true;
32021 }
32022 return NULL_TREE;
32023 }
32024
32025 static bool
32026 ix86_ms_bitfield_layout_p (const_tree record_type)
32027 {
32028 return ((TARGET_MS_BITFIELD_LAYOUT
32029 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
32030 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
32031 }
32032
32033 /* Returns an expression indicating where the this parameter is
32034 located on entry to the FUNCTION. */
32035
32036 static rtx
32037 x86_this_parameter (tree function)
32038 {
32039 tree type = TREE_TYPE (function);
32040 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
32041 int nregs;
32042
32043 if (TARGET_64BIT)
32044 {
32045 const int *parm_regs;
32046
32047 if (ix86_function_type_abi (type) == MS_ABI)
32048 parm_regs = x86_64_ms_abi_int_parameter_registers;
32049 else
32050 parm_regs = x86_64_int_parameter_registers;
32051 return gen_rtx_REG (Pmode, parm_regs[aggr]);
32052 }
32053
32054 nregs = ix86_function_regparm (type, function);
32055
32056 if (nregs > 0 && !stdarg_p (type))
32057 {
32058 int regno;
32059 unsigned int ccvt = ix86_get_callcvt (type);
32060
32061 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32062 regno = aggr ? DX_REG : CX_REG;
32063 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32064 {
32065 regno = CX_REG;
32066 if (aggr)
32067 return gen_rtx_MEM (SImode,
32068 plus_constant (stack_pointer_rtx, 4));
32069 }
32070 else
32071 {
32072 regno = AX_REG;
32073 if (aggr)
32074 {
32075 regno = DX_REG;
32076 if (nregs == 1)
32077 return gen_rtx_MEM (SImode,
32078 plus_constant (stack_pointer_rtx, 4));
32079 }
32080 }
32081 return gen_rtx_REG (SImode, regno);
32082 }
32083
32084 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
32085 }
32086
32087 /* Determine whether x86_output_mi_thunk can succeed. */
32088
32089 static bool
32090 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
32091 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
32092 HOST_WIDE_INT vcall_offset, const_tree function)
32093 {
32094 /* 64-bit can handle anything. */
32095 if (TARGET_64BIT)
32096 return true;
32097
32098 /* For 32-bit, everything's fine if we have one free register. */
32099 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
32100 return true;
32101
32102 /* Need a free register for vcall_offset. */
32103 if (vcall_offset)
32104 return false;
32105
32106 /* Need a free register for GOT references. */
32107 if (flag_pic && !targetm.binds_local_p (function))
32108 return false;
32109
32110 /* Otherwise ok. */
32111 return true;
32112 }
32113
32114 /* Output the assembler code for a thunk function. THUNK_DECL is the
32115 declaration for the thunk function itself, FUNCTION is the decl for
32116 the target function. DELTA is an immediate constant offset to be
32117 added to THIS. If VCALL_OFFSET is nonzero, the word at
32118 *(*this + vcall_offset) should be added to THIS. */
32119
32120 static void
32121 x86_output_mi_thunk (FILE *file,
32122 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
32123 HOST_WIDE_INT vcall_offset, tree function)
32124 {
32125 rtx this_param = x86_this_parameter (function);
32126 rtx this_reg, tmp, fnaddr;
32127
32128 emit_note (NOTE_INSN_PROLOGUE_END);
32129
32130 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
32131 pull it in now and let DELTA benefit. */
32132 if (REG_P (this_param))
32133 this_reg = this_param;
32134 else if (vcall_offset)
32135 {
32136 /* Put the this parameter into %eax. */
32137 this_reg = gen_rtx_REG (Pmode, AX_REG);
32138 emit_move_insn (this_reg, this_param);
32139 }
32140 else
32141 this_reg = NULL_RTX;
32142
32143 /* Adjust the this parameter by a fixed constant. */
32144 if (delta)
32145 {
32146 rtx delta_rtx = GEN_INT (delta);
32147 rtx delta_dst = this_reg ? this_reg : this_param;
32148
32149 if (TARGET_64BIT)
32150 {
32151 if (!x86_64_general_operand (delta_rtx, Pmode))
32152 {
32153 tmp = gen_rtx_REG (Pmode, R10_REG);
32154 emit_move_insn (tmp, delta_rtx);
32155 delta_rtx = tmp;
32156 }
32157 }
32158
32159 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32160 }
32161
32162 /* Adjust the this parameter by a value stored in the vtable. */
32163 if (vcall_offset)
32164 {
32165 rtx vcall_addr, vcall_mem, this_mem;
32166 unsigned int tmp_regno;
32167
32168 if (TARGET_64BIT)
32169 tmp_regno = R10_REG;
32170 else
32171 {
32172 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32173 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
32174 tmp_regno = AX_REG;
32175 else
32176 tmp_regno = CX_REG;
32177 }
32178 tmp = gen_rtx_REG (Pmode, tmp_regno);
32179
32180 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32181 if (Pmode != ptr_mode)
32182 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32183 emit_move_insn (tmp, this_mem);
32184
32185 /* Adjust the this parameter. */
32186 vcall_addr = plus_constant (tmp, vcall_offset);
32187 if (TARGET_64BIT
32188 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32189 {
32190 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32191 emit_move_insn (tmp2, GEN_INT (vcall_offset));
32192 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32193 }
32194
32195 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32196 if (Pmode != ptr_mode)
32197 emit_insn (gen_addsi_1_zext (this_reg,
32198 gen_rtx_REG (ptr_mode,
32199 REGNO (this_reg)),
32200 vcall_mem));
32201 else
32202 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32203 }
32204
32205 /* If necessary, drop THIS back to its stack slot. */
32206 if (this_reg && this_reg != this_param)
32207 emit_move_insn (this_param, this_reg);
32208
32209 fnaddr = XEXP (DECL_RTL (function), 0);
32210 if (TARGET_64BIT)
32211 {
32212 if (!flag_pic || targetm.binds_local_p (function)
32213 || cfun->machine->call_abi == MS_ABI)
32214 ;
32215 else
32216 {
32217 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32218 tmp = gen_rtx_CONST (Pmode, tmp);
32219 fnaddr = gen_rtx_MEM (Pmode, tmp);
32220 }
32221 }
32222 else
32223 {
32224 if (!flag_pic || targetm.binds_local_p (function))
32225 ;
32226 #if TARGET_MACHO
32227 else if (TARGET_MACHO)
32228 {
32229 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32230 fnaddr = XEXP (fnaddr, 0);
32231 }
32232 #endif /* TARGET_MACHO */
32233 else
32234 {
32235 tmp = gen_rtx_REG (Pmode, CX_REG);
32236 output_set_got (tmp, NULL_RTX);
32237
32238 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32239 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32240 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32241 }
32242 }
32243
32244 /* Our sibling call patterns do not allow memories, because we have no
32245 predicate that can distinguish between frame and non-frame memory.
32246 For our purposes here, we can get away with (ab)using a jump pattern,
32247 because we're going to do no optimization. */
32248 if (MEM_P (fnaddr))
32249 emit_jump_insn (gen_indirect_jump (fnaddr));
32250 else
32251 {
32252 tmp = gen_rtx_MEM (QImode, fnaddr);
32253 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32254 tmp = emit_call_insn (tmp);
32255 SIBLING_CALL_P (tmp) = 1;
32256 }
32257 emit_barrier ();
32258
32259 /* Emit just enough of rest_of_compilation to get the insns emitted.
32260 Note that use_thunk calls assemble_start_function et al. */
32261 tmp = get_insns ();
32262 insn_locators_alloc ();
32263 shorten_branches (tmp);
32264 final_start_function (tmp, file, 1);
32265 final (tmp, file, 1);
32266 final_end_function ();
32267 }
32268
32269 static void
32270 x86_file_start (void)
32271 {
32272 default_file_start ();
32273 #if TARGET_MACHO
32274 darwin_file_start ();
32275 #endif
32276 if (X86_FILE_START_VERSION_DIRECTIVE)
32277 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32278 if (X86_FILE_START_FLTUSED)
32279 fputs ("\t.global\t__fltused\n", asm_out_file);
32280 if (ix86_asm_dialect == ASM_INTEL)
32281 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32282 }
32283
32284 int
32285 x86_field_alignment (tree field, int computed)
32286 {
32287 enum machine_mode mode;
32288 tree type = TREE_TYPE (field);
32289
32290 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32291 return computed;
32292 mode = TYPE_MODE (strip_array_types (type));
32293 if (mode == DFmode || mode == DCmode
32294 || GET_MODE_CLASS (mode) == MODE_INT
32295 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32296 return MIN (32, computed);
32297 return computed;
32298 }
32299
32300 /* Output assembler code to FILE to increment profiler label # LABELNO
32301 for profiling a function entry. */
32302 void
32303 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32304 {
32305 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32306 : MCOUNT_NAME);
32307
32308 if (TARGET_64BIT)
32309 {
32310 #ifndef NO_PROFILE_COUNTERS
32311 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32312 #endif
32313
32314 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32315 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32316 else
32317 fprintf (file, "\tcall\t%s\n", mcount_name);
32318 }
32319 else if (flag_pic)
32320 {
32321 #ifndef NO_PROFILE_COUNTERS
32322 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32323 LPREFIX, labelno);
32324 #endif
32325 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32326 }
32327 else
32328 {
32329 #ifndef NO_PROFILE_COUNTERS
32330 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32331 LPREFIX, labelno);
32332 #endif
32333 fprintf (file, "\tcall\t%s\n", mcount_name);
32334 }
32335 }
32336
32337 /* We don't have exact information about the insn sizes, but we may assume
32338 quite safely that we are informed about all 1 byte insns and memory
32339 address sizes. This is enough to eliminate unnecessary padding in
32340 99% of cases. */
32341
32342 static int
32343 min_insn_size (rtx insn)
32344 {
32345 int l = 0, len;
32346
32347 if (!INSN_P (insn) || !active_insn_p (insn))
32348 return 0;
32349
32350 /* Discard alignments we've emit and jump instructions. */
32351 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
32352 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
32353 return 0;
32354 if (JUMP_TABLE_DATA_P (insn))
32355 return 0;
32356
32357 /* Important case - calls are always 5 bytes.
32358 It is common to have many calls in the row. */
32359 if (CALL_P (insn)
32360 && symbolic_reference_mentioned_p (PATTERN (insn))
32361 && !SIBLING_CALL_P (insn))
32362 return 5;
32363 len = get_attr_length (insn);
32364 if (len <= 1)
32365 return 1;
32366
32367 /* For normal instructions we rely on get_attr_length being exact,
32368 with a few exceptions. */
32369 if (!JUMP_P (insn))
32370 {
32371 enum attr_type type = get_attr_type (insn);
32372
32373 switch (type)
32374 {
32375 case TYPE_MULTI:
32376 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
32377 || asm_noperands (PATTERN (insn)) >= 0)
32378 return 0;
32379 break;
32380 case TYPE_OTHER:
32381 case TYPE_FCMP:
32382 break;
32383 default:
32384 /* Otherwise trust get_attr_length. */
32385 return len;
32386 }
32387
32388 l = get_attr_length_address (insn);
32389 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
32390 l = 4;
32391 }
32392 if (l)
32393 return 1+l;
32394 else
32395 return 2;
32396 }
32397
32398 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32399
32400 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
32401 window. */
32402
32403 static void
32404 ix86_avoid_jump_mispredicts (void)
32405 {
32406 rtx insn, start = get_insns ();
32407 int nbytes = 0, njumps = 0;
32408 int isjump = 0;
32409
32410 /* Look for all minimal intervals of instructions containing 4 jumps.
32411 The intervals are bounded by START and INSN. NBYTES is the total
32412 size of instructions in the interval including INSN and not including
32413 START. When the NBYTES is smaller than 16 bytes, it is possible
32414 that the end of START and INSN ends up in the same 16byte page.
32415
32416 The smallest offset in the page INSN can start is the case where START
32417 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
32418 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
32419 */
32420 for (insn = start; insn; insn = NEXT_INSN (insn))
32421 {
32422 int min_size;
32423
32424 if (LABEL_P (insn))
32425 {
32426 int align = label_to_alignment (insn);
32427 int max_skip = label_to_max_skip (insn);
32428
32429 if (max_skip > 15)
32430 max_skip = 15;
32431 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
32432 already in the current 16 byte page, because otherwise
32433 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
32434 bytes to reach 16 byte boundary. */
32435 if (align <= 0
32436 || (align <= 3 && max_skip != (1 << align) - 1))
32437 max_skip = 0;
32438 if (dump_file)
32439 fprintf (dump_file, "Label %i with max_skip %i\n",
32440 INSN_UID (insn), max_skip);
32441 if (max_skip)
32442 {
32443 while (nbytes + max_skip >= 16)
32444 {
32445 start = NEXT_INSN (start);
32446 if ((JUMP_P (start)
32447 && GET_CODE (PATTERN (start)) != ADDR_VEC
32448 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32449 || CALL_P (start))
32450 njumps--, isjump = 1;
32451 else
32452 isjump = 0;
32453 nbytes -= min_insn_size (start);
32454 }
32455 }
32456 continue;
32457 }
32458
32459 min_size = min_insn_size (insn);
32460 nbytes += min_size;
32461 if (dump_file)
32462 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
32463 INSN_UID (insn), min_size);
32464 if ((JUMP_P (insn)
32465 && GET_CODE (PATTERN (insn)) != ADDR_VEC
32466 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
32467 || CALL_P (insn))
32468 njumps++;
32469 else
32470 continue;
32471
32472 while (njumps > 3)
32473 {
32474 start = NEXT_INSN (start);
32475 if ((JUMP_P (start)
32476 && GET_CODE (PATTERN (start)) != ADDR_VEC
32477 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32478 || CALL_P (start))
32479 njumps--, isjump = 1;
32480 else
32481 isjump = 0;
32482 nbytes -= min_insn_size (start);
32483 }
32484 gcc_assert (njumps >= 0);
32485 if (dump_file)
32486 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
32487 INSN_UID (start), INSN_UID (insn), nbytes);
32488
32489 if (njumps == 3 && isjump && nbytes < 16)
32490 {
32491 int padsize = 15 - nbytes + min_insn_size (insn);
32492
32493 if (dump_file)
32494 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
32495 INSN_UID (insn), padsize);
32496 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
32497 }
32498 }
32499 }
32500 #endif
32501
32502 /* AMD Athlon works faster
32503 when RET is not destination of conditional jump or directly preceded
32504 by other jump instruction. We avoid the penalty by inserting NOP just
32505 before the RET instructions in such cases. */
32506 static void
32507 ix86_pad_returns (void)
32508 {
32509 edge e;
32510 edge_iterator ei;
32511
32512 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32513 {
32514 basic_block bb = e->src;
32515 rtx ret = BB_END (bb);
32516 rtx prev;
32517 bool replace = false;
32518
32519 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
32520 || optimize_bb_for_size_p (bb))
32521 continue;
32522 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
32523 if (active_insn_p (prev) || LABEL_P (prev))
32524 break;
32525 if (prev && LABEL_P (prev))
32526 {
32527 edge e;
32528 edge_iterator ei;
32529
32530 FOR_EACH_EDGE (e, ei, bb->preds)
32531 if (EDGE_FREQUENCY (e) && e->src->index >= 0
32532 && !(e->flags & EDGE_FALLTHRU))
32533 replace = true;
32534 }
32535 if (!replace)
32536 {
32537 prev = prev_active_insn (ret);
32538 if (prev
32539 && ((JUMP_P (prev) && any_condjump_p (prev))
32540 || CALL_P (prev)))
32541 replace = true;
32542 /* Empty functions get branch mispredict even when
32543 the jump destination is not visible to us. */
32544 if (!prev && !optimize_function_for_size_p (cfun))
32545 replace = true;
32546 }
32547 if (replace)
32548 {
32549 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
32550 delete_insn (ret);
32551 }
32552 }
32553 }
32554
32555 /* Count the minimum number of instructions in BB. Return 4 if the
32556 number of instructions >= 4. */
32557
32558 static int
32559 ix86_count_insn_bb (basic_block bb)
32560 {
32561 rtx insn;
32562 int insn_count = 0;
32563
32564 /* Count number of instructions in this block. Return 4 if the number
32565 of instructions >= 4. */
32566 FOR_BB_INSNS (bb, insn)
32567 {
32568 /* Only happen in exit blocks. */
32569 if (JUMP_P (insn)
32570 && ANY_RETURN_P (PATTERN (insn)))
32571 break;
32572
32573 if (NONDEBUG_INSN_P (insn)
32574 && GET_CODE (PATTERN (insn)) != USE
32575 && GET_CODE (PATTERN (insn)) != CLOBBER)
32576 {
32577 insn_count++;
32578 if (insn_count >= 4)
32579 return insn_count;
32580 }
32581 }
32582
32583 return insn_count;
32584 }
32585
32586
32587 /* Count the minimum number of instructions in code path in BB.
32588 Return 4 if the number of instructions >= 4. */
32589
32590 static int
32591 ix86_count_insn (basic_block bb)
32592 {
32593 edge e;
32594 edge_iterator ei;
32595 int min_prev_count;
32596
32597 /* Only bother counting instructions along paths with no
32598 more than 2 basic blocks between entry and exit. Given
32599 that BB has an edge to exit, determine if a predecessor
32600 of BB has an edge from entry. If so, compute the number
32601 of instructions in the predecessor block. If there
32602 happen to be multiple such blocks, compute the minimum. */
32603 min_prev_count = 4;
32604 FOR_EACH_EDGE (e, ei, bb->preds)
32605 {
32606 edge prev_e;
32607 edge_iterator prev_ei;
32608
32609 if (e->src == ENTRY_BLOCK_PTR)
32610 {
32611 min_prev_count = 0;
32612 break;
32613 }
32614 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
32615 {
32616 if (prev_e->src == ENTRY_BLOCK_PTR)
32617 {
32618 int count = ix86_count_insn_bb (e->src);
32619 if (count < min_prev_count)
32620 min_prev_count = count;
32621 break;
32622 }
32623 }
32624 }
32625
32626 if (min_prev_count < 4)
32627 min_prev_count += ix86_count_insn_bb (bb);
32628
32629 return min_prev_count;
32630 }
32631
32632 /* Pad short funtion to 4 instructions. */
32633
32634 static void
32635 ix86_pad_short_function (void)
32636 {
32637 edge e;
32638 edge_iterator ei;
32639
32640 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32641 {
32642 rtx ret = BB_END (e->src);
32643 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
32644 {
32645 int insn_count = ix86_count_insn (e->src);
32646
32647 /* Pad short function. */
32648 if (insn_count < 4)
32649 {
32650 rtx insn = ret;
32651
32652 /* Find epilogue. */
32653 while (insn
32654 && (!NOTE_P (insn)
32655 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
32656 insn = PREV_INSN (insn);
32657
32658 if (!insn)
32659 insn = ret;
32660
32661 /* Two NOPs count as one instruction. */
32662 insn_count = 2 * (4 - insn_count);
32663 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
32664 }
32665 }
32666 }
32667 }
32668
32669 /* Implement machine specific optimizations. We implement padding of returns
32670 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
32671 static void
32672 ix86_reorg (void)
32673 {
32674 /* We are freeing block_for_insn in the toplev to keep compatibility
32675 with old MDEP_REORGS that are not CFG based. Recompute it now. */
32676 compute_bb_for_insn ();
32677
32678 /* Run the vzeroupper optimization if needed. */
32679 if (TARGET_VZEROUPPER)
32680 move_or_delete_vzeroupper ();
32681
32682 if (optimize && optimize_function_for_speed_p (cfun))
32683 {
32684 if (TARGET_PAD_SHORT_FUNCTION)
32685 ix86_pad_short_function ();
32686 else if (TARGET_PAD_RETURNS)
32687 ix86_pad_returns ();
32688 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32689 if (TARGET_FOUR_JUMP_LIMIT)
32690 ix86_avoid_jump_mispredicts ();
32691 #endif
32692 }
32693 }
32694
32695 /* Return nonzero when QImode register that must be represented via REX prefix
32696 is used. */
32697 bool
32698 x86_extended_QIreg_mentioned_p (rtx insn)
32699 {
32700 int i;
32701 extract_insn_cached (insn);
32702 for (i = 0; i < recog_data.n_operands; i++)
32703 if (REG_P (recog_data.operand[i])
32704 && REGNO (recog_data.operand[i]) > BX_REG)
32705 return true;
32706 return false;
32707 }
32708
32709 /* Return nonzero when P points to register encoded via REX prefix.
32710 Called via for_each_rtx. */
32711 static int
32712 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
32713 {
32714 unsigned int regno;
32715 if (!REG_P (*p))
32716 return 0;
32717 regno = REGNO (*p);
32718 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
32719 }
32720
32721 /* Return true when INSN mentions register that must be encoded using REX
32722 prefix. */
32723 bool
32724 x86_extended_reg_mentioned_p (rtx insn)
32725 {
32726 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
32727 extended_reg_mentioned_1, NULL);
32728 }
32729
32730 /* If profitable, negate (without causing overflow) integer constant
32731 of mode MODE at location LOC. Return true in this case. */
32732 bool
32733 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
32734 {
32735 HOST_WIDE_INT val;
32736
32737 if (!CONST_INT_P (*loc))
32738 return false;
32739
32740 switch (mode)
32741 {
32742 case DImode:
32743 /* DImode x86_64 constants must fit in 32 bits. */
32744 gcc_assert (x86_64_immediate_operand (*loc, mode));
32745
32746 mode = SImode;
32747 break;
32748
32749 case SImode:
32750 case HImode:
32751 case QImode:
32752 break;
32753
32754 default:
32755 gcc_unreachable ();
32756 }
32757
32758 /* Avoid overflows. */
32759 if (mode_signbit_p (mode, *loc))
32760 return false;
32761
32762 val = INTVAL (*loc);
32763
32764 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
32765 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
32766 if ((val < 0 && val != -128)
32767 || val == 128)
32768 {
32769 *loc = GEN_INT (-val);
32770 return true;
32771 }
32772
32773 return false;
32774 }
32775
32776 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
32777 optabs would emit if we didn't have TFmode patterns. */
32778
32779 void
32780 x86_emit_floatuns (rtx operands[2])
32781 {
32782 rtx neglab, donelab, i0, i1, f0, in, out;
32783 enum machine_mode mode, inmode;
32784
32785 inmode = GET_MODE (operands[1]);
32786 gcc_assert (inmode == SImode || inmode == DImode);
32787
32788 out = operands[0];
32789 in = force_reg (inmode, operands[1]);
32790 mode = GET_MODE (out);
32791 neglab = gen_label_rtx ();
32792 donelab = gen_label_rtx ();
32793 f0 = gen_reg_rtx (mode);
32794
32795 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
32796
32797 expand_float (out, in, 0);
32798
32799 emit_jump_insn (gen_jump (donelab));
32800 emit_barrier ();
32801
32802 emit_label (neglab);
32803
32804 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
32805 1, OPTAB_DIRECT);
32806 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
32807 1, OPTAB_DIRECT);
32808 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
32809
32810 expand_float (f0, i0, 0);
32811
32812 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
32813
32814 emit_label (donelab);
32815 }
32816 \f
32817 /* AVX2 does support 32-byte integer vector operations,
32818 thus the longest vector we are faced with is V32QImode. */
32819 #define MAX_VECT_LEN 32
32820
32821 struct expand_vec_perm_d
32822 {
32823 rtx target, op0, op1;
32824 unsigned char perm[MAX_VECT_LEN];
32825 enum machine_mode vmode;
32826 unsigned char nelt;
32827 bool testing_p;
32828 };
32829
32830 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
32831 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
32832
32833 /* Get a vector mode of the same size as the original but with elements
32834 twice as wide. This is only guaranteed to apply to integral vectors. */
32835
32836 static inline enum machine_mode
32837 get_mode_wider_vector (enum machine_mode o)
32838 {
32839 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
32840 enum machine_mode n = GET_MODE_WIDER_MODE (o);
32841 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
32842 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
32843 return n;
32844 }
32845
32846 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32847 with all elements equal to VAR. Return true if successful. */
32848
32849 static bool
32850 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
32851 rtx target, rtx val)
32852 {
32853 bool ok;
32854
32855 switch (mode)
32856 {
32857 case V2SImode:
32858 case V2SFmode:
32859 if (!mmx_ok)
32860 return false;
32861 /* FALLTHRU */
32862
32863 case V4DFmode:
32864 case V4DImode:
32865 case V8SFmode:
32866 case V8SImode:
32867 case V2DFmode:
32868 case V2DImode:
32869 case V4SFmode:
32870 case V4SImode:
32871 {
32872 rtx insn, dup;
32873
32874 /* First attempt to recognize VAL as-is. */
32875 dup = gen_rtx_VEC_DUPLICATE (mode, val);
32876 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
32877 if (recog_memoized (insn) < 0)
32878 {
32879 rtx seq;
32880 /* If that fails, force VAL into a register. */
32881
32882 start_sequence ();
32883 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
32884 seq = get_insns ();
32885 end_sequence ();
32886 if (seq)
32887 emit_insn_before (seq, insn);
32888
32889 ok = recog_memoized (insn) >= 0;
32890 gcc_assert (ok);
32891 }
32892 }
32893 return true;
32894
32895 case V4HImode:
32896 if (!mmx_ok)
32897 return false;
32898 if (TARGET_SSE || TARGET_3DNOW_A)
32899 {
32900 rtx x;
32901
32902 val = gen_lowpart (SImode, val);
32903 x = gen_rtx_TRUNCATE (HImode, val);
32904 x = gen_rtx_VEC_DUPLICATE (mode, x);
32905 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32906 return true;
32907 }
32908 goto widen;
32909
32910 case V8QImode:
32911 if (!mmx_ok)
32912 return false;
32913 goto widen;
32914
32915 case V8HImode:
32916 if (TARGET_SSE2)
32917 {
32918 struct expand_vec_perm_d dperm;
32919 rtx tmp1, tmp2;
32920
32921 permute:
32922 memset (&dperm, 0, sizeof (dperm));
32923 dperm.target = target;
32924 dperm.vmode = mode;
32925 dperm.nelt = GET_MODE_NUNITS (mode);
32926 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
32927
32928 /* Extend to SImode using a paradoxical SUBREG. */
32929 tmp1 = gen_reg_rtx (SImode);
32930 emit_move_insn (tmp1, gen_lowpart (SImode, val));
32931
32932 /* Insert the SImode value as low element of a V4SImode vector. */
32933 tmp2 = gen_lowpart (V4SImode, dperm.op0);
32934 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
32935
32936 ok = (expand_vec_perm_1 (&dperm)
32937 || expand_vec_perm_broadcast_1 (&dperm));
32938 gcc_assert (ok);
32939 return ok;
32940 }
32941 goto widen;
32942
32943 case V16QImode:
32944 if (TARGET_SSE2)
32945 goto permute;
32946 goto widen;
32947
32948 widen:
32949 /* Replicate the value once into the next wider mode and recurse. */
32950 {
32951 enum machine_mode smode, wsmode, wvmode;
32952 rtx x;
32953
32954 smode = GET_MODE_INNER (mode);
32955 wvmode = get_mode_wider_vector (mode);
32956 wsmode = GET_MODE_INNER (wvmode);
32957
32958 val = convert_modes (wsmode, smode, val, true);
32959 x = expand_simple_binop (wsmode, ASHIFT, val,
32960 GEN_INT (GET_MODE_BITSIZE (smode)),
32961 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32962 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
32963
32964 x = gen_lowpart (wvmode, target);
32965 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
32966 gcc_assert (ok);
32967 return ok;
32968 }
32969
32970 case V16HImode:
32971 case V32QImode:
32972 {
32973 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
32974 rtx x = gen_reg_rtx (hvmode);
32975
32976 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
32977 gcc_assert (ok);
32978
32979 x = gen_rtx_VEC_CONCAT (mode, x, x);
32980 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32981 }
32982 return true;
32983
32984 default:
32985 return false;
32986 }
32987 }
32988
32989 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32990 whose ONE_VAR element is VAR, and other elements are zero. Return true
32991 if successful. */
32992
32993 static bool
32994 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
32995 rtx target, rtx var, int one_var)
32996 {
32997 enum machine_mode vsimode;
32998 rtx new_target;
32999 rtx x, tmp;
33000 bool use_vector_set = false;
33001
33002 switch (mode)
33003 {
33004 case V2DImode:
33005 /* For SSE4.1, we normally use vector set. But if the second
33006 element is zero and inter-unit moves are OK, we use movq
33007 instead. */
33008 use_vector_set = (TARGET_64BIT
33009 && TARGET_SSE4_1
33010 && !(TARGET_INTER_UNIT_MOVES
33011 && one_var == 0));
33012 break;
33013 case V16QImode:
33014 case V4SImode:
33015 case V4SFmode:
33016 use_vector_set = TARGET_SSE4_1;
33017 break;
33018 case V8HImode:
33019 use_vector_set = TARGET_SSE2;
33020 break;
33021 case V4HImode:
33022 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
33023 break;
33024 case V32QImode:
33025 case V16HImode:
33026 case V8SImode:
33027 case V8SFmode:
33028 case V4DFmode:
33029 use_vector_set = TARGET_AVX;
33030 break;
33031 case V4DImode:
33032 /* Use ix86_expand_vector_set in 64bit mode only. */
33033 use_vector_set = TARGET_AVX && TARGET_64BIT;
33034 break;
33035 default:
33036 break;
33037 }
33038
33039 if (use_vector_set)
33040 {
33041 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
33042 var = force_reg (GET_MODE_INNER (mode), var);
33043 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33044 return true;
33045 }
33046
33047 switch (mode)
33048 {
33049 case V2SFmode:
33050 case V2SImode:
33051 if (!mmx_ok)
33052 return false;
33053 /* FALLTHRU */
33054
33055 case V2DFmode:
33056 case V2DImode:
33057 if (one_var != 0)
33058 return false;
33059 var = force_reg (GET_MODE_INNER (mode), var);
33060 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
33061 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33062 return true;
33063
33064 case V4SFmode:
33065 case V4SImode:
33066 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
33067 new_target = gen_reg_rtx (mode);
33068 else
33069 new_target = target;
33070 var = force_reg (GET_MODE_INNER (mode), var);
33071 x = gen_rtx_VEC_DUPLICATE (mode, var);
33072 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
33073 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
33074 if (one_var != 0)
33075 {
33076 /* We need to shuffle the value to the correct position, so
33077 create a new pseudo to store the intermediate result. */
33078
33079 /* With SSE2, we can use the integer shuffle insns. */
33080 if (mode != V4SFmode && TARGET_SSE2)
33081 {
33082 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
33083 const1_rtx,
33084 GEN_INT (one_var == 1 ? 0 : 1),
33085 GEN_INT (one_var == 2 ? 0 : 1),
33086 GEN_INT (one_var == 3 ? 0 : 1)));
33087 if (target != new_target)
33088 emit_move_insn (target, new_target);
33089 return true;
33090 }
33091
33092 /* Otherwise convert the intermediate result to V4SFmode and
33093 use the SSE1 shuffle instructions. */
33094 if (mode != V4SFmode)
33095 {
33096 tmp = gen_reg_rtx (V4SFmode);
33097 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
33098 }
33099 else
33100 tmp = new_target;
33101
33102 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
33103 const1_rtx,
33104 GEN_INT (one_var == 1 ? 0 : 1),
33105 GEN_INT (one_var == 2 ? 0+4 : 1+4),
33106 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
33107
33108 if (mode != V4SFmode)
33109 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
33110 else if (tmp != target)
33111 emit_move_insn (target, tmp);
33112 }
33113 else if (target != new_target)
33114 emit_move_insn (target, new_target);
33115 return true;
33116
33117 case V8HImode:
33118 case V16QImode:
33119 vsimode = V4SImode;
33120 goto widen;
33121 case V4HImode:
33122 case V8QImode:
33123 if (!mmx_ok)
33124 return false;
33125 vsimode = V2SImode;
33126 goto widen;
33127 widen:
33128 if (one_var != 0)
33129 return false;
33130
33131 /* Zero extend the variable element to SImode and recurse. */
33132 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
33133
33134 x = gen_reg_rtx (vsimode);
33135 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
33136 var, one_var))
33137 gcc_unreachable ();
33138
33139 emit_move_insn (target, gen_lowpart (mode, x));
33140 return true;
33141
33142 default:
33143 return false;
33144 }
33145 }
33146
33147 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33148 consisting of the values in VALS. It is known that all elements
33149 except ONE_VAR are constants. Return true if successful. */
33150
33151 static bool
33152 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33153 rtx target, rtx vals, int one_var)
33154 {
33155 rtx var = XVECEXP (vals, 0, one_var);
33156 enum machine_mode wmode;
33157 rtx const_vec, x;
33158
33159 const_vec = copy_rtx (vals);
33160 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33161 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33162
33163 switch (mode)
33164 {
33165 case V2DFmode:
33166 case V2DImode:
33167 case V2SFmode:
33168 case V2SImode:
33169 /* For the two element vectors, it's just as easy to use
33170 the general case. */
33171 return false;
33172
33173 case V4DImode:
33174 /* Use ix86_expand_vector_set in 64bit mode only. */
33175 if (!TARGET_64BIT)
33176 return false;
33177 case V4DFmode:
33178 case V8SFmode:
33179 case V8SImode:
33180 case V16HImode:
33181 case V32QImode:
33182 case V4SFmode:
33183 case V4SImode:
33184 case V8HImode:
33185 case V4HImode:
33186 break;
33187
33188 case V16QImode:
33189 if (TARGET_SSE4_1)
33190 break;
33191 wmode = V8HImode;
33192 goto widen;
33193 case V8QImode:
33194 wmode = V4HImode;
33195 goto widen;
33196 widen:
33197 /* There's no way to set one QImode entry easily. Combine
33198 the variable value with its adjacent constant value, and
33199 promote to an HImode set. */
33200 x = XVECEXP (vals, 0, one_var ^ 1);
33201 if (one_var & 1)
33202 {
33203 var = convert_modes (HImode, QImode, var, true);
33204 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33205 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33206 x = GEN_INT (INTVAL (x) & 0xff);
33207 }
33208 else
33209 {
33210 var = convert_modes (HImode, QImode, var, true);
33211 x = gen_int_mode (INTVAL (x) << 8, HImode);
33212 }
33213 if (x != const0_rtx)
33214 var = expand_simple_binop (HImode, IOR, var, x, var,
33215 1, OPTAB_LIB_WIDEN);
33216
33217 x = gen_reg_rtx (wmode);
33218 emit_move_insn (x, gen_lowpart (wmode, const_vec));
33219 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33220
33221 emit_move_insn (target, gen_lowpart (mode, x));
33222 return true;
33223
33224 default:
33225 return false;
33226 }
33227
33228 emit_move_insn (target, const_vec);
33229 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33230 return true;
33231 }
33232
33233 /* A subroutine of ix86_expand_vector_init_general. Use vector
33234 concatenate to handle the most general case: all values variable,
33235 and none identical. */
33236
33237 static void
33238 ix86_expand_vector_init_concat (enum machine_mode mode,
33239 rtx target, rtx *ops, int n)
33240 {
33241 enum machine_mode cmode, hmode = VOIDmode;
33242 rtx first[8], second[4];
33243 rtvec v;
33244 int i, j;
33245
33246 switch (n)
33247 {
33248 case 2:
33249 switch (mode)
33250 {
33251 case V8SImode:
33252 cmode = V4SImode;
33253 break;
33254 case V8SFmode:
33255 cmode = V4SFmode;
33256 break;
33257 case V4DImode:
33258 cmode = V2DImode;
33259 break;
33260 case V4DFmode:
33261 cmode = V2DFmode;
33262 break;
33263 case V4SImode:
33264 cmode = V2SImode;
33265 break;
33266 case V4SFmode:
33267 cmode = V2SFmode;
33268 break;
33269 case V2DImode:
33270 cmode = DImode;
33271 break;
33272 case V2SImode:
33273 cmode = SImode;
33274 break;
33275 case V2DFmode:
33276 cmode = DFmode;
33277 break;
33278 case V2SFmode:
33279 cmode = SFmode;
33280 break;
33281 default:
33282 gcc_unreachable ();
33283 }
33284
33285 if (!register_operand (ops[1], cmode))
33286 ops[1] = force_reg (cmode, ops[1]);
33287 if (!register_operand (ops[0], cmode))
33288 ops[0] = force_reg (cmode, ops[0]);
33289 emit_insn (gen_rtx_SET (VOIDmode, target,
33290 gen_rtx_VEC_CONCAT (mode, ops[0],
33291 ops[1])));
33292 break;
33293
33294 case 4:
33295 switch (mode)
33296 {
33297 case V4DImode:
33298 cmode = V2DImode;
33299 break;
33300 case V4DFmode:
33301 cmode = V2DFmode;
33302 break;
33303 case V4SImode:
33304 cmode = V2SImode;
33305 break;
33306 case V4SFmode:
33307 cmode = V2SFmode;
33308 break;
33309 default:
33310 gcc_unreachable ();
33311 }
33312 goto half;
33313
33314 case 8:
33315 switch (mode)
33316 {
33317 case V8SImode:
33318 cmode = V2SImode;
33319 hmode = V4SImode;
33320 break;
33321 case V8SFmode:
33322 cmode = V2SFmode;
33323 hmode = V4SFmode;
33324 break;
33325 default:
33326 gcc_unreachable ();
33327 }
33328 goto half;
33329
33330 half:
33331 /* FIXME: We process inputs backward to help RA. PR 36222. */
33332 i = n - 1;
33333 j = (n >> 1) - 1;
33334 for (; i > 0; i -= 2, j--)
33335 {
33336 first[j] = gen_reg_rtx (cmode);
33337 v = gen_rtvec (2, ops[i - 1], ops[i]);
33338 ix86_expand_vector_init (false, first[j],
33339 gen_rtx_PARALLEL (cmode, v));
33340 }
33341
33342 n >>= 1;
33343 if (n > 2)
33344 {
33345 gcc_assert (hmode != VOIDmode);
33346 for (i = j = 0; i < n; i += 2, j++)
33347 {
33348 second[j] = gen_reg_rtx (hmode);
33349 ix86_expand_vector_init_concat (hmode, second [j],
33350 &first [i], 2);
33351 }
33352 n >>= 1;
33353 ix86_expand_vector_init_concat (mode, target, second, n);
33354 }
33355 else
33356 ix86_expand_vector_init_concat (mode, target, first, n);
33357 break;
33358
33359 default:
33360 gcc_unreachable ();
33361 }
33362 }
33363
33364 /* A subroutine of ix86_expand_vector_init_general. Use vector
33365 interleave to handle the most general case: all values variable,
33366 and none identical. */
33367
33368 static void
33369 ix86_expand_vector_init_interleave (enum machine_mode mode,
33370 rtx target, rtx *ops, int n)
33371 {
33372 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
33373 int i, j;
33374 rtx op0, op1;
33375 rtx (*gen_load_even) (rtx, rtx, rtx);
33376 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
33377 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
33378
33379 switch (mode)
33380 {
33381 case V8HImode:
33382 gen_load_even = gen_vec_setv8hi;
33383 gen_interleave_first_low = gen_vec_interleave_lowv4si;
33384 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33385 inner_mode = HImode;
33386 first_imode = V4SImode;
33387 second_imode = V2DImode;
33388 third_imode = VOIDmode;
33389 break;
33390 case V16QImode:
33391 gen_load_even = gen_vec_setv16qi;
33392 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
33393 gen_interleave_second_low = gen_vec_interleave_lowv4si;
33394 inner_mode = QImode;
33395 first_imode = V8HImode;
33396 second_imode = V4SImode;
33397 third_imode = V2DImode;
33398 break;
33399 default:
33400 gcc_unreachable ();
33401 }
33402
33403 for (i = 0; i < n; i++)
33404 {
33405 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
33406 op0 = gen_reg_rtx (SImode);
33407 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
33408
33409 /* Insert the SImode value as low element of V4SImode vector. */
33410 op1 = gen_reg_rtx (V4SImode);
33411 op0 = gen_rtx_VEC_MERGE (V4SImode,
33412 gen_rtx_VEC_DUPLICATE (V4SImode,
33413 op0),
33414 CONST0_RTX (V4SImode),
33415 const1_rtx);
33416 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
33417
33418 /* Cast the V4SImode vector back to a vector in orignal mode. */
33419 op0 = gen_reg_rtx (mode);
33420 emit_move_insn (op0, gen_lowpart (mode, op1));
33421
33422 /* Load even elements into the second positon. */
33423 emit_insn (gen_load_even (op0,
33424 force_reg (inner_mode,
33425 ops [i + i + 1]),
33426 const1_rtx));
33427
33428 /* Cast vector to FIRST_IMODE vector. */
33429 ops[i] = gen_reg_rtx (first_imode);
33430 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
33431 }
33432
33433 /* Interleave low FIRST_IMODE vectors. */
33434 for (i = j = 0; i < n; i += 2, j++)
33435 {
33436 op0 = gen_reg_rtx (first_imode);
33437 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
33438
33439 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
33440 ops[j] = gen_reg_rtx (second_imode);
33441 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
33442 }
33443
33444 /* Interleave low SECOND_IMODE vectors. */
33445 switch (second_imode)
33446 {
33447 case V4SImode:
33448 for (i = j = 0; i < n / 2; i += 2, j++)
33449 {
33450 op0 = gen_reg_rtx (second_imode);
33451 emit_insn (gen_interleave_second_low (op0, ops[i],
33452 ops[i + 1]));
33453
33454 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
33455 vector. */
33456 ops[j] = gen_reg_rtx (third_imode);
33457 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
33458 }
33459 second_imode = V2DImode;
33460 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33461 /* FALLTHRU */
33462
33463 case V2DImode:
33464 op0 = gen_reg_rtx (second_imode);
33465 emit_insn (gen_interleave_second_low (op0, ops[0],
33466 ops[1]));
33467
33468 /* Cast the SECOND_IMODE vector back to a vector on original
33469 mode. */
33470 emit_insn (gen_rtx_SET (VOIDmode, target,
33471 gen_lowpart (mode, op0)));
33472 break;
33473
33474 default:
33475 gcc_unreachable ();
33476 }
33477 }
33478
33479 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
33480 all values variable, and none identical. */
33481
33482 static void
33483 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
33484 rtx target, rtx vals)
33485 {
33486 rtx ops[32], op0, op1;
33487 enum machine_mode half_mode = VOIDmode;
33488 int n, i;
33489
33490 switch (mode)
33491 {
33492 case V2SFmode:
33493 case V2SImode:
33494 if (!mmx_ok && !TARGET_SSE)
33495 break;
33496 /* FALLTHRU */
33497
33498 case V8SFmode:
33499 case V8SImode:
33500 case V4DFmode:
33501 case V4DImode:
33502 case V4SFmode:
33503 case V4SImode:
33504 case V2DFmode:
33505 case V2DImode:
33506 n = GET_MODE_NUNITS (mode);
33507 for (i = 0; i < n; i++)
33508 ops[i] = XVECEXP (vals, 0, i);
33509 ix86_expand_vector_init_concat (mode, target, ops, n);
33510 return;
33511
33512 case V32QImode:
33513 half_mode = V16QImode;
33514 goto half;
33515
33516 case V16HImode:
33517 half_mode = V8HImode;
33518 goto half;
33519
33520 half:
33521 n = GET_MODE_NUNITS (mode);
33522 for (i = 0; i < n; i++)
33523 ops[i] = XVECEXP (vals, 0, i);
33524 op0 = gen_reg_rtx (half_mode);
33525 op1 = gen_reg_rtx (half_mode);
33526 ix86_expand_vector_init_interleave (half_mode, op0, ops,
33527 n >> 2);
33528 ix86_expand_vector_init_interleave (half_mode, op1,
33529 &ops [n >> 1], n >> 2);
33530 emit_insn (gen_rtx_SET (VOIDmode, target,
33531 gen_rtx_VEC_CONCAT (mode, op0, op1)));
33532 return;
33533
33534 case V16QImode:
33535 if (!TARGET_SSE4_1)
33536 break;
33537 /* FALLTHRU */
33538
33539 case V8HImode:
33540 if (!TARGET_SSE2)
33541 break;
33542
33543 /* Don't use ix86_expand_vector_init_interleave if we can't
33544 move from GPR to SSE register directly. */
33545 if (!TARGET_INTER_UNIT_MOVES)
33546 break;
33547
33548 n = GET_MODE_NUNITS (mode);
33549 for (i = 0; i < n; i++)
33550 ops[i] = XVECEXP (vals, 0, i);
33551 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
33552 return;
33553
33554 case V4HImode:
33555 case V8QImode:
33556 break;
33557
33558 default:
33559 gcc_unreachable ();
33560 }
33561
33562 {
33563 int i, j, n_elts, n_words, n_elt_per_word;
33564 enum machine_mode inner_mode;
33565 rtx words[4], shift;
33566
33567 inner_mode = GET_MODE_INNER (mode);
33568 n_elts = GET_MODE_NUNITS (mode);
33569 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
33570 n_elt_per_word = n_elts / n_words;
33571 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
33572
33573 for (i = 0; i < n_words; ++i)
33574 {
33575 rtx word = NULL_RTX;
33576
33577 for (j = 0; j < n_elt_per_word; ++j)
33578 {
33579 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
33580 elt = convert_modes (word_mode, inner_mode, elt, true);
33581
33582 if (j == 0)
33583 word = elt;
33584 else
33585 {
33586 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
33587 word, 1, OPTAB_LIB_WIDEN);
33588 word = expand_simple_binop (word_mode, IOR, word, elt,
33589 word, 1, OPTAB_LIB_WIDEN);
33590 }
33591 }
33592
33593 words[i] = word;
33594 }
33595
33596 if (n_words == 1)
33597 emit_move_insn (target, gen_lowpart (mode, words[0]));
33598 else if (n_words == 2)
33599 {
33600 rtx tmp = gen_reg_rtx (mode);
33601 emit_clobber (tmp);
33602 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
33603 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
33604 emit_move_insn (target, tmp);
33605 }
33606 else if (n_words == 4)
33607 {
33608 rtx tmp = gen_reg_rtx (V4SImode);
33609 gcc_assert (word_mode == SImode);
33610 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
33611 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
33612 emit_move_insn (target, gen_lowpart (mode, tmp));
33613 }
33614 else
33615 gcc_unreachable ();
33616 }
33617 }
33618
33619 /* Initialize vector TARGET via VALS. Suppress the use of MMX
33620 instructions unless MMX_OK is true. */
33621
33622 void
33623 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
33624 {
33625 enum machine_mode mode = GET_MODE (target);
33626 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33627 int n_elts = GET_MODE_NUNITS (mode);
33628 int n_var = 0, one_var = -1;
33629 bool all_same = true, all_const_zero = true;
33630 int i;
33631 rtx x;
33632
33633 for (i = 0; i < n_elts; ++i)
33634 {
33635 x = XVECEXP (vals, 0, i);
33636 if (!(CONST_INT_P (x)
33637 || GET_CODE (x) == CONST_DOUBLE
33638 || GET_CODE (x) == CONST_FIXED))
33639 n_var++, one_var = i;
33640 else if (x != CONST0_RTX (inner_mode))
33641 all_const_zero = false;
33642 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
33643 all_same = false;
33644 }
33645
33646 /* Constants are best loaded from the constant pool. */
33647 if (n_var == 0)
33648 {
33649 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
33650 return;
33651 }
33652
33653 /* If all values are identical, broadcast the value. */
33654 if (all_same
33655 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
33656 XVECEXP (vals, 0, 0)))
33657 return;
33658
33659 /* Values where only one field is non-constant are best loaded from
33660 the pool and overwritten via move later. */
33661 if (n_var == 1)
33662 {
33663 if (all_const_zero
33664 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
33665 XVECEXP (vals, 0, one_var),
33666 one_var))
33667 return;
33668
33669 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
33670 return;
33671 }
33672
33673 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
33674 }
33675
33676 void
33677 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
33678 {
33679 enum machine_mode mode = GET_MODE (target);
33680 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33681 enum machine_mode half_mode;
33682 bool use_vec_merge = false;
33683 rtx tmp;
33684 static rtx (*gen_extract[6][2]) (rtx, rtx)
33685 = {
33686 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
33687 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
33688 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
33689 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
33690 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
33691 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
33692 };
33693 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
33694 = {
33695 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
33696 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
33697 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
33698 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
33699 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
33700 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
33701 };
33702 int i, j, n;
33703
33704 switch (mode)
33705 {
33706 case V2SFmode:
33707 case V2SImode:
33708 if (mmx_ok)
33709 {
33710 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33711 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
33712 if (elt == 0)
33713 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33714 else
33715 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33716 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33717 return;
33718 }
33719 break;
33720
33721 case V2DImode:
33722 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
33723 if (use_vec_merge)
33724 break;
33725
33726 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33727 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
33728 if (elt == 0)
33729 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33730 else
33731 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33732 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33733 return;
33734
33735 case V2DFmode:
33736 {
33737 rtx op0, op1;
33738
33739 /* For the two element vectors, we implement a VEC_CONCAT with
33740 the extraction of the other element. */
33741
33742 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
33743 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
33744
33745 if (elt == 0)
33746 op0 = val, op1 = tmp;
33747 else
33748 op0 = tmp, op1 = val;
33749
33750 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
33751 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33752 }
33753 return;
33754
33755 case V4SFmode:
33756 use_vec_merge = TARGET_SSE4_1;
33757 if (use_vec_merge)
33758 break;
33759
33760 switch (elt)
33761 {
33762 case 0:
33763 use_vec_merge = true;
33764 break;
33765
33766 case 1:
33767 /* tmp = target = A B C D */
33768 tmp = copy_to_reg (target);
33769 /* target = A A B B */
33770 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
33771 /* target = X A B B */
33772 ix86_expand_vector_set (false, target, val, 0);
33773 /* target = A X C D */
33774 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33775 const1_rtx, const0_rtx,
33776 GEN_INT (2+4), GEN_INT (3+4)));
33777 return;
33778
33779 case 2:
33780 /* tmp = target = A B C D */
33781 tmp = copy_to_reg (target);
33782 /* tmp = X B C D */
33783 ix86_expand_vector_set (false, tmp, val, 0);
33784 /* target = A B X D */
33785 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33786 const0_rtx, const1_rtx,
33787 GEN_INT (0+4), GEN_INT (3+4)));
33788 return;
33789
33790 case 3:
33791 /* tmp = target = A B C D */
33792 tmp = copy_to_reg (target);
33793 /* tmp = X B C D */
33794 ix86_expand_vector_set (false, tmp, val, 0);
33795 /* target = A B X D */
33796 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33797 const0_rtx, const1_rtx,
33798 GEN_INT (2+4), GEN_INT (0+4)));
33799 return;
33800
33801 default:
33802 gcc_unreachable ();
33803 }
33804 break;
33805
33806 case V4SImode:
33807 use_vec_merge = TARGET_SSE4_1;
33808 if (use_vec_merge)
33809 break;
33810
33811 /* Element 0 handled by vec_merge below. */
33812 if (elt == 0)
33813 {
33814 use_vec_merge = true;
33815 break;
33816 }
33817
33818 if (TARGET_SSE2)
33819 {
33820 /* With SSE2, use integer shuffles to swap element 0 and ELT,
33821 store into element 0, then shuffle them back. */
33822
33823 rtx order[4];
33824
33825 order[0] = GEN_INT (elt);
33826 order[1] = const1_rtx;
33827 order[2] = const2_rtx;
33828 order[3] = GEN_INT (3);
33829 order[elt] = const0_rtx;
33830
33831 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33832 order[1], order[2], order[3]));
33833
33834 ix86_expand_vector_set (false, target, val, 0);
33835
33836 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33837 order[1], order[2], order[3]));
33838 }
33839 else
33840 {
33841 /* For SSE1, we have to reuse the V4SF code. */
33842 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
33843 gen_lowpart (SFmode, val), elt);
33844 }
33845 return;
33846
33847 case V8HImode:
33848 use_vec_merge = TARGET_SSE2;
33849 break;
33850 case V4HImode:
33851 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33852 break;
33853
33854 case V16QImode:
33855 use_vec_merge = TARGET_SSE4_1;
33856 break;
33857
33858 case V8QImode:
33859 break;
33860
33861 case V32QImode:
33862 half_mode = V16QImode;
33863 j = 0;
33864 n = 16;
33865 goto half;
33866
33867 case V16HImode:
33868 half_mode = V8HImode;
33869 j = 1;
33870 n = 8;
33871 goto half;
33872
33873 case V8SImode:
33874 half_mode = V4SImode;
33875 j = 2;
33876 n = 4;
33877 goto half;
33878
33879 case V4DImode:
33880 half_mode = V2DImode;
33881 j = 3;
33882 n = 2;
33883 goto half;
33884
33885 case V8SFmode:
33886 half_mode = V4SFmode;
33887 j = 4;
33888 n = 4;
33889 goto half;
33890
33891 case V4DFmode:
33892 half_mode = V2DFmode;
33893 j = 5;
33894 n = 2;
33895 goto half;
33896
33897 half:
33898 /* Compute offset. */
33899 i = elt / n;
33900 elt %= n;
33901
33902 gcc_assert (i <= 1);
33903
33904 /* Extract the half. */
33905 tmp = gen_reg_rtx (half_mode);
33906 emit_insn (gen_extract[j][i] (tmp, target));
33907
33908 /* Put val in tmp at elt. */
33909 ix86_expand_vector_set (false, tmp, val, elt);
33910
33911 /* Put it back. */
33912 emit_insn (gen_insert[j][i] (target, target, tmp));
33913 return;
33914
33915 default:
33916 break;
33917 }
33918
33919 if (use_vec_merge)
33920 {
33921 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
33922 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
33923 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33924 }
33925 else
33926 {
33927 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33928
33929 emit_move_insn (mem, target);
33930
33931 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33932 emit_move_insn (tmp, val);
33933
33934 emit_move_insn (target, mem);
33935 }
33936 }
33937
33938 void
33939 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
33940 {
33941 enum machine_mode mode = GET_MODE (vec);
33942 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33943 bool use_vec_extr = false;
33944 rtx tmp;
33945
33946 switch (mode)
33947 {
33948 case V2SImode:
33949 case V2SFmode:
33950 if (!mmx_ok)
33951 break;
33952 /* FALLTHRU */
33953
33954 case V2DFmode:
33955 case V2DImode:
33956 use_vec_extr = true;
33957 break;
33958
33959 case V4SFmode:
33960 use_vec_extr = TARGET_SSE4_1;
33961 if (use_vec_extr)
33962 break;
33963
33964 switch (elt)
33965 {
33966 case 0:
33967 tmp = vec;
33968 break;
33969
33970 case 1:
33971 case 3:
33972 tmp = gen_reg_rtx (mode);
33973 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
33974 GEN_INT (elt), GEN_INT (elt),
33975 GEN_INT (elt+4), GEN_INT (elt+4)));
33976 break;
33977
33978 case 2:
33979 tmp = gen_reg_rtx (mode);
33980 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
33981 break;
33982
33983 default:
33984 gcc_unreachable ();
33985 }
33986 vec = tmp;
33987 use_vec_extr = true;
33988 elt = 0;
33989 break;
33990
33991 case V4SImode:
33992 use_vec_extr = TARGET_SSE4_1;
33993 if (use_vec_extr)
33994 break;
33995
33996 if (TARGET_SSE2)
33997 {
33998 switch (elt)
33999 {
34000 case 0:
34001 tmp = vec;
34002 break;
34003
34004 case 1:
34005 case 3:
34006 tmp = gen_reg_rtx (mode);
34007 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
34008 GEN_INT (elt), GEN_INT (elt),
34009 GEN_INT (elt), GEN_INT (elt)));
34010 break;
34011
34012 case 2:
34013 tmp = gen_reg_rtx (mode);
34014 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
34015 break;
34016
34017 default:
34018 gcc_unreachable ();
34019 }
34020 vec = tmp;
34021 use_vec_extr = true;
34022 elt = 0;
34023 }
34024 else
34025 {
34026 /* For SSE1, we have to reuse the V4SF code. */
34027 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
34028 gen_lowpart (V4SFmode, vec), elt);
34029 return;
34030 }
34031 break;
34032
34033 case V8HImode:
34034 use_vec_extr = TARGET_SSE2;
34035 break;
34036 case V4HImode:
34037 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34038 break;
34039
34040 case V16QImode:
34041 use_vec_extr = TARGET_SSE4_1;
34042 break;
34043
34044 case V8SFmode:
34045 if (TARGET_AVX)
34046 {
34047 tmp = gen_reg_rtx (V4SFmode);
34048 if (elt < 4)
34049 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
34050 else
34051 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
34052 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34053 return;
34054 }
34055 break;
34056
34057 case V4DFmode:
34058 if (TARGET_AVX)
34059 {
34060 tmp = gen_reg_rtx (V2DFmode);
34061 if (elt < 2)
34062 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
34063 else
34064 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
34065 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34066 return;
34067 }
34068 break;
34069
34070 case V32QImode:
34071 if (TARGET_AVX)
34072 {
34073 tmp = gen_reg_rtx (V16QImode);
34074 if (elt < 16)
34075 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
34076 else
34077 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
34078 ix86_expand_vector_extract (false, target, tmp, elt & 15);
34079 return;
34080 }
34081 break;
34082
34083 case V16HImode:
34084 if (TARGET_AVX)
34085 {
34086 tmp = gen_reg_rtx (V8HImode);
34087 if (elt < 8)
34088 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
34089 else
34090 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
34091 ix86_expand_vector_extract (false, target, tmp, elt & 7);
34092 return;
34093 }
34094 break;
34095
34096 case V8SImode:
34097 if (TARGET_AVX)
34098 {
34099 tmp = gen_reg_rtx (V4SImode);
34100 if (elt < 4)
34101 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
34102 else
34103 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
34104 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34105 return;
34106 }
34107 break;
34108
34109 case V4DImode:
34110 if (TARGET_AVX)
34111 {
34112 tmp = gen_reg_rtx (V2DImode);
34113 if (elt < 2)
34114 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
34115 else
34116 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
34117 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34118 return;
34119 }
34120 break;
34121
34122 case V8QImode:
34123 /* ??? Could extract the appropriate HImode element and shift. */
34124 default:
34125 break;
34126 }
34127
34128 if (use_vec_extr)
34129 {
34130 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
34131 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
34132
34133 /* Let the rtl optimizers know about the zero extension performed. */
34134 if (inner_mode == QImode || inner_mode == HImode)
34135 {
34136 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
34137 target = gen_lowpart (SImode, target);
34138 }
34139
34140 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34141 }
34142 else
34143 {
34144 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34145
34146 emit_move_insn (mem, vec);
34147
34148 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34149 emit_move_insn (target, tmp);
34150 }
34151 }
34152
34153 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34154 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34155 The upper bits of DEST are undefined, though they shouldn't cause
34156 exceptions (some bits from src or all zeros are ok). */
34157
34158 static void
34159 emit_reduc_half (rtx dest, rtx src, int i)
34160 {
34161 rtx tem;
34162 switch (GET_MODE (src))
34163 {
34164 case V4SFmode:
34165 if (i == 128)
34166 tem = gen_sse_movhlps (dest, src, src);
34167 else
34168 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34169 GEN_INT (1 + 4), GEN_INT (1 + 4));
34170 break;
34171 case V2DFmode:
34172 tem = gen_vec_interleave_highv2df (dest, src, src);
34173 break;
34174 case V16QImode:
34175 case V8HImode:
34176 case V4SImode:
34177 case V2DImode:
34178 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34179 gen_lowpart (V1TImode, src),
34180 GEN_INT (i / 2));
34181 break;
34182 case V8SFmode:
34183 if (i == 256)
34184 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34185 else
34186 tem = gen_avx_shufps256 (dest, src, src,
34187 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34188 break;
34189 case V4DFmode:
34190 if (i == 256)
34191 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34192 else
34193 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34194 break;
34195 case V32QImode:
34196 case V16HImode:
34197 case V8SImode:
34198 case V4DImode:
34199 if (i == 256)
34200 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34201 gen_lowpart (V4DImode, src),
34202 gen_lowpart (V4DImode, src),
34203 const1_rtx);
34204 else
34205 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34206 gen_lowpart (V2TImode, src),
34207 GEN_INT (i / 2));
34208 break;
34209 default:
34210 gcc_unreachable ();
34211 }
34212 emit_insn (tem);
34213 }
34214
34215 /* Expand a vector reduction. FN is the binary pattern to reduce;
34216 DEST is the destination; IN is the input vector. */
34217
34218 void
34219 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34220 {
34221 rtx half, dst, vec = in;
34222 enum machine_mode mode = GET_MODE (in);
34223 int i;
34224
34225 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
34226 if (TARGET_SSE4_1
34227 && mode == V8HImode
34228 && fn == gen_uminv8hi3)
34229 {
34230 emit_insn (gen_sse4_1_phminposuw (dest, in));
34231 return;
34232 }
34233
34234 for (i = GET_MODE_BITSIZE (mode);
34235 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34236 i >>= 1)
34237 {
34238 half = gen_reg_rtx (mode);
34239 emit_reduc_half (half, vec, i);
34240 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34241 dst = dest;
34242 else
34243 dst = gen_reg_rtx (mode);
34244 emit_insn (fn (dst, half, vec));
34245 vec = dst;
34246 }
34247 }
34248 \f
34249 /* Target hook for scalar_mode_supported_p. */
34250 static bool
34251 ix86_scalar_mode_supported_p (enum machine_mode mode)
34252 {
34253 if (DECIMAL_FLOAT_MODE_P (mode))
34254 return default_decimal_float_supported_p ();
34255 else if (mode == TFmode)
34256 return true;
34257 else
34258 return default_scalar_mode_supported_p (mode);
34259 }
34260
34261 /* Implements target hook vector_mode_supported_p. */
34262 static bool
34263 ix86_vector_mode_supported_p (enum machine_mode mode)
34264 {
34265 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34266 return true;
34267 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34268 return true;
34269 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34270 return true;
34271 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34272 return true;
34273 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34274 return true;
34275 return false;
34276 }
34277
34278 /* Target hook for c_mode_for_suffix. */
34279 static enum machine_mode
34280 ix86_c_mode_for_suffix (char suffix)
34281 {
34282 if (suffix == 'q')
34283 return TFmode;
34284 if (suffix == 'w')
34285 return XFmode;
34286
34287 return VOIDmode;
34288 }
34289
34290 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34291
34292 We do this in the new i386 backend to maintain source compatibility
34293 with the old cc0-based compiler. */
34294
34295 static tree
34296 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34297 tree inputs ATTRIBUTE_UNUSED,
34298 tree clobbers)
34299 {
34300 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34301 clobbers);
34302 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34303 clobbers);
34304 return clobbers;
34305 }
34306
34307 /* Implements target vector targetm.asm.encode_section_info. */
34308
34309 static void ATTRIBUTE_UNUSED
34310 ix86_encode_section_info (tree decl, rtx rtl, int first)
34311 {
34312 default_encode_section_info (decl, rtl, first);
34313
34314 if (TREE_CODE (decl) == VAR_DECL
34315 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34316 && ix86_in_large_data_p (decl))
34317 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34318 }
34319
34320 /* Worker function for REVERSE_CONDITION. */
34321
34322 enum rtx_code
34323 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34324 {
34325 return (mode != CCFPmode && mode != CCFPUmode
34326 ? reverse_condition (code)
34327 : reverse_condition_maybe_unordered (code));
34328 }
34329
34330 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34331 to OPERANDS[0]. */
34332
34333 const char *
34334 output_387_reg_move (rtx insn, rtx *operands)
34335 {
34336 if (REG_P (operands[0]))
34337 {
34338 if (REG_P (operands[1])
34339 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34340 {
34341 if (REGNO (operands[0]) == FIRST_STACK_REG)
34342 return output_387_ffreep (operands, 0);
34343 return "fstp\t%y0";
34344 }
34345 if (STACK_TOP_P (operands[0]))
34346 return "fld%Z1\t%y1";
34347 return "fst\t%y0";
34348 }
34349 else if (MEM_P (operands[0]))
34350 {
34351 gcc_assert (REG_P (operands[1]));
34352 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34353 return "fstp%Z0\t%y0";
34354 else
34355 {
34356 /* There is no non-popping store to memory for XFmode.
34357 So if we need one, follow the store with a load. */
34358 if (GET_MODE (operands[0]) == XFmode)
34359 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
34360 else
34361 return "fst%Z0\t%y0";
34362 }
34363 }
34364 else
34365 gcc_unreachable();
34366 }
34367
34368 /* Output code to perform a conditional jump to LABEL, if C2 flag in
34369 FP status register is set. */
34370
34371 void
34372 ix86_emit_fp_unordered_jump (rtx label)
34373 {
34374 rtx reg = gen_reg_rtx (HImode);
34375 rtx temp;
34376
34377 emit_insn (gen_x86_fnstsw_1 (reg));
34378
34379 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
34380 {
34381 emit_insn (gen_x86_sahf_1 (reg));
34382
34383 temp = gen_rtx_REG (CCmode, FLAGS_REG);
34384 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
34385 }
34386 else
34387 {
34388 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
34389
34390 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
34391 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
34392 }
34393
34394 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
34395 gen_rtx_LABEL_REF (VOIDmode, label),
34396 pc_rtx);
34397 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
34398
34399 emit_jump_insn (temp);
34400 predict_jump (REG_BR_PROB_BASE * 10 / 100);
34401 }
34402
34403 /* Output code to perform a log1p XFmode calculation. */
34404
34405 void ix86_emit_i387_log1p (rtx op0, rtx op1)
34406 {
34407 rtx label1 = gen_label_rtx ();
34408 rtx label2 = gen_label_rtx ();
34409
34410 rtx tmp = gen_reg_rtx (XFmode);
34411 rtx tmp2 = gen_reg_rtx (XFmode);
34412 rtx test;
34413
34414 emit_insn (gen_absxf2 (tmp, op1));
34415 test = gen_rtx_GE (VOIDmode, tmp,
34416 CONST_DOUBLE_FROM_REAL_VALUE (
34417 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
34418 XFmode));
34419 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
34420
34421 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34422 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
34423 emit_jump (label2);
34424
34425 emit_label (label1);
34426 emit_move_insn (tmp, CONST1_RTX (XFmode));
34427 emit_insn (gen_addxf3 (tmp, op1, tmp));
34428 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34429 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
34430
34431 emit_label (label2);
34432 }
34433
34434 /* Emit code for round calculation. */
34435 void ix86_emit_i387_round (rtx op0, rtx op1)
34436 {
34437 enum machine_mode inmode = GET_MODE (op1);
34438 enum machine_mode outmode = GET_MODE (op0);
34439 rtx e1, e2, res, tmp, tmp1, half;
34440 rtx scratch = gen_reg_rtx (HImode);
34441 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
34442 rtx jump_label = gen_label_rtx ();
34443 rtx insn;
34444 rtx (*gen_abs) (rtx, rtx);
34445 rtx (*gen_neg) (rtx, rtx);
34446
34447 switch (inmode)
34448 {
34449 case SFmode:
34450 gen_abs = gen_abssf2;
34451 break;
34452 case DFmode:
34453 gen_abs = gen_absdf2;
34454 break;
34455 case XFmode:
34456 gen_abs = gen_absxf2;
34457 break;
34458 default:
34459 gcc_unreachable ();
34460 }
34461
34462 switch (outmode)
34463 {
34464 case SFmode:
34465 gen_neg = gen_negsf2;
34466 break;
34467 case DFmode:
34468 gen_neg = gen_negdf2;
34469 break;
34470 case XFmode:
34471 gen_neg = gen_negxf2;
34472 break;
34473 case HImode:
34474 gen_neg = gen_neghi2;
34475 break;
34476 case SImode:
34477 gen_neg = gen_negsi2;
34478 break;
34479 case DImode:
34480 gen_neg = gen_negdi2;
34481 break;
34482 default:
34483 gcc_unreachable ();
34484 }
34485
34486 e1 = gen_reg_rtx (inmode);
34487 e2 = gen_reg_rtx (inmode);
34488 res = gen_reg_rtx (outmode);
34489
34490 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
34491
34492 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
34493
34494 /* scratch = fxam(op1) */
34495 emit_insn (gen_rtx_SET (VOIDmode, scratch,
34496 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
34497 UNSPEC_FXAM)));
34498 /* e1 = fabs(op1) */
34499 emit_insn (gen_abs (e1, op1));
34500
34501 /* e2 = e1 + 0.5 */
34502 half = force_reg (inmode, half);
34503 emit_insn (gen_rtx_SET (VOIDmode, e2,
34504 gen_rtx_PLUS (inmode, e1, half)));
34505
34506 /* res = floor(e2) */
34507 if (inmode != XFmode)
34508 {
34509 tmp1 = gen_reg_rtx (XFmode);
34510
34511 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
34512 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
34513 }
34514 else
34515 tmp1 = e2;
34516
34517 switch (outmode)
34518 {
34519 case SFmode:
34520 case DFmode:
34521 {
34522 rtx tmp0 = gen_reg_rtx (XFmode);
34523
34524 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
34525
34526 emit_insn (gen_rtx_SET (VOIDmode, res,
34527 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
34528 UNSPEC_TRUNC_NOOP)));
34529 }
34530 break;
34531 case XFmode:
34532 emit_insn (gen_frndintxf2_floor (res, tmp1));
34533 break;
34534 case HImode:
34535 emit_insn (gen_lfloorxfhi2 (res, tmp1));
34536 break;
34537 case SImode:
34538 emit_insn (gen_lfloorxfsi2 (res, tmp1));
34539 break;
34540 case DImode:
34541 emit_insn (gen_lfloorxfdi2 (res, tmp1));
34542 break;
34543 default:
34544 gcc_unreachable ();
34545 }
34546
34547 /* flags = signbit(a) */
34548 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
34549
34550 /* if (flags) then res = -res */
34551 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
34552 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
34553 gen_rtx_LABEL_REF (VOIDmode, jump_label),
34554 pc_rtx);
34555 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34556 predict_jump (REG_BR_PROB_BASE * 50 / 100);
34557 JUMP_LABEL (insn) = jump_label;
34558
34559 emit_insn (gen_neg (res, res));
34560
34561 emit_label (jump_label);
34562 LABEL_NUSES (jump_label) = 1;
34563
34564 emit_move_insn (op0, res);
34565 }
34566
34567 /* Output code to perform a Newton-Rhapson approximation of a single precision
34568 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
34569
34570 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
34571 {
34572 rtx x0, x1, e0, e1;
34573
34574 x0 = gen_reg_rtx (mode);
34575 e0 = gen_reg_rtx (mode);
34576 e1 = gen_reg_rtx (mode);
34577 x1 = gen_reg_rtx (mode);
34578
34579 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
34580
34581 b = force_reg (mode, b);
34582
34583 /* x0 = rcp(b) estimate */
34584 emit_insn (gen_rtx_SET (VOIDmode, x0,
34585 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
34586 UNSPEC_RCP)));
34587 /* e0 = x0 * b */
34588 emit_insn (gen_rtx_SET (VOIDmode, e0,
34589 gen_rtx_MULT (mode, x0, b)));
34590
34591 /* e0 = x0 * e0 */
34592 emit_insn (gen_rtx_SET (VOIDmode, e0,
34593 gen_rtx_MULT (mode, x0, e0)));
34594
34595 /* e1 = x0 + x0 */
34596 emit_insn (gen_rtx_SET (VOIDmode, e1,
34597 gen_rtx_PLUS (mode, x0, x0)));
34598
34599 /* x1 = e1 - e0 */
34600 emit_insn (gen_rtx_SET (VOIDmode, x1,
34601 gen_rtx_MINUS (mode, e1, e0)));
34602
34603 /* res = a * x1 */
34604 emit_insn (gen_rtx_SET (VOIDmode, res,
34605 gen_rtx_MULT (mode, a, x1)));
34606 }
34607
34608 /* Output code to perform a Newton-Rhapson approximation of a
34609 single precision floating point [reciprocal] square root. */
34610
34611 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
34612 bool recip)
34613 {
34614 rtx x0, e0, e1, e2, e3, mthree, mhalf;
34615 REAL_VALUE_TYPE r;
34616
34617 x0 = gen_reg_rtx (mode);
34618 e0 = gen_reg_rtx (mode);
34619 e1 = gen_reg_rtx (mode);
34620 e2 = gen_reg_rtx (mode);
34621 e3 = gen_reg_rtx (mode);
34622
34623 real_from_integer (&r, VOIDmode, -3, -1, 0);
34624 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34625
34626 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
34627 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34628
34629 if (VECTOR_MODE_P (mode))
34630 {
34631 mthree = ix86_build_const_vector (mode, true, mthree);
34632 mhalf = ix86_build_const_vector (mode, true, mhalf);
34633 }
34634
34635 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
34636 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
34637
34638 a = force_reg (mode, a);
34639
34640 /* x0 = rsqrt(a) estimate */
34641 emit_insn (gen_rtx_SET (VOIDmode, x0,
34642 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
34643 UNSPEC_RSQRT)));
34644
34645 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
34646 if (!recip)
34647 {
34648 rtx zero, mask;
34649
34650 zero = gen_reg_rtx (mode);
34651 mask = gen_reg_rtx (mode);
34652
34653 zero = force_reg (mode, CONST0_RTX(mode));
34654 emit_insn (gen_rtx_SET (VOIDmode, mask,
34655 gen_rtx_NE (mode, zero, a)));
34656
34657 emit_insn (gen_rtx_SET (VOIDmode, x0,
34658 gen_rtx_AND (mode, x0, mask)));
34659 }
34660
34661 /* e0 = x0 * a */
34662 emit_insn (gen_rtx_SET (VOIDmode, e0,
34663 gen_rtx_MULT (mode, x0, a)));
34664 /* e1 = e0 * x0 */
34665 emit_insn (gen_rtx_SET (VOIDmode, e1,
34666 gen_rtx_MULT (mode, e0, x0)));
34667
34668 /* e2 = e1 - 3. */
34669 mthree = force_reg (mode, mthree);
34670 emit_insn (gen_rtx_SET (VOIDmode, e2,
34671 gen_rtx_PLUS (mode, e1, mthree)));
34672
34673 mhalf = force_reg (mode, mhalf);
34674 if (recip)
34675 /* e3 = -.5 * x0 */
34676 emit_insn (gen_rtx_SET (VOIDmode, e3,
34677 gen_rtx_MULT (mode, x0, mhalf)));
34678 else
34679 /* e3 = -.5 * e0 */
34680 emit_insn (gen_rtx_SET (VOIDmode, e3,
34681 gen_rtx_MULT (mode, e0, mhalf)));
34682 /* ret = e2 * e3 */
34683 emit_insn (gen_rtx_SET (VOIDmode, res,
34684 gen_rtx_MULT (mode, e2, e3)));
34685 }
34686
34687 #ifdef TARGET_SOLARIS
34688 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
34689
34690 static void
34691 i386_solaris_elf_named_section (const char *name, unsigned int flags,
34692 tree decl)
34693 {
34694 /* With Binutils 2.15, the "@unwind" marker must be specified on
34695 every occurrence of the ".eh_frame" section, not just the first
34696 one. */
34697 if (TARGET_64BIT
34698 && strcmp (name, ".eh_frame") == 0)
34699 {
34700 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
34701 flags & SECTION_WRITE ? "aw" : "a");
34702 return;
34703 }
34704
34705 #ifndef USE_GAS
34706 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
34707 {
34708 solaris_elf_asm_comdat_section (name, flags, decl);
34709 return;
34710 }
34711 #endif
34712
34713 default_elf_asm_named_section (name, flags, decl);
34714 }
34715 #endif /* TARGET_SOLARIS */
34716
34717 /* Return the mangling of TYPE if it is an extended fundamental type. */
34718
34719 static const char *
34720 ix86_mangle_type (const_tree type)
34721 {
34722 type = TYPE_MAIN_VARIANT (type);
34723
34724 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
34725 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
34726 return NULL;
34727
34728 switch (TYPE_MODE (type))
34729 {
34730 case TFmode:
34731 /* __float128 is "g". */
34732 return "g";
34733 case XFmode:
34734 /* "long double" or __float80 is "e". */
34735 return "e";
34736 default:
34737 return NULL;
34738 }
34739 }
34740
34741 /* For 32-bit code we can save PIC register setup by using
34742 __stack_chk_fail_local hidden function instead of calling
34743 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
34744 register, so it is better to call __stack_chk_fail directly. */
34745
34746 static tree ATTRIBUTE_UNUSED
34747 ix86_stack_protect_fail (void)
34748 {
34749 return TARGET_64BIT
34750 ? default_external_stack_protect_fail ()
34751 : default_hidden_stack_protect_fail ();
34752 }
34753
34754 /* Select a format to encode pointers in exception handling data. CODE
34755 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
34756 true if the symbol may be affected by dynamic relocations.
34757
34758 ??? All x86 object file formats are capable of representing this.
34759 After all, the relocation needed is the same as for the call insn.
34760 Whether or not a particular assembler allows us to enter such, I
34761 guess we'll have to see. */
34762 int
34763 asm_preferred_eh_data_format (int code, int global)
34764 {
34765 if (flag_pic)
34766 {
34767 int type = DW_EH_PE_sdata8;
34768 if (!TARGET_64BIT
34769 || ix86_cmodel == CM_SMALL_PIC
34770 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
34771 type = DW_EH_PE_sdata4;
34772 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
34773 }
34774 if (ix86_cmodel == CM_SMALL
34775 || (ix86_cmodel == CM_MEDIUM && code))
34776 return DW_EH_PE_udata4;
34777 return DW_EH_PE_absptr;
34778 }
34779 \f
34780 /* Expand copysign from SIGN to the positive value ABS_VALUE
34781 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
34782 the sign-bit. */
34783 static void
34784 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
34785 {
34786 enum machine_mode mode = GET_MODE (sign);
34787 rtx sgn = gen_reg_rtx (mode);
34788 if (mask == NULL_RTX)
34789 {
34790 enum machine_mode vmode;
34791
34792 if (mode == SFmode)
34793 vmode = V4SFmode;
34794 else if (mode == DFmode)
34795 vmode = V2DFmode;
34796 else
34797 vmode = mode;
34798
34799 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
34800 if (!VECTOR_MODE_P (mode))
34801 {
34802 /* We need to generate a scalar mode mask in this case. */
34803 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34804 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34805 mask = gen_reg_rtx (mode);
34806 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34807 }
34808 }
34809 else
34810 mask = gen_rtx_NOT (mode, mask);
34811 emit_insn (gen_rtx_SET (VOIDmode, sgn,
34812 gen_rtx_AND (mode, mask, sign)));
34813 emit_insn (gen_rtx_SET (VOIDmode, result,
34814 gen_rtx_IOR (mode, abs_value, sgn)));
34815 }
34816
34817 /* Expand fabs (OP0) and return a new rtx that holds the result. The
34818 mask for masking out the sign-bit is stored in *SMASK, if that is
34819 non-null. */
34820 static rtx
34821 ix86_expand_sse_fabs (rtx op0, rtx *smask)
34822 {
34823 enum machine_mode vmode, mode = GET_MODE (op0);
34824 rtx xa, mask;
34825
34826 xa = gen_reg_rtx (mode);
34827 if (mode == SFmode)
34828 vmode = V4SFmode;
34829 else if (mode == DFmode)
34830 vmode = V2DFmode;
34831 else
34832 vmode = mode;
34833 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
34834 if (!VECTOR_MODE_P (mode))
34835 {
34836 /* We need to generate a scalar mode mask in this case. */
34837 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34838 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34839 mask = gen_reg_rtx (mode);
34840 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34841 }
34842 emit_insn (gen_rtx_SET (VOIDmode, xa,
34843 gen_rtx_AND (mode, op0, mask)));
34844
34845 if (smask)
34846 *smask = mask;
34847
34848 return xa;
34849 }
34850
34851 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
34852 swapping the operands if SWAP_OPERANDS is true. The expanded
34853 code is a forward jump to a newly created label in case the
34854 comparison is true. The generated label rtx is returned. */
34855 static rtx
34856 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
34857 bool swap_operands)
34858 {
34859 rtx label, tmp;
34860
34861 if (swap_operands)
34862 {
34863 tmp = op0;
34864 op0 = op1;
34865 op1 = tmp;
34866 }
34867
34868 label = gen_label_rtx ();
34869 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
34870 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34871 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
34872 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
34873 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
34874 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
34875 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34876 JUMP_LABEL (tmp) = label;
34877
34878 return label;
34879 }
34880
34881 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
34882 using comparison code CODE. Operands are swapped for the comparison if
34883 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
34884 static rtx
34885 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
34886 bool swap_operands)
34887 {
34888 rtx (*insn)(rtx, rtx, rtx, rtx);
34889 enum machine_mode mode = GET_MODE (op0);
34890 rtx mask = gen_reg_rtx (mode);
34891
34892 if (swap_operands)
34893 {
34894 rtx tmp = op0;
34895 op0 = op1;
34896 op1 = tmp;
34897 }
34898
34899 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
34900
34901 emit_insn (insn (mask, op0, op1,
34902 gen_rtx_fmt_ee (code, mode, op0, op1)));
34903 return mask;
34904 }
34905
34906 /* Generate and return a rtx of mode MODE for 2**n where n is the number
34907 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
34908 static rtx
34909 ix86_gen_TWO52 (enum machine_mode mode)
34910 {
34911 REAL_VALUE_TYPE TWO52r;
34912 rtx TWO52;
34913
34914 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
34915 TWO52 = const_double_from_real_value (TWO52r, mode);
34916 TWO52 = force_reg (mode, TWO52);
34917
34918 return TWO52;
34919 }
34920
34921 /* Expand SSE sequence for computing lround from OP1 storing
34922 into OP0. */
34923 void
34924 ix86_expand_lround (rtx op0, rtx op1)
34925 {
34926 /* C code for the stuff we're doing below:
34927 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
34928 return (long)tmp;
34929 */
34930 enum machine_mode mode = GET_MODE (op1);
34931 const struct real_format *fmt;
34932 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34933 rtx adj;
34934
34935 /* load nextafter (0.5, 0.0) */
34936 fmt = REAL_MODE_FORMAT (mode);
34937 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34938 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34939
34940 /* adj = copysign (0.5, op1) */
34941 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
34942 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
34943
34944 /* adj = op1 + adj */
34945 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
34946
34947 /* op0 = (imode)adj */
34948 expand_fix (op0, adj, 0);
34949 }
34950
34951 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
34952 into OPERAND0. */
34953 void
34954 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
34955 {
34956 /* C code for the stuff we're doing below (for do_floor):
34957 xi = (long)op1;
34958 xi -= (double)xi > op1 ? 1 : 0;
34959 return xi;
34960 */
34961 enum machine_mode fmode = GET_MODE (op1);
34962 enum machine_mode imode = GET_MODE (op0);
34963 rtx ireg, freg, label, tmp;
34964
34965 /* reg = (long)op1 */
34966 ireg = gen_reg_rtx (imode);
34967 expand_fix (ireg, op1, 0);
34968
34969 /* freg = (double)reg */
34970 freg = gen_reg_rtx (fmode);
34971 expand_float (freg, ireg, 0);
34972
34973 /* ireg = (freg > op1) ? ireg - 1 : ireg */
34974 label = ix86_expand_sse_compare_and_jump (UNLE,
34975 freg, op1, !do_floor);
34976 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
34977 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
34978 emit_move_insn (ireg, tmp);
34979
34980 emit_label (label);
34981 LABEL_NUSES (label) = 1;
34982
34983 emit_move_insn (op0, ireg);
34984 }
34985
34986 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
34987 result in OPERAND0. */
34988 void
34989 ix86_expand_rint (rtx operand0, rtx operand1)
34990 {
34991 /* C code for the stuff we're doing below:
34992 xa = fabs (operand1);
34993 if (!isless (xa, 2**52))
34994 return operand1;
34995 xa = xa + 2**52 - 2**52;
34996 return copysign (xa, operand1);
34997 */
34998 enum machine_mode mode = GET_MODE (operand0);
34999 rtx res, xa, label, TWO52, mask;
35000
35001 res = gen_reg_rtx (mode);
35002 emit_move_insn (res, operand1);
35003
35004 /* xa = abs (operand1) */
35005 xa = ix86_expand_sse_fabs (res, &mask);
35006
35007 /* if (!isless (xa, TWO52)) goto label; */
35008 TWO52 = ix86_gen_TWO52 (mode);
35009 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35010
35011 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35012 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35013
35014 ix86_sse_copysign_to_positive (res, xa, res, mask);
35015
35016 emit_label (label);
35017 LABEL_NUSES (label) = 1;
35018
35019 emit_move_insn (operand0, res);
35020 }
35021
35022 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35023 into OPERAND0. */
35024 void
35025 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
35026 {
35027 /* C code for the stuff we expand below.
35028 double xa = fabs (x), x2;
35029 if (!isless (xa, TWO52))
35030 return x;
35031 xa = xa + TWO52 - TWO52;
35032 x2 = copysign (xa, x);
35033 Compensate. Floor:
35034 if (x2 > x)
35035 x2 -= 1;
35036 Compensate. Ceil:
35037 if (x2 < x)
35038 x2 -= -1;
35039 return x2;
35040 */
35041 enum machine_mode mode = GET_MODE (operand0);
35042 rtx xa, TWO52, tmp, label, one, res, mask;
35043
35044 TWO52 = ix86_gen_TWO52 (mode);
35045
35046 /* Temporary for holding the result, initialized to the input
35047 operand to ease control flow. */
35048 res = gen_reg_rtx (mode);
35049 emit_move_insn (res, operand1);
35050
35051 /* xa = abs (operand1) */
35052 xa = ix86_expand_sse_fabs (res, &mask);
35053
35054 /* if (!isless (xa, TWO52)) goto label; */
35055 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35056
35057 /* xa = xa + TWO52 - TWO52; */
35058 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35059 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35060
35061 /* xa = copysign (xa, operand1) */
35062 ix86_sse_copysign_to_positive (xa, xa, res, mask);
35063
35064 /* generate 1.0 or -1.0 */
35065 one = force_reg (mode,
35066 const_double_from_real_value (do_floor
35067 ? dconst1 : dconstm1, mode));
35068
35069 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35070 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35071 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35072 gen_rtx_AND (mode, one, tmp)));
35073 /* We always need to subtract here to preserve signed zero. */
35074 tmp = expand_simple_binop (mode, MINUS,
35075 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35076 emit_move_insn (res, tmp);
35077
35078 emit_label (label);
35079 LABEL_NUSES (label) = 1;
35080
35081 emit_move_insn (operand0, res);
35082 }
35083
35084 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35085 into OPERAND0. */
35086 void
35087 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
35088 {
35089 /* C code for the stuff we expand below.
35090 double xa = fabs (x), x2;
35091 if (!isless (xa, TWO52))
35092 return x;
35093 x2 = (double)(long)x;
35094 Compensate. Floor:
35095 if (x2 > x)
35096 x2 -= 1;
35097 Compensate. Ceil:
35098 if (x2 < x)
35099 x2 += 1;
35100 if (HONOR_SIGNED_ZEROS (mode))
35101 return copysign (x2, x);
35102 return x2;
35103 */
35104 enum machine_mode mode = GET_MODE (operand0);
35105 rtx xa, xi, TWO52, tmp, label, one, res, mask;
35106
35107 TWO52 = ix86_gen_TWO52 (mode);
35108
35109 /* Temporary for holding the result, initialized to the input
35110 operand to ease control flow. */
35111 res = gen_reg_rtx (mode);
35112 emit_move_insn (res, operand1);
35113
35114 /* xa = abs (operand1) */
35115 xa = ix86_expand_sse_fabs (res, &mask);
35116
35117 /* if (!isless (xa, TWO52)) goto label; */
35118 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35119
35120 /* xa = (double)(long)x */
35121 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35122 expand_fix (xi, res, 0);
35123 expand_float (xa, xi, 0);
35124
35125 /* generate 1.0 */
35126 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35127
35128 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35129 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35130 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35131 gen_rtx_AND (mode, one, tmp)));
35132 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
35133 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35134 emit_move_insn (res, tmp);
35135
35136 if (HONOR_SIGNED_ZEROS (mode))
35137 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35138
35139 emit_label (label);
35140 LABEL_NUSES (label) = 1;
35141
35142 emit_move_insn (operand0, res);
35143 }
35144
35145 /* Expand SSE sequence for computing round from OPERAND1 storing
35146 into OPERAND0. Sequence that works without relying on DImode truncation
35147 via cvttsd2siq that is only available on 64bit targets. */
35148 void
35149 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35150 {
35151 /* C code for the stuff we expand below.
35152 double xa = fabs (x), xa2, x2;
35153 if (!isless (xa, TWO52))
35154 return x;
35155 Using the absolute value and copying back sign makes
35156 -0.0 -> -0.0 correct.
35157 xa2 = xa + TWO52 - TWO52;
35158 Compensate.
35159 dxa = xa2 - xa;
35160 if (dxa <= -0.5)
35161 xa2 += 1;
35162 else if (dxa > 0.5)
35163 xa2 -= 1;
35164 x2 = copysign (xa2, x);
35165 return x2;
35166 */
35167 enum machine_mode mode = GET_MODE (operand0);
35168 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35169
35170 TWO52 = ix86_gen_TWO52 (mode);
35171
35172 /* Temporary for holding the result, initialized to the input
35173 operand to ease control flow. */
35174 res = gen_reg_rtx (mode);
35175 emit_move_insn (res, operand1);
35176
35177 /* xa = abs (operand1) */
35178 xa = ix86_expand_sse_fabs (res, &mask);
35179
35180 /* if (!isless (xa, TWO52)) goto label; */
35181 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35182
35183 /* xa2 = xa + TWO52 - TWO52; */
35184 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35185 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35186
35187 /* dxa = xa2 - xa; */
35188 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35189
35190 /* generate 0.5, 1.0 and -0.5 */
35191 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35192 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35193 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35194 0, OPTAB_DIRECT);
35195
35196 /* Compensate. */
35197 tmp = gen_reg_rtx (mode);
35198 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35199 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35200 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35201 gen_rtx_AND (mode, one, tmp)));
35202 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35203 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35204 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35205 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35206 gen_rtx_AND (mode, one, tmp)));
35207 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35208
35209 /* res = copysign (xa2, operand1) */
35210 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35211
35212 emit_label (label);
35213 LABEL_NUSES (label) = 1;
35214
35215 emit_move_insn (operand0, res);
35216 }
35217
35218 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35219 into OPERAND0. */
35220 void
35221 ix86_expand_trunc (rtx operand0, rtx operand1)
35222 {
35223 /* C code for SSE variant we expand below.
35224 double xa = fabs (x), x2;
35225 if (!isless (xa, TWO52))
35226 return x;
35227 x2 = (double)(long)x;
35228 if (HONOR_SIGNED_ZEROS (mode))
35229 return copysign (x2, x);
35230 return x2;
35231 */
35232 enum machine_mode mode = GET_MODE (operand0);
35233 rtx xa, xi, TWO52, label, res, mask;
35234
35235 TWO52 = ix86_gen_TWO52 (mode);
35236
35237 /* Temporary for holding the result, initialized to the input
35238 operand to ease control flow. */
35239 res = gen_reg_rtx (mode);
35240 emit_move_insn (res, operand1);
35241
35242 /* xa = abs (operand1) */
35243 xa = ix86_expand_sse_fabs (res, &mask);
35244
35245 /* if (!isless (xa, TWO52)) goto label; */
35246 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35247
35248 /* x = (double)(long)x */
35249 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35250 expand_fix (xi, res, 0);
35251 expand_float (res, xi, 0);
35252
35253 if (HONOR_SIGNED_ZEROS (mode))
35254 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35255
35256 emit_label (label);
35257 LABEL_NUSES (label) = 1;
35258
35259 emit_move_insn (operand0, res);
35260 }
35261
35262 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35263 into OPERAND0. */
35264 void
35265 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35266 {
35267 enum machine_mode mode = GET_MODE (operand0);
35268 rtx xa, mask, TWO52, label, one, res, smask, tmp;
35269
35270 /* C code for SSE variant we expand below.
35271 double xa = fabs (x), x2;
35272 if (!isless (xa, TWO52))
35273 return x;
35274 xa2 = xa + TWO52 - TWO52;
35275 Compensate:
35276 if (xa2 > xa)
35277 xa2 -= 1.0;
35278 x2 = copysign (xa2, x);
35279 return x2;
35280 */
35281
35282 TWO52 = ix86_gen_TWO52 (mode);
35283
35284 /* Temporary for holding the result, initialized to the input
35285 operand to ease control flow. */
35286 res = gen_reg_rtx (mode);
35287 emit_move_insn (res, operand1);
35288
35289 /* xa = abs (operand1) */
35290 xa = ix86_expand_sse_fabs (res, &smask);
35291
35292 /* if (!isless (xa, TWO52)) goto label; */
35293 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35294
35295 /* res = xa + TWO52 - TWO52; */
35296 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35297 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35298 emit_move_insn (res, tmp);
35299
35300 /* generate 1.0 */
35301 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35302
35303 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
35304 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35305 emit_insn (gen_rtx_SET (VOIDmode, mask,
35306 gen_rtx_AND (mode, mask, one)));
35307 tmp = expand_simple_binop (mode, MINUS,
35308 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35309 emit_move_insn (res, tmp);
35310
35311 /* res = copysign (res, operand1) */
35312 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35313
35314 emit_label (label);
35315 LABEL_NUSES (label) = 1;
35316
35317 emit_move_insn (operand0, res);
35318 }
35319
35320 /* Expand SSE sequence for computing round from OPERAND1 storing
35321 into OPERAND0. */
35322 void
35323 ix86_expand_round (rtx operand0, rtx operand1)
35324 {
35325 /* C code for the stuff we're doing below:
35326 double xa = fabs (x);
35327 if (!isless (xa, TWO52))
35328 return x;
35329 xa = (double)(long)(xa + nextafter (0.5, 0.0));
35330 return copysign (xa, x);
35331 */
35332 enum machine_mode mode = GET_MODE (operand0);
35333 rtx res, TWO52, xa, label, xi, half, mask;
35334 const struct real_format *fmt;
35335 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35336
35337 /* Temporary for holding the result, initialized to the input
35338 operand to ease control flow. */
35339 res = gen_reg_rtx (mode);
35340 emit_move_insn (res, operand1);
35341
35342 TWO52 = ix86_gen_TWO52 (mode);
35343 xa = ix86_expand_sse_fabs (res, &mask);
35344 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35345
35346 /* load nextafter (0.5, 0.0) */
35347 fmt = REAL_MODE_FORMAT (mode);
35348 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35349 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35350
35351 /* xa = xa + 0.5 */
35352 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
35353 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
35354
35355 /* xa = (double)(int64_t)xa */
35356 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35357 expand_fix (xi, xa, 0);
35358 expand_float (xa, xi, 0);
35359
35360 /* res = copysign (xa, operand1) */
35361 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
35362
35363 emit_label (label);
35364 LABEL_NUSES (label) = 1;
35365
35366 emit_move_insn (operand0, res);
35367 }
35368
35369 /* Expand SSE sequence for computing round
35370 from OP1 storing into OP0 using sse4 round insn. */
35371 void
35372 ix86_expand_round_sse4 (rtx op0, rtx op1)
35373 {
35374 enum machine_mode mode = GET_MODE (op0);
35375 rtx e1, e2, res, half;
35376 const struct real_format *fmt;
35377 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35378 rtx (*gen_copysign) (rtx, rtx, rtx);
35379 rtx (*gen_round) (rtx, rtx, rtx);
35380
35381 switch (mode)
35382 {
35383 case SFmode:
35384 gen_copysign = gen_copysignsf3;
35385 gen_round = gen_sse4_1_roundsf2;
35386 break;
35387 case DFmode:
35388 gen_copysign = gen_copysigndf3;
35389 gen_round = gen_sse4_1_rounddf2;
35390 break;
35391 default:
35392 gcc_unreachable ();
35393 }
35394
35395 /* round (a) = trunc (a + copysign (0.5, a)) */
35396
35397 /* load nextafter (0.5, 0.0) */
35398 fmt = REAL_MODE_FORMAT (mode);
35399 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35400 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35401 half = const_double_from_real_value (pred_half, mode);
35402
35403 /* e1 = copysign (0.5, op1) */
35404 e1 = gen_reg_rtx (mode);
35405 emit_insn (gen_copysign (e1, half, op1));
35406
35407 /* e2 = op1 + e1 */
35408 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
35409
35410 /* res = trunc (e2) */
35411 res = gen_reg_rtx (mode);
35412 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
35413
35414 emit_move_insn (op0, res);
35415 }
35416 \f
35417
35418 /* Table of valid machine attributes. */
35419 static const struct attribute_spec ix86_attribute_table[] =
35420 {
35421 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
35422 affects_type_identity } */
35423 /* Stdcall attribute says callee is responsible for popping arguments
35424 if they are not variable. */
35425 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35426 true },
35427 /* Fastcall attribute says callee is responsible for popping arguments
35428 if they are not variable. */
35429 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35430 true },
35431 /* Thiscall attribute says callee is responsible for popping arguments
35432 if they are not variable. */
35433 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35434 true },
35435 /* Cdecl attribute says the callee is a normal C declaration */
35436 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35437 true },
35438 /* Regparm attribute specifies how many integer arguments are to be
35439 passed in registers. */
35440 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
35441 true },
35442 /* Sseregparm attribute says we are using x86_64 calling conventions
35443 for FP arguments. */
35444 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35445 true },
35446 /* The transactional memory builtins are implicitly regparm or fastcall
35447 depending on the ABI. Override the generic do-nothing attribute that
35448 these builtins were declared with. */
35449 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
35450 true },
35451 /* force_align_arg_pointer says this function realigns the stack at entry. */
35452 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
35453 false, true, true, ix86_handle_cconv_attribute, false },
35454 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
35455 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
35456 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
35457 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
35458 false },
35459 #endif
35460 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35461 false },
35462 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35463 false },
35464 #ifdef SUBTARGET_ATTRIBUTE_TABLE
35465 SUBTARGET_ATTRIBUTE_TABLE,
35466 #endif
35467 /* ms_abi and sysv_abi calling convention function attributes. */
35468 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35469 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35470 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
35471 false },
35472 { "callee_pop_aggregate_return", 1, 1, false, true, true,
35473 ix86_handle_callee_pop_aggregate_return, true },
35474 /* End element. */
35475 { NULL, 0, 0, false, false, false, NULL, false }
35476 };
35477
35478 /* Implement targetm.vectorize.builtin_vectorization_cost. */
35479 static int
35480 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35481 tree vectype ATTRIBUTE_UNUSED,
35482 int misalign ATTRIBUTE_UNUSED)
35483 {
35484 switch (type_of_cost)
35485 {
35486 case scalar_stmt:
35487 return ix86_cost->scalar_stmt_cost;
35488
35489 case scalar_load:
35490 return ix86_cost->scalar_load_cost;
35491
35492 case scalar_store:
35493 return ix86_cost->scalar_store_cost;
35494
35495 case vector_stmt:
35496 return ix86_cost->vec_stmt_cost;
35497
35498 case vector_load:
35499 return ix86_cost->vec_align_load_cost;
35500
35501 case vector_store:
35502 return ix86_cost->vec_store_cost;
35503
35504 case vec_to_scalar:
35505 return ix86_cost->vec_to_scalar_cost;
35506
35507 case scalar_to_vec:
35508 return ix86_cost->scalar_to_vec_cost;
35509
35510 case unaligned_load:
35511 case unaligned_store:
35512 return ix86_cost->vec_unalign_load_cost;
35513
35514 case cond_branch_taken:
35515 return ix86_cost->cond_taken_branch_cost;
35516
35517 case cond_branch_not_taken:
35518 return ix86_cost->cond_not_taken_branch_cost;
35519
35520 case vec_perm:
35521 case vec_promote_demote:
35522 return ix86_cost->vec_stmt_cost;
35523
35524 default:
35525 gcc_unreachable ();
35526 }
35527 }
35528
35529 /* Construct (set target (vec_select op0 (parallel perm))) and
35530 return true if that's a valid instruction in the active ISA. */
35531
35532 static bool
35533 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
35534 {
35535 rtx rperm[MAX_VECT_LEN], x;
35536 unsigned i;
35537
35538 for (i = 0; i < nelt; ++i)
35539 rperm[i] = GEN_INT (perm[i]);
35540
35541 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
35542 x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
35543 x = gen_rtx_SET (VOIDmode, target, x);
35544
35545 x = emit_insn (x);
35546 if (recog_memoized (x) < 0)
35547 {
35548 remove_insn (x);
35549 return false;
35550 }
35551 return true;
35552 }
35553
35554 /* Similar, but generate a vec_concat from op0 and op1 as well. */
35555
35556 static bool
35557 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
35558 const unsigned char *perm, unsigned nelt)
35559 {
35560 enum machine_mode v2mode;
35561 rtx x;
35562
35563 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
35564 x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
35565 return expand_vselect (target, x, perm, nelt);
35566 }
35567
35568 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35569 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
35570
35571 static bool
35572 expand_vec_perm_blend (struct expand_vec_perm_d *d)
35573 {
35574 enum machine_mode vmode = d->vmode;
35575 unsigned i, mask, nelt = d->nelt;
35576 rtx target, op0, op1, x;
35577 rtx rperm[32], vperm;
35578
35579 if (d->op0 == d->op1)
35580 return false;
35581 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
35582 ;
35583 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
35584 ;
35585 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
35586 ;
35587 else
35588 return false;
35589
35590 /* This is a blend, not a permute. Elements must stay in their
35591 respective lanes. */
35592 for (i = 0; i < nelt; ++i)
35593 {
35594 unsigned e = d->perm[i];
35595 if (!(e == i || e == i + nelt))
35596 return false;
35597 }
35598
35599 if (d->testing_p)
35600 return true;
35601
35602 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
35603 decision should be extracted elsewhere, so that we only try that
35604 sequence once all budget==3 options have been tried. */
35605 target = d->target;
35606 op0 = d->op0;
35607 op1 = d->op1;
35608 mask = 0;
35609
35610 switch (vmode)
35611 {
35612 case V4DFmode:
35613 case V8SFmode:
35614 case V2DFmode:
35615 case V4SFmode:
35616 case V8HImode:
35617 case V8SImode:
35618 for (i = 0; i < nelt; ++i)
35619 mask |= (d->perm[i] >= nelt) << i;
35620 break;
35621
35622 case V2DImode:
35623 for (i = 0; i < 2; ++i)
35624 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
35625 vmode = V8HImode;
35626 goto do_subreg;
35627
35628 case V4SImode:
35629 for (i = 0; i < 4; ++i)
35630 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35631 vmode = V8HImode;
35632 goto do_subreg;
35633
35634 case V16QImode:
35635 /* See if bytes move in pairs so we can use pblendw with
35636 an immediate argument, rather than pblendvb with a vector
35637 argument. */
35638 for (i = 0; i < 16; i += 2)
35639 if (d->perm[i] + 1 != d->perm[i + 1])
35640 {
35641 use_pblendvb:
35642 for (i = 0; i < nelt; ++i)
35643 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
35644
35645 finish_pblendvb:
35646 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
35647 vperm = force_reg (vmode, vperm);
35648
35649 if (GET_MODE_SIZE (vmode) == 16)
35650 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
35651 else
35652 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
35653 return true;
35654 }
35655
35656 for (i = 0; i < 8; ++i)
35657 mask |= (d->perm[i * 2] >= 16) << i;
35658 vmode = V8HImode;
35659 /* FALLTHRU */
35660
35661 do_subreg:
35662 target = gen_lowpart (vmode, target);
35663 op0 = gen_lowpart (vmode, op0);
35664 op1 = gen_lowpart (vmode, op1);
35665 break;
35666
35667 case V32QImode:
35668 /* See if bytes move in pairs. If not, vpblendvb must be used. */
35669 for (i = 0; i < 32; i += 2)
35670 if (d->perm[i] + 1 != d->perm[i + 1])
35671 goto use_pblendvb;
35672 /* See if bytes move in quadruplets. If yes, vpblendd
35673 with immediate can be used. */
35674 for (i = 0; i < 32; i += 4)
35675 if (d->perm[i] + 2 != d->perm[i + 2])
35676 break;
35677 if (i < 32)
35678 {
35679 /* See if bytes move the same in both lanes. If yes,
35680 vpblendw with immediate can be used. */
35681 for (i = 0; i < 16; i += 2)
35682 if (d->perm[i] + 16 != d->perm[i + 16])
35683 goto use_pblendvb;
35684
35685 /* Use vpblendw. */
35686 for (i = 0; i < 16; ++i)
35687 mask |= (d->perm[i * 2] >= 32) << i;
35688 vmode = V16HImode;
35689 goto do_subreg;
35690 }
35691
35692 /* Use vpblendd. */
35693 for (i = 0; i < 8; ++i)
35694 mask |= (d->perm[i * 4] >= 32) << i;
35695 vmode = V8SImode;
35696 goto do_subreg;
35697
35698 case V16HImode:
35699 /* See if words move in pairs. If yes, vpblendd can be used. */
35700 for (i = 0; i < 16; i += 2)
35701 if (d->perm[i] + 1 != d->perm[i + 1])
35702 break;
35703 if (i < 16)
35704 {
35705 /* See if words move the same in both lanes. If not,
35706 vpblendvb must be used. */
35707 for (i = 0; i < 8; i++)
35708 if (d->perm[i] + 8 != d->perm[i + 8])
35709 {
35710 /* Use vpblendvb. */
35711 for (i = 0; i < 32; ++i)
35712 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
35713
35714 vmode = V32QImode;
35715 nelt = 32;
35716 target = gen_lowpart (vmode, target);
35717 op0 = gen_lowpart (vmode, op0);
35718 op1 = gen_lowpart (vmode, op1);
35719 goto finish_pblendvb;
35720 }
35721
35722 /* Use vpblendw. */
35723 for (i = 0; i < 16; ++i)
35724 mask |= (d->perm[i] >= 16) << i;
35725 break;
35726 }
35727
35728 /* Use vpblendd. */
35729 for (i = 0; i < 8; ++i)
35730 mask |= (d->perm[i * 2] >= 16) << i;
35731 vmode = V8SImode;
35732 goto do_subreg;
35733
35734 case V4DImode:
35735 /* Use vpblendd. */
35736 for (i = 0; i < 4; ++i)
35737 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35738 vmode = V8SImode;
35739 goto do_subreg;
35740
35741 default:
35742 gcc_unreachable ();
35743 }
35744
35745 /* This matches five different patterns with the different modes. */
35746 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
35747 x = gen_rtx_SET (VOIDmode, target, x);
35748 emit_insn (x);
35749
35750 return true;
35751 }
35752
35753 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35754 in terms of the variable form of vpermilps.
35755
35756 Note that we will have already failed the immediate input vpermilps,
35757 which requires that the high and low part shuffle be identical; the
35758 variable form doesn't require that. */
35759
35760 static bool
35761 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
35762 {
35763 rtx rperm[8], vperm;
35764 unsigned i;
35765
35766 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
35767 return false;
35768
35769 /* We can only permute within the 128-bit lane. */
35770 for (i = 0; i < 8; ++i)
35771 {
35772 unsigned e = d->perm[i];
35773 if (i < 4 ? e >= 4 : e < 4)
35774 return false;
35775 }
35776
35777 if (d->testing_p)
35778 return true;
35779
35780 for (i = 0; i < 8; ++i)
35781 {
35782 unsigned e = d->perm[i];
35783
35784 /* Within each 128-bit lane, the elements of op0 are numbered
35785 from 0 and the elements of op1 are numbered from 4. */
35786 if (e >= 8 + 4)
35787 e -= 8;
35788 else if (e >= 4)
35789 e -= 4;
35790
35791 rperm[i] = GEN_INT (e);
35792 }
35793
35794 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
35795 vperm = force_reg (V8SImode, vperm);
35796 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
35797
35798 return true;
35799 }
35800
35801 /* Return true if permutation D can be performed as VMODE permutation
35802 instead. */
35803
35804 static bool
35805 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
35806 {
35807 unsigned int i, j, chunk;
35808
35809 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
35810 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
35811 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
35812 return false;
35813
35814 if (GET_MODE_NUNITS (vmode) >= d->nelt)
35815 return true;
35816
35817 chunk = d->nelt / GET_MODE_NUNITS (vmode);
35818 for (i = 0; i < d->nelt; i += chunk)
35819 if (d->perm[i] & (chunk - 1))
35820 return false;
35821 else
35822 for (j = 1; j < chunk; ++j)
35823 if (d->perm[i] + j != d->perm[i + j])
35824 return false;
35825
35826 return true;
35827 }
35828
35829 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35830 in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */
35831
35832 static bool
35833 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
35834 {
35835 unsigned i, nelt, eltsz, mask;
35836 unsigned char perm[32];
35837 enum machine_mode vmode = V16QImode;
35838 rtx rperm[32], vperm, target, op0, op1;
35839
35840 nelt = d->nelt;
35841
35842 if (d->op0 != d->op1)
35843 {
35844 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
35845 {
35846 if (TARGET_AVX2
35847 && valid_perm_using_mode_p (V2TImode, d))
35848 {
35849 if (d->testing_p)
35850 return true;
35851
35852 /* Use vperm2i128 insn. The pattern uses
35853 V4DImode instead of V2TImode. */
35854 target = gen_lowpart (V4DImode, d->target);
35855 op0 = gen_lowpart (V4DImode, d->op0);
35856 op1 = gen_lowpart (V4DImode, d->op1);
35857 rperm[0]
35858 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
35859 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
35860 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
35861 return true;
35862 }
35863 return false;
35864 }
35865 }
35866 else
35867 {
35868 if (GET_MODE_SIZE (d->vmode) == 16)
35869 {
35870 if (!TARGET_SSSE3)
35871 return false;
35872 }
35873 else if (GET_MODE_SIZE (d->vmode) == 32)
35874 {
35875 if (!TARGET_AVX2)
35876 return false;
35877
35878 /* V4DImode should be already handled through
35879 expand_vselect by vpermq instruction. */
35880 gcc_assert (d->vmode != V4DImode);
35881
35882 vmode = V32QImode;
35883 if (d->vmode == V8SImode
35884 || d->vmode == V16HImode
35885 || d->vmode == V32QImode)
35886 {
35887 /* First see if vpermq can be used for
35888 V8SImode/V16HImode/V32QImode. */
35889 if (valid_perm_using_mode_p (V4DImode, d))
35890 {
35891 for (i = 0; i < 4; i++)
35892 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
35893 if (d->testing_p)
35894 return true;
35895 return expand_vselect (gen_lowpart (V4DImode, d->target),
35896 gen_lowpart (V4DImode, d->op0),
35897 perm, 4);
35898 }
35899
35900 /* Next see if vpermd can be used. */
35901 if (valid_perm_using_mode_p (V8SImode, d))
35902 vmode = V8SImode;
35903 }
35904
35905 if (vmode == V32QImode)
35906 {
35907 /* vpshufb only works intra lanes, it is not
35908 possible to shuffle bytes in between the lanes. */
35909 for (i = 0; i < nelt; ++i)
35910 if ((d->perm[i] ^ i) & (nelt / 2))
35911 return false;
35912 }
35913 }
35914 else
35915 return false;
35916 }
35917
35918 if (d->testing_p)
35919 return true;
35920
35921 if (vmode == V8SImode)
35922 for (i = 0; i < 8; ++i)
35923 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
35924 else
35925 {
35926 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35927 if (d->op0 != d->op1)
35928 mask = 2 * nelt - 1;
35929 else if (vmode == V16QImode)
35930 mask = nelt - 1;
35931 else
35932 mask = nelt / 2 - 1;
35933
35934 for (i = 0; i < nelt; ++i)
35935 {
35936 unsigned j, e = d->perm[i] & mask;
35937 for (j = 0; j < eltsz; ++j)
35938 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
35939 }
35940 }
35941
35942 vperm = gen_rtx_CONST_VECTOR (vmode,
35943 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
35944 vperm = force_reg (vmode, vperm);
35945
35946 target = gen_lowpart (vmode, d->target);
35947 op0 = gen_lowpart (vmode, d->op0);
35948 if (d->op0 == d->op1)
35949 {
35950 if (vmode == V16QImode)
35951 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
35952 else if (vmode == V32QImode)
35953 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
35954 else
35955 emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
35956 }
35957 else
35958 {
35959 op1 = gen_lowpart (vmode, d->op1);
35960 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
35961 }
35962
35963 return true;
35964 }
35965
35966 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
35967 in a single instruction. */
35968
35969 static bool
35970 expand_vec_perm_1 (struct expand_vec_perm_d *d)
35971 {
35972 unsigned i, nelt = d->nelt;
35973 unsigned char perm2[MAX_VECT_LEN];
35974
35975 /* Check plain VEC_SELECT first, because AVX has instructions that could
35976 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
35977 input where SEL+CONCAT may not. */
35978 if (d->op0 == d->op1)
35979 {
35980 int mask = nelt - 1;
35981 bool identity_perm = true;
35982 bool broadcast_perm = true;
35983
35984 for (i = 0; i < nelt; i++)
35985 {
35986 perm2[i] = d->perm[i] & mask;
35987 if (perm2[i] != i)
35988 identity_perm = false;
35989 if (perm2[i])
35990 broadcast_perm = false;
35991 }
35992
35993 if (identity_perm)
35994 {
35995 if (!d->testing_p)
35996 emit_move_insn (d->target, d->op0);
35997 return true;
35998 }
35999 else if (broadcast_perm && TARGET_AVX2)
36000 {
36001 /* Use vpbroadcast{b,w,d}. */
36002 rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
36003 switch (d->vmode)
36004 {
36005 case V32QImode:
36006 op = gen_lowpart (V16QImode, op);
36007 gen = gen_avx2_pbroadcastv32qi;
36008 break;
36009 case V16HImode:
36010 op = gen_lowpart (V8HImode, op);
36011 gen = gen_avx2_pbroadcastv16hi;
36012 break;
36013 case V8SImode:
36014 op = gen_lowpart (V4SImode, op);
36015 gen = gen_avx2_pbroadcastv8si;
36016 break;
36017 case V16QImode:
36018 gen = gen_avx2_pbroadcastv16qi;
36019 break;
36020 case V8HImode:
36021 gen = gen_avx2_pbroadcastv8hi;
36022 break;
36023 /* For other modes prefer other shuffles this function creates. */
36024 default: break;
36025 }
36026 if (gen != NULL)
36027 {
36028 if (!d->testing_p)
36029 emit_insn (gen (d->target, op));
36030 return true;
36031 }
36032 }
36033
36034 if (expand_vselect (d->target, d->op0, perm2, nelt))
36035 return true;
36036
36037 /* There are plenty of patterns in sse.md that are written for
36038 SEL+CONCAT and are not replicated for a single op. Perhaps
36039 that should be changed, to avoid the nastiness here. */
36040
36041 /* Recognize interleave style patterns, which means incrementing
36042 every other permutation operand. */
36043 for (i = 0; i < nelt; i += 2)
36044 {
36045 perm2[i] = d->perm[i] & mask;
36046 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
36047 }
36048 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
36049 return true;
36050
36051 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
36052 if (nelt >= 4)
36053 {
36054 for (i = 0; i < nelt; i += 4)
36055 {
36056 perm2[i + 0] = d->perm[i + 0] & mask;
36057 perm2[i + 1] = d->perm[i + 1] & mask;
36058 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
36059 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
36060 }
36061
36062 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
36063 return true;
36064 }
36065 }
36066
36067 /* Finally, try the fully general two operand permute. */
36068 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
36069 return true;
36070
36071 /* Recognize interleave style patterns with reversed operands. */
36072 if (d->op0 != d->op1)
36073 {
36074 for (i = 0; i < nelt; ++i)
36075 {
36076 unsigned e = d->perm[i];
36077 if (e >= nelt)
36078 e -= nelt;
36079 else
36080 e += nelt;
36081 perm2[i] = e;
36082 }
36083
36084 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
36085 return true;
36086 }
36087
36088 /* Try the SSE4.1 blend variable merge instructions. */
36089 if (expand_vec_perm_blend (d))
36090 return true;
36091
36092 /* Try one of the AVX vpermil variable permutations. */
36093 if (expand_vec_perm_vpermil (d))
36094 return true;
36095
36096 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
36097 vpshufb, vpermd or vpermq variable permutation. */
36098 if (expand_vec_perm_pshufb (d))
36099 return true;
36100
36101 return false;
36102 }
36103
36104 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36105 in terms of a pair of pshuflw + pshufhw instructions. */
36106
36107 static bool
36108 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
36109 {
36110 unsigned char perm2[MAX_VECT_LEN];
36111 unsigned i;
36112 bool ok;
36113
36114 if (d->vmode != V8HImode || d->op0 != d->op1)
36115 return false;
36116
36117 /* The two permutations only operate in 64-bit lanes. */
36118 for (i = 0; i < 4; ++i)
36119 if (d->perm[i] >= 4)
36120 return false;
36121 for (i = 4; i < 8; ++i)
36122 if (d->perm[i] < 4)
36123 return false;
36124
36125 if (d->testing_p)
36126 return true;
36127
36128 /* Emit the pshuflw. */
36129 memcpy (perm2, d->perm, 4);
36130 for (i = 4; i < 8; ++i)
36131 perm2[i] = i;
36132 ok = expand_vselect (d->target, d->op0, perm2, 8);
36133 gcc_assert (ok);
36134
36135 /* Emit the pshufhw. */
36136 memcpy (perm2 + 4, d->perm + 4, 4);
36137 for (i = 0; i < 4; ++i)
36138 perm2[i] = i;
36139 ok = expand_vselect (d->target, d->target, perm2, 8);
36140 gcc_assert (ok);
36141
36142 return true;
36143 }
36144
36145 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36146 the permutation using the SSSE3 palignr instruction. This succeeds
36147 when all of the elements in PERM fit within one vector and we merely
36148 need to shift them down so that a single vector permutation has a
36149 chance to succeed. */
36150
36151 static bool
36152 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36153 {
36154 unsigned i, nelt = d->nelt;
36155 unsigned min, max;
36156 bool in_order, ok;
36157 rtx shift;
36158
36159 /* Even with AVX, palignr only operates on 128-bit vectors. */
36160 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36161 return false;
36162
36163 min = nelt, max = 0;
36164 for (i = 0; i < nelt; ++i)
36165 {
36166 unsigned e = d->perm[i];
36167 if (e < min)
36168 min = e;
36169 if (e > max)
36170 max = e;
36171 }
36172 if (min == 0 || max - min >= nelt)
36173 return false;
36174
36175 /* Given that we have SSSE3, we know we'll be able to implement the
36176 single operand permutation after the palignr with pshufb. */
36177 if (d->testing_p)
36178 return true;
36179
36180 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36181 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36182 gen_lowpart (TImode, d->op1),
36183 gen_lowpart (TImode, d->op0), shift));
36184
36185 d->op0 = d->op1 = d->target;
36186
36187 in_order = true;
36188 for (i = 0; i < nelt; ++i)
36189 {
36190 unsigned e = d->perm[i] - min;
36191 if (e != i)
36192 in_order = false;
36193 d->perm[i] = e;
36194 }
36195
36196 /* Test for the degenerate case where the alignment by itself
36197 produces the desired permutation. */
36198 if (in_order)
36199 return true;
36200
36201 ok = expand_vec_perm_1 (d);
36202 gcc_assert (ok);
36203
36204 return ok;
36205 }
36206
36207 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
36208
36209 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36210 a two vector permutation into a single vector permutation by using
36211 an interleave operation to merge the vectors. */
36212
36213 static bool
36214 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36215 {
36216 struct expand_vec_perm_d dremap, dfinal;
36217 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36218 unsigned HOST_WIDE_INT contents;
36219 unsigned char remap[2 * MAX_VECT_LEN];
36220 rtx seq;
36221 bool ok, same_halves = false;
36222
36223 if (GET_MODE_SIZE (d->vmode) == 16)
36224 {
36225 if (d->op0 == d->op1)
36226 return false;
36227 }
36228 else if (GET_MODE_SIZE (d->vmode) == 32)
36229 {
36230 if (!TARGET_AVX)
36231 return false;
36232 /* For 32-byte modes allow even d->op0 == d->op1.
36233 The lack of cross-lane shuffling in some instructions
36234 might prevent a single insn shuffle. */
36235 dfinal = *d;
36236 dfinal.testing_p = true;
36237 /* If expand_vec_perm_interleave3 can expand this into
36238 a 3 insn sequence, give up and let it be expanded as
36239 3 insn sequence. While that is one insn longer,
36240 it doesn't need a memory operand and in the common
36241 case that both interleave low and high permutations
36242 with the same operands are adjacent needs 4 insns
36243 for both after CSE. */
36244 if (expand_vec_perm_interleave3 (&dfinal))
36245 return false;
36246 }
36247 else
36248 return false;
36249
36250 /* Examine from whence the elements come. */
36251 contents = 0;
36252 for (i = 0; i < nelt; ++i)
36253 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36254
36255 memset (remap, 0xff, sizeof (remap));
36256 dremap = *d;
36257
36258 if (GET_MODE_SIZE (d->vmode) == 16)
36259 {
36260 unsigned HOST_WIDE_INT h1, h2, h3, h4;
36261
36262 /* Split the two input vectors into 4 halves. */
36263 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36264 h2 = h1 << nelt2;
36265 h3 = h2 << nelt2;
36266 h4 = h3 << nelt2;
36267
36268 /* If the elements from the low halves use interleave low, and similarly
36269 for interleave high. If the elements are from mis-matched halves, we
36270 can use shufps for V4SF/V4SI or do a DImode shuffle. */
36271 if ((contents & (h1 | h3)) == contents)
36272 {
36273 /* punpckl* */
36274 for (i = 0; i < nelt2; ++i)
36275 {
36276 remap[i] = i * 2;
36277 remap[i + nelt] = i * 2 + 1;
36278 dremap.perm[i * 2] = i;
36279 dremap.perm[i * 2 + 1] = i + nelt;
36280 }
36281 if (!TARGET_SSE2 && d->vmode == V4SImode)
36282 dremap.vmode = V4SFmode;
36283 }
36284 else if ((contents & (h2 | h4)) == contents)
36285 {
36286 /* punpckh* */
36287 for (i = 0; i < nelt2; ++i)
36288 {
36289 remap[i + nelt2] = i * 2;
36290 remap[i + nelt + nelt2] = i * 2 + 1;
36291 dremap.perm[i * 2] = i + nelt2;
36292 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
36293 }
36294 if (!TARGET_SSE2 && d->vmode == V4SImode)
36295 dremap.vmode = V4SFmode;
36296 }
36297 else if ((contents & (h1 | h4)) == contents)
36298 {
36299 /* shufps */
36300 for (i = 0; i < nelt2; ++i)
36301 {
36302 remap[i] = i;
36303 remap[i + nelt + nelt2] = i + nelt2;
36304 dremap.perm[i] = i;
36305 dremap.perm[i + nelt2] = i + nelt + nelt2;
36306 }
36307 if (nelt != 4)
36308 {
36309 /* shufpd */
36310 dremap.vmode = V2DImode;
36311 dremap.nelt = 2;
36312 dremap.perm[0] = 0;
36313 dremap.perm[1] = 3;
36314 }
36315 }
36316 else if ((contents & (h2 | h3)) == contents)
36317 {
36318 /* shufps */
36319 for (i = 0; i < nelt2; ++i)
36320 {
36321 remap[i + nelt2] = i;
36322 remap[i + nelt] = i + nelt2;
36323 dremap.perm[i] = i + nelt2;
36324 dremap.perm[i + nelt2] = i + nelt;
36325 }
36326 if (nelt != 4)
36327 {
36328 /* shufpd */
36329 dremap.vmode = V2DImode;
36330 dremap.nelt = 2;
36331 dremap.perm[0] = 1;
36332 dremap.perm[1] = 2;
36333 }
36334 }
36335 else
36336 return false;
36337 }
36338 else
36339 {
36340 unsigned int nelt4 = nelt / 4, nzcnt = 0;
36341 unsigned HOST_WIDE_INT q[8];
36342 unsigned int nonzero_halves[4];
36343
36344 /* Split the two input vectors into 8 quarters. */
36345 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
36346 for (i = 1; i < 8; ++i)
36347 q[i] = q[0] << (nelt4 * i);
36348 for (i = 0; i < 4; ++i)
36349 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
36350 {
36351 nonzero_halves[nzcnt] = i;
36352 ++nzcnt;
36353 }
36354
36355 if (nzcnt == 1)
36356 {
36357 gcc_assert (d->op0 == d->op1);
36358 nonzero_halves[1] = nonzero_halves[0];
36359 same_halves = true;
36360 }
36361 else if (d->op0 == d->op1)
36362 {
36363 gcc_assert (nonzero_halves[0] == 0);
36364 gcc_assert (nonzero_halves[1] == 1);
36365 }
36366
36367 if (nzcnt <= 2)
36368 {
36369 if (d->perm[0] / nelt2 == nonzero_halves[1])
36370 {
36371 /* Attempt to increase the likelyhood that dfinal
36372 shuffle will be intra-lane. */
36373 char tmph = nonzero_halves[0];
36374 nonzero_halves[0] = nonzero_halves[1];
36375 nonzero_halves[1] = tmph;
36376 }
36377
36378 /* vperm2f128 or vperm2i128. */
36379 for (i = 0; i < nelt2; ++i)
36380 {
36381 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
36382 remap[i + nonzero_halves[0] * nelt2] = i;
36383 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
36384 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
36385 }
36386
36387 if (d->vmode != V8SFmode
36388 && d->vmode != V4DFmode
36389 && d->vmode != V8SImode)
36390 {
36391 dremap.vmode = V8SImode;
36392 dremap.nelt = 8;
36393 for (i = 0; i < 4; ++i)
36394 {
36395 dremap.perm[i] = i + nonzero_halves[0] * 4;
36396 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
36397 }
36398 }
36399 }
36400 else if (d->op0 == d->op1)
36401 return false;
36402 else if (TARGET_AVX2
36403 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
36404 {
36405 /* vpunpckl* */
36406 for (i = 0; i < nelt4; ++i)
36407 {
36408 remap[i] = i * 2;
36409 remap[i + nelt] = i * 2 + 1;
36410 remap[i + nelt2] = i * 2 + nelt2;
36411 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
36412 dremap.perm[i * 2] = i;
36413 dremap.perm[i * 2 + 1] = i + nelt;
36414 dremap.perm[i * 2 + nelt2] = i + nelt2;
36415 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
36416 }
36417 }
36418 else if (TARGET_AVX2
36419 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
36420 {
36421 /* vpunpckh* */
36422 for (i = 0; i < nelt4; ++i)
36423 {
36424 remap[i + nelt4] = i * 2;
36425 remap[i + nelt + nelt4] = i * 2 + 1;
36426 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
36427 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
36428 dremap.perm[i * 2] = i + nelt4;
36429 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
36430 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
36431 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
36432 }
36433 }
36434 else
36435 return false;
36436 }
36437
36438 /* Use the remapping array set up above to move the elements from their
36439 swizzled locations into their final destinations. */
36440 dfinal = *d;
36441 for (i = 0; i < nelt; ++i)
36442 {
36443 unsigned e = remap[d->perm[i]];
36444 gcc_assert (e < nelt);
36445 /* If same_halves is true, both halves of the remapped vector are the
36446 same. Avoid cross-lane accesses if possible. */
36447 if (same_halves && i >= nelt2)
36448 {
36449 gcc_assert (e < nelt2);
36450 dfinal.perm[i] = e + nelt2;
36451 }
36452 else
36453 dfinal.perm[i] = e;
36454 }
36455 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
36456 dfinal.op1 = dfinal.op0;
36457 dremap.target = dfinal.op0;
36458
36459 /* Test if the final remap can be done with a single insn. For V4SFmode or
36460 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
36461 start_sequence ();
36462 ok = expand_vec_perm_1 (&dfinal);
36463 seq = get_insns ();
36464 end_sequence ();
36465
36466 if (!ok)
36467 return false;
36468
36469 if (d->testing_p)
36470 return true;
36471
36472 if (dremap.vmode != dfinal.vmode)
36473 {
36474 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
36475 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
36476 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
36477 }
36478
36479 ok = expand_vec_perm_1 (&dremap);
36480 gcc_assert (ok);
36481
36482 emit_insn (seq);
36483 return true;
36484 }
36485
36486 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36487 a single vector cross-lane permutation into vpermq followed
36488 by any of the single insn permutations. */
36489
36490 static bool
36491 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
36492 {
36493 struct expand_vec_perm_d dremap, dfinal;
36494 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
36495 unsigned contents[2];
36496 bool ok;
36497
36498 if (!(TARGET_AVX2
36499 && (d->vmode == V32QImode || d->vmode == V16HImode)
36500 && d->op0 == d->op1))
36501 return false;
36502
36503 contents[0] = 0;
36504 contents[1] = 0;
36505 for (i = 0; i < nelt2; ++i)
36506 {
36507 contents[0] |= 1u << (d->perm[i] / nelt4);
36508 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
36509 }
36510
36511 for (i = 0; i < 2; ++i)
36512 {
36513 unsigned int cnt = 0;
36514 for (j = 0; j < 4; ++j)
36515 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
36516 return false;
36517 }
36518
36519 if (d->testing_p)
36520 return true;
36521
36522 dremap = *d;
36523 dremap.vmode = V4DImode;
36524 dremap.nelt = 4;
36525 dremap.target = gen_reg_rtx (V4DImode);
36526 dremap.op0 = gen_lowpart (V4DImode, d->op0);
36527 dremap.op1 = dremap.op0;
36528 for (i = 0; i < 2; ++i)
36529 {
36530 unsigned int cnt = 0;
36531 for (j = 0; j < 4; ++j)
36532 if ((contents[i] & (1u << j)) != 0)
36533 dremap.perm[2 * i + cnt++] = j;
36534 for (; cnt < 2; ++cnt)
36535 dremap.perm[2 * i + cnt] = 0;
36536 }
36537
36538 dfinal = *d;
36539 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
36540 dfinal.op1 = dfinal.op0;
36541 for (i = 0, j = 0; i < nelt; ++i)
36542 {
36543 if (i == nelt2)
36544 j = 2;
36545 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
36546 if ((d->perm[i] / nelt4) == dremap.perm[j])
36547 ;
36548 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
36549 dfinal.perm[i] |= nelt4;
36550 else
36551 gcc_unreachable ();
36552 }
36553
36554 ok = expand_vec_perm_1 (&dremap);
36555 gcc_assert (ok);
36556
36557 ok = expand_vec_perm_1 (&dfinal);
36558 gcc_assert (ok);
36559
36560 return true;
36561 }
36562
36563 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36564 a two vector permutation using 2 intra-lane interleave insns
36565 and cross-lane shuffle for 32-byte vectors. */
36566
36567 static bool
36568 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
36569 {
36570 unsigned i, nelt;
36571 rtx (*gen) (rtx, rtx, rtx);
36572
36573 if (d->op0 == d->op1)
36574 return false;
36575 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
36576 ;
36577 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
36578 ;
36579 else
36580 return false;
36581
36582 nelt = d->nelt;
36583 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
36584 return false;
36585 for (i = 0; i < nelt; i += 2)
36586 if (d->perm[i] != d->perm[0] + i / 2
36587 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
36588 return false;
36589
36590 if (d->testing_p)
36591 return true;
36592
36593 switch (d->vmode)
36594 {
36595 case V32QImode:
36596 if (d->perm[0])
36597 gen = gen_vec_interleave_highv32qi;
36598 else
36599 gen = gen_vec_interleave_lowv32qi;
36600 break;
36601 case V16HImode:
36602 if (d->perm[0])
36603 gen = gen_vec_interleave_highv16hi;
36604 else
36605 gen = gen_vec_interleave_lowv16hi;
36606 break;
36607 case V8SImode:
36608 if (d->perm[0])
36609 gen = gen_vec_interleave_highv8si;
36610 else
36611 gen = gen_vec_interleave_lowv8si;
36612 break;
36613 case V4DImode:
36614 if (d->perm[0])
36615 gen = gen_vec_interleave_highv4di;
36616 else
36617 gen = gen_vec_interleave_lowv4di;
36618 break;
36619 case V8SFmode:
36620 if (d->perm[0])
36621 gen = gen_vec_interleave_highv8sf;
36622 else
36623 gen = gen_vec_interleave_lowv8sf;
36624 break;
36625 case V4DFmode:
36626 if (d->perm[0])
36627 gen = gen_vec_interleave_highv4df;
36628 else
36629 gen = gen_vec_interleave_lowv4df;
36630 break;
36631 default:
36632 gcc_unreachable ();
36633 }
36634
36635 emit_insn (gen (d->target, d->op0, d->op1));
36636 return true;
36637 }
36638
36639 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
36640 a single vector permutation using a single intra-lane vector
36641 permutation, vperm2f128 swapping the lanes and vblend* insn blending
36642 the non-swapped and swapped vectors together. */
36643
36644 static bool
36645 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
36646 {
36647 struct expand_vec_perm_d dfirst, dsecond;
36648 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
36649 rtx seq;
36650 bool ok;
36651 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
36652
36653 if (!TARGET_AVX
36654 || TARGET_AVX2
36655 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
36656 || d->op0 != d->op1)
36657 return false;
36658
36659 dfirst = *d;
36660 for (i = 0; i < nelt; i++)
36661 dfirst.perm[i] = 0xff;
36662 for (i = 0, msk = 0; i < nelt; i++)
36663 {
36664 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
36665 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
36666 return false;
36667 dfirst.perm[j] = d->perm[i];
36668 if (j != i)
36669 msk |= (1 << i);
36670 }
36671 for (i = 0; i < nelt; i++)
36672 if (dfirst.perm[i] == 0xff)
36673 dfirst.perm[i] = i;
36674
36675 if (!d->testing_p)
36676 dfirst.target = gen_reg_rtx (dfirst.vmode);
36677
36678 start_sequence ();
36679 ok = expand_vec_perm_1 (&dfirst);
36680 seq = get_insns ();
36681 end_sequence ();
36682
36683 if (!ok)
36684 return false;
36685
36686 if (d->testing_p)
36687 return true;
36688
36689 emit_insn (seq);
36690
36691 dsecond = *d;
36692 dsecond.op0 = dfirst.target;
36693 dsecond.op1 = dfirst.target;
36694 dsecond.target = gen_reg_rtx (dsecond.vmode);
36695 for (i = 0; i < nelt; i++)
36696 dsecond.perm[i] = i ^ nelt2;
36697
36698 ok = expand_vec_perm_1 (&dsecond);
36699 gcc_assert (ok);
36700
36701 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
36702 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
36703 return true;
36704 }
36705
36706 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
36707 permutation with two pshufb insns and an ior. We should have already
36708 failed all two instruction sequences. */
36709
36710 static bool
36711 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
36712 {
36713 rtx rperm[2][16], vperm, l, h, op, m128;
36714 unsigned int i, nelt, eltsz;
36715
36716 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36717 return false;
36718 gcc_assert (d->op0 != d->op1);
36719
36720 nelt = d->nelt;
36721 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36722
36723 /* Generate two permutation masks. If the required element is within
36724 the given vector it is shuffled into the proper lane. If the required
36725 element is in the other vector, force a zero into the lane by setting
36726 bit 7 in the permutation mask. */
36727 m128 = GEN_INT (-128);
36728 for (i = 0; i < nelt; ++i)
36729 {
36730 unsigned j, e = d->perm[i];
36731 unsigned which = (e >= nelt);
36732 if (e >= nelt)
36733 e -= nelt;
36734
36735 for (j = 0; j < eltsz; ++j)
36736 {
36737 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
36738 rperm[1-which][i*eltsz + j] = m128;
36739 }
36740 }
36741
36742 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
36743 vperm = force_reg (V16QImode, vperm);
36744
36745 l = gen_reg_rtx (V16QImode);
36746 op = gen_lowpart (V16QImode, d->op0);
36747 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
36748
36749 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
36750 vperm = force_reg (V16QImode, vperm);
36751
36752 h = gen_reg_rtx (V16QImode);
36753 op = gen_lowpart (V16QImode, d->op1);
36754 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
36755
36756 op = gen_lowpart (V16QImode, d->target);
36757 emit_insn (gen_iorv16qi3 (op, l, h));
36758
36759 return true;
36760 }
36761
36762 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
36763 with two vpshufb insns, vpermq and vpor. We should have already failed
36764 all two or three instruction sequences. */
36765
36766 static bool
36767 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
36768 {
36769 rtx rperm[2][32], vperm, l, h, hp, op, m128;
36770 unsigned int i, nelt, eltsz;
36771
36772 if (!TARGET_AVX2
36773 || d->op0 != d->op1
36774 || (d->vmode != V32QImode && d->vmode != V16HImode))
36775 return false;
36776
36777 if (d->testing_p)
36778 return true;
36779
36780 nelt = d->nelt;
36781 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36782
36783 /* Generate two permutation masks. If the required element is within
36784 the same lane, it is shuffled in. If the required element from the
36785 other lane, force a zero by setting bit 7 in the permutation mask.
36786 In the other mask the mask has non-negative elements if element
36787 is requested from the other lane, but also moved to the other lane,
36788 so that the result of vpshufb can have the two V2TImode halves
36789 swapped. */
36790 m128 = GEN_INT (-128);
36791 for (i = 0; i < nelt; ++i)
36792 {
36793 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36794 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36795
36796 for (j = 0; j < eltsz; ++j)
36797 {
36798 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
36799 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
36800 }
36801 }
36802
36803 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36804 vperm = force_reg (V32QImode, vperm);
36805
36806 h = gen_reg_rtx (V32QImode);
36807 op = gen_lowpart (V32QImode, d->op0);
36808 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36809
36810 /* Swap the 128-byte lanes of h into hp. */
36811 hp = gen_reg_rtx (V4DImode);
36812 op = gen_lowpart (V4DImode, h);
36813 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
36814 const1_rtx));
36815
36816 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36817 vperm = force_reg (V32QImode, vperm);
36818
36819 l = gen_reg_rtx (V32QImode);
36820 op = gen_lowpart (V32QImode, d->op0);
36821 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36822
36823 op = gen_lowpart (V32QImode, d->target);
36824 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
36825
36826 return true;
36827 }
36828
36829 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
36830 and extract-odd permutations of two V32QImode and V16QImode operand
36831 with two vpshufb insns, vpor and vpermq. We should have already
36832 failed all two or three instruction sequences. */
36833
36834 static bool
36835 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
36836 {
36837 rtx rperm[2][32], vperm, l, h, ior, op, m128;
36838 unsigned int i, nelt, eltsz;
36839
36840 if (!TARGET_AVX2
36841 || d->op0 == d->op1
36842 || (d->vmode != V32QImode && d->vmode != V16HImode))
36843 return false;
36844
36845 for (i = 0; i < d->nelt; ++i)
36846 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
36847 return false;
36848
36849 if (d->testing_p)
36850 return true;
36851
36852 nelt = d->nelt;
36853 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36854
36855 /* Generate two permutation masks. In the first permutation mask
36856 the first quarter will contain indexes for the first half
36857 of the op0, the second quarter will contain bit 7 set, third quarter
36858 will contain indexes for the second half of the op0 and the
36859 last quarter bit 7 set. In the second permutation mask
36860 the first quarter will contain bit 7 set, the second quarter
36861 indexes for the first half of the op1, the third quarter bit 7 set
36862 and last quarter indexes for the second half of the op1.
36863 I.e. the first mask e.g. for V32QImode extract even will be:
36864 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
36865 (all values masked with 0xf except for -128) and second mask
36866 for extract even will be
36867 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
36868 m128 = GEN_INT (-128);
36869 for (i = 0; i < nelt; ++i)
36870 {
36871 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36872 unsigned which = d->perm[i] >= nelt;
36873 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
36874
36875 for (j = 0; j < eltsz; ++j)
36876 {
36877 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
36878 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
36879 }
36880 }
36881
36882 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36883 vperm = force_reg (V32QImode, vperm);
36884
36885 l = gen_reg_rtx (V32QImode);
36886 op = gen_lowpart (V32QImode, d->op0);
36887 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36888
36889 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36890 vperm = force_reg (V32QImode, vperm);
36891
36892 h = gen_reg_rtx (V32QImode);
36893 op = gen_lowpart (V32QImode, d->op1);
36894 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36895
36896 ior = gen_reg_rtx (V32QImode);
36897 emit_insn (gen_iorv32qi3 (ior, l, h));
36898
36899 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
36900 op = gen_lowpart (V4DImode, d->target);
36901 ior = gen_lowpart (V4DImode, ior);
36902 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
36903 const1_rtx, GEN_INT (3)));
36904
36905 return true;
36906 }
36907
36908 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
36909 and extract-odd permutations. */
36910
36911 static bool
36912 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
36913 {
36914 rtx t1, t2, t3;
36915
36916 switch (d->vmode)
36917 {
36918 case V4DFmode:
36919 t1 = gen_reg_rtx (V4DFmode);
36920 t2 = gen_reg_rtx (V4DFmode);
36921
36922 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36923 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
36924 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
36925
36926 /* Now an unpck[lh]pd will produce the result required. */
36927 if (odd)
36928 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
36929 else
36930 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
36931 emit_insn (t3);
36932 break;
36933
36934 case V8SFmode:
36935 {
36936 int mask = odd ? 0xdd : 0x88;
36937
36938 t1 = gen_reg_rtx (V8SFmode);
36939 t2 = gen_reg_rtx (V8SFmode);
36940 t3 = gen_reg_rtx (V8SFmode);
36941
36942 /* Shuffle within the 128-bit lanes to produce:
36943 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
36944 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
36945 GEN_INT (mask)));
36946
36947 /* Shuffle the lanes around to produce:
36948 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
36949 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
36950 GEN_INT (0x3)));
36951
36952 /* Shuffle within the 128-bit lanes to produce:
36953 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
36954 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
36955
36956 /* Shuffle within the 128-bit lanes to produce:
36957 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
36958 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
36959
36960 /* Shuffle the lanes around to produce:
36961 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
36962 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
36963 GEN_INT (0x20)));
36964 }
36965 break;
36966
36967 case V2DFmode:
36968 case V4SFmode:
36969 case V2DImode:
36970 case V4SImode:
36971 /* These are always directly implementable by expand_vec_perm_1. */
36972 gcc_unreachable ();
36973
36974 case V8HImode:
36975 if (TARGET_SSSE3)
36976 return expand_vec_perm_pshufb2 (d);
36977 else
36978 {
36979 /* We need 2*log2(N)-1 operations to achieve odd/even
36980 with interleave. */
36981 t1 = gen_reg_rtx (V8HImode);
36982 t2 = gen_reg_rtx (V8HImode);
36983 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
36984 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
36985 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
36986 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
36987 if (odd)
36988 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
36989 else
36990 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
36991 emit_insn (t3);
36992 }
36993 break;
36994
36995 case V16QImode:
36996 if (TARGET_SSSE3)
36997 return expand_vec_perm_pshufb2 (d);
36998 else
36999 {
37000 t1 = gen_reg_rtx (V16QImode);
37001 t2 = gen_reg_rtx (V16QImode);
37002 t3 = gen_reg_rtx (V16QImode);
37003 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
37004 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
37005 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
37006 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
37007 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
37008 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
37009 if (odd)
37010 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
37011 else
37012 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
37013 emit_insn (t3);
37014 }
37015 break;
37016
37017 case V16HImode:
37018 case V32QImode:
37019 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
37020
37021 case V4DImode:
37022 if (!TARGET_AVX2)
37023 {
37024 struct expand_vec_perm_d d_copy = *d;
37025 d_copy.vmode = V4DFmode;
37026 d_copy.target = gen_lowpart (V4DFmode, d->target);
37027 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
37028 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
37029 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37030 }
37031
37032 t1 = gen_reg_rtx (V4DImode);
37033 t2 = gen_reg_rtx (V4DImode);
37034
37035 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37036 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
37037 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
37038
37039 /* Now an vpunpck[lh]qdq will produce the result required. */
37040 if (odd)
37041 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
37042 else
37043 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
37044 emit_insn (t3);
37045 break;
37046
37047 case V8SImode:
37048 if (!TARGET_AVX2)
37049 {
37050 struct expand_vec_perm_d d_copy = *d;
37051 d_copy.vmode = V8SFmode;
37052 d_copy.target = gen_lowpart (V8SFmode, d->target);
37053 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
37054 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
37055 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37056 }
37057
37058 t1 = gen_reg_rtx (V8SImode);
37059 t2 = gen_reg_rtx (V8SImode);
37060
37061 /* Shuffle the lanes around into
37062 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
37063 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
37064 gen_lowpart (V4DImode, d->op0),
37065 gen_lowpart (V4DImode, d->op1),
37066 GEN_INT (0x20)));
37067 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
37068 gen_lowpart (V4DImode, d->op0),
37069 gen_lowpart (V4DImode, d->op1),
37070 GEN_INT (0x31)));
37071
37072 /* Swap the 2nd and 3rd position in each lane into
37073 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
37074 emit_insn (gen_avx2_pshufdv3 (t1, t1,
37075 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37076 emit_insn (gen_avx2_pshufdv3 (t2, t2,
37077 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37078
37079 /* Now an vpunpck[lh]qdq will produce
37080 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
37081 if (odd)
37082 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
37083 gen_lowpart (V4DImode, t1),
37084 gen_lowpart (V4DImode, t2));
37085 else
37086 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
37087 gen_lowpart (V4DImode, t1),
37088 gen_lowpart (V4DImode, t2));
37089 emit_insn (t3);
37090 break;
37091
37092 default:
37093 gcc_unreachable ();
37094 }
37095
37096 return true;
37097 }
37098
37099 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37100 extract-even and extract-odd permutations. */
37101
37102 static bool
37103 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
37104 {
37105 unsigned i, odd, nelt = d->nelt;
37106
37107 odd = d->perm[0];
37108 if (odd != 0 && odd != 1)
37109 return false;
37110
37111 for (i = 1; i < nelt; ++i)
37112 if (d->perm[i] != 2 * i + odd)
37113 return false;
37114
37115 return expand_vec_perm_even_odd_1 (d, odd);
37116 }
37117
37118 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
37119 permutations. We assume that expand_vec_perm_1 has already failed. */
37120
37121 static bool
37122 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
37123 {
37124 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
37125 enum machine_mode vmode = d->vmode;
37126 unsigned char perm2[4];
37127 rtx op0 = d->op0;
37128 bool ok;
37129
37130 switch (vmode)
37131 {
37132 case V4DFmode:
37133 case V8SFmode:
37134 /* These are special-cased in sse.md so that we can optionally
37135 use the vbroadcast instruction. They expand to two insns
37136 if the input happens to be in a register. */
37137 gcc_unreachable ();
37138
37139 case V2DFmode:
37140 case V2DImode:
37141 case V4SFmode:
37142 case V4SImode:
37143 /* These are always implementable using standard shuffle patterns. */
37144 gcc_unreachable ();
37145
37146 case V8HImode:
37147 case V16QImode:
37148 /* These can be implemented via interleave. We save one insn by
37149 stopping once we have promoted to V4SImode and then use pshufd. */
37150 do
37151 {
37152 rtx dest;
37153 rtx (*gen) (rtx, rtx, rtx)
37154 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
37155 : gen_vec_interleave_lowv8hi;
37156
37157 if (elt >= nelt2)
37158 {
37159 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
37160 : gen_vec_interleave_highv8hi;
37161 elt -= nelt2;
37162 }
37163 nelt2 /= 2;
37164
37165 dest = gen_reg_rtx (vmode);
37166 emit_insn (gen (dest, op0, op0));
37167 vmode = get_mode_wider_vector (vmode);
37168 op0 = gen_lowpart (vmode, dest);
37169 }
37170 while (vmode != V4SImode);
37171
37172 memset (perm2, elt, 4);
37173 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
37174 gcc_assert (ok);
37175 return true;
37176
37177 case V32QImode:
37178 case V16HImode:
37179 case V8SImode:
37180 case V4DImode:
37181 /* For AVX2 broadcasts of the first element vpbroadcast* or
37182 vpermq should be used by expand_vec_perm_1. */
37183 gcc_assert (!TARGET_AVX2 || d->perm[0]);
37184 return false;
37185
37186 default:
37187 gcc_unreachable ();
37188 }
37189 }
37190
37191 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37192 broadcast permutations. */
37193
37194 static bool
37195 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
37196 {
37197 unsigned i, elt, nelt = d->nelt;
37198
37199 if (d->op0 != d->op1)
37200 return false;
37201
37202 elt = d->perm[0];
37203 for (i = 1; i < nelt; ++i)
37204 if (d->perm[i] != elt)
37205 return false;
37206
37207 return expand_vec_perm_broadcast_1 (d);
37208 }
37209
37210 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
37211 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
37212 all the shorter instruction sequences. */
37213
37214 static bool
37215 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
37216 {
37217 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
37218 unsigned int i, nelt, eltsz;
37219 bool used[4];
37220
37221 if (!TARGET_AVX2
37222 || d->op0 == d->op1
37223 || (d->vmode != V32QImode && d->vmode != V16HImode))
37224 return false;
37225
37226 if (d->testing_p)
37227 return true;
37228
37229 nelt = d->nelt;
37230 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37231
37232 /* Generate 4 permutation masks. If the required element is within
37233 the same lane, it is shuffled in. If the required element from the
37234 other lane, force a zero by setting bit 7 in the permutation mask.
37235 In the other mask the mask has non-negative elements if element
37236 is requested from the other lane, but also moved to the other lane,
37237 so that the result of vpshufb can have the two V2TImode halves
37238 swapped. */
37239 m128 = GEN_INT (-128);
37240 for (i = 0; i < 32; ++i)
37241 {
37242 rperm[0][i] = m128;
37243 rperm[1][i] = m128;
37244 rperm[2][i] = m128;
37245 rperm[3][i] = m128;
37246 }
37247 used[0] = false;
37248 used[1] = false;
37249 used[2] = false;
37250 used[3] = false;
37251 for (i = 0; i < nelt; ++i)
37252 {
37253 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37254 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37255 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
37256
37257 for (j = 0; j < eltsz; ++j)
37258 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
37259 used[which] = true;
37260 }
37261
37262 for (i = 0; i < 2; ++i)
37263 {
37264 if (!used[2 * i + 1])
37265 {
37266 h[i] = NULL_RTX;
37267 continue;
37268 }
37269 vperm = gen_rtx_CONST_VECTOR (V32QImode,
37270 gen_rtvec_v (32, rperm[2 * i + 1]));
37271 vperm = force_reg (V32QImode, vperm);
37272 h[i] = gen_reg_rtx (V32QImode);
37273 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37274 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
37275 }
37276
37277 /* Swap the 128-byte lanes of h[X]. */
37278 for (i = 0; i < 2; ++i)
37279 {
37280 if (h[i] == NULL_RTX)
37281 continue;
37282 op = gen_reg_rtx (V4DImode);
37283 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
37284 const2_rtx, GEN_INT (3), const0_rtx,
37285 const1_rtx));
37286 h[i] = gen_lowpart (V32QImode, op);
37287 }
37288
37289 for (i = 0; i < 2; ++i)
37290 {
37291 if (!used[2 * i])
37292 {
37293 l[i] = NULL_RTX;
37294 continue;
37295 }
37296 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
37297 vperm = force_reg (V32QImode, vperm);
37298 l[i] = gen_reg_rtx (V32QImode);
37299 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37300 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
37301 }
37302
37303 for (i = 0; i < 2; ++i)
37304 {
37305 if (h[i] && l[i])
37306 {
37307 op = gen_reg_rtx (V32QImode);
37308 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
37309 l[i] = op;
37310 }
37311 else if (h[i])
37312 l[i] = h[i];
37313 }
37314
37315 gcc_assert (l[0] && l[1]);
37316 op = gen_lowpart (V32QImode, d->target);
37317 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
37318 return true;
37319 }
37320
37321 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
37322 With all of the interface bits taken care of, perform the expansion
37323 in D and return true on success. */
37324
37325 static bool
37326 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
37327 {
37328 /* Try a single instruction expansion. */
37329 if (expand_vec_perm_1 (d))
37330 return true;
37331
37332 /* Try sequences of two instructions. */
37333
37334 if (expand_vec_perm_pshuflw_pshufhw (d))
37335 return true;
37336
37337 if (expand_vec_perm_palignr (d))
37338 return true;
37339
37340 if (expand_vec_perm_interleave2 (d))
37341 return true;
37342
37343 if (expand_vec_perm_broadcast (d))
37344 return true;
37345
37346 if (expand_vec_perm_vpermq_perm_1 (d))
37347 return true;
37348
37349 /* Try sequences of three instructions. */
37350
37351 if (expand_vec_perm_pshufb2 (d))
37352 return true;
37353
37354 if (expand_vec_perm_interleave3 (d))
37355 return true;
37356
37357 if (expand_vec_perm_vperm2f128_vblend (d))
37358 return true;
37359
37360 /* Try sequences of four instructions. */
37361
37362 if (expand_vec_perm_vpshufb2_vpermq (d))
37363 return true;
37364
37365 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
37366 return true;
37367
37368 /* ??? Look for narrow permutations whose element orderings would
37369 allow the promotion to a wider mode. */
37370
37371 /* ??? Look for sequences of interleave or a wider permute that place
37372 the data into the correct lanes for a half-vector shuffle like
37373 pshuf[lh]w or vpermilps. */
37374
37375 /* ??? Look for sequences of interleave that produce the desired results.
37376 The combinatorics of punpck[lh] get pretty ugly... */
37377
37378 if (expand_vec_perm_even_odd (d))
37379 return true;
37380
37381 /* Even longer sequences. */
37382 if (expand_vec_perm_vpshufb4_vpermq2 (d))
37383 return true;
37384
37385 return false;
37386 }
37387
37388 bool
37389 ix86_expand_vec_perm_const (rtx operands[4])
37390 {
37391 struct expand_vec_perm_d d;
37392 unsigned char perm[MAX_VECT_LEN];
37393 int i, nelt, which;
37394 rtx sel;
37395
37396 d.target = operands[0];
37397 d.op0 = operands[1];
37398 d.op1 = operands[2];
37399 sel = operands[3];
37400
37401 d.vmode = GET_MODE (d.target);
37402 gcc_assert (VECTOR_MODE_P (d.vmode));
37403 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37404 d.testing_p = false;
37405
37406 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
37407 gcc_assert (XVECLEN (sel, 0) == nelt);
37408 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
37409
37410 for (i = which = 0; i < nelt; ++i)
37411 {
37412 rtx e = XVECEXP (sel, 0, i);
37413 int ei = INTVAL (e) & (2 * nelt - 1);
37414
37415 which |= (ei < nelt ? 1 : 2);
37416 d.perm[i] = ei;
37417 perm[i] = ei;
37418 }
37419
37420 switch (which)
37421 {
37422 default:
37423 gcc_unreachable();
37424
37425 case 3:
37426 if (!rtx_equal_p (d.op0, d.op1))
37427 break;
37428
37429 /* The elements of PERM do not suggest that only the first operand
37430 is used, but both operands are identical. Allow easier matching
37431 of the permutation by folding the permutation into the single
37432 input vector. */
37433 for (i = 0; i < nelt; ++i)
37434 if (d.perm[i] >= nelt)
37435 d.perm[i] -= nelt;
37436 /* FALLTHRU */
37437
37438 case 1:
37439 d.op1 = d.op0;
37440 break;
37441
37442 case 2:
37443 for (i = 0; i < nelt; ++i)
37444 d.perm[i] -= nelt;
37445 d.op0 = d.op1;
37446 break;
37447 }
37448
37449 if (ix86_expand_vec_perm_const_1 (&d))
37450 return true;
37451
37452 /* If the mask says both arguments are needed, but they are the same,
37453 the above tried to expand with d.op0 == d.op1. If that didn't work,
37454 retry with d.op0 != d.op1 as that is what testing has been done with. */
37455 if (which == 3 && d.op0 == d.op1)
37456 {
37457 rtx seq;
37458 bool ok;
37459
37460 memcpy (d.perm, perm, sizeof (perm));
37461 d.op1 = gen_reg_rtx (d.vmode);
37462 start_sequence ();
37463 ok = ix86_expand_vec_perm_const_1 (&d);
37464 seq = get_insns ();
37465 end_sequence ();
37466 if (ok)
37467 {
37468 emit_move_insn (d.op1, d.op0);
37469 emit_insn (seq);
37470 return true;
37471 }
37472 }
37473
37474 return false;
37475 }
37476
37477 /* Implement targetm.vectorize.vec_perm_const_ok. */
37478
37479 static bool
37480 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
37481 const unsigned char *sel)
37482 {
37483 struct expand_vec_perm_d d;
37484 unsigned int i, nelt, which;
37485 bool ret, one_vec;
37486
37487 d.vmode = vmode;
37488 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37489 d.testing_p = true;
37490
37491 /* Given sufficient ISA support we can just return true here
37492 for selected vector modes. */
37493 if (GET_MODE_SIZE (d.vmode) == 16)
37494 {
37495 /* All implementable with a single vpperm insn. */
37496 if (TARGET_XOP)
37497 return true;
37498 /* All implementable with 2 pshufb + 1 ior. */
37499 if (TARGET_SSSE3)
37500 return true;
37501 /* All implementable with shufpd or unpck[lh]pd. */
37502 if (d.nelt == 2)
37503 return true;
37504 }
37505
37506 /* Extract the values from the vector CST into the permutation
37507 array in D. */
37508 memcpy (d.perm, sel, nelt);
37509 for (i = which = 0; i < nelt; ++i)
37510 {
37511 unsigned char e = d.perm[i];
37512 gcc_assert (e < 2 * nelt);
37513 which |= (e < nelt ? 1 : 2);
37514 }
37515
37516 /* For all elements from second vector, fold the elements to first. */
37517 if (which == 2)
37518 for (i = 0; i < nelt; ++i)
37519 d.perm[i] -= nelt;
37520
37521 /* Check whether the mask can be applied to the vector type. */
37522 one_vec = (which != 3);
37523
37524 /* Implementable with shufps or pshufd. */
37525 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
37526 return true;
37527
37528 /* Otherwise we have to go through the motions and see if we can
37529 figure out how to generate the requested permutation. */
37530 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
37531 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
37532 if (!one_vec)
37533 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
37534
37535 start_sequence ();
37536 ret = ix86_expand_vec_perm_const_1 (&d);
37537 end_sequence ();
37538
37539 return ret;
37540 }
37541
37542 void
37543 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
37544 {
37545 struct expand_vec_perm_d d;
37546 unsigned i, nelt;
37547
37548 d.target = targ;
37549 d.op0 = op0;
37550 d.op1 = op1;
37551 d.vmode = GET_MODE (targ);
37552 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37553 d.testing_p = false;
37554
37555 for (i = 0; i < nelt; ++i)
37556 d.perm[i] = i * 2 + odd;
37557
37558 /* We'll either be able to implement the permutation directly... */
37559 if (expand_vec_perm_1 (&d))
37560 return;
37561
37562 /* ... or we use the special-case patterns. */
37563 expand_vec_perm_even_odd_1 (&d, odd);
37564 }
37565
37566 /* Expand an insert into a vector register through pinsr insn.
37567 Return true if successful. */
37568
37569 bool
37570 ix86_expand_pinsr (rtx *operands)
37571 {
37572 rtx dst = operands[0];
37573 rtx src = operands[3];
37574
37575 unsigned int size = INTVAL (operands[1]);
37576 unsigned int pos = INTVAL (operands[2]);
37577
37578 if (GET_CODE (dst) == SUBREG)
37579 {
37580 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
37581 dst = SUBREG_REG (dst);
37582 }
37583
37584 if (GET_CODE (src) == SUBREG)
37585 src = SUBREG_REG (src);
37586
37587 switch (GET_MODE (dst))
37588 {
37589 case V16QImode:
37590 case V8HImode:
37591 case V4SImode:
37592 case V2DImode:
37593 {
37594 enum machine_mode srcmode, dstmode;
37595 rtx (*pinsr)(rtx, rtx, rtx, rtx);
37596
37597 srcmode = mode_for_size (size, MODE_INT, 0);
37598
37599 switch (srcmode)
37600 {
37601 case QImode:
37602 if (!TARGET_SSE4_1)
37603 return false;
37604 dstmode = V16QImode;
37605 pinsr = gen_sse4_1_pinsrb;
37606 break;
37607
37608 case HImode:
37609 if (!TARGET_SSE2)
37610 return false;
37611 dstmode = V8HImode;
37612 pinsr = gen_sse2_pinsrw;
37613 break;
37614
37615 case SImode:
37616 if (!TARGET_SSE4_1)
37617 return false;
37618 dstmode = V4SImode;
37619 pinsr = gen_sse4_1_pinsrd;
37620 break;
37621
37622 case DImode:
37623 gcc_assert (TARGET_64BIT);
37624 if (!TARGET_SSE4_1)
37625 return false;
37626 dstmode = V2DImode;
37627 pinsr = gen_sse4_1_pinsrq;
37628 break;
37629
37630 default:
37631 return false;
37632 }
37633
37634 dst = gen_lowpart (dstmode, dst);
37635 src = gen_lowpart (srcmode, src);
37636
37637 pos /= size;
37638
37639 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
37640 return true;
37641 }
37642
37643 default:
37644 return false;
37645 }
37646 }
37647 \f
37648 /* This function returns the calling abi specific va_list type node.
37649 It returns the FNDECL specific va_list type. */
37650
37651 static tree
37652 ix86_fn_abi_va_list (tree fndecl)
37653 {
37654 if (!TARGET_64BIT)
37655 return va_list_type_node;
37656 gcc_assert (fndecl != NULL_TREE);
37657
37658 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
37659 return ms_va_list_type_node;
37660 else
37661 return sysv_va_list_type_node;
37662 }
37663
37664 /* Returns the canonical va_list type specified by TYPE. If there
37665 is no valid TYPE provided, it return NULL_TREE. */
37666
37667 static tree
37668 ix86_canonical_va_list_type (tree type)
37669 {
37670 tree wtype, htype;
37671
37672 /* Resolve references and pointers to va_list type. */
37673 if (TREE_CODE (type) == MEM_REF)
37674 type = TREE_TYPE (type);
37675 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
37676 type = TREE_TYPE (type);
37677 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
37678 type = TREE_TYPE (type);
37679
37680 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
37681 {
37682 wtype = va_list_type_node;
37683 gcc_assert (wtype != NULL_TREE);
37684 htype = type;
37685 if (TREE_CODE (wtype) == ARRAY_TYPE)
37686 {
37687 /* If va_list is an array type, the argument may have decayed
37688 to a pointer type, e.g. by being passed to another function.
37689 In that case, unwrap both types so that we can compare the
37690 underlying records. */
37691 if (TREE_CODE (htype) == ARRAY_TYPE
37692 || POINTER_TYPE_P (htype))
37693 {
37694 wtype = TREE_TYPE (wtype);
37695 htype = TREE_TYPE (htype);
37696 }
37697 }
37698 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37699 return va_list_type_node;
37700 wtype = sysv_va_list_type_node;
37701 gcc_assert (wtype != NULL_TREE);
37702 htype = type;
37703 if (TREE_CODE (wtype) == ARRAY_TYPE)
37704 {
37705 /* If va_list is an array type, the argument may have decayed
37706 to a pointer type, e.g. by being passed to another function.
37707 In that case, unwrap both types so that we can compare the
37708 underlying records. */
37709 if (TREE_CODE (htype) == ARRAY_TYPE
37710 || POINTER_TYPE_P (htype))
37711 {
37712 wtype = TREE_TYPE (wtype);
37713 htype = TREE_TYPE (htype);
37714 }
37715 }
37716 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37717 return sysv_va_list_type_node;
37718 wtype = ms_va_list_type_node;
37719 gcc_assert (wtype != NULL_TREE);
37720 htype = type;
37721 if (TREE_CODE (wtype) == ARRAY_TYPE)
37722 {
37723 /* If va_list is an array type, the argument may have decayed
37724 to a pointer type, e.g. by being passed to another function.
37725 In that case, unwrap both types so that we can compare the
37726 underlying records. */
37727 if (TREE_CODE (htype) == ARRAY_TYPE
37728 || POINTER_TYPE_P (htype))
37729 {
37730 wtype = TREE_TYPE (wtype);
37731 htype = TREE_TYPE (htype);
37732 }
37733 }
37734 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37735 return ms_va_list_type_node;
37736 return NULL_TREE;
37737 }
37738 return std_canonical_va_list_type (type);
37739 }
37740
37741 /* Iterate through the target-specific builtin types for va_list.
37742 IDX denotes the iterator, *PTREE is set to the result type of
37743 the va_list builtin, and *PNAME to its internal type.
37744 Returns zero if there is no element for this index, otherwise
37745 IDX should be increased upon the next call.
37746 Note, do not iterate a base builtin's name like __builtin_va_list.
37747 Used from c_common_nodes_and_builtins. */
37748
37749 static int
37750 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
37751 {
37752 if (TARGET_64BIT)
37753 {
37754 switch (idx)
37755 {
37756 default:
37757 break;
37758
37759 case 0:
37760 *ptree = ms_va_list_type_node;
37761 *pname = "__builtin_ms_va_list";
37762 return 1;
37763
37764 case 1:
37765 *ptree = sysv_va_list_type_node;
37766 *pname = "__builtin_sysv_va_list";
37767 return 1;
37768 }
37769 }
37770
37771 return 0;
37772 }
37773
37774 #undef TARGET_SCHED_DISPATCH
37775 #define TARGET_SCHED_DISPATCH has_dispatch
37776 #undef TARGET_SCHED_DISPATCH_DO
37777 #define TARGET_SCHED_DISPATCH_DO do_dispatch
37778 #undef TARGET_SCHED_REASSOCIATION_WIDTH
37779 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
37780
37781 /* The size of the dispatch window is the total number of bytes of
37782 object code allowed in a window. */
37783 #define DISPATCH_WINDOW_SIZE 16
37784
37785 /* Number of dispatch windows considered for scheduling. */
37786 #define MAX_DISPATCH_WINDOWS 3
37787
37788 /* Maximum number of instructions in a window. */
37789 #define MAX_INSN 4
37790
37791 /* Maximum number of immediate operands in a window. */
37792 #define MAX_IMM 4
37793
37794 /* Maximum number of immediate bits allowed in a window. */
37795 #define MAX_IMM_SIZE 128
37796
37797 /* Maximum number of 32 bit immediates allowed in a window. */
37798 #define MAX_IMM_32 4
37799
37800 /* Maximum number of 64 bit immediates allowed in a window. */
37801 #define MAX_IMM_64 2
37802
37803 /* Maximum total of loads or prefetches allowed in a window. */
37804 #define MAX_LOAD 2
37805
37806 /* Maximum total of stores allowed in a window. */
37807 #define MAX_STORE 1
37808
37809 #undef BIG
37810 #define BIG 100
37811
37812
37813 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
37814 enum dispatch_group {
37815 disp_no_group = 0,
37816 disp_load,
37817 disp_store,
37818 disp_load_store,
37819 disp_prefetch,
37820 disp_imm,
37821 disp_imm_32,
37822 disp_imm_64,
37823 disp_branch,
37824 disp_cmp,
37825 disp_jcc,
37826 disp_last
37827 };
37828
37829 /* Number of allowable groups in a dispatch window. It is an array
37830 indexed by dispatch_group enum. 100 is used as a big number,
37831 because the number of these kind of operations does not have any
37832 effect in dispatch window, but we need them for other reasons in
37833 the table. */
37834 static unsigned int num_allowable_groups[disp_last] = {
37835 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
37836 };
37837
37838 char group_name[disp_last + 1][16] = {
37839 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
37840 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
37841 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
37842 };
37843
37844 /* Instruction path. */
37845 enum insn_path {
37846 no_path = 0,
37847 path_single, /* Single micro op. */
37848 path_double, /* Double micro op. */
37849 path_multi, /* Instructions with more than 2 micro op.. */
37850 last_path
37851 };
37852
37853 /* sched_insn_info defines a window to the instructions scheduled in
37854 the basic block. It contains a pointer to the insn_info table and
37855 the instruction scheduled.
37856
37857 Windows are allocated for each basic block and are linked
37858 together. */
37859 typedef struct sched_insn_info_s {
37860 rtx insn;
37861 enum dispatch_group group;
37862 enum insn_path path;
37863 int byte_len;
37864 int imm_bytes;
37865 } sched_insn_info;
37866
37867 /* Linked list of dispatch windows. This is a two way list of
37868 dispatch windows of a basic block. It contains information about
37869 the number of uops in the window and the total number of
37870 instructions and of bytes in the object code for this dispatch
37871 window. */
37872 typedef struct dispatch_windows_s {
37873 int num_insn; /* Number of insn in the window. */
37874 int num_uops; /* Number of uops in the window. */
37875 int window_size; /* Number of bytes in the window. */
37876 int window_num; /* Window number between 0 or 1. */
37877 int num_imm; /* Number of immediates in an insn. */
37878 int num_imm_32; /* Number of 32 bit immediates in an insn. */
37879 int num_imm_64; /* Number of 64 bit immediates in an insn. */
37880 int imm_size; /* Total immediates in the window. */
37881 int num_loads; /* Total memory loads in the window. */
37882 int num_stores; /* Total memory stores in the window. */
37883 int violation; /* Violation exists in window. */
37884 sched_insn_info *window; /* Pointer to the window. */
37885 struct dispatch_windows_s *next;
37886 struct dispatch_windows_s *prev;
37887 } dispatch_windows;
37888
37889 /* Immediate valuse used in an insn. */
37890 typedef struct imm_info_s
37891 {
37892 int imm;
37893 int imm32;
37894 int imm64;
37895 } imm_info;
37896
37897 static dispatch_windows *dispatch_window_list;
37898 static dispatch_windows *dispatch_window_list1;
37899
37900 /* Get dispatch group of insn. */
37901
37902 static enum dispatch_group
37903 get_mem_group (rtx insn)
37904 {
37905 enum attr_memory memory;
37906
37907 if (INSN_CODE (insn) < 0)
37908 return disp_no_group;
37909 memory = get_attr_memory (insn);
37910 if (memory == MEMORY_STORE)
37911 return disp_store;
37912
37913 if (memory == MEMORY_LOAD)
37914 return disp_load;
37915
37916 if (memory == MEMORY_BOTH)
37917 return disp_load_store;
37918
37919 return disp_no_group;
37920 }
37921
37922 /* Return true if insn is a compare instruction. */
37923
37924 static bool
37925 is_cmp (rtx insn)
37926 {
37927 enum attr_type type;
37928
37929 type = get_attr_type (insn);
37930 return (type == TYPE_TEST
37931 || type == TYPE_ICMP
37932 || type == TYPE_FCMP
37933 || GET_CODE (PATTERN (insn)) == COMPARE);
37934 }
37935
37936 /* Return true if a dispatch violation encountered. */
37937
37938 static bool
37939 dispatch_violation (void)
37940 {
37941 if (dispatch_window_list->next)
37942 return dispatch_window_list->next->violation;
37943 return dispatch_window_list->violation;
37944 }
37945
37946 /* Return true if insn is a branch instruction. */
37947
37948 static bool
37949 is_branch (rtx insn)
37950 {
37951 return (CALL_P (insn) || JUMP_P (insn));
37952 }
37953
37954 /* Return true if insn is a prefetch instruction. */
37955
37956 static bool
37957 is_prefetch (rtx insn)
37958 {
37959 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
37960 }
37961
37962 /* This function initializes a dispatch window and the list container holding a
37963 pointer to the window. */
37964
37965 static void
37966 init_window (int window_num)
37967 {
37968 int i;
37969 dispatch_windows *new_list;
37970
37971 if (window_num == 0)
37972 new_list = dispatch_window_list;
37973 else
37974 new_list = dispatch_window_list1;
37975
37976 new_list->num_insn = 0;
37977 new_list->num_uops = 0;
37978 new_list->window_size = 0;
37979 new_list->next = NULL;
37980 new_list->prev = NULL;
37981 new_list->window_num = window_num;
37982 new_list->num_imm = 0;
37983 new_list->num_imm_32 = 0;
37984 new_list->num_imm_64 = 0;
37985 new_list->imm_size = 0;
37986 new_list->num_loads = 0;
37987 new_list->num_stores = 0;
37988 new_list->violation = false;
37989
37990 for (i = 0; i < MAX_INSN; i++)
37991 {
37992 new_list->window[i].insn = NULL;
37993 new_list->window[i].group = disp_no_group;
37994 new_list->window[i].path = no_path;
37995 new_list->window[i].byte_len = 0;
37996 new_list->window[i].imm_bytes = 0;
37997 }
37998 return;
37999 }
38000
38001 /* This function allocates and initializes a dispatch window and the
38002 list container holding a pointer to the window. */
38003
38004 static dispatch_windows *
38005 allocate_window (void)
38006 {
38007 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
38008 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
38009
38010 return new_list;
38011 }
38012
38013 /* This routine initializes the dispatch scheduling information. It
38014 initiates building dispatch scheduler tables and constructs the
38015 first dispatch window. */
38016
38017 static void
38018 init_dispatch_sched (void)
38019 {
38020 /* Allocate a dispatch list and a window. */
38021 dispatch_window_list = allocate_window ();
38022 dispatch_window_list1 = allocate_window ();
38023 init_window (0);
38024 init_window (1);
38025 }
38026
38027 /* This function returns true if a branch is detected. End of a basic block
38028 does not have to be a branch, but here we assume only branches end a
38029 window. */
38030
38031 static bool
38032 is_end_basic_block (enum dispatch_group group)
38033 {
38034 return group == disp_branch;
38035 }
38036
38037 /* This function is called when the end of a window processing is reached. */
38038
38039 static void
38040 process_end_window (void)
38041 {
38042 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
38043 if (dispatch_window_list->next)
38044 {
38045 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
38046 gcc_assert (dispatch_window_list->window_size
38047 + dispatch_window_list1->window_size <= 48);
38048 init_window (1);
38049 }
38050 init_window (0);
38051 }
38052
38053 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
38054 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
38055 for 48 bytes of instructions. Note that these windows are not dispatch
38056 windows that their sizes are DISPATCH_WINDOW_SIZE. */
38057
38058 static dispatch_windows *
38059 allocate_next_window (int window_num)
38060 {
38061 if (window_num == 0)
38062 {
38063 if (dispatch_window_list->next)
38064 init_window (1);
38065 init_window (0);
38066 return dispatch_window_list;
38067 }
38068
38069 dispatch_window_list->next = dispatch_window_list1;
38070 dispatch_window_list1->prev = dispatch_window_list;
38071
38072 return dispatch_window_list1;
38073 }
38074
38075 /* Increment the number of immediate operands of an instruction. */
38076
38077 static int
38078 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
38079 {
38080 if (*in_rtx == 0)
38081 return 0;
38082
38083 switch ( GET_CODE (*in_rtx))
38084 {
38085 case CONST:
38086 case SYMBOL_REF:
38087 case CONST_INT:
38088 (imm_values->imm)++;
38089 if (x86_64_immediate_operand (*in_rtx, SImode))
38090 (imm_values->imm32)++;
38091 else
38092 (imm_values->imm64)++;
38093 break;
38094
38095 case CONST_DOUBLE:
38096 (imm_values->imm)++;
38097 (imm_values->imm64)++;
38098 break;
38099
38100 case CODE_LABEL:
38101 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
38102 {
38103 (imm_values->imm)++;
38104 (imm_values->imm32)++;
38105 }
38106 break;
38107
38108 default:
38109 break;
38110 }
38111
38112 return 0;
38113 }
38114
38115 /* Compute number of immediate operands of an instruction. */
38116
38117 static void
38118 find_constant (rtx in_rtx, imm_info *imm_values)
38119 {
38120 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
38121 (rtx_function) find_constant_1, (void *) imm_values);
38122 }
38123
38124 /* Return total size of immediate operands of an instruction along with number
38125 of corresponding immediate-operands. It initializes its parameters to zero
38126 befor calling FIND_CONSTANT.
38127 INSN is the input instruction. IMM is the total of immediates.
38128 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
38129 bit immediates. */
38130
38131 static int
38132 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
38133 {
38134 imm_info imm_values = {0, 0, 0};
38135
38136 find_constant (insn, &imm_values);
38137 *imm = imm_values.imm;
38138 *imm32 = imm_values.imm32;
38139 *imm64 = imm_values.imm64;
38140 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
38141 }
38142
38143 /* This function indicates if an operand of an instruction is an
38144 immediate. */
38145
38146 static bool
38147 has_immediate (rtx insn)
38148 {
38149 int num_imm_operand;
38150 int num_imm32_operand;
38151 int num_imm64_operand;
38152
38153 if (insn)
38154 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38155 &num_imm64_operand);
38156 return false;
38157 }
38158
38159 /* Return single or double path for instructions. */
38160
38161 static enum insn_path
38162 get_insn_path (rtx insn)
38163 {
38164 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
38165
38166 if ((int)path == 0)
38167 return path_single;
38168
38169 if ((int)path == 1)
38170 return path_double;
38171
38172 return path_multi;
38173 }
38174
38175 /* Return insn dispatch group. */
38176
38177 static enum dispatch_group
38178 get_insn_group (rtx insn)
38179 {
38180 enum dispatch_group group = get_mem_group (insn);
38181 if (group)
38182 return group;
38183
38184 if (is_branch (insn))
38185 return disp_branch;
38186
38187 if (is_cmp (insn))
38188 return disp_cmp;
38189
38190 if (has_immediate (insn))
38191 return disp_imm;
38192
38193 if (is_prefetch (insn))
38194 return disp_prefetch;
38195
38196 return disp_no_group;
38197 }
38198
38199 /* Count number of GROUP restricted instructions in a dispatch
38200 window WINDOW_LIST. */
38201
38202 static int
38203 count_num_restricted (rtx insn, dispatch_windows *window_list)
38204 {
38205 enum dispatch_group group = get_insn_group (insn);
38206 int imm_size;
38207 int num_imm_operand;
38208 int num_imm32_operand;
38209 int num_imm64_operand;
38210
38211 if (group == disp_no_group)
38212 return 0;
38213
38214 if (group == disp_imm)
38215 {
38216 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38217 &num_imm64_operand);
38218 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
38219 || num_imm_operand + window_list->num_imm > MAX_IMM
38220 || (num_imm32_operand > 0
38221 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
38222 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
38223 || (num_imm64_operand > 0
38224 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
38225 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
38226 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
38227 && num_imm64_operand > 0
38228 && ((window_list->num_imm_64 > 0
38229 && window_list->num_insn >= 2)
38230 || window_list->num_insn >= 3)))
38231 return BIG;
38232
38233 return 1;
38234 }
38235
38236 if ((group == disp_load_store
38237 && (window_list->num_loads >= MAX_LOAD
38238 || window_list->num_stores >= MAX_STORE))
38239 || ((group == disp_load
38240 || group == disp_prefetch)
38241 && window_list->num_loads >= MAX_LOAD)
38242 || (group == disp_store
38243 && window_list->num_stores >= MAX_STORE))
38244 return BIG;
38245
38246 return 1;
38247 }
38248
38249 /* This function returns true if insn satisfies dispatch rules on the
38250 last window scheduled. */
38251
38252 static bool
38253 fits_dispatch_window (rtx insn)
38254 {
38255 dispatch_windows *window_list = dispatch_window_list;
38256 dispatch_windows *window_list_next = dispatch_window_list->next;
38257 unsigned int num_restrict;
38258 enum dispatch_group group = get_insn_group (insn);
38259 enum insn_path path = get_insn_path (insn);
38260 int sum;
38261
38262 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
38263 instructions should be given the lowest priority in the
38264 scheduling process in Haifa scheduler to make sure they will be
38265 scheduled in the same dispatch window as the refrence to them. */
38266 if (group == disp_jcc || group == disp_cmp)
38267 return false;
38268
38269 /* Check nonrestricted. */
38270 if (group == disp_no_group || group == disp_branch)
38271 return true;
38272
38273 /* Get last dispatch window. */
38274 if (window_list_next)
38275 window_list = window_list_next;
38276
38277 if (window_list->window_num == 1)
38278 {
38279 sum = window_list->prev->window_size + window_list->window_size;
38280
38281 if (sum == 32
38282 || (min_insn_size (insn) + sum) >= 48)
38283 /* Window 1 is full. Go for next window. */
38284 return true;
38285 }
38286
38287 num_restrict = count_num_restricted (insn, window_list);
38288
38289 if (num_restrict > num_allowable_groups[group])
38290 return false;
38291
38292 /* See if it fits in the first window. */
38293 if (window_list->window_num == 0)
38294 {
38295 /* The first widow should have only single and double path
38296 uops. */
38297 if (path == path_double
38298 && (window_list->num_uops + 2) > MAX_INSN)
38299 return false;
38300 else if (path != path_single)
38301 return false;
38302 }
38303 return true;
38304 }
38305
38306 /* Add an instruction INSN with NUM_UOPS micro-operations to the
38307 dispatch window WINDOW_LIST. */
38308
38309 static void
38310 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
38311 {
38312 int byte_len = min_insn_size (insn);
38313 int num_insn = window_list->num_insn;
38314 int imm_size;
38315 sched_insn_info *window = window_list->window;
38316 enum dispatch_group group = get_insn_group (insn);
38317 enum insn_path path = get_insn_path (insn);
38318 int num_imm_operand;
38319 int num_imm32_operand;
38320 int num_imm64_operand;
38321
38322 if (!window_list->violation && group != disp_cmp
38323 && !fits_dispatch_window (insn))
38324 window_list->violation = true;
38325
38326 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38327 &num_imm64_operand);
38328
38329 /* Initialize window with new instruction. */
38330 window[num_insn].insn = insn;
38331 window[num_insn].byte_len = byte_len;
38332 window[num_insn].group = group;
38333 window[num_insn].path = path;
38334 window[num_insn].imm_bytes = imm_size;
38335
38336 window_list->window_size += byte_len;
38337 window_list->num_insn = num_insn + 1;
38338 window_list->num_uops = window_list->num_uops + num_uops;
38339 window_list->imm_size += imm_size;
38340 window_list->num_imm += num_imm_operand;
38341 window_list->num_imm_32 += num_imm32_operand;
38342 window_list->num_imm_64 += num_imm64_operand;
38343
38344 if (group == disp_store)
38345 window_list->num_stores += 1;
38346 else if (group == disp_load
38347 || group == disp_prefetch)
38348 window_list->num_loads += 1;
38349 else if (group == disp_load_store)
38350 {
38351 window_list->num_stores += 1;
38352 window_list->num_loads += 1;
38353 }
38354 }
38355
38356 /* Adds a scheduled instruction, INSN, to the current dispatch window.
38357 If the total bytes of instructions or the number of instructions in
38358 the window exceed allowable, it allocates a new window. */
38359
38360 static void
38361 add_to_dispatch_window (rtx insn)
38362 {
38363 int byte_len;
38364 dispatch_windows *window_list;
38365 dispatch_windows *next_list;
38366 dispatch_windows *window0_list;
38367 enum insn_path path;
38368 enum dispatch_group insn_group;
38369 bool insn_fits;
38370 int num_insn;
38371 int num_uops;
38372 int window_num;
38373 int insn_num_uops;
38374 int sum;
38375
38376 if (INSN_CODE (insn) < 0)
38377 return;
38378
38379 byte_len = min_insn_size (insn);
38380 window_list = dispatch_window_list;
38381 next_list = window_list->next;
38382 path = get_insn_path (insn);
38383 insn_group = get_insn_group (insn);
38384
38385 /* Get the last dispatch window. */
38386 if (next_list)
38387 window_list = dispatch_window_list->next;
38388
38389 if (path == path_single)
38390 insn_num_uops = 1;
38391 else if (path == path_double)
38392 insn_num_uops = 2;
38393 else
38394 insn_num_uops = (int) path;
38395
38396 /* If current window is full, get a new window.
38397 Window number zero is full, if MAX_INSN uops are scheduled in it.
38398 Window number one is full, if window zero's bytes plus window
38399 one's bytes is 32, or if the bytes of the new instruction added
38400 to the total makes it greater than 48, or it has already MAX_INSN
38401 instructions in it. */
38402 num_insn = window_list->num_insn;
38403 num_uops = window_list->num_uops;
38404 window_num = window_list->window_num;
38405 insn_fits = fits_dispatch_window (insn);
38406
38407 if (num_insn >= MAX_INSN
38408 || num_uops + insn_num_uops > MAX_INSN
38409 || !(insn_fits))
38410 {
38411 window_num = ~window_num & 1;
38412 window_list = allocate_next_window (window_num);
38413 }
38414
38415 if (window_num == 0)
38416 {
38417 add_insn_window (insn, window_list, insn_num_uops);
38418 if (window_list->num_insn >= MAX_INSN
38419 && insn_group == disp_branch)
38420 {
38421 process_end_window ();
38422 return;
38423 }
38424 }
38425 else if (window_num == 1)
38426 {
38427 window0_list = window_list->prev;
38428 sum = window0_list->window_size + window_list->window_size;
38429 if (sum == 32
38430 || (byte_len + sum) >= 48)
38431 {
38432 process_end_window ();
38433 window_list = dispatch_window_list;
38434 }
38435
38436 add_insn_window (insn, window_list, insn_num_uops);
38437 }
38438 else
38439 gcc_unreachable ();
38440
38441 if (is_end_basic_block (insn_group))
38442 {
38443 /* End of basic block is reached do end-basic-block process. */
38444 process_end_window ();
38445 return;
38446 }
38447 }
38448
38449 /* Print the dispatch window, WINDOW_NUM, to FILE. */
38450
38451 DEBUG_FUNCTION static void
38452 debug_dispatch_window_file (FILE *file, int window_num)
38453 {
38454 dispatch_windows *list;
38455 int i;
38456
38457 if (window_num == 0)
38458 list = dispatch_window_list;
38459 else
38460 list = dispatch_window_list1;
38461
38462 fprintf (file, "Window #%d:\n", list->window_num);
38463 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
38464 list->num_insn, list->num_uops, list->window_size);
38465 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38466 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
38467
38468 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
38469 list->num_stores);
38470 fprintf (file, " insn info:\n");
38471
38472 for (i = 0; i < MAX_INSN; i++)
38473 {
38474 if (!list->window[i].insn)
38475 break;
38476 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
38477 i, group_name[list->window[i].group],
38478 i, (void *)list->window[i].insn,
38479 i, list->window[i].path,
38480 i, list->window[i].byte_len,
38481 i, list->window[i].imm_bytes);
38482 }
38483 }
38484
38485 /* Print to stdout a dispatch window. */
38486
38487 DEBUG_FUNCTION void
38488 debug_dispatch_window (int window_num)
38489 {
38490 debug_dispatch_window_file (stdout, window_num);
38491 }
38492
38493 /* Print INSN dispatch information to FILE. */
38494
38495 DEBUG_FUNCTION static void
38496 debug_insn_dispatch_info_file (FILE *file, rtx insn)
38497 {
38498 int byte_len;
38499 enum insn_path path;
38500 enum dispatch_group group;
38501 int imm_size;
38502 int num_imm_operand;
38503 int num_imm32_operand;
38504 int num_imm64_operand;
38505
38506 if (INSN_CODE (insn) < 0)
38507 return;
38508
38509 byte_len = min_insn_size (insn);
38510 path = get_insn_path (insn);
38511 group = get_insn_group (insn);
38512 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38513 &num_imm64_operand);
38514
38515 fprintf (file, " insn info:\n");
38516 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
38517 group_name[group], path, byte_len);
38518 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38519 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
38520 }
38521
38522 /* Print to STDERR the status of the ready list with respect to
38523 dispatch windows. */
38524
38525 DEBUG_FUNCTION void
38526 debug_ready_dispatch (void)
38527 {
38528 int i;
38529 int no_ready = number_in_ready ();
38530
38531 fprintf (stdout, "Number of ready: %d\n", no_ready);
38532
38533 for (i = 0; i < no_ready; i++)
38534 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
38535 }
38536
38537 /* This routine is the driver of the dispatch scheduler. */
38538
38539 static void
38540 do_dispatch (rtx insn, int mode)
38541 {
38542 if (mode == DISPATCH_INIT)
38543 init_dispatch_sched ();
38544 else if (mode == ADD_TO_DISPATCH_WINDOW)
38545 add_to_dispatch_window (insn);
38546 }
38547
38548 /* Return TRUE if Dispatch Scheduling is supported. */
38549
38550 static bool
38551 has_dispatch (rtx insn, int action)
38552 {
38553 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
38554 && flag_dispatch_scheduler)
38555 switch (action)
38556 {
38557 default:
38558 return false;
38559
38560 case IS_DISPATCH_ON:
38561 return true;
38562 break;
38563
38564 case IS_CMP:
38565 return is_cmp (insn);
38566
38567 case DISPATCH_VIOLATION:
38568 return dispatch_violation ();
38569
38570 case FITS_DISPATCH_WINDOW:
38571 return fits_dispatch_window (insn);
38572 }
38573
38574 return false;
38575 }
38576
38577 /* Implementation of reassociation_width target hook used by
38578 reassoc phase to identify parallelism level in reassociated
38579 tree. Statements tree_code is passed in OPC. Arguments type
38580 is passed in MODE.
38581
38582 Currently parallel reassociation is enabled for Atom
38583 processors only and we set reassociation width to be 2
38584 because Atom may issue up to 2 instructions per cycle.
38585
38586 Return value should be fixed if parallel reassociation is
38587 enabled for other processors. */
38588
38589 static int
38590 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
38591 enum machine_mode mode)
38592 {
38593 int res = 1;
38594
38595 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
38596 res = 2;
38597 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
38598 res = 2;
38599
38600 return res;
38601 }
38602
38603 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
38604 place emms and femms instructions. */
38605
38606 static enum machine_mode
38607 ix86_preferred_simd_mode (enum machine_mode mode)
38608 {
38609 if (!TARGET_SSE)
38610 return word_mode;
38611
38612 switch (mode)
38613 {
38614 case QImode:
38615 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
38616 case HImode:
38617 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
38618 case SImode:
38619 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
38620 case DImode:
38621 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
38622
38623 case SFmode:
38624 if (TARGET_AVX && !TARGET_PREFER_AVX128)
38625 return V8SFmode;
38626 else
38627 return V4SFmode;
38628
38629 case DFmode:
38630 if (!TARGET_VECTORIZE_DOUBLE)
38631 return word_mode;
38632 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
38633 return V4DFmode;
38634 else if (TARGET_SSE2)
38635 return V2DFmode;
38636 /* FALLTHRU */
38637
38638 default:
38639 return word_mode;
38640 }
38641 }
38642
38643 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
38644 vectors. */
38645
38646 static unsigned int
38647 ix86_autovectorize_vector_sizes (void)
38648 {
38649 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
38650 }
38651
38652 /* Initialize the GCC target structure. */
38653 #undef TARGET_RETURN_IN_MEMORY
38654 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
38655
38656 #undef TARGET_LEGITIMIZE_ADDRESS
38657 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
38658
38659 #undef TARGET_ATTRIBUTE_TABLE
38660 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
38661 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38662 # undef TARGET_MERGE_DECL_ATTRIBUTES
38663 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
38664 #endif
38665
38666 #undef TARGET_COMP_TYPE_ATTRIBUTES
38667 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
38668
38669 #undef TARGET_INIT_BUILTINS
38670 #define TARGET_INIT_BUILTINS ix86_init_builtins
38671 #undef TARGET_BUILTIN_DECL
38672 #define TARGET_BUILTIN_DECL ix86_builtin_decl
38673 #undef TARGET_EXPAND_BUILTIN
38674 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
38675
38676 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
38677 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
38678 ix86_builtin_vectorized_function
38679
38680 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
38681 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
38682
38683 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
38684 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
38685
38686 #undef TARGET_VECTORIZE_BUILTIN_GATHER
38687 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
38688
38689 #undef TARGET_BUILTIN_RECIPROCAL
38690 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
38691
38692 #undef TARGET_ASM_FUNCTION_EPILOGUE
38693 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
38694
38695 #undef TARGET_ENCODE_SECTION_INFO
38696 #ifndef SUBTARGET_ENCODE_SECTION_INFO
38697 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
38698 #else
38699 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
38700 #endif
38701
38702 #undef TARGET_ASM_OPEN_PAREN
38703 #define TARGET_ASM_OPEN_PAREN ""
38704 #undef TARGET_ASM_CLOSE_PAREN
38705 #define TARGET_ASM_CLOSE_PAREN ""
38706
38707 #undef TARGET_ASM_BYTE_OP
38708 #define TARGET_ASM_BYTE_OP ASM_BYTE
38709
38710 #undef TARGET_ASM_ALIGNED_HI_OP
38711 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
38712 #undef TARGET_ASM_ALIGNED_SI_OP
38713 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
38714 #ifdef ASM_QUAD
38715 #undef TARGET_ASM_ALIGNED_DI_OP
38716 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
38717 #endif
38718
38719 #undef TARGET_PROFILE_BEFORE_PROLOGUE
38720 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
38721
38722 #undef TARGET_ASM_UNALIGNED_HI_OP
38723 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
38724 #undef TARGET_ASM_UNALIGNED_SI_OP
38725 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
38726 #undef TARGET_ASM_UNALIGNED_DI_OP
38727 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
38728
38729 #undef TARGET_PRINT_OPERAND
38730 #define TARGET_PRINT_OPERAND ix86_print_operand
38731 #undef TARGET_PRINT_OPERAND_ADDRESS
38732 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
38733 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
38734 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
38735 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
38736 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
38737
38738 #undef TARGET_SCHED_INIT_GLOBAL
38739 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
38740 #undef TARGET_SCHED_ADJUST_COST
38741 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
38742 #undef TARGET_SCHED_ISSUE_RATE
38743 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
38744 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
38745 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
38746 ia32_multipass_dfa_lookahead
38747
38748 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
38749 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
38750
38751 #ifdef HAVE_AS_TLS
38752 #undef TARGET_HAVE_TLS
38753 #define TARGET_HAVE_TLS true
38754 #endif
38755 #undef TARGET_CANNOT_FORCE_CONST_MEM
38756 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
38757 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
38758 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
38759
38760 #undef TARGET_DELEGITIMIZE_ADDRESS
38761 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
38762
38763 #undef TARGET_MS_BITFIELD_LAYOUT_P
38764 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
38765
38766 #if TARGET_MACHO
38767 #undef TARGET_BINDS_LOCAL_P
38768 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
38769 #endif
38770 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38771 #undef TARGET_BINDS_LOCAL_P
38772 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
38773 #endif
38774
38775 #undef TARGET_ASM_OUTPUT_MI_THUNK
38776 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
38777 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
38778 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
38779
38780 #undef TARGET_ASM_FILE_START
38781 #define TARGET_ASM_FILE_START x86_file_start
38782
38783 #undef TARGET_OPTION_OVERRIDE
38784 #define TARGET_OPTION_OVERRIDE ix86_option_override
38785
38786 #undef TARGET_REGISTER_MOVE_COST
38787 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
38788 #undef TARGET_MEMORY_MOVE_COST
38789 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
38790 #undef TARGET_RTX_COSTS
38791 #define TARGET_RTX_COSTS ix86_rtx_costs
38792 #undef TARGET_ADDRESS_COST
38793 #define TARGET_ADDRESS_COST ix86_address_cost
38794
38795 #undef TARGET_FIXED_CONDITION_CODE_REGS
38796 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
38797 #undef TARGET_CC_MODES_COMPATIBLE
38798 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
38799
38800 #undef TARGET_MACHINE_DEPENDENT_REORG
38801 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
38802
38803 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
38804 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
38805
38806 #undef TARGET_BUILD_BUILTIN_VA_LIST
38807 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
38808
38809 #undef TARGET_ENUM_VA_LIST_P
38810 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
38811
38812 #undef TARGET_FN_ABI_VA_LIST
38813 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
38814
38815 #undef TARGET_CANONICAL_VA_LIST_TYPE
38816 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
38817
38818 #undef TARGET_EXPAND_BUILTIN_VA_START
38819 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
38820
38821 #undef TARGET_MD_ASM_CLOBBERS
38822 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
38823
38824 #undef TARGET_PROMOTE_PROTOTYPES
38825 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
38826 #undef TARGET_STRUCT_VALUE_RTX
38827 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
38828 #undef TARGET_SETUP_INCOMING_VARARGS
38829 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
38830 #undef TARGET_MUST_PASS_IN_STACK
38831 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
38832 #undef TARGET_FUNCTION_ARG_ADVANCE
38833 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
38834 #undef TARGET_FUNCTION_ARG
38835 #define TARGET_FUNCTION_ARG ix86_function_arg
38836 #undef TARGET_FUNCTION_ARG_BOUNDARY
38837 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
38838 #undef TARGET_PASS_BY_REFERENCE
38839 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
38840 #undef TARGET_INTERNAL_ARG_POINTER
38841 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
38842 #undef TARGET_UPDATE_STACK_BOUNDARY
38843 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
38844 #undef TARGET_GET_DRAP_RTX
38845 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
38846 #undef TARGET_STRICT_ARGUMENT_NAMING
38847 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
38848 #undef TARGET_STATIC_CHAIN
38849 #define TARGET_STATIC_CHAIN ix86_static_chain
38850 #undef TARGET_TRAMPOLINE_INIT
38851 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
38852 #undef TARGET_RETURN_POPS_ARGS
38853 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
38854
38855 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
38856 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
38857
38858 #undef TARGET_SCALAR_MODE_SUPPORTED_P
38859 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
38860
38861 #undef TARGET_VECTOR_MODE_SUPPORTED_P
38862 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
38863
38864 #undef TARGET_C_MODE_FOR_SUFFIX
38865 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
38866
38867 #ifdef HAVE_AS_TLS
38868 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
38869 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
38870 #endif
38871
38872 #ifdef SUBTARGET_INSERT_ATTRIBUTES
38873 #undef TARGET_INSERT_ATTRIBUTES
38874 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
38875 #endif
38876
38877 #undef TARGET_MANGLE_TYPE
38878 #define TARGET_MANGLE_TYPE ix86_mangle_type
38879
38880 #if !TARGET_MACHO
38881 #undef TARGET_STACK_PROTECT_FAIL
38882 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
38883 #endif
38884
38885 #undef TARGET_FUNCTION_VALUE
38886 #define TARGET_FUNCTION_VALUE ix86_function_value
38887
38888 #undef TARGET_FUNCTION_VALUE_REGNO_P
38889 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
38890
38891 #undef TARGET_PROMOTE_FUNCTION_MODE
38892 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
38893
38894 #undef TARGET_SECONDARY_RELOAD
38895 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
38896
38897 #undef TARGET_CLASS_MAX_NREGS
38898 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
38899
38900 #undef TARGET_PREFERRED_RELOAD_CLASS
38901 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
38902 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
38903 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
38904 #undef TARGET_CLASS_LIKELY_SPILLED_P
38905 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
38906
38907 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
38908 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
38909 ix86_builtin_vectorization_cost
38910 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
38911 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
38912 ix86_vectorize_vec_perm_const_ok
38913 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
38914 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
38915 ix86_preferred_simd_mode
38916 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
38917 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
38918 ix86_autovectorize_vector_sizes
38919
38920 #undef TARGET_SET_CURRENT_FUNCTION
38921 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
38922
38923 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
38924 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
38925
38926 #undef TARGET_OPTION_SAVE
38927 #define TARGET_OPTION_SAVE ix86_function_specific_save
38928
38929 #undef TARGET_OPTION_RESTORE
38930 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
38931
38932 #undef TARGET_OPTION_PRINT
38933 #define TARGET_OPTION_PRINT ix86_function_specific_print
38934
38935 #undef TARGET_CAN_INLINE_P
38936 #define TARGET_CAN_INLINE_P ix86_can_inline_p
38937
38938 #undef TARGET_EXPAND_TO_RTL_HOOK
38939 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
38940
38941 #undef TARGET_LEGITIMATE_ADDRESS_P
38942 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
38943
38944 #undef TARGET_LEGITIMATE_CONSTANT_P
38945 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
38946
38947 #undef TARGET_FRAME_POINTER_REQUIRED
38948 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
38949
38950 #undef TARGET_CAN_ELIMINATE
38951 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
38952
38953 #undef TARGET_EXTRA_LIVE_ON_ENTRY
38954 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
38955
38956 #undef TARGET_ASM_CODE_END
38957 #define TARGET_ASM_CODE_END ix86_code_end
38958
38959 #undef TARGET_CONDITIONAL_REGISTER_USAGE
38960 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
38961
38962 #if TARGET_MACHO
38963 #undef TARGET_INIT_LIBFUNCS
38964 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
38965 #endif
38966
38967 struct gcc_target targetm = TARGET_INITIALIZER;
38968 \f
38969 #include "gt-i386.h"