Add _mm_stream_si64.
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "cgraph.h"
51 #include "gimple.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "sched-int.h"
59 #include "sbitmap.h"
60 #include "fibheap.h"
61 #include "opts.h"
62 #include "diagnostic.h"
63
64 enum upper_128bits_state
65 {
66 unknown = 0,
67 unused,
68 used
69 };
70
71 typedef struct block_info_def
72 {
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
76 in this block. */
77 bool unchanged;
78 /* TRUE if block has been processed. */
79 bool processed;
80 /* TRUE if block has been scanned. */
81 bool scanned;
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
84 } *block_info;
85
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
87
88 enum call_avx256_state
89 {
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
95 callee_pass_avx256,
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
98 call_no_avx256,
99 /* vzeroupper intrinsic. */
100 vzeroupper_intrinsic
101 };
102
103 /* Check if a 256bit AVX register is referenced in stores. */
104
105 static void
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
107 {
108 if ((REG_P (dest)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
113 {
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
116 *state = used;
117 }
118 }
119
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
123
124 STATE is state of the upper 128bits of AVX registers at entry. */
125
126 static void
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
129 {
130 rtx insn, bb_end;
131 rtx vzeroupper_insn = NULL_RTX;
132 rtx pat;
133 int avx256;
134 bool unchanged;
135
136 if (BLOCK_INFO (bb)->unchanged)
137 {
138 if (dump_file)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
140 bb->index, state);
141
142 BLOCK_INFO (bb)->state = state;
143 return;
144 }
145
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
147 {
148 if (dump_file)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
151 return;
152 }
153
154 BLOCK_INFO (bb)->prev = state;
155
156 if (dump_file)
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
158 bb->index, state);
159
160 unchanged = true;
161
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
164 insn = BB_HEAD (bb);
165 while (insn != bb_end)
166 {
167 insn = NEXT_INSN (insn);
168
169 if (!NONDEBUG_INSN_P (insn))
170 continue;
171
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
174 {
175 if (!vzeroupper_insn)
176 continue;
177
178 if (PREV_INSN (insn) != vzeroupper_insn)
179 {
180 if (dump_file)
181 {
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
186 }
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
188 PREV_INSN (insn));
189 }
190 vzeroupper_insn = NULL_RTX;
191 continue;
192 }
193
194 pat = PATTERN (insn);
195
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
199 {
200 if (dump_file)
201 {
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
205 }
206 }
207 else
208 {
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
213 {
214 state = unused;
215 unchanged = false;
216
217 /* Delete pending vzeroupper insertion. */
218 if (vzeroupper_insn)
219 {
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
222 }
223 }
224 else if (state != used)
225 {
226 note_stores (pat, check_avx256_stores, &state);
227 if (state == used)
228 unchanged = false;
229 }
230 continue;
231 }
232
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
235
236 if (state == unused)
237 {
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
242 {
243 state = used;
244 unchanged = false;
245 }
246
247 /* Remove unnecessary vzeroupper since upper 128bits are
248 cleared. */
249 if (dump_file)
250 {
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
253 }
254 delete_insn (insn);
255 }
256 else
257 {
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
259 register. */
260 if (avx256 != callee_return_pass_avx256)
261 state = unused;
262
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
265 {
266 /* Must remove vzeroupper since callee passes in 256bit
267 AVX register. */
268 if (dump_file)
269 {
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
272 }
273 delete_insn (insn);
274 }
275 else
276 {
277 vzeroupper_insn = insn;
278 unchanged = false;
279 }
280 }
281 }
282
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
286
287 if (dump_file)
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
290 state);
291 }
292
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
296 state is changed. */
297
298 static bool
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
300 {
301 edge e;
302 edge_iterator ei;
303 enum upper_128bits_state state, old_state, new_state;
304 bool seen_unknown;
305
306 if (dump_file)
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
309
310 if (BLOCK_INFO (block)->processed)
311 return false;
312
313 state = unused;
314
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
318 {
319 if (e->src == block)
320 continue;
321 switch (BLOCK_INFO (e->src)->state)
322 {
323 case unknown:
324 if (!unknown_is_unused)
325 seen_unknown = true;
326 case unused:
327 break;
328 case used:
329 state = used;
330 goto done;
331 }
332 }
333
334 if (seen_unknown)
335 state = unknown;
336
337 done:
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
341
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
344
345 /* Need to rescan if the upper 128bits of AVX registers are changed
346 to USED at exit. */
347 if (new_state != old_state)
348 {
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
351 return true;
352 }
353 else
354 return false;
355 }
356
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
360
361 static void
362 move_or_delete_vzeroupper (void)
363 {
364 edge e;
365 edge_iterator ei;
366 basic_block bb;
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
369 int *bb_order;
370 int *rc_order;
371 int i;
372
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
375
376 /* Process outgoing edges of entry point. */
377 if (dump_file)
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
379
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
381 {
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
384 ? used : unused);
385 BLOCK_INFO (e->dest)->processed = true;
386 }
387
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
395 free (rc_order);
396
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
403
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
406 FOR_EACH_BB (bb)
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
409 else
410 {
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
413 }
414
415 if (dump_file)
416 fprintf (dump_file, "Check remaining basic blocks\n");
417
418 while (!fibheap_empty (pending))
419 {
420 fibheap_swap = pending;
421 pending = worklist;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
426
427 sbitmap_zero (visited);
428
429 cfun->machine->rescan_vzeroupper_p = 0;
430
431 while (!fibheap_empty (worklist))
432 {
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
437 {
438 edge_iterator ei;
439
440 SET_BIT (visited, bb->index);
441
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
444 {
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
447 continue;
448
449 if (TEST_BIT (visited, e->dest->index))
450 {
451 if (!TEST_BIT (in_pending, e->dest->index))
452 {
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
457 e->dest);
458 }
459 }
460 else if (!TEST_BIT (in_worklist, e->dest->index))
461 {
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
465 e->dest);
466 }
467 }
468 }
469 }
470
471 if (!cfun->machine->rescan_vzeroupper_p)
472 break;
473 }
474
475 free (bb_order);
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
481
482 if (dump_file)
483 fprintf (dump_file, "Process remaining basic blocks\n");
484
485 FOR_EACH_BB (bb)
486 move_or_delete_vzeroupper_1 (bb, true);
487
488 free_aux_for_blocks ();
489 }
490
491 static rtx legitimize_dllimport_symbol (rtx, bool);
492
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
495 #endif
496
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
503 : 4)
504
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
508
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
510
511 const
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
531 2, /* MOVE_RATIO */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
557 2, /* Branch cost */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}},
568 {{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
569 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
570 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
571 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}},
572 1, /* scalar_stmt_cost. */
573 1, /* scalar load_cost. */
574 1, /* scalar_store_cost. */
575 1, /* vec_stmt_cost. */
576 1, /* vec_to_scalar_cost. */
577 1, /* scalar_to_vec_cost. */
578 1, /* vec_align_load_cost. */
579 1, /* vec_unalign_load_cost. */
580 1, /* vec_store_cost. */
581 1, /* cond_taken_branch_cost. */
582 1, /* cond_not_taken_branch_cost. */
583 };
584
585 /* Processor costs (relative to an add) */
586 static const
587 struct processor_costs i386_cost = { /* 386 specific costs */
588 COSTS_N_INSNS (1), /* cost of an add instruction */
589 COSTS_N_INSNS (1), /* cost of a lea instruction */
590 COSTS_N_INSNS (3), /* variable shift costs */
591 COSTS_N_INSNS (2), /* constant shift costs */
592 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
593 COSTS_N_INSNS (6), /* HI */
594 COSTS_N_INSNS (6), /* SI */
595 COSTS_N_INSNS (6), /* DI */
596 COSTS_N_INSNS (6)}, /* other */
597 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
598 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
599 COSTS_N_INSNS (23), /* HI */
600 COSTS_N_INSNS (23), /* SI */
601 COSTS_N_INSNS (23), /* DI */
602 COSTS_N_INSNS (23)}, /* other */
603 COSTS_N_INSNS (3), /* cost of movsx */
604 COSTS_N_INSNS (2), /* cost of movzx */
605 15, /* "large" insn */
606 3, /* MOVE_RATIO */
607 4, /* cost for loading QImode using movzbl */
608 {2, 4, 2}, /* cost of loading integer registers
609 in QImode, HImode and SImode.
610 Relative to reg-reg move (2). */
611 {2, 4, 2}, /* cost of storing integer registers */
612 2, /* cost of reg,reg fld/fst */
613 {8, 8, 8}, /* cost of loading fp registers
614 in SFmode, DFmode and XFmode */
615 {8, 8, 8}, /* cost of storing fp registers
616 in SFmode, DFmode and XFmode */
617 2, /* cost of moving MMX register */
618 {4, 8}, /* cost of loading MMX registers
619 in SImode and DImode */
620 {4, 8}, /* cost of storing MMX registers
621 in SImode and DImode */
622 2, /* cost of moving SSE register */
623 {4, 8, 16}, /* cost of loading SSE registers
624 in SImode, DImode and TImode */
625 {4, 8, 16}, /* cost of storing SSE registers
626 in SImode, DImode and TImode */
627 3, /* MMX or SSE register to integer */
628 0, /* size of l1 cache */
629 0, /* size of l2 cache */
630 0, /* size of prefetch block */
631 0, /* number of parallel prefetches */
632 1, /* Branch cost */
633 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
634 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
635 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
636 COSTS_N_INSNS (22), /* cost of FABS instruction. */
637 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
638 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
639 {{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
640 DUMMY_STRINGOP_ALGS},
641 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
642 DUMMY_STRINGOP_ALGS}},
643 {{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
644 DUMMY_STRINGOP_ALGS},
645 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
646 DUMMY_STRINGOP_ALGS}},
647 1, /* scalar_stmt_cost. */
648 1, /* scalar load_cost. */
649 1, /* scalar_store_cost. */
650 1, /* vec_stmt_cost. */
651 1, /* vec_to_scalar_cost. */
652 1, /* scalar_to_vec_cost. */
653 1, /* vec_align_load_cost. */
654 2, /* vec_unalign_load_cost. */
655 1, /* vec_store_cost. */
656 3, /* cond_taken_branch_cost. */
657 1, /* cond_not_taken_branch_cost. */
658 };
659
660 static const
661 struct processor_costs i486_cost = { /* 486 specific costs */
662 COSTS_N_INSNS (1), /* cost of an add instruction */
663 COSTS_N_INSNS (1), /* cost of a lea instruction */
664 COSTS_N_INSNS (3), /* variable shift costs */
665 COSTS_N_INSNS (2), /* constant shift costs */
666 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
667 COSTS_N_INSNS (12), /* HI */
668 COSTS_N_INSNS (12), /* SI */
669 COSTS_N_INSNS (12), /* DI */
670 COSTS_N_INSNS (12)}, /* other */
671 1, /* cost of multiply per each bit set */
672 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
673 COSTS_N_INSNS (40), /* HI */
674 COSTS_N_INSNS (40), /* SI */
675 COSTS_N_INSNS (40), /* DI */
676 COSTS_N_INSNS (40)}, /* other */
677 COSTS_N_INSNS (3), /* cost of movsx */
678 COSTS_N_INSNS (2), /* cost of movzx */
679 15, /* "large" insn */
680 3, /* MOVE_RATIO */
681 4, /* cost for loading QImode using movzbl */
682 {2, 4, 2}, /* cost of loading integer registers
683 in QImode, HImode and SImode.
684 Relative to reg-reg move (2). */
685 {2, 4, 2}, /* cost of storing integer registers */
686 2, /* cost of reg,reg fld/fst */
687 {8, 8, 8}, /* cost of loading fp registers
688 in SFmode, DFmode and XFmode */
689 {8, 8, 8}, /* cost of storing fp registers
690 in SFmode, DFmode and XFmode */
691 2, /* cost of moving MMX register */
692 {4, 8}, /* cost of loading MMX registers
693 in SImode and DImode */
694 {4, 8}, /* cost of storing MMX registers
695 in SImode and DImode */
696 2, /* cost of moving SSE register */
697 {4, 8, 16}, /* cost of loading SSE registers
698 in SImode, DImode and TImode */
699 {4, 8, 16}, /* cost of storing SSE registers
700 in SImode, DImode and TImode */
701 3, /* MMX or SSE register to integer */
702 4, /* size of l1 cache. 486 has 8kB cache
703 shared for code and data, so 4kB is
704 not really precise. */
705 4, /* size of l2 cache */
706 0, /* size of prefetch block */
707 0, /* number of parallel prefetches */
708 1, /* Branch cost */
709 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
710 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
711 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
712 COSTS_N_INSNS (3), /* cost of FABS instruction. */
713 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
714 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
715 {{{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
716 DUMMY_STRINGOP_ALGS},
717 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
718 DUMMY_STRINGOP_ALGS}},
719 {{{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
720 DUMMY_STRINGOP_ALGS},
721 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
722 DUMMY_STRINGOP_ALGS}},
723 1, /* scalar_stmt_cost. */
724 1, /* scalar load_cost. */
725 1, /* scalar_store_cost. */
726 1, /* vec_stmt_cost. */
727 1, /* vec_to_scalar_cost. */
728 1, /* scalar_to_vec_cost. */
729 1, /* vec_align_load_cost. */
730 2, /* vec_unalign_load_cost. */
731 1, /* vec_store_cost. */
732 3, /* cond_taken_branch_cost. */
733 1, /* cond_not_taken_branch_cost. */
734 };
735
736 static const
737 struct processor_costs pentium_cost = {
738 COSTS_N_INSNS (1), /* cost of an add instruction */
739 COSTS_N_INSNS (1), /* cost of a lea instruction */
740 COSTS_N_INSNS (4), /* variable shift costs */
741 COSTS_N_INSNS (1), /* constant shift costs */
742 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
743 COSTS_N_INSNS (11), /* HI */
744 COSTS_N_INSNS (11), /* SI */
745 COSTS_N_INSNS (11), /* DI */
746 COSTS_N_INSNS (11)}, /* other */
747 0, /* cost of multiply per each bit set */
748 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
749 COSTS_N_INSNS (25), /* HI */
750 COSTS_N_INSNS (25), /* SI */
751 COSTS_N_INSNS (25), /* DI */
752 COSTS_N_INSNS (25)}, /* other */
753 COSTS_N_INSNS (3), /* cost of movsx */
754 COSTS_N_INSNS (2), /* cost of movzx */
755 8, /* "large" insn */
756 6, /* MOVE_RATIO */
757 6, /* cost for loading QImode using movzbl */
758 {2, 4, 2}, /* cost of loading integer registers
759 in QImode, HImode and SImode.
760 Relative to reg-reg move (2). */
761 {2, 4, 2}, /* cost of storing integer registers */
762 2, /* cost of reg,reg fld/fst */
763 {2, 2, 6}, /* cost of loading fp registers
764 in SFmode, DFmode and XFmode */
765 {4, 4, 6}, /* cost of storing fp registers
766 in SFmode, DFmode and XFmode */
767 8, /* cost of moving MMX register */
768 {8, 8}, /* cost of loading MMX registers
769 in SImode and DImode */
770 {8, 8}, /* cost of storing MMX registers
771 in SImode and DImode */
772 2, /* cost of moving SSE register */
773 {4, 8, 16}, /* cost of loading SSE registers
774 in SImode, DImode and TImode */
775 {4, 8, 16}, /* cost of storing SSE registers
776 in SImode, DImode and TImode */
777 3, /* MMX or SSE register to integer */
778 8, /* size of l1 cache. */
779 8, /* size of l2 cache */
780 0, /* size of prefetch block */
781 0, /* number of parallel prefetches */
782 2, /* Branch cost */
783 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
784 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
785 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
786 COSTS_N_INSNS (1), /* cost of FABS instruction. */
787 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
788 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
789 {{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
790 DUMMY_STRINGOP_ALGS},
791 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
792 DUMMY_STRINGOP_ALGS}},
793 {{{libcall, {{-1, rep_prefix_4_byte}}},
794 DUMMY_STRINGOP_ALGS},
795 {{libcall, {{-1, rep_prefix_4_byte}}},
796 DUMMY_STRINGOP_ALGS}},
797 1, /* scalar_stmt_cost. */
798 1, /* scalar load_cost. */
799 1, /* scalar_store_cost. */
800 1, /* vec_stmt_cost. */
801 1, /* vec_to_scalar_cost. */
802 1, /* scalar_to_vec_cost. */
803 1, /* vec_align_load_cost. */
804 2, /* vec_unalign_load_cost. */
805 1, /* vec_store_cost. */
806 3, /* cond_taken_branch_cost. */
807 1, /* cond_not_taken_branch_cost. */
808 };
809
810 static const
811 struct processor_costs pentiumpro_cost = {
812 COSTS_N_INSNS (1), /* cost of an add instruction */
813 COSTS_N_INSNS (1), /* cost of a lea instruction */
814 COSTS_N_INSNS (1), /* variable shift costs */
815 COSTS_N_INSNS (1), /* constant shift costs */
816 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
817 COSTS_N_INSNS (4), /* HI */
818 COSTS_N_INSNS (4), /* SI */
819 COSTS_N_INSNS (4), /* DI */
820 COSTS_N_INSNS (4)}, /* other */
821 0, /* cost of multiply per each bit set */
822 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
823 COSTS_N_INSNS (17), /* HI */
824 COSTS_N_INSNS (17), /* SI */
825 COSTS_N_INSNS (17), /* DI */
826 COSTS_N_INSNS (17)}, /* other */
827 COSTS_N_INSNS (1), /* cost of movsx */
828 COSTS_N_INSNS (1), /* cost of movzx */
829 8, /* "large" insn */
830 6, /* MOVE_RATIO */
831 2, /* cost for loading QImode using movzbl */
832 {4, 4, 4}, /* cost of loading integer registers
833 in QImode, HImode and SImode.
834 Relative to reg-reg move (2). */
835 {2, 2, 2}, /* cost of storing integer registers */
836 2, /* cost of reg,reg fld/fst */
837 {2, 2, 6}, /* cost of loading fp registers
838 in SFmode, DFmode and XFmode */
839 {4, 4, 6}, /* cost of storing fp registers
840 in SFmode, DFmode and XFmode */
841 2, /* cost of moving MMX register */
842 {2, 2}, /* cost of loading MMX registers
843 in SImode and DImode */
844 {2, 2}, /* cost of storing MMX registers
845 in SImode and DImode */
846 2, /* cost of moving SSE register */
847 {2, 2, 8}, /* cost of loading SSE registers
848 in SImode, DImode and TImode */
849 {2, 2, 8}, /* cost of storing SSE registers
850 in SImode, DImode and TImode */
851 3, /* MMX or SSE register to integer */
852 8, /* size of l1 cache. */
853 256, /* size of l2 cache */
854 32, /* size of prefetch block */
855 6, /* number of parallel prefetches */
856 2, /* Branch cost */
857 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
858 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
859 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
860 COSTS_N_INSNS (2), /* cost of FABS instruction. */
861 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
862 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
863 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
864 (we ensure the alignment). For small blocks inline loop is still a
865 noticeable win, for bigger blocks either rep movsl or rep movsb is
866 way to go. Rep movsb has apparently more expensive startup time in CPU,
867 but after 4K the difference is down in the noise. */
868 {{{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
869 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
870 DUMMY_STRINGOP_ALGS},
871 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
872 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
873 DUMMY_STRINGOP_ALGS}},
874 {{{rep_prefix_4_byte, {{1024, unrolled_loop},
875 {8192, rep_prefix_4_byte}, {-1, libcall}}},
876 DUMMY_STRINGOP_ALGS},
877 {{rep_prefix_4_byte, {{1024, unrolled_loop},
878 {8192, rep_prefix_4_byte}, {-1, libcall}}},
879 DUMMY_STRINGOP_ALGS}},
880 1, /* scalar_stmt_cost. */
881 1, /* scalar load_cost. */
882 1, /* scalar_store_cost. */
883 1, /* vec_stmt_cost. */
884 1, /* vec_to_scalar_cost. */
885 1, /* scalar_to_vec_cost. */
886 1, /* vec_align_load_cost. */
887 2, /* vec_unalign_load_cost. */
888 1, /* vec_store_cost. */
889 3, /* cond_taken_branch_cost. */
890 1, /* cond_not_taken_branch_cost. */
891 };
892
893 static const
894 struct processor_costs geode_cost = {
895 COSTS_N_INSNS (1), /* cost of an add instruction */
896 COSTS_N_INSNS (1), /* cost of a lea instruction */
897 COSTS_N_INSNS (2), /* variable shift costs */
898 COSTS_N_INSNS (1), /* constant shift costs */
899 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
900 COSTS_N_INSNS (4), /* HI */
901 COSTS_N_INSNS (7), /* SI */
902 COSTS_N_INSNS (7), /* DI */
903 COSTS_N_INSNS (7)}, /* other */
904 0, /* cost of multiply per each bit set */
905 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
906 COSTS_N_INSNS (23), /* HI */
907 COSTS_N_INSNS (39), /* SI */
908 COSTS_N_INSNS (39), /* DI */
909 COSTS_N_INSNS (39)}, /* other */
910 COSTS_N_INSNS (1), /* cost of movsx */
911 COSTS_N_INSNS (1), /* cost of movzx */
912 8, /* "large" insn */
913 4, /* MOVE_RATIO */
914 1, /* cost for loading QImode using movzbl */
915 {1, 1, 1}, /* cost of loading integer registers
916 in QImode, HImode and SImode.
917 Relative to reg-reg move (2). */
918 {1, 1, 1}, /* cost of storing integer registers */
919 1, /* cost of reg,reg fld/fst */
920 {1, 1, 1}, /* cost of loading fp registers
921 in SFmode, DFmode and XFmode */
922 {4, 6, 6}, /* cost of storing fp registers
923 in SFmode, DFmode and XFmode */
924
925 1, /* cost of moving MMX register */
926 {1, 1}, /* cost of loading MMX registers
927 in SImode and DImode */
928 {1, 1}, /* cost of storing MMX registers
929 in SImode and DImode */
930 1, /* cost of moving SSE register */
931 {1, 1, 1}, /* cost of loading SSE registers
932 in SImode, DImode and TImode */
933 {1, 1, 1}, /* cost of storing SSE registers
934 in SImode, DImode and TImode */
935 1, /* MMX or SSE register to integer */
936 64, /* size of l1 cache. */
937 128, /* size of l2 cache. */
938 32, /* size of prefetch block */
939 1, /* number of parallel prefetches */
940 1, /* Branch cost */
941 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
942 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
943 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
944 COSTS_N_INSNS (1), /* cost of FABS instruction. */
945 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
946 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
947 {{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
948 DUMMY_STRINGOP_ALGS},
949 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
950 DUMMY_STRINGOP_ALGS}},
951 {{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
952 DUMMY_STRINGOP_ALGS},
953 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
954 DUMMY_STRINGOP_ALGS}},
955 1, /* scalar_stmt_cost. */
956 1, /* scalar load_cost. */
957 1, /* scalar_store_cost. */
958 1, /* vec_stmt_cost. */
959 1, /* vec_to_scalar_cost. */
960 1, /* scalar_to_vec_cost. */
961 1, /* vec_align_load_cost. */
962 2, /* vec_unalign_load_cost. */
963 1, /* vec_store_cost. */
964 3, /* cond_taken_branch_cost. */
965 1, /* cond_not_taken_branch_cost. */
966 };
967
968 static const
969 struct processor_costs k6_cost = {
970 COSTS_N_INSNS (1), /* cost of an add instruction */
971 COSTS_N_INSNS (2), /* cost of a lea instruction */
972 COSTS_N_INSNS (1), /* variable shift costs */
973 COSTS_N_INSNS (1), /* constant shift costs */
974 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
975 COSTS_N_INSNS (3), /* HI */
976 COSTS_N_INSNS (3), /* SI */
977 COSTS_N_INSNS (3), /* DI */
978 COSTS_N_INSNS (3)}, /* other */
979 0, /* cost of multiply per each bit set */
980 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
981 COSTS_N_INSNS (18), /* HI */
982 COSTS_N_INSNS (18), /* SI */
983 COSTS_N_INSNS (18), /* DI */
984 COSTS_N_INSNS (18)}, /* other */
985 COSTS_N_INSNS (2), /* cost of movsx */
986 COSTS_N_INSNS (2), /* cost of movzx */
987 8, /* "large" insn */
988 4, /* MOVE_RATIO */
989 3, /* cost for loading QImode using movzbl */
990 {4, 5, 4}, /* cost of loading integer registers
991 in QImode, HImode and SImode.
992 Relative to reg-reg move (2). */
993 {2, 3, 2}, /* cost of storing integer registers */
994 4, /* cost of reg,reg fld/fst */
995 {6, 6, 6}, /* cost of loading fp registers
996 in SFmode, DFmode and XFmode */
997 {4, 4, 4}, /* cost of storing fp registers
998 in SFmode, DFmode and XFmode */
999 2, /* cost of moving MMX register */
1000 {2, 2}, /* cost of loading MMX registers
1001 in SImode and DImode */
1002 {2, 2}, /* cost of storing MMX registers
1003 in SImode and DImode */
1004 2, /* cost of moving SSE register */
1005 {2, 2, 8}, /* cost of loading SSE registers
1006 in SImode, DImode and TImode */
1007 {2, 2, 8}, /* cost of storing SSE registers
1008 in SImode, DImode and TImode */
1009 6, /* MMX or SSE register to integer */
1010 32, /* size of l1 cache. */
1011 32, /* size of l2 cache. Some models
1012 have integrated l2 cache, but
1013 optimizing for k6 is not important
1014 enough to worry about that. */
1015 32, /* size of prefetch block */
1016 1, /* number of parallel prefetches */
1017 1, /* Branch cost */
1018 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
1019 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
1020 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
1021 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1022 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1023 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
1024 {{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1025 DUMMY_STRINGOP_ALGS},
1026 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1027 DUMMY_STRINGOP_ALGS}},
1028 {{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1029 DUMMY_STRINGOP_ALGS},
1030 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1031 DUMMY_STRINGOP_ALGS}},
1032 1, /* scalar_stmt_cost. */
1033 1, /* scalar load_cost. */
1034 1, /* scalar_store_cost. */
1035 1, /* vec_stmt_cost. */
1036 1, /* vec_to_scalar_cost. */
1037 1, /* scalar_to_vec_cost. */
1038 1, /* vec_align_load_cost. */
1039 2, /* vec_unalign_load_cost. */
1040 1, /* vec_store_cost. */
1041 3, /* cond_taken_branch_cost. */
1042 1, /* cond_not_taken_branch_cost. */
1043 };
1044
1045 static const
1046 struct processor_costs athlon_cost = {
1047 COSTS_N_INSNS (1), /* cost of an add instruction */
1048 COSTS_N_INSNS (2), /* cost of a lea instruction */
1049 COSTS_N_INSNS (1), /* variable shift costs */
1050 COSTS_N_INSNS (1), /* constant shift costs */
1051 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1052 COSTS_N_INSNS (5), /* HI */
1053 COSTS_N_INSNS (5), /* SI */
1054 COSTS_N_INSNS (5), /* DI */
1055 COSTS_N_INSNS (5)}, /* other */
1056 0, /* cost of multiply per each bit set */
1057 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1058 COSTS_N_INSNS (26), /* HI */
1059 COSTS_N_INSNS (42), /* SI */
1060 COSTS_N_INSNS (74), /* DI */
1061 COSTS_N_INSNS (74)}, /* other */
1062 COSTS_N_INSNS (1), /* cost of movsx */
1063 COSTS_N_INSNS (1), /* cost of movzx */
1064 8, /* "large" insn */
1065 9, /* MOVE_RATIO */
1066 4, /* cost for loading QImode using movzbl */
1067 {3, 4, 3}, /* cost of loading integer registers
1068 in QImode, HImode and SImode.
1069 Relative to reg-reg move (2). */
1070 {3, 4, 3}, /* cost of storing integer registers */
1071 4, /* cost of reg,reg fld/fst */
1072 {4, 4, 12}, /* cost of loading fp registers
1073 in SFmode, DFmode and XFmode */
1074 {6, 6, 8}, /* cost of storing fp registers
1075 in SFmode, DFmode and XFmode */
1076 2, /* cost of moving MMX register */
1077 {4, 4}, /* cost of loading MMX registers
1078 in SImode and DImode */
1079 {4, 4}, /* cost of storing MMX registers
1080 in SImode and DImode */
1081 2, /* cost of moving SSE register */
1082 {4, 4, 6}, /* cost of loading SSE registers
1083 in SImode, DImode and TImode */
1084 {4, 4, 5}, /* cost of storing SSE registers
1085 in SImode, DImode and TImode */
1086 5, /* MMX or SSE register to integer */
1087 64, /* size of l1 cache. */
1088 256, /* size of l2 cache. */
1089 64, /* size of prefetch block */
1090 6, /* number of parallel prefetches */
1091 5, /* Branch cost */
1092 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1093 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1094 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1095 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1096 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1097 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1098 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1099 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1100 128 bytes for memset. */
1101 {{{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1102 DUMMY_STRINGOP_ALGS},
1103 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1104 DUMMY_STRINGOP_ALGS}},
1105 {{{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1106 DUMMY_STRINGOP_ALGS},
1107 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1108 DUMMY_STRINGOP_ALGS}},
1109 1, /* scalar_stmt_cost. */
1110 1, /* scalar load_cost. */
1111 1, /* scalar_store_cost. */
1112 1, /* vec_stmt_cost. */
1113 1, /* vec_to_scalar_cost. */
1114 1, /* scalar_to_vec_cost. */
1115 1, /* vec_align_load_cost. */
1116 2, /* vec_unalign_load_cost. */
1117 1, /* vec_store_cost. */
1118 3, /* cond_taken_branch_cost. */
1119 1, /* cond_not_taken_branch_cost. */
1120 };
1121
1122 static const
1123 struct processor_costs k8_cost = {
1124 COSTS_N_INSNS (1), /* cost of an add instruction */
1125 COSTS_N_INSNS (2), /* cost of a lea instruction */
1126 COSTS_N_INSNS (1), /* variable shift costs */
1127 COSTS_N_INSNS (1), /* constant shift costs */
1128 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1129 COSTS_N_INSNS (4), /* HI */
1130 COSTS_N_INSNS (3), /* SI */
1131 COSTS_N_INSNS (4), /* DI */
1132 COSTS_N_INSNS (5)}, /* other */
1133 0, /* cost of multiply per each bit set */
1134 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1135 COSTS_N_INSNS (26), /* HI */
1136 COSTS_N_INSNS (42), /* SI */
1137 COSTS_N_INSNS (74), /* DI */
1138 COSTS_N_INSNS (74)}, /* other */
1139 COSTS_N_INSNS (1), /* cost of movsx */
1140 COSTS_N_INSNS (1), /* cost of movzx */
1141 8, /* "large" insn */
1142 9, /* MOVE_RATIO */
1143 4, /* cost for loading QImode using movzbl */
1144 {3, 4, 3}, /* cost of loading integer registers
1145 in QImode, HImode and SImode.
1146 Relative to reg-reg move (2). */
1147 {3, 4, 3}, /* cost of storing integer registers */
1148 4, /* cost of reg,reg fld/fst */
1149 {4, 4, 12}, /* cost of loading fp registers
1150 in SFmode, DFmode and XFmode */
1151 {6, 6, 8}, /* cost of storing fp registers
1152 in SFmode, DFmode and XFmode */
1153 2, /* cost of moving MMX register */
1154 {3, 3}, /* cost of loading MMX registers
1155 in SImode and DImode */
1156 {4, 4}, /* cost of storing MMX registers
1157 in SImode and DImode */
1158 2, /* cost of moving SSE register */
1159 {4, 3, 6}, /* cost of loading SSE registers
1160 in SImode, DImode and TImode */
1161 {4, 4, 5}, /* cost of storing SSE registers
1162 in SImode, DImode and TImode */
1163 5, /* MMX or SSE register to integer */
1164 64, /* size of l1 cache. */
1165 512, /* size of l2 cache. */
1166 64, /* size of prefetch block */
1167 /* New AMD processors never drop prefetches; if they cannot be performed
1168 immediately, they are queued. We set number of simultaneous prefetches
1169 to a large constant to reflect this (it probably is not a good idea not
1170 to limit number of prefetches at all, as their execution also takes some
1171 time). */
1172 100, /* number of parallel prefetches */
1173 3, /* Branch cost */
1174 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1175 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1176 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1177 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1178 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1179 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1180 /* K8 has optimized REP instruction for medium sized blocks, but for very
1181 small blocks it is better to use loop. For large blocks, libcall can
1182 do nontemporary accesses and beat inline considerably. */
1183 {{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1184 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1185 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1186 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
1187 {{{libcall, {{8, loop}, {24, unrolled_loop},
1188 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1189 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1190 {{libcall, {{8, loop}, {24, unrolled_loop},
1191 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1192 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
1193 4, /* scalar_stmt_cost. */
1194 2, /* scalar load_cost. */
1195 2, /* scalar_store_cost. */
1196 5, /* vec_stmt_cost. */
1197 0, /* vec_to_scalar_cost. */
1198 2, /* scalar_to_vec_cost. */
1199 2, /* vec_align_load_cost. */
1200 3, /* vec_unalign_load_cost. */
1201 3, /* vec_store_cost. */
1202 3, /* cond_taken_branch_cost. */
1203 2, /* cond_not_taken_branch_cost. */
1204 };
1205
1206 struct processor_costs amdfam10_cost = {
1207 COSTS_N_INSNS (1), /* cost of an add instruction */
1208 COSTS_N_INSNS (2), /* cost of a lea instruction */
1209 COSTS_N_INSNS (1), /* variable shift costs */
1210 COSTS_N_INSNS (1), /* constant shift costs */
1211 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1212 COSTS_N_INSNS (4), /* HI */
1213 COSTS_N_INSNS (3), /* SI */
1214 COSTS_N_INSNS (4), /* DI */
1215 COSTS_N_INSNS (5)}, /* other */
1216 0, /* cost of multiply per each bit set */
1217 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1218 COSTS_N_INSNS (35), /* HI */
1219 COSTS_N_INSNS (51), /* SI */
1220 COSTS_N_INSNS (83), /* DI */
1221 COSTS_N_INSNS (83)}, /* other */
1222 COSTS_N_INSNS (1), /* cost of movsx */
1223 COSTS_N_INSNS (1), /* cost of movzx */
1224 8, /* "large" insn */
1225 9, /* MOVE_RATIO */
1226 4, /* cost for loading QImode using movzbl */
1227 {3, 4, 3}, /* cost of loading integer registers
1228 in QImode, HImode and SImode.
1229 Relative to reg-reg move (2). */
1230 {3, 4, 3}, /* cost of storing integer registers */
1231 4, /* cost of reg,reg fld/fst */
1232 {4, 4, 12}, /* cost of loading fp registers
1233 in SFmode, DFmode and XFmode */
1234 {6, 6, 8}, /* cost of storing fp registers
1235 in SFmode, DFmode and XFmode */
1236 2, /* cost of moving MMX register */
1237 {3, 3}, /* cost of loading MMX registers
1238 in SImode and DImode */
1239 {4, 4}, /* cost of storing MMX registers
1240 in SImode and DImode */
1241 2, /* cost of moving SSE register */
1242 {4, 4, 3}, /* cost of loading SSE registers
1243 in SImode, DImode and TImode */
1244 {4, 4, 5}, /* cost of storing SSE registers
1245 in SImode, DImode and TImode */
1246 3, /* MMX or SSE register to integer */
1247 /* On K8:
1248 MOVD reg64, xmmreg Double FSTORE 4
1249 MOVD reg32, xmmreg Double FSTORE 4
1250 On AMDFAM10:
1251 MOVD reg64, xmmreg Double FADD 3
1252 1/1 1/1
1253 MOVD reg32, xmmreg Double FADD 3
1254 1/1 1/1 */
1255 64, /* size of l1 cache. */
1256 512, /* size of l2 cache. */
1257 64, /* size of prefetch block */
1258 /* New AMD processors never drop prefetches; if they cannot be performed
1259 immediately, they are queued. We set number of simultaneous prefetches
1260 to a large constant to reflect this (it probably is not a good idea not
1261 to limit number of prefetches at all, as their execution also takes some
1262 time). */
1263 100, /* number of parallel prefetches */
1264 2, /* Branch cost */
1265 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1266 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1267 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1268 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1269 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1270 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1271
1272 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1273 very small blocks it is better to use loop. For large blocks, libcall can
1274 do nontemporary accesses and beat inline considerably. */
1275 {{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1276 {libcall, {{16, loop}, {512, rep_prefix_8_byte}, {-1, libcall}}}},
1277 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1278 {libcall, {{16, loop}, {512, rep_prefix_8_byte}, {-1, libcall}}}}},
1279 {{{libcall, {{8, loop}, {24, unrolled_loop},
1280 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1281 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1282 {{libcall, {{8, loop}, {24, unrolled_loop},
1283 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1284 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
1285 4, /* scalar_stmt_cost. */
1286 2, /* scalar load_cost. */
1287 2, /* scalar_store_cost. */
1288 6, /* vec_stmt_cost. */
1289 0, /* vec_to_scalar_cost. */
1290 2, /* scalar_to_vec_cost. */
1291 2, /* vec_align_load_cost. */
1292 2, /* vec_unalign_load_cost. */
1293 2, /* vec_store_cost. */
1294 2, /* cond_taken_branch_cost. */
1295 1, /* cond_not_taken_branch_cost. */
1296 };
1297
1298 struct processor_costs bdver1_cost = {
1299 COSTS_N_INSNS (1), /* cost of an add instruction */
1300 COSTS_N_INSNS (1), /* cost of a lea instruction */
1301 COSTS_N_INSNS (1), /* variable shift costs */
1302 COSTS_N_INSNS (1), /* constant shift costs */
1303 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1304 COSTS_N_INSNS (4), /* HI */
1305 COSTS_N_INSNS (4), /* SI */
1306 COSTS_N_INSNS (6), /* DI */
1307 COSTS_N_INSNS (6)}, /* other */
1308 0, /* cost of multiply per each bit set */
1309 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1310 COSTS_N_INSNS (35), /* HI */
1311 COSTS_N_INSNS (51), /* SI */
1312 COSTS_N_INSNS (83), /* DI */
1313 COSTS_N_INSNS (83)}, /* other */
1314 COSTS_N_INSNS (1), /* cost of movsx */
1315 COSTS_N_INSNS (1), /* cost of movzx */
1316 8, /* "large" insn */
1317 9, /* MOVE_RATIO */
1318 4, /* cost for loading QImode using movzbl */
1319 {5, 5, 4}, /* cost of loading integer registers
1320 in QImode, HImode and SImode.
1321 Relative to reg-reg move (2). */
1322 {4, 4, 4}, /* cost of storing integer registers */
1323 2, /* cost of reg,reg fld/fst */
1324 {5, 5, 12}, /* cost of loading fp registers
1325 in SFmode, DFmode and XFmode */
1326 {4, 4, 8}, /* cost of storing fp registers
1327 in SFmode, DFmode and XFmode */
1328 2, /* cost of moving MMX register */
1329 {4, 4}, /* cost of loading MMX registers
1330 in SImode and DImode */
1331 {4, 4}, /* cost of storing MMX registers
1332 in SImode and DImode */
1333 2, /* cost of moving SSE register */
1334 {4, 4, 4}, /* cost of loading SSE registers
1335 in SImode, DImode and TImode */
1336 {4, 4, 4}, /* cost of storing SSE registers
1337 in SImode, DImode and TImode */
1338 2, /* MMX or SSE register to integer */
1339 /* On K8:
1340 MOVD reg64, xmmreg Double FSTORE 4
1341 MOVD reg32, xmmreg Double FSTORE 4
1342 On AMDFAM10:
1343 MOVD reg64, xmmreg Double FADD 3
1344 1/1 1/1
1345 MOVD reg32, xmmreg Double FADD 3
1346 1/1 1/1 */
1347 16, /* size of l1 cache. */
1348 2048, /* size of l2 cache. */
1349 64, /* size of prefetch block */
1350 /* New AMD processors never drop prefetches; if they cannot be performed
1351 immediately, they are queued. We set number of simultaneous prefetches
1352 to a large constant to reflect this (it probably is not a good idea not
1353 to limit number of prefetches at all, as their execution also takes some
1354 time). */
1355 100, /* number of parallel prefetches */
1356 2, /* Branch cost */
1357 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1358 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1359 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1360 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1361 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1362 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1363
1364 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1365 very small blocks it is better to use loop. For large blocks, libcall
1366 can do nontemporary accesses and beat inline considerably. */
1367 {{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1368 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1369 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1370 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
1371 {{{libcall, {{8, loop}, {24, unrolled_loop},
1372 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1373 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1374 {{libcall, {{8, loop}, {24, unrolled_loop},
1375 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1376 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
1377 6, /* scalar_stmt_cost. */
1378 4, /* scalar load_cost. */
1379 4, /* scalar_store_cost. */
1380 6, /* vec_stmt_cost. */
1381 0, /* vec_to_scalar_cost. */
1382 2, /* scalar_to_vec_cost. */
1383 4, /* vec_align_load_cost. */
1384 4, /* vec_unalign_load_cost. */
1385 4, /* vec_store_cost. */
1386 2, /* cond_taken_branch_cost. */
1387 1, /* cond_not_taken_branch_cost. */
1388 };
1389
1390 struct processor_costs bdver2_cost = {
1391 COSTS_N_INSNS (1), /* cost of an add instruction */
1392 COSTS_N_INSNS (1), /* cost of a lea instruction */
1393 COSTS_N_INSNS (1), /* variable shift costs */
1394 COSTS_N_INSNS (1), /* constant shift costs */
1395 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1396 COSTS_N_INSNS (4), /* HI */
1397 COSTS_N_INSNS (4), /* SI */
1398 COSTS_N_INSNS (6), /* DI */
1399 COSTS_N_INSNS (6)}, /* other */
1400 0, /* cost of multiply per each bit set */
1401 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1402 COSTS_N_INSNS (35), /* HI */
1403 COSTS_N_INSNS (51), /* SI */
1404 COSTS_N_INSNS (83), /* DI */
1405 COSTS_N_INSNS (83)}, /* other */
1406 COSTS_N_INSNS (1), /* cost of movsx */
1407 COSTS_N_INSNS (1), /* cost of movzx */
1408 8, /* "large" insn */
1409 9, /* MOVE_RATIO */
1410 4, /* cost for loading QImode using movzbl */
1411 {5, 5, 4}, /* cost of loading integer registers
1412 in QImode, HImode and SImode.
1413 Relative to reg-reg move (2). */
1414 {4, 4, 4}, /* cost of storing integer registers */
1415 2, /* cost of reg,reg fld/fst */
1416 {5, 5, 12}, /* cost of loading fp registers
1417 in SFmode, DFmode and XFmode */
1418 {4, 4, 8}, /* cost of storing fp registers
1419 in SFmode, DFmode and XFmode */
1420 2, /* cost of moving MMX register */
1421 {4, 4}, /* cost of loading MMX registers
1422 in SImode and DImode */
1423 {4, 4}, /* cost of storing MMX registers
1424 in SImode and DImode */
1425 2, /* cost of moving SSE register */
1426 {4, 4, 4}, /* cost of loading SSE registers
1427 in SImode, DImode and TImode */
1428 {4, 4, 4}, /* cost of storing SSE registers
1429 in SImode, DImode and TImode */
1430 2, /* MMX or SSE register to integer */
1431 /* On K8:
1432 MOVD reg64, xmmreg Double FSTORE 4
1433 MOVD reg32, xmmreg Double FSTORE 4
1434 On AMDFAM10:
1435 MOVD reg64, xmmreg Double FADD 3
1436 1/1 1/1
1437 MOVD reg32, xmmreg Double FADD 3
1438 1/1 1/1 */
1439 16, /* size of l1 cache. */
1440 2048, /* size of l2 cache. */
1441 64, /* size of prefetch block */
1442 /* New AMD processors never drop prefetches; if they cannot be performed
1443 immediately, they are queued. We set number of simultaneous prefetches
1444 to a large constant to reflect this (it probably is not a good idea not
1445 to limit number of prefetches at all, as their execution also takes some
1446 time). */
1447 100, /* number of parallel prefetches */
1448 2, /* Branch cost */
1449 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1450 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1451 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1452 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1453 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1454 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1455
1456 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1457 very small blocks it is better to use loop. For large blocks, libcall
1458 can do nontemporary accesses and beat inline considerably. */
1459 {{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1460 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1461 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1462 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
1463 {{{libcall, {{8, loop}, {24, unrolled_loop},
1464 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1465 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1466 {{libcall, {{8, loop}, {24, unrolled_loop},
1467 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1468 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
1469 6, /* scalar_stmt_cost. */
1470 4, /* scalar load_cost. */
1471 4, /* scalar_store_cost. */
1472 6, /* vec_stmt_cost. */
1473 0, /* vec_to_scalar_cost. */
1474 2, /* scalar_to_vec_cost. */
1475 4, /* vec_align_load_cost. */
1476 4, /* vec_unalign_load_cost. */
1477 4, /* vec_store_cost. */
1478 2, /* cond_taken_branch_cost. */
1479 1, /* cond_not_taken_branch_cost. */
1480 };
1481
1482 struct processor_costs btver1_cost = {
1483 COSTS_N_INSNS (1), /* cost of an add instruction */
1484 COSTS_N_INSNS (2), /* cost of a lea instruction */
1485 COSTS_N_INSNS (1), /* variable shift costs */
1486 COSTS_N_INSNS (1), /* constant shift costs */
1487 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1488 COSTS_N_INSNS (4), /* HI */
1489 COSTS_N_INSNS (3), /* SI */
1490 COSTS_N_INSNS (4), /* DI */
1491 COSTS_N_INSNS (5)}, /* other */
1492 0, /* cost of multiply per each bit set */
1493 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1494 COSTS_N_INSNS (35), /* HI */
1495 COSTS_N_INSNS (51), /* SI */
1496 COSTS_N_INSNS (83), /* DI */
1497 COSTS_N_INSNS (83)}, /* other */
1498 COSTS_N_INSNS (1), /* cost of movsx */
1499 COSTS_N_INSNS (1), /* cost of movzx */
1500 8, /* "large" insn */
1501 9, /* MOVE_RATIO */
1502 4, /* cost for loading QImode using movzbl */
1503 {3, 4, 3}, /* cost of loading integer registers
1504 in QImode, HImode and SImode.
1505 Relative to reg-reg move (2). */
1506 {3, 4, 3}, /* cost of storing integer registers */
1507 4, /* cost of reg,reg fld/fst */
1508 {4, 4, 12}, /* cost of loading fp registers
1509 in SFmode, DFmode and XFmode */
1510 {6, 6, 8}, /* cost of storing fp registers
1511 in SFmode, DFmode and XFmode */
1512 2, /* cost of moving MMX register */
1513 {3, 3}, /* cost of loading MMX registers
1514 in SImode and DImode */
1515 {4, 4}, /* cost of storing MMX registers
1516 in SImode and DImode */
1517 2, /* cost of moving SSE register */
1518 {4, 4, 3}, /* cost of loading SSE registers
1519 in SImode, DImode and TImode */
1520 {4, 4, 5}, /* cost of storing SSE registers
1521 in SImode, DImode and TImode */
1522 3, /* MMX or SSE register to integer */
1523 /* On K8:
1524 MOVD reg64, xmmreg Double FSTORE 4
1525 MOVD reg32, xmmreg Double FSTORE 4
1526 On AMDFAM10:
1527 MOVD reg64, xmmreg Double FADD 3
1528 1/1 1/1
1529 MOVD reg32, xmmreg Double FADD 3
1530 1/1 1/1 */
1531 32, /* size of l1 cache. */
1532 512, /* size of l2 cache. */
1533 64, /* size of prefetch block */
1534 100, /* number of parallel prefetches */
1535 2, /* Branch cost */
1536 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1537 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1538 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1539 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1540 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1541 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1542
1543 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1544 very small blocks it is better to use loop. For large blocks, libcall can
1545 do nontemporary accesses and beat inline considerably. */
1546 {{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1547 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1548 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1549 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
1550 {{{libcall, {{8, loop}, {24, unrolled_loop},
1551 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1552 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1553 {{libcall, {{8, loop}, {24, unrolled_loop},
1554 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1555 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
1556 4, /* scalar_stmt_cost. */
1557 2, /* scalar load_cost. */
1558 2, /* scalar_store_cost. */
1559 6, /* vec_stmt_cost. */
1560 0, /* vec_to_scalar_cost. */
1561 2, /* scalar_to_vec_cost. */
1562 2, /* vec_align_load_cost. */
1563 2, /* vec_unalign_load_cost. */
1564 2, /* vec_store_cost. */
1565 2, /* cond_taken_branch_cost. */
1566 1, /* cond_not_taken_branch_cost. */
1567 };
1568
1569 static const
1570 struct processor_costs pentium4_cost = {
1571 COSTS_N_INSNS (1), /* cost of an add instruction */
1572 COSTS_N_INSNS (3), /* cost of a lea instruction */
1573 COSTS_N_INSNS (4), /* variable shift costs */
1574 COSTS_N_INSNS (4), /* constant shift costs */
1575 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1576 COSTS_N_INSNS (15), /* HI */
1577 COSTS_N_INSNS (15), /* SI */
1578 COSTS_N_INSNS (15), /* DI */
1579 COSTS_N_INSNS (15)}, /* other */
1580 0, /* cost of multiply per each bit set */
1581 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1582 COSTS_N_INSNS (56), /* HI */
1583 COSTS_N_INSNS (56), /* SI */
1584 COSTS_N_INSNS (56), /* DI */
1585 COSTS_N_INSNS (56)}, /* other */
1586 COSTS_N_INSNS (1), /* cost of movsx */
1587 COSTS_N_INSNS (1), /* cost of movzx */
1588 16, /* "large" insn */
1589 6, /* MOVE_RATIO */
1590 2, /* cost for loading QImode using movzbl */
1591 {4, 5, 4}, /* cost of loading integer registers
1592 in QImode, HImode and SImode.
1593 Relative to reg-reg move (2). */
1594 {2, 3, 2}, /* cost of storing integer registers */
1595 2, /* cost of reg,reg fld/fst */
1596 {2, 2, 6}, /* cost of loading fp registers
1597 in SFmode, DFmode and XFmode */
1598 {4, 4, 6}, /* cost of storing fp registers
1599 in SFmode, DFmode and XFmode */
1600 2, /* cost of moving MMX register */
1601 {2, 2}, /* cost of loading MMX registers
1602 in SImode and DImode */
1603 {2, 2}, /* cost of storing MMX registers
1604 in SImode and DImode */
1605 12, /* cost of moving SSE register */
1606 {12, 12, 12}, /* cost of loading SSE registers
1607 in SImode, DImode and TImode */
1608 {2, 2, 8}, /* cost of storing SSE registers
1609 in SImode, DImode and TImode */
1610 10, /* MMX or SSE register to integer */
1611 8, /* size of l1 cache. */
1612 256, /* size of l2 cache. */
1613 64, /* size of prefetch block */
1614 6, /* number of parallel prefetches */
1615 2, /* Branch cost */
1616 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1617 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1618 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1619 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1620 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1621 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1622
1623 {{{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1624 DUMMY_STRINGOP_ALGS},
1625 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1626 DUMMY_STRINGOP_ALGS}},
1627
1628 {{{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1629 {-1, libcall}}},
1630 DUMMY_STRINGOP_ALGS},
1631 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1632 {-1, libcall}}},
1633 DUMMY_STRINGOP_ALGS}},
1634 1, /* scalar_stmt_cost. */
1635 1, /* scalar load_cost. */
1636 1, /* scalar_store_cost. */
1637 1, /* vec_stmt_cost. */
1638 1, /* vec_to_scalar_cost. */
1639 1, /* scalar_to_vec_cost. */
1640 1, /* vec_align_load_cost. */
1641 2, /* vec_unalign_load_cost. */
1642 1, /* vec_store_cost. */
1643 3, /* cond_taken_branch_cost. */
1644 1, /* cond_not_taken_branch_cost. */
1645 };
1646
1647 static const
1648 struct processor_costs nocona_cost = {
1649 COSTS_N_INSNS (1), /* cost of an add instruction */
1650 COSTS_N_INSNS (1), /* cost of a lea instruction */
1651 COSTS_N_INSNS (1), /* variable shift costs */
1652 COSTS_N_INSNS (1), /* constant shift costs */
1653 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1654 COSTS_N_INSNS (10), /* HI */
1655 COSTS_N_INSNS (10), /* SI */
1656 COSTS_N_INSNS (10), /* DI */
1657 COSTS_N_INSNS (10)}, /* other */
1658 0, /* cost of multiply per each bit set */
1659 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1660 COSTS_N_INSNS (66), /* HI */
1661 COSTS_N_INSNS (66), /* SI */
1662 COSTS_N_INSNS (66), /* DI */
1663 COSTS_N_INSNS (66)}, /* other */
1664 COSTS_N_INSNS (1), /* cost of movsx */
1665 COSTS_N_INSNS (1), /* cost of movzx */
1666 16, /* "large" insn */
1667 17, /* MOVE_RATIO */
1668 4, /* cost for loading QImode using movzbl */
1669 {4, 4, 4}, /* cost of loading integer registers
1670 in QImode, HImode and SImode.
1671 Relative to reg-reg move (2). */
1672 {4, 4, 4}, /* cost of storing integer registers */
1673 3, /* cost of reg,reg fld/fst */
1674 {12, 12, 12}, /* cost of loading fp registers
1675 in SFmode, DFmode and XFmode */
1676 {4, 4, 4}, /* cost of storing fp registers
1677 in SFmode, DFmode and XFmode */
1678 6, /* cost of moving MMX register */
1679 {12, 12}, /* cost of loading MMX registers
1680 in SImode and DImode */
1681 {12, 12}, /* cost of storing MMX registers
1682 in SImode and DImode */
1683 6, /* cost of moving SSE register */
1684 {12, 12, 12}, /* cost of loading SSE registers
1685 in SImode, DImode and TImode */
1686 {12, 12, 12}, /* cost of storing SSE registers
1687 in SImode, DImode and TImode */
1688 8, /* MMX or SSE register to integer */
1689 8, /* size of l1 cache. */
1690 1024, /* size of l2 cache. */
1691 128, /* size of prefetch block */
1692 8, /* number of parallel prefetches */
1693 1, /* Branch cost */
1694 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1695 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1696 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1697 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1698 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1699 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1700
1701 {{{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1702 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1703 {100000, unrolled_loop}, {-1, libcall}}}},
1704 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1705 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1706 {100000, unrolled_loop}, {-1, libcall}}}}},
1707
1708 {{{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1709 {-1, libcall}}},
1710 {libcall, {{24, loop}, {64, unrolled_loop},
1711 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1712 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1713 {-1, libcall}}},
1714 {libcall, {{24, loop}, {64, unrolled_loop},
1715 {8192, rep_prefix_8_byte}, {-1, libcall}}}}},
1716 1, /* scalar_stmt_cost. */
1717 1, /* scalar load_cost. */
1718 1, /* scalar_store_cost. */
1719 1, /* vec_stmt_cost. */
1720 1, /* vec_to_scalar_cost. */
1721 1, /* scalar_to_vec_cost. */
1722 1, /* vec_align_load_cost. */
1723 2, /* vec_unalign_load_cost. */
1724 1, /* vec_store_cost. */
1725 3, /* cond_taken_branch_cost. */
1726 1, /* cond_not_taken_branch_cost. */
1727 };
1728
1729 static const
1730 struct processor_costs atom_cost = {
1731 COSTS_N_INSNS (1), /* cost of an add instruction */
1732 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1733 COSTS_N_INSNS (1), /* variable shift costs */
1734 COSTS_N_INSNS (1), /* constant shift costs */
1735 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1736 COSTS_N_INSNS (4), /* HI */
1737 COSTS_N_INSNS (3), /* SI */
1738 COSTS_N_INSNS (4), /* DI */
1739 COSTS_N_INSNS (2)}, /* other */
1740 0, /* cost of multiply per each bit set */
1741 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1742 COSTS_N_INSNS (26), /* HI */
1743 COSTS_N_INSNS (42), /* SI */
1744 COSTS_N_INSNS (74), /* DI */
1745 COSTS_N_INSNS (74)}, /* other */
1746 COSTS_N_INSNS (1), /* cost of movsx */
1747 COSTS_N_INSNS (1), /* cost of movzx */
1748 8, /* "large" insn */
1749 17, /* MOVE_RATIO */
1750 4, /* cost for loading QImode using movzbl */
1751 {4, 4, 4}, /* cost of loading integer registers
1752 in QImode, HImode and SImode.
1753 Relative to reg-reg move (2). */
1754 {4, 4, 4}, /* cost of storing integer registers */
1755 4, /* cost of reg,reg fld/fst */
1756 {12, 12, 12}, /* cost of loading fp registers
1757 in SFmode, DFmode and XFmode */
1758 {6, 6, 8}, /* cost of storing fp registers
1759 in SFmode, DFmode and XFmode */
1760 2, /* cost of moving MMX register */
1761 {8, 8}, /* cost of loading MMX registers
1762 in SImode and DImode */
1763 {8, 8}, /* cost of storing MMX registers
1764 in SImode and DImode */
1765 2, /* cost of moving SSE register */
1766 {8, 8, 8}, /* cost of loading SSE registers
1767 in SImode, DImode and TImode */
1768 {8, 8, 8}, /* cost of storing SSE registers
1769 in SImode, DImode and TImode */
1770 5, /* MMX or SSE register to integer */
1771 32, /* size of l1 cache. */
1772 256, /* size of l2 cache. */
1773 64, /* size of prefetch block */
1774 6, /* number of parallel prefetches */
1775 3, /* Branch cost */
1776 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1777 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1778 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1779 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1780 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1781 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1782
1783 /* stringop_algs for memcpy.
1784 SSE loops works best on Atom, but fall back into non-SSE unrolled loop variant
1785 if that fails. */
1786 {{{libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment. */
1787 {libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}},
1788 {{libcall, {{2048, sse_loop}, {2048, unrolled_loop}, {-1, libcall}}}, /* Unknown alignment. */
1789 {libcall, {{2048, sse_loop}, {2048, unrolled_loop},
1790 {-1, libcall}}}}},
1791
1792 /* stringop_algs for memset. */
1793 {{{libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment. */
1794 {libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}},
1795 {{libcall, {{1024, sse_loop}, {1024, unrolled_loop}, /* Unknown alignment. */
1796 {-1, libcall}}},
1797 {libcall, {{2048, sse_loop}, {2048, unrolled_loop},
1798 {-1, libcall}}}}},
1799 1, /* scalar_stmt_cost. */
1800 1, /* scalar load_cost. */
1801 1, /* scalar_store_cost. */
1802 1, /* vec_stmt_cost. */
1803 1, /* vec_to_scalar_cost. */
1804 1, /* scalar_to_vec_cost. */
1805 1, /* vec_align_load_cost. */
1806 2, /* vec_unalign_load_cost. */
1807 1, /* vec_store_cost. */
1808 3, /* cond_taken_branch_cost. */
1809 1, /* cond_not_taken_branch_cost. */
1810 };
1811
1812 /* Core should produce code tuned for core variants. */
1813 static const
1814 struct processor_costs core_cost = {
1815 COSTS_N_INSNS (1), /* cost of an add instruction */
1816 /* On all chips taken into consideration lea is 2 cycles and more. With
1817 this cost however our current implementation of synth_mult results in
1818 use of unnecessary temporary registers causing regression on several
1819 SPECfp benchmarks. */
1820 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1821 COSTS_N_INSNS (1), /* variable shift costs */
1822 COSTS_N_INSNS (1), /* constant shift costs */
1823 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1824 COSTS_N_INSNS (4), /* HI */
1825 COSTS_N_INSNS (3), /* SI */
1826 COSTS_N_INSNS (4), /* DI */
1827 COSTS_N_INSNS (2)}, /* other */
1828 0, /* cost of multiply per each bit set */
1829 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1830 COSTS_N_INSNS (26), /* HI */
1831 COSTS_N_INSNS (42), /* SI */
1832 COSTS_N_INSNS (74), /* DI */
1833 COSTS_N_INSNS (74)}, /* other */
1834 COSTS_N_INSNS (1), /* cost of movsx */
1835 COSTS_N_INSNS (1), /* cost of movzx */
1836 8, /* "large" insn */
1837 17, /* MOVE_RATIO */
1838 4, /* cost for loading QImode using movzbl */
1839 {4, 4, 4}, /* cost of loading integer registers
1840 in QImode, HImode and SImode.
1841 Relative to reg-reg move (2). */
1842 {4, 4, 4}, /* cost of storing integer registers */
1843 4, /* cost of reg,reg fld/fst */
1844 {12, 12, 12}, /* cost of loading fp registers
1845 in SFmode, DFmode and XFmode */
1846 {6, 6, 8}, /* cost of storing fp registers
1847 in SFmode, DFmode and XFmode */
1848 2, /* cost of moving MMX register */
1849 {8, 8}, /* cost of loading MMX registers
1850 in SImode and DImode */
1851 {8, 8}, /* cost of storing MMX registers
1852 in SImode and DImode */
1853 2, /* cost of moving SSE register */
1854 {8, 8, 8}, /* cost of loading SSE registers
1855 in SImode, DImode and TImode */
1856 {8, 8, 8}, /* cost of storing SSE registers
1857 in SImode, DImode and TImode */
1858 5, /* MMX or SSE register to integer */
1859 32, /* size of l1 cache. */
1860 512, /* size of l2 cache. */
1861 64, /* size of prefetch block */
1862 6, /* number of parallel prefetches */
1863 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1864 value is increased to perhaps more appropriate value of 5. */
1865 3, /* Branch cost */
1866 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1867 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1868 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1869 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1870 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1871 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1872
1873 /* stringop_algs for memcpy. */
1874 {{{libcall, {{16, loop}, {24, unrolled_loop}, {1024, rep_prefix_4_byte}, {-1, libcall}}}, /* Known alignment. */
1875 {libcall, {{16, loop}, {24, unrolled_loop}, {1024, rep_prefix_8_byte}, {-1, libcall}}}},
1876 {{libcall, {{16, loop}, {24, unrolled_loop}, {1024, rep_prefix_4_byte}, {-1, libcall}}}, /* Unknown alignment. */
1877 {libcall, {{16, loop}, {24, unrolled_loop}, {1024, rep_prefix_8_byte}, {-1, libcall}}}}},
1878
1879 /* stringop_algs for memset. */
1880 {{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, /* Known alignment. */
1881 {libcall, {{256, rep_prefix_8_byte}, {-1, libcall}}}},
1882 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, /* Unknown alignment. */
1883 {libcall, {{256, rep_prefix_8_byte}, {-1, libcall}}}}},
1884 1, /* scalar_stmt_cost. */
1885 1, /* scalar load_cost. */
1886 1, /* scalar_store_cost. */
1887 1, /* vec_stmt_cost. */
1888 1, /* vec_to_scalar_cost. */
1889 1, /* scalar_to_vec_cost. */
1890 1, /* vec_align_load_cost. */
1891 2, /* vec_unalign_load_cost. */
1892 1, /* vec_store_cost. */
1893 3, /* cond_taken_branch_cost. */
1894 1, /* cond_not_taken_branch_cost. */
1895 };
1896
1897 /* Generic64 should produce code tuned for Nocona, Core, K8, Amdfam10 and buldozer. */
1898 static const
1899 struct processor_costs generic64_cost = {
1900 COSTS_N_INSNS (1), /* cost of an add instruction */
1901 /* On all chips taken into consideration lea is 2 cycles and more. With
1902 this cost however our current implementation of synth_mult results in
1903 use of unnecessary temporary registers causing regression on several
1904 SPECfp benchmarks. */
1905 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1906 COSTS_N_INSNS (1), /* variable shift costs */
1907 COSTS_N_INSNS (1), /* constant shift costs */
1908 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1909 COSTS_N_INSNS (4), /* HI */
1910 COSTS_N_INSNS (3), /* SI */
1911 COSTS_N_INSNS (4), /* DI */
1912 COSTS_N_INSNS (2)}, /* other */
1913 0, /* cost of multiply per each bit set */
1914 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1915 COSTS_N_INSNS (26), /* HI */
1916 COSTS_N_INSNS (42), /* SI */
1917 COSTS_N_INSNS (74), /* DI */
1918 COSTS_N_INSNS (74)}, /* other */
1919 COSTS_N_INSNS (1), /* cost of movsx */
1920 COSTS_N_INSNS (1), /* cost of movzx */
1921 8, /* "large" insn */
1922 17, /* MOVE_RATIO */
1923 4, /* cost for loading QImode using movzbl */
1924 {4, 4, 4}, /* cost of loading integer registers
1925 in QImode, HImode and SImode.
1926 Relative to reg-reg move (2). */
1927 {4, 4, 4}, /* cost of storing integer registers */
1928 4, /* cost of reg,reg fld/fst */
1929 {12, 12, 12}, /* cost of loading fp registers
1930 in SFmode, DFmode and XFmode */
1931 {6, 6, 8}, /* cost of storing fp registers
1932 in SFmode, DFmode and XFmode */
1933 2, /* cost of moving MMX register */
1934 {8, 8}, /* cost of loading MMX registers
1935 in SImode and DImode */
1936 {8, 8}, /* cost of storing MMX registers
1937 in SImode and DImode */
1938 2, /* cost of moving SSE register */
1939 {8, 8, 8}, /* cost of loading SSE registers
1940 in SImode, DImode and TImode */
1941 {8, 8, 8}, /* cost of storing SSE registers
1942 in SImode, DImode and TImode */
1943 5, /* MMX or SSE register to integer */
1944 32, /* size of l1 cache. */
1945 512, /* size of l2 cache. */
1946 64, /* size of prefetch block */
1947 6, /* number of parallel prefetches */
1948 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1949 value is increased to perhaps more appropriate value of 5. */
1950 3, /* Branch cost */
1951 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1952 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1953 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1954 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1955 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1956 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1957
1958 {{DUMMY_STRINGOP_ALGS,
1959 {libcall, {{16, rep_prefix_4_byte}, {128, rep_prefix_8_byte}, {4096, rep_prefix_1_byte}, {-1, libcall}}}},
1960 {DUMMY_STRINGOP_ALGS,
1961 {libcall, {{128, rep_prefix_4_byte}, {4096, rep_prefix_1_byte}, {-1, libcall}}}}},
1962
1963 {{DUMMY_STRINGOP_ALGS,
1964 {libcall, {{16, rep_prefix_4_byte}, {512, unrolled_loop}, {4096, rep_prefix_1_byte}, {-1, libcall}}}},
1965 {DUMMY_STRINGOP_ALGS,
1966 {libcall, {{16, rep_prefix_4_byte}, {512, unrolled_loop}, {4096, rep_prefix_1_byte}, {-1, libcall}}}}},
1967 1, /* scalar_stmt_cost. */
1968 1, /* scalar load_cost. */
1969 1, /* scalar_store_cost. */
1970 1, /* vec_stmt_cost. */
1971 1, /* vec_to_scalar_cost. */
1972 1, /* scalar_to_vec_cost. */
1973 1, /* vec_align_load_cost. */
1974 2, /* vec_unalign_load_cost. */
1975 1, /* vec_store_cost. */
1976 3, /* cond_taken_branch_cost. */
1977 1, /* cond_not_taken_branch_cost. */
1978 };
1979
1980 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, Core
1981 Athlon, K8, amdfam10, buldozer. */
1982 static const
1983 struct processor_costs generic32_cost = {
1984 COSTS_N_INSNS (1), /* cost of an add instruction */
1985 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1986 COSTS_N_INSNS (1), /* variable shift costs */
1987 COSTS_N_INSNS (1), /* constant shift costs */
1988 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1989 COSTS_N_INSNS (4), /* HI */
1990 COSTS_N_INSNS (3), /* SI */
1991 COSTS_N_INSNS (4), /* DI */
1992 COSTS_N_INSNS (2)}, /* other */
1993 0, /* cost of multiply per each bit set */
1994 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1995 COSTS_N_INSNS (26), /* HI */
1996 COSTS_N_INSNS (42), /* SI */
1997 COSTS_N_INSNS (74), /* DI */
1998 COSTS_N_INSNS (74)}, /* other */
1999 COSTS_N_INSNS (1), /* cost of movsx */
2000 COSTS_N_INSNS (1), /* cost of movzx */
2001 8, /* "large" insn */
2002 17, /* MOVE_RATIO */
2003 4, /* cost for loading QImode using movzbl */
2004 {4, 4, 4}, /* cost of loading integer registers
2005 in QImode, HImode and SImode.
2006 Relative to reg-reg move (2). */
2007 {4, 4, 4}, /* cost of storing integer registers */
2008 4, /* cost of reg,reg fld/fst */
2009 {12, 12, 12}, /* cost of loading fp registers
2010 in SFmode, DFmode and XFmode */
2011 {6, 6, 8}, /* cost of storing fp registers
2012 in SFmode, DFmode and XFmode */
2013 2, /* cost of moving MMX register */
2014 {8, 8}, /* cost of loading MMX registers
2015 in SImode and DImode */
2016 {8, 8}, /* cost of storing MMX registers
2017 in SImode and DImode */
2018 2, /* cost of moving SSE register */
2019 {8, 8, 8}, /* cost of loading SSE registers
2020 in SImode, DImode and TImode */
2021 {8, 8, 8}, /* cost of storing SSE registers
2022 in SImode, DImode and TImode */
2023 5, /* MMX or SSE register to integer */
2024 32, /* size of l1 cache. */
2025 256, /* size of l2 cache. */
2026 64, /* size of prefetch block */
2027 6, /* number of parallel prefetches */
2028 3, /* Branch cost */
2029 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2030 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2031 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2032 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2033 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2034 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2035 /* stringop_algs for memcpy. */
2036 {{{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
2037 DUMMY_STRINGOP_ALGS},
2038 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
2039 DUMMY_STRINGOP_ALGS}},
2040 /* stringop_algs for memset. */
2041 {{{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
2042 DUMMY_STRINGOP_ALGS},
2043 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
2044 DUMMY_STRINGOP_ALGS}},
2045 1, /* scalar_stmt_cost. */
2046 1, /* scalar load_cost. */
2047 1, /* scalar_store_cost. */
2048 1, /* vec_stmt_cost. */
2049 1, /* vec_to_scalar_cost. */
2050 1, /* scalar_to_vec_cost. */
2051 1, /* vec_align_load_cost. */
2052 2, /* vec_unalign_load_cost. */
2053 1, /* vec_store_cost. */
2054 3, /* cond_taken_branch_cost. */
2055 1, /* cond_not_taken_branch_cost. */
2056 };
2057
2058 const struct processor_costs *ix86_cost = &pentium_cost;
2059
2060 /* Processor feature/optimization bitmasks. */
2061 #define m_386 (1<<PROCESSOR_I386)
2062 #define m_486 (1<<PROCESSOR_I486)
2063 #define m_PENT (1<<PROCESSOR_PENTIUM)
2064 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
2065 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
2066 #define m_NOCONA (1<<PROCESSOR_NOCONA)
2067 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
2068 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
2069 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
2070 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
2071 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
2072 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
2073 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
2074 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
2075 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
2076 #define m_ATOM (1<<PROCESSOR_ATOM)
2077
2078 #define m_GEODE (1<<PROCESSOR_GEODE)
2079 #define m_K6 (1<<PROCESSOR_K6)
2080 #define m_K6_GEODE (m_K6 | m_GEODE)
2081 #define m_K8 (1<<PROCESSOR_K8)
2082 #define m_ATHLON (1<<PROCESSOR_ATHLON)
2083 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
2084 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
2085 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
2086 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
2087 #define m_BDVER (m_BDVER1 | m_BDVER2)
2088 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
2089 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
2090
2091 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
2092 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
2093
2094 /* Generic instruction choice should be common subset of supported CPUs
2095 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
2096 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
2097
2098 /* Feature tests against the various tunings. */
2099 unsigned char ix86_tune_features[X86_TUNE_LAST];
2100
2101 /* Feature tests against the various tunings used to create ix86_tune_features
2102 based on the processor mask. */
2103 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
2104 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
2105 negatively, so enabling for Generic64 seems like good code size
2106 tradeoff. We can't enable it for 32bit generic because it does not
2107 work well with PPro base chips. */
2108 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
2109
2110 /* X86_TUNE_PUSH_MEMORY */
2111 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2112
2113 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
2114 m_486 | m_PENT,
2115
2116 /* X86_TUNE_UNROLL_STRLEN */
2117 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
2118
2119 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
2120 on simulation result. But after P4 was made, no performance benefit
2121 was observed with branch hints. It also increases the code size.
2122 As a result, icc never generates branch hints. */
2123 0,
2124
2125 /* X86_TUNE_DOUBLE_WITH_ADD */
2126 ~m_386,
2127
2128 /* X86_TUNE_USE_SAHF */
2129 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
2130
2131 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
2132 partial dependencies. */
2133 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2134
2135 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
2136 register stalls on Generic32 compilation setting as well. However
2137 in current implementation the partial register stalls are not eliminated
2138 very well - they can be introduced via subregs synthesized by combine
2139 and can happen in caller/callee saving sequences. Because this option
2140 pays back little on PPro based chips and is in conflict with partial reg
2141 dependencies used by Athlon/P4 based chips, it is better to leave it off
2142 for generic32 for now. */
2143 m_PPRO,
2144
2145 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
2146 m_CORE2I7 | m_GENERIC,
2147
2148 /* X86_TUNE_USE_HIMODE_FIOP */
2149 m_386 | m_486 | m_K6_GEODE,
2150
2151 /* X86_TUNE_USE_SIMODE_FIOP */
2152 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
2153
2154 /* X86_TUNE_USE_MOV0 */
2155 m_K6,
2156
2157 /* X86_TUNE_USE_CLTD */
2158 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
2159
2160 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
2161 m_PENT4,
2162
2163 /* X86_TUNE_SPLIT_LONG_MOVES */
2164 m_PPRO,
2165
2166 /* X86_TUNE_READ_MODIFY_WRITE */
2167 ~m_PENT,
2168
2169 /* X86_TUNE_READ_MODIFY */
2170 ~(m_PENT | m_PPRO),
2171
2172 /* X86_TUNE_PROMOTE_QIMODE */
2173 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2174
2175 /* X86_TUNE_FAST_PREFIX */
2176 ~(m_386 | m_486 | m_PENT),
2177
2178 /* X86_TUNE_SINGLE_STRINGOP */
2179 m_386 | m_P4_NOCONA,
2180
2181 /* X86_TUNE_QIMODE_MATH */
2182 ~0,
2183
2184 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2185 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2186 might be considered for Generic32 if our scheme for avoiding partial
2187 stalls was more effective. */
2188 ~m_PPRO,
2189
2190 /* X86_TUNE_PROMOTE_QI_REGS */
2191 0,
2192
2193 /* X86_TUNE_PROMOTE_HI_REGS */
2194 m_PPRO,
2195
2196 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2197 over esp addition. */
2198 m_386 | m_486 | m_PENT | m_PPRO,
2199
2200 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2201 over esp addition. */
2202 m_PENT,
2203
2204 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2205 over esp subtraction. */
2206 m_386 | m_486 | m_PENT | m_K6_GEODE,
2207
2208 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2209 over esp subtraction. */
2210 m_PENT | m_K6_GEODE,
2211
2212 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2213 for DFmode copies */
2214 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2215
2216 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2217 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2218
2219 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2220 conflict here in between PPro/Pentium4 based chips that thread 128bit
2221 SSE registers as single units versus K8 based chips that divide SSE
2222 registers to two 64bit halves. This knob promotes all store destinations
2223 to be 128bit to allow register renaming on 128bit SSE units, but usually
2224 results in one extra microop on 64bit SSE units. Experimental results
2225 shows that disabling this option on P4 brings over 20% SPECfp regression,
2226 while enabling it on K8 brings roughly 2.4% regression that can be partly
2227 masked by careful scheduling of moves. */
2228 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2229
2230 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2231 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2232
2233 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2234 m_COREI7 | m_BDVER,
2235
2236 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2237 m_BDVER ,
2238
2239 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2240 are resolved on SSE register parts instead of whole registers, so we may
2241 maintain just lower part of scalar values in proper format leaving the
2242 upper part undefined. */
2243 m_ATHLON_K8,
2244
2245 /* X86_TUNE_SSE_TYPELESS_STORES */
2246 m_AMD_MULTIPLE,
2247
2248 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2249 m_PPRO | m_P4_NOCONA,
2250
2251 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2252 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2253
2254 /* X86_TUNE_PROLOGUE_USING_MOVE */
2255 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2256
2257 /* X86_TUNE_EPILOGUE_USING_MOVE */
2258 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2259
2260 /* X86_TUNE_SHIFT1 */
2261 ~m_486,
2262
2263 /* X86_TUNE_USE_FFREEP */
2264 m_AMD_MULTIPLE,
2265
2266 /* X86_TUNE_INTER_UNIT_MOVES */
2267 ~(m_AMD_MULTIPLE | m_GENERIC),
2268
2269 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2270 ~(m_AMDFAM10 | m_BDVER ),
2271
2272 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2273 than 4 branch instructions in the 16 byte window. */
2274 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2275
2276 /* X86_TUNE_SCHEDULE */
2277 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2278
2279 /* X86_TUNE_USE_BT */
2280 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2281
2282 /* X86_TUNE_USE_INCDEC */
2283 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2284
2285 /* X86_TUNE_PAD_RETURNS */
2286 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2287
2288 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2289 m_ATOM,
2290
2291 /* X86_TUNE_EXT_80387_CONSTANTS */
2292 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2293
2294 /* X86_TUNE_SHORTEN_X87_SSE */
2295 ~m_K8,
2296
2297 /* X86_TUNE_AVOID_VECTOR_DECODE */
2298 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2299
2300 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2301 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2302 ~(m_386 | m_486),
2303
2304 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2305 vector path on AMD machines. */
2306 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2307
2308 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2309 machines. */
2310 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2311
2312 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2313 than a MOV. */
2314 m_PENT,
2315
2316 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2317 but one byte longer. */
2318 m_PENT,
2319
2320 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2321 operand that cannot be represented using a modRM byte. The XOR
2322 replacement is long decoded, so this split helps here as well. */
2323 m_K6,
2324
2325 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2326 from FP to FP. */
2327 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2328
2329 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2330 from integer to FP. */
2331 m_AMDFAM10,
2332
2333 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2334 with a subsequent conditional jump instruction into a single
2335 compare-and-branch uop. */
2336 m_BDVER,
2337
2338 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2339 will impact LEA instruction selection. */
2340 m_ATOM,
2341
2342 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2343 instructions. */
2344 ~m_ATOM,
2345
2346 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2347 at -O3. For the moment, the prefetching seems badly tuned for Intel
2348 chips. */
2349 m_K6_GEODE | m_AMD_MULTIPLE,
2350
2351 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2352 the auto-vectorizer. */
2353 m_BDVER,
2354
2355 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2356 during reassociation of integer computation. */
2357 m_ATOM,
2358
2359 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2360 during reassociation of fp computation. */
2361 m_ATOM
2362 };
2363
2364 /* Feature tests against the various architecture variations. */
2365 unsigned char ix86_arch_features[X86_ARCH_LAST];
2366
2367 /* Feature tests against the various architecture variations, used to create
2368 ix86_arch_features based on the processor mask. */
2369 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2370 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2371 ~(m_386 | m_486 | m_PENT | m_K6),
2372
2373 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2374 ~m_386,
2375
2376 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2377 ~(m_386 | m_486),
2378
2379 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2380 ~m_386,
2381
2382 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2383 ~m_386,
2384 };
2385
2386 static const unsigned int x86_accumulate_outgoing_args
2387 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2388
2389 static const unsigned int x86_arch_always_fancy_math_387
2390 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2391
2392 static const unsigned int x86_avx256_split_unaligned_load
2393 = m_COREI7 | m_GENERIC;
2394
2395 static const unsigned int x86_avx256_split_unaligned_store
2396 = m_COREI7 | m_BDVER | m_GENERIC;
2397
2398 /* In case the average insn count for single function invocation is
2399 lower than this constant, emit fast (but longer) prologue and
2400 epilogue code. */
2401 #define FAST_PROLOGUE_INSN_COUNT 20
2402
2403 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2404 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2405 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2406 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2407
2408 /* Array of the smallest class containing reg number REGNO, indexed by
2409 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2410
2411 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2412 {
2413 /* ax, dx, cx, bx */
2414 AREG, DREG, CREG, BREG,
2415 /* si, di, bp, sp */
2416 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2417 /* FP registers */
2418 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2419 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2420 /* arg pointer */
2421 NON_Q_REGS,
2422 /* flags, fpsr, fpcr, frame */
2423 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2424 /* SSE registers */
2425 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2426 SSE_REGS, SSE_REGS,
2427 /* MMX registers */
2428 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2429 MMX_REGS, MMX_REGS,
2430 /* REX registers */
2431 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2432 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2433 /* SSE REX registers */
2434 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2435 SSE_REGS, SSE_REGS,
2436 };
2437
2438 /* The "default" register map used in 32bit mode. */
2439
2440 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2441 {
2442 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2443 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2444 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2445 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2446 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2447 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2448 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2449 };
2450
2451 /* The "default" register map used in 64bit mode. */
2452
2453 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2454 {
2455 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2456 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2457 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2458 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2459 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2460 8,9,10,11,12,13,14,15, /* extended integer registers */
2461 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2462 };
2463
2464 /* Define the register numbers to be used in Dwarf debugging information.
2465 The SVR4 reference port C compiler uses the following register numbers
2466 in its Dwarf output code:
2467 0 for %eax (gcc regno = 0)
2468 1 for %ecx (gcc regno = 2)
2469 2 for %edx (gcc regno = 1)
2470 3 for %ebx (gcc regno = 3)
2471 4 for %esp (gcc regno = 7)
2472 5 for %ebp (gcc regno = 6)
2473 6 for %esi (gcc regno = 4)
2474 7 for %edi (gcc regno = 5)
2475 The following three DWARF register numbers are never generated by
2476 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2477 believes these numbers have these meanings.
2478 8 for %eip (no gcc equivalent)
2479 9 for %eflags (gcc regno = 17)
2480 10 for %trapno (no gcc equivalent)
2481 It is not at all clear how we should number the FP stack registers
2482 for the x86 architecture. If the version of SDB on x86/svr4 were
2483 a bit less brain dead with respect to floating-point then we would
2484 have a precedent to follow with respect to DWARF register numbers
2485 for x86 FP registers, but the SDB on x86/svr4 is so completely
2486 broken with respect to FP registers that it is hardly worth thinking
2487 of it as something to strive for compatibility with.
2488 The version of x86/svr4 SDB I have at the moment does (partially)
2489 seem to believe that DWARF register number 11 is associated with
2490 the x86 register %st(0), but that's about all. Higher DWARF
2491 register numbers don't seem to be associated with anything in
2492 particular, and even for DWARF regno 11, SDB only seems to under-
2493 stand that it should say that a variable lives in %st(0) (when
2494 asked via an `=' command) if we said it was in DWARF regno 11,
2495 but SDB still prints garbage when asked for the value of the
2496 variable in question (via a `/' command).
2497 (Also note that the labels SDB prints for various FP stack regs
2498 when doing an `x' command are all wrong.)
2499 Note that these problems generally don't affect the native SVR4
2500 C compiler because it doesn't allow the use of -O with -g and
2501 because when it is *not* optimizing, it allocates a memory
2502 location for each floating-point variable, and the memory
2503 location is what gets described in the DWARF AT_location
2504 attribute for the variable in question.
2505 Regardless of the severe mental illness of the x86/svr4 SDB, we
2506 do something sensible here and we use the following DWARF
2507 register numbers. Note that these are all stack-top-relative
2508 numbers.
2509 11 for %st(0) (gcc regno = 8)
2510 12 for %st(1) (gcc regno = 9)
2511 13 for %st(2) (gcc regno = 10)
2512 14 for %st(3) (gcc regno = 11)
2513 15 for %st(4) (gcc regno = 12)
2514 16 for %st(5) (gcc regno = 13)
2515 17 for %st(6) (gcc regno = 14)
2516 18 for %st(7) (gcc regno = 15)
2517 */
2518 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2519 {
2520 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2521 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2522 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2523 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2524 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2525 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2526 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2527 };
2528
2529 /* Define parameter passing and return registers. */
2530
2531 static int const x86_64_int_parameter_registers[6] =
2532 {
2533 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2534 };
2535
2536 static int const x86_64_ms_abi_int_parameter_registers[4] =
2537 {
2538 CX_REG, DX_REG, R8_REG, R9_REG
2539 };
2540
2541 static int const x86_64_int_return_registers[4] =
2542 {
2543 AX_REG, DX_REG, DI_REG, SI_REG
2544 };
2545
2546 /* Define the structure for the machine field in struct function. */
2547
2548 struct GTY(()) stack_local_entry {
2549 unsigned short mode;
2550 unsigned short n;
2551 rtx rtl;
2552 struct stack_local_entry *next;
2553 };
2554
2555 /* Structure describing stack frame layout.
2556 Stack grows downward:
2557
2558 [arguments]
2559 <- ARG_POINTER
2560 saved pc
2561
2562 saved static chain if ix86_static_chain_on_stack
2563
2564 saved frame pointer if frame_pointer_needed
2565 <- HARD_FRAME_POINTER
2566 [saved regs]
2567 <- regs_save_offset
2568 [padding0]
2569
2570 [saved SSE regs]
2571 <- sse_regs_save_offset
2572 [padding1] |
2573 | <- FRAME_POINTER
2574 [va_arg registers] |
2575 |
2576 [frame] |
2577 |
2578 [padding2] | = to_allocate
2579 <- STACK_POINTER
2580 */
2581 struct ix86_frame
2582 {
2583 int nsseregs;
2584 int nregs;
2585 int va_arg_size;
2586 int red_zone_size;
2587 int outgoing_arguments_size;
2588 HOST_WIDE_INT frame;
2589
2590 /* The offsets relative to ARG_POINTER. */
2591 HOST_WIDE_INT frame_pointer_offset;
2592 HOST_WIDE_INT hard_frame_pointer_offset;
2593 HOST_WIDE_INT stack_pointer_offset;
2594 HOST_WIDE_INT hfp_save_offset;
2595 HOST_WIDE_INT reg_save_offset;
2596 HOST_WIDE_INT sse_reg_save_offset;
2597
2598 /* When save_regs_using_mov is set, emit prologue using
2599 move instead of push instructions. */
2600 bool save_regs_using_mov;
2601 };
2602
2603 /* Which cpu are we scheduling for. */
2604 enum attr_cpu ix86_schedule;
2605
2606 /* Which cpu are we optimizing for. */
2607 enum processor_type ix86_tune;
2608
2609 /* Which instruction set architecture to use. */
2610 enum processor_type ix86_arch;
2611
2612 /* true if sse prefetch instruction is not NOOP. */
2613 int x86_prefetch_sse;
2614
2615 /* -mstackrealign option */
2616 static const char ix86_force_align_arg_pointer_string[]
2617 = "force_align_arg_pointer";
2618
2619 static rtx (*ix86_gen_leave) (void);
2620 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2621 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2622 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2623 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2624 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2625 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2626 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2627 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2628 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2629
2630 /* Preferred alignment for stack boundary in bits. */
2631 unsigned int ix86_preferred_stack_boundary;
2632
2633 /* Alignment for incoming stack boundary in bits specified at
2634 command line. */
2635 static unsigned int ix86_user_incoming_stack_boundary;
2636
2637 /* Default alignment for incoming stack boundary in bits. */
2638 static unsigned int ix86_default_incoming_stack_boundary;
2639
2640 /* Alignment for incoming stack boundary in bits. */
2641 unsigned int ix86_incoming_stack_boundary;
2642
2643 /* Calling abi specific va_list type nodes. */
2644 static GTY(()) tree sysv_va_list_type_node;
2645 static GTY(()) tree ms_va_list_type_node;
2646
2647 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2648 char internal_label_prefix[16];
2649 int internal_label_prefix_len;
2650
2651 /* Fence to use after loop using movnt. */
2652 tree x86_mfence;
2653
2654 /* Register class used for passing given 64bit part of the argument.
2655 These represent classes as documented by the PS ABI, with the exception
2656 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2657 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2658
2659 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2660 whenever possible (upper half does contain padding). */
2661 enum x86_64_reg_class
2662 {
2663 X86_64_NO_CLASS,
2664 X86_64_INTEGER_CLASS,
2665 X86_64_INTEGERSI_CLASS,
2666 X86_64_SSE_CLASS,
2667 X86_64_SSESF_CLASS,
2668 X86_64_SSEDF_CLASS,
2669 X86_64_SSEUP_CLASS,
2670 X86_64_X87_CLASS,
2671 X86_64_X87UP_CLASS,
2672 X86_64_COMPLEX_X87_CLASS,
2673 X86_64_MEMORY_CLASS
2674 };
2675
2676 #define MAX_CLASSES 4
2677
2678 /* Table of constants used by fldpi, fldln2, etc.... */
2679 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2680 static bool ext_80387_constants_init = 0;
2681
2682 \f
2683 static struct machine_function * ix86_init_machine_status (void);
2684 static rtx ix86_function_value (const_tree, const_tree, bool);
2685 static bool ix86_function_value_regno_p (const unsigned int);
2686 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2687 const_tree);
2688 static rtx ix86_static_chain (const_tree, bool);
2689 static int ix86_function_regparm (const_tree, const_tree);
2690 static void ix86_compute_frame_layout (struct ix86_frame *);
2691 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2692 rtx, rtx, int);
2693 static void ix86_add_new_builtins (HOST_WIDE_INT);
2694 static tree ix86_canonical_va_list_type (tree);
2695 static void predict_jump (int);
2696 static unsigned int split_stack_prologue_scratch_regno (void);
2697 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2698
2699 enum ix86_function_specific_strings
2700 {
2701 IX86_FUNCTION_SPECIFIC_ARCH,
2702 IX86_FUNCTION_SPECIFIC_TUNE,
2703 IX86_FUNCTION_SPECIFIC_MAX
2704 };
2705
2706 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2707 const char *, enum fpmath_unit, bool);
2708 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2709 static void ix86_function_specific_save (struct cl_target_option *);
2710 static void ix86_function_specific_restore (struct cl_target_option *);
2711 static void ix86_function_specific_print (FILE *, int,
2712 struct cl_target_option *);
2713 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2714 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2715 struct gcc_options *);
2716 static bool ix86_can_inline_p (tree, tree);
2717 static void ix86_set_current_function (tree);
2718 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2719
2720 static enum calling_abi ix86_function_abi (const_tree);
2721 static rtx promote_duplicated_reg (enum machine_mode, rtx);
2722 static rtx promote_duplicated_reg_to_size (rtx, int, int, int);
2723
2724 \f
2725 #ifndef SUBTARGET32_DEFAULT_CPU
2726 #define SUBTARGET32_DEFAULT_CPU "i386"
2727 #endif
2728
2729 /* The svr4 ABI for the i386 says that records and unions are returned
2730 in memory. */
2731 #ifndef DEFAULT_PCC_STRUCT_RETURN
2732 #define DEFAULT_PCC_STRUCT_RETURN 1
2733 #endif
2734
2735 /* Whether -mtune= or -march= were specified */
2736 static int ix86_tune_defaulted;
2737 static int ix86_arch_specified;
2738
2739 /* Vectorization library interface and handlers. */
2740 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2741
2742 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2743 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2744
2745 /* Processor target table, indexed by processor number */
2746 struct ptt
2747 {
2748 const struct processor_costs *cost; /* Processor costs */
2749 const int align_loop; /* Default alignments. */
2750 const int align_loop_max_skip;
2751 const int align_jump;
2752 const int align_jump_max_skip;
2753 const int align_func;
2754 };
2755
2756 static const struct ptt processor_target_table[PROCESSOR_max] =
2757 {
2758 {&i386_cost, 4, 3, 4, 3, 4},
2759 {&i486_cost, 16, 15, 16, 15, 16},
2760 {&pentium_cost, 16, 7, 16, 7, 16},
2761 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2762 {&geode_cost, 0, 0, 0, 0, 0},
2763 {&k6_cost, 32, 7, 32, 7, 32},
2764 {&athlon_cost, 16, 7, 16, 7, 16},
2765 {&pentium4_cost, 0, 0, 0, 0, 0},
2766 {&k8_cost, 16, 7, 16, 7, 16},
2767 {&nocona_cost, 0, 0, 0, 0, 0},
2768 /* Core 2 32-bit. */
2769 {&core_cost, 16, 10, 16, 10, 16},
2770 /* Core 2 64-bit. */
2771 {&core_cost, 16, 10, 16, 10, 16},
2772 /* Core i7 32-bit. */
2773 {&core_cost, 16, 10, 16, 10, 16},
2774 /* Core i7 64-bit. */
2775 {&core_cost, 16, 10, 16, 10, 16},
2776 {&generic32_cost, 16, 7, 16, 7, 16},
2777 {&generic64_cost, 16, 10, 16, 10, 16},
2778 {&amdfam10_cost, 32, 24, 32, 7, 32},
2779 {&bdver1_cost, 32, 24, 32, 7, 32},
2780 {&bdver2_cost, 32, 24, 32, 7, 32},
2781 {&btver1_cost, 32, 24, 32, 7, 32},
2782 {&atom_cost, 16, 15, 16, 7, 16}
2783 };
2784
2785 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2786 {
2787 "generic",
2788 "i386",
2789 "i486",
2790 "pentium",
2791 "pentium-mmx",
2792 "pentiumpro",
2793 "pentium2",
2794 "pentium3",
2795 "pentium4",
2796 "pentium-m",
2797 "prescott",
2798 "nocona",
2799 "core2",
2800 "corei7",
2801 "atom",
2802 "geode",
2803 "k6",
2804 "k6-2",
2805 "k6-3",
2806 "athlon",
2807 "athlon-4",
2808 "k8",
2809 "amdfam10",
2810 "bdver1",
2811 "bdver2",
2812 "btver1"
2813 };
2814 \f
2815 /* Return true if a red-zone is in use. */
2816
2817 static inline bool
2818 ix86_using_red_zone (void)
2819 {
2820 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2821 }
2822 \f
2823 /* Return a string that documents the current -m options. The caller is
2824 responsible for freeing the string. */
2825
2826 static char *
2827 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2828 const char *tune, enum fpmath_unit fpmath,
2829 bool add_nl_p)
2830 {
2831 struct ix86_target_opts
2832 {
2833 const char *option; /* option string */
2834 HOST_WIDE_INT mask; /* isa mask options */
2835 };
2836
2837 /* This table is ordered so that options like -msse4.2 that imply
2838 preceding options while match those first. */
2839 static struct ix86_target_opts isa_opts[] =
2840 {
2841 { "-m64", OPTION_MASK_ISA_64BIT },
2842 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2843 { "-mfma", OPTION_MASK_ISA_FMA },
2844 { "-mxop", OPTION_MASK_ISA_XOP },
2845 { "-mlwp", OPTION_MASK_ISA_LWP },
2846 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2847 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2848 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2849 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2850 { "-msse3", OPTION_MASK_ISA_SSE3 },
2851 { "-msse2", OPTION_MASK_ISA_SSE2 },
2852 { "-msse", OPTION_MASK_ISA_SSE },
2853 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2854 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2855 { "-mmmx", OPTION_MASK_ISA_MMX },
2856 { "-mabm", OPTION_MASK_ISA_ABM },
2857 { "-mbmi", OPTION_MASK_ISA_BMI },
2858 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2859 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2860 { "-mtbm", OPTION_MASK_ISA_TBM },
2861 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2862 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2863 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2864 { "-maes", OPTION_MASK_ISA_AES },
2865 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2866 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2867 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2868 { "-mf16c", OPTION_MASK_ISA_F16C },
2869 };
2870
2871 /* Flag options. */
2872 static struct ix86_target_opts flag_opts[] =
2873 {
2874 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2875 { "-m80387", MASK_80387 },
2876 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2877 { "-malign-double", MASK_ALIGN_DOUBLE },
2878 { "-mcld", MASK_CLD },
2879 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2880 { "-mieee-fp", MASK_IEEE_FP },
2881 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2882 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2883 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2884 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2885 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2886 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2887 { "-mno-red-zone", MASK_NO_RED_ZONE },
2888 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2889 { "-mrecip", MASK_RECIP },
2890 { "-mrtd", MASK_RTD },
2891 { "-msseregparm", MASK_SSEREGPARM },
2892 { "-mstack-arg-probe", MASK_STACK_PROBE },
2893 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2894 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2895 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2896 { "-mvzeroupper", MASK_VZEROUPPER },
2897 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2898 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2899 { "-mprefer-avx128", MASK_PREFER_AVX128},
2900 };
2901
2902 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2903
2904 char isa_other[40];
2905 char target_other[40];
2906 unsigned num = 0;
2907 unsigned i, j;
2908 char *ret;
2909 char *ptr;
2910 size_t len;
2911 size_t line_len;
2912 size_t sep_len;
2913
2914 memset (opts, '\0', sizeof (opts));
2915
2916 /* Add -march= option. */
2917 if (arch)
2918 {
2919 opts[num][0] = "-march=";
2920 opts[num++][1] = arch;
2921 }
2922
2923 /* Add -mtune= option. */
2924 if (tune)
2925 {
2926 opts[num][0] = "-mtune=";
2927 opts[num++][1] = tune;
2928 }
2929
2930 /* Pick out the options in isa options. */
2931 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2932 {
2933 if ((isa & isa_opts[i].mask) != 0)
2934 {
2935 opts[num++][0] = isa_opts[i].option;
2936 isa &= ~ isa_opts[i].mask;
2937 }
2938 }
2939
2940 if (isa && add_nl_p)
2941 {
2942 opts[num++][0] = isa_other;
2943 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2944 isa);
2945 }
2946
2947 /* Add flag options. */
2948 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2949 {
2950 if ((flags & flag_opts[i].mask) != 0)
2951 {
2952 opts[num++][0] = flag_opts[i].option;
2953 flags &= ~ flag_opts[i].mask;
2954 }
2955 }
2956
2957 if (flags && add_nl_p)
2958 {
2959 opts[num++][0] = target_other;
2960 sprintf (target_other, "(other flags: %#x)", flags);
2961 }
2962
2963 /* Add -fpmath= option. */
2964 if (fpmath)
2965 {
2966 opts[num][0] = "-mfpmath=";
2967 switch ((int) fpmath)
2968 {
2969 case FPMATH_387:
2970 opts[num++][1] = "387";
2971 break;
2972
2973 case FPMATH_SSE:
2974 opts[num++][1] = "sse";
2975 break;
2976
2977 case FPMATH_387 | FPMATH_SSE:
2978 opts[num++][1] = "sse+387";
2979 break;
2980
2981 default:
2982 gcc_unreachable ();
2983 }
2984 }
2985
2986 /* Any options? */
2987 if (num == 0)
2988 return NULL;
2989
2990 gcc_assert (num < ARRAY_SIZE (opts));
2991
2992 /* Size the string. */
2993 len = 0;
2994 sep_len = (add_nl_p) ? 3 : 1;
2995 for (i = 0; i < num; i++)
2996 {
2997 len += sep_len;
2998 for (j = 0; j < 2; j++)
2999 if (opts[i][j])
3000 len += strlen (opts[i][j]);
3001 }
3002
3003 /* Build the string. */
3004 ret = ptr = (char *) xmalloc (len);
3005 line_len = 0;
3006
3007 for (i = 0; i < num; i++)
3008 {
3009 size_t len2[2];
3010
3011 for (j = 0; j < 2; j++)
3012 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3013
3014 if (i != 0)
3015 {
3016 *ptr++ = ' ';
3017 line_len++;
3018
3019 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3020 {
3021 *ptr++ = '\\';
3022 *ptr++ = '\n';
3023 line_len = 0;
3024 }
3025 }
3026
3027 for (j = 0; j < 2; j++)
3028 if (opts[i][j])
3029 {
3030 memcpy (ptr, opts[i][j], len2[j]);
3031 ptr += len2[j];
3032 line_len += len2[j];
3033 }
3034 }
3035
3036 *ptr = '\0';
3037 gcc_assert (ret + len >= ptr);
3038
3039 return ret;
3040 }
3041
3042 /* Return true, if profiling code should be emitted before
3043 prologue. Otherwise it returns false.
3044 Note: For x86 with "hotfix" it is sorried. */
3045 static bool
3046 ix86_profile_before_prologue (void)
3047 {
3048 return flag_fentry != 0;
3049 }
3050
3051 /* Function that is callable from the debugger to print the current
3052 options. */
3053 void
3054 ix86_debug_options (void)
3055 {
3056 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
3057 ix86_arch_string, ix86_tune_string,
3058 ix86_fpmath, true);
3059
3060 if (opts)
3061 {
3062 fprintf (stderr, "%s\n\n", opts);
3063 free (opts);
3064 }
3065 else
3066 fputs ("<no options>\n\n", stderr);
3067
3068 return;
3069 }
3070 \f
3071 /* Override various settings based on options. If MAIN_ARGS_P, the
3072 options are from the command line, otherwise they are from
3073 attributes. */
3074
3075 static void
3076 ix86_option_override_internal (bool main_args_p)
3077 {
3078 int i;
3079 unsigned int ix86_arch_mask, ix86_tune_mask;
3080 const bool ix86_tune_specified = (ix86_tune_string != NULL);
3081 const char *prefix;
3082 const char *suffix;
3083 const char *sw;
3084
3085 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3086 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3087 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3088 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3089 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3090 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3091 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3092 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3093 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3094 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3095 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3096 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3097 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3098 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3099 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3100 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3101 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3102 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3103 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3104 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3105 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3106 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3107 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3108 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3109 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3110 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3111 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3112 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3113 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3114 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3115 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3116 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3117 /* if this reaches 64, need to widen struct pta flags below */
3118
3119 static struct pta
3120 {
3121 const char *const name; /* processor name or nickname. */
3122 const enum processor_type processor;
3123 const enum attr_cpu schedule;
3124 const unsigned HOST_WIDE_INT flags;
3125 }
3126 const processor_alias_table[] =
3127 {
3128 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3129 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3130 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3131 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3132 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3133 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3134 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3135 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3136 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
3137 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3138 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3139 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
3140 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3141 PTA_MMX | PTA_SSE},
3142 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3143 PTA_MMX | PTA_SSE},
3144 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3145 PTA_MMX | PTA_SSE | PTA_SSE2},
3146 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3147 PTA_MMX |PTA_SSE | PTA_SSE2},
3148 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3149 PTA_MMX | PTA_SSE | PTA_SSE2},
3150 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3151 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
3152 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3153 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3154 | PTA_CX16 | PTA_NO_SAHF},
3155 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
3156 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3157 | PTA_SSSE3 | PTA_CX16},
3158 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
3159 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3160 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
3161 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
3162 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3163 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3164 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
3165 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
3166 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3167 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3168 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3169 | PTA_RDRND | PTA_F16C},
3170 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
3171 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3172 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3173 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3174 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3175 | PTA_FMA | PTA_MOVBE},
3176 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3177 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3178 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
3179 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3180 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
3181 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3182 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3183 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3184 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3185 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3186 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3187 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3188 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3189 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3190 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3191 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3192 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3193 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3194 {"x86-64", PROCESSOR_K8, CPU_K8,
3195 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3196 {"k8", PROCESSOR_K8, CPU_K8,
3197 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3198 | PTA_SSE2 | PTA_NO_SAHF},
3199 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3200 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3201 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3202 {"opteron", PROCESSOR_K8, CPU_K8,
3203 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3204 | PTA_SSE2 | PTA_NO_SAHF},
3205 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3206 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3207 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3208 {"athlon64", PROCESSOR_K8, CPU_K8,
3209 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3210 | PTA_SSE2 | PTA_NO_SAHF},
3211 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3212 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3213 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3214 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3215 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3216 | PTA_SSE2 | PTA_NO_SAHF},
3217 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3218 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3219 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3220 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3221 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3222 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3223 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3224 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3225 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3226 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3227 | PTA_XOP | PTA_LWP},
3228 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3229 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3230 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3231 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3232 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3233 | PTA_FMA},
3234 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3235 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3236 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3237 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3238 0 /* flags are only used for -march switch. */ },
3239 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3240 PTA_64BIT /* flags are only used for -march switch. */ },
3241 };
3242
3243 /* -mrecip options. */
3244 static struct
3245 {
3246 const char *string; /* option name */
3247 unsigned int mask; /* mask bits to set */
3248 }
3249 const recip_options[] =
3250 {
3251 { "all", RECIP_MASK_ALL },
3252 { "none", RECIP_MASK_NONE },
3253 { "div", RECIP_MASK_DIV },
3254 { "sqrt", RECIP_MASK_SQRT },
3255 { "vec-div", RECIP_MASK_VEC_DIV },
3256 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3257 };
3258
3259 int const pta_size = ARRAY_SIZE (processor_alias_table);
3260
3261 /* Set up prefix/suffix so the error messages refer to either the command
3262 line argument, or the attribute(target). */
3263 if (main_args_p)
3264 {
3265 prefix = "-m";
3266 suffix = "";
3267 sw = "switch";
3268 }
3269 else
3270 {
3271 prefix = "option(\"";
3272 suffix = "\")";
3273 sw = "attribute";
3274 }
3275
3276 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3277 SUBTARGET_OVERRIDE_OPTIONS;
3278 #endif
3279
3280 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3281 SUBSUBTARGET_OVERRIDE_OPTIONS;
3282 #endif
3283
3284 if (TARGET_X32)
3285 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3286
3287 /* -fPIC is the default for x86_64. */
3288 if (TARGET_MACHO && TARGET_64BIT)
3289 flag_pic = 2;
3290
3291 /* Need to check -mtune=generic first. */
3292 if (ix86_tune_string)
3293 {
3294 if (!strcmp (ix86_tune_string, "generic")
3295 || !strcmp (ix86_tune_string, "i686")
3296 /* As special support for cross compilers we read -mtune=native
3297 as -mtune=generic. With native compilers we won't see the
3298 -mtune=native, as it was changed by the driver. */
3299 || !strcmp (ix86_tune_string, "native"))
3300 {
3301 if (TARGET_64BIT)
3302 ix86_tune_string = "generic64";
3303 else
3304 ix86_tune_string = "generic32";
3305 }
3306 /* If this call is for setting the option attribute, allow the
3307 generic32/generic64 that was previously set. */
3308 else if (!main_args_p
3309 && (!strcmp (ix86_tune_string, "generic32")
3310 || !strcmp (ix86_tune_string, "generic64")))
3311 ;
3312 else if (!strncmp (ix86_tune_string, "generic", 7))
3313 error ("bad value (%s) for %stune=%s %s",
3314 ix86_tune_string, prefix, suffix, sw);
3315 else if (!strcmp (ix86_tune_string, "x86-64"))
3316 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3317 "%stune=k8%s or %stune=generic%s instead as appropriate",
3318 prefix, suffix, prefix, suffix, prefix, suffix);
3319 }
3320 else
3321 {
3322 if (ix86_arch_string)
3323 ix86_tune_string = ix86_arch_string;
3324 if (!ix86_tune_string)
3325 {
3326 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3327 ix86_tune_defaulted = 1;
3328 }
3329
3330 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3331 need to use a sensible tune option. */
3332 if (!strcmp (ix86_tune_string, "generic")
3333 || !strcmp (ix86_tune_string, "x86-64")
3334 || !strcmp (ix86_tune_string, "i686"))
3335 {
3336 if (TARGET_64BIT)
3337 ix86_tune_string = "generic64";
3338 else
3339 ix86_tune_string = "generic32";
3340 }
3341 }
3342
3343 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3344 {
3345 /* rep; movq isn't available in 32-bit code. */
3346 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3347 ix86_stringop_alg = no_stringop;
3348 }
3349
3350 if (!ix86_arch_string)
3351 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3352 else
3353 ix86_arch_specified = 1;
3354
3355 if (!global_options_set.x_ix86_abi)
3356 ix86_abi = DEFAULT_ABI;
3357
3358 if (global_options_set.x_ix86_cmodel)
3359 {
3360 switch (ix86_cmodel)
3361 {
3362 case CM_SMALL:
3363 case CM_SMALL_PIC:
3364 if (flag_pic)
3365 ix86_cmodel = CM_SMALL_PIC;
3366 if (!TARGET_64BIT)
3367 error ("code model %qs not supported in the %s bit mode",
3368 "small", "32");
3369 break;
3370
3371 case CM_MEDIUM:
3372 case CM_MEDIUM_PIC:
3373 if (flag_pic)
3374 ix86_cmodel = CM_MEDIUM_PIC;
3375 if (!TARGET_64BIT)
3376 error ("code model %qs not supported in the %s bit mode",
3377 "medium", "32");
3378 else if (TARGET_X32)
3379 error ("code model %qs not supported in x32 mode",
3380 "medium");
3381 break;
3382
3383 case CM_LARGE:
3384 case CM_LARGE_PIC:
3385 if (flag_pic)
3386 ix86_cmodel = CM_LARGE_PIC;
3387 if (!TARGET_64BIT)
3388 error ("code model %qs not supported in the %s bit mode",
3389 "large", "32");
3390 else if (TARGET_X32)
3391 error ("code model %qs not supported in x32 mode",
3392 "medium");
3393 break;
3394
3395 case CM_32:
3396 if (flag_pic)
3397 error ("code model %s does not support PIC mode", "32");
3398 if (TARGET_64BIT)
3399 error ("code model %qs not supported in the %s bit mode",
3400 "32", "64");
3401 break;
3402
3403 case CM_KERNEL:
3404 if (flag_pic)
3405 {
3406 error ("code model %s does not support PIC mode", "kernel");
3407 ix86_cmodel = CM_32;
3408 }
3409 if (!TARGET_64BIT)
3410 error ("code model %qs not supported in the %s bit mode",
3411 "kernel", "32");
3412 break;
3413
3414 default:
3415 gcc_unreachable ();
3416 }
3417 }
3418 else
3419 {
3420 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3421 use of rip-relative addressing. This eliminates fixups that
3422 would otherwise be needed if this object is to be placed in a
3423 DLL, and is essentially just as efficient as direct addressing. */
3424 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3425 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3426 else if (TARGET_64BIT)
3427 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3428 else
3429 ix86_cmodel = CM_32;
3430 }
3431 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3432 {
3433 error ("-masm=intel not supported in this configuration");
3434 ix86_asm_dialect = ASM_ATT;
3435 }
3436 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3437 sorry ("%i-bit mode not compiled in",
3438 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3439
3440 for (i = 0; i < pta_size; i++)
3441 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3442 {
3443 ix86_schedule = processor_alias_table[i].schedule;
3444 ix86_arch = processor_alias_table[i].processor;
3445 /* Default cpu tuning to the architecture. */
3446 ix86_tune = ix86_arch;
3447
3448 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3449 error ("CPU you selected does not support x86-64 "
3450 "instruction set");
3451
3452 if (processor_alias_table[i].flags & PTA_MMX
3453 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3454 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3455 if (processor_alias_table[i].flags & PTA_3DNOW
3456 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3457 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3458 if (processor_alias_table[i].flags & PTA_3DNOW_A
3459 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3460 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3461 if (processor_alias_table[i].flags & PTA_SSE
3462 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3463 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3464 if (processor_alias_table[i].flags & PTA_SSE2
3465 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3466 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3467 if (processor_alias_table[i].flags & PTA_SSE3
3468 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3469 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3470 if (processor_alias_table[i].flags & PTA_SSSE3
3471 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3472 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3473 if (processor_alias_table[i].flags & PTA_SSE4_1
3474 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3475 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3476 if (processor_alias_table[i].flags & PTA_SSE4_2
3477 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3478 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3479 if (processor_alias_table[i].flags & PTA_AVX
3480 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3481 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3482 if (processor_alias_table[i].flags & PTA_AVX2
3483 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3484 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3485 if (processor_alias_table[i].flags & PTA_FMA
3486 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3487 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3488 if (processor_alias_table[i].flags & PTA_SSE4A
3489 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3490 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3491 if (processor_alias_table[i].flags & PTA_FMA4
3492 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3493 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3494 if (processor_alias_table[i].flags & PTA_XOP
3495 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3496 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3497 if (processor_alias_table[i].flags & PTA_LWP
3498 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3499 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3500 if (processor_alias_table[i].flags & PTA_ABM
3501 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3502 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3503 if (processor_alias_table[i].flags & PTA_BMI
3504 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3505 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3506 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3507 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3508 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3509 if (processor_alias_table[i].flags & PTA_TBM
3510 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3511 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3512 if (processor_alias_table[i].flags & PTA_BMI2
3513 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3514 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3515 if (processor_alias_table[i].flags & PTA_CX16
3516 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3517 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3518 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3519 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3520 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3521 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3522 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3523 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3524 if (processor_alias_table[i].flags & PTA_MOVBE
3525 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3526 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3527 if (processor_alias_table[i].flags & PTA_AES
3528 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3529 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3530 if (processor_alias_table[i].flags & PTA_PCLMUL
3531 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3532 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3533 if (processor_alias_table[i].flags & PTA_FSGSBASE
3534 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3535 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3536 if (processor_alias_table[i].flags & PTA_RDRND
3537 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3538 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3539 if (processor_alias_table[i].flags & PTA_F16C
3540 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3541 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3542 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3543 x86_prefetch_sse = true;
3544
3545 break;
3546 }
3547
3548 if (!strcmp (ix86_arch_string, "generic"))
3549 error ("generic CPU can be used only for %stune=%s %s",
3550 prefix, suffix, sw);
3551 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3552 error ("bad value (%s) for %sarch=%s %s",
3553 ix86_arch_string, prefix, suffix, sw);
3554
3555 ix86_arch_mask = 1u << ix86_arch;
3556 for (i = 0; i < X86_ARCH_LAST; ++i)
3557 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3558
3559 for (i = 0; i < pta_size; i++)
3560 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3561 {
3562 ix86_schedule = processor_alias_table[i].schedule;
3563 ix86_tune = processor_alias_table[i].processor;
3564 if (TARGET_64BIT)
3565 {
3566 if (!(processor_alias_table[i].flags & PTA_64BIT))
3567 {
3568 if (ix86_tune_defaulted)
3569 {
3570 ix86_tune_string = "x86-64";
3571 for (i = 0; i < pta_size; i++)
3572 if (! strcmp (ix86_tune_string,
3573 processor_alias_table[i].name))
3574 break;
3575 ix86_schedule = processor_alias_table[i].schedule;
3576 ix86_tune = processor_alias_table[i].processor;
3577 }
3578 else
3579 error ("CPU you selected does not support x86-64 "
3580 "instruction set");
3581 }
3582 }
3583 else
3584 {
3585 /* Adjust tuning when compiling for 32-bit ABI. */
3586 switch (ix86_tune)
3587 {
3588 case PROCESSOR_GENERIC64:
3589 ix86_tune = PROCESSOR_GENERIC32;
3590 ix86_schedule = CPU_PENTIUMPRO;
3591 break;
3592
3593 case PROCESSOR_CORE2_64:
3594 ix86_tune = PROCESSOR_CORE2_32;
3595 break;
3596
3597 case PROCESSOR_COREI7_64:
3598 ix86_tune = PROCESSOR_COREI7_32;
3599 break;
3600
3601 default:
3602 break;
3603 }
3604 }
3605 /* Intel CPUs have always interpreted SSE prefetch instructions as
3606 NOPs; so, we can enable SSE prefetch instructions even when
3607 -mtune (rather than -march) points us to a processor that has them.
3608 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3609 higher processors. */
3610 if (TARGET_CMOVE
3611 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3612 x86_prefetch_sse = true;
3613 break;
3614 }
3615
3616 if (ix86_tune_specified && i == pta_size)
3617 error ("bad value (%s) for %stune=%s %s",
3618 ix86_tune_string, prefix, suffix, sw);
3619
3620 ix86_tune_mask = 1u << ix86_tune;
3621 for (i = 0; i < X86_TUNE_LAST; ++i)
3622 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3623
3624 #ifndef USE_IX86_FRAME_POINTER
3625 #define USE_IX86_FRAME_POINTER 0
3626 #endif
3627
3628 #ifndef USE_X86_64_FRAME_POINTER
3629 #define USE_X86_64_FRAME_POINTER 0
3630 #endif
3631
3632 /* Set the default values for switches whose default depends on TARGET_64BIT
3633 in case they weren't overwritten by command line options. */
3634 if (TARGET_64BIT)
3635 {
3636 if (optimize > 1 && !global_options_set.x_flag_zee)
3637 flag_zee = 1;
3638 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3639 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3640 if (flag_asynchronous_unwind_tables == 2)
3641 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3642 if (flag_pcc_struct_return == 2)
3643 flag_pcc_struct_return = 0;
3644 }
3645 else
3646 {
3647 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3648 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3649 if (flag_asynchronous_unwind_tables == 2)
3650 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3651 if (flag_pcc_struct_return == 2)
3652 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3653 }
3654
3655 if (optimize_size)
3656 ix86_cost = &ix86_size_cost;
3657 else
3658 ix86_cost = processor_target_table[ix86_tune].cost;
3659
3660 /* Arrange to set up i386_stack_locals for all functions. */
3661 init_machine_status = ix86_init_machine_status;
3662
3663 /* Validate -mregparm= value. */
3664 if (global_options_set.x_ix86_regparm)
3665 {
3666 if (TARGET_64BIT)
3667 warning (0, "-mregparm is ignored in 64-bit mode");
3668 if (ix86_regparm > REGPARM_MAX)
3669 {
3670 error ("-mregparm=%d is not between 0 and %d",
3671 ix86_regparm, REGPARM_MAX);
3672 ix86_regparm = 0;
3673 }
3674 }
3675 if (TARGET_64BIT)
3676 ix86_regparm = REGPARM_MAX;
3677
3678 /* Default align_* from the processor table. */
3679 if (align_loops == 0)
3680 {
3681 align_loops = processor_target_table[ix86_tune].align_loop;
3682 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3683 }
3684 if (align_jumps == 0)
3685 {
3686 align_jumps = processor_target_table[ix86_tune].align_jump;
3687 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3688 }
3689 if (align_functions == 0)
3690 {
3691 align_functions = processor_target_table[ix86_tune].align_func;
3692 }
3693
3694 /* Provide default for -mbranch-cost= value. */
3695 if (!global_options_set.x_ix86_branch_cost)
3696 ix86_branch_cost = ix86_cost->branch_cost;
3697
3698 if (TARGET_64BIT)
3699 {
3700 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3701
3702 /* Enable by default the SSE and MMX builtins. Do allow the user to
3703 explicitly disable any of these. In particular, disabling SSE and
3704 MMX for kernel code is extremely useful. */
3705 if (!ix86_arch_specified)
3706 ix86_isa_flags
3707 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3708 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3709
3710 if (TARGET_RTD)
3711 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3712 }
3713 else
3714 {
3715 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3716
3717 if (!ix86_arch_specified)
3718 ix86_isa_flags
3719 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3720
3721 /* i386 ABI does not specify red zone. It still makes sense to use it
3722 when programmer takes care to stack from being destroyed. */
3723 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3724 target_flags |= MASK_NO_RED_ZONE;
3725 }
3726
3727 /* Keep nonleaf frame pointers. */
3728 if (flag_omit_frame_pointer)
3729 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3730 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3731 flag_omit_frame_pointer = 1;
3732
3733 /* If we're doing fast math, we don't care about comparison order
3734 wrt NaNs. This lets us use a shorter comparison sequence. */
3735 if (flag_finite_math_only)
3736 target_flags &= ~MASK_IEEE_FP;
3737
3738 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3739 since the insns won't need emulation. */
3740 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3741 target_flags &= ~MASK_NO_FANCY_MATH_387;
3742
3743 /* Likewise, if the target doesn't have a 387, or we've specified
3744 software floating point, don't use 387 inline intrinsics. */
3745 if (!TARGET_80387)
3746 target_flags |= MASK_NO_FANCY_MATH_387;
3747
3748 /* Turn on MMX builtins for -msse. */
3749 if (TARGET_SSE)
3750 {
3751 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3752 x86_prefetch_sse = true;
3753 }
3754
3755 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3756 if (TARGET_SSE4_2 || TARGET_ABM)
3757 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3758
3759 /* Turn on lzcnt instruction for -mabm. */
3760 if (TARGET_ABM)
3761 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3762
3763 /* Validate -mpreferred-stack-boundary= value or default it to
3764 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3765 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3766 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3767 {
3768 int min = (TARGET_64BIT ? 4 : 2);
3769 int max = (TARGET_SEH ? 4 : 12);
3770
3771 if (ix86_preferred_stack_boundary_arg < min
3772 || ix86_preferred_stack_boundary_arg > max)
3773 {
3774 if (min == max)
3775 error ("-mpreferred-stack-boundary is not supported "
3776 "for this target");
3777 else
3778 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3779 ix86_preferred_stack_boundary_arg, min, max);
3780 }
3781 else
3782 ix86_preferred_stack_boundary
3783 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3784 }
3785
3786 /* Set the default value for -mstackrealign. */
3787 if (ix86_force_align_arg_pointer == -1)
3788 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3789
3790 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3791
3792 /* Validate -mincoming-stack-boundary= value or default it to
3793 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3794 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3795 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3796 {
3797 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3798 || ix86_incoming_stack_boundary_arg > 12)
3799 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3800 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3801 else
3802 {
3803 ix86_user_incoming_stack_boundary
3804 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3805 ix86_incoming_stack_boundary
3806 = ix86_user_incoming_stack_boundary;
3807 }
3808 }
3809
3810 /* Accept -msseregparm only if at least SSE support is enabled. */
3811 if (TARGET_SSEREGPARM
3812 && ! TARGET_SSE)
3813 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3814
3815 if (global_options_set.x_ix86_fpmath)
3816 {
3817 if (ix86_fpmath & FPMATH_SSE)
3818 {
3819 if (!TARGET_SSE)
3820 {
3821 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3822 ix86_fpmath = FPMATH_387;
3823 }
3824 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3825 {
3826 warning (0, "387 instruction set disabled, using SSE arithmetics");
3827 ix86_fpmath = FPMATH_SSE;
3828 }
3829 }
3830 }
3831 else
3832 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3833
3834 /* If the i387 is disabled, then do not return values in it. */
3835 if (!TARGET_80387)
3836 target_flags &= ~MASK_FLOAT_RETURNS;
3837
3838 /* Use external vectorized library in vectorizing intrinsics. */
3839 if (global_options_set.x_ix86_veclibabi_type)
3840 switch (ix86_veclibabi_type)
3841 {
3842 case ix86_veclibabi_type_svml:
3843 ix86_veclib_handler = ix86_veclibabi_svml;
3844 break;
3845
3846 case ix86_veclibabi_type_acml:
3847 ix86_veclib_handler = ix86_veclibabi_acml;
3848 break;
3849
3850 default:
3851 gcc_unreachable ();
3852 }
3853
3854 if ((!USE_IX86_FRAME_POINTER
3855 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3856 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3857 && !optimize_size)
3858 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3859
3860 /* ??? Unwind info is not correct around the CFG unless either a frame
3861 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3862 unwind info generation to be aware of the CFG and propagating states
3863 around edges. */
3864 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3865 || flag_exceptions || flag_non_call_exceptions)
3866 && flag_omit_frame_pointer
3867 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3868 {
3869 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3870 warning (0, "unwind tables currently require either a frame pointer "
3871 "or %saccumulate-outgoing-args%s for correctness",
3872 prefix, suffix);
3873 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3874 }
3875
3876 /* If stack probes are required, the space used for large function
3877 arguments on the stack must also be probed, so enable
3878 -maccumulate-outgoing-args so this happens in the prologue. */
3879 if (TARGET_STACK_PROBE
3880 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3881 {
3882 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3883 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3884 "for correctness", prefix, suffix);
3885 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3886 }
3887
3888 /* For sane SSE instruction set generation we need fcomi instruction.
3889 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3890 expands to a sequence that includes conditional move. */
3891 if (TARGET_SSE || TARGET_RDRND)
3892 TARGET_CMOVE = 1;
3893
3894 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3895 {
3896 char *p;
3897 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3898 p = strchr (internal_label_prefix, 'X');
3899 internal_label_prefix_len = p - internal_label_prefix;
3900 *p = '\0';
3901 }
3902
3903 /* When scheduling description is not available, disable scheduler pass
3904 so it won't slow down the compilation and make x87 code slower. */
3905 if (!TARGET_SCHEDULE)
3906 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3907
3908 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3909 ix86_cost->simultaneous_prefetches,
3910 global_options.x_param_values,
3911 global_options_set.x_param_values);
3912 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3913 global_options.x_param_values,
3914 global_options_set.x_param_values);
3915 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3916 global_options.x_param_values,
3917 global_options_set.x_param_values);
3918 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3919 global_options.x_param_values,
3920 global_options_set.x_param_values);
3921
3922 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3923 if (flag_prefetch_loop_arrays < 0
3924 && HAVE_prefetch
3925 && optimize >= 3
3926 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3927 flag_prefetch_loop_arrays = 1;
3928
3929 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3930 can be optimized to ap = __builtin_next_arg (0). */
3931 if (!TARGET_64BIT && !flag_split_stack)
3932 targetm.expand_builtin_va_start = NULL;
3933
3934 if (TARGET_64BIT)
3935 {
3936 ix86_gen_leave = gen_leave_rex64;
3937 ix86_gen_add3 = gen_adddi3;
3938 ix86_gen_sub3 = gen_subdi3;
3939 ix86_gen_sub3_carry = gen_subdi3_carry;
3940 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3941 ix86_gen_monitor = gen_sse3_monitor64;
3942 ix86_gen_andsp = gen_anddi3;
3943 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3944 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3945 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3946 }
3947 else
3948 {
3949 ix86_gen_leave = gen_leave;
3950 ix86_gen_add3 = gen_addsi3;
3951 ix86_gen_sub3 = gen_subsi3;
3952 ix86_gen_sub3_carry = gen_subsi3_carry;
3953 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3954 ix86_gen_monitor = gen_sse3_monitor;
3955 ix86_gen_andsp = gen_andsi3;
3956 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3957 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3958 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3959 }
3960
3961 #ifdef USE_IX86_CLD
3962 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3963 if (!TARGET_64BIT)
3964 target_flags |= MASK_CLD & ~target_flags_explicit;
3965 #endif
3966
3967 if (!TARGET_64BIT && flag_pic)
3968 {
3969 if (flag_fentry > 0)
3970 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3971 "with -fpic");
3972 flag_fentry = 0;
3973 }
3974 else if (TARGET_SEH)
3975 {
3976 if (flag_fentry == 0)
3977 sorry ("-mno-fentry isn%'t compatible with SEH");
3978 flag_fentry = 1;
3979 }
3980 else if (flag_fentry < 0)
3981 {
3982 #if defined(PROFILE_BEFORE_PROLOGUE)
3983 flag_fentry = 1;
3984 #else
3985 flag_fentry = 0;
3986 #endif
3987 }
3988
3989 if (TARGET_AVX)
3990 {
3991 /* When not optimize for size, enable vzeroupper optimization for
3992 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3993 AVX unaligned load/store. */
3994 if (!optimize_size)
3995 {
3996 if (flag_expensive_optimizations
3997 && !(target_flags_explicit & MASK_VZEROUPPER))
3998 target_flags |= MASK_VZEROUPPER;
3999 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
4000 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4001 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4002 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
4003 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4004 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4005 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
4006 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
4007 target_flags |= MASK_PREFER_AVX128;
4008 }
4009 }
4010 else
4011 {
4012 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
4013 target_flags &= ~MASK_VZEROUPPER;
4014 }
4015
4016 if (ix86_recip_name)
4017 {
4018 char *p = ASTRDUP (ix86_recip_name);
4019 char *q;
4020 unsigned int mask, i;
4021 bool invert;
4022
4023 while ((q = strtok (p, ",")) != NULL)
4024 {
4025 p = NULL;
4026 if (*q == '!')
4027 {
4028 invert = true;
4029 q++;
4030 }
4031 else
4032 invert = false;
4033
4034 if (!strcmp (q, "default"))
4035 mask = RECIP_MASK_ALL;
4036 else
4037 {
4038 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4039 if (!strcmp (q, recip_options[i].string))
4040 {
4041 mask = recip_options[i].mask;
4042 break;
4043 }
4044
4045 if (i == ARRAY_SIZE (recip_options))
4046 {
4047 error ("unknown option for -mrecip=%s", q);
4048 invert = false;
4049 mask = RECIP_MASK_NONE;
4050 }
4051 }
4052
4053 recip_mask_explicit |= mask;
4054 if (invert)
4055 recip_mask &= ~mask;
4056 else
4057 recip_mask |= mask;
4058 }
4059 }
4060
4061 if (TARGET_RECIP)
4062 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
4063 else if (target_flags_explicit & MASK_RECIP)
4064 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
4065
4066 /* Save the initial options in case the user does function specific
4067 options. */
4068 if (main_args_p)
4069 target_option_default_node = target_option_current_node
4070 = build_target_option_node ();
4071 }
4072
4073 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
4074
4075 static bool
4076 function_pass_avx256_p (const_rtx val)
4077 {
4078 if (!val)
4079 return false;
4080
4081 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
4082 return true;
4083
4084 if (GET_CODE (val) == PARALLEL)
4085 {
4086 int i;
4087 rtx r;
4088
4089 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
4090 {
4091 r = XVECEXP (val, 0, i);
4092 if (GET_CODE (r) == EXPR_LIST
4093 && XEXP (r, 0)
4094 && REG_P (XEXP (r, 0))
4095 && (GET_MODE (XEXP (r, 0)) == OImode
4096 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
4097 return true;
4098 }
4099 }
4100
4101 return false;
4102 }
4103
4104 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4105
4106 static void
4107 ix86_option_override (void)
4108 {
4109 ix86_option_override_internal (true);
4110 }
4111
4112 /* Update register usage after having seen the compiler flags. */
4113
4114 static void
4115 ix86_conditional_register_usage (void)
4116 {
4117 int i;
4118 unsigned int j;
4119
4120 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4121 {
4122 if (fixed_regs[i] > 1)
4123 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
4124 if (call_used_regs[i] > 1)
4125 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
4126 }
4127
4128 /* The PIC register, if it exists, is fixed. */
4129 j = PIC_OFFSET_TABLE_REGNUM;
4130 if (j != INVALID_REGNUM)
4131 fixed_regs[j] = call_used_regs[j] = 1;
4132
4133 /* The 64-bit MS_ABI changes the set of call-used registers. */
4134 if (TARGET_64BIT_MS_ABI)
4135 {
4136 call_used_regs[SI_REG] = 0;
4137 call_used_regs[DI_REG] = 0;
4138 call_used_regs[XMM6_REG] = 0;
4139 call_used_regs[XMM7_REG] = 0;
4140 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4141 call_used_regs[i] = 0;
4142 }
4143
4144 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
4145 other call-clobbered regs for 64-bit. */
4146 if (TARGET_64BIT)
4147 {
4148 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4149
4150 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4151 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4152 && call_used_regs[i])
4153 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4154 }
4155
4156 /* If MMX is disabled, squash the registers. */
4157 if (! TARGET_MMX)
4158 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4159 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4160 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4161
4162 /* If SSE is disabled, squash the registers. */
4163 if (! TARGET_SSE)
4164 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4165 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4166 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4167
4168 /* If the FPU is disabled, squash the registers. */
4169 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4170 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4171 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4172 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4173
4174 /* If 32-bit, squash the 64-bit registers. */
4175 if (! TARGET_64BIT)
4176 {
4177 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4178 reg_names[i] = "";
4179 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4180 reg_names[i] = "";
4181 }
4182 }
4183
4184 \f
4185 /* Save the current options */
4186
4187 static void
4188 ix86_function_specific_save (struct cl_target_option *ptr)
4189 {
4190 ptr->arch = ix86_arch;
4191 ptr->schedule = ix86_schedule;
4192 ptr->tune = ix86_tune;
4193 ptr->branch_cost = ix86_branch_cost;
4194 ptr->tune_defaulted = ix86_tune_defaulted;
4195 ptr->arch_specified = ix86_arch_specified;
4196 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4197 ptr->ix86_target_flags_explicit = target_flags_explicit;
4198 ptr->x_recip_mask_explicit = recip_mask_explicit;
4199
4200 /* The fields are char but the variables are not; make sure the
4201 values fit in the fields. */
4202 gcc_assert (ptr->arch == ix86_arch);
4203 gcc_assert (ptr->schedule == ix86_schedule);
4204 gcc_assert (ptr->tune == ix86_tune);
4205 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4206 }
4207
4208 /* Restore the current options */
4209
4210 static void
4211 ix86_function_specific_restore (struct cl_target_option *ptr)
4212 {
4213 enum processor_type old_tune = ix86_tune;
4214 enum processor_type old_arch = ix86_arch;
4215 unsigned int ix86_arch_mask, ix86_tune_mask;
4216 int i;
4217
4218 ix86_arch = (enum processor_type) ptr->arch;
4219 ix86_schedule = (enum attr_cpu) ptr->schedule;
4220 ix86_tune = (enum processor_type) ptr->tune;
4221 ix86_branch_cost = ptr->branch_cost;
4222 ix86_tune_defaulted = ptr->tune_defaulted;
4223 ix86_arch_specified = ptr->arch_specified;
4224 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4225 target_flags_explicit = ptr->ix86_target_flags_explicit;
4226 recip_mask_explicit = ptr->x_recip_mask_explicit;
4227
4228 /* Recreate the arch feature tests if the arch changed */
4229 if (old_arch != ix86_arch)
4230 {
4231 ix86_arch_mask = 1u << ix86_arch;
4232 for (i = 0; i < X86_ARCH_LAST; ++i)
4233 ix86_arch_features[i]
4234 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4235 }
4236
4237 /* Recreate the tune optimization tests */
4238 if (old_tune != ix86_tune)
4239 {
4240 ix86_tune_mask = 1u << ix86_tune;
4241 for (i = 0; i < X86_TUNE_LAST; ++i)
4242 ix86_tune_features[i]
4243 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4244 }
4245 }
4246
4247 /* Print the current options */
4248
4249 static void
4250 ix86_function_specific_print (FILE *file, int indent,
4251 struct cl_target_option *ptr)
4252 {
4253 char *target_string
4254 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4255 NULL, NULL, ptr->x_ix86_fpmath, false);
4256
4257 fprintf (file, "%*sarch = %d (%s)\n",
4258 indent, "",
4259 ptr->arch,
4260 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4261 ? cpu_names[ptr->arch]
4262 : "<unknown>"));
4263
4264 fprintf (file, "%*stune = %d (%s)\n",
4265 indent, "",
4266 ptr->tune,
4267 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4268 ? cpu_names[ptr->tune]
4269 : "<unknown>"));
4270
4271 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4272
4273 if (target_string)
4274 {
4275 fprintf (file, "%*s%s\n", indent, "", target_string);
4276 free (target_string);
4277 }
4278 }
4279
4280 \f
4281 /* Inner function to process the attribute((target(...))), take an argument and
4282 set the current options from the argument. If we have a list, recursively go
4283 over the list. */
4284
4285 static bool
4286 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4287 struct gcc_options *enum_opts_set)
4288 {
4289 char *next_optstr;
4290 bool ret = true;
4291
4292 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4293 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4294 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4295 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4296 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4297
4298 enum ix86_opt_type
4299 {
4300 ix86_opt_unknown,
4301 ix86_opt_yes,
4302 ix86_opt_no,
4303 ix86_opt_str,
4304 ix86_opt_enum,
4305 ix86_opt_isa
4306 };
4307
4308 static const struct
4309 {
4310 const char *string;
4311 size_t len;
4312 enum ix86_opt_type type;
4313 int opt;
4314 int mask;
4315 } attrs[] = {
4316 /* isa options */
4317 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4318 IX86_ATTR_ISA ("abm", OPT_mabm),
4319 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4320 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4321 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4322 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4323 IX86_ATTR_ISA ("aes", OPT_maes),
4324 IX86_ATTR_ISA ("avx", OPT_mavx),
4325 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4326 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4327 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4328 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4329 IX86_ATTR_ISA ("sse", OPT_msse),
4330 IX86_ATTR_ISA ("sse2", OPT_msse2),
4331 IX86_ATTR_ISA ("sse3", OPT_msse3),
4332 IX86_ATTR_ISA ("sse4", OPT_msse4),
4333 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4334 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4335 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4336 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4337 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4338 IX86_ATTR_ISA ("fma", OPT_mfma),
4339 IX86_ATTR_ISA ("xop", OPT_mxop),
4340 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4341 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4342 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4343 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4344
4345 /* enum options */
4346 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4347
4348 /* string options */
4349 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4350 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4351
4352 /* flag options */
4353 IX86_ATTR_YES ("cld",
4354 OPT_mcld,
4355 MASK_CLD),
4356
4357 IX86_ATTR_NO ("fancy-math-387",
4358 OPT_mfancy_math_387,
4359 MASK_NO_FANCY_MATH_387),
4360
4361 IX86_ATTR_YES ("ieee-fp",
4362 OPT_mieee_fp,
4363 MASK_IEEE_FP),
4364
4365 IX86_ATTR_YES ("inline-all-stringops",
4366 OPT_minline_all_stringops,
4367 MASK_INLINE_ALL_STRINGOPS),
4368
4369 IX86_ATTR_YES ("inline-stringops-dynamically",
4370 OPT_minline_stringops_dynamically,
4371 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4372
4373 IX86_ATTR_NO ("align-stringops",
4374 OPT_mno_align_stringops,
4375 MASK_NO_ALIGN_STRINGOPS),
4376
4377 IX86_ATTR_YES ("recip",
4378 OPT_mrecip,
4379 MASK_RECIP),
4380
4381 };
4382
4383 /* If this is a list, recurse to get the options. */
4384 if (TREE_CODE (args) == TREE_LIST)
4385 {
4386 bool ret = true;
4387
4388 for (; args; args = TREE_CHAIN (args))
4389 if (TREE_VALUE (args)
4390 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4391 p_strings, enum_opts_set))
4392 ret = false;
4393
4394 return ret;
4395 }
4396
4397 else if (TREE_CODE (args) != STRING_CST)
4398 gcc_unreachable ();
4399
4400 /* Handle multiple arguments separated by commas. */
4401 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4402
4403 while (next_optstr && *next_optstr != '\0')
4404 {
4405 char *p = next_optstr;
4406 char *orig_p = p;
4407 char *comma = strchr (next_optstr, ',');
4408 const char *opt_string;
4409 size_t len, opt_len;
4410 int opt;
4411 bool opt_set_p;
4412 char ch;
4413 unsigned i;
4414 enum ix86_opt_type type = ix86_opt_unknown;
4415 int mask = 0;
4416
4417 if (comma)
4418 {
4419 *comma = '\0';
4420 len = comma - next_optstr;
4421 next_optstr = comma + 1;
4422 }
4423 else
4424 {
4425 len = strlen (p);
4426 next_optstr = NULL;
4427 }
4428
4429 /* Recognize no-xxx. */
4430 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4431 {
4432 opt_set_p = false;
4433 p += 3;
4434 len -= 3;
4435 }
4436 else
4437 opt_set_p = true;
4438
4439 /* Find the option. */
4440 ch = *p;
4441 opt = N_OPTS;
4442 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4443 {
4444 type = attrs[i].type;
4445 opt_len = attrs[i].len;
4446 if (ch == attrs[i].string[0]
4447 && ((type != ix86_opt_str && type != ix86_opt_enum)
4448 ? len == opt_len
4449 : len > opt_len)
4450 && memcmp (p, attrs[i].string, opt_len) == 0)
4451 {
4452 opt = attrs[i].opt;
4453 mask = attrs[i].mask;
4454 opt_string = attrs[i].string;
4455 break;
4456 }
4457 }
4458
4459 /* Process the option. */
4460 if (opt == N_OPTS)
4461 {
4462 error ("attribute(target(\"%s\")) is unknown", orig_p);
4463 ret = false;
4464 }
4465
4466 else if (type == ix86_opt_isa)
4467 {
4468 struct cl_decoded_option decoded;
4469
4470 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4471 ix86_handle_option (&global_options, &global_options_set,
4472 &decoded, input_location);
4473 }
4474
4475 else if (type == ix86_opt_yes || type == ix86_opt_no)
4476 {
4477 if (type == ix86_opt_no)
4478 opt_set_p = !opt_set_p;
4479
4480 if (opt_set_p)
4481 target_flags |= mask;
4482 else
4483 target_flags &= ~mask;
4484 }
4485
4486 else if (type == ix86_opt_str)
4487 {
4488 if (p_strings[opt])
4489 {
4490 error ("option(\"%s\") was already specified", opt_string);
4491 ret = false;
4492 }
4493 else
4494 p_strings[opt] = xstrdup (p + opt_len);
4495 }
4496
4497 else if (type == ix86_opt_enum)
4498 {
4499 bool arg_ok;
4500 int value;
4501
4502 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4503 if (arg_ok)
4504 set_option (&global_options, enum_opts_set, opt, value,
4505 p + opt_len, DK_UNSPECIFIED, input_location,
4506 global_dc);
4507 else
4508 {
4509 error ("attribute(target(\"%s\")) is unknown", orig_p);
4510 ret = false;
4511 }
4512 }
4513
4514 else
4515 gcc_unreachable ();
4516 }
4517
4518 return ret;
4519 }
4520
4521 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4522
4523 tree
4524 ix86_valid_target_attribute_tree (tree args)
4525 {
4526 const char *orig_arch_string = ix86_arch_string;
4527 const char *orig_tune_string = ix86_tune_string;
4528 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4529 int orig_tune_defaulted = ix86_tune_defaulted;
4530 int orig_arch_specified = ix86_arch_specified;
4531 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4532 tree t = NULL_TREE;
4533 int i;
4534 struct cl_target_option *def
4535 = TREE_TARGET_OPTION (target_option_default_node);
4536 struct gcc_options enum_opts_set;
4537
4538 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4539
4540 /* Process each of the options on the chain. */
4541 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4542 &enum_opts_set))
4543 return NULL_TREE;
4544
4545 /* If the changed options are different from the default, rerun
4546 ix86_option_override_internal, and then save the options away.
4547 The string options are are attribute options, and will be undone
4548 when we copy the save structure. */
4549 if (ix86_isa_flags != def->x_ix86_isa_flags
4550 || target_flags != def->x_target_flags
4551 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4552 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4553 || enum_opts_set.x_ix86_fpmath)
4554 {
4555 /* If we are using the default tune= or arch=, undo the string assigned,
4556 and use the default. */
4557 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4558 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4559 else if (!orig_arch_specified)
4560 ix86_arch_string = NULL;
4561
4562 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4563 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4564 else if (orig_tune_defaulted)
4565 ix86_tune_string = NULL;
4566
4567 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4568 if (enum_opts_set.x_ix86_fpmath)
4569 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4570 else if (!TARGET_64BIT && TARGET_SSE)
4571 {
4572 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4573 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4574 }
4575
4576 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4577 ix86_option_override_internal (false);
4578
4579 /* Add any builtin functions with the new isa if any. */
4580 ix86_add_new_builtins (ix86_isa_flags);
4581
4582 /* Save the current options unless we are validating options for
4583 #pragma. */
4584 t = build_target_option_node ();
4585
4586 ix86_arch_string = orig_arch_string;
4587 ix86_tune_string = orig_tune_string;
4588 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4589
4590 /* Free up memory allocated to hold the strings */
4591 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4592 free (option_strings[i]);
4593 }
4594
4595 return t;
4596 }
4597
4598 /* Hook to validate attribute((target("string"))). */
4599
4600 static bool
4601 ix86_valid_target_attribute_p (tree fndecl,
4602 tree ARG_UNUSED (name),
4603 tree args,
4604 int ARG_UNUSED (flags))
4605 {
4606 struct cl_target_option cur_target;
4607 bool ret = true;
4608 tree old_optimize = build_optimization_node ();
4609 tree new_target, new_optimize;
4610 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4611
4612 /* If the function changed the optimization levels as well as setting target
4613 options, start with the optimizations specified. */
4614 if (func_optimize && func_optimize != old_optimize)
4615 cl_optimization_restore (&global_options,
4616 TREE_OPTIMIZATION (func_optimize));
4617
4618 /* The target attributes may also change some optimization flags, so update
4619 the optimization options if necessary. */
4620 cl_target_option_save (&cur_target, &global_options);
4621 new_target = ix86_valid_target_attribute_tree (args);
4622 new_optimize = build_optimization_node ();
4623
4624 if (!new_target)
4625 ret = false;
4626
4627 else if (fndecl)
4628 {
4629 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4630
4631 if (old_optimize != new_optimize)
4632 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4633 }
4634
4635 cl_target_option_restore (&global_options, &cur_target);
4636
4637 if (old_optimize != new_optimize)
4638 cl_optimization_restore (&global_options,
4639 TREE_OPTIMIZATION (old_optimize));
4640
4641 return ret;
4642 }
4643
4644 \f
4645 /* Hook to determine if one function can safely inline another. */
4646
4647 static bool
4648 ix86_can_inline_p (tree caller, tree callee)
4649 {
4650 bool ret = false;
4651 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4652 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4653
4654 /* If callee has no option attributes, then it is ok to inline. */
4655 if (!callee_tree)
4656 ret = true;
4657
4658 /* If caller has no option attributes, but callee does then it is not ok to
4659 inline. */
4660 else if (!caller_tree)
4661 ret = false;
4662
4663 else
4664 {
4665 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4666 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4667
4668 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4669 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4670 function. */
4671 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4672 != callee_opts->x_ix86_isa_flags)
4673 ret = false;
4674
4675 /* See if we have the same non-isa options. */
4676 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4677 ret = false;
4678
4679 /* See if arch, tune, etc. are the same. */
4680 else if (caller_opts->arch != callee_opts->arch)
4681 ret = false;
4682
4683 else if (caller_opts->tune != callee_opts->tune)
4684 ret = false;
4685
4686 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4687 ret = false;
4688
4689 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4690 ret = false;
4691
4692 else
4693 ret = true;
4694 }
4695
4696 return ret;
4697 }
4698
4699 \f
4700 /* Remember the last target of ix86_set_current_function. */
4701 static GTY(()) tree ix86_previous_fndecl;
4702
4703 /* Establish appropriate back-end context for processing the function
4704 FNDECL. The argument might be NULL to indicate processing at top
4705 level, outside of any function scope. */
4706 static void
4707 ix86_set_current_function (tree fndecl)
4708 {
4709 /* Only change the context if the function changes. This hook is called
4710 several times in the course of compiling a function, and we don't want to
4711 slow things down too much or call target_reinit when it isn't safe. */
4712 if (fndecl && fndecl != ix86_previous_fndecl)
4713 {
4714 tree old_tree = (ix86_previous_fndecl
4715 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4716 : NULL_TREE);
4717
4718 tree new_tree = (fndecl
4719 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4720 : NULL_TREE);
4721
4722 ix86_previous_fndecl = fndecl;
4723 if (old_tree == new_tree)
4724 ;
4725
4726 else if (new_tree)
4727 {
4728 cl_target_option_restore (&global_options,
4729 TREE_TARGET_OPTION (new_tree));
4730 target_reinit ();
4731 }
4732
4733 else if (old_tree)
4734 {
4735 struct cl_target_option *def
4736 = TREE_TARGET_OPTION (target_option_current_node);
4737
4738 cl_target_option_restore (&global_options, def);
4739 target_reinit ();
4740 }
4741 }
4742 }
4743
4744 \f
4745 /* Return true if this goes in large data/bss. */
4746
4747 static bool
4748 ix86_in_large_data_p (tree exp)
4749 {
4750 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4751 return false;
4752
4753 /* Functions are never large data. */
4754 if (TREE_CODE (exp) == FUNCTION_DECL)
4755 return false;
4756
4757 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4758 {
4759 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4760 if (strcmp (section, ".ldata") == 0
4761 || strcmp (section, ".lbss") == 0)
4762 return true;
4763 return false;
4764 }
4765 else
4766 {
4767 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4768
4769 /* If this is an incomplete type with size 0, then we can't put it
4770 in data because it might be too big when completed. */
4771 if (!size || size > ix86_section_threshold)
4772 return true;
4773 }
4774
4775 return false;
4776 }
4777
4778 /* Switch to the appropriate section for output of DECL.
4779 DECL is either a `VAR_DECL' node or a constant of some sort.
4780 RELOC indicates whether forming the initial value of DECL requires
4781 link-time relocations. */
4782
4783 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4784 ATTRIBUTE_UNUSED;
4785
4786 static section *
4787 x86_64_elf_select_section (tree decl, int reloc,
4788 unsigned HOST_WIDE_INT align)
4789 {
4790 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4791 && ix86_in_large_data_p (decl))
4792 {
4793 const char *sname = NULL;
4794 unsigned int flags = SECTION_WRITE;
4795 switch (categorize_decl_for_section (decl, reloc))
4796 {
4797 case SECCAT_DATA:
4798 sname = ".ldata";
4799 break;
4800 case SECCAT_DATA_REL:
4801 sname = ".ldata.rel";
4802 break;
4803 case SECCAT_DATA_REL_LOCAL:
4804 sname = ".ldata.rel.local";
4805 break;
4806 case SECCAT_DATA_REL_RO:
4807 sname = ".ldata.rel.ro";
4808 break;
4809 case SECCAT_DATA_REL_RO_LOCAL:
4810 sname = ".ldata.rel.ro.local";
4811 break;
4812 case SECCAT_BSS:
4813 sname = ".lbss";
4814 flags |= SECTION_BSS;
4815 break;
4816 case SECCAT_RODATA:
4817 case SECCAT_RODATA_MERGE_STR:
4818 case SECCAT_RODATA_MERGE_STR_INIT:
4819 case SECCAT_RODATA_MERGE_CONST:
4820 sname = ".lrodata";
4821 flags = 0;
4822 break;
4823 case SECCAT_SRODATA:
4824 case SECCAT_SDATA:
4825 case SECCAT_SBSS:
4826 gcc_unreachable ();
4827 case SECCAT_TEXT:
4828 case SECCAT_TDATA:
4829 case SECCAT_TBSS:
4830 /* We don't split these for medium model. Place them into
4831 default sections and hope for best. */
4832 break;
4833 }
4834 if (sname)
4835 {
4836 /* We might get called with string constants, but get_named_section
4837 doesn't like them as they are not DECLs. Also, we need to set
4838 flags in that case. */
4839 if (!DECL_P (decl))
4840 return get_section (sname, flags, NULL);
4841 return get_named_section (decl, sname, reloc);
4842 }
4843 }
4844 return default_elf_select_section (decl, reloc, align);
4845 }
4846
4847 /* Build up a unique section name, expressed as a
4848 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4849 RELOC indicates whether the initial value of EXP requires
4850 link-time relocations. */
4851
4852 static void ATTRIBUTE_UNUSED
4853 x86_64_elf_unique_section (tree decl, int reloc)
4854 {
4855 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4856 && ix86_in_large_data_p (decl))
4857 {
4858 const char *prefix = NULL;
4859 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4860 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4861
4862 switch (categorize_decl_for_section (decl, reloc))
4863 {
4864 case SECCAT_DATA:
4865 case SECCAT_DATA_REL:
4866 case SECCAT_DATA_REL_LOCAL:
4867 case SECCAT_DATA_REL_RO:
4868 case SECCAT_DATA_REL_RO_LOCAL:
4869 prefix = one_only ? ".ld" : ".ldata";
4870 break;
4871 case SECCAT_BSS:
4872 prefix = one_only ? ".lb" : ".lbss";
4873 break;
4874 case SECCAT_RODATA:
4875 case SECCAT_RODATA_MERGE_STR:
4876 case SECCAT_RODATA_MERGE_STR_INIT:
4877 case SECCAT_RODATA_MERGE_CONST:
4878 prefix = one_only ? ".lr" : ".lrodata";
4879 break;
4880 case SECCAT_SRODATA:
4881 case SECCAT_SDATA:
4882 case SECCAT_SBSS:
4883 gcc_unreachable ();
4884 case SECCAT_TEXT:
4885 case SECCAT_TDATA:
4886 case SECCAT_TBSS:
4887 /* We don't split these for medium model. Place them into
4888 default sections and hope for best. */
4889 break;
4890 }
4891 if (prefix)
4892 {
4893 const char *name, *linkonce;
4894 char *string;
4895
4896 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4897 name = targetm.strip_name_encoding (name);
4898
4899 /* If we're using one_only, then there needs to be a .gnu.linkonce
4900 prefix to the section name. */
4901 linkonce = one_only ? ".gnu.linkonce" : "";
4902
4903 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4904
4905 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4906 return;
4907 }
4908 }
4909 default_unique_section (decl, reloc);
4910 }
4911
4912 #ifdef COMMON_ASM_OP
4913 /* This says how to output assembler code to declare an
4914 uninitialized external linkage data object.
4915
4916 For medium model x86-64 we need to use .largecomm opcode for
4917 large objects. */
4918 void
4919 x86_elf_aligned_common (FILE *file,
4920 const char *name, unsigned HOST_WIDE_INT size,
4921 int align)
4922 {
4923 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4924 && size > (unsigned int)ix86_section_threshold)
4925 fputs (".largecomm\t", file);
4926 else
4927 fputs (COMMON_ASM_OP, file);
4928 assemble_name (file, name);
4929 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4930 size, align / BITS_PER_UNIT);
4931 }
4932 #endif
4933
4934 /* Utility function for targets to use in implementing
4935 ASM_OUTPUT_ALIGNED_BSS. */
4936
4937 void
4938 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4939 const char *name, unsigned HOST_WIDE_INT size,
4940 int align)
4941 {
4942 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4943 && size > (unsigned int)ix86_section_threshold)
4944 switch_to_section (get_named_section (decl, ".lbss", 0));
4945 else
4946 switch_to_section (bss_section);
4947 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4948 #ifdef ASM_DECLARE_OBJECT_NAME
4949 last_assemble_variable_decl = decl;
4950 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4951 #else
4952 /* Standard thing is just output label for the object. */
4953 ASM_OUTPUT_LABEL (file, name);
4954 #endif /* ASM_DECLARE_OBJECT_NAME */
4955 ASM_OUTPUT_SKIP (file, size ? size : 1);
4956 }
4957 \f
4958 /* Decide whether we must probe the stack before any space allocation
4959 on this target. It's essentially TARGET_STACK_PROBE except when
4960 -fstack-check causes the stack to be already probed differently. */
4961
4962 bool
4963 ix86_target_stack_probe (void)
4964 {
4965 /* Do not probe the stack twice if static stack checking is enabled. */
4966 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4967 return false;
4968
4969 return TARGET_STACK_PROBE;
4970 }
4971 \f
4972 /* Decide whether we can make a sibling call to a function. DECL is the
4973 declaration of the function being targeted by the call and EXP is the
4974 CALL_EXPR representing the call. */
4975
4976 static bool
4977 ix86_function_ok_for_sibcall (tree decl, tree exp)
4978 {
4979 tree type, decl_or_type;
4980 rtx a, b;
4981
4982 /* If we are generating position-independent code, we cannot sibcall
4983 optimize any indirect call, or a direct call to a global function,
4984 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4985 if (!TARGET_MACHO
4986 && !TARGET_64BIT
4987 && flag_pic
4988 && (!decl || !targetm.binds_local_p (decl)))
4989 return false;
4990
4991 /* If we need to align the outgoing stack, then sibcalling would
4992 unalign the stack, which may break the called function. */
4993 if (ix86_minimum_incoming_stack_boundary (true)
4994 < PREFERRED_STACK_BOUNDARY)
4995 return false;
4996
4997 if (decl)
4998 {
4999 decl_or_type = decl;
5000 type = TREE_TYPE (decl);
5001 }
5002 else
5003 {
5004 /* We're looking at the CALL_EXPR, we need the type of the function. */
5005 type = CALL_EXPR_FN (exp); /* pointer expression */
5006 type = TREE_TYPE (type); /* pointer type */
5007 type = TREE_TYPE (type); /* function type */
5008 decl_or_type = type;
5009 }
5010
5011 /* Check that the return value locations are the same. Like
5012 if we are returning floats on the 80387 register stack, we cannot
5013 make a sibcall from a function that doesn't return a float to a
5014 function that does or, conversely, from a function that does return
5015 a float to a function that doesn't; the necessary stack adjustment
5016 would not be executed. This is also the place we notice
5017 differences in the return value ABI. Note that it is ok for one
5018 of the functions to have void return type as long as the return
5019 value of the other is passed in a register. */
5020 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5021 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5022 cfun->decl, false);
5023 if (STACK_REG_P (a) || STACK_REG_P (b))
5024 {
5025 if (!rtx_equal_p (a, b))
5026 return false;
5027 }
5028 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5029 {
5030 /* Disable sibcall if we need to generate vzeroupper after
5031 callee returns. */
5032 if (TARGET_VZEROUPPER
5033 && cfun->machine->callee_return_avx256_p
5034 && !cfun->machine->caller_return_avx256_p)
5035 return false;
5036 }
5037 else if (!rtx_equal_p (a, b))
5038 return false;
5039
5040 if (TARGET_64BIT)
5041 {
5042 /* The SYSV ABI has more call-clobbered registers;
5043 disallow sibcalls from MS to SYSV. */
5044 if (cfun->machine->call_abi == MS_ABI
5045 && ix86_function_type_abi (type) == SYSV_ABI)
5046 return false;
5047 }
5048 else
5049 {
5050 /* If this call is indirect, we'll need to be able to use a
5051 call-clobbered register for the address of the target function.
5052 Make sure that all such registers are not used for passing
5053 parameters. Note that DLLIMPORT functions are indirect. */
5054 if (!decl
5055 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5056 {
5057 if (ix86_function_regparm (type, NULL) >= 3)
5058 {
5059 /* ??? Need to count the actual number of registers to be used,
5060 not the possible number of registers. Fix later. */
5061 return false;
5062 }
5063 }
5064 }
5065
5066 /* Otherwise okay. That also includes certain types of indirect calls. */
5067 return true;
5068 }
5069
5070 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5071 and "sseregparm" calling convention attributes;
5072 arguments as in struct attribute_spec.handler. */
5073
5074 static tree
5075 ix86_handle_cconv_attribute (tree *node, tree name,
5076 tree args,
5077 int flags ATTRIBUTE_UNUSED,
5078 bool *no_add_attrs)
5079 {
5080 if (TREE_CODE (*node) != FUNCTION_TYPE
5081 && TREE_CODE (*node) != METHOD_TYPE
5082 && TREE_CODE (*node) != FIELD_DECL
5083 && TREE_CODE (*node) != TYPE_DECL)
5084 {
5085 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5086 name);
5087 *no_add_attrs = true;
5088 return NULL_TREE;
5089 }
5090
5091 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5092 if (is_attribute_p ("regparm", name))
5093 {
5094 tree cst;
5095
5096 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5097 {
5098 error ("fastcall and regparm attributes are not compatible");
5099 }
5100
5101 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5102 {
5103 error ("regparam and thiscall attributes are not compatible");
5104 }
5105
5106 cst = TREE_VALUE (args);
5107 if (TREE_CODE (cst) != INTEGER_CST)
5108 {
5109 warning (OPT_Wattributes,
5110 "%qE attribute requires an integer constant argument",
5111 name);
5112 *no_add_attrs = true;
5113 }
5114 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5115 {
5116 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5117 name, REGPARM_MAX);
5118 *no_add_attrs = true;
5119 }
5120
5121 return NULL_TREE;
5122 }
5123
5124 if (TARGET_64BIT)
5125 {
5126 /* Do not warn when emulating the MS ABI. */
5127 if ((TREE_CODE (*node) != FUNCTION_TYPE
5128 && TREE_CODE (*node) != METHOD_TYPE)
5129 || ix86_function_type_abi (*node) != MS_ABI)
5130 warning (OPT_Wattributes, "%qE attribute ignored",
5131 name);
5132 *no_add_attrs = true;
5133 return NULL_TREE;
5134 }
5135
5136 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5137 if (is_attribute_p ("fastcall", name))
5138 {
5139 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5140 {
5141 error ("fastcall and cdecl attributes are not compatible");
5142 }
5143 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5144 {
5145 error ("fastcall and stdcall attributes are not compatible");
5146 }
5147 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5148 {
5149 error ("fastcall and regparm attributes are not compatible");
5150 }
5151 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5152 {
5153 error ("fastcall and thiscall attributes are not compatible");
5154 }
5155 }
5156
5157 /* Can combine stdcall with fastcall (redundant), regparm and
5158 sseregparm. */
5159 else if (is_attribute_p ("stdcall", name))
5160 {
5161 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5162 {
5163 error ("stdcall and cdecl attributes are not compatible");
5164 }
5165 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5166 {
5167 error ("stdcall and fastcall attributes are not compatible");
5168 }
5169 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5170 {
5171 error ("stdcall and thiscall attributes are not compatible");
5172 }
5173 }
5174
5175 /* Can combine cdecl with regparm and sseregparm. */
5176 else if (is_attribute_p ("cdecl", name))
5177 {
5178 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5179 {
5180 error ("stdcall and cdecl attributes are not compatible");
5181 }
5182 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5183 {
5184 error ("fastcall and cdecl attributes are not compatible");
5185 }
5186 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5187 {
5188 error ("cdecl and thiscall attributes are not compatible");
5189 }
5190 }
5191 else if (is_attribute_p ("thiscall", name))
5192 {
5193 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5194 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5195 name);
5196 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5197 {
5198 error ("stdcall and thiscall attributes are not compatible");
5199 }
5200 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5201 {
5202 error ("fastcall and thiscall attributes are not compatible");
5203 }
5204 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5205 {
5206 error ("cdecl and thiscall attributes are not compatible");
5207 }
5208 }
5209
5210 /* Can combine sseregparm with all attributes. */
5211
5212 return NULL_TREE;
5213 }
5214
5215 /* The transactional memory builtins are implicitly regparm or fastcall
5216 depending on the ABI. Override the generic do-nothing attribute that
5217 these builtins were declared with, and replace it with one of the two
5218 attributes that we expect elsewhere. */
5219
5220 static tree
5221 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5222 tree args ATTRIBUTE_UNUSED,
5223 int flags ATTRIBUTE_UNUSED,
5224 bool *no_add_attrs)
5225 {
5226 tree alt;
5227
5228 /* In no case do we want to add the placeholder attribute. */
5229 *no_add_attrs = true;
5230
5231 /* The 64-bit ABI is unchanged for transactional memory. */
5232 if (TARGET_64BIT)
5233 return NULL_TREE;
5234
5235 /* ??? Is there a better way to validate 32-bit windows? We have
5236 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5237 if (CHECK_STACK_LIMIT > 0)
5238 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5239 else
5240 {
5241 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5242 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5243 }
5244 decl_attributes (node, alt, flags);
5245
5246 return NULL_TREE;
5247 }
5248
5249 /* This function determines from TYPE the calling-convention. */
5250
5251 unsigned int
5252 ix86_get_callcvt (const_tree type)
5253 {
5254 unsigned int ret = 0;
5255 bool is_stdarg;
5256 tree attrs;
5257
5258 if (TARGET_64BIT)
5259 return IX86_CALLCVT_CDECL;
5260
5261 attrs = TYPE_ATTRIBUTES (type);
5262 if (attrs != NULL_TREE)
5263 {
5264 if (lookup_attribute ("cdecl", attrs))
5265 ret |= IX86_CALLCVT_CDECL;
5266 else if (lookup_attribute ("stdcall", attrs))
5267 ret |= IX86_CALLCVT_STDCALL;
5268 else if (lookup_attribute ("fastcall", attrs))
5269 ret |= IX86_CALLCVT_FASTCALL;
5270 else if (lookup_attribute ("thiscall", attrs))
5271 ret |= IX86_CALLCVT_THISCALL;
5272
5273 /* Regparam isn't allowed for thiscall and fastcall. */
5274 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5275 {
5276 if (lookup_attribute ("regparm", attrs))
5277 ret |= IX86_CALLCVT_REGPARM;
5278 if (lookup_attribute ("sseregparm", attrs))
5279 ret |= IX86_CALLCVT_SSEREGPARM;
5280 }
5281
5282 if (IX86_BASE_CALLCVT(ret) != 0)
5283 return ret;
5284 }
5285
5286 is_stdarg = stdarg_p (type);
5287 if (TARGET_RTD && !is_stdarg)
5288 return IX86_CALLCVT_STDCALL | ret;
5289
5290 if (ret != 0
5291 || is_stdarg
5292 || TREE_CODE (type) != METHOD_TYPE
5293 || ix86_function_type_abi (type) != MS_ABI)
5294 return IX86_CALLCVT_CDECL | ret;
5295
5296 return IX86_CALLCVT_THISCALL;
5297 }
5298
5299 /* Return 0 if the attributes for two types are incompatible, 1 if they
5300 are compatible, and 2 if they are nearly compatible (which causes a
5301 warning to be generated). */
5302
5303 static int
5304 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5305 {
5306 unsigned int ccvt1, ccvt2;
5307
5308 if (TREE_CODE (type1) != FUNCTION_TYPE
5309 && TREE_CODE (type1) != METHOD_TYPE)
5310 return 1;
5311
5312 ccvt1 = ix86_get_callcvt (type1);
5313 ccvt2 = ix86_get_callcvt (type2);
5314 if (ccvt1 != ccvt2)
5315 return 0;
5316 if (ix86_function_regparm (type1, NULL)
5317 != ix86_function_regparm (type2, NULL))
5318 return 0;
5319
5320 return 1;
5321 }
5322 \f
5323 /* Return the regparm value for a function with the indicated TYPE and DECL.
5324 DECL may be NULL when calling function indirectly
5325 or considering a libcall. */
5326
5327 static int
5328 ix86_function_regparm (const_tree type, const_tree decl)
5329 {
5330 tree attr;
5331 int regparm;
5332 unsigned int ccvt;
5333
5334 if (TARGET_64BIT)
5335 return (ix86_function_type_abi (type) == SYSV_ABI
5336 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5337 ccvt = ix86_get_callcvt (type);
5338 regparm = ix86_regparm;
5339
5340 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5341 {
5342 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5343 if (attr)
5344 {
5345 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5346 return regparm;
5347 }
5348 }
5349 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5350 return 2;
5351 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5352 return 1;
5353
5354 /* Use register calling convention for local functions when possible. */
5355 if (decl
5356 && TREE_CODE (decl) == FUNCTION_DECL
5357 && optimize
5358 && !(profile_flag && !flag_fentry))
5359 {
5360 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5361 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5362 if (i && i->local && i->can_change_signature)
5363 {
5364 int local_regparm, globals = 0, regno;
5365
5366 /* Make sure no regparm register is taken by a
5367 fixed register variable. */
5368 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5369 if (fixed_regs[local_regparm])
5370 break;
5371
5372 /* We don't want to use regparm(3) for nested functions as
5373 these use a static chain pointer in the third argument. */
5374 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5375 local_regparm = 2;
5376
5377 /* In 32-bit mode save a register for the split stack. */
5378 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5379 local_regparm = 2;
5380
5381 /* Each fixed register usage increases register pressure,
5382 so less registers should be used for argument passing.
5383 This functionality can be overriden by an explicit
5384 regparm value. */
5385 for (regno = 0; regno <= DI_REG; regno++)
5386 if (fixed_regs[regno])
5387 globals++;
5388
5389 local_regparm
5390 = globals < local_regparm ? local_regparm - globals : 0;
5391
5392 if (local_regparm > regparm)
5393 regparm = local_regparm;
5394 }
5395 }
5396
5397 return regparm;
5398 }
5399
5400 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5401 DFmode (2) arguments in SSE registers for a function with the
5402 indicated TYPE and DECL. DECL may be NULL when calling function
5403 indirectly or considering a libcall. Otherwise return 0. */
5404
5405 static int
5406 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5407 {
5408 gcc_assert (!TARGET_64BIT);
5409
5410 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5411 by the sseregparm attribute. */
5412 if (TARGET_SSEREGPARM
5413 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5414 {
5415 if (!TARGET_SSE)
5416 {
5417 if (warn)
5418 {
5419 if (decl)
5420 error ("calling %qD with attribute sseregparm without "
5421 "SSE/SSE2 enabled", decl);
5422 else
5423 error ("calling %qT with attribute sseregparm without "
5424 "SSE/SSE2 enabled", type);
5425 }
5426 return 0;
5427 }
5428
5429 return 2;
5430 }
5431
5432 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5433 (and DFmode for SSE2) arguments in SSE registers. */
5434 if (decl && TARGET_SSE_MATH && optimize
5435 && !(profile_flag && !flag_fentry))
5436 {
5437 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5438 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5439 if (i && i->local && i->can_change_signature)
5440 return TARGET_SSE2 ? 2 : 1;
5441 }
5442
5443 return 0;
5444 }
5445
5446 /* Return true if EAX is live at the start of the function. Used by
5447 ix86_expand_prologue to determine if we need special help before
5448 calling allocate_stack_worker. */
5449
5450 static bool
5451 ix86_eax_live_at_start_p (void)
5452 {
5453 /* Cheat. Don't bother working forward from ix86_function_regparm
5454 to the function type to whether an actual argument is located in
5455 eax. Instead just look at cfg info, which is still close enough
5456 to correct at this point. This gives false positives for broken
5457 functions that might use uninitialized data that happens to be
5458 allocated in eax, but who cares? */
5459 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5460 }
5461
5462 static bool
5463 ix86_keep_aggregate_return_pointer (tree fntype)
5464 {
5465 tree attr;
5466
5467 if (!TARGET_64BIT)
5468 {
5469 attr = lookup_attribute ("callee_pop_aggregate_return",
5470 TYPE_ATTRIBUTES (fntype));
5471 if (attr)
5472 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5473
5474 /* For 32-bit MS-ABI the default is to keep aggregate
5475 return pointer. */
5476 if (ix86_function_type_abi (fntype) == MS_ABI)
5477 return true;
5478 }
5479 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5480 }
5481
5482 /* Value is the number of bytes of arguments automatically
5483 popped when returning from a subroutine call.
5484 FUNDECL is the declaration node of the function (as a tree),
5485 FUNTYPE is the data type of the function (as a tree),
5486 or for a library call it is an identifier node for the subroutine name.
5487 SIZE is the number of bytes of arguments passed on the stack.
5488
5489 On the 80386, the RTD insn may be used to pop them if the number
5490 of args is fixed, but if the number is variable then the caller
5491 must pop them all. RTD can't be used for library calls now
5492 because the library is compiled with the Unix compiler.
5493 Use of RTD is a selectable option, since it is incompatible with
5494 standard Unix calling sequences. If the option is not selected,
5495 the caller must always pop the args.
5496
5497 The attribute stdcall is equivalent to RTD on a per module basis. */
5498
5499 static int
5500 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5501 {
5502 unsigned int ccvt;
5503
5504 /* None of the 64-bit ABIs pop arguments. */
5505 if (TARGET_64BIT)
5506 return 0;
5507
5508 ccvt = ix86_get_callcvt (funtype);
5509
5510 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5511 | IX86_CALLCVT_THISCALL)) != 0
5512 && ! stdarg_p (funtype))
5513 return size;
5514
5515 /* Lose any fake structure return argument if it is passed on the stack. */
5516 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5517 && !ix86_keep_aggregate_return_pointer (funtype))
5518 {
5519 int nregs = ix86_function_regparm (funtype, fundecl);
5520 if (nregs == 0)
5521 return GET_MODE_SIZE (Pmode);
5522 }
5523
5524 return 0;
5525 }
5526 \f
5527 /* Argument support functions. */
5528
5529 /* Return true when register may be used to pass function parameters. */
5530 bool
5531 ix86_function_arg_regno_p (int regno)
5532 {
5533 int i;
5534 const int *parm_regs;
5535
5536 if (!TARGET_64BIT)
5537 {
5538 if (TARGET_MACHO)
5539 return (regno < REGPARM_MAX
5540 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5541 else
5542 return (regno < REGPARM_MAX
5543 || (TARGET_MMX && MMX_REGNO_P (regno)
5544 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5545 || (TARGET_SSE && SSE_REGNO_P (regno)
5546 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5547 }
5548
5549 if (TARGET_MACHO)
5550 {
5551 if (SSE_REGNO_P (regno) && TARGET_SSE)
5552 return true;
5553 }
5554 else
5555 {
5556 if (TARGET_SSE && SSE_REGNO_P (regno)
5557 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5558 return true;
5559 }
5560
5561 /* TODO: The function should depend on current function ABI but
5562 builtins.c would need updating then. Therefore we use the
5563 default ABI. */
5564
5565 /* RAX is used as hidden argument to va_arg functions. */
5566 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5567 return true;
5568
5569 if (ix86_abi == MS_ABI)
5570 parm_regs = x86_64_ms_abi_int_parameter_registers;
5571 else
5572 parm_regs = x86_64_int_parameter_registers;
5573 for (i = 0; i < (ix86_abi == MS_ABI
5574 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5575 if (regno == parm_regs[i])
5576 return true;
5577 return false;
5578 }
5579
5580 /* Return if we do not know how to pass TYPE solely in registers. */
5581
5582 static bool
5583 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5584 {
5585 if (must_pass_in_stack_var_size_or_pad (mode, type))
5586 return true;
5587
5588 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5589 The layout_type routine is crafty and tries to trick us into passing
5590 currently unsupported vector types on the stack by using TImode. */
5591 return (!TARGET_64BIT && mode == TImode
5592 && type && TREE_CODE (type) != VECTOR_TYPE);
5593 }
5594
5595 /* It returns the size, in bytes, of the area reserved for arguments passed
5596 in registers for the function represented by fndecl dependent to the used
5597 abi format. */
5598 int
5599 ix86_reg_parm_stack_space (const_tree fndecl)
5600 {
5601 enum calling_abi call_abi = SYSV_ABI;
5602 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5603 call_abi = ix86_function_abi (fndecl);
5604 else
5605 call_abi = ix86_function_type_abi (fndecl);
5606 if (TARGET_64BIT && call_abi == MS_ABI)
5607 return 32;
5608 return 0;
5609 }
5610
5611 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5612 call abi used. */
5613 enum calling_abi
5614 ix86_function_type_abi (const_tree fntype)
5615 {
5616 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5617 {
5618 enum calling_abi abi = ix86_abi;
5619 if (abi == SYSV_ABI)
5620 {
5621 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5622 abi = MS_ABI;
5623 }
5624 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5625 abi = SYSV_ABI;
5626 return abi;
5627 }
5628 return ix86_abi;
5629 }
5630
5631 static bool
5632 ix86_function_ms_hook_prologue (const_tree fn)
5633 {
5634 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5635 {
5636 if (decl_function_context (fn) != NULL_TREE)
5637 error_at (DECL_SOURCE_LOCATION (fn),
5638 "ms_hook_prologue is not compatible with nested function");
5639 else
5640 return true;
5641 }
5642 return false;
5643 }
5644
5645 static enum calling_abi
5646 ix86_function_abi (const_tree fndecl)
5647 {
5648 if (! fndecl)
5649 return ix86_abi;
5650 return ix86_function_type_abi (TREE_TYPE (fndecl));
5651 }
5652
5653 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5654 call abi used. */
5655 enum calling_abi
5656 ix86_cfun_abi (void)
5657 {
5658 if (! cfun)
5659 return ix86_abi;
5660 return cfun->machine->call_abi;
5661 }
5662
5663 /* Write the extra assembler code needed to declare a function properly. */
5664
5665 void
5666 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5667 tree decl)
5668 {
5669 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5670
5671 if (is_ms_hook)
5672 {
5673 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5674 unsigned int filler_cc = 0xcccccccc;
5675
5676 for (i = 0; i < filler_count; i += 4)
5677 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5678 }
5679
5680 #ifdef SUBTARGET_ASM_UNWIND_INIT
5681 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5682 #endif
5683
5684 ASM_OUTPUT_LABEL (asm_out_file, fname);
5685
5686 /* Output magic byte marker, if hot-patch attribute is set. */
5687 if (is_ms_hook)
5688 {
5689 if (TARGET_64BIT)
5690 {
5691 /* leaq [%rsp + 0], %rsp */
5692 asm_fprintf (asm_out_file, ASM_BYTE
5693 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5694 }
5695 else
5696 {
5697 /* movl.s %edi, %edi
5698 push %ebp
5699 movl.s %esp, %ebp */
5700 asm_fprintf (asm_out_file, ASM_BYTE
5701 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5702 }
5703 }
5704 }
5705
5706 /* regclass.c */
5707 extern void init_regs (void);
5708
5709 /* Implementation of call abi switching target hook. Specific to FNDECL
5710 the specific call register sets are set. See also
5711 ix86_conditional_register_usage for more details. */
5712 void
5713 ix86_call_abi_override (const_tree fndecl)
5714 {
5715 if (fndecl == NULL_TREE)
5716 cfun->machine->call_abi = ix86_abi;
5717 else
5718 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5719 }
5720
5721 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5722 expensive re-initialization of init_regs each time we switch function context
5723 since this is needed only during RTL expansion. */
5724 static void
5725 ix86_maybe_switch_abi (void)
5726 {
5727 if (TARGET_64BIT &&
5728 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5729 reinit_regs ();
5730 }
5731
5732 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5733 for a call to a function whose data type is FNTYPE.
5734 For a library call, FNTYPE is 0. */
5735
5736 void
5737 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5738 tree fntype, /* tree ptr for function decl */
5739 rtx libname, /* SYMBOL_REF of library name or 0 */
5740 tree fndecl,
5741 int caller)
5742 {
5743 struct cgraph_local_info *i;
5744 tree fnret_type;
5745
5746 memset (cum, 0, sizeof (*cum));
5747
5748 /* Initialize for the current callee. */
5749 if (caller)
5750 {
5751 cfun->machine->callee_pass_avx256_p = false;
5752 cfun->machine->callee_return_avx256_p = false;
5753 }
5754
5755 if (fndecl)
5756 {
5757 i = cgraph_local_info (fndecl);
5758 cum->call_abi = ix86_function_abi (fndecl);
5759 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5760 }
5761 else
5762 {
5763 i = NULL;
5764 cum->call_abi = ix86_function_type_abi (fntype);
5765 if (fntype)
5766 fnret_type = TREE_TYPE (fntype);
5767 else
5768 fnret_type = NULL;
5769 }
5770
5771 if (TARGET_VZEROUPPER && fnret_type)
5772 {
5773 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5774 false);
5775 if (function_pass_avx256_p (fnret_value))
5776 {
5777 /* The return value of this function uses 256bit AVX modes. */
5778 if (caller)
5779 cfun->machine->callee_return_avx256_p = true;
5780 else
5781 cfun->machine->caller_return_avx256_p = true;
5782 }
5783 }
5784
5785 cum->caller = caller;
5786
5787 /* Set up the number of registers to use for passing arguments. */
5788
5789 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5790 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5791 "or subtarget optimization implying it");
5792 cum->nregs = ix86_regparm;
5793 if (TARGET_64BIT)
5794 {
5795 cum->nregs = (cum->call_abi == SYSV_ABI
5796 ? X86_64_REGPARM_MAX
5797 : X86_64_MS_REGPARM_MAX);
5798 }
5799 if (TARGET_SSE)
5800 {
5801 cum->sse_nregs = SSE_REGPARM_MAX;
5802 if (TARGET_64BIT)
5803 {
5804 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5805 ? X86_64_SSE_REGPARM_MAX
5806 : X86_64_MS_SSE_REGPARM_MAX);
5807 }
5808 }
5809 if (TARGET_MMX)
5810 cum->mmx_nregs = MMX_REGPARM_MAX;
5811 cum->warn_avx = true;
5812 cum->warn_sse = true;
5813 cum->warn_mmx = true;
5814
5815 /* Because type might mismatch in between caller and callee, we need to
5816 use actual type of function for local calls.
5817 FIXME: cgraph_analyze can be told to actually record if function uses
5818 va_start so for local functions maybe_vaarg can be made aggressive
5819 helping K&R code.
5820 FIXME: once typesytem is fixed, we won't need this code anymore. */
5821 if (i && i->local && i->can_change_signature)
5822 fntype = TREE_TYPE (fndecl);
5823 cum->maybe_vaarg = (fntype
5824 ? (!prototype_p (fntype) || stdarg_p (fntype))
5825 : !libname);
5826
5827 if (!TARGET_64BIT)
5828 {
5829 /* If there are variable arguments, then we won't pass anything
5830 in registers in 32-bit mode. */
5831 if (stdarg_p (fntype))
5832 {
5833 cum->nregs = 0;
5834 cum->sse_nregs = 0;
5835 cum->mmx_nregs = 0;
5836 cum->warn_avx = 0;
5837 cum->warn_sse = 0;
5838 cum->warn_mmx = 0;
5839 return;
5840 }
5841
5842 /* Use ecx and edx registers if function has fastcall attribute,
5843 else look for regparm information. */
5844 if (fntype)
5845 {
5846 unsigned int ccvt = ix86_get_callcvt (fntype);
5847 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5848 {
5849 cum->nregs = 1;
5850 cum->fastcall = 1; /* Same first register as in fastcall. */
5851 }
5852 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5853 {
5854 cum->nregs = 2;
5855 cum->fastcall = 1;
5856 }
5857 else
5858 cum->nregs = ix86_function_regparm (fntype, fndecl);
5859 }
5860
5861 /* Set up the number of SSE registers used for passing SFmode
5862 and DFmode arguments. Warn for mismatching ABI. */
5863 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5864 }
5865 }
5866
5867 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5868 But in the case of vector types, it is some vector mode.
5869
5870 When we have only some of our vector isa extensions enabled, then there
5871 are some modes for which vector_mode_supported_p is false. For these
5872 modes, the generic vector support in gcc will choose some non-vector mode
5873 in order to implement the type. By computing the natural mode, we'll
5874 select the proper ABI location for the operand and not depend on whatever
5875 the middle-end decides to do with these vector types.
5876
5877 The midde-end can't deal with the vector types > 16 bytes. In this
5878 case, we return the original mode and warn ABI change if CUM isn't
5879 NULL. */
5880
5881 static enum machine_mode
5882 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5883 {
5884 enum machine_mode mode = TYPE_MODE (type);
5885
5886 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5887 {
5888 HOST_WIDE_INT size = int_size_in_bytes (type);
5889 if ((size == 8 || size == 16 || size == 32)
5890 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5891 && TYPE_VECTOR_SUBPARTS (type) > 1)
5892 {
5893 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5894
5895 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5896 mode = MIN_MODE_VECTOR_FLOAT;
5897 else
5898 mode = MIN_MODE_VECTOR_INT;
5899
5900 /* Get the mode which has this inner mode and number of units. */
5901 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5902 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5903 && GET_MODE_INNER (mode) == innermode)
5904 {
5905 if (size == 32 && !TARGET_AVX)
5906 {
5907 static bool warnedavx;
5908
5909 if (cum
5910 && !warnedavx
5911 && cum->warn_avx)
5912 {
5913 warnedavx = true;
5914 warning (0, "AVX vector argument without AVX "
5915 "enabled changes the ABI");
5916 }
5917 return TYPE_MODE (type);
5918 }
5919 else
5920 return mode;
5921 }
5922
5923 gcc_unreachable ();
5924 }
5925 }
5926
5927 return mode;
5928 }
5929
5930 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5931 this may not agree with the mode that the type system has chosen for the
5932 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5933 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5934
5935 static rtx
5936 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5937 unsigned int regno)
5938 {
5939 rtx tmp;
5940
5941 if (orig_mode != BLKmode)
5942 tmp = gen_rtx_REG (orig_mode, regno);
5943 else
5944 {
5945 tmp = gen_rtx_REG (mode, regno);
5946 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5947 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5948 }
5949
5950 return tmp;
5951 }
5952
5953 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5954 of this code is to classify each 8bytes of incoming argument by the register
5955 class and assign registers accordingly. */
5956
5957 /* Return the union class of CLASS1 and CLASS2.
5958 See the x86-64 PS ABI for details. */
5959
5960 static enum x86_64_reg_class
5961 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5962 {
5963 /* Rule #1: If both classes are equal, this is the resulting class. */
5964 if (class1 == class2)
5965 return class1;
5966
5967 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5968 the other class. */
5969 if (class1 == X86_64_NO_CLASS)
5970 return class2;
5971 if (class2 == X86_64_NO_CLASS)
5972 return class1;
5973
5974 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5975 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5976 return X86_64_MEMORY_CLASS;
5977
5978 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5979 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5980 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5981 return X86_64_INTEGERSI_CLASS;
5982 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5983 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5984 return X86_64_INTEGER_CLASS;
5985
5986 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5987 MEMORY is used. */
5988 if (class1 == X86_64_X87_CLASS
5989 || class1 == X86_64_X87UP_CLASS
5990 || class1 == X86_64_COMPLEX_X87_CLASS
5991 || class2 == X86_64_X87_CLASS
5992 || class2 == X86_64_X87UP_CLASS
5993 || class2 == X86_64_COMPLEX_X87_CLASS)
5994 return X86_64_MEMORY_CLASS;
5995
5996 /* Rule #6: Otherwise class SSE is used. */
5997 return X86_64_SSE_CLASS;
5998 }
5999
6000 /* Classify the argument of type TYPE and mode MODE.
6001 CLASSES will be filled by the register class used to pass each word
6002 of the operand. The number of words is returned. In case the parameter
6003 should be passed in memory, 0 is returned. As a special case for zero
6004 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6005
6006 BIT_OFFSET is used internally for handling records and specifies offset
6007 of the offset in bits modulo 256 to avoid overflow cases.
6008
6009 See the x86-64 PS ABI for details.
6010 */
6011
6012 static int
6013 classify_argument (enum machine_mode mode, const_tree type,
6014 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6015 {
6016 HOST_WIDE_INT bytes =
6017 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6018 int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6019
6020 /* Variable sized entities are always passed/returned in memory. */
6021 if (bytes < 0)
6022 return 0;
6023
6024 if (mode != VOIDmode
6025 && targetm.calls.must_pass_in_stack (mode, type))
6026 return 0;
6027
6028 if (type && AGGREGATE_TYPE_P (type))
6029 {
6030 int i;
6031 tree field;
6032 enum x86_64_reg_class subclasses[MAX_CLASSES];
6033
6034 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6035 if (bytes > 32)
6036 return 0;
6037
6038 for (i = 0; i < words; i++)
6039 classes[i] = X86_64_NO_CLASS;
6040
6041 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6042 signalize memory class, so handle it as special case. */
6043 if (!words)
6044 {
6045 classes[0] = X86_64_NO_CLASS;
6046 return 1;
6047 }
6048
6049 /* Classify each field of record and merge classes. */
6050 switch (TREE_CODE (type))
6051 {
6052 case RECORD_TYPE:
6053 /* And now merge the fields of structure. */
6054 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6055 {
6056 if (TREE_CODE (field) == FIELD_DECL)
6057 {
6058 int num;
6059
6060 if (TREE_TYPE (field) == error_mark_node)
6061 continue;
6062
6063 /* Bitfields are always classified as integer. Handle them
6064 early, since later code would consider them to be
6065 misaligned integers. */
6066 if (DECL_BIT_FIELD (field))
6067 {
6068 for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
6069 i < ((int_bit_position (field) + (bit_offset % 64))
6070 + tree_low_cst (DECL_SIZE (field), 0)
6071 + 63) / 8 / 8; i++)
6072 classes[i] =
6073 merge_classes (X86_64_INTEGER_CLASS,
6074 classes[i]);
6075 }
6076 else
6077 {
6078 int pos;
6079
6080 type = TREE_TYPE (field);
6081
6082 /* Flexible array member is ignored. */
6083 if (TYPE_MODE (type) == BLKmode
6084 && TREE_CODE (type) == ARRAY_TYPE
6085 && TYPE_SIZE (type) == NULL_TREE
6086 && TYPE_DOMAIN (type) != NULL_TREE
6087 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6088 == NULL_TREE))
6089 {
6090 static bool warned;
6091
6092 if (!warned && warn_psabi)
6093 {
6094 warned = true;
6095 inform (input_location,
6096 "the ABI of passing struct with"
6097 " a flexible array member has"
6098 " changed in GCC 4.4");
6099 }
6100 continue;
6101 }
6102 num = classify_argument (TYPE_MODE (type), type,
6103 subclasses,
6104 (int_bit_position (field)
6105 + bit_offset) % 256);
6106 if (!num)
6107 return 0;
6108 pos = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
6109 for (i = 0; i < num && (i + pos) < words; i++)
6110 classes[i + pos] =
6111 merge_classes (subclasses[i], classes[i + pos]);
6112 }
6113 }
6114 }
6115 break;
6116
6117 case ARRAY_TYPE:
6118 /* Arrays are handled as small records. */
6119 {
6120 int num;
6121 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6122 TREE_TYPE (type), subclasses, bit_offset);
6123 if (!num)
6124 return 0;
6125
6126 /* The partial classes are now full classes. */
6127 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6128 subclasses[0] = X86_64_SSE_CLASS;
6129 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6130 && !((bit_offset % 64) == 0 && bytes == 4))
6131 subclasses[0] = X86_64_INTEGER_CLASS;
6132
6133 for (i = 0; i < words; i++)
6134 classes[i] = subclasses[i % num];
6135
6136 break;
6137 }
6138 case UNION_TYPE:
6139 case QUAL_UNION_TYPE:
6140 /* Unions are similar to RECORD_TYPE but offset is always 0.
6141 */
6142 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6143 {
6144 if (TREE_CODE (field) == FIELD_DECL)
6145 {
6146 int num;
6147
6148 if (TREE_TYPE (field) == error_mark_node)
6149 continue;
6150
6151 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6152 TREE_TYPE (field), subclasses,
6153 bit_offset);
6154 if (!num)
6155 return 0;
6156 for (i = 0; i < num; i++)
6157 classes[i] = merge_classes (subclasses[i], classes[i]);
6158 }
6159 }
6160 break;
6161
6162 default:
6163 gcc_unreachable ();
6164 }
6165
6166 if (words > 2)
6167 {
6168 /* When size > 16 bytes, if the first one isn't
6169 X86_64_SSE_CLASS or any other ones aren't
6170 X86_64_SSEUP_CLASS, everything should be passed in
6171 memory. */
6172 if (classes[0] != X86_64_SSE_CLASS)
6173 return 0;
6174
6175 for (i = 1; i < words; i++)
6176 if (classes[i] != X86_64_SSEUP_CLASS)
6177 return 0;
6178 }
6179
6180 /* Final merger cleanup. */
6181 for (i = 0; i < words; i++)
6182 {
6183 /* If one class is MEMORY, everything should be passed in
6184 memory. */
6185 if (classes[i] == X86_64_MEMORY_CLASS)
6186 return 0;
6187
6188 /* The X86_64_SSEUP_CLASS should be always preceded by
6189 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6190 if (classes[i] == X86_64_SSEUP_CLASS
6191 && classes[i - 1] != X86_64_SSE_CLASS
6192 && classes[i - 1] != X86_64_SSEUP_CLASS)
6193 {
6194 /* The first one should never be X86_64_SSEUP_CLASS. */
6195 gcc_assert (i != 0);
6196 classes[i] = X86_64_SSE_CLASS;
6197 }
6198
6199 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6200 everything should be passed in memory. */
6201 if (classes[i] == X86_64_X87UP_CLASS
6202 && (classes[i - 1] != X86_64_X87_CLASS))
6203 {
6204 static bool warned;
6205
6206 /* The first one should never be X86_64_X87UP_CLASS. */
6207 gcc_assert (i != 0);
6208 if (!warned && warn_psabi)
6209 {
6210 warned = true;
6211 inform (input_location,
6212 "the ABI of passing union with long double"
6213 " has changed in GCC 4.4");
6214 }
6215 return 0;
6216 }
6217 }
6218 return words;
6219 }
6220
6221 /* Compute alignment needed. We align all types to natural boundaries with
6222 exception of XFmode that is aligned to 64bits. */
6223 if (mode != VOIDmode && mode != BLKmode)
6224 {
6225 int mode_alignment = GET_MODE_BITSIZE (mode);
6226
6227 if (mode == XFmode)
6228 mode_alignment = 128;
6229 else if (mode == XCmode)
6230 mode_alignment = 256;
6231 if (COMPLEX_MODE_P (mode))
6232 mode_alignment /= 2;
6233 /* Misaligned fields are always returned in memory. */
6234 if (bit_offset % mode_alignment)
6235 return 0;
6236 }
6237
6238 /* for V1xx modes, just use the base mode */
6239 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6240 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6241 mode = GET_MODE_INNER (mode);
6242
6243 /* Classification of atomic types. */
6244 switch (mode)
6245 {
6246 case SDmode:
6247 case DDmode:
6248 classes[0] = X86_64_SSE_CLASS;
6249 return 1;
6250 case TDmode:
6251 classes[0] = X86_64_SSE_CLASS;
6252 classes[1] = X86_64_SSEUP_CLASS;
6253 return 2;
6254 case DImode:
6255 case SImode:
6256 case HImode:
6257 case QImode:
6258 case CSImode:
6259 case CHImode:
6260 case CQImode:
6261 {
6262 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6263
6264 if (size <= 32)
6265 {
6266 classes[0] = X86_64_INTEGERSI_CLASS;
6267 return 1;
6268 }
6269 else if (size <= 64)
6270 {
6271 classes[0] = X86_64_INTEGER_CLASS;
6272 return 1;
6273 }
6274 else if (size <= 64+32)
6275 {
6276 classes[0] = X86_64_INTEGER_CLASS;
6277 classes[1] = X86_64_INTEGERSI_CLASS;
6278 return 2;
6279 }
6280 else if (size <= 64+64)
6281 {
6282 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6283 return 2;
6284 }
6285 else
6286 gcc_unreachable ();
6287 }
6288 case CDImode:
6289 case TImode:
6290 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6291 return 2;
6292 case COImode:
6293 case OImode:
6294 /* OImode shouldn't be used directly. */
6295 gcc_unreachable ();
6296 case CTImode:
6297 return 0;
6298 case SFmode:
6299 if (!(bit_offset % 64))
6300 classes[0] = X86_64_SSESF_CLASS;
6301 else
6302 classes[0] = X86_64_SSE_CLASS;
6303 return 1;
6304 case DFmode:
6305 classes[0] = X86_64_SSEDF_CLASS;
6306 return 1;
6307 case XFmode:
6308 classes[0] = X86_64_X87_CLASS;
6309 classes[1] = X86_64_X87UP_CLASS;
6310 return 2;
6311 case TFmode:
6312 classes[0] = X86_64_SSE_CLASS;
6313 classes[1] = X86_64_SSEUP_CLASS;
6314 return 2;
6315 case SCmode:
6316 classes[0] = X86_64_SSE_CLASS;
6317 if (!(bit_offset % 64))
6318 return 1;
6319 else
6320 {
6321 static bool warned;
6322
6323 if (!warned && warn_psabi)
6324 {
6325 warned = true;
6326 inform (input_location,
6327 "the ABI of passing structure with complex float"
6328 " member has changed in GCC 4.4");
6329 }
6330 classes[1] = X86_64_SSESF_CLASS;
6331 return 2;
6332 }
6333 case DCmode:
6334 classes[0] = X86_64_SSEDF_CLASS;
6335 classes[1] = X86_64_SSEDF_CLASS;
6336 return 2;
6337 case XCmode:
6338 classes[0] = X86_64_COMPLEX_X87_CLASS;
6339 return 1;
6340 case TCmode:
6341 /* This modes is larger than 16 bytes. */
6342 return 0;
6343 case V8SFmode:
6344 case V8SImode:
6345 case V32QImode:
6346 case V16HImode:
6347 case V4DFmode:
6348 case V4DImode:
6349 classes[0] = X86_64_SSE_CLASS;
6350 classes[1] = X86_64_SSEUP_CLASS;
6351 classes[2] = X86_64_SSEUP_CLASS;
6352 classes[3] = X86_64_SSEUP_CLASS;
6353 return 4;
6354 case V4SFmode:
6355 case V4SImode:
6356 case V16QImode:
6357 case V8HImode:
6358 case V2DFmode:
6359 case V2DImode:
6360 classes[0] = X86_64_SSE_CLASS;
6361 classes[1] = X86_64_SSEUP_CLASS;
6362 return 2;
6363 case V1TImode:
6364 case V1DImode:
6365 case V2SFmode:
6366 case V2SImode:
6367 case V4HImode:
6368 case V8QImode:
6369 classes[0] = X86_64_SSE_CLASS;
6370 return 1;
6371 case BLKmode:
6372 case VOIDmode:
6373 return 0;
6374 default:
6375 gcc_assert (VECTOR_MODE_P (mode));
6376
6377 if (bytes > 16)
6378 return 0;
6379
6380 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6381
6382 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6383 classes[0] = X86_64_INTEGERSI_CLASS;
6384 else
6385 classes[0] = X86_64_INTEGER_CLASS;
6386 classes[1] = X86_64_INTEGER_CLASS;
6387 return 1 + (bytes > 8);
6388 }
6389 }
6390
6391 /* Examine the argument and return set number of register required in each
6392 class. Return 0 iff parameter should be passed in memory. */
6393 static int
6394 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6395 int *int_nregs, int *sse_nregs)
6396 {
6397 enum x86_64_reg_class regclass[MAX_CLASSES];
6398 int n = classify_argument (mode, type, regclass, 0);
6399
6400 *int_nregs = 0;
6401 *sse_nregs = 0;
6402 if (!n)
6403 return 0;
6404 for (n--; n >= 0; n--)
6405 switch (regclass[n])
6406 {
6407 case X86_64_INTEGER_CLASS:
6408 case X86_64_INTEGERSI_CLASS:
6409 (*int_nregs)++;
6410 break;
6411 case X86_64_SSE_CLASS:
6412 case X86_64_SSESF_CLASS:
6413 case X86_64_SSEDF_CLASS:
6414 (*sse_nregs)++;
6415 break;
6416 case X86_64_NO_CLASS:
6417 case X86_64_SSEUP_CLASS:
6418 break;
6419 case X86_64_X87_CLASS:
6420 case X86_64_X87UP_CLASS:
6421 if (!in_return)
6422 return 0;
6423 break;
6424 case X86_64_COMPLEX_X87_CLASS:
6425 return in_return ? 2 : 0;
6426 case X86_64_MEMORY_CLASS:
6427 gcc_unreachable ();
6428 }
6429 return 1;
6430 }
6431
6432 /* Construct container for the argument used by GCC interface. See
6433 FUNCTION_ARG for the detailed description. */
6434
6435 static rtx
6436 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6437 const_tree type, int in_return, int nintregs, int nsseregs,
6438 const int *intreg, int sse_regno)
6439 {
6440 /* The following variables hold the static issued_error state. */
6441 static bool issued_sse_arg_error;
6442 static bool issued_sse_ret_error;
6443 static bool issued_x87_ret_error;
6444
6445 enum machine_mode tmpmode;
6446 int bytes =
6447 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6448 enum x86_64_reg_class regclass[MAX_CLASSES];
6449 int n;
6450 int i;
6451 int nexps = 0;
6452 int needed_sseregs, needed_intregs;
6453 rtx exp[MAX_CLASSES];
6454 rtx ret;
6455
6456 n = classify_argument (mode, type, regclass, 0);
6457 if (!n)
6458 return NULL;
6459 if (!examine_argument (mode, type, in_return, &needed_intregs,
6460 &needed_sseregs))
6461 return NULL;
6462 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6463 return NULL;
6464
6465 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6466 some less clueful developer tries to use floating-point anyway. */
6467 if (needed_sseregs && !TARGET_SSE)
6468 {
6469 if (in_return)
6470 {
6471 if (!issued_sse_ret_error)
6472 {
6473 error ("SSE register return with SSE disabled");
6474 issued_sse_ret_error = true;
6475 }
6476 }
6477 else if (!issued_sse_arg_error)
6478 {
6479 error ("SSE register argument with SSE disabled");
6480 issued_sse_arg_error = true;
6481 }
6482 return NULL;
6483 }
6484
6485 /* Likewise, error if the ABI requires us to return values in the
6486 x87 registers and the user specified -mno-80387. */
6487 if (!TARGET_80387 && in_return)
6488 for (i = 0; i < n; i++)
6489 if (regclass[i] == X86_64_X87_CLASS
6490 || regclass[i] == X86_64_X87UP_CLASS
6491 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6492 {
6493 if (!issued_x87_ret_error)
6494 {
6495 error ("x87 register return with x87 disabled");
6496 issued_x87_ret_error = true;
6497 }
6498 return NULL;
6499 }
6500
6501 /* First construct simple cases. Avoid SCmode, since we want to use
6502 single register to pass this type. */
6503 if (n == 1 && mode != SCmode)
6504 switch (regclass[0])
6505 {
6506 case X86_64_INTEGER_CLASS:
6507 case X86_64_INTEGERSI_CLASS:
6508 return gen_rtx_REG (mode, intreg[0]);
6509 case X86_64_SSE_CLASS:
6510 case X86_64_SSESF_CLASS:
6511 case X86_64_SSEDF_CLASS:
6512 if (mode != BLKmode)
6513 return gen_reg_or_parallel (mode, orig_mode,
6514 SSE_REGNO (sse_regno));
6515 break;
6516 case X86_64_X87_CLASS:
6517 case X86_64_COMPLEX_X87_CLASS:
6518 return gen_rtx_REG (mode, FIRST_STACK_REG);
6519 case X86_64_NO_CLASS:
6520 /* Zero sized array, struct or class. */
6521 return NULL;
6522 default:
6523 gcc_unreachable ();
6524 }
6525 if (n == 2 && regclass[0] == X86_64_SSE_CLASS
6526 && regclass[1] == X86_64_SSEUP_CLASS && mode != BLKmode)
6527 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6528 if (n == 4
6529 && regclass[0] == X86_64_SSE_CLASS
6530 && regclass[1] == X86_64_SSEUP_CLASS
6531 && regclass[2] == X86_64_SSEUP_CLASS
6532 && regclass[3] == X86_64_SSEUP_CLASS
6533 && mode != BLKmode)
6534 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6535
6536 if (n == 2
6537 && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS)
6538 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6539 if (n == 2 && regclass[0] == X86_64_INTEGER_CLASS
6540 && regclass[1] == X86_64_INTEGER_CLASS
6541 && (mode == CDImode || mode == TImode || mode == TFmode)
6542 && intreg[0] + 1 == intreg[1])
6543 return gen_rtx_REG (mode, intreg[0]);
6544
6545 /* Otherwise figure out the entries of the PARALLEL. */
6546 for (i = 0; i < n; i++)
6547 {
6548 int pos;
6549
6550 switch (regclass[i])
6551 {
6552 case X86_64_NO_CLASS:
6553 break;
6554 case X86_64_INTEGER_CLASS:
6555 case X86_64_INTEGERSI_CLASS:
6556 /* Merge TImodes on aligned occasions here too. */
6557 if (i * 8 + 8 > bytes)
6558 tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6559 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6560 tmpmode = SImode;
6561 else
6562 tmpmode = DImode;
6563 /* We've requested 24 bytes we don't have mode for. Use DImode. */
6564 if (tmpmode == BLKmode)
6565 tmpmode = DImode;
6566 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6567 gen_rtx_REG (tmpmode, *intreg),
6568 GEN_INT (i*8));
6569 intreg++;
6570 break;
6571 case X86_64_SSESF_CLASS:
6572 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6573 gen_rtx_REG (SFmode,
6574 SSE_REGNO (sse_regno)),
6575 GEN_INT (i*8));
6576 sse_regno++;
6577 break;
6578 case X86_64_SSEDF_CLASS:
6579 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6580 gen_rtx_REG (DFmode,
6581 SSE_REGNO (sse_regno)),
6582 GEN_INT (i*8));
6583 sse_regno++;
6584 break;
6585 case X86_64_SSE_CLASS:
6586 pos = i;
6587 switch (n)
6588 {
6589 case 1:
6590 tmpmode = DImode;
6591 break;
6592 case 2:
6593 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6594 {
6595 tmpmode = TImode;
6596 i++;
6597 }
6598 else
6599 tmpmode = DImode;
6600 break;
6601 case 4:
6602 gcc_assert (i == 0
6603 && regclass[1] == X86_64_SSEUP_CLASS
6604 && regclass[2] == X86_64_SSEUP_CLASS
6605 && regclass[3] == X86_64_SSEUP_CLASS);
6606 tmpmode = OImode;
6607 i += 3;
6608 break;
6609 default:
6610 gcc_unreachable ();
6611 }
6612 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6613 gen_rtx_REG (tmpmode,
6614 SSE_REGNO (sse_regno)),
6615 GEN_INT (pos*8));
6616 sse_regno++;
6617 break;
6618 default:
6619 gcc_unreachable ();
6620 }
6621 }
6622
6623 /* Empty aligned struct, union or class. */
6624 if (nexps == 0)
6625 return NULL;
6626
6627 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6628 for (i = 0; i < nexps; i++)
6629 XVECEXP (ret, 0, i) = exp [i];
6630 return ret;
6631 }
6632
6633 /* Update the data in CUM to advance over an argument of mode MODE
6634 and data type TYPE. (TYPE is null for libcalls where that information
6635 may not be available.) */
6636
6637 static void
6638 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6639 const_tree type, HOST_WIDE_INT bytes,
6640 HOST_WIDE_INT words)
6641 {
6642 switch (mode)
6643 {
6644 default:
6645 break;
6646
6647 case BLKmode:
6648 if (bytes < 0)
6649 break;
6650 /* FALLTHRU */
6651
6652 case DImode:
6653 case SImode:
6654 case HImode:
6655 case QImode:
6656 cum->words += words;
6657 cum->nregs -= words;
6658 cum->regno += words;
6659
6660 if (cum->nregs <= 0)
6661 {
6662 cum->nregs = 0;
6663 cum->regno = 0;
6664 }
6665 break;
6666
6667 case OImode:
6668 /* OImode shouldn't be used directly. */
6669 gcc_unreachable ();
6670
6671 case DFmode:
6672 if (cum->float_in_sse < 2)
6673 break;
6674 case SFmode:
6675 if (cum->float_in_sse < 1)
6676 break;
6677 /* FALLTHRU */
6678
6679 case V8SFmode:
6680 case V8SImode:
6681 case V32QImode:
6682 case V16HImode:
6683 case V4DFmode:
6684 case V4DImode:
6685 case TImode:
6686 case V16QImode:
6687 case V8HImode:
6688 case V4SImode:
6689 case V2DImode:
6690 case V4SFmode:
6691 case V2DFmode:
6692 if (!type || !AGGREGATE_TYPE_P (type))
6693 {
6694 cum->sse_words += words;
6695 cum->sse_nregs -= 1;
6696 cum->sse_regno += 1;
6697 if (cum->sse_nregs <= 0)
6698 {
6699 cum->sse_nregs = 0;
6700 cum->sse_regno = 0;
6701 }
6702 }
6703 break;
6704
6705 case V8QImode:
6706 case V4HImode:
6707 case V2SImode:
6708 case V2SFmode:
6709 case V1TImode:
6710 case V1DImode:
6711 if (!type || !AGGREGATE_TYPE_P (type))
6712 {
6713 cum->mmx_words += words;
6714 cum->mmx_nregs -= 1;
6715 cum->mmx_regno += 1;
6716 if (cum->mmx_nregs <= 0)
6717 {
6718 cum->mmx_nregs = 0;
6719 cum->mmx_regno = 0;
6720 }
6721 }
6722 break;
6723 }
6724 }
6725
6726 static void
6727 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6728 const_tree type, HOST_WIDE_INT words, bool named)
6729 {
6730 int int_nregs, sse_nregs;
6731
6732 /* Unnamed 256bit vector mode parameters are passed on stack. */
6733 if (!named && VALID_AVX256_REG_MODE (mode))
6734 return;
6735
6736 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6737 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6738 {
6739 cum->nregs -= int_nregs;
6740 cum->sse_nregs -= sse_nregs;
6741 cum->regno += int_nregs;
6742 cum->sse_regno += sse_nregs;
6743 }
6744 else
6745 {
6746 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6747 cum->words = (cum->words + align - 1) & ~(align - 1);
6748 cum->words += words;
6749 }
6750 }
6751
6752 static void
6753 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6754 HOST_WIDE_INT words)
6755 {
6756 /* Otherwise, this should be passed indirect. */
6757 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6758
6759 cum->words += words;
6760 if (cum->nregs > 0)
6761 {
6762 cum->nregs -= 1;
6763 cum->regno += 1;
6764 }
6765 }
6766
6767 /* Update the data in CUM to advance over an argument of mode MODE and
6768 data type TYPE. (TYPE is null for libcalls where that information
6769 may not be available.) */
6770
6771 static void
6772 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6773 const_tree type, bool named)
6774 {
6775 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6776 HOST_WIDE_INT bytes, words;
6777
6778 if (mode == BLKmode)
6779 bytes = int_size_in_bytes (type);
6780 else
6781 bytes = GET_MODE_SIZE (mode);
6782 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6783
6784 if (type)
6785 mode = type_natural_mode (type, NULL);
6786
6787 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6788 function_arg_advance_ms_64 (cum, bytes, words);
6789 else if (TARGET_64BIT)
6790 function_arg_advance_64 (cum, mode, type, words, named);
6791 else
6792 function_arg_advance_32 (cum, mode, type, bytes, words);
6793 }
6794
6795 /* Define where to put the arguments to a function.
6796 Value is zero to push the argument on the stack,
6797 or a hard register in which to store the argument.
6798
6799 MODE is the argument's machine mode.
6800 TYPE is the data type of the argument (as a tree).
6801 This is null for libcalls where that information may
6802 not be available.
6803 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6804 the preceding args and about the function being called.
6805 NAMED is nonzero if this argument is a named parameter
6806 (otherwise it is an extra parameter matching an ellipsis). */
6807
6808 static rtx
6809 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6810 enum machine_mode orig_mode, const_tree type,
6811 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6812 {
6813 static bool warnedsse, warnedmmx;
6814
6815 /* Avoid the AL settings for the Unix64 ABI. */
6816 if (mode == VOIDmode)
6817 return constm1_rtx;
6818
6819 switch (mode)
6820 {
6821 default:
6822 break;
6823
6824 case BLKmode:
6825 if (bytes < 0)
6826 break;
6827 /* FALLTHRU */
6828 case DImode:
6829 case SImode:
6830 case HImode:
6831 case QImode:
6832 if (words <= cum->nregs)
6833 {
6834 int regno = cum->regno;
6835
6836 /* Fastcall allocates the first two DWORD (SImode) or
6837 smaller arguments to ECX and EDX if it isn't an
6838 aggregate type . */
6839 if (cum->fastcall)
6840 {
6841 if (mode == BLKmode
6842 || mode == DImode
6843 || (type && AGGREGATE_TYPE_P (type)))
6844 break;
6845
6846 /* ECX not EAX is the first allocated register. */
6847 if (regno == AX_REG)
6848 regno = CX_REG;
6849 }
6850 return gen_rtx_REG (mode, regno);
6851 }
6852 break;
6853
6854 case DFmode:
6855 if (cum->float_in_sse < 2)
6856 break;
6857 case SFmode:
6858 if (cum->float_in_sse < 1)
6859 break;
6860 /* FALLTHRU */
6861 case TImode:
6862 /* In 32bit, we pass TImode in xmm registers. */
6863 case V16QImode:
6864 case V8HImode:
6865 case V4SImode:
6866 case V2DImode:
6867 case V4SFmode:
6868 case V2DFmode:
6869 if (!type || !AGGREGATE_TYPE_P (type))
6870 {
6871 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6872 {
6873 warnedsse = true;
6874 warning (0, "SSE vector argument without SSE enabled "
6875 "changes the ABI");
6876 }
6877 if (cum->sse_nregs)
6878 return gen_reg_or_parallel (mode, orig_mode,
6879 cum->sse_regno + FIRST_SSE_REG);
6880 }
6881 break;
6882
6883 case OImode:
6884 /* OImode shouldn't be used directly. */
6885 gcc_unreachable ();
6886
6887 case V8SFmode:
6888 case V8SImode:
6889 case V32QImode:
6890 case V16HImode:
6891 case V4DFmode:
6892 case V4DImode:
6893 if (!type || !AGGREGATE_TYPE_P (type))
6894 {
6895 if (cum->sse_nregs)
6896 return gen_reg_or_parallel (mode, orig_mode,
6897 cum->sse_regno + FIRST_SSE_REG);
6898 }
6899 break;
6900
6901 case V8QImode:
6902 case V4HImode:
6903 case V2SImode:
6904 case V2SFmode:
6905 case V1TImode:
6906 case V1DImode:
6907 if (!type || !AGGREGATE_TYPE_P (type))
6908 {
6909 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6910 {
6911 warnedmmx = true;
6912 warning (0, "MMX vector argument without MMX enabled "
6913 "changes the ABI");
6914 }
6915 if (cum->mmx_nregs)
6916 return gen_reg_or_parallel (mode, orig_mode,
6917 cum->mmx_regno + FIRST_MMX_REG);
6918 }
6919 break;
6920 }
6921
6922 return NULL_RTX;
6923 }
6924
6925 static rtx
6926 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6927 enum machine_mode orig_mode, const_tree type, bool named)
6928 {
6929 /* Handle a hidden AL argument containing number of registers
6930 for varargs x86-64 functions. */
6931 if (mode == VOIDmode)
6932 return GEN_INT (cum->maybe_vaarg
6933 ? (cum->sse_nregs < 0
6934 ? X86_64_SSE_REGPARM_MAX
6935 : cum->sse_regno)
6936 : -1);
6937
6938 switch (mode)
6939 {
6940 default:
6941 break;
6942
6943 case V8SFmode:
6944 case V8SImode:
6945 case V32QImode:
6946 case V16HImode:
6947 case V4DFmode:
6948 case V4DImode:
6949 /* Unnamed 256bit vector mode parameters are passed on stack. */
6950 if (!named)
6951 return NULL;
6952 break;
6953 }
6954
6955 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6956 cum->sse_nregs,
6957 &x86_64_int_parameter_registers [cum->regno],
6958 cum->sse_regno);
6959 }
6960
6961 static rtx
6962 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6963 enum machine_mode orig_mode, bool named,
6964 HOST_WIDE_INT bytes)
6965 {
6966 unsigned int regno;
6967
6968 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6969 We use value of -2 to specify that current function call is MSABI. */
6970 if (mode == VOIDmode)
6971 return GEN_INT (-2);
6972
6973 /* If we've run out of registers, it goes on the stack. */
6974 if (cum->nregs == 0)
6975 return NULL_RTX;
6976
6977 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6978
6979 /* Only floating point modes are passed in anything but integer regs. */
6980 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6981 {
6982 if (named)
6983 regno = cum->regno + FIRST_SSE_REG;
6984 else
6985 {
6986 rtx t1, t2;
6987
6988 /* Unnamed floating parameters are passed in both the
6989 SSE and integer registers. */
6990 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6991 t2 = gen_rtx_REG (mode, regno);
6992 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6993 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6994 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6995 }
6996 }
6997 /* Handle aggregated types passed in register. */
6998 if (orig_mode == BLKmode)
6999 {
7000 if (bytes > 0 && bytes <= 8)
7001 mode = (bytes > 4 ? DImode : SImode);
7002 if (mode == BLKmode)
7003 mode = DImode;
7004 }
7005
7006 return gen_reg_or_parallel (mode, orig_mode, regno);
7007 }
7008
7009 /* Return where to put the arguments to a function.
7010 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7011
7012 MODE is the argument's machine mode. TYPE is the data type of the
7013 argument. It is null for libcalls where that information may not be
7014 available. CUM gives information about the preceding args and about
7015 the function being called. NAMED is nonzero if this argument is a
7016 named parameter (otherwise it is an extra parameter matching an
7017 ellipsis). */
7018
7019 static rtx
7020 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7021 const_tree type, bool named)
7022 {
7023 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7024 enum machine_mode mode = omode;
7025 HOST_WIDE_INT bytes, words;
7026 rtx arg;
7027
7028 if (mode == BLKmode)
7029 bytes = int_size_in_bytes (type);
7030 else
7031 bytes = GET_MODE_SIZE (mode);
7032 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7033
7034 /* To simplify the code below, represent vector types with a vector mode
7035 even if MMX/SSE are not active. */
7036 if (type && TREE_CODE (type) == VECTOR_TYPE)
7037 mode = type_natural_mode (type, cum);
7038
7039 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7040 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7041 else if (TARGET_64BIT)
7042 arg = function_arg_64 (cum, mode, omode, type, named);
7043 else
7044 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7045
7046 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
7047 {
7048 /* This argument uses 256bit AVX modes. */
7049 if (cum->caller)
7050 cfun->machine->callee_pass_avx256_p = true;
7051 else
7052 cfun->machine->caller_pass_avx256_p = true;
7053 }
7054
7055 return arg;
7056 }
7057
7058 /* A C expression that indicates when an argument must be passed by
7059 reference. If nonzero for an argument, a copy of that argument is
7060 made in memory and a pointer to the argument is passed instead of
7061 the argument itself. The pointer is passed in whatever way is
7062 appropriate for passing a pointer to that type. */
7063
7064 static bool
7065 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7066 enum machine_mode mode ATTRIBUTE_UNUSED,
7067 const_tree type, bool named ATTRIBUTE_UNUSED)
7068 {
7069 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7070
7071 /* See Windows x64 Software Convention. */
7072 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7073 {
7074 int msize = (int) GET_MODE_SIZE (mode);
7075 if (type)
7076 {
7077 /* Arrays are passed by reference. */
7078 if (TREE_CODE (type) == ARRAY_TYPE)
7079 return true;
7080
7081 if (AGGREGATE_TYPE_P (type))
7082 {
7083 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7084 are passed by reference. */
7085 msize = int_size_in_bytes (type);
7086 }
7087 }
7088
7089 /* __m128 is passed by reference. */
7090 switch (msize) {
7091 case 1: case 2: case 4: case 8:
7092 break;
7093 default:
7094 return true;
7095 }
7096 }
7097 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7098 return 1;
7099
7100 return 0;
7101 }
7102
7103 /* Return true when TYPE should be 128bit aligned for 32bit argument
7104 passing ABI. XXX: This function is obsolete and is only used for
7105 checking psABI compatibility with previous versions of GCC. */
7106
7107 static bool
7108 ix86_compat_aligned_value_p (const_tree type)
7109 {
7110 enum machine_mode mode = TYPE_MODE (type);
7111 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7112 || mode == TDmode
7113 || mode == TFmode
7114 || mode == TCmode)
7115 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7116 return true;
7117 if (TYPE_ALIGN (type) < 128)
7118 return false;
7119
7120 if (AGGREGATE_TYPE_P (type))
7121 {
7122 /* Walk the aggregates recursively. */
7123 switch (TREE_CODE (type))
7124 {
7125 case RECORD_TYPE:
7126 case UNION_TYPE:
7127 case QUAL_UNION_TYPE:
7128 {
7129 tree field;
7130
7131 /* Walk all the structure fields. */
7132 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7133 {
7134 if (TREE_CODE (field) == FIELD_DECL
7135 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7136 return true;
7137 }
7138 break;
7139 }
7140
7141 case ARRAY_TYPE:
7142 /* Just for use if some languages passes arrays by value. */
7143 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7144 return true;
7145 break;
7146
7147 default:
7148 gcc_unreachable ();
7149 }
7150 }
7151 return false;
7152 }
7153
7154 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7155 XXX: This function is obsolete and is only used for checking psABI
7156 compatibility with previous versions of GCC. */
7157
7158 static unsigned int
7159 ix86_compat_function_arg_boundary (enum machine_mode mode,
7160 const_tree type, unsigned int align)
7161 {
7162 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7163 natural boundaries. */
7164 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7165 {
7166 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7167 make an exception for SSE modes since these require 128bit
7168 alignment.
7169
7170 The handling here differs from field_alignment. ICC aligns MMX
7171 arguments to 4 byte boundaries, while structure fields are aligned
7172 to 8 byte boundaries. */
7173 if (!type)
7174 {
7175 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7176 align = PARM_BOUNDARY;
7177 }
7178 else
7179 {
7180 if (!ix86_compat_aligned_value_p (type))
7181 align = PARM_BOUNDARY;
7182 }
7183 }
7184 if (align > BIGGEST_ALIGNMENT)
7185 align = BIGGEST_ALIGNMENT;
7186 return align;
7187 }
7188
7189 /* Return true when TYPE should be 128bit aligned for 32bit argument
7190 passing ABI. */
7191
7192 static bool
7193 ix86_contains_aligned_value_p (const_tree type)
7194 {
7195 enum machine_mode mode = TYPE_MODE (type);
7196
7197 if (mode == XFmode || mode == XCmode)
7198 return false;
7199
7200 if (TYPE_ALIGN (type) < 128)
7201 return false;
7202
7203 if (AGGREGATE_TYPE_P (type))
7204 {
7205 /* Walk the aggregates recursively. */
7206 switch (TREE_CODE (type))
7207 {
7208 case RECORD_TYPE:
7209 case UNION_TYPE:
7210 case QUAL_UNION_TYPE:
7211 {
7212 tree field;
7213
7214 /* Walk all the structure fields. */
7215 for (field = TYPE_FIELDS (type);
7216 field;
7217 field = DECL_CHAIN (field))
7218 {
7219 if (TREE_CODE (field) == FIELD_DECL
7220 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7221 return true;
7222 }
7223 break;
7224 }
7225
7226 case ARRAY_TYPE:
7227 /* Just for use if some languages passes arrays by value. */
7228 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7229 return true;
7230 break;
7231
7232 default:
7233 gcc_unreachable ();
7234 }
7235 }
7236 else
7237 return TYPE_ALIGN (type) >= 128;
7238
7239 return false;
7240 }
7241
7242 /* Gives the alignment boundary, in bits, of an argument with the
7243 specified mode and type. */
7244
7245 static unsigned int
7246 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7247 {
7248 unsigned int align;
7249 if (type)
7250 {
7251 /* Since the main variant type is used for call, we convert it to
7252 the main variant type. */
7253 type = TYPE_MAIN_VARIANT (type);
7254 align = TYPE_ALIGN (type);
7255 }
7256 else
7257 align = GET_MODE_ALIGNMENT (mode);
7258 if (align < PARM_BOUNDARY)
7259 align = PARM_BOUNDARY;
7260 else
7261 {
7262 static bool warned;
7263 unsigned int saved_align = align;
7264
7265 if (!TARGET_64BIT)
7266 {
7267 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7268 if (!type)
7269 {
7270 if (mode == XFmode || mode == XCmode)
7271 align = PARM_BOUNDARY;
7272 }
7273 else if (!ix86_contains_aligned_value_p (type))
7274 align = PARM_BOUNDARY;
7275
7276 if (align < 128)
7277 align = PARM_BOUNDARY;
7278 }
7279
7280 if (warn_psabi
7281 && !warned
7282 && align != ix86_compat_function_arg_boundary (mode, type,
7283 saved_align))
7284 {
7285 warned = true;
7286 inform (input_location,
7287 "The ABI for passing parameters with %d-byte"
7288 " alignment has changed in GCC 4.6",
7289 align / BITS_PER_UNIT);
7290 }
7291 }
7292
7293 return align;
7294 }
7295
7296 /* Return true if N is a possible register number of function value. */
7297
7298 static bool
7299 ix86_function_value_regno_p (const unsigned int regno)
7300 {
7301 switch (regno)
7302 {
7303 case AX_REG:
7304 return true;
7305
7306 case FIRST_FLOAT_REG:
7307 /* TODO: The function should depend on current function ABI but
7308 builtins.c would need updating then. Therefore we use the
7309 default ABI. */
7310 if (TARGET_64BIT && ix86_abi == MS_ABI)
7311 return false;
7312 return TARGET_FLOAT_RETURNS_IN_80387;
7313
7314 case FIRST_SSE_REG:
7315 return TARGET_SSE;
7316
7317 case FIRST_MMX_REG:
7318 if (TARGET_MACHO || TARGET_64BIT)
7319 return false;
7320 return TARGET_MMX;
7321 }
7322
7323 return false;
7324 }
7325
7326 /* Define how to find the value returned by a function.
7327 VALTYPE is the data type of the value (as a tree).
7328 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7329 otherwise, FUNC is 0. */
7330
7331 static rtx
7332 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7333 const_tree fntype, const_tree fn)
7334 {
7335 unsigned int regno;
7336
7337 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7338 we normally prevent this case when mmx is not available. However
7339 some ABIs may require the result to be returned like DImode. */
7340 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7341 regno = FIRST_MMX_REG;
7342
7343 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7344 we prevent this case when sse is not available. However some ABIs
7345 may require the result to be returned like integer TImode. */
7346 else if (mode == TImode
7347 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7348 regno = FIRST_SSE_REG;
7349
7350 /* 32-byte vector modes in %ymm0. */
7351 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7352 regno = FIRST_SSE_REG;
7353
7354 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7355 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7356 regno = FIRST_FLOAT_REG;
7357 else
7358 /* Most things go in %eax. */
7359 regno = AX_REG;
7360
7361 /* Override FP return register with %xmm0 for local functions when
7362 SSE math is enabled or for functions with sseregparm attribute. */
7363 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7364 {
7365 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7366 if ((sse_level >= 1 && mode == SFmode)
7367 || (sse_level == 2 && mode == DFmode))
7368 regno = FIRST_SSE_REG;
7369 }
7370
7371 /* OImode shouldn't be used directly. */
7372 gcc_assert (mode != OImode);
7373
7374 return gen_rtx_REG (orig_mode, regno);
7375 }
7376
7377 static rtx
7378 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7379 const_tree valtype)
7380 {
7381 rtx ret;
7382
7383 /* Handle libcalls, which don't provide a type node. */
7384 if (valtype == NULL)
7385 {
7386 unsigned int regno;
7387
7388 switch (mode)
7389 {
7390 case SFmode:
7391 case SCmode:
7392 case DFmode:
7393 case DCmode:
7394 case TFmode:
7395 case SDmode:
7396 case DDmode:
7397 case TDmode:
7398 regno = FIRST_SSE_REG;
7399 break;
7400 case XFmode:
7401 case XCmode:
7402 regno = FIRST_FLOAT_REG;
7403 break;
7404 case TCmode:
7405 return NULL;
7406 default:
7407 regno = AX_REG;
7408 }
7409
7410 return gen_rtx_REG (mode, regno);
7411 }
7412 else if (POINTER_TYPE_P (valtype))
7413 {
7414 /* Pointers are always returned in Pmode. */
7415 mode = Pmode;
7416 }
7417
7418 ret = construct_container (mode, orig_mode, valtype, 1,
7419 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7420 x86_64_int_return_registers, 0);
7421
7422 /* For zero sized structures, construct_container returns NULL, but we
7423 need to keep rest of compiler happy by returning meaningful value. */
7424 if (!ret)
7425 ret = gen_rtx_REG (orig_mode, AX_REG);
7426
7427 return ret;
7428 }
7429
7430 static rtx
7431 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7432 {
7433 unsigned int regno = AX_REG;
7434
7435 if (TARGET_SSE)
7436 {
7437 switch (GET_MODE_SIZE (mode))
7438 {
7439 case 16:
7440 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7441 && !COMPLEX_MODE_P (mode))
7442 regno = FIRST_SSE_REG;
7443 break;
7444 case 8:
7445 case 4:
7446 if (mode == SFmode || mode == DFmode)
7447 regno = FIRST_SSE_REG;
7448 break;
7449 default:
7450 break;
7451 }
7452 }
7453 return gen_rtx_REG (orig_mode, regno);
7454 }
7455
7456 static rtx
7457 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7458 enum machine_mode orig_mode, enum machine_mode mode)
7459 {
7460 const_tree fn, fntype;
7461
7462 fn = NULL_TREE;
7463 if (fntype_or_decl && DECL_P (fntype_or_decl))
7464 fn = fntype_or_decl;
7465 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7466
7467 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7468 return function_value_ms_64 (orig_mode, mode);
7469 else if (TARGET_64BIT)
7470 return function_value_64 (orig_mode, mode, valtype);
7471 else
7472 return function_value_32 (orig_mode, mode, fntype, fn);
7473 }
7474
7475 static rtx
7476 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7477 bool outgoing ATTRIBUTE_UNUSED)
7478 {
7479 enum machine_mode mode, orig_mode;
7480
7481 orig_mode = TYPE_MODE (valtype);
7482 mode = type_natural_mode (valtype, NULL);
7483 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7484 }
7485
7486 /* Pointer function arguments and return values are promoted to Pmode. */
7487
7488 static enum machine_mode
7489 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7490 int *punsignedp, const_tree fntype,
7491 int for_return)
7492 {
7493 if (type != NULL_TREE && POINTER_TYPE_P (type))
7494 {
7495 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7496 return Pmode;
7497 }
7498 return default_promote_function_mode (type, mode, punsignedp, fntype,
7499 for_return);
7500 }
7501
7502 rtx
7503 ix86_libcall_value (enum machine_mode mode)
7504 {
7505 return ix86_function_value_1 (NULL, NULL, mode, mode);
7506 }
7507
7508 /* Return true iff type is returned in memory. */
7509
7510 static bool ATTRIBUTE_UNUSED
7511 return_in_memory_32 (const_tree type, enum machine_mode mode)
7512 {
7513 HOST_WIDE_INT size;
7514
7515 if (mode == BLKmode)
7516 return true;
7517
7518 size = int_size_in_bytes (type);
7519
7520 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7521 return false;
7522
7523 if (VECTOR_MODE_P (mode) || mode == TImode)
7524 {
7525 /* User-created vectors small enough to fit in EAX. */
7526 if (size < 8)
7527 return false;
7528
7529 /* MMX/3dNow values are returned in MM0,
7530 except when it doesn't exits or the ABI prescribes otherwise. */
7531 if (size == 8)
7532 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7533
7534 /* SSE values are returned in XMM0, except when it doesn't exist. */
7535 if (size == 16)
7536 return !TARGET_SSE;
7537
7538 /* AVX values are returned in YMM0, except when it doesn't exist. */
7539 if (size == 32)
7540 return !TARGET_AVX;
7541 }
7542
7543 if (mode == XFmode)
7544 return false;
7545
7546 if (size > 12)
7547 return true;
7548
7549 /* OImode shouldn't be used directly. */
7550 gcc_assert (mode != OImode);
7551
7552 return false;
7553 }
7554
7555 static bool ATTRIBUTE_UNUSED
7556 return_in_memory_64 (const_tree type, enum machine_mode mode)
7557 {
7558 int needed_intregs, needed_sseregs;
7559 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7560 }
7561
7562 static bool ATTRIBUTE_UNUSED
7563 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7564 {
7565 HOST_WIDE_INT size = int_size_in_bytes (type);
7566
7567 /* __m128 is returned in xmm0. */
7568 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7569 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7570 return false;
7571
7572 /* Otherwise, the size must be exactly in [1248]. */
7573 return size != 1 && size != 2 && size != 4 && size != 8;
7574 }
7575
7576 static bool
7577 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7578 {
7579 #ifdef SUBTARGET_RETURN_IN_MEMORY
7580 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7581 #else
7582 const enum machine_mode mode = type_natural_mode (type, NULL);
7583
7584 if (TARGET_64BIT)
7585 {
7586 if (ix86_function_type_abi (fntype) == MS_ABI)
7587 return return_in_memory_ms_64 (type, mode);
7588 else
7589 return return_in_memory_64 (type, mode);
7590 }
7591 else
7592 return return_in_memory_32 (type, mode);
7593 #endif
7594 }
7595
7596 /* When returning SSE vector types, we have a choice of either
7597 (1) being abi incompatible with a -march switch, or
7598 (2) generating an error.
7599 Given no good solution, I think the safest thing is one warning.
7600 The user won't be able to use -Werror, but....
7601
7602 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7603 called in response to actually generating a caller or callee that
7604 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7605 via aggregate_value_p for general type probing from tree-ssa. */
7606
7607 static rtx
7608 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7609 {
7610 static bool warnedsse, warnedmmx;
7611
7612 if (!TARGET_64BIT && type)
7613 {
7614 /* Look at the return type of the function, not the function type. */
7615 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7616
7617 if (!TARGET_SSE && !warnedsse)
7618 {
7619 if (mode == TImode
7620 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7621 {
7622 warnedsse = true;
7623 warning (0, "SSE vector return without SSE enabled "
7624 "changes the ABI");
7625 }
7626 }
7627
7628 if (!TARGET_MMX && !warnedmmx)
7629 {
7630 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7631 {
7632 warnedmmx = true;
7633 warning (0, "MMX vector return without MMX enabled "
7634 "changes the ABI");
7635 }
7636 }
7637 }
7638
7639 return NULL;
7640 }
7641
7642 \f
7643 /* Create the va_list data type. */
7644
7645 /* Returns the calling convention specific va_list date type.
7646 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7647
7648 static tree
7649 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7650 {
7651 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7652
7653 /* For i386 we use plain pointer to argument area. */
7654 if (!TARGET_64BIT || abi == MS_ABI)
7655 return build_pointer_type (char_type_node);
7656
7657 record = lang_hooks.types.make_type (RECORD_TYPE);
7658 type_decl = build_decl (BUILTINS_LOCATION,
7659 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7660
7661 f_gpr = build_decl (BUILTINS_LOCATION,
7662 FIELD_DECL, get_identifier ("gp_offset"),
7663 unsigned_type_node);
7664 f_fpr = build_decl (BUILTINS_LOCATION,
7665 FIELD_DECL, get_identifier ("fp_offset"),
7666 unsigned_type_node);
7667 f_ovf = build_decl (BUILTINS_LOCATION,
7668 FIELD_DECL, get_identifier ("overflow_arg_area"),
7669 ptr_type_node);
7670 f_sav = build_decl (BUILTINS_LOCATION,
7671 FIELD_DECL, get_identifier ("reg_save_area"),
7672 ptr_type_node);
7673
7674 va_list_gpr_counter_field = f_gpr;
7675 va_list_fpr_counter_field = f_fpr;
7676
7677 DECL_FIELD_CONTEXT (f_gpr) = record;
7678 DECL_FIELD_CONTEXT (f_fpr) = record;
7679 DECL_FIELD_CONTEXT (f_ovf) = record;
7680 DECL_FIELD_CONTEXT (f_sav) = record;
7681
7682 TYPE_STUB_DECL (record) = type_decl;
7683 TYPE_NAME (record) = type_decl;
7684 TYPE_FIELDS (record) = f_gpr;
7685 DECL_CHAIN (f_gpr) = f_fpr;
7686 DECL_CHAIN (f_fpr) = f_ovf;
7687 DECL_CHAIN (f_ovf) = f_sav;
7688
7689 layout_type (record);
7690
7691 /* The correct type is an array type of one element. */
7692 return build_array_type (record, build_index_type (size_zero_node));
7693 }
7694
7695 /* Setup the builtin va_list data type and for 64-bit the additional
7696 calling convention specific va_list data types. */
7697
7698 static tree
7699 ix86_build_builtin_va_list (void)
7700 {
7701 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7702
7703 /* Initialize abi specific va_list builtin types. */
7704 if (TARGET_64BIT)
7705 {
7706 tree t;
7707 if (ix86_abi == MS_ABI)
7708 {
7709 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7710 if (TREE_CODE (t) != RECORD_TYPE)
7711 t = build_variant_type_copy (t);
7712 sysv_va_list_type_node = t;
7713 }
7714 else
7715 {
7716 t = ret;
7717 if (TREE_CODE (t) != RECORD_TYPE)
7718 t = build_variant_type_copy (t);
7719 sysv_va_list_type_node = t;
7720 }
7721 if (ix86_abi != MS_ABI)
7722 {
7723 t = ix86_build_builtin_va_list_abi (MS_ABI);
7724 if (TREE_CODE (t) != RECORD_TYPE)
7725 t = build_variant_type_copy (t);
7726 ms_va_list_type_node = t;
7727 }
7728 else
7729 {
7730 t = ret;
7731 if (TREE_CODE (t) != RECORD_TYPE)
7732 t = build_variant_type_copy (t);
7733 ms_va_list_type_node = t;
7734 }
7735 }
7736
7737 return ret;
7738 }
7739
7740 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7741
7742 static void
7743 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7744 {
7745 rtx save_area, mem;
7746 alias_set_type set;
7747 int i, max;
7748
7749 /* GPR size of varargs save area. */
7750 if (cfun->va_list_gpr_size)
7751 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7752 else
7753 ix86_varargs_gpr_size = 0;
7754
7755 /* FPR size of varargs save area. We don't need it if we don't pass
7756 anything in SSE registers. */
7757 if (TARGET_SSE && cfun->va_list_fpr_size)
7758 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7759 else
7760 ix86_varargs_fpr_size = 0;
7761
7762 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7763 return;
7764
7765 save_area = frame_pointer_rtx;
7766 set = get_varargs_alias_set ();
7767
7768 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7769 if (max > X86_64_REGPARM_MAX)
7770 max = X86_64_REGPARM_MAX;
7771
7772 for (i = cum->regno; i < max; i++)
7773 {
7774 mem = gen_rtx_MEM (Pmode,
7775 plus_constant (save_area, i * UNITS_PER_WORD));
7776 MEM_NOTRAP_P (mem) = 1;
7777 set_mem_alias_set (mem, set);
7778 emit_move_insn (mem, gen_rtx_REG (Pmode,
7779 x86_64_int_parameter_registers[i]));
7780 }
7781
7782 if (ix86_varargs_fpr_size)
7783 {
7784 enum machine_mode smode;
7785 rtx label, test;
7786
7787 /* Now emit code to save SSE registers. The AX parameter contains number
7788 of SSE parameter registers used to call this function, though all we
7789 actually check here is the zero/non-zero status. */
7790
7791 label = gen_label_rtx ();
7792 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7793 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7794 label));
7795
7796 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7797 we used movdqa (i.e. TImode) instead? Perhaps even better would
7798 be if we could determine the real mode of the data, via a hook
7799 into pass_stdarg. Ignore all that for now. */
7800 smode = V4SFmode;
7801 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7802 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7803
7804 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7805 if (max > X86_64_SSE_REGPARM_MAX)
7806 max = X86_64_SSE_REGPARM_MAX;
7807
7808 for (i = cum->sse_regno; i < max; ++i)
7809 {
7810 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7811 mem = gen_rtx_MEM (smode, mem);
7812 MEM_NOTRAP_P (mem) = 1;
7813 set_mem_alias_set (mem, set);
7814 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7815
7816 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7817 }
7818
7819 emit_label (label);
7820 }
7821 }
7822
7823 static void
7824 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7825 {
7826 alias_set_type set = get_varargs_alias_set ();
7827 int i;
7828
7829 /* Reset to zero, as there might be a sysv vaarg used
7830 before. */
7831 ix86_varargs_gpr_size = 0;
7832 ix86_varargs_fpr_size = 0;
7833
7834 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7835 {
7836 rtx reg, mem;
7837
7838 mem = gen_rtx_MEM (Pmode,
7839 plus_constant (virtual_incoming_args_rtx,
7840 i * UNITS_PER_WORD));
7841 MEM_NOTRAP_P (mem) = 1;
7842 set_mem_alias_set (mem, set);
7843
7844 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7845 emit_move_insn (mem, reg);
7846 }
7847 }
7848
7849 static void
7850 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7851 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7852 int no_rtl)
7853 {
7854 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7855 CUMULATIVE_ARGS next_cum;
7856 tree fntype;
7857
7858 /* This argument doesn't appear to be used anymore. Which is good,
7859 because the old code here didn't suppress rtl generation. */
7860 gcc_assert (!no_rtl);
7861
7862 if (!TARGET_64BIT)
7863 return;
7864
7865 fntype = TREE_TYPE (current_function_decl);
7866
7867 /* For varargs, we do not want to skip the dummy va_dcl argument.
7868 For stdargs, we do want to skip the last named argument. */
7869 next_cum = *cum;
7870 if (stdarg_p (fntype))
7871 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7872 true);
7873
7874 if (cum->call_abi == MS_ABI)
7875 setup_incoming_varargs_ms_64 (&next_cum);
7876 else
7877 setup_incoming_varargs_64 (&next_cum);
7878 }
7879
7880 /* Checks if TYPE is of kind va_list char *. */
7881
7882 static bool
7883 is_va_list_char_pointer (tree type)
7884 {
7885 tree canonic;
7886
7887 /* For 32-bit it is always true. */
7888 if (!TARGET_64BIT)
7889 return true;
7890 canonic = ix86_canonical_va_list_type (type);
7891 return (canonic == ms_va_list_type_node
7892 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7893 }
7894
7895 /* Implement va_start. */
7896
7897 static void
7898 ix86_va_start (tree valist, rtx nextarg)
7899 {
7900 HOST_WIDE_INT words, n_gpr, n_fpr;
7901 tree f_gpr, f_fpr, f_ovf, f_sav;
7902 tree gpr, fpr, ovf, sav, t;
7903 tree type;
7904 rtx ovf_rtx;
7905
7906 if (flag_split_stack
7907 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7908 {
7909 unsigned int scratch_regno;
7910
7911 /* When we are splitting the stack, we can't refer to the stack
7912 arguments using internal_arg_pointer, because they may be on
7913 the old stack. The split stack prologue will arrange to
7914 leave a pointer to the old stack arguments in a scratch
7915 register, which we here copy to a pseudo-register. The split
7916 stack prologue can't set the pseudo-register directly because
7917 it (the prologue) runs before any registers have been saved. */
7918
7919 scratch_regno = split_stack_prologue_scratch_regno ();
7920 if (scratch_regno != INVALID_REGNUM)
7921 {
7922 rtx reg, seq;
7923
7924 reg = gen_reg_rtx (Pmode);
7925 cfun->machine->split_stack_varargs_pointer = reg;
7926
7927 start_sequence ();
7928 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7929 seq = get_insns ();
7930 end_sequence ();
7931
7932 push_topmost_sequence ();
7933 emit_insn_after (seq, entry_of_function ());
7934 pop_topmost_sequence ();
7935 }
7936 }
7937
7938 /* Only 64bit target needs something special. */
7939 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7940 {
7941 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7942 std_expand_builtin_va_start (valist, nextarg);
7943 else
7944 {
7945 rtx va_r, next;
7946
7947 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7948 next = expand_binop (ptr_mode, add_optab,
7949 cfun->machine->split_stack_varargs_pointer,
7950 crtl->args.arg_offset_rtx,
7951 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7952 convert_move (va_r, next, 0);
7953 }
7954 return;
7955 }
7956
7957 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7958 f_fpr = DECL_CHAIN (f_gpr);
7959 f_ovf = DECL_CHAIN (f_fpr);
7960 f_sav = DECL_CHAIN (f_ovf);
7961
7962 valist = build_simple_mem_ref (valist);
7963 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7964 /* The following should be folded into the MEM_REF offset. */
7965 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7966 f_gpr, NULL_TREE);
7967 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7968 f_fpr, NULL_TREE);
7969 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7970 f_ovf, NULL_TREE);
7971 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7972 f_sav, NULL_TREE);
7973
7974 /* Count number of gp and fp argument registers used. */
7975 words = crtl->args.info.words;
7976 n_gpr = crtl->args.info.regno;
7977 n_fpr = crtl->args.info.sse_regno;
7978
7979 if (cfun->va_list_gpr_size)
7980 {
7981 type = TREE_TYPE (gpr);
7982 t = build2 (MODIFY_EXPR, type,
7983 gpr, build_int_cst (type, n_gpr * 8));
7984 TREE_SIDE_EFFECTS (t) = 1;
7985 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7986 }
7987
7988 if (TARGET_SSE && cfun->va_list_fpr_size)
7989 {
7990 type = TREE_TYPE (fpr);
7991 t = build2 (MODIFY_EXPR, type, fpr,
7992 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7993 TREE_SIDE_EFFECTS (t) = 1;
7994 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7995 }
7996
7997 /* Find the overflow area. */
7998 type = TREE_TYPE (ovf);
7999 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8000 ovf_rtx = crtl->args.internal_arg_pointer;
8001 else
8002 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8003 t = make_tree (type, ovf_rtx);
8004 if (words != 0)
8005 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8006 t = build2 (MODIFY_EXPR, type, ovf, t);
8007 TREE_SIDE_EFFECTS (t) = 1;
8008 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8009
8010 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8011 {
8012 /* Find the register save area.
8013 Prologue of the function save it right above stack frame. */
8014 type = TREE_TYPE (sav);
8015 t = make_tree (type, frame_pointer_rtx);
8016 if (!ix86_varargs_gpr_size)
8017 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8018 t = build2 (MODIFY_EXPR, type, sav, t);
8019 TREE_SIDE_EFFECTS (t) = 1;
8020 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8021 }
8022 }
8023
8024 /* Implement va_arg. */
8025
8026 static tree
8027 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8028 gimple_seq *post_p)
8029 {
8030 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8031 tree f_gpr, f_fpr, f_ovf, f_sav;
8032 tree gpr, fpr, ovf, sav, t;
8033 int size, rsize;
8034 tree lab_false, lab_over = NULL_TREE;
8035 tree addr, t2;
8036 rtx container;
8037 int indirect_p = 0;
8038 tree ptrtype;
8039 enum machine_mode nat_mode;
8040 unsigned int arg_boundary;
8041
8042 /* Only 64bit target needs something special. */
8043 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8044 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8045
8046 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8047 f_fpr = DECL_CHAIN (f_gpr);
8048 f_ovf = DECL_CHAIN (f_fpr);
8049 f_sav = DECL_CHAIN (f_ovf);
8050
8051 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8052 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8053 valist = build_va_arg_indirect_ref (valist);
8054 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8055 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8056 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8057
8058 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8059 if (indirect_p)
8060 type = build_pointer_type (type);
8061 size = int_size_in_bytes (type);
8062 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8063
8064 nat_mode = type_natural_mode (type, NULL);
8065 switch (nat_mode)
8066 {
8067 case V8SFmode:
8068 case V8SImode:
8069 case V32QImode:
8070 case V16HImode:
8071 case V4DFmode:
8072 case V4DImode:
8073 /* Unnamed 256bit vector mode parameters are passed on stack. */
8074 if (!TARGET_64BIT_MS_ABI)
8075 {
8076 container = NULL;
8077 break;
8078 }
8079
8080 default:
8081 container = construct_container (nat_mode, TYPE_MODE (type),
8082 type, 0, X86_64_REGPARM_MAX,
8083 X86_64_SSE_REGPARM_MAX, intreg,
8084 0);
8085 break;
8086 }
8087
8088 /* Pull the value out of the saved registers. */
8089
8090 addr = create_tmp_var (ptr_type_node, "addr");
8091
8092 if (container)
8093 {
8094 int needed_intregs, needed_sseregs;
8095 bool need_temp;
8096 tree int_addr, sse_addr;
8097
8098 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8099 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8100
8101 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8102
8103 need_temp = (!REG_P (container)
8104 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8105 || TYPE_ALIGN (type) > 128));
8106
8107 /* In case we are passing structure, verify that it is consecutive block
8108 on the register save area. If not we need to do moves. */
8109 if (!need_temp && !REG_P (container))
8110 {
8111 /* Verify that all registers are strictly consecutive */
8112 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8113 {
8114 int i;
8115
8116 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8117 {
8118 rtx slot = XVECEXP (container, 0, i);
8119 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8120 || INTVAL (XEXP (slot, 1)) != i * 16)
8121 need_temp = 1;
8122 }
8123 }
8124 else
8125 {
8126 int i;
8127
8128 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8129 {
8130 rtx slot = XVECEXP (container, 0, i);
8131 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8132 || INTVAL (XEXP (slot, 1)) != i * 8)
8133 need_temp = 1;
8134 }
8135 }
8136 }
8137 if (!need_temp)
8138 {
8139 int_addr = addr;
8140 sse_addr = addr;
8141 }
8142 else
8143 {
8144 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8145 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8146 }
8147
8148 /* First ensure that we fit completely in registers. */
8149 if (needed_intregs)
8150 {
8151 t = build_int_cst (TREE_TYPE (gpr),
8152 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8153 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8154 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8155 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8156 gimplify_and_add (t, pre_p);
8157 }
8158 if (needed_sseregs)
8159 {
8160 t = build_int_cst (TREE_TYPE (fpr),
8161 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8162 + X86_64_REGPARM_MAX * 8);
8163 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8164 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8165 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8166 gimplify_and_add (t, pre_p);
8167 }
8168
8169 /* Compute index to start of area used for integer regs. */
8170 if (needed_intregs)
8171 {
8172 /* int_addr = gpr + sav; */
8173 t = fold_build_pointer_plus (sav, gpr);
8174 gimplify_assign (int_addr, t, pre_p);
8175 }
8176 if (needed_sseregs)
8177 {
8178 /* sse_addr = fpr + sav; */
8179 t = fold_build_pointer_plus (sav, fpr);
8180 gimplify_assign (sse_addr, t, pre_p);
8181 }
8182 if (need_temp)
8183 {
8184 int i, prev_size = 0;
8185 tree temp = create_tmp_var (type, "va_arg_tmp");
8186
8187 /* addr = &temp; */
8188 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8189 gimplify_assign (addr, t, pre_p);
8190
8191 for (i = 0; i < XVECLEN (container, 0); i++)
8192 {
8193 rtx slot = XVECEXP (container, 0, i);
8194 rtx reg = XEXP (slot, 0);
8195 enum machine_mode mode = GET_MODE (reg);
8196 tree piece_type;
8197 tree addr_type;
8198 tree daddr_type;
8199 tree src_addr, src;
8200 int src_offset;
8201 tree dest_addr, dest;
8202 int cur_size = GET_MODE_SIZE (mode);
8203
8204 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8205 prev_size = INTVAL (XEXP (slot, 1));
8206 if (prev_size + cur_size > size)
8207 {
8208 cur_size = size - prev_size;
8209 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8210 if (mode == BLKmode)
8211 mode = QImode;
8212 }
8213 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8214 if (mode == GET_MODE (reg))
8215 addr_type = build_pointer_type (piece_type);
8216 else
8217 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8218 true);
8219 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8220 true);
8221
8222 if (SSE_REGNO_P (REGNO (reg)))
8223 {
8224 src_addr = sse_addr;
8225 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8226 }
8227 else
8228 {
8229 src_addr = int_addr;
8230 src_offset = REGNO (reg) * 8;
8231 }
8232 src_addr = fold_convert (addr_type, src_addr);
8233 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8234
8235 dest_addr = fold_convert (daddr_type, addr);
8236 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8237 if (cur_size == GET_MODE_SIZE (mode))
8238 {
8239 src = build_va_arg_indirect_ref (src_addr);
8240 dest = build_va_arg_indirect_ref (dest_addr);
8241
8242 gimplify_assign (dest, src, pre_p);
8243 }
8244 else
8245 {
8246 tree copy
8247 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8248 3, dest_addr, src_addr,
8249 size_int (cur_size));
8250 gimplify_and_add (copy, pre_p);
8251 }
8252 prev_size += cur_size;
8253 }
8254 }
8255
8256 if (needed_intregs)
8257 {
8258 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8259 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8260 gimplify_assign (gpr, t, pre_p);
8261 }
8262
8263 if (needed_sseregs)
8264 {
8265 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8266 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8267 gimplify_assign (fpr, t, pre_p);
8268 }
8269
8270 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8271
8272 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8273 }
8274
8275 /* ... otherwise out of the overflow area. */
8276
8277 /* When we align parameter on stack for caller, if the parameter
8278 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8279 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8280 here with caller. */
8281 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8282 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8283 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8284
8285 /* Care for on-stack alignment if needed. */
8286 if (arg_boundary <= 64 || size == 0)
8287 t = ovf;
8288 else
8289 {
8290 HOST_WIDE_INT align = arg_boundary / 8;
8291 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8292 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8293 build_int_cst (TREE_TYPE (t), -align));
8294 }
8295
8296 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8297 gimplify_assign (addr, t, pre_p);
8298
8299 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8300 gimplify_assign (unshare_expr (ovf), t, pre_p);
8301
8302 if (container)
8303 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8304
8305 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8306 addr = fold_convert (ptrtype, addr);
8307
8308 if (indirect_p)
8309 addr = build_va_arg_indirect_ref (addr);
8310 return build_va_arg_indirect_ref (addr);
8311 }
8312 \f
8313 /* Return true if OPNUM's MEM should be matched
8314 in movabs* patterns. */
8315
8316 bool
8317 ix86_check_movabs (rtx insn, int opnum)
8318 {
8319 rtx set, mem;
8320
8321 set = PATTERN (insn);
8322 if (GET_CODE (set) == PARALLEL)
8323 set = XVECEXP (set, 0, 0);
8324 gcc_assert (GET_CODE (set) == SET);
8325 mem = XEXP (set, opnum);
8326 while (GET_CODE (mem) == SUBREG)
8327 mem = SUBREG_REG (mem);
8328 gcc_assert (MEM_P (mem));
8329 return volatile_ok || !MEM_VOLATILE_P (mem);
8330 }
8331 \f
8332 /* Initialize the table of extra 80387 mathematical constants. */
8333
8334 static void
8335 init_ext_80387_constants (void)
8336 {
8337 static const char * cst[5] =
8338 {
8339 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8340 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8341 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8342 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8343 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8344 };
8345 int i;
8346
8347 for (i = 0; i < 5; i++)
8348 {
8349 real_from_string (&ext_80387_constants_table[i], cst[i]);
8350 /* Ensure each constant is rounded to XFmode precision. */
8351 real_convert (&ext_80387_constants_table[i],
8352 XFmode, &ext_80387_constants_table[i]);
8353 }
8354
8355 ext_80387_constants_init = 1;
8356 }
8357
8358 /* Return non-zero if the constant is something that
8359 can be loaded with a special instruction. */
8360
8361 int
8362 standard_80387_constant_p (rtx x)
8363 {
8364 enum machine_mode mode = GET_MODE (x);
8365
8366 REAL_VALUE_TYPE r;
8367
8368 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8369 return -1;
8370
8371 if (x == CONST0_RTX (mode))
8372 return 1;
8373 if (x == CONST1_RTX (mode))
8374 return 2;
8375
8376 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8377
8378 /* For XFmode constants, try to find a special 80387 instruction when
8379 optimizing for size or on those CPUs that benefit from them. */
8380 if (mode == XFmode
8381 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8382 {
8383 int i;
8384
8385 if (! ext_80387_constants_init)
8386 init_ext_80387_constants ();
8387
8388 for (i = 0; i < 5; i++)
8389 if (real_identical (&r, &ext_80387_constants_table[i]))
8390 return i + 3;
8391 }
8392
8393 /* Load of the constant -0.0 or -1.0 will be split as
8394 fldz;fchs or fld1;fchs sequence. */
8395 if (real_isnegzero (&r))
8396 return 8;
8397 if (real_identical (&r, &dconstm1))
8398 return 9;
8399
8400 return 0;
8401 }
8402
8403 /* Return the opcode of the special instruction to be used to load
8404 the constant X. */
8405
8406 const char *
8407 standard_80387_constant_opcode (rtx x)
8408 {
8409 switch (standard_80387_constant_p (x))
8410 {
8411 case 1:
8412 return "fldz";
8413 case 2:
8414 return "fld1";
8415 case 3:
8416 return "fldlg2";
8417 case 4:
8418 return "fldln2";
8419 case 5:
8420 return "fldl2e";
8421 case 6:
8422 return "fldl2t";
8423 case 7:
8424 return "fldpi";
8425 case 8:
8426 case 9:
8427 return "#";
8428 default:
8429 gcc_unreachable ();
8430 }
8431 }
8432
8433 /* Return the CONST_DOUBLE representing the 80387 constant that is
8434 loaded by the specified special instruction. The argument IDX
8435 matches the return value from standard_80387_constant_p. */
8436
8437 rtx
8438 standard_80387_constant_rtx (int idx)
8439 {
8440 int i;
8441
8442 if (! ext_80387_constants_init)
8443 init_ext_80387_constants ();
8444
8445 switch (idx)
8446 {
8447 case 3:
8448 case 4:
8449 case 5:
8450 case 6:
8451 case 7:
8452 i = idx - 3;
8453 break;
8454
8455 default:
8456 gcc_unreachable ();
8457 }
8458
8459 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8460 XFmode);
8461 }
8462
8463 /* Return 1 if X is all 0s and 2 if x is all 1s
8464 in supported SSE/AVX vector mode. */
8465
8466 int
8467 standard_sse_constant_p (rtx x)
8468 {
8469 enum machine_mode mode = GET_MODE (x);
8470
8471 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8472 return 1;
8473 if (vector_all_ones_operand (x, mode))
8474 switch (mode)
8475 {
8476 case V16QImode:
8477 case V8HImode:
8478 case V4SImode:
8479 case V2DImode:
8480 if (TARGET_SSE2)
8481 return 2;
8482 case V32QImode:
8483 case V16HImode:
8484 case V8SImode:
8485 case V4DImode:
8486 if (TARGET_AVX2)
8487 return 2;
8488 default:
8489 break;
8490 }
8491
8492 return 0;
8493 }
8494
8495 /* Return the opcode of the special instruction to be used to load
8496 the constant X. */
8497
8498 const char *
8499 standard_sse_constant_opcode (rtx insn, rtx x)
8500 {
8501 switch (standard_sse_constant_p (x))
8502 {
8503 case 1:
8504 switch (get_attr_mode (insn))
8505 {
8506 case MODE_TI:
8507 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8508 return "%vpxor\t%0, %d0";
8509 case MODE_V2DF:
8510 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8511 return "%vxorpd\t%0, %d0";
8512 case MODE_V4SF:
8513 return "%vxorps\t%0, %d0";
8514
8515 case MODE_OI:
8516 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8517 return "vpxor\t%x0, %x0, %x0";
8518 case MODE_V4DF:
8519 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8520 return "vxorpd\t%x0, %x0, %x0";
8521 case MODE_V8SF:
8522 return "vxorps\t%x0, %x0, %x0";
8523
8524 default:
8525 break;
8526 }
8527
8528 case 2:
8529 if (TARGET_AVX)
8530 return "vpcmpeqd\t%0, %0, %0";
8531 else
8532 return "pcmpeqd\t%0, %0";
8533
8534 default:
8535 break;
8536 }
8537 gcc_unreachable ();
8538 }
8539
8540 /* Returns true if OP contains a symbol reference */
8541
8542 bool
8543 symbolic_reference_mentioned_p (rtx op)
8544 {
8545 const char *fmt;
8546 int i;
8547
8548 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8549 return true;
8550
8551 fmt = GET_RTX_FORMAT (GET_CODE (op));
8552 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8553 {
8554 if (fmt[i] == 'E')
8555 {
8556 int j;
8557
8558 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8559 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8560 return true;
8561 }
8562
8563 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8564 return true;
8565 }
8566
8567 return false;
8568 }
8569
8570 /* Return true if it is appropriate to emit `ret' instructions in the
8571 body of a function. Do this only if the epilogue is simple, needing a
8572 couple of insns. Prior to reloading, we can't tell how many registers
8573 must be saved, so return false then. Return false if there is no frame
8574 marker to de-allocate. */
8575
8576 bool
8577 ix86_can_use_return_insn_p (void)
8578 {
8579 struct ix86_frame frame;
8580
8581 if (! reload_completed || frame_pointer_needed)
8582 return 0;
8583
8584 /* Don't allow more than 32k pop, since that's all we can do
8585 with one instruction. */
8586 if (crtl->args.pops_args && crtl->args.size >= 32768)
8587 return 0;
8588
8589 ix86_compute_frame_layout (&frame);
8590 return (frame.stack_pointer_offset == UNITS_PER_WORD
8591 && (frame.nregs + frame.nsseregs) == 0);
8592 }
8593 \f
8594 /* Value should be nonzero if functions must have frame pointers.
8595 Zero means the frame pointer need not be set up (and parms may
8596 be accessed via the stack pointer) in functions that seem suitable. */
8597
8598 static bool
8599 ix86_frame_pointer_required (void)
8600 {
8601 /* If we accessed previous frames, then the generated code expects
8602 to be able to access the saved ebp value in our frame. */
8603 if (cfun->machine->accesses_prev_frame)
8604 return true;
8605
8606 /* Several x86 os'es need a frame pointer for other reasons,
8607 usually pertaining to setjmp. */
8608 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8609 return true;
8610
8611 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8612 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8613 return true;
8614
8615 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8616 turns off the frame pointer by default. Turn it back on now if
8617 we've not got a leaf function. */
8618 if (TARGET_OMIT_LEAF_FRAME_POINTER
8619 && (!current_function_is_leaf
8620 || ix86_current_function_calls_tls_descriptor))
8621 return true;
8622
8623 if (crtl->profile && !flag_fentry)
8624 return true;
8625
8626 return false;
8627 }
8628
8629 /* Record that the current function accesses previous call frames. */
8630
8631 void
8632 ix86_setup_frame_addresses (void)
8633 {
8634 cfun->machine->accesses_prev_frame = 1;
8635 }
8636 \f
8637 #ifndef USE_HIDDEN_LINKONCE
8638 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8639 # define USE_HIDDEN_LINKONCE 1
8640 # else
8641 # define USE_HIDDEN_LINKONCE 0
8642 # endif
8643 #endif
8644
8645 static int pic_labels_used;
8646
8647 /* Fills in the label name that should be used for a pc thunk for
8648 the given register. */
8649
8650 static void
8651 get_pc_thunk_name (char name[32], unsigned int regno)
8652 {
8653 gcc_assert (!TARGET_64BIT);
8654
8655 if (USE_HIDDEN_LINKONCE)
8656 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8657 else
8658 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8659 }
8660
8661
8662 /* This function generates code for -fpic that loads %ebx with
8663 the return address of the caller and then returns. */
8664
8665 static void
8666 ix86_code_end (void)
8667 {
8668 rtx xops[2];
8669 int regno;
8670
8671 for (regno = AX_REG; regno <= SP_REG; regno++)
8672 {
8673 char name[32];
8674 tree decl;
8675
8676 if (!(pic_labels_used & (1 << regno)))
8677 continue;
8678
8679 get_pc_thunk_name (name, regno);
8680
8681 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8682 get_identifier (name),
8683 build_function_type_list (void_type_node, NULL_TREE));
8684 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8685 NULL_TREE, void_type_node);
8686 TREE_PUBLIC (decl) = 1;
8687 TREE_STATIC (decl) = 1;
8688
8689 #if TARGET_MACHO
8690 if (TARGET_MACHO)
8691 {
8692 switch_to_section (darwin_sections[text_coal_section]);
8693 fputs ("\t.weak_definition\t", asm_out_file);
8694 assemble_name (asm_out_file, name);
8695 fputs ("\n\t.private_extern\t", asm_out_file);
8696 assemble_name (asm_out_file, name);
8697 putc ('\n', asm_out_file);
8698 ASM_OUTPUT_LABEL (asm_out_file, name);
8699 DECL_WEAK (decl) = 1;
8700 }
8701 else
8702 #endif
8703 if (USE_HIDDEN_LINKONCE)
8704 {
8705 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8706
8707 targetm.asm_out.unique_section (decl, 0);
8708 switch_to_section (get_named_section (decl, NULL, 0));
8709
8710 targetm.asm_out.globalize_label (asm_out_file, name);
8711 fputs ("\t.hidden\t", asm_out_file);
8712 assemble_name (asm_out_file, name);
8713 putc ('\n', asm_out_file);
8714 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8715 }
8716 else
8717 {
8718 switch_to_section (text_section);
8719 ASM_OUTPUT_LABEL (asm_out_file, name);
8720 }
8721
8722 DECL_INITIAL (decl) = make_node (BLOCK);
8723 current_function_decl = decl;
8724 init_function_start (decl);
8725 first_function_block_is_cold = false;
8726 /* Make sure unwind info is emitted for the thunk if needed. */
8727 final_start_function (emit_barrier (), asm_out_file, 1);
8728
8729 /* Pad stack IP move with 4 instructions (two NOPs count
8730 as one instruction). */
8731 if (TARGET_PAD_SHORT_FUNCTION)
8732 {
8733 int i = 8;
8734
8735 while (i--)
8736 fputs ("\tnop\n", asm_out_file);
8737 }
8738
8739 xops[0] = gen_rtx_REG (Pmode, regno);
8740 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8741 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8742 fputs ("\tret\n", asm_out_file);
8743 final_end_function ();
8744 init_insn_lengths ();
8745 free_after_compilation (cfun);
8746 set_cfun (NULL);
8747 current_function_decl = NULL;
8748 }
8749
8750 if (flag_split_stack)
8751 file_end_indicate_split_stack ();
8752 }
8753
8754 /* Emit code for the SET_GOT patterns. */
8755
8756 const char *
8757 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8758 {
8759 rtx xops[3];
8760
8761 xops[0] = dest;
8762
8763 if (TARGET_VXWORKS_RTP && flag_pic)
8764 {
8765 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8766 xops[2] = gen_rtx_MEM (Pmode,
8767 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8768 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8769
8770 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8771 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8772 an unadorned address. */
8773 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8774 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8775 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8776 return "";
8777 }
8778
8779 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8780
8781 if (!flag_pic)
8782 {
8783 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8784
8785 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8786
8787 #if TARGET_MACHO
8788 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8789 is what will be referenced by the Mach-O PIC subsystem. */
8790 if (!label)
8791 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8792 #endif
8793
8794 targetm.asm_out.internal_label (asm_out_file, "L",
8795 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8796 }
8797 else
8798 {
8799 char name[32];
8800 get_pc_thunk_name (name, REGNO (dest));
8801 pic_labels_used |= 1 << REGNO (dest);
8802
8803 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8804 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8805 output_asm_insn ("call\t%X2", xops);
8806 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8807 is what will be referenced by the Mach-O PIC subsystem. */
8808 #if TARGET_MACHO
8809 if (!label)
8810 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8811 else
8812 targetm.asm_out.internal_label (asm_out_file, "L",
8813 CODE_LABEL_NUMBER (label));
8814 #endif
8815 }
8816
8817 if (!TARGET_MACHO)
8818 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8819
8820 return "";
8821 }
8822
8823 /* Generate an "push" pattern for input ARG. */
8824
8825 static rtx
8826 gen_push (rtx arg)
8827 {
8828 struct machine_function *m = cfun->machine;
8829
8830 if (m->fs.cfa_reg == stack_pointer_rtx)
8831 m->fs.cfa_offset += UNITS_PER_WORD;
8832 m->fs.sp_offset += UNITS_PER_WORD;
8833
8834 return gen_rtx_SET (VOIDmode,
8835 gen_rtx_MEM (Pmode,
8836 gen_rtx_PRE_DEC (Pmode,
8837 stack_pointer_rtx)),
8838 arg);
8839 }
8840
8841 /* Generate an "pop" pattern for input ARG. */
8842
8843 static rtx
8844 gen_pop (rtx arg)
8845 {
8846 return gen_rtx_SET (VOIDmode,
8847 arg,
8848 gen_rtx_MEM (Pmode,
8849 gen_rtx_POST_INC (Pmode,
8850 stack_pointer_rtx)));
8851 }
8852
8853 /* Return >= 0 if there is an unused call-clobbered register available
8854 for the entire function. */
8855
8856 static unsigned int
8857 ix86_select_alt_pic_regnum (void)
8858 {
8859 if (current_function_is_leaf
8860 && !crtl->profile
8861 && !ix86_current_function_calls_tls_descriptor)
8862 {
8863 int i, drap;
8864 /* Can't use the same register for both PIC and DRAP. */
8865 if (crtl->drap_reg)
8866 drap = REGNO (crtl->drap_reg);
8867 else
8868 drap = -1;
8869 for (i = 2; i >= 0; --i)
8870 if (i != drap && !df_regs_ever_live_p (i))
8871 return i;
8872 }
8873
8874 return INVALID_REGNUM;
8875 }
8876
8877 /* Return TRUE if we need to save REGNO. */
8878
8879 static bool
8880 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8881 {
8882 if (pic_offset_table_rtx
8883 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8884 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8885 || crtl->profile
8886 || crtl->calls_eh_return
8887 || crtl->uses_const_pool))
8888 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8889
8890 if (crtl->calls_eh_return && maybe_eh_return)
8891 {
8892 unsigned i;
8893 for (i = 0; ; i++)
8894 {
8895 unsigned test = EH_RETURN_DATA_REGNO (i);
8896 if (test == INVALID_REGNUM)
8897 break;
8898 if (test == regno)
8899 return true;
8900 }
8901 }
8902
8903 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8904 return true;
8905
8906 return (df_regs_ever_live_p (regno)
8907 && !call_used_regs[regno]
8908 && !fixed_regs[regno]
8909 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8910 }
8911
8912 /* Return number of saved general prupose registers. */
8913
8914 static int
8915 ix86_nsaved_regs (void)
8916 {
8917 int nregs = 0;
8918 int regno;
8919
8920 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8921 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8922 nregs ++;
8923 return nregs;
8924 }
8925
8926 /* Return number of saved SSE registrers. */
8927
8928 static int
8929 ix86_nsaved_sseregs (void)
8930 {
8931 int nregs = 0;
8932 int regno;
8933
8934 if (!TARGET_64BIT_MS_ABI)
8935 return 0;
8936 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8937 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8938 nregs ++;
8939 return nregs;
8940 }
8941
8942 /* Given FROM and TO register numbers, say whether this elimination is
8943 allowed. If stack alignment is needed, we can only replace argument
8944 pointer with hard frame pointer, or replace frame pointer with stack
8945 pointer. Otherwise, frame pointer elimination is automatically
8946 handled and all other eliminations are valid. */
8947
8948 static bool
8949 ix86_can_eliminate (const int from, const int to)
8950 {
8951 if (stack_realign_fp)
8952 return ((from == ARG_POINTER_REGNUM
8953 && to == HARD_FRAME_POINTER_REGNUM)
8954 || (from == FRAME_POINTER_REGNUM
8955 && to == STACK_POINTER_REGNUM));
8956 else
8957 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8958 }
8959
8960 /* Return the offset between two registers, one to be eliminated, and the other
8961 its replacement, at the start of a routine. */
8962
8963 HOST_WIDE_INT
8964 ix86_initial_elimination_offset (int from, int to)
8965 {
8966 struct ix86_frame frame;
8967 ix86_compute_frame_layout (&frame);
8968
8969 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8970 return frame.hard_frame_pointer_offset;
8971 else if (from == FRAME_POINTER_REGNUM
8972 && to == HARD_FRAME_POINTER_REGNUM)
8973 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8974 else
8975 {
8976 gcc_assert (to == STACK_POINTER_REGNUM);
8977
8978 if (from == ARG_POINTER_REGNUM)
8979 return frame.stack_pointer_offset;
8980
8981 gcc_assert (from == FRAME_POINTER_REGNUM);
8982 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8983 }
8984 }
8985
8986 /* In a dynamically-aligned function, we can't know the offset from
8987 stack pointer to frame pointer, so we must ensure that setjmp
8988 eliminates fp against the hard fp (%ebp) rather than trying to
8989 index from %esp up to the top of the frame across a gap that is
8990 of unknown (at compile-time) size. */
8991 static rtx
8992 ix86_builtin_setjmp_frame_value (void)
8993 {
8994 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8995 }
8996
8997 /* When using -fsplit-stack, the allocation routines set a field in
8998 the TCB to the bottom of the stack plus this much space, measured
8999 in bytes. */
9000
9001 #define SPLIT_STACK_AVAILABLE 256
9002
9003 /* Fill structure ix86_frame about frame of currently computed function. */
9004
9005 static void
9006 ix86_compute_frame_layout (struct ix86_frame *frame)
9007 {
9008 unsigned int stack_alignment_needed;
9009 HOST_WIDE_INT offset;
9010 unsigned int preferred_alignment;
9011 HOST_WIDE_INT size = get_frame_size ();
9012 HOST_WIDE_INT to_allocate;
9013
9014 frame->nregs = ix86_nsaved_regs ();
9015 frame->nsseregs = ix86_nsaved_sseregs ();
9016
9017 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9018 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9019
9020 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9021 function prologues and leaf. */
9022 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9023 && (!current_function_is_leaf || cfun->calls_alloca != 0
9024 || ix86_current_function_calls_tls_descriptor))
9025 {
9026 preferred_alignment = 16;
9027 stack_alignment_needed = 16;
9028 crtl->preferred_stack_boundary = 128;
9029 crtl->stack_alignment_needed = 128;
9030 }
9031
9032 gcc_assert (!size || stack_alignment_needed);
9033 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9034 gcc_assert (preferred_alignment <= stack_alignment_needed);
9035
9036 /* For SEH we have to limit the amount of code movement into the prologue.
9037 At present we do this via a BLOCKAGE, at which point there's very little
9038 scheduling that can be done, which means that there's very little point
9039 in doing anything except PUSHs. */
9040 if (TARGET_SEH)
9041 cfun->machine->use_fast_prologue_epilogue = false;
9042
9043 /* During reload iteration the amount of registers saved can change.
9044 Recompute the value as needed. Do not recompute when amount of registers
9045 didn't change as reload does multiple calls to the function and does not
9046 expect the decision to change within single iteration. */
9047 else if (!optimize_function_for_size_p (cfun)
9048 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9049 {
9050 int count = frame->nregs;
9051 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9052
9053 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9054
9055 /* The fast prologue uses move instead of push to save registers. This
9056 is significantly longer, but also executes faster as modern hardware
9057 can execute the moves in parallel, but can't do that for push/pop.
9058
9059 Be careful about choosing what prologue to emit: When function takes
9060 many instructions to execute we may use slow version as well as in
9061 case function is known to be outside hot spot (this is known with
9062 feedback only). Weight the size of function by number of registers
9063 to save as it is cheap to use one or two push instructions but very
9064 slow to use many of them. */
9065 if (count)
9066 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9067 if (node->frequency < NODE_FREQUENCY_NORMAL
9068 || (flag_branch_probabilities
9069 && node->frequency < NODE_FREQUENCY_HOT))
9070 cfun->machine->use_fast_prologue_epilogue = false;
9071 else
9072 cfun->machine->use_fast_prologue_epilogue
9073 = !expensive_function_p (count);
9074 }
9075
9076 frame->save_regs_using_mov
9077 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9078 /* If static stack checking is enabled and done with probes,
9079 the registers need to be saved before allocating the frame. */
9080 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9081
9082 /* Skip return address. */
9083 offset = UNITS_PER_WORD;
9084
9085 /* Skip pushed static chain. */
9086 if (ix86_static_chain_on_stack)
9087 offset += UNITS_PER_WORD;
9088
9089 /* Skip saved base pointer. */
9090 if (frame_pointer_needed)
9091 offset += UNITS_PER_WORD;
9092 frame->hfp_save_offset = offset;
9093
9094 /* The traditional frame pointer location is at the top of the frame. */
9095 frame->hard_frame_pointer_offset = offset;
9096
9097 /* Register save area */
9098 offset += frame->nregs * UNITS_PER_WORD;
9099 frame->reg_save_offset = offset;
9100
9101 /* Align and set SSE register save area. */
9102 if (frame->nsseregs)
9103 {
9104 /* The only ABI that has saved SSE registers (Win64) also has a
9105 16-byte aligned default stack, and thus we don't need to be
9106 within the re-aligned local stack frame to save them. */
9107 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9108 offset = (offset + 16 - 1) & -16;
9109 offset += frame->nsseregs * 16;
9110 }
9111 frame->sse_reg_save_offset = offset;
9112
9113 /* The re-aligned stack starts here. Values before this point are not
9114 directly comparable with values below this point. In order to make
9115 sure that no value happens to be the same before and after, force
9116 the alignment computation below to add a non-zero value. */
9117 if (stack_realign_fp)
9118 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9119
9120 /* Va-arg area */
9121 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9122 offset += frame->va_arg_size;
9123
9124 /* Align start of frame for local function. */
9125 if (stack_realign_fp
9126 || offset != frame->sse_reg_save_offset
9127 || size != 0
9128 || !current_function_is_leaf
9129 || cfun->calls_alloca
9130 || ix86_current_function_calls_tls_descriptor)
9131 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9132
9133 /* Frame pointer points here. */
9134 frame->frame_pointer_offset = offset;
9135
9136 offset += size;
9137
9138 /* Add outgoing arguments area. Can be skipped if we eliminated
9139 all the function calls as dead code.
9140 Skipping is however impossible when function calls alloca. Alloca
9141 expander assumes that last crtl->outgoing_args_size
9142 of stack frame are unused. */
9143 if (ACCUMULATE_OUTGOING_ARGS
9144 && (!current_function_is_leaf || cfun->calls_alloca
9145 || ix86_current_function_calls_tls_descriptor))
9146 {
9147 offset += crtl->outgoing_args_size;
9148 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9149 }
9150 else
9151 frame->outgoing_arguments_size = 0;
9152
9153 /* Align stack boundary. Only needed if we're calling another function
9154 or using alloca. */
9155 if (!current_function_is_leaf || cfun->calls_alloca
9156 || ix86_current_function_calls_tls_descriptor)
9157 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9158
9159 /* We've reached end of stack frame. */
9160 frame->stack_pointer_offset = offset;
9161
9162 /* Size prologue needs to allocate. */
9163 to_allocate = offset - frame->sse_reg_save_offset;
9164
9165 if ((!to_allocate && frame->nregs <= 1)
9166 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9167 frame->save_regs_using_mov = false;
9168
9169 if (ix86_using_red_zone ()
9170 && current_function_sp_is_unchanging
9171 && current_function_is_leaf
9172 && !ix86_current_function_calls_tls_descriptor)
9173 {
9174 frame->red_zone_size = to_allocate;
9175 if (frame->save_regs_using_mov)
9176 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9177 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9178 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9179 }
9180 else
9181 frame->red_zone_size = 0;
9182 frame->stack_pointer_offset -= frame->red_zone_size;
9183
9184 /* The SEH frame pointer location is near the bottom of the frame.
9185 This is enforced by the fact that the difference between the
9186 stack pointer and the frame pointer is limited to 240 bytes in
9187 the unwind data structure. */
9188 if (TARGET_SEH)
9189 {
9190 HOST_WIDE_INT diff;
9191
9192 /* If we can leave the frame pointer where it is, do so. */
9193 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9194 if (diff > 240 || (diff & 15) != 0)
9195 {
9196 /* Ideally we'd determine what portion of the local stack frame
9197 (within the constraint of the lowest 240) is most heavily used.
9198 But without that complication, simply bias the frame pointer
9199 by 128 bytes so as to maximize the amount of the local stack
9200 frame that is addressable with 8-bit offsets. */
9201 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9202 }
9203 }
9204 }
9205
9206 /* This is semi-inlined memory_address_length, but simplified
9207 since we know that we're always dealing with reg+offset, and
9208 to avoid having to create and discard all that rtl. */
9209
9210 static inline int
9211 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9212 {
9213 int len = 4;
9214
9215 if (offset == 0)
9216 {
9217 /* EBP and R13 cannot be encoded without an offset. */
9218 len = (regno == BP_REG || regno == R13_REG);
9219 }
9220 else if (IN_RANGE (offset, -128, 127))
9221 len = 1;
9222
9223 /* ESP and R12 must be encoded with a SIB byte. */
9224 if (regno == SP_REG || regno == R12_REG)
9225 len++;
9226
9227 return len;
9228 }
9229
9230 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9231 The valid base registers are taken from CFUN->MACHINE->FS. */
9232
9233 static rtx
9234 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9235 {
9236 const struct machine_function *m = cfun->machine;
9237 rtx base_reg = NULL;
9238 HOST_WIDE_INT base_offset = 0;
9239
9240 if (m->use_fast_prologue_epilogue)
9241 {
9242 /* Choose the base register most likely to allow the most scheduling
9243 opportunities. Generally FP is valid througout the function,
9244 while DRAP must be reloaded within the epilogue. But choose either
9245 over the SP due to increased encoding size. */
9246
9247 if (m->fs.fp_valid)
9248 {
9249 base_reg = hard_frame_pointer_rtx;
9250 base_offset = m->fs.fp_offset - cfa_offset;
9251 }
9252 else if (m->fs.drap_valid)
9253 {
9254 base_reg = crtl->drap_reg;
9255 base_offset = 0 - cfa_offset;
9256 }
9257 else if (m->fs.sp_valid)
9258 {
9259 base_reg = stack_pointer_rtx;
9260 base_offset = m->fs.sp_offset - cfa_offset;
9261 }
9262 }
9263 else
9264 {
9265 HOST_WIDE_INT toffset;
9266 int len = 16, tlen;
9267
9268 /* Choose the base register with the smallest address encoding.
9269 With a tie, choose FP > DRAP > SP. */
9270 if (m->fs.sp_valid)
9271 {
9272 base_reg = stack_pointer_rtx;
9273 base_offset = m->fs.sp_offset - cfa_offset;
9274 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9275 }
9276 if (m->fs.drap_valid)
9277 {
9278 toffset = 0 - cfa_offset;
9279 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9280 if (tlen <= len)
9281 {
9282 base_reg = crtl->drap_reg;
9283 base_offset = toffset;
9284 len = tlen;
9285 }
9286 }
9287 if (m->fs.fp_valid)
9288 {
9289 toffset = m->fs.fp_offset - cfa_offset;
9290 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9291 if (tlen <= len)
9292 {
9293 base_reg = hard_frame_pointer_rtx;
9294 base_offset = toffset;
9295 len = tlen;
9296 }
9297 }
9298 }
9299 gcc_assert (base_reg != NULL);
9300
9301 return plus_constant (base_reg, base_offset);
9302 }
9303
9304 /* Emit code to save registers in the prologue. */
9305
9306 static void
9307 ix86_emit_save_regs (void)
9308 {
9309 unsigned int regno;
9310 rtx insn;
9311
9312 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9313 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9314 {
9315 insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
9316 RTX_FRAME_RELATED_P (insn) = 1;
9317 }
9318 }
9319
9320 /* Emit a single register save at CFA - CFA_OFFSET. */
9321
9322 static void
9323 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9324 HOST_WIDE_INT cfa_offset)
9325 {
9326 struct machine_function *m = cfun->machine;
9327 rtx reg = gen_rtx_REG (mode, regno);
9328 rtx mem, addr, base, insn;
9329
9330 addr = choose_baseaddr (cfa_offset);
9331 mem = gen_frame_mem (mode, addr);
9332
9333 /* For SSE saves, we need to indicate the 128-bit alignment. */
9334 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9335
9336 insn = emit_move_insn (mem, reg);
9337 RTX_FRAME_RELATED_P (insn) = 1;
9338
9339 base = addr;
9340 if (GET_CODE (base) == PLUS)
9341 base = XEXP (base, 0);
9342 gcc_checking_assert (REG_P (base));
9343
9344 /* When saving registers into a re-aligned local stack frame, avoid
9345 any tricky guessing by dwarf2out. */
9346 if (m->fs.realigned)
9347 {
9348 gcc_checking_assert (stack_realign_drap);
9349
9350 if (regno == REGNO (crtl->drap_reg))
9351 {
9352 /* A bit of a hack. We force the DRAP register to be saved in
9353 the re-aligned stack frame, which provides us with a copy
9354 of the CFA that will last past the prologue. Install it. */
9355 gcc_checking_assert (cfun->machine->fs.fp_valid);
9356 addr = plus_constant (hard_frame_pointer_rtx,
9357 cfun->machine->fs.fp_offset - cfa_offset);
9358 mem = gen_rtx_MEM (mode, addr);
9359 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9360 }
9361 else
9362 {
9363 /* The frame pointer is a stable reference within the
9364 aligned frame. Use it. */
9365 gcc_checking_assert (cfun->machine->fs.fp_valid);
9366 addr = plus_constant (hard_frame_pointer_rtx,
9367 cfun->machine->fs.fp_offset - cfa_offset);
9368 mem = gen_rtx_MEM (mode, addr);
9369 add_reg_note (insn, REG_CFA_EXPRESSION,
9370 gen_rtx_SET (VOIDmode, mem, reg));
9371 }
9372 }
9373
9374 /* The memory may not be relative to the current CFA register,
9375 which means that we may need to generate a new pattern for
9376 use by the unwind info. */
9377 else if (base != m->fs.cfa_reg)
9378 {
9379 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9380 mem = gen_rtx_MEM (mode, addr);
9381 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9382 }
9383 }
9384
9385 /* Emit code to save registers using MOV insns.
9386 First register is stored at CFA - CFA_OFFSET. */
9387 static void
9388 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9389 {
9390 unsigned int regno;
9391
9392 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9393 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9394 {
9395 ix86_emit_save_reg_using_mov (Pmode, regno, cfa_offset);
9396 cfa_offset -= UNITS_PER_WORD;
9397 }
9398 }
9399
9400 /* Emit code to save SSE registers using MOV insns.
9401 First register is stored at CFA - CFA_OFFSET. */
9402 static void
9403 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9404 {
9405 unsigned int regno;
9406
9407 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9408 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9409 {
9410 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9411 cfa_offset -= 16;
9412 }
9413 }
9414
9415 static GTY(()) rtx queued_cfa_restores;
9416
9417 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9418 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9419 Don't add the note if the previously saved value will be left untouched
9420 within stack red-zone till return, as unwinders can find the same value
9421 in the register and on the stack. */
9422
9423 static void
9424 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9425 {
9426 if (!crtl->shrink_wrapped
9427 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9428 return;
9429
9430 if (insn)
9431 {
9432 add_reg_note (insn, REG_CFA_RESTORE, reg);
9433 RTX_FRAME_RELATED_P (insn) = 1;
9434 }
9435 else
9436 queued_cfa_restores
9437 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9438 }
9439
9440 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9441
9442 static void
9443 ix86_add_queued_cfa_restore_notes (rtx insn)
9444 {
9445 rtx last;
9446 if (!queued_cfa_restores)
9447 return;
9448 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9449 ;
9450 XEXP (last, 1) = REG_NOTES (insn);
9451 REG_NOTES (insn) = queued_cfa_restores;
9452 queued_cfa_restores = NULL_RTX;
9453 RTX_FRAME_RELATED_P (insn) = 1;
9454 }
9455
9456 /* Expand prologue or epilogue stack adjustment.
9457 The pattern exist to put a dependency on all ebp-based memory accesses.
9458 STYLE should be negative if instructions should be marked as frame related,
9459 zero if %r11 register is live and cannot be freely used and positive
9460 otherwise. */
9461
9462 static void
9463 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9464 int style, bool set_cfa)
9465 {
9466 struct machine_function *m = cfun->machine;
9467 rtx insn;
9468 bool add_frame_related_expr = false;
9469
9470 if (! TARGET_64BIT)
9471 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9472 else if (x86_64_immediate_operand (offset, DImode))
9473 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9474 else
9475 {
9476 rtx tmp;
9477 /* r11 is used by indirect sibcall return as well, set before the
9478 epilogue and used after the epilogue. */
9479 if (style)
9480 tmp = gen_rtx_REG (DImode, R11_REG);
9481 else
9482 {
9483 gcc_assert (src != hard_frame_pointer_rtx
9484 && dest != hard_frame_pointer_rtx);
9485 tmp = hard_frame_pointer_rtx;
9486 }
9487 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9488 if (style < 0)
9489 add_frame_related_expr = true;
9490
9491 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9492 }
9493
9494 insn = emit_insn (insn);
9495 if (style >= 0)
9496 ix86_add_queued_cfa_restore_notes (insn);
9497
9498 if (set_cfa)
9499 {
9500 rtx r;
9501
9502 gcc_assert (m->fs.cfa_reg == src);
9503 m->fs.cfa_offset += INTVAL (offset);
9504 m->fs.cfa_reg = dest;
9505
9506 r = gen_rtx_PLUS (Pmode, src, offset);
9507 r = gen_rtx_SET (VOIDmode, dest, r);
9508 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9509 RTX_FRAME_RELATED_P (insn) = 1;
9510 }
9511 else if (style < 0)
9512 {
9513 RTX_FRAME_RELATED_P (insn) = 1;
9514 if (add_frame_related_expr)
9515 {
9516 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9517 r = gen_rtx_SET (VOIDmode, dest, r);
9518 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9519 }
9520 }
9521
9522 if (dest == stack_pointer_rtx)
9523 {
9524 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9525 bool valid = m->fs.sp_valid;
9526
9527 if (src == hard_frame_pointer_rtx)
9528 {
9529 valid = m->fs.fp_valid;
9530 ooffset = m->fs.fp_offset;
9531 }
9532 else if (src == crtl->drap_reg)
9533 {
9534 valid = m->fs.drap_valid;
9535 ooffset = 0;
9536 }
9537 else
9538 {
9539 /* Else there are two possibilities: SP itself, which we set
9540 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9541 taken care of this by hand along the eh_return path. */
9542 gcc_checking_assert (src == stack_pointer_rtx
9543 || offset == const0_rtx);
9544 }
9545
9546 m->fs.sp_offset = ooffset - INTVAL (offset);
9547 m->fs.sp_valid = valid;
9548 }
9549 }
9550
9551 /* Find an available register to be used as dynamic realign argument
9552 pointer regsiter. Such a register will be written in prologue and
9553 used in begin of body, so it must not be
9554 1. parameter passing register.
9555 2. GOT pointer.
9556 We reuse static-chain register if it is available. Otherwise, we
9557 use DI for i386 and R13 for x86-64. We chose R13 since it has
9558 shorter encoding.
9559
9560 Return: the regno of chosen register. */
9561
9562 static unsigned int
9563 find_drap_reg (void)
9564 {
9565 tree decl = cfun->decl;
9566
9567 if (TARGET_64BIT)
9568 {
9569 /* Use R13 for nested function or function need static chain.
9570 Since function with tail call may use any caller-saved
9571 registers in epilogue, DRAP must not use caller-saved
9572 register in such case. */
9573 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9574 return R13_REG;
9575
9576 return R10_REG;
9577 }
9578 else
9579 {
9580 /* Use DI for nested function or function need static chain.
9581 Since function with tail call may use any caller-saved
9582 registers in epilogue, DRAP must not use caller-saved
9583 register in such case. */
9584 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9585 return DI_REG;
9586
9587 /* Reuse static chain register if it isn't used for parameter
9588 passing. */
9589 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9590 {
9591 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9592 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9593 return CX_REG;
9594 }
9595 return DI_REG;
9596 }
9597 }
9598
9599 /* Return minimum incoming stack alignment. */
9600
9601 static unsigned int
9602 ix86_minimum_incoming_stack_boundary (bool sibcall)
9603 {
9604 unsigned int incoming_stack_boundary;
9605
9606 /* Prefer the one specified at command line. */
9607 if (ix86_user_incoming_stack_boundary)
9608 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9609 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9610 if -mstackrealign is used, it isn't used for sibcall check and
9611 estimated stack alignment is 128bit. */
9612 else if (!sibcall
9613 && !TARGET_64BIT
9614 && ix86_force_align_arg_pointer
9615 && crtl->stack_alignment_estimated == 128)
9616 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9617 else
9618 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9619
9620 /* Incoming stack alignment can be changed on individual functions
9621 via force_align_arg_pointer attribute. We use the smallest
9622 incoming stack boundary. */
9623 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9624 && lookup_attribute (ix86_force_align_arg_pointer_string,
9625 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9626 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9627
9628 /* The incoming stack frame has to be aligned at least at
9629 parm_stack_boundary. */
9630 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9631 incoming_stack_boundary = crtl->parm_stack_boundary;
9632
9633 /* Stack at entrance of main is aligned by runtime. We use the
9634 smallest incoming stack boundary. */
9635 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9636 && DECL_NAME (current_function_decl)
9637 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9638 && DECL_FILE_SCOPE_P (current_function_decl))
9639 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9640
9641 return incoming_stack_boundary;
9642 }
9643
9644 /* Update incoming stack boundary and estimated stack alignment. */
9645
9646 static void
9647 ix86_update_stack_boundary (void)
9648 {
9649 ix86_incoming_stack_boundary
9650 = ix86_minimum_incoming_stack_boundary (false);
9651
9652 /* x86_64 vararg needs 16byte stack alignment for register save
9653 area. */
9654 if (TARGET_64BIT
9655 && cfun->stdarg
9656 && crtl->stack_alignment_estimated < 128)
9657 crtl->stack_alignment_estimated = 128;
9658 }
9659
9660 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9661 needed or an rtx for DRAP otherwise. */
9662
9663 static rtx
9664 ix86_get_drap_rtx (void)
9665 {
9666 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9667 crtl->need_drap = true;
9668
9669 if (stack_realign_drap)
9670 {
9671 /* Assign DRAP to vDRAP and returns vDRAP */
9672 unsigned int regno = find_drap_reg ();
9673 rtx drap_vreg;
9674 rtx arg_ptr;
9675 rtx seq, insn;
9676
9677 arg_ptr = gen_rtx_REG (Pmode, regno);
9678 crtl->drap_reg = arg_ptr;
9679
9680 start_sequence ();
9681 drap_vreg = copy_to_reg (arg_ptr);
9682 seq = get_insns ();
9683 end_sequence ();
9684
9685 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9686 if (!optimize)
9687 {
9688 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9689 RTX_FRAME_RELATED_P (insn) = 1;
9690 }
9691 return drap_vreg;
9692 }
9693 else
9694 return NULL;
9695 }
9696
9697 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9698
9699 static rtx
9700 ix86_internal_arg_pointer (void)
9701 {
9702 return virtual_incoming_args_rtx;
9703 }
9704
9705 struct scratch_reg {
9706 rtx reg;
9707 bool saved;
9708 };
9709
9710 /* Return a short-lived scratch register for use on function entry.
9711 In 32-bit mode, it is valid only after the registers are saved
9712 in the prologue. This register must be released by means of
9713 release_scratch_register_on_entry once it is dead. */
9714
9715 static void
9716 get_scratch_register_on_entry (struct scratch_reg *sr)
9717 {
9718 int regno;
9719
9720 sr->saved = false;
9721
9722 if (TARGET_64BIT)
9723 {
9724 /* We always use R11 in 64-bit mode. */
9725 regno = R11_REG;
9726 }
9727 else
9728 {
9729 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9730 bool fastcall_p
9731 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9732 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9733 int regparm = ix86_function_regparm (fntype, decl);
9734 int drap_regno
9735 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9736
9737 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9738 for the static chain register. */
9739 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9740 && drap_regno != AX_REG)
9741 regno = AX_REG;
9742 else if (regparm < 2 && drap_regno != DX_REG)
9743 regno = DX_REG;
9744 /* ecx is the static chain register. */
9745 else if (regparm < 3 && !fastcall_p && !static_chain_p
9746 && drap_regno != CX_REG)
9747 regno = CX_REG;
9748 else if (ix86_save_reg (BX_REG, true))
9749 regno = BX_REG;
9750 /* esi is the static chain register. */
9751 else if (!(regparm == 3 && static_chain_p)
9752 && ix86_save_reg (SI_REG, true))
9753 regno = SI_REG;
9754 else if (ix86_save_reg (DI_REG, true))
9755 regno = DI_REG;
9756 else
9757 {
9758 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9759 sr->saved = true;
9760 }
9761 }
9762
9763 sr->reg = gen_rtx_REG (Pmode, regno);
9764 if (sr->saved)
9765 {
9766 rtx insn = emit_insn (gen_push (sr->reg));
9767 RTX_FRAME_RELATED_P (insn) = 1;
9768 }
9769 }
9770
9771 /* Release a scratch register obtained from the preceding function. */
9772
9773 static void
9774 release_scratch_register_on_entry (struct scratch_reg *sr)
9775 {
9776 if (sr->saved)
9777 {
9778 rtx x, insn = emit_insn (gen_pop (sr->reg));
9779
9780 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9781 RTX_FRAME_RELATED_P (insn) = 1;
9782 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9783 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9784 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9785 }
9786 }
9787
9788 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9789
9790 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9791
9792 static void
9793 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9794 {
9795 /* We skip the probe for the first interval + a small dope of 4 words and
9796 probe that many bytes past the specified size to maintain a protection
9797 area at the botton of the stack. */
9798 const int dope = 4 * UNITS_PER_WORD;
9799 rtx size_rtx = GEN_INT (size), last;
9800
9801 /* See if we have a constant small number of probes to generate. If so,
9802 that's the easy case. The run-time loop is made up of 11 insns in the
9803 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9804 for n # of intervals. */
9805 if (size <= 5 * PROBE_INTERVAL)
9806 {
9807 HOST_WIDE_INT i, adjust;
9808 bool first_probe = true;
9809
9810 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9811 values of N from 1 until it exceeds SIZE. If only one probe is
9812 needed, this will not generate any code. Then adjust and probe
9813 to PROBE_INTERVAL + SIZE. */
9814 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9815 {
9816 if (first_probe)
9817 {
9818 adjust = 2 * PROBE_INTERVAL + dope;
9819 first_probe = false;
9820 }
9821 else
9822 adjust = PROBE_INTERVAL;
9823
9824 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9825 plus_constant (stack_pointer_rtx, -adjust)));
9826 emit_stack_probe (stack_pointer_rtx);
9827 }
9828
9829 if (first_probe)
9830 adjust = size + PROBE_INTERVAL + dope;
9831 else
9832 adjust = size + PROBE_INTERVAL - i;
9833
9834 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9835 plus_constant (stack_pointer_rtx, -adjust)));
9836 emit_stack_probe (stack_pointer_rtx);
9837
9838 /* Adjust back to account for the additional first interval. */
9839 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9840 plus_constant (stack_pointer_rtx,
9841 PROBE_INTERVAL + dope)));
9842 }
9843
9844 /* Otherwise, do the same as above, but in a loop. Note that we must be
9845 extra careful with variables wrapping around because we might be at
9846 the very top (or the very bottom) of the address space and we have
9847 to be able to handle this case properly; in particular, we use an
9848 equality test for the loop condition. */
9849 else
9850 {
9851 HOST_WIDE_INT rounded_size;
9852 struct scratch_reg sr;
9853
9854 get_scratch_register_on_entry (&sr);
9855
9856
9857 /* Step 1: round SIZE to the previous multiple of the interval. */
9858
9859 rounded_size = size & -PROBE_INTERVAL;
9860
9861
9862 /* Step 2: compute initial and final value of the loop counter. */
9863
9864 /* SP = SP_0 + PROBE_INTERVAL. */
9865 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9866 plus_constant (stack_pointer_rtx,
9867 - (PROBE_INTERVAL + dope))));
9868
9869 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9870 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9871 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9872 gen_rtx_PLUS (Pmode, sr.reg,
9873 stack_pointer_rtx)));
9874
9875
9876 /* Step 3: the loop
9877
9878 while (SP != LAST_ADDR)
9879 {
9880 SP = SP + PROBE_INTERVAL
9881 probe at SP
9882 }
9883
9884 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9885 values of N from 1 until it is equal to ROUNDED_SIZE. */
9886
9887 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9888
9889
9890 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9891 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9892
9893 if (size != rounded_size)
9894 {
9895 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9896 plus_constant (stack_pointer_rtx,
9897 rounded_size - size)));
9898 emit_stack_probe (stack_pointer_rtx);
9899 }
9900
9901 /* Adjust back to account for the additional first interval. */
9902 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9903 plus_constant (stack_pointer_rtx,
9904 PROBE_INTERVAL + dope)));
9905
9906 release_scratch_register_on_entry (&sr);
9907 }
9908
9909 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9910
9911 /* Even if the stack pointer isn't the CFA register, we need to correctly
9912 describe the adjustments made to it, in particular differentiate the
9913 frame-related ones from the frame-unrelated ones. */
9914 if (size > 0)
9915 {
9916 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9917 XVECEXP (expr, 0, 0)
9918 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9919 plus_constant (stack_pointer_rtx, -size));
9920 XVECEXP (expr, 0, 1)
9921 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9922 plus_constant (stack_pointer_rtx,
9923 PROBE_INTERVAL + dope + size));
9924 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9925 RTX_FRAME_RELATED_P (last) = 1;
9926
9927 cfun->machine->fs.sp_offset += size;
9928 }
9929
9930 /* Make sure nothing is scheduled before we are done. */
9931 emit_insn (gen_blockage ());
9932 }
9933
9934 /* Adjust the stack pointer up to REG while probing it. */
9935
9936 const char *
9937 output_adjust_stack_and_probe (rtx reg)
9938 {
9939 static int labelno = 0;
9940 char loop_lab[32], end_lab[32];
9941 rtx xops[2];
9942
9943 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9944 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9945
9946 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9947
9948 /* Jump to END_LAB if SP == LAST_ADDR. */
9949 xops[0] = stack_pointer_rtx;
9950 xops[1] = reg;
9951 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9952 fputs ("\tje\t", asm_out_file);
9953 assemble_name_raw (asm_out_file, end_lab);
9954 fputc ('\n', asm_out_file);
9955
9956 /* SP = SP + PROBE_INTERVAL. */
9957 xops[1] = GEN_INT (PROBE_INTERVAL);
9958 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9959
9960 /* Probe at SP. */
9961 xops[1] = const0_rtx;
9962 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9963
9964 fprintf (asm_out_file, "\tjmp\t");
9965 assemble_name_raw (asm_out_file, loop_lab);
9966 fputc ('\n', asm_out_file);
9967
9968 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9969
9970 return "";
9971 }
9972
9973 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9974 inclusive. These are offsets from the current stack pointer. */
9975
9976 static void
9977 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9978 {
9979 /* See if we have a constant small number of probes to generate. If so,
9980 that's the easy case. The run-time loop is made up of 7 insns in the
9981 generic case while the compile-time loop is made up of n insns for n #
9982 of intervals. */
9983 if (size <= 7 * PROBE_INTERVAL)
9984 {
9985 HOST_WIDE_INT i;
9986
9987 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9988 it exceeds SIZE. If only one probe is needed, this will not
9989 generate any code. Then probe at FIRST + SIZE. */
9990 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9991 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9992
9993 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9994 }
9995
9996 /* Otherwise, do the same as above, but in a loop. Note that we must be
9997 extra careful with variables wrapping around because we might be at
9998 the very top (or the very bottom) of the address space and we have
9999 to be able to handle this case properly; in particular, we use an
10000 equality test for the loop condition. */
10001 else
10002 {
10003 HOST_WIDE_INT rounded_size, last;
10004 struct scratch_reg sr;
10005
10006 get_scratch_register_on_entry (&sr);
10007
10008
10009 /* Step 1: round SIZE to the previous multiple of the interval. */
10010
10011 rounded_size = size & -PROBE_INTERVAL;
10012
10013
10014 /* Step 2: compute initial and final value of the loop counter. */
10015
10016 /* TEST_OFFSET = FIRST. */
10017 emit_move_insn (sr.reg, GEN_INT (-first));
10018
10019 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10020 last = first + rounded_size;
10021
10022
10023 /* Step 3: the loop
10024
10025 while (TEST_ADDR != LAST_ADDR)
10026 {
10027 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10028 probe at TEST_ADDR
10029 }
10030
10031 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10032 until it is equal to ROUNDED_SIZE. */
10033
10034 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10035
10036
10037 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10038 that SIZE is equal to ROUNDED_SIZE. */
10039
10040 if (size != rounded_size)
10041 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
10042 stack_pointer_rtx,
10043 sr.reg),
10044 rounded_size - size));
10045
10046 release_scratch_register_on_entry (&sr);
10047 }
10048
10049 /* Make sure nothing is scheduled before we are done. */
10050 emit_insn (gen_blockage ());
10051 }
10052
10053 /* Probe a range of stack addresses from REG to END, inclusive. These are
10054 offsets from the current stack pointer. */
10055
10056 const char *
10057 output_probe_stack_range (rtx reg, rtx end)
10058 {
10059 static int labelno = 0;
10060 char loop_lab[32], end_lab[32];
10061 rtx xops[3];
10062
10063 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10064 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10065
10066 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10067
10068 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10069 xops[0] = reg;
10070 xops[1] = end;
10071 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10072 fputs ("\tje\t", asm_out_file);
10073 assemble_name_raw (asm_out_file, end_lab);
10074 fputc ('\n', asm_out_file);
10075
10076 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10077 xops[1] = GEN_INT (PROBE_INTERVAL);
10078 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10079
10080 /* Probe at TEST_ADDR. */
10081 xops[0] = stack_pointer_rtx;
10082 xops[1] = reg;
10083 xops[2] = const0_rtx;
10084 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10085
10086 fprintf (asm_out_file, "\tjmp\t");
10087 assemble_name_raw (asm_out_file, loop_lab);
10088 fputc ('\n', asm_out_file);
10089
10090 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10091
10092 return "";
10093 }
10094
10095 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10096 to be generated in correct form. */
10097 static void
10098 ix86_finalize_stack_realign_flags (void)
10099 {
10100 /* Check if stack realign is really needed after reload, and
10101 stores result in cfun */
10102 unsigned int incoming_stack_boundary
10103 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10104 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10105 unsigned int stack_realign = (incoming_stack_boundary
10106 < (current_function_is_leaf
10107 ? crtl->max_used_stack_slot_alignment
10108 : crtl->stack_alignment_needed));
10109
10110 if (crtl->stack_realign_finalized)
10111 {
10112 /* After stack_realign_needed is finalized, we can't no longer
10113 change it. */
10114 gcc_assert (crtl->stack_realign_needed == stack_realign);
10115 return;
10116 }
10117
10118 /* If the only reason for frame_pointer_needed is that we conservatively
10119 assumed stack realignment might be needed, but in the end nothing that
10120 needed the stack alignment had been spilled, clear frame_pointer_needed
10121 and say we don't need stack realignment. */
10122 if (stack_realign
10123 && !crtl->need_drap
10124 && frame_pointer_needed
10125 && current_function_is_leaf
10126 && flag_omit_frame_pointer
10127 && current_function_sp_is_unchanging
10128 && !ix86_current_function_calls_tls_descriptor
10129 && !crtl->accesses_prior_frames
10130 && !cfun->calls_alloca
10131 && !crtl->calls_eh_return
10132 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10133 && !ix86_frame_pointer_required ()
10134 && get_frame_size () == 0
10135 && ix86_nsaved_sseregs () == 0
10136 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10137 {
10138 HARD_REG_SET set_up_by_prologue, prologue_used;
10139 basic_block bb;
10140
10141 CLEAR_HARD_REG_SET (prologue_used);
10142 CLEAR_HARD_REG_SET (set_up_by_prologue);
10143 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10144 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10145 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10146 HARD_FRAME_POINTER_REGNUM);
10147 FOR_EACH_BB (bb)
10148 {
10149 rtx insn;
10150 FOR_BB_INSNS (bb, insn)
10151 if (NONDEBUG_INSN_P (insn)
10152 && requires_stack_frame_p (insn, prologue_used,
10153 set_up_by_prologue))
10154 {
10155 crtl->stack_realign_needed = stack_realign;
10156 crtl->stack_realign_finalized = true;
10157 return;
10158 }
10159 }
10160
10161 frame_pointer_needed = false;
10162 stack_realign = false;
10163 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10164 crtl->stack_alignment_needed = incoming_stack_boundary;
10165 crtl->stack_alignment_estimated = incoming_stack_boundary;
10166 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10167 crtl->preferred_stack_boundary = incoming_stack_boundary;
10168 df_finish_pass (true);
10169 df_scan_alloc (NULL);
10170 df_scan_blocks ();
10171 df_compute_regs_ever_live (true);
10172 df_analyze ();
10173 }
10174
10175 crtl->stack_realign_needed = stack_realign;
10176 crtl->stack_realign_finalized = true;
10177 }
10178
10179 /* Expand the prologue into a bunch of separate insns. */
10180
10181 void
10182 ix86_expand_prologue (void)
10183 {
10184 struct machine_function *m = cfun->machine;
10185 rtx insn, t;
10186 bool pic_reg_used;
10187 struct ix86_frame frame;
10188 HOST_WIDE_INT allocate;
10189 bool int_registers_saved;
10190
10191 ix86_finalize_stack_realign_flags ();
10192
10193 /* DRAP should not coexist with stack_realign_fp */
10194 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10195
10196 memset (&m->fs, 0, sizeof (m->fs));
10197
10198 /* Initialize CFA state for before the prologue. */
10199 m->fs.cfa_reg = stack_pointer_rtx;
10200 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10201
10202 /* Track SP offset to the CFA. We continue tracking this after we've
10203 swapped the CFA register away from SP. In the case of re-alignment
10204 this is fudged; we're interested to offsets within the local frame. */
10205 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10206 m->fs.sp_valid = true;
10207
10208 ix86_compute_frame_layout (&frame);
10209
10210 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10211 {
10212 /* We should have already generated an error for any use of
10213 ms_hook on a nested function. */
10214 gcc_checking_assert (!ix86_static_chain_on_stack);
10215
10216 /* Check if profiling is active and we shall use profiling before
10217 prologue variant. If so sorry. */
10218 if (crtl->profile && flag_fentry != 0)
10219 sorry ("ms_hook_prologue attribute isn%'t compatible "
10220 "with -mfentry for 32-bit");
10221
10222 /* In ix86_asm_output_function_label we emitted:
10223 8b ff movl.s %edi,%edi
10224 55 push %ebp
10225 8b ec movl.s %esp,%ebp
10226
10227 This matches the hookable function prologue in Win32 API
10228 functions in Microsoft Windows XP Service Pack 2 and newer.
10229 Wine uses this to enable Windows apps to hook the Win32 API
10230 functions provided by Wine.
10231
10232 What that means is that we've already set up the frame pointer. */
10233
10234 if (frame_pointer_needed
10235 && !(crtl->drap_reg && crtl->stack_realign_needed))
10236 {
10237 rtx push, mov;
10238
10239 /* We've decided to use the frame pointer already set up.
10240 Describe this to the unwinder by pretending that both
10241 push and mov insns happen right here.
10242
10243 Putting the unwind info here at the end of the ms_hook
10244 is done so that we can make absolutely certain we get
10245 the required byte sequence at the start of the function,
10246 rather than relying on an assembler that can produce
10247 the exact encoding required.
10248
10249 However it does mean (in the unpatched case) that we have
10250 a 1 insn window where the asynchronous unwind info is
10251 incorrect. However, if we placed the unwind info at
10252 its correct location we would have incorrect unwind info
10253 in the patched case. Which is probably all moot since
10254 I don't expect Wine generates dwarf2 unwind info for the
10255 system libraries that use this feature. */
10256
10257 insn = emit_insn (gen_blockage ());
10258
10259 push = gen_push (hard_frame_pointer_rtx);
10260 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10261 stack_pointer_rtx);
10262 RTX_FRAME_RELATED_P (push) = 1;
10263 RTX_FRAME_RELATED_P (mov) = 1;
10264
10265 RTX_FRAME_RELATED_P (insn) = 1;
10266 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10267 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10268
10269 /* Note that gen_push incremented m->fs.cfa_offset, even
10270 though we didn't emit the push insn here. */
10271 m->fs.cfa_reg = hard_frame_pointer_rtx;
10272 m->fs.fp_offset = m->fs.cfa_offset;
10273 m->fs.fp_valid = true;
10274 }
10275 else
10276 {
10277 /* The frame pointer is not needed so pop %ebp again.
10278 This leaves us with a pristine state. */
10279 emit_insn (gen_pop (hard_frame_pointer_rtx));
10280 }
10281 }
10282
10283 /* The first insn of a function that accepts its static chain on the
10284 stack is to push the register that would be filled in by a direct
10285 call. This insn will be skipped by the trampoline. */
10286 else if (ix86_static_chain_on_stack)
10287 {
10288 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10289 emit_insn (gen_blockage ());
10290
10291 /* We don't want to interpret this push insn as a register save,
10292 only as a stack adjustment. The real copy of the register as
10293 a save will be done later, if needed. */
10294 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10295 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10296 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10297 RTX_FRAME_RELATED_P (insn) = 1;
10298 }
10299
10300 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10301 of DRAP is needed and stack realignment is really needed after reload */
10302 if (stack_realign_drap)
10303 {
10304 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10305
10306 /* Only need to push parameter pointer reg if it is caller saved. */
10307 if (!call_used_regs[REGNO (crtl->drap_reg)])
10308 {
10309 /* Push arg pointer reg */
10310 insn = emit_insn (gen_push (crtl->drap_reg));
10311 RTX_FRAME_RELATED_P (insn) = 1;
10312 }
10313
10314 /* Grab the argument pointer. */
10315 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10316 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10317 RTX_FRAME_RELATED_P (insn) = 1;
10318 m->fs.cfa_reg = crtl->drap_reg;
10319 m->fs.cfa_offset = 0;
10320
10321 /* Align the stack. */
10322 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10323 stack_pointer_rtx,
10324 GEN_INT (-align_bytes)));
10325 RTX_FRAME_RELATED_P (insn) = 1;
10326
10327 /* Replicate the return address on the stack so that return
10328 address can be reached via (argp - 1) slot. This is needed
10329 to implement macro RETURN_ADDR_RTX and intrinsic function
10330 expand_builtin_return_addr etc. */
10331 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10332 t = gen_frame_mem (Pmode, t);
10333 insn = emit_insn (gen_push (t));
10334 RTX_FRAME_RELATED_P (insn) = 1;
10335
10336 /* For the purposes of frame and register save area addressing,
10337 we've started over with a new frame. */
10338 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10339 m->fs.realigned = true;
10340 }
10341
10342 if (frame_pointer_needed && !m->fs.fp_valid)
10343 {
10344 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10345 slower on all targets. Also sdb doesn't like it. */
10346 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10347 RTX_FRAME_RELATED_P (insn) = 1;
10348
10349 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10350 {
10351 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10352 RTX_FRAME_RELATED_P (insn) = 1;
10353
10354 if (m->fs.cfa_reg == stack_pointer_rtx)
10355 m->fs.cfa_reg = hard_frame_pointer_rtx;
10356 m->fs.fp_offset = m->fs.sp_offset;
10357 m->fs.fp_valid = true;
10358 }
10359 }
10360
10361 int_registers_saved = (frame.nregs == 0);
10362
10363 if (!int_registers_saved)
10364 {
10365 /* If saving registers via PUSH, do so now. */
10366 if (!frame.save_regs_using_mov)
10367 {
10368 ix86_emit_save_regs ();
10369 int_registers_saved = true;
10370 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10371 }
10372
10373 /* When using red zone we may start register saving before allocating
10374 the stack frame saving one cycle of the prologue. However, avoid
10375 doing this if we have to probe the stack; at least on x86_64 the
10376 stack probe can turn into a call that clobbers a red zone location. */
10377 else if (ix86_using_red_zone ()
10378 && (! TARGET_STACK_PROBE
10379 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10380 {
10381 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10382 int_registers_saved = true;
10383 }
10384 }
10385
10386 if (stack_realign_fp)
10387 {
10388 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10389 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10390
10391 /* The computation of the size of the re-aligned stack frame means
10392 that we must allocate the size of the register save area before
10393 performing the actual alignment. Otherwise we cannot guarantee
10394 that there's enough storage above the realignment point. */
10395 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10396 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10397 GEN_INT (m->fs.sp_offset
10398 - frame.sse_reg_save_offset),
10399 -1, false);
10400
10401 /* Align the stack. */
10402 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10403 stack_pointer_rtx,
10404 GEN_INT (-align_bytes)));
10405
10406 /* For the purposes of register save area addressing, the stack
10407 pointer is no longer valid. As for the value of sp_offset,
10408 see ix86_compute_frame_layout, which we need to match in order
10409 to pass verification of stack_pointer_offset at the end. */
10410 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10411 m->fs.sp_valid = false;
10412 }
10413
10414 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10415
10416 if (flag_stack_usage_info)
10417 {
10418 /* We start to count from ARG_POINTER. */
10419 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10420
10421 /* If it was realigned, take into account the fake frame. */
10422 if (stack_realign_drap)
10423 {
10424 if (ix86_static_chain_on_stack)
10425 stack_size += UNITS_PER_WORD;
10426
10427 if (!call_used_regs[REGNO (crtl->drap_reg)])
10428 stack_size += UNITS_PER_WORD;
10429
10430 /* This over-estimates by 1 minimal-stack-alignment-unit but
10431 mitigates that by counting in the new return address slot. */
10432 current_function_dynamic_stack_size
10433 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10434 }
10435
10436 current_function_static_stack_size = stack_size;
10437 }
10438
10439 /* The stack has already been decremented by the instruction calling us
10440 so probe if the size is non-negative to preserve the protection area. */
10441 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10442 {
10443 /* We expect the registers to be saved when probes are used. */
10444 gcc_assert (int_registers_saved);
10445
10446 if (STACK_CHECK_MOVING_SP)
10447 {
10448 ix86_adjust_stack_and_probe (allocate);
10449 allocate = 0;
10450 }
10451 else
10452 {
10453 HOST_WIDE_INT size = allocate;
10454
10455 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10456 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10457
10458 if (TARGET_STACK_PROBE)
10459 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10460 else
10461 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10462 }
10463 }
10464
10465 if (allocate == 0)
10466 ;
10467 else if (!ix86_target_stack_probe ()
10468 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10469 {
10470 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10471 GEN_INT (-allocate), -1,
10472 m->fs.cfa_reg == stack_pointer_rtx);
10473 }
10474 else
10475 {
10476 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10477 rtx r10 = NULL;
10478 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10479
10480 bool eax_live = false;
10481 bool r10_live = false;
10482
10483 if (TARGET_64BIT)
10484 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10485 if (!TARGET_64BIT_MS_ABI)
10486 eax_live = ix86_eax_live_at_start_p ();
10487
10488 if (eax_live)
10489 {
10490 emit_insn (gen_push (eax));
10491 allocate -= UNITS_PER_WORD;
10492 }
10493 if (r10_live)
10494 {
10495 r10 = gen_rtx_REG (Pmode, R10_REG);
10496 emit_insn (gen_push (r10));
10497 allocate -= UNITS_PER_WORD;
10498 }
10499
10500 emit_move_insn (eax, GEN_INT (allocate));
10501 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10502
10503 /* Use the fact that AX still contains ALLOCATE. */
10504 adjust_stack_insn = (TARGET_64BIT
10505 ? gen_pro_epilogue_adjust_stack_di_sub
10506 : gen_pro_epilogue_adjust_stack_si_sub);
10507
10508 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10509 stack_pointer_rtx, eax));
10510
10511 /* Note that SEH directives need to continue tracking the stack
10512 pointer even after the frame pointer has been set up. */
10513 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10514 {
10515 if (m->fs.cfa_reg == stack_pointer_rtx)
10516 m->fs.cfa_offset += allocate;
10517
10518 RTX_FRAME_RELATED_P (insn) = 1;
10519 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10520 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10521 plus_constant (stack_pointer_rtx,
10522 -allocate)));
10523 }
10524 m->fs.sp_offset += allocate;
10525
10526 if (r10_live && eax_live)
10527 {
10528 t = choose_baseaddr (m->fs.sp_offset - allocate);
10529 emit_move_insn (r10, gen_frame_mem (Pmode, t));
10530 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10531 emit_move_insn (eax, gen_frame_mem (Pmode, t));
10532 }
10533 else if (eax_live || r10_live)
10534 {
10535 t = choose_baseaddr (m->fs.sp_offset - allocate);
10536 emit_move_insn ((eax_live ? eax : r10), gen_frame_mem (Pmode, t));
10537 }
10538 }
10539 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10540
10541 /* If we havn't already set up the frame pointer, do so now. */
10542 if (frame_pointer_needed && !m->fs.fp_valid)
10543 {
10544 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10545 GEN_INT (frame.stack_pointer_offset
10546 - frame.hard_frame_pointer_offset));
10547 insn = emit_insn (insn);
10548 RTX_FRAME_RELATED_P (insn) = 1;
10549 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10550
10551 if (m->fs.cfa_reg == stack_pointer_rtx)
10552 m->fs.cfa_reg = hard_frame_pointer_rtx;
10553 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10554 m->fs.fp_valid = true;
10555 }
10556
10557 if (!int_registers_saved)
10558 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10559 if (frame.nsseregs)
10560 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10561
10562 pic_reg_used = false;
10563 if (pic_offset_table_rtx
10564 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10565 || crtl->profile))
10566 {
10567 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10568
10569 if (alt_pic_reg_used != INVALID_REGNUM)
10570 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10571
10572 pic_reg_used = true;
10573 }
10574
10575 if (pic_reg_used)
10576 {
10577 if (TARGET_64BIT)
10578 {
10579 if (ix86_cmodel == CM_LARGE_PIC)
10580 {
10581 rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
10582 rtx label = gen_label_rtx ();
10583 emit_label (label);
10584 LABEL_PRESERVE_P (label) = 1;
10585 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10586 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
10587 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10588 insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
10589 pic_offset_table_rtx, tmp_reg));
10590 }
10591 else
10592 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10593 }
10594 else
10595 {
10596 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10597 RTX_FRAME_RELATED_P (insn) = 1;
10598 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10599 }
10600 }
10601
10602 /* In the pic_reg_used case, make sure that the got load isn't deleted
10603 when mcount needs it. Blockage to avoid call movement across mcount
10604 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10605 note. */
10606 if (crtl->profile && !flag_fentry && pic_reg_used)
10607 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10608
10609 if (crtl->drap_reg && !crtl->stack_realign_needed)
10610 {
10611 /* vDRAP is setup but after reload it turns out stack realign
10612 isn't necessary, here we will emit prologue to setup DRAP
10613 without stack realign adjustment */
10614 t = choose_baseaddr (0);
10615 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10616 }
10617
10618 /* Prevent instructions from being scheduled into register save push
10619 sequence when access to the redzone area is done through frame pointer.
10620 The offset between the frame pointer and the stack pointer is calculated
10621 relative to the value of the stack pointer at the end of the function
10622 prologue, and moving instructions that access redzone area via frame
10623 pointer inside push sequence violates this assumption. */
10624 if (frame_pointer_needed && frame.red_zone_size)
10625 emit_insn (gen_memory_blockage ());
10626
10627 /* Emit cld instruction if stringops are used in the function. */
10628 if (TARGET_CLD && ix86_current_function_needs_cld)
10629 emit_insn (gen_cld ());
10630
10631 /* SEH requires that the prologue end within 256 bytes of the start of
10632 the function. Prevent instruction schedules that would extend that.
10633 Further, prevent alloca modifications to the stack pointer from being
10634 combined with prologue modifications. */
10635 if (TARGET_SEH)
10636 emit_insn (gen_prologue_use (stack_pointer_rtx));
10637 }
10638
10639 /* Emit code to restore REG using a POP insn. */
10640
10641 static void
10642 ix86_emit_restore_reg_using_pop (rtx reg)
10643 {
10644 struct machine_function *m = cfun->machine;
10645 rtx insn = emit_insn (gen_pop (reg));
10646
10647 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10648 m->fs.sp_offset -= UNITS_PER_WORD;
10649
10650 if (m->fs.cfa_reg == crtl->drap_reg
10651 && REGNO (reg) == REGNO (crtl->drap_reg))
10652 {
10653 /* Previously we'd represented the CFA as an expression
10654 like *(%ebp - 8). We've just popped that value from
10655 the stack, which means we need to reset the CFA to
10656 the drap register. This will remain until we restore
10657 the stack pointer. */
10658 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10659 RTX_FRAME_RELATED_P (insn) = 1;
10660
10661 /* This means that the DRAP register is valid for addressing too. */
10662 m->fs.drap_valid = true;
10663 return;
10664 }
10665
10666 if (m->fs.cfa_reg == stack_pointer_rtx)
10667 {
10668 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10669 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10670 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10671 RTX_FRAME_RELATED_P (insn) = 1;
10672
10673 m->fs.cfa_offset -= UNITS_PER_WORD;
10674 }
10675
10676 /* When the frame pointer is the CFA, and we pop it, we are
10677 swapping back to the stack pointer as the CFA. This happens
10678 for stack frames that don't allocate other data, so we assume
10679 the stack pointer is now pointing at the return address, i.e.
10680 the function entry state, which makes the offset be 1 word. */
10681 if (reg == hard_frame_pointer_rtx)
10682 {
10683 m->fs.fp_valid = false;
10684 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10685 {
10686 m->fs.cfa_reg = stack_pointer_rtx;
10687 m->fs.cfa_offset -= UNITS_PER_WORD;
10688
10689 add_reg_note (insn, REG_CFA_DEF_CFA,
10690 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10691 GEN_INT (m->fs.cfa_offset)));
10692 RTX_FRAME_RELATED_P (insn) = 1;
10693 }
10694 }
10695 }
10696
10697 /* Emit code to restore saved registers using POP insns. */
10698
10699 static void
10700 ix86_emit_restore_regs_using_pop (void)
10701 {
10702 unsigned int regno;
10703
10704 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10705 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10706 ix86_emit_restore_reg_using_pop (gen_rtx_REG (Pmode, regno));
10707 }
10708
10709 /* Emit code and notes for the LEAVE instruction. */
10710
10711 static void
10712 ix86_emit_leave (void)
10713 {
10714 struct machine_function *m = cfun->machine;
10715 rtx insn = emit_insn (ix86_gen_leave ());
10716
10717 ix86_add_queued_cfa_restore_notes (insn);
10718
10719 gcc_assert (m->fs.fp_valid);
10720 m->fs.sp_valid = true;
10721 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10722 m->fs.fp_valid = false;
10723
10724 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10725 {
10726 m->fs.cfa_reg = stack_pointer_rtx;
10727 m->fs.cfa_offset = m->fs.sp_offset;
10728
10729 add_reg_note (insn, REG_CFA_DEF_CFA,
10730 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10731 RTX_FRAME_RELATED_P (insn) = 1;
10732 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10733 m->fs.fp_offset);
10734 }
10735 }
10736
10737 /* Emit code to restore saved registers using MOV insns.
10738 First register is restored from CFA - CFA_OFFSET. */
10739 static void
10740 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10741 bool maybe_eh_return)
10742 {
10743 struct machine_function *m = cfun->machine;
10744 unsigned int regno;
10745
10746 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10747 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10748 {
10749 rtx reg = gen_rtx_REG (Pmode, regno);
10750 rtx insn, mem;
10751
10752 mem = choose_baseaddr (cfa_offset);
10753 mem = gen_frame_mem (Pmode, mem);
10754 insn = emit_move_insn (reg, mem);
10755
10756 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10757 {
10758 /* Previously we'd represented the CFA as an expression
10759 like *(%ebp - 8). We've just popped that value from
10760 the stack, which means we need to reset the CFA to
10761 the drap register. This will remain until we restore
10762 the stack pointer. */
10763 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10764 RTX_FRAME_RELATED_P (insn) = 1;
10765
10766 /* This means that the DRAP register is valid for addressing. */
10767 m->fs.drap_valid = true;
10768 }
10769 else
10770 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10771
10772 cfa_offset -= UNITS_PER_WORD;
10773 }
10774 }
10775
10776 /* Emit code to restore saved registers using MOV insns.
10777 First register is restored from CFA - CFA_OFFSET. */
10778 static void
10779 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10780 bool maybe_eh_return)
10781 {
10782 unsigned int regno;
10783
10784 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10785 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10786 {
10787 rtx reg = gen_rtx_REG (V4SFmode, regno);
10788 rtx mem;
10789
10790 mem = choose_baseaddr (cfa_offset);
10791 mem = gen_rtx_MEM (V4SFmode, mem);
10792 set_mem_align (mem, 128);
10793 emit_move_insn (reg, mem);
10794
10795 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10796
10797 cfa_offset -= 16;
10798 }
10799 }
10800
10801 /* Emit vzeroupper if needed. */
10802
10803 void
10804 ix86_maybe_emit_epilogue_vzeroupper (void)
10805 {
10806 if (TARGET_VZEROUPPER
10807 && !TREE_THIS_VOLATILE (cfun->decl)
10808 && !cfun->machine->caller_return_avx256_p)
10809 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10810 }
10811
10812 /* Restore function stack, frame, and registers. */
10813
10814 void
10815 ix86_expand_epilogue (int style)
10816 {
10817 struct machine_function *m = cfun->machine;
10818 struct machine_frame_state frame_state_save = m->fs;
10819 struct ix86_frame frame;
10820 bool restore_regs_via_mov;
10821 bool using_drap;
10822
10823 ix86_finalize_stack_realign_flags ();
10824 ix86_compute_frame_layout (&frame);
10825
10826 m->fs.sp_valid = (!frame_pointer_needed
10827 || (current_function_sp_is_unchanging
10828 && !stack_realign_fp));
10829 gcc_assert (!m->fs.sp_valid
10830 || m->fs.sp_offset == frame.stack_pointer_offset);
10831
10832 /* The FP must be valid if the frame pointer is present. */
10833 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10834 gcc_assert (!m->fs.fp_valid
10835 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10836
10837 /* We must have *some* valid pointer to the stack frame. */
10838 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10839
10840 /* The DRAP is never valid at this point. */
10841 gcc_assert (!m->fs.drap_valid);
10842
10843 /* See the comment about red zone and frame
10844 pointer usage in ix86_expand_prologue. */
10845 if (frame_pointer_needed && frame.red_zone_size)
10846 emit_insn (gen_memory_blockage ());
10847
10848 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10849 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10850
10851 /* Determine the CFA offset of the end of the red-zone. */
10852 m->fs.red_zone_offset = 0;
10853 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10854 {
10855 /* The red-zone begins below the return address. */
10856 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10857
10858 /* When the register save area is in the aligned portion of
10859 the stack, determine the maximum runtime displacement that
10860 matches up with the aligned frame. */
10861 if (stack_realign_drap)
10862 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10863 + UNITS_PER_WORD);
10864 }
10865
10866 /* Special care must be taken for the normal return case of a function
10867 using eh_return: the eax and edx registers are marked as saved, but
10868 not restored along this path. Adjust the save location to match. */
10869 if (crtl->calls_eh_return && style != 2)
10870 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10871
10872 /* EH_RETURN requires the use of moves to function properly. */
10873 if (crtl->calls_eh_return)
10874 restore_regs_via_mov = true;
10875 /* SEH requires the use of pops to identify the epilogue. */
10876 else if (TARGET_SEH)
10877 restore_regs_via_mov = false;
10878 /* If we're only restoring one register and sp is not valid then
10879 using a move instruction to restore the register since it's
10880 less work than reloading sp and popping the register. */
10881 else if (!m->fs.sp_valid && frame.nregs <= 1)
10882 restore_regs_via_mov = true;
10883 else if (TARGET_EPILOGUE_USING_MOVE
10884 && cfun->machine->use_fast_prologue_epilogue
10885 && (frame.nregs > 1
10886 || m->fs.sp_offset != frame.reg_save_offset))
10887 restore_regs_via_mov = true;
10888 else if (frame_pointer_needed
10889 && !frame.nregs
10890 && m->fs.sp_offset != frame.reg_save_offset)
10891 restore_regs_via_mov = true;
10892 else if (frame_pointer_needed
10893 && TARGET_USE_LEAVE
10894 && cfun->machine->use_fast_prologue_epilogue
10895 && frame.nregs == 1)
10896 restore_regs_via_mov = true;
10897 else
10898 restore_regs_via_mov = false;
10899
10900 if (restore_regs_via_mov || frame.nsseregs)
10901 {
10902 /* Ensure that the entire register save area is addressable via
10903 the stack pointer, if we will restore via sp. */
10904 if (TARGET_64BIT
10905 && m->fs.sp_offset > 0x7fffffff
10906 && !(m->fs.fp_valid || m->fs.drap_valid)
10907 && (frame.nsseregs + frame.nregs) != 0)
10908 {
10909 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10910 GEN_INT (m->fs.sp_offset
10911 - frame.sse_reg_save_offset),
10912 style,
10913 m->fs.cfa_reg == stack_pointer_rtx);
10914 }
10915 }
10916
10917 /* If there are any SSE registers to restore, then we have to do it
10918 via moves, since there's obviously no pop for SSE regs. */
10919 if (frame.nsseregs)
10920 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10921 style == 2);
10922
10923 if (restore_regs_via_mov)
10924 {
10925 rtx t;
10926
10927 if (frame.nregs)
10928 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10929
10930 /* eh_return epilogues need %ecx added to the stack pointer. */
10931 if (style == 2)
10932 {
10933 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10934
10935 /* Stack align doesn't work with eh_return. */
10936 gcc_assert (!stack_realign_drap);
10937 /* Neither does regparm nested functions. */
10938 gcc_assert (!ix86_static_chain_on_stack);
10939
10940 if (frame_pointer_needed)
10941 {
10942 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10943 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10944 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10945
10946 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10947 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10948
10949 /* Note that we use SA as a temporary CFA, as the return
10950 address is at the proper place relative to it. We
10951 pretend this happens at the FP restore insn because
10952 prior to this insn the FP would be stored at the wrong
10953 offset relative to SA, and after this insn we have no
10954 other reasonable register to use for the CFA. We don't
10955 bother resetting the CFA to the SP for the duration of
10956 the return insn. */
10957 add_reg_note (insn, REG_CFA_DEF_CFA,
10958 plus_constant (sa, UNITS_PER_WORD));
10959 ix86_add_queued_cfa_restore_notes (insn);
10960 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10961 RTX_FRAME_RELATED_P (insn) = 1;
10962
10963 m->fs.cfa_reg = sa;
10964 m->fs.cfa_offset = UNITS_PER_WORD;
10965 m->fs.fp_valid = false;
10966
10967 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10968 const0_rtx, style, false);
10969 }
10970 else
10971 {
10972 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10973 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10974 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10975 ix86_add_queued_cfa_restore_notes (insn);
10976
10977 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10978 if (m->fs.cfa_offset != UNITS_PER_WORD)
10979 {
10980 m->fs.cfa_offset = UNITS_PER_WORD;
10981 add_reg_note (insn, REG_CFA_DEF_CFA,
10982 plus_constant (stack_pointer_rtx,
10983 UNITS_PER_WORD));
10984 RTX_FRAME_RELATED_P (insn) = 1;
10985 }
10986 }
10987 m->fs.sp_offset = UNITS_PER_WORD;
10988 m->fs.sp_valid = true;
10989 }
10990 }
10991 else
10992 {
10993 /* SEH requires that the function end with (1) a stack adjustment
10994 if necessary, (2) a sequence of pops, and (3) a return or
10995 jump instruction. Prevent insns from the function body from
10996 being scheduled into this sequence. */
10997 if (TARGET_SEH)
10998 {
10999 /* Prevent a catch region from being adjacent to the standard
11000 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11001 several other flags that would be interesting to test are
11002 not yet set up. */
11003 if (flag_non_call_exceptions)
11004 emit_insn (gen_nops (const1_rtx));
11005 else
11006 emit_insn (gen_blockage ());
11007 }
11008
11009 /* First step is to deallocate the stack frame so that we can
11010 pop the registers. */
11011 if (!m->fs.sp_valid)
11012 {
11013 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11014 GEN_INT (m->fs.fp_offset
11015 - frame.reg_save_offset),
11016 style, false);
11017 }
11018 else if (m->fs.sp_offset != frame.reg_save_offset)
11019 {
11020 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11021 GEN_INT (m->fs.sp_offset
11022 - frame.reg_save_offset),
11023 style,
11024 m->fs.cfa_reg == stack_pointer_rtx);
11025 }
11026
11027 ix86_emit_restore_regs_using_pop ();
11028 }
11029
11030 /* If we used a stack pointer and haven't already got rid of it,
11031 then do so now. */
11032 if (m->fs.fp_valid)
11033 {
11034 /* If the stack pointer is valid and pointing at the frame
11035 pointer store address, then we only need a pop. */
11036 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11037 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11038 /* Leave results in shorter dependency chains on CPUs that are
11039 able to grok it fast. */
11040 else if (TARGET_USE_LEAVE
11041 || optimize_function_for_size_p (cfun)
11042 || !cfun->machine->use_fast_prologue_epilogue)
11043 ix86_emit_leave ();
11044 else
11045 {
11046 pro_epilogue_adjust_stack (stack_pointer_rtx,
11047 hard_frame_pointer_rtx,
11048 const0_rtx, style, !using_drap);
11049 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11050 }
11051 }
11052
11053 if (using_drap)
11054 {
11055 int param_ptr_offset = UNITS_PER_WORD;
11056 rtx insn;
11057
11058 gcc_assert (stack_realign_drap);
11059
11060 if (ix86_static_chain_on_stack)
11061 param_ptr_offset += UNITS_PER_WORD;
11062 if (!call_used_regs[REGNO (crtl->drap_reg)])
11063 param_ptr_offset += UNITS_PER_WORD;
11064
11065 insn = emit_insn (gen_rtx_SET
11066 (VOIDmode, stack_pointer_rtx,
11067 gen_rtx_PLUS (Pmode,
11068 crtl->drap_reg,
11069 GEN_INT (-param_ptr_offset))));
11070 m->fs.cfa_reg = stack_pointer_rtx;
11071 m->fs.cfa_offset = param_ptr_offset;
11072 m->fs.sp_offset = param_ptr_offset;
11073 m->fs.realigned = false;
11074
11075 add_reg_note (insn, REG_CFA_DEF_CFA,
11076 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11077 GEN_INT (param_ptr_offset)));
11078 RTX_FRAME_RELATED_P (insn) = 1;
11079
11080 if (!call_used_regs[REGNO (crtl->drap_reg)])
11081 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11082 }
11083
11084 /* At this point the stack pointer must be valid, and we must have
11085 restored all of the registers. We may not have deallocated the
11086 entire stack frame. We've delayed this until now because it may
11087 be possible to merge the local stack deallocation with the
11088 deallocation forced by ix86_static_chain_on_stack. */
11089 gcc_assert (m->fs.sp_valid);
11090 gcc_assert (!m->fs.fp_valid);
11091 gcc_assert (!m->fs.realigned);
11092 if (m->fs.sp_offset != UNITS_PER_WORD)
11093 {
11094 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11095 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11096 style, true);
11097 }
11098 else
11099 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11100
11101 /* Sibcall epilogues don't want a return instruction. */
11102 if (style == 0)
11103 {
11104 m->fs = frame_state_save;
11105 return;
11106 }
11107
11108 /* Emit vzeroupper if needed. */
11109 ix86_maybe_emit_epilogue_vzeroupper ();
11110
11111 if (crtl->args.pops_args && crtl->args.size)
11112 {
11113 rtx popc = GEN_INT (crtl->args.pops_args);
11114
11115 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11116 address, do explicit add, and jump indirectly to the caller. */
11117
11118 if (crtl->args.pops_args >= 65536)
11119 {
11120 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11121 rtx insn;
11122
11123 /* There is no "pascal" calling convention in any 64bit ABI. */
11124 gcc_assert (!TARGET_64BIT);
11125
11126 insn = emit_insn (gen_pop (ecx));
11127 m->fs.cfa_offset -= UNITS_PER_WORD;
11128 m->fs.sp_offset -= UNITS_PER_WORD;
11129
11130 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11131 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11132 add_reg_note (insn, REG_CFA_REGISTER,
11133 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11134 RTX_FRAME_RELATED_P (insn) = 1;
11135
11136 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11137 popc, -1, true);
11138 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11139 }
11140 else
11141 emit_jump_insn (gen_simple_return_pop_internal (popc));
11142 }
11143 else
11144 emit_jump_insn (gen_simple_return_internal ());
11145
11146 /* Restore the state back to the state from the prologue,
11147 so that it's correct for the next epilogue. */
11148 m->fs = frame_state_save;
11149 }
11150
11151 /* Reset from the function's potential modifications. */
11152
11153 static void
11154 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11155 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11156 {
11157 if (pic_offset_table_rtx)
11158 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11159 #if TARGET_MACHO
11160 /* Mach-O doesn't support labels at the end of objects, so if
11161 it looks like we might want one, insert a NOP. */
11162 {
11163 rtx insn = get_last_insn ();
11164 rtx deleted_debug_label = NULL_RTX;
11165 while (insn
11166 && NOTE_P (insn)
11167 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11168 {
11169 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11170 notes only, instead set their CODE_LABEL_NUMBER to -1,
11171 otherwise there would be code generation differences
11172 in between -g and -g0. */
11173 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11174 deleted_debug_label = insn;
11175 insn = PREV_INSN (insn);
11176 }
11177 if (insn
11178 && (LABEL_P (insn)
11179 || (NOTE_P (insn)
11180 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11181 fputs ("\tnop\n", file);
11182 else if (deleted_debug_label)
11183 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11184 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11185 CODE_LABEL_NUMBER (insn) = -1;
11186 }
11187 #endif
11188
11189 }
11190
11191 /* Return a scratch register to use in the split stack prologue. The
11192 split stack prologue is used for -fsplit-stack. It is the first
11193 instructions in the function, even before the regular prologue.
11194 The scratch register can be any caller-saved register which is not
11195 used for parameters or for the static chain. */
11196
11197 static unsigned int
11198 split_stack_prologue_scratch_regno (void)
11199 {
11200 if (TARGET_64BIT)
11201 return R11_REG;
11202 else
11203 {
11204 bool is_fastcall;
11205 int regparm;
11206
11207 is_fastcall = (lookup_attribute ("fastcall",
11208 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11209 != NULL);
11210 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11211
11212 if (is_fastcall)
11213 {
11214 if (DECL_STATIC_CHAIN (cfun->decl))
11215 {
11216 sorry ("-fsplit-stack does not support fastcall with "
11217 "nested function");
11218 return INVALID_REGNUM;
11219 }
11220 return AX_REG;
11221 }
11222 else if (regparm < 3)
11223 {
11224 if (!DECL_STATIC_CHAIN (cfun->decl))
11225 return CX_REG;
11226 else
11227 {
11228 if (regparm >= 2)
11229 {
11230 sorry ("-fsplit-stack does not support 2 register "
11231 " parameters for a nested function");
11232 return INVALID_REGNUM;
11233 }
11234 return DX_REG;
11235 }
11236 }
11237 else
11238 {
11239 /* FIXME: We could make this work by pushing a register
11240 around the addition and comparison. */
11241 sorry ("-fsplit-stack does not support 3 register parameters");
11242 return INVALID_REGNUM;
11243 }
11244 }
11245 }
11246
11247 /* A SYMBOL_REF for the function which allocates new stackspace for
11248 -fsplit-stack. */
11249
11250 static GTY(()) rtx split_stack_fn;
11251
11252 /* A SYMBOL_REF for the more stack function when using the large
11253 model. */
11254
11255 static GTY(()) rtx split_stack_fn_large;
11256
11257 /* Handle -fsplit-stack. These are the first instructions in the
11258 function, even before the regular prologue. */
11259
11260 void
11261 ix86_expand_split_stack_prologue (void)
11262 {
11263 struct ix86_frame frame;
11264 HOST_WIDE_INT allocate;
11265 unsigned HOST_WIDE_INT args_size;
11266 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11267 rtx scratch_reg = NULL_RTX;
11268 rtx varargs_label = NULL_RTX;
11269 rtx fn;
11270
11271 gcc_assert (flag_split_stack && reload_completed);
11272
11273 ix86_finalize_stack_realign_flags ();
11274 ix86_compute_frame_layout (&frame);
11275 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11276
11277 /* This is the label we will branch to if we have enough stack
11278 space. We expect the basic block reordering pass to reverse this
11279 branch if optimizing, so that we branch in the unlikely case. */
11280 label = gen_label_rtx ();
11281
11282 /* We need to compare the stack pointer minus the frame size with
11283 the stack boundary in the TCB. The stack boundary always gives
11284 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11285 can compare directly. Otherwise we need to do an addition. */
11286
11287 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11288 UNSPEC_STACK_CHECK);
11289 limit = gen_rtx_CONST (Pmode, limit);
11290 limit = gen_rtx_MEM (Pmode, limit);
11291 if (allocate < SPLIT_STACK_AVAILABLE)
11292 current = stack_pointer_rtx;
11293 else
11294 {
11295 unsigned int scratch_regno;
11296 rtx offset;
11297
11298 /* We need a scratch register to hold the stack pointer minus
11299 the required frame size. Since this is the very start of the
11300 function, the scratch register can be any caller-saved
11301 register which is not used for parameters. */
11302 offset = GEN_INT (- allocate);
11303 scratch_regno = split_stack_prologue_scratch_regno ();
11304 if (scratch_regno == INVALID_REGNUM)
11305 return;
11306 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11307 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11308 {
11309 /* We don't use ix86_gen_add3 in this case because it will
11310 want to split to lea, but when not optimizing the insn
11311 will not be split after this point. */
11312 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11313 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11314 offset)));
11315 }
11316 else
11317 {
11318 emit_move_insn (scratch_reg, offset);
11319 emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
11320 stack_pointer_rtx));
11321 }
11322 current = scratch_reg;
11323 }
11324
11325 ix86_expand_branch (GEU, current, limit, label);
11326 jump_insn = get_last_insn ();
11327 JUMP_LABEL (jump_insn) = label;
11328
11329 /* Mark the jump as very likely to be taken. */
11330 add_reg_note (jump_insn, REG_BR_PROB,
11331 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11332
11333 if (split_stack_fn == NULL_RTX)
11334 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11335 fn = split_stack_fn;
11336
11337 /* Get more stack space. We pass in the desired stack space and the
11338 size of the arguments to copy to the new stack. In 32-bit mode
11339 we push the parameters; __morestack will return on a new stack
11340 anyhow. In 64-bit mode we pass the parameters in r10 and
11341 r11. */
11342 allocate_rtx = GEN_INT (allocate);
11343 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11344 call_fusage = NULL_RTX;
11345 if (TARGET_64BIT)
11346 {
11347 rtx reg10, reg11;
11348
11349 reg10 = gen_rtx_REG (Pmode, R10_REG);
11350 reg11 = gen_rtx_REG (Pmode, R11_REG);
11351
11352 /* If this function uses a static chain, it will be in %r10.
11353 Preserve it across the call to __morestack. */
11354 if (DECL_STATIC_CHAIN (cfun->decl))
11355 {
11356 rtx rax;
11357
11358 rax = gen_rtx_REG (Pmode, AX_REG);
11359 emit_move_insn (rax, reg10);
11360 use_reg (&call_fusage, rax);
11361 }
11362
11363 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11364 {
11365 HOST_WIDE_INT argval;
11366
11367 /* When using the large model we need to load the address
11368 into a register, and we've run out of registers. So we
11369 switch to a different calling convention, and we call a
11370 different function: __morestack_large. We pass the
11371 argument size in the upper 32 bits of r10 and pass the
11372 frame size in the lower 32 bits. */
11373 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11374 gcc_assert ((args_size & 0xffffffff) == args_size);
11375
11376 if (split_stack_fn_large == NULL_RTX)
11377 split_stack_fn_large =
11378 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11379
11380 if (ix86_cmodel == CM_LARGE_PIC)
11381 {
11382 rtx label, x;
11383
11384 label = gen_label_rtx ();
11385 emit_label (label);
11386 LABEL_PRESERVE_P (label) = 1;
11387 emit_insn (gen_set_rip_rex64 (reg10, label));
11388 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11389 emit_insn (gen_adddi3 (reg10, reg10, reg11));
11390 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11391 UNSPEC_GOT);
11392 x = gen_rtx_CONST (Pmode, x);
11393 emit_move_insn (reg11, x);
11394 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11395 x = gen_const_mem (Pmode, x);
11396 emit_move_insn (reg11, x);
11397 }
11398 else
11399 emit_move_insn (reg11, split_stack_fn_large);
11400
11401 fn = reg11;
11402
11403 argval = ((args_size << 16) << 16) + allocate;
11404 emit_move_insn (reg10, GEN_INT (argval));
11405 }
11406 else
11407 {
11408 emit_move_insn (reg10, allocate_rtx);
11409 emit_move_insn (reg11, GEN_INT (args_size));
11410 use_reg (&call_fusage, reg11);
11411 }
11412
11413 use_reg (&call_fusage, reg10);
11414 }
11415 else
11416 {
11417 emit_insn (gen_push (GEN_INT (args_size)));
11418 emit_insn (gen_push (allocate_rtx));
11419 }
11420 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11421 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11422 NULL_RTX, false);
11423 add_function_usage_to (call_insn, call_fusage);
11424
11425 /* In order to make call/return prediction work right, we now need
11426 to execute a return instruction. See
11427 libgcc/config/i386/morestack.S for the details on how this works.
11428
11429 For flow purposes gcc must not see this as a return
11430 instruction--we need control flow to continue at the subsequent
11431 label. Therefore, we use an unspec. */
11432 gcc_assert (crtl->args.pops_args < 65536);
11433 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11434
11435 /* If we are in 64-bit mode and this function uses a static chain,
11436 we saved %r10 in %rax before calling _morestack. */
11437 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11438 emit_move_insn (gen_rtx_REG (Pmode, R10_REG),
11439 gen_rtx_REG (Pmode, AX_REG));
11440
11441 /* If this function calls va_start, we need to store a pointer to
11442 the arguments on the old stack, because they may not have been
11443 all copied to the new stack. At this point the old stack can be
11444 found at the frame pointer value used by __morestack, because
11445 __morestack has set that up before calling back to us. Here we
11446 store that pointer in a scratch register, and in
11447 ix86_expand_prologue we store the scratch register in a stack
11448 slot. */
11449 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11450 {
11451 unsigned int scratch_regno;
11452 rtx frame_reg;
11453 int words;
11454
11455 scratch_regno = split_stack_prologue_scratch_regno ();
11456 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11457 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11458
11459 /* 64-bit:
11460 fp -> old fp value
11461 return address within this function
11462 return address of caller of this function
11463 stack arguments
11464 So we add three words to get to the stack arguments.
11465
11466 32-bit:
11467 fp -> old fp value
11468 return address within this function
11469 first argument to __morestack
11470 second argument to __morestack
11471 return address of caller of this function
11472 stack arguments
11473 So we add five words to get to the stack arguments.
11474 */
11475 words = TARGET_64BIT ? 3 : 5;
11476 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11477 gen_rtx_PLUS (Pmode, frame_reg,
11478 GEN_INT (words * UNITS_PER_WORD))));
11479
11480 varargs_label = gen_label_rtx ();
11481 emit_jump_insn (gen_jump (varargs_label));
11482 JUMP_LABEL (get_last_insn ()) = varargs_label;
11483
11484 emit_barrier ();
11485 }
11486
11487 emit_label (label);
11488 LABEL_NUSES (label) = 1;
11489
11490 /* If this function calls va_start, we now have to set the scratch
11491 register for the case where we do not call __morestack. In this
11492 case we need to set it based on the stack pointer. */
11493 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11494 {
11495 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11496 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11497 GEN_INT (UNITS_PER_WORD))));
11498
11499 emit_label (varargs_label);
11500 LABEL_NUSES (varargs_label) = 1;
11501 }
11502 }
11503
11504 /* We may have to tell the dataflow pass that the split stack prologue
11505 is initializing a scratch register. */
11506
11507 static void
11508 ix86_live_on_entry (bitmap regs)
11509 {
11510 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11511 {
11512 gcc_assert (flag_split_stack);
11513 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11514 }
11515 }
11516 \f
11517 /* Determine if op is suitable SUBREG RTX for address. */
11518
11519 static bool
11520 ix86_address_subreg_operand (rtx op)
11521 {
11522 enum machine_mode mode;
11523
11524 if (!REG_P (op))
11525 return false;
11526
11527 mode = GET_MODE (op);
11528
11529 if (GET_MODE_CLASS (mode) != MODE_INT)
11530 return false;
11531
11532 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11533 failures when the register is one word out of a two word structure. */
11534 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11535 return false;
11536
11537 /* Allow only SUBREGs of non-eliminable hard registers. */
11538 return register_no_elim_operand (op, mode);
11539 }
11540
11541 /* Extract the parts of an RTL expression that is a valid memory address
11542 for an instruction. Return 0 if the structure of the address is
11543 grossly off. Return -1 if the address contains ASHIFT, so it is not
11544 strictly valid, but still used for computing length of lea instruction. */
11545
11546 int
11547 ix86_decompose_address (rtx addr, struct ix86_address *out)
11548 {
11549 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11550 rtx base_reg, index_reg;
11551 HOST_WIDE_INT scale = 1;
11552 rtx scale_rtx = NULL_RTX;
11553 rtx tmp;
11554 int retval = 1;
11555 enum ix86_address_seg seg = SEG_DEFAULT;
11556
11557 /* Allow zero-extended SImode addresses,
11558 they will be emitted with addr32 prefix. */
11559 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11560 {
11561 if (GET_CODE (addr) == ZERO_EXTEND
11562 && GET_MODE (XEXP (addr, 0)) == SImode)
11563 addr = XEXP (addr, 0);
11564 else if (GET_CODE (addr) == AND
11565 && const_32bit_mask (XEXP (addr, 1), DImode))
11566 {
11567 addr = XEXP (addr, 0);
11568
11569 /* Strip subreg. */
11570 if (GET_CODE (addr) == SUBREG
11571 && GET_MODE (SUBREG_REG (addr)) == SImode)
11572 addr = SUBREG_REG (addr);
11573 }
11574 }
11575
11576 if (REG_P (addr))
11577 base = addr;
11578 else if (GET_CODE (addr) == SUBREG)
11579 {
11580 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11581 base = addr;
11582 else
11583 return 0;
11584 }
11585 else if (GET_CODE (addr) == PLUS)
11586 {
11587 rtx addends[4], op;
11588 int n = 0, i;
11589
11590 op = addr;
11591 do
11592 {
11593 if (n >= 4)
11594 return 0;
11595 addends[n++] = XEXP (op, 1);
11596 op = XEXP (op, 0);
11597 }
11598 while (GET_CODE (op) == PLUS);
11599 if (n >= 4)
11600 return 0;
11601 addends[n] = op;
11602
11603 for (i = n; i >= 0; --i)
11604 {
11605 op = addends[i];
11606 switch (GET_CODE (op))
11607 {
11608 case MULT:
11609 if (index)
11610 return 0;
11611 index = XEXP (op, 0);
11612 scale_rtx = XEXP (op, 1);
11613 break;
11614
11615 case ASHIFT:
11616 if (index)
11617 return 0;
11618 index = XEXP (op, 0);
11619 tmp = XEXP (op, 1);
11620 if (!CONST_INT_P (tmp))
11621 return 0;
11622 scale = INTVAL (tmp);
11623 if ((unsigned HOST_WIDE_INT) scale > 3)
11624 return 0;
11625 scale = 1 << scale;
11626 break;
11627
11628 case UNSPEC:
11629 if (XINT (op, 1) == UNSPEC_TP
11630 && TARGET_TLS_DIRECT_SEG_REFS
11631 && seg == SEG_DEFAULT)
11632 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11633 else
11634 return 0;
11635 break;
11636
11637 case SUBREG:
11638 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11639 return 0;
11640 /* FALLTHRU */
11641
11642 case REG:
11643 if (!base)
11644 base = op;
11645 else if (!index)
11646 index = op;
11647 else
11648 return 0;
11649 break;
11650
11651 case CONST:
11652 case CONST_INT:
11653 case SYMBOL_REF:
11654 case LABEL_REF:
11655 if (disp)
11656 return 0;
11657 disp = op;
11658 break;
11659
11660 default:
11661 return 0;
11662 }
11663 }
11664 }
11665 else if (GET_CODE (addr) == MULT)
11666 {
11667 index = XEXP (addr, 0); /* index*scale */
11668 scale_rtx = XEXP (addr, 1);
11669 }
11670 else if (GET_CODE (addr) == ASHIFT)
11671 {
11672 /* We're called for lea too, which implements ashift on occasion. */
11673 index = XEXP (addr, 0);
11674 tmp = XEXP (addr, 1);
11675 if (!CONST_INT_P (tmp))
11676 return 0;
11677 scale = INTVAL (tmp);
11678 if ((unsigned HOST_WIDE_INT) scale > 3)
11679 return 0;
11680 scale = 1 << scale;
11681 retval = -1;
11682 }
11683 else
11684 disp = addr; /* displacement */
11685
11686 if (index)
11687 {
11688 if (REG_P (index))
11689 ;
11690 else if (GET_CODE (index) == SUBREG
11691 && ix86_address_subreg_operand (SUBREG_REG (index)))
11692 ;
11693 else
11694 return 0;
11695 }
11696
11697 /* Extract the integral value of scale. */
11698 if (scale_rtx)
11699 {
11700 if (!CONST_INT_P (scale_rtx))
11701 return 0;
11702 scale = INTVAL (scale_rtx);
11703 }
11704
11705 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11706 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11707
11708 /* Avoid useless 0 displacement. */
11709 if (disp == const0_rtx && (base || index))
11710 disp = NULL_RTX;
11711
11712 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11713 if (base_reg && index_reg && scale == 1
11714 && (index_reg == arg_pointer_rtx
11715 || index_reg == frame_pointer_rtx
11716 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11717 {
11718 rtx tmp;
11719 tmp = base, base = index, index = tmp;
11720 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11721 }
11722
11723 /* Special case: %ebp cannot be encoded as a base without a displacement.
11724 Similarly %r13. */
11725 if (!disp
11726 && base_reg
11727 && (base_reg == hard_frame_pointer_rtx
11728 || base_reg == frame_pointer_rtx
11729 || base_reg == arg_pointer_rtx
11730 || (REG_P (base_reg)
11731 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11732 || REGNO (base_reg) == R13_REG))))
11733 disp = const0_rtx;
11734
11735 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11736 Avoid this by transforming to [%esi+0].
11737 Reload calls address legitimization without cfun defined, so we need
11738 to test cfun for being non-NULL. */
11739 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11740 && base_reg && !index_reg && !disp
11741 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11742 disp = const0_rtx;
11743
11744 /* Special case: encode reg+reg instead of reg*2. */
11745 if (!base && index && scale == 2)
11746 base = index, base_reg = index_reg, scale = 1;
11747
11748 /* Special case: scaling cannot be encoded without base or displacement. */
11749 if (!base && !disp && index && scale != 1)
11750 disp = const0_rtx;
11751
11752 out->base = base;
11753 out->index = index;
11754 out->disp = disp;
11755 out->scale = scale;
11756 out->seg = seg;
11757
11758 return retval;
11759 }
11760 \f
11761 /* Return cost of the memory address x.
11762 For i386, it is better to use a complex address than let gcc copy
11763 the address into a reg and make a new pseudo. But not if the address
11764 requires to two regs - that would mean more pseudos with longer
11765 lifetimes. */
11766 static int
11767 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11768 {
11769 struct ix86_address parts;
11770 int cost = 1;
11771 int ok = ix86_decompose_address (x, &parts);
11772
11773 gcc_assert (ok);
11774
11775 if (parts.base && GET_CODE (parts.base) == SUBREG)
11776 parts.base = SUBREG_REG (parts.base);
11777 if (parts.index && GET_CODE (parts.index) == SUBREG)
11778 parts.index = SUBREG_REG (parts.index);
11779
11780 /* Attempt to minimize number of registers in the address. */
11781 if ((parts.base
11782 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11783 || (parts.index
11784 && (!REG_P (parts.index)
11785 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11786 cost++;
11787
11788 if (parts.base
11789 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11790 && parts.index
11791 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11792 && parts.base != parts.index)
11793 cost++;
11794
11795 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11796 since it's predecode logic can't detect the length of instructions
11797 and it degenerates to vector decoded. Increase cost of such
11798 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11799 to split such addresses or even refuse such addresses at all.
11800
11801 Following addressing modes are affected:
11802 [base+scale*index]
11803 [scale*index+disp]
11804 [base+index]
11805
11806 The first and last case may be avoidable by explicitly coding the zero in
11807 memory address, but I don't have AMD-K6 machine handy to check this
11808 theory. */
11809
11810 if (TARGET_K6
11811 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11812 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11813 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11814 cost += 10;
11815
11816 return cost;
11817 }
11818 \f
11819 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11820 this is used for to form addresses to local data when -fPIC is in
11821 use. */
11822
11823 static bool
11824 darwin_local_data_pic (rtx disp)
11825 {
11826 return (GET_CODE (disp) == UNSPEC
11827 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11828 }
11829
11830 /* Determine if a given RTX is a valid constant. We already know this
11831 satisfies CONSTANT_P. */
11832
11833 static bool
11834 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11835 {
11836 switch (GET_CODE (x))
11837 {
11838 case CONST:
11839 x = XEXP (x, 0);
11840
11841 if (GET_CODE (x) == PLUS)
11842 {
11843 if (!CONST_INT_P (XEXP (x, 1)))
11844 return false;
11845 x = XEXP (x, 0);
11846 }
11847
11848 if (TARGET_MACHO && darwin_local_data_pic (x))
11849 return true;
11850
11851 /* Only some unspecs are valid as "constants". */
11852 if (GET_CODE (x) == UNSPEC)
11853 switch (XINT (x, 1))
11854 {
11855 case UNSPEC_GOT:
11856 case UNSPEC_GOTOFF:
11857 case UNSPEC_PLTOFF:
11858 return TARGET_64BIT;
11859 case UNSPEC_TPOFF:
11860 case UNSPEC_NTPOFF:
11861 x = XVECEXP (x, 0, 0);
11862 return (GET_CODE (x) == SYMBOL_REF
11863 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11864 case UNSPEC_DTPOFF:
11865 x = XVECEXP (x, 0, 0);
11866 return (GET_CODE (x) == SYMBOL_REF
11867 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11868 default:
11869 return false;
11870 }
11871
11872 /* We must have drilled down to a symbol. */
11873 if (GET_CODE (x) == LABEL_REF)
11874 return true;
11875 if (GET_CODE (x) != SYMBOL_REF)
11876 return false;
11877 /* FALLTHRU */
11878
11879 case SYMBOL_REF:
11880 /* TLS symbols are never valid. */
11881 if (SYMBOL_REF_TLS_MODEL (x))
11882 return false;
11883
11884 /* DLLIMPORT symbols are never valid. */
11885 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11886 && SYMBOL_REF_DLLIMPORT_P (x))
11887 return false;
11888
11889 #if TARGET_MACHO
11890 /* mdynamic-no-pic */
11891 if (MACHO_DYNAMIC_NO_PIC_P)
11892 return machopic_symbol_defined_p (x);
11893 #endif
11894 break;
11895
11896 case CONST_DOUBLE:
11897 if (GET_MODE (x) == TImode
11898 && x != CONST0_RTX (TImode)
11899 && !TARGET_64BIT)
11900 return false;
11901 break;
11902
11903 case CONST_VECTOR:
11904 if (!standard_sse_constant_p (x))
11905 return false;
11906
11907 default:
11908 break;
11909 }
11910
11911 /* Otherwise we handle everything else in the move patterns. */
11912 return true;
11913 }
11914
11915 /* Determine if it's legal to put X into the constant pool. This
11916 is not possible for the address of thread-local symbols, which
11917 is checked above. */
11918
11919 static bool
11920 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11921 {
11922 /* We can always put integral constants and vectors in memory. */
11923 switch (GET_CODE (x))
11924 {
11925 case CONST_INT:
11926 case CONST_DOUBLE:
11927 case CONST_VECTOR:
11928 return false;
11929
11930 default:
11931 break;
11932 }
11933 return !ix86_legitimate_constant_p (mode, x);
11934 }
11935
11936
11937 /* Nonzero if the constant value X is a legitimate general operand
11938 when generating PIC code. It is given that flag_pic is on and
11939 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11940
11941 bool
11942 legitimate_pic_operand_p (rtx x)
11943 {
11944 rtx inner;
11945
11946 switch (GET_CODE (x))
11947 {
11948 case CONST:
11949 inner = XEXP (x, 0);
11950 if (GET_CODE (inner) == PLUS
11951 && CONST_INT_P (XEXP (inner, 1)))
11952 inner = XEXP (inner, 0);
11953
11954 /* Only some unspecs are valid as "constants". */
11955 if (GET_CODE (inner) == UNSPEC)
11956 switch (XINT (inner, 1))
11957 {
11958 case UNSPEC_GOT:
11959 case UNSPEC_GOTOFF:
11960 case UNSPEC_PLTOFF:
11961 return TARGET_64BIT;
11962 case UNSPEC_TPOFF:
11963 x = XVECEXP (inner, 0, 0);
11964 return (GET_CODE (x) == SYMBOL_REF
11965 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11966 case UNSPEC_MACHOPIC_OFFSET:
11967 return legitimate_pic_address_disp_p (x);
11968 default:
11969 return false;
11970 }
11971 /* FALLTHRU */
11972
11973 case SYMBOL_REF:
11974 case LABEL_REF:
11975 return legitimate_pic_address_disp_p (x);
11976
11977 default:
11978 return true;
11979 }
11980 }
11981
11982 /* Determine if a given CONST RTX is a valid memory displacement
11983 in PIC mode. */
11984
11985 bool
11986 legitimate_pic_address_disp_p (rtx disp)
11987 {
11988 bool saw_plus;
11989
11990 /* In 64bit mode we can allow direct addresses of symbols and labels
11991 when they are not dynamic symbols. */
11992 if (TARGET_64BIT)
11993 {
11994 rtx op0 = disp, op1;
11995
11996 switch (GET_CODE (disp))
11997 {
11998 case LABEL_REF:
11999 return true;
12000
12001 case CONST:
12002 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12003 break;
12004 op0 = XEXP (XEXP (disp, 0), 0);
12005 op1 = XEXP (XEXP (disp, 0), 1);
12006 if (!CONST_INT_P (op1)
12007 || INTVAL (op1) >= 16*1024*1024
12008 || INTVAL (op1) < -16*1024*1024)
12009 break;
12010 if (GET_CODE (op0) == LABEL_REF)
12011 return true;
12012 if (GET_CODE (op0) != SYMBOL_REF)
12013 break;
12014 /* FALLTHRU */
12015
12016 case SYMBOL_REF:
12017 /* TLS references should always be enclosed in UNSPEC. */
12018 if (SYMBOL_REF_TLS_MODEL (op0))
12019 return false;
12020 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12021 && ix86_cmodel != CM_LARGE_PIC)
12022 return true;
12023 break;
12024
12025 default:
12026 break;
12027 }
12028 }
12029 if (GET_CODE (disp) != CONST)
12030 return false;
12031 disp = XEXP (disp, 0);
12032
12033 if (TARGET_64BIT)
12034 {
12035 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12036 of GOT tables. We should not need these anyway. */
12037 if (GET_CODE (disp) != UNSPEC
12038 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12039 && XINT (disp, 1) != UNSPEC_GOTOFF
12040 && XINT (disp, 1) != UNSPEC_PCREL
12041 && XINT (disp, 1) != UNSPEC_PLTOFF))
12042 return false;
12043
12044 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12045 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12046 return false;
12047 return true;
12048 }
12049
12050 saw_plus = false;
12051 if (GET_CODE (disp) == PLUS)
12052 {
12053 if (!CONST_INT_P (XEXP (disp, 1)))
12054 return false;
12055 disp = XEXP (disp, 0);
12056 saw_plus = true;
12057 }
12058
12059 if (TARGET_MACHO && darwin_local_data_pic (disp))
12060 return true;
12061
12062 if (GET_CODE (disp) != UNSPEC)
12063 return false;
12064
12065 switch (XINT (disp, 1))
12066 {
12067 case UNSPEC_GOT:
12068 if (saw_plus)
12069 return false;
12070 /* We need to check for both symbols and labels because VxWorks loads
12071 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12072 details. */
12073 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12074 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12075 case UNSPEC_GOTOFF:
12076 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12077 While ABI specify also 32bit relocation but we don't produce it in
12078 small PIC model at all. */
12079 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12080 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12081 && !TARGET_64BIT)
12082 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12083 return false;
12084 case UNSPEC_GOTTPOFF:
12085 case UNSPEC_GOTNTPOFF:
12086 case UNSPEC_INDNTPOFF:
12087 if (saw_plus)
12088 return false;
12089 disp = XVECEXP (disp, 0, 0);
12090 return (GET_CODE (disp) == SYMBOL_REF
12091 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12092 case UNSPEC_NTPOFF:
12093 disp = XVECEXP (disp, 0, 0);
12094 return (GET_CODE (disp) == SYMBOL_REF
12095 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12096 case UNSPEC_DTPOFF:
12097 disp = XVECEXP (disp, 0, 0);
12098 return (GET_CODE (disp) == SYMBOL_REF
12099 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12100 }
12101
12102 return false;
12103 }
12104
12105 /* Recognizes RTL expressions that are valid memory addresses for an
12106 instruction. The MODE argument is the machine mode for the MEM
12107 expression that wants to use this address.
12108
12109 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12110 convert common non-canonical forms to canonical form so that they will
12111 be recognized. */
12112
12113 static bool
12114 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12115 rtx addr, bool strict)
12116 {
12117 struct ix86_address parts;
12118 rtx base, index, disp;
12119 HOST_WIDE_INT scale;
12120
12121 if (ix86_decompose_address (addr, &parts) <= 0)
12122 /* Decomposition failed. */
12123 return false;
12124
12125 base = parts.base;
12126 index = parts.index;
12127 disp = parts.disp;
12128 scale = parts.scale;
12129
12130 /* Validate base register. */
12131 if (base)
12132 {
12133 rtx reg;
12134
12135 if (REG_P (base))
12136 reg = base;
12137 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12138 reg = SUBREG_REG (base);
12139 else
12140 /* Base is not a register. */
12141 return false;
12142
12143 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12144 return false;
12145
12146 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12147 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12148 /* Base is not valid. */
12149 return false;
12150 }
12151
12152 /* Validate index register. */
12153 if (index)
12154 {
12155 rtx reg;
12156
12157 if (REG_P (index))
12158 reg = index;
12159 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12160 reg = SUBREG_REG (index);
12161 else
12162 /* Index is not a register. */
12163 return false;
12164
12165 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12166 return false;
12167
12168 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12169 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12170 /* Index is not valid. */
12171 return false;
12172 }
12173
12174 /* Index and base should have the same mode. */
12175 if (base && index
12176 && GET_MODE (base) != GET_MODE (index))
12177 return false;
12178
12179 /* Validate scale factor. */
12180 if (scale != 1)
12181 {
12182 if (!index)
12183 /* Scale without index. */
12184 return false;
12185
12186 if (scale != 2 && scale != 4 && scale != 8)
12187 /* Scale is not a valid multiplier. */
12188 return false;
12189 }
12190
12191 /* Validate displacement. */
12192 if (disp)
12193 {
12194 if (GET_CODE (disp) == CONST
12195 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12196 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12197 switch (XINT (XEXP (disp, 0), 1))
12198 {
12199 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12200 used. While ABI specify also 32bit relocations, we don't produce
12201 them at all and use IP relative instead. */
12202 case UNSPEC_GOT:
12203 case UNSPEC_GOTOFF:
12204 gcc_assert (flag_pic);
12205 if (!TARGET_64BIT)
12206 goto is_legitimate_pic;
12207
12208 /* 64bit address unspec. */
12209 return false;
12210
12211 case UNSPEC_GOTPCREL:
12212 case UNSPEC_PCREL:
12213 gcc_assert (flag_pic);
12214 goto is_legitimate_pic;
12215
12216 case UNSPEC_GOTTPOFF:
12217 case UNSPEC_GOTNTPOFF:
12218 case UNSPEC_INDNTPOFF:
12219 case UNSPEC_NTPOFF:
12220 case UNSPEC_DTPOFF:
12221 break;
12222
12223 case UNSPEC_STACK_CHECK:
12224 gcc_assert (flag_split_stack);
12225 break;
12226
12227 default:
12228 /* Invalid address unspec. */
12229 return false;
12230 }
12231
12232 else if (SYMBOLIC_CONST (disp)
12233 && (flag_pic
12234 || (TARGET_MACHO
12235 #if TARGET_MACHO
12236 && MACHOPIC_INDIRECT
12237 && !machopic_operand_p (disp)
12238 #endif
12239 )))
12240 {
12241
12242 is_legitimate_pic:
12243 if (TARGET_64BIT && (index || base))
12244 {
12245 /* foo@dtpoff(%rX) is ok. */
12246 if (GET_CODE (disp) != CONST
12247 || GET_CODE (XEXP (disp, 0)) != PLUS
12248 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12249 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12250 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12251 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12252 /* Non-constant pic memory reference. */
12253 return false;
12254 }
12255 else if ((!TARGET_MACHO || flag_pic)
12256 && ! legitimate_pic_address_disp_p (disp))
12257 /* Displacement is an invalid pic construct. */
12258 return false;
12259 #if TARGET_MACHO
12260 else if (MACHO_DYNAMIC_NO_PIC_P
12261 && !ix86_legitimate_constant_p (Pmode, disp))
12262 /* displacment must be referenced via non_lazy_pointer */
12263 return false;
12264 #endif
12265
12266 /* This code used to verify that a symbolic pic displacement
12267 includes the pic_offset_table_rtx register.
12268
12269 While this is good idea, unfortunately these constructs may
12270 be created by "adds using lea" optimization for incorrect
12271 code like:
12272
12273 int a;
12274 int foo(int i)
12275 {
12276 return *(&a+i);
12277 }
12278
12279 This code is nonsensical, but results in addressing
12280 GOT table with pic_offset_table_rtx base. We can't
12281 just refuse it easily, since it gets matched by
12282 "addsi3" pattern, that later gets split to lea in the
12283 case output register differs from input. While this
12284 can be handled by separate addsi pattern for this case
12285 that never results in lea, this seems to be easier and
12286 correct fix for crash to disable this test. */
12287 }
12288 else if (GET_CODE (disp) != LABEL_REF
12289 && !CONST_INT_P (disp)
12290 && (GET_CODE (disp) != CONST
12291 || !ix86_legitimate_constant_p (Pmode, disp))
12292 && (GET_CODE (disp) != SYMBOL_REF
12293 || !ix86_legitimate_constant_p (Pmode, disp)))
12294 /* Displacement is not constant. */
12295 return false;
12296 else if (TARGET_64BIT
12297 && !x86_64_immediate_operand (disp, VOIDmode))
12298 /* Displacement is out of range. */
12299 return false;
12300 }
12301
12302 /* Everything looks valid. */
12303 return true;
12304 }
12305
12306 /* Determine if a given RTX is a valid constant address. */
12307
12308 bool
12309 constant_address_p (rtx x)
12310 {
12311 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12312 }
12313 \f
12314 /* Return a unique alias set for the GOT. */
12315
12316 static alias_set_type
12317 ix86_GOT_alias_set (void)
12318 {
12319 static alias_set_type set = -1;
12320 if (set == -1)
12321 set = new_alias_set ();
12322 return set;
12323 }
12324
12325 /* Return a legitimate reference for ORIG (an address) using the
12326 register REG. If REG is 0, a new pseudo is generated.
12327
12328 There are two types of references that must be handled:
12329
12330 1. Global data references must load the address from the GOT, via
12331 the PIC reg. An insn is emitted to do this load, and the reg is
12332 returned.
12333
12334 2. Static data references, constant pool addresses, and code labels
12335 compute the address as an offset from the GOT, whose base is in
12336 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12337 differentiate them from global data objects. The returned
12338 address is the PIC reg + an unspec constant.
12339
12340 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12341 reg also appears in the address. */
12342
12343 static rtx
12344 legitimize_pic_address (rtx orig, rtx reg)
12345 {
12346 rtx addr = orig;
12347 rtx new_rtx = orig;
12348 rtx base;
12349
12350 #if TARGET_MACHO
12351 if (TARGET_MACHO && !TARGET_64BIT)
12352 {
12353 if (reg == 0)
12354 reg = gen_reg_rtx (Pmode);
12355 /* Use the generic Mach-O PIC machinery. */
12356 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12357 }
12358 #endif
12359
12360 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12361 new_rtx = addr;
12362 else if (TARGET_64BIT
12363 && ix86_cmodel != CM_SMALL_PIC
12364 && gotoff_operand (addr, Pmode))
12365 {
12366 rtx tmpreg;
12367 /* This symbol may be referenced via a displacement from the PIC
12368 base address (@GOTOFF). */
12369
12370 if (reload_in_progress)
12371 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12372 if (GET_CODE (addr) == CONST)
12373 addr = XEXP (addr, 0);
12374 if (GET_CODE (addr) == PLUS)
12375 {
12376 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12377 UNSPEC_GOTOFF);
12378 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12379 }
12380 else
12381 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12382 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12383 if (!reg)
12384 tmpreg = gen_reg_rtx (Pmode);
12385 else
12386 tmpreg = reg;
12387 emit_move_insn (tmpreg, new_rtx);
12388
12389 if (reg != 0)
12390 {
12391 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12392 tmpreg, 1, OPTAB_DIRECT);
12393 new_rtx = reg;
12394 }
12395 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12396 }
12397 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12398 {
12399 /* This symbol may be referenced via a displacement from the PIC
12400 base address (@GOTOFF). */
12401
12402 if (reload_in_progress)
12403 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12404 if (GET_CODE (addr) == CONST)
12405 addr = XEXP (addr, 0);
12406 if (GET_CODE (addr) == PLUS)
12407 {
12408 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12409 UNSPEC_GOTOFF);
12410 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12411 }
12412 else
12413 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12414 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12415 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12416
12417 if (reg != 0)
12418 {
12419 emit_move_insn (reg, new_rtx);
12420 new_rtx = reg;
12421 }
12422 }
12423 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12424 /* We can't use @GOTOFF for text labels on VxWorks;
12425 see gotoff_operand. */
12426 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12427 {
12428 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12429 {
12430 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12431 return legitimize_dllimport_symbol (addr, true);
12432 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12433 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12434 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12435 {
12436 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12437 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12438 }
12439 }
12440
12441 /* For x64 PE-COFF there is no GOT table. So we use address
12442 directly. */
12443 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12444 {
12445 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12446 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12447
12448 if (reg == 0)
12449 reg = gen_reg_rtx (Pmode);
12450 emit_move_insn (reg, new_rtx);
12451 new_rtx = reg;
12452 }
12453 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12454 {
12455 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12456 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12457 new_rtx = gen_const_mem (Pmode, new_rtx);
12458 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12459
12460 if (reg == 0)
12461 reg = gen_reg_rtx (Pmode);
12462 /* Use directly gen_movsi, otherwise the address is loaded
12463 into register for CSE. We don't want to CSE this addresses,
12464 instead we CSE addresses from the GOT table, so skip this. */
12465 emit_insn (gen_movsi (reg, new_rtx));
12466 new_rtx = reg;
12467 }
12468 else
12469 {
12470 /* This symbol must be referenced via a load from the
12471 Global Offset Table (@GOT). */
12472
12473 if (reload_in_progress)
12474 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12475 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12476 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12477 if (TARGET_64BIT)
12478 new_rtx = force_reg (Pmode, new_rtx);
12479 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12480 new_rtx = gen_const_mem (Pmode, new_rtx);
12481 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12482
12483 if (reg == 0)
12484 reg = gen_reg_rtx (Pmode);
12485 emit_move_insn (reg, new_rtx);
12486 new_rtx = reg;
12487 }
12488 }
12489 else
12490 {
12491 if (CONST_INT_P (addr)
12492 && !x86_64_immediate_operand (addr, VOIDmode))
12493 {
12494 if (reg)
12495 {
12496 emit_move_insn (reg, addr);
12497 new_rtx = reg;
12498 }
12499 else
12500 new_rtx = force_reg (Pmode, addr);
12501 }
12502 else if (GET_CODE (addr) == CONST)
12503 {
12504 addr = XEXP (addr, 0);
12505
12506 /* We must match stuff we generate before. Assume the only
12507 unspecs that can get here are ours. Not that we could do
12508 anything with them anyway.... */
12509 if (GET_CODE (addr) == UNSPEC
12510 || (GET_CODE (addr) == PLUS
12511 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12512 return orig;
12513 gcc_assert (GET_CODE (addr) == PLUS);
12514 }
12515 if (GET_CODE (addr) == PLUS)
12516 {
12517 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12518
12519 /* Check first to see if this is a constant offset from a @GOTOFF
12520 symbol reference. */
12521 if (gotoff_operand (op0, Pmode)
12522 && CONST_INT_P (op1))
12523 {
12524 if (!TARGET_64BIT)
12525 {
12526 if (reload_in_progress)
12527 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12528 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12529 UNSPEC_GOTOFF);
12530 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12531 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12532 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12533
12534 if (reg != 0)
12535 {
12536 emit_move_insn (reg, new_rtx);
12537 new_rtx = reg;
12538 }
12539 }
12540 else
12541 {
12542 if (INTVAL (op1) < -16*1024*1024
12543 || INTVAL (op1) >= 16*1024*1024)
12544 {
12545 if (!x86_64_immediate_operand (op1, Pmode))
12546 op1 = force_reg (Pmode, op1);
12547 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12548 }
12549 }
12550 }
12551 else
12552 {
12553 base = legitimize_pic_address (XEXP (addr, 0), reg);
12554 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12555 base == reg ? NULL_RTX : reg);
12556
12557 if (CONST_INT_P (new_rtx))
12558 new_rtx = plus_constant (base, INTVAL (new_rtx));
12559 else
12560 {
12561 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12562 {
12563 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12564 new_rtx = XEXP (new_rtx, 1);
12565 }
12566 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12567 }
12568 }
12569 }
12570 }
12571 return new_rtx;
12572 }
12573 \f
12574 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12575
12576 static rtx
12577 get_thread_pointer (bool to_reg)
12578 {
12579 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12580
12581 if (GET_MODE (tp) != Pmode)
12582 tp = convert_to_mode (Pmode, tp, 1);
12583
12584 if (to_reg)
12585 tp = copy_addr_to_reg (tp);
12586
12587 return tp;
12588 }
12589
12590 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12591
12592 static GTY(()) rtx ix86_tls_symbol;
12593
12594 static rtx
12595 ix86_tls_get_addr (void)
12596 {
12597 if (!ix86_tls_symbol)
12598 {
12599 const char *sym
12600 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12601 ? "___tls_get_addr" : "__tls_get_addr");
12602
12603 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12604 }
12605
12606 return ix86_tls_symbol;
12607 }
12608
12609 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12610
12611 static GTY(()) rtx ix86_tls_module_base_symbol;
12612
12613 rtx
12614 ix86_tls_module_base (void)
12615 {
12616 if (!ix86_tls_module_base_symbol)
12617 {
12618 ix86_tls_module_base_symbol
12619 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12620
12621 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12622 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12623 }
12624
12625 return ix86_tls_module_base_symbol;
12626 }
12627
12628 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12629 false if we expect this to be used for a memory address and true if
12630 we expect to load the address into a register. */
12631
12632 static rtx
12633 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12634 {
12635 rtx dest, base, off;
12636 rtx pic = NULL_RTX, tp = NULL_RTX;
12637 int type;
12638
12639 switch (model)
12640 {
12641 case TLS_MODEL_GLOBAL_DYNAMIC:
12642 dest = gen_reg_rtx (Pmode);
12643
12644 if (!TARGET_64BIT)
12645 {
12646 if (flag_pic)
12647 pic = pic_offset_table_rtx;
12648 else
12649 {
12650 pic = gen_reg_rtx (Pmode);
12651 emit_insn (gen_set_got (pic));
12652 }
12653 }
12654
12655 if (TARGET_GNU2_TLS)
12656 {
12657 if (TARGET_64BIT)
12658 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12659 else
12660 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12661
12662 tp = get_thread_pointer (true);
12663 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12664
12665 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12666 }
12667 else
12668 {
12669 rtx caddr = ix86_tls_get_addr ();
12670
12671 if (TARGET_64BIT)
12672 {
12673 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12674
12675 start_sequence ();
12676 emit_call_insn (gen_tls_global_dynamic_64 (rax, x, caddr));
12677 insns = get_insns ();
12678 end_sequence ();
12679
12680 RTL_CONST_CALL_P (insns) = 1;
12681 emit_libcall_block (insns, dest, rax, x);
12682 }
12683 else
12684 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12685 }
12686 break;
12687
12688 case TLS_MODEL_LOCAL_DYNAMIC:
12689 base = gen_reg_rtx (Pmode);
12690
12691 if (!TARGET_64BIT)
12692 {
12693 if (flag_pic)
12694 pic = pic_offset_table_rtx;
12695 else
12696 {
12697 pic = gen_reg_rtx (Pmode);
12698 emit_insn (gen_set_got (pic));
12699 }
12700 }
12701
12702 if (TARGET_GNU2_TLS)
12703 {
12704 rtx tmp = ix86_tls_module_base ();
12705
12706 if (TARGET_64BIT)
12707 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12708 else
12709 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12710
12711 tp = get_thread_pointer (true);
12712 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12713 gen_rtx_MINUS (Pmode, tmp, tp));
12714 }
12715 else
12716 {
12717 rtx caddr = ix86_tls_get_addr ();
12718
12719 if (TARGET_64BIT)
12720 {
12721 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12722
12723 start_sequence ();
12724 emit_call_insn (gen_tls_local_dynamic_base_64 (rax, caddr));
12725 insns = get_insns ();
12726 end_sequence ();
12727
12728 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12729 share the LD_BASE result with other LD model accesses. */
12730 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12731 UNSPEC_TLS_LD_BASE);
12732
12733 RTL_CONST_CALL_P (insns) = 1;
12734 emit_libcall_block (insns, base, rax, eqv);
12735 }
12736 else
12737 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12738 }
12739
12740 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12741 off = gen_rtx_CONST (Pmode, off);
12742
12743 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12744
12745 if (TARGET_GNU2_TLS)
12746 {
12747 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12748
12749 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12750 }
12751 break;
12752
12753 case TLS_MODEL_INITIAL_EXEC:
12754 if (TARGET_64BIT)
12755 {
12756 if (TARGET_SUN_TLS)
12757 {
12758 /* The Sun linker took the AMD64 TLS spec literally
12759 and can only handle %rax as destination of the
12760 initial executable code sequence. */
12761
12762 dest = gen_reg_rtx (Pmode);
12763 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12764 return dest;
12765 }
12766
12767 pic = NULL;
12768 type = UNSPEC_GOTNTPOFF;
12769 }
12770 else if (flag_pic)
12771 {
12772 if (reload_in_progress)
12773 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12774 pic = pic_offset_table_rtx;
12775 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12776 }
12777 else if (!TARGET_ANY_GNU_TLS)
12778 {
12779 pic = gen_reg_rtx (Pmode);
12780 emit_insn (gen_set_got (pic));
12781 type = UNSPEC_GOTTPOFF;
12782 }
12783 else
12784 {
12785 pic = NULL;
12786 type = UNSPEC_INDNTPOFF;
12787 }
12788
12789 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
12790 off = gen_rtx_CONST (Pmode, off);
12791 if (pic)
12792 off = gen_rtx_PLUS (Pmode, pic, off);
12793 off = gen_const_mem (Pmode, off);
12794 set_mem_alias_set (off, ix86_GOT_alias_set ());
12795
12796 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12797 {
12798 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12799 off = force_reg (Pmode, off);
12800 return gen_rtx_PLUS (Pmode, base, off);
12801 }
12802 else
12803 {
12804 base = get_thread_pointer (true);
12805 dest = gen_reg_rtx (Pmode);
12806 emit_insn (gen_subsi3 (dest, base, off));
12807 }
12808 break;
12809
12810 case TLS_MODEL_LOCAL_EXEC:
12811 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12812 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12813 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12814 off = gen_rtx_CONST (Pmode, off);
12815
12816 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12817 {
12818 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12819 return gen_rtx_PLUS (Pmode, base, off);
12820 }
12821 else
12822 {
12823 base = get_thread_pointer (true);
12824 dest = gen_reg_rtx (Pmode);
12825 emit_insn (gen_subsi3 (dest, base, off));
12826 }
12827 break;
12828
12829 default:
12830 gcc_unreachable ();
12831 }
12832
12833 return dest;
12834 }
12835
12836 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12837 to symbol DECL. */
12838
12839 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12840 htab_t dllimport_map;
12841
12842 static tree
12843 get_dllimport_decl (tree decl)
12844 {
12845 struct tree_map *h, in;
12846 void **loc;
12847 const char *name;
12848 const char *prefix;
12849 size_t namelen, prefixlen;
12850 char *imp_name;
12851 tree to;
12852 rtx rtl;
12853
12854 if (!dllimport_map)
12855 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12856
12857 in.hash = htab_hash_pointer (decl);
12858 in.base.from = decl;
12859 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12860 h = (struct tree_map *) *loc;
12861 if (h)
12862 return h->to;
12863
12864 *loc = h = ggc_alloc_tree_map ();
12865 h->hash = in.hash;
12866 h->base.from = decl;
12867 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12868 VAR_DECL, NULL, ptr_type_node);
12869 DECL_ARTIFICIAL (to) = 1;
12870 DECL_IGNORED_P (to) = 1;
12871 DECL_EXTERNAL (to) = 1;
12872 TREE_READONLY (to) = 1;
12873
12874 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12875 name = targetm.strip_name_encoding (name);
12876 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12877 ? "*__imp_" : "*__imp__";
12878 namelen = strlen (name);
12879 prefixlen = strlen (prefix);
12880 imp_name = (char *) alloca (namelen + prefixlen + 1);
12881 memcpy (imp_name, prefix, prefixlen);
12882 memcpy (imp_name + prefixlen, name, namelen + 1);
12883
12884 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12885 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12886 SET_SYMBOL_REF_DECL (rtl, to);
12887 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12888
12889 rtl = gen_const_mem (Pmode, rtl);
12890 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12891
12892 SET_DECL_RTL (to, rtl);
12893 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12894
12895 return to;
12896 }
12897
12898 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12899 true if we require the result be a register. */
12900
12901 static rtx
12902 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12903 {
12904 tree imp_decl;
12905 rtx x;
12906
12907 gcc_assert (SYMBOL_REF_DECL (symbol));
12908 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12909
12910 x = DECL_RTL (imp_decl);
12911 if (want_reg)
12912 x = force_reg (Pmode, x);
12913 return x;
12914 }
12915
12916 /* Try machine-dependent ways of modifying an illegitimate address
12917 to be legitimate. If we find one, return the new, valid address.
12918 This macro is used in only one place: `memory_address' in explow.c.
12919
12920 OLDX is the address as it was before break_out_memory_refs was called.
12921 In some cases it is useful to look at this to decide what needs to be done.
12922
12923 It is always safe for this macro to do nothing. It exists to recognize
12924 opportunities to optimize the output.
12925
12926 For the 80386, we handle X+REG by loading X into a register R and
12927 using R+REG. R will go in a general reg and indexing will be used.
12928 However, if REG is a broken-out memory address or multiplication,
12929 nothing needs to be done because REG can certainly go in a general reg.
12930
12931 When -fpic is used, special handling is needed for symbolic references.
12932 See comments by legitimize_pic_address in i386.c for details. */
12933
12934 static rtx
12935 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12936 enum machine_mode mode)
12937 {
12938 int changed = 0;
12939 unsigned log;
12940
12941 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12942 if (log)
12943 return legitimize_tls_address (x, (enum tls_model) log, false);
12944 if (GET_CODE (x) == CONST
12945 && GET_CODE (XEXP (x, 0)) == PLUS
12946 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12947 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12948 {
12949 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12950 (enum tls_model) log, false);
12951 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12952 }
12953
12954 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12955 {
12956 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12957 return legitimize_dllimport_symbol (x, true);
12958 if (GET_CODE (x) == CONST
12959 && GET_CODE (XEXP (x, 0)) == PLUS
12960 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12961 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12962 {
12963 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12964 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12965 }
12966 }
12967
12968 if (flag_pic && SYMBOLIC_CONST (x))
12969 return legitimize_pic_address (x, 0);
12970
12971 #if TARGET_MACHO
12972 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12973 return machopic_indirect_data_reference (x, 0);
12974 #endif
12975
12976 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12977 if (GET_CODE (x) == ASHIFT
12978 && CONST_INT_P (XEXP (x, 1))
12979 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12980 {
12981 changed = 1;
12982 log = INTVAL (XEXP (x, 1));
12983 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12984 GEN_INT (1 << log));
12985 }
12986
12987 if (GET_CODE (x) == PLUS)
12988 {
12989 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12990
12991 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12992 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12993 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12994 {
12995 changed = 1;
12996 log = INTVAL (XEXP (XEXP (x, 0), 1));
12997 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12998 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12999 GEN_INT (1 << log));
13000 }
13001
13002 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13003 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13004 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13005 {
13006 changed = 1;
13007 log = INTVAL (XEXP (XEXP (x, 1), 1));
13008 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13009 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13010 GEN_INT (1 << log));
13011 }
13012
13013 /* Put multiply first if it isn't already. */
13014 if (GET_CODE (XEXP (x, 1)) == MULT)
13015 {
13016 rtx tmp = XEXP (x, 0);
13017 XEXP (x, 0) = XEXP (x, 1);
13018 XEXP (x, 1) = tmp;
13019 changed = 1;
13020 }
13021
13022 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13023 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13024 created by virtual register instantiation, register elimination, and
13025 similar optimizations. */
13026 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13027 {
13028 changed = 1;
13029 x = gen_rtx_PLUS (Pmode,
13030 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13031 XEXP (XEXP (x, 1), 0)),
13032 XEXP (XEXP (x, 1), 1));
13033 }
13034
13035 /* Canonicalize
13036 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13037 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13038 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13039 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13040 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13041 && CONSTANT_P (XEXP (x, 1)))
13042 {
13043 rtx constant;
13044 rtx other = NULL_RTX;
13045
13046 if (CONST_INT_P (XEXP (x, 1)))
13047 {
13048 constant = XEXP (x, 1);
13049 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13050 }
13051 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13052 {
13053 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13054 other = XEXP (x, 1);
13055 }
13056 else
13057 constant = 0;
13058
13059 if (constant)
13060 {
13061 changed = 1;
13062 x = gen_rtx_PLUS (Pmode,
13063 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13064 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13065 plus_constant (other, INTVAL (constant)));
13066 }
13067 }
13068
13069 if (changed && ix86_legitimate_address_p (mode, x, false))
13070 return x;
13071
13072 if (GET_CODE (XEXP (x, 0)) == MULT)
13073 {
13074 changed = 1;
13075 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13076 }
13077
13078 if (GET_CODE (XEXP (x, 1)) == MULT)
13079 {
13080 changed = 1;
13081 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13082 }
13083
13084 if (changed
13085 && REG_P (XEXP (x, 1))
13086 && REG_P (XEXP (x, 0)))
13087 return x;
13088
13089 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13090 {
13091 changed = 1;
13092 x = legitimize_pic_address (x, 0);
13093 }
13094
13095 if (changed && ix86_legitimate_address_p (mode, x, false))
13096 return x;
13097
13098 if (REG_P (XEXP (x, 0)))
13099 {
13100 rtx temp = gen_reg_rtx (Pmode);
13101 rtx val = force_operand (XEXP (x, 1), temp);
13102 if (val != temp)
13103 {
13104 if (GET_MODE (val) != Pmode)
13105 val = convert_to_mode (Pmode, val, 1);
13106 emit_move_insn (temp, val);
13107 }
13108
13109 XEXP (x, 1) = temp;
13110 return x;
13111 }
13112
13113 else if (REG_P (XEXP (x, 1)))
13114 {
13115 rtx temp = gen_reg_rtx (Pmode);
13116 rtx val = force_operand (XEXP (x, 0), temp);
13117 if (val != temp)
13118 {
13119 if (GET_MODE (val) != Pmode)
13120 val = convert_to_mode (Pmode, val, 1);
13121 emit_move_insn (temp, val);
13122 }
13123
13124 XEXP (x, 0) = temp;
13125 return x;
13126 }
13127 }
13128
13129 return x;
13130 }
13131 \f
13132 /* Print an integer constant expression in assembler syntax. Addition
13133 and subtraction are the only arithmetic that may appear in these
13134 expressions. FILE is the stdio stream to write to, X is the rtx, and
13135 CODE is the operand print code from the output string. */
13136
13137 static void
13138 output_pic_addr_const (FILE *file, rtx x, int code)
13139 {
13140 char buf[256];
13141
13142 switch (GET_CODE (x))
13143 {
13144 case PC:
13145 gcc_assert (flag_pic);
13146 putc ('.', file);
13147 break;
13148
13149 case SYMBOL_REF:
13150 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13151 output_addr_const (file, x);
13152 else
13153 {
13154 const char *name = XSTR (x, 0);
13155
13156 /* Mark the decl as referenced so that cgraph will
13157 output the function. */
13158 if (SYMBOL_REF_DECL (x))
13159 mark_decl_referenced (SYMBOL_REF_DECL (x));
13160
13161 #if TARGET_MACHO
13162 if (MACHOPIC_INDIRECT
13163 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13164 name = machopic_indirection_name (x, /*stub_p=*/true);
13165 #endif
13166 assemble_name (file, name);
13167 }
13168 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13169 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13170 fputs ("@PLT", file);
13171 break;
13172
13173 case LABEL_REF:
13174 x = XEXP (x, 0);
13175 /* FALLTHRU */
13176 case CODE_LABEL:
13177 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13178 assemble_name (asm_out_file, buf);
13179 break;
13180
13181 case CONST_INT:
13182 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13183 break;
13184
13185 case CONST:
13186 /* This used to output parentheses around the expression,
13187 but that does not work on the 386 (either ATT or BSD assembler). */
13188 output_pic_addr_const (file, XEXP (x, 0), code);
13189 break;
13190
13191 case CONST_DOUBLE:
13192 if (GET_MODE (x) == VOIDmode)
13193 {
13194 /* We can use %d if the number is <32 bits and positive. */
13195 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13196 fprintf (file, "0x%lx%08lx",
13197 (unsigned long) CONST_DOUBLE_HIGH (x),
13198 (unsigned long) CONST_DOUBLE_LOW (x));
13199 else
13200 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13201 }
13202 else
13203 /* We can't handle floating point constants;
13204 TARGET_PRINT_OPERAND must handle them. */
13205 output_operand_lossage ("floating constant misused");
13206 break;
13207
13208 case PLUS:
13209 /* Some assemblers need integer constants to appear first. */
13210 if (CONST_INT_P (XEXP (x, 0)))
13211 {
13212 output_pic_addr_const (file, XEXP (x, 0), code);
13213 putc ('+', file);
13214 output_pic_addr_const (file, XEXP (x, 1), code);
13215 }
13216 else
13217 {
13218 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13219 output_pic_addr_const (file, XEXP (x, 1), code);
13220 putc ('+', file);
13221 output_pic_addr_const (file, XEXP (x, 0), code);
13222 }
13223 break;
13224
13225 case MINUS:
13226 if (!TARGET_MACHO)
13227 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13228 output_pic_addr_const (file, XEXP (x, 0), code);
13229 putc ('-', file);
13230 output_pic_addr_const (file, XEXP (x, 1), code);
13231 if (!TARGET_MACHO)
13232 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13233 break;
13234
13235 case UNSPEC:
13236 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13237 {
13238 bool f = i386_asm_output_addr_const_extra (file, x);
13239 gcc_assert (f);
13240 break;
13241 }
13242
13243 gcc_assert (XVECLEN (x, 0) == 1);
13244 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13245 switch (XINT (x, 1))
13246 {
13247 case UNSPEC_GOT:
13248 fputs ("@GOT", file);
13249 break;
13250 case UNSPEC_GOTOFF:
13251 fputs ("@GOTOFF", file);
13252 break;
13253 case UNSPEC_PLTOFF:
13254 fputs ("@PLTOFF", file);
13255 break;
13256 case UNSPEC_PCREL:
13257 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13258 "(%rip)" : "[rip]", file);
13259 break;
13260 case UNSPEC_GOTPCREL:
13261 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13262 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13263 break;
13264 case UNSPEC_GOTTPOFF:
13265 /* FIXME: This might be @TPOFF in Sun ld too. */
13266 fputs ("@gottpoff", file);
13267 break;
13268 case UNSPEC_TPOFF:
13269 fputs ("@tpoff", file);
13270 break;
13271 case UNSPEC_NTPOFF:
13272 if (TARGET_64BIT)
13273 fputs ("@tpoff", file);
13274 else
13275 fputs ("@ntpoff", file);
13276 break;
13277 case UNSPEC_DTPOFF:
13278 fputs ("@dtpoff", file);
13279 break;
13280 case UNSPEC_GOTNTPOFF:
13281 if (TARGET_64BIT)
13282 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13283 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13284 else
13285 fputs ("@gotntpoff", file);
13286 break;
13287 case UNSPEC_INDNTPOFF:
13288 fputs ("@indntpoff", file);
13289 break;
13290 #if TARGET_MACHO
13291 case UNSPEC_MACHOPIC_OFFSET:
13292 putc ('-', file);
13293 machopic_output_function_base_name (file);
13294 break;
13295 #endif
13296 default:
13297 output_operand_lossage ("invalid UNSPEC as operand");
13298 break;
13299 }
13300 break;
13301
13302 default:
13303 output_operand_lossage ("invalid expression as operand");
13304 }
13305 }
13306
13307 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13308 We need to emit DTP-relative relocations. */
13309
13310 static void ATTRIBUTE_UNUSED
13311 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13312 {
13313 fputs (ASM_LONG, file);
13314 output_addr_const (file, x);
13315 fputs ("@dtpoff", file);
13316 switch (size)
13317 {
13318 case 4:
13319 break;
13320 case 8:
13321 fputs (", 0", file);
13322 break;
13323 default:
13324 gcc_unreachable ();
13325 }
13326 }
13327
13328 /* Return true if X is a representation of the PIC register. This copes
13329 with calls from ix86_find_base_term, where the register might have
13330 been replaced by a cselib value. */
13331
13332 static bool
13333 ix86_pic_register_p (rtx x)
13334 {
13335 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13336 return (pic_offset_table_rtx
13337 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13338 else
13339 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13340 }
13341
13342 /* Helper function for ix86_delegitimize_address.
13343 Attempt to delegitimize TLS local-exec accesses. */
13344
13345 static rtx
13346 ix86_delegitimize_tls_address (rtx orig_x)
13347 {
13348 rtx x = orig_x, unspec;
13349 struct ix86_address addr;
13350
13351 if (!TARGET_TLS_DIRECT_SEG_REFS)
13352 return orig_x;
13353 if (MEM_P (x))
13354 x = XEXP (x, 0);
13355 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13356 return orig_x;
13357 if (ix86_decompose_address (x, &addr) == 0
13358 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13359 || addr.disp == NULL_RTX
13360 || GET_CODE (addr.disp) != CONST)
13361 return orig_x;
13362 unspec = XEXP (addr.disp, 0);
13363 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13364 unspec = XEXP (unspec, 0);
13365 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13366 return orig_x;
13367 x = XVECEXP (unspec, 0, 0);
13368 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13369 if (unspec != XEXP (addr.disp, 0))
13370 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13371 if (addr.index)
13372 {
13373 rtx idx = addr.index;
13374 if (addr.scale != 1)
13375 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13376 x = gen_rtx_PLUS (Pmode, idx, x);
13377 }
13378 if (addr.base)
13379 x = gen_rtx_PLUS (Pmode, addr.base, x);
13380 if (MEM_P (orig_x))
13381 x = replace_equiv_address_nv (orig_x, x);
13382 return x;
13383 }
13384
13385 /* In the name of slightly smaller debug output, and to cater to
13386 general assembler lossage, recognize PIC+GOTOFF and turn it back
13387 into a direct symbol reference.
13388
13389 On Darwin, this is necessary to avoid a crash, because Darwin
13390 has a different PIC label for each routine but the DWARF debugging
13391 information is not associated with any particular routine, so it's
13392 necessary to remove references to the PIC label from RTL stored by
13393 the DWARF output code. */
13394
13395 static rtx
13396 ix86_delegitimize_address (rtx x)
13397 {
13398 rtx orig_x = delegitimize_mem_from_attrs (x);
13399 /* addend is NULL or some rtx if x is something+GOTOFF where
13400 something doesn't include the PIC register. */
13401 rtx addend = NULL_RTX;
13402 /* reg_addend is NULL or a multiple of some register. */
13403 rtx reg_addend = NULL_RTX;
13404 /* const_addend is NULL or a const_int. */
13405 rtx const_addend = NULL_RTX;
13406 /* This is the result, or NULL. */
13407 rtx result = NULL_RTX;
13408
13409 x = orig_x;
13410
13411 if (MEM_P (x))
13412 x = XEXP (x, 0);
13413
13414 if (TARGET_64BIT)
13415 {
13416 if (GET_CODE (x) != CONST
13417 || GET_CODE (XEXP (x, 0)) != UNSPEC
13418 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13419 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13420 || !MEM_P (orig_x))
13421 return ix86_delegitimize_tls_address (orig_x);
13422 x = XVECEXP (XEXP (x, 0), 0, 0);
13423 if (GET_MODE (orig_x) != GET_MODE (x))
13424 {
13425 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13426 GET_MODE (x), 0);
13427 if (x == NULL_RTX)
13428 return orig_x;
13429 }
13430 return x;
13431 }
13432
13433 if (GET_CODE (x) != PLUS
13434 || GET_CODE (XEXP (x, 1)) != CONST)
13435 return ix86_delegitimize_tls_address (orig_x);
13436
13437 if (ix86_pic_register_p (XEXP (x, 0)))
13438 /* %ebx + GOT/GOTOFF */
13439 ;
13440 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13441 {
13442 /* %ebx + %reg * scale + GOT/GOTOFF */
13443 reg_addend = XEXP (x, 0);
13444 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13445 reg_addend = XEXP (reg_addend, 1);
13446 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13447 reg_addend = XEXP (reg_addend, 0);
13448 else
13449 {
13450 reg_addend = NULL_RTX;
13451 addend = XEXP (x, 0);
13452 }
13453 }
13454 else
13455 addend = XEXP (x, 0);
13456
13457 x = XEXP (XEXP (x, 1), 0);
13458 if (GET_CODE (x) == PLUS
13459 && CONST_INT_P (XEXP (x, 1)))
13460 {
13461 const_addend = XEXP (x, 1);
13462 x = XEXP (x, 0);
13463 }
13464
13465 if (GET_CODE (x) == UNSPEC
13466 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13467 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13468 result = XVECEXP (x, 0, 0);
13469
13470 if (TARGET_MACHO && darwin_local_data_pic (x)
13471 && !MEM_P (orig_x))
13472 result = XVECEXP (x, 0, 0);
13473
13474 if (! result)
13475 return ix86_delegitimize_tls_address (orig_x);
13476
13477 if (const_addend)
13478 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13479 if (reg_addend)
13480 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13481 if (addend)
13482 {
13483 /* If the rest of original X doesn't involve the PIC register, add
13484 addend and subtract pic_offset_table_rtx. This can happen e.g.
13485 for code like:
13486 leal (%ebx, %ecx, 4), %ecx
13487 ...
13488 movl foo@GOTOFF(%ecx), %edx
13489 in which case we return (%ecx - %ebx) + foo. */
13490 if (pic_offset_table_rtx)
13491 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13492 pic_offset_table_rtx),
13493 result);
13494 else
13495 return orig_x;
13496 }
13497 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13498 {
13499 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13500 if (result == NULL_RTX)
13501 return orig_x;
13502 }
13503 return result;
13504 }
13505
13506 /* If X is a machine specific address (i.e. a symbol or label being
13507 referenced as a displacement from the GOT implemented using an
13508 UNSPEC), then return the base term. Otherwise return X. */
13509
13510 rtx
13511 ix86_find_base_term (rtx x)
13512 {
13513 rtx term;
13514
13515 if (TARGET_64BIT)
13516 {
13517 if (GET_CODE (x) != CONST)
13518 return x;
13519 term = XEXP (x, 0);
13520 if (GET_CODE (term) == PLUS
13521 && (CONST_INT_P (XEXP (term, 1))
13522 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13523 term = XEXP (term, 0);
13524 if (GET_CODE (term) != UNSPEC
13525 || (XINT (term, 1) != UNSPEC_GOTPCREL
13526 && XINT (term, 1) != UNSPEC_PCREL))
13527 return x;
13528
13529 return XVECEXP (term, 0, 0);
13530 }
13531
13532 return ix86_delegitimize_address (x);
13533 }
13534 \f
13535 static void
13536 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13537 int fp, FILE *file)
13538 {
13539 const char *suffix;
13540
13541 if (mode == CCFPmode || mode == CCFPUmode)
13542 {
13543 code = ix86_fp_compare_code_to_integer (code);
13544 mode = CCmode;
13545 }
13546 if (reverse)
13547 code = reverse_condition (code);
13548
13549 switch (code)
13550 {
13551 case EQ:
13552 switch (mode)
13553 {
13554 case CCAmode:
13555 suffix = "a";
13556 break;
13557
13558 case CCCmode:
13559 suffix = "c";
13560 break;
13561
13562 case CCOmode:
13563 suffix = "o";
13564 break;
13565
13566 case CCSmode:
13567 suffix = "s";
13568 break;
13569
13570 default:
13571 suffix = "e";
13572 }
13573 break;
13574 case NE:
13575 switch (mode)
13576 {
13577 case CCAmode:
13578 suffix = "na";
13579 break;
13580
13581 case CCCmode:
13582 suffix = "nc";
13583 break;
13584
13585 case CCOmode:
13586 suffix = "no";
13587 break;
13588
13589 case CCSmode:
13590 suffix = "ns";
13591 break;
13592
13593 default:
13594 suffix = "ne";
13595 }
13596 break;
13597 case GT:
13598 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13599 suffix = "g";
13600 break;
13601 case GTU:
13602 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13603 Those same assemblers have the same but opposite lossage on cmov. */
13604 if (mode == CCmode)
13605 suffix = fp ? "nbe" : "a";
13606 else if (mode == CCCmode)
13607 suffix = "b";
13608 else
13609 gcc_unreachable ();
13610 break;
13611 case LT:
13612 switch (mode)
13613 {
13614 case CCNOmode:
13615 case CCGOCmode:
13616 suffix = "s";
13617 break;
13618
13619 case CCmode:
13620 case CCGCmode:
13621 suffix = "l";
13622 break;
13623
13624 default:
13625 gcc_unreachable ();
13626 }
13627 break;
13628 case LTU:
13629 gcc_assert (mode == CCmode || mode == CCCmode);
13630 suffix = "b";
13631 break;
13632 case GE:
13633 switch (mode)
13634 {
13635 case CCNOmode:
13636 case CCGOCmode:
13637 suffix = "ns";
13638 break;
13639
13640 case CCmode:
13641 case CCGCmode:
13642 suffix = "ge";
13643 break;
13644
13645 default:
13646 gcc_unreachable ();
13647 }
13648 break;
13649 case GEU:
13650 /* ??? As above. */
13651 gcc_assert (mode == CCmode || mode == CCCmode);
13652 suffix = fp ? "nb" : "ae";
13653 break;
13654 case LE:
13655 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13656 suffix = "le";
13657 break;
13658 case LEU:
13659 /* ??? As above. */
13660 if (mode == CCmode)
13661 suffix = "be";
13662 else if (mode == CCCmode)
13663 suffix = fp ? "nb" : "ae";
13664 else
13665 gcc_unreachable ();
13666 break;
13667 case UNORDERED:
13668 suffix = fp ? "u" : "p";
13669 break;
13670 case ORDERED:
13671 suffix = fp ? "nu" : "np";
13672 break;
13673 default:
13674 gcc_unreachable ();
13675 }
13676 fputs (suffix, file);
13677 }
13678
13679 /* Print the name of register X to FILE based on its machine mode and number.
13680 If CODE is 'w', pretend the mode is HImode.
13681 If CODE is 'b', pretend the mode is QImode.
13682 If CODE is 'k', pretend the mode is SImode.
13683 If CODE is 'q', pretend the mode is DImode.
13684 If CODE is 'x', pretend the mode is V4SFmode.
13685 If CODE is 't', pretend the mode is V8SFmode.
13686 If CODE is 'h', pretend the reg is the 'high' byte register.
13687 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13688 If CODE is 'd', duplicate the operand for AVX instruction.
13689 */
13690
13691 void
13692 print_reg (rtx x, int code, FILE *file)
13693 {
13694 const char *reg;
13695 bool duplicated = code == 'd' && TARGET_AVX;
13696
13697 gcc_assert (x == pc_rtx
13698 || (REGNO (x) != ARG_POINTER_REGNUM
13699 && REGNO (x) != FRAME_POINTER_REGNUM
13700 && REGNO (x) != FLAGS_REG
13701 && REGNO (x) != FPSR_REG
13702 && REGNO (x) != FPCR_REG));
13703
13704 if (ASSEMBLER_DIALECT == ASM_ATT)
13705 putc ('%', file);
13706
13707 if (x == pc_rtx)
13708 {
13709 gcc_assert (TARGET_64BIT);
13710 fputs ("rip", file);
13711 return;
13712 }
13713
13714 if (code == 'w' || MMX_REG_P (x))
13715 code = 2;
13716 else if (code == 'b')
13717 code = 1;
13718 else if (code == 'k')
13719 code = 4;
13720 else if (code == 'q')
13721 code = 8;
13722 else if (code == 'y')
13723 code = 3;
13724 else if (code == 'h')
13725 code = 0;
13726 else if (code == 'x')
13727 code = 16;
13728 else if (code == 't')
13729 code = 32;
13730 else
13731 code = GET_MODE_SIZE (GET_MODE (x));
13732
13733 /* Irritatingly, AMD extended registers use different naming convention
13734 from the normal registers: "r%d[bwd]" */
13735 if (REX_INT_REG_P (x))
13736 {
13737 gcc_assert (TARGET_64BIT);
13738 putc ('r', file);
13739 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13740 switch (code)
13741 {
13742 case 0:
13743 error ("extended registers have no high halves");
13744 break;
13745 case 1:
13746 putc ('b', file);
13747 break;
13748 case 2:
13749 putc ('w', file);
13750 break;
13751 case 4:
13752 putc ('d', file);
13753 break;
13754 case 8:
13755 /* no suffix */
13756 break;
13757 default:
13758 error ("unsupported operand size for extended register");
13759 break;
13760 }
13761 return;
13762 }
13763
13764 reg = NULL;
13765 switch (code)
13766 {
13767 case 3:
13768 if (STACK_TOP_P (x))
13769 {
13770 reg = "st(0)";
13771 break;
13772 }
13773 /* FALLTHRU */
13774 case 8:
13775 case 4:
13776 case 12:
13777 if (! ANY_FP_REG_P (x))
13778 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13779 /* FALLTHRU */
13780 case 16:
13781 case 2:
13782 normal:
13783 reg = hi_reg_name[REGNO (x)];
13784 break;
13785 case 1:
13786 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13787 goto normal;
13788 reg = qi_reg_name[REGNO (x)];
13789 break;
13790 case 0:
13791 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13792 goto normal;
13793 reg = qi_high_reg_name[REGNO (x)];
13794 break;
13795 case 32:
13796 if (SSE_REG_P (x))
13797 {
13798 gcc_assert (!duplicated);
13799 putc ('y', file);
13800 fputs (hi_reg_name[REGNO (x)] + 1, file);
13801 return;
13802 }
13803 break;
13804 default:
13805 gcc_unreachable ();
13806 }
13807
13808 fputs (reg, file);
13809 if (duplicated)
13810 {
13811 if (ASSEMBLER_DIALECT == ASM_ATT)
13812 fprintf (file, ", %%%s", reg);
13813 else
13814 fprintf (file, ", %s", reg);
13815 }
13816 }
13817
13818 /* Locate some local-dynamic symbol still in use by this function
13819 so that we can print its name in some tls_local_dynamic_base
13820 pattern. */
13821
13822 static int
13823 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13824 {
13825 rtx x = *px;
13826
13827 if (GET_CODE (x) == SYMBOL_REF
13828 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13829 {
13830 cfun->machine->some_ld_name = XSTR (x, 0);
13831 return 1;
13832 }
13833
13834 return 0;
13835 }
13836
13837 static const char *
13838 get_some_local_dynamic_name (void)
13839 {
13840 rtx insn;
13841
13842 if (cfun->machine->some_ld_name)
13843 return cfun->machine->some_ld_name;
13844
13845 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13846 if (NONDEBUG_INSN_P (insn)
13847 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13848 return cfun->machine->some_ld_name;
13849
13850 return NULL;
13851 }
13852
13853 /* Meaning of CODE:
13854 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13855 C -- print opcode suffix for set/cmov insn.
13856 c -- like C, but print reversed condition
13857 F,f -- likewise, but for floating-point.
13858 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13859 otherwise nothing
13860 R -- print the prefix for register names.
13861 z -- print the opcode suffix for the size of the current operand.
13862 Z -- likewise, with special suffixes for x87 instructions.
13863 * -- print a star (in certain assembler syntax)
13864 A -- print an absolute memory reference.
13865 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13866 s -- print a shift double count, followed by the assemblers argument
13867 delimiter.
13868 b -- print the QImode name of the register for the indicated operand.
13869 %b0 would print %al if operands[0] is reg 0.
13870 w -- likewise, print the HImode name of the register.
13871 k -- likewise, print the SImode name of the register.
13872 q -- likewise, print the DImode name of the register.
13873 x -- likewise, print the V4SFmode name of the register.
13874 t -- likewise, print the V8SFmode name of the register.
13875 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13876 y -- print "st(0)" instead of "st" as a register.
13877 d -- print duplicated register operand for AVX instruction.
13878 D -- print condition for SSE cmp instruction.
13879 P -- if PIC, print an @PLT suffix.
13880 p -- print raw symbol name.
13881 X -- don't print any sort of PIC '@' suffix for a symbol.
13882 & -- print some in-use local-dynamic symbol name.
13883 H -- print a memory address offset by 8; used for sse high-parts
13884 Y -- print condition for XOP pcom* instruction.
13885 + -- print a branch hint as 'cs' or 'ds' prefix
13886 ; -- print a semicolon (after prefixes due to bug in older gas).
13887 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13888 @ -- print a segment register of thread base pointer load
13889 */
13890
13891 void
13892 ix86_print_operand (FILE *file, rtx x, int code)
13893 {
13894 if (code)
13895 {
13896 switch (code)
13897 {
13898 case '*':
13899 if (ASSEMBLER_DIALECT == ASM_ATT)
13900 putc ('*', file);
13901 return;
13902
13903 case '&':
13904 {
13905 const char *name = get_some_local_dynamic_name ();
13906 if (name == NULL)
13907 output_operand_lossage ("'%%&' used without any "
13908 "local dynamic TLS references");
13909 else
13910 assemble_name (file, name);
13911 return;
13912 }
13913
13914 case 'A':
13915 switch (ASSEMBLER_DIALECT)
13916 {
13917 case ASM_ATT:
13918 putc ('*', file);
13919 break;
13920
13921 case ASM_INTEL:
13922 /* Intel syntax. For absolute addresses, registers should not
13923 be surrounded by braces. */
13924 if (!REG_P (x))
13925 {
13926 putc ('[', file);
13927 ix86_print_operand (file, x, 0);
13928 putc (']', file);
13929 return;
13930 }
13931 break;
13932
13933 default:
13934 gcc_unreachable ();
13935 }
13936
13937 ix86_print_operand (file, x, 0);
13938 return;
13939
13940
13941 case 'L':
13942 if (ASSEMBLER_DIALECT == ASM_ATT)
13943 putc ('l', file);
13944 return;
13945
13946 case 'W':
13947 if (ASSEMBLER_DIALECT == ASM_ATT)
13948 putc ('w', file);
13949 return;
13950
13951 case 'B':
13952 if (ASSEMBLER_DIALECT == ASM_ATT)
13953 putc ('b', file);
13954 return;
13955
13956 case 'Q':
13957 if (ASSEMBLER_DIALECT == ASM_ATT)
13958 putc ('l', file);
13959 return;
13960
13961 case 'S':
13962 if (ASSEMBLER_DIALECT == ASM_ATT)
13963 putc ('s', file);
13964 return;
13965
13966 case 'T':
13967 if (ASSEMBLER_DIALECT == ASM_ATT)
13968 putc ('t', file);
13969 return;
13970
13971 case 'z':
13972 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13973 {
13974 /* Opcodes don't get size suffixes if using Intel opcodes. */
13975 if (ASSEMBLER_DIALECT == ASM_INTEL)
13976 return;
13977
13978 switch (GET_MODE_SIZE (GET_MODE (x)))
13979 {
13980 case 1:
13981 putc ('b', file);
13982 return;
13983
13984 case 2:
13985 putc ('w', file);
13986 return;
13987
13988 case 4:
13989 putc ('l', file);
13990 return;
13991
13992 case 8:
13993 putc ('q', file);
13994 return;
13995
13996 default:
13997 output_operand_lossage
13998 ("invalid operand size for operand code '%c'", code);
13999 return;
14000 }
14001 }
14002
14003 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14004 warning
14005 (0, "non-integer operand used with operand code '%c'", code);
14006 /* FALLTHRU */
14007
14008 case 'Z':
14009 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14010 if (ASSEMBLER_DIALECT == ASM_INTEL)
14011 return;
14012
14013 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14014 {
14015 switch (GET_MODE_SIZE (GET_MODE (x)))
14016 {
14017 case 2:
14018 #ifdef HAVE_AS_IX86_FILDS
14019 putc ('s', file);
14020 #endif
14021 return;
14022
14023 case 4:
14024 putc ('l', file);
14025 return;
14026
14027 case 8:
14028 #ifdef HAVE_AS_IX86_FILDQ
14029 putc ('q', file);
14030 #else
14031 fputs ("ll", file);
14032 #endif
14033 return;
14034
14035 default:
14036 break;
14037 }
14038 }
14039 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14040 {
14041 /* 387 opcodes don't get size suffixes
14042 if the operands are registers. */
14043 if (STACK_REG_P (x))
14044 return;
14045
14046 switch (GET_MODE_SIZE (GET_MODE (x)))
14047 {
14048 case 4:
14049 putc ('s', file);
14050 return;
14051
14052 case 8:
14053 putc ('l', file);
14054 return;
14055
14056 case 12:
14057 case 16:
14058 putc ('t', file);
14059 return;
14060
14061 default:
14062 break;
14063 }
14064 }
14065 else
14066 {
14067 output_operand_lossage
14068 ("invalid operand type used with operand code '%c'", code);
14069 return;
14070 }
14071
14072 output_operand_lossage
14073 ("invalid operand size for operand code '%c'", code);
14074 return;
14075
14076 case 'd':
14077 case 'b':
14078 case 'w':
14079 case 'k':
14080 case 'q':
14081 case 'h':
14082 case 't':
14083 case 'y':
14084 case 'x':
14085 case 'X':
14086 case 'P':
14087 case 'p':
14088 break;
14089
14090 case 's':
14091 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14092 {
14093 ix86_print_operand (file, x, 0);
14094 fputs (", ", file);
14095 }
14096 return;
14097
14098 case 'D':
14099 /* Little bit of braindamage here. The SSE compare instructions
14100 does use completely different names for the comparisons that the
14101 fp conditional moves. */
14102 if (TARGET_AVX)
14103 {
14104 switch (GET_CODE (x))
14105 {
14106 case EQ:
14107 fputs ("eq", file);
14108 break;
14109 case UNEQ:
14110 fputs ("eq_us", file);
14111 break;
14112 case LT:
14113 fputs ("lt", file);
14114 break;
14115 case UNLT:
14116 fputs ("nge", file);
14117 break;
14118 case LE:
14119 fputs ("le", file);
14120 break;
14121 case UNLE:
14122 fputs ("ngt", file);
14123 break;
14124 case UNORDERED:
14125 fputs ("unord", file);
14126 break;
14127 case NE:
14128 fputs ("neq", file);
14129 break;
14130 case LTGT:
14131 fputs ("neq_oq", file);
14132 break;
14133 case GE:
14134 fputs ("ge", file);
14135 break;
14136 case UNGE:
14137 fputs ("nlt", file);
14138 break;
14139 case GT:
14140 fputs ("gt", file);
14141 break;
14142 case UNGT:
14143 fputs ("nle", file);
14144 break;
14145 case ORDERED:
14146 fputs ("ord", file);
14147 break;
14148 default:
14149 output_operand_lossage ("operand is not a condition code, "
14150 "invalid operand code 'D'");
14151 return;
14152 }
14153 }
14154 else
14155 {
14156 switch (GET_CODE (x))
14157 {
14158 case EQ:
14159 case UNEQ:
14160 fputs ("eq", file);
14161 break;
14162 case LT:
14163 case UNLT:
14164 fputs ("lt", file);
14165 break;
14166 case LE:
14167 case UNLE:
14168 fputs ("le", file);
14169 break;
14170 case UNORDERED:
14171 fputs ("unord", file);
14172 break;
14173 case NE:
14174 case LTGT:
14175 fputs ("neq", file);
14176 break;
14177 case UNGE:
14178 case GE:
14179 fputs ("nlt", file);
14180 break;
14181 case UNGT:
14182 case GT:
14183 fputs ("nle", file);
14184 break;
14185 case ORDERED:
14186 fputs ("ord", file);
14187 break;
14188 default:
14189 output_operand_lossage ("operand is not a condition code, "
14190 "invalid operand code 'D'");
14191 return;
14192 }
14193 }
14194 return;
14195 case 'O':
14196 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14197 if (ASSEMBLER_DIALECT == ASM_ATT)
14198 {
14199 switch (GET_MODE (x))
14200 {
14201 case HImode: putc ('w', file); break;
14202 case SImode:
14203 case SFmode: putc ('l', file); break;
14204 case DImode:
14205 case DFmode: putc ('q', file); break;
14206 default: gcc_unreachable ();
14207 }
14208 putc ('.', file);
14209 }
14210 #endif
14211 return;
14212 case 'C':
14213 if (!COMPARISON_P (x))
14214 {
14215 output_operand_lossage ("operand is neither a constant nor a "
14216 "condition code, invalid operand code "
14217 "'C'");
14218 return;
14219 }
14220 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
14221 return;
14222 case 'F':
14223 if (!COMPARISON_P (x))
14224 {
14225 output_operand_lossage ("operand is neither a constant nor a "
14226 "condition code, invalid operand code "
14227 "'F'");
14228 return;
14229 }
14230 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14231 if (ASSEMBLER_DIALECT == ASM_ATT)
14232 putc ('.', file);
14233 #endif
14234 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
14235 return;
14236
14237 /* Like above, but reverse condition */
14238 case 'c':
14239 /* Check to see if argument to %c is really a constant
14240 and not a condition code which needs to be reversed. */
14241 if (!COMPARISON_P (x))
14242 {
14243 output_operand_lossage ("operand is neither a constant nor a "
14244 "condition code, invalid operand "
14245 "code 'c'");
14246 return;
14247 }
14248 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
14249 return;
14250 case 'f':
14251 if (!COMPARISON_P (x))
14252 {
14253 output_operand_lossage ("operand is neither a constant nor a "
14254 "condition code, invalid operand "
14255 "code 'f'");
14256 return;
14257 }
14258 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14259 if (ASSEMBLER_DIALECT == ASM_ATT)
14260 putc ('.', file);
14261 #endif
14262 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14263 return;
14264
14265 case 'H':
14266 /* It doesn't actually matter what mode we use here, as we're
14267 only going to use this for printing. */
14268 x = adjust_address_nv (x, DImode, 8);
14269 break;
14270
14271 case '+':
14272 {
14273 rtx x;
14274
14275 if (!optimize
14276 || optimize_function_for_size_p (cfun) || !TARGET_BRANCH_PREDICTION_HINTS)
14277 return;
14278
14279 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14280 if (x)
14281 {
14282 int pred_val = INTVAL (XEXP (x, 0));
14283
14284 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14285 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14286 {
14287 int taken = pred_val > REG_BR_PROB_BASE / 2;
14288 int cputaken = final_forward_branch_p (current_output_insn) == 0;
14289
14290 /* Emit hints only in the case default branch prediction
14291 heuristics would fail. */
14292 if (taken != cputaken)
14293 {
14294 /* We use 3e (DS) prefix for taken branches and
14295 2e (CS) prefix for not taken branches. */
14296 if (taken)
14297 fputs ("ds ; ", file);
14298 else
14299 fputs ("cs ; ", file);
14300 }
14301 }
14302 }
14303 return;
14304 }
14305
14306 case 'Y':
14307 switch (GET_CODE (x))
14308 {
14309 case NE:
14310 fputs ("neq", file);
14311 break;
14312 case EQ:
14313 fputs ("eq", file);
14314 break;
14315 case GE:
14316 case GEU:
14317 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14318 break;
14319 case GT:
14320 case GTU:
14321 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14322 break;
14323 case LE:
14324 case LEU:
14325 fputs ("le", file);
14326 break;
14327 case LT:
14328 case LTU:
14329 fputs ("lt", file);
14330 break;
14331 case UNORDERED:
14332 fputs ("unord", file);
14333 break;
14334 case ORDERED:
14335 fputs ("ord", file);
14336 break;
14337 case UNEQ:
14338 fputs ("ueq", file);
14339 break;
14340 case UNGE:
14341 fputs ("nlt", file);
14342 break;
14343 case UNGT:
14344 fputs ("nle", file);
14345 break;
14346 case UNLE:
14347 fputs ("ule", file);
14348 break;
14349 case UNLT:
14350 fputs ("ult", file);
14351 break;
14352 case LTGT:
14353 fputs ("une", file);
14354 break;
14355 default:
14356 output_operand_lossage ("operand is not a condition code, "
14357 "invalid operand code 'Y'");
14358 return;
14359 }
14360 return;
14361
14362 case ';':
14363 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14364 putc (';', file);
14365 #endif
14366 return;
14367
14368 case '@':
14369 if (ASSEMBLER_DIALECT == ASM_ATT)
14370 putc ('%', file);
14371
14372 /* The kernel uses a different segment register for performance
14373 reasons; a system call would not have to trash the userspace
14374 segment register, which would be expensive. */
14375 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14376 fputs ("fs", file);
14377 else
14378 fputs ("gs", file);
14379 return;
14380
14381 case '~':
14382 putc (TARGET_AVX2 ? 'i' : 'f', file);
14383 return;
14384
14385 default:
14386 output_operand_lossage ("invalid operand code '%c'", code);
14387 }
14388 }
14389
14390 if (REG_P (x))
14391 print_reg (x, code, file);
14392
14393 else if (MEM_P (x))
14394 {
14395 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14396 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14397 && GET_MODE (x) != BLKmode)
14398 {
14399 const char * size;
14400 switch (GET_MODE_SIZE (GET_MODE (x)))
14401 {
14402 case 1: size = "BYTE"; break;
14403 case 2: size = "WORD"; break;
14404 case 4: size = "DWORD"; break;
14405 case 8: size = "QWORD"; break;
14406 case 12: size = "TBYTE"; break;
14407 case 16:
14408 if (GET_MODE (x) == XFmode)
14409 size = "TBYTE";
14410 else
14411 size = "XMMWORD";
14412 break;
14413 case 32: size = "YMMWORD"; break;
14414 default:
14415 gcc_unreachable ();
14416 }
14417
14418 /* Check for explicit size override (codes 'b', 'w', 'k',
14419 'q' and 'x') */
14420 if (code == 'b')
14421 size = "BYTE";
14422 else if (code == 'w')
14423 size = "WORD";
14424 else if (code == 'k')
14425 size = "DWORD";
14426 else if (code == 'q')
14427 size = "QWORD";
14428 else if (code == 'x')
14429 size = "XMMWORD";
14430
14431 fputs (size, file);
14432 fputs (" PTR ", file);
14433 }
14434
14435 x = XEXP (x, 0);
14436 /* Avoid (%rip) for call operands. */
14437 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14438 && !CONST_INT_P (x))
14439 output_addr_const (file, x);
14440 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14441 output_operand_lossage ("invalid constraints for operand");
14442 else
14443 output_address (x);
14444 }
14445
14446 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14447 {
14448 REAL_VALUE_TYPE r;
14449 long l;
14450
14451 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14452 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14453
14454 if (ASSEMBLER_DIALECT == ASM_ATT)
14455 putc ('$', file);
14456 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14457 if (code == 'q')
14458 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14459 else
14460 fprintf (file, "0x%08x", (unsigned int) l);
14461 }
14462
14463 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14464 {
14465 REAL_VALUE_TYPE r;
14466 long l[2];
14467
14468 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14469 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14470
14471 if (ASSEMBLER_DIALECT == ASM_ATT)
14472 putc ('$', file);
14473 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14474 }
14475
14476 /* These float cases don't actually occur as immediate operands. */
14477 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14478 {
14479 char dstr[30];
14480
14481 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14482 fputs (dstr, file);
14483 }
14484
14485 else
14486 {
14487 /* We have patterns that allow zero sets of memory, for instance.
14488 In 64-bit mode, we should probably support all 8-byte vectors,
14489 since we can in fact encode that into an immediate. */
14490 if (GET_CODE (x) == CONST_VECTOR)
14491 {
14492 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14493 x = const0_rtx;
14494 }
14495
14496 if (code != 'P' && code != 'p')
14497 {
14498 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14499 {
14500 if (ASSEMBLER_DIALECT == ASM_ATT)
14501 putc ('$', file);
14502 }
14503 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14504 || GET_CODE (x) == LABEL_REF)
14505 {
14506 if (ASSEMBLER_DIALECT == ASM_ATT)
14507 putc ('$', file);
14508 else
14509 fputs ("OFFSET FLAT:", file);
14510 }
14511 }
14512 if (CONST_INT_P (x))
14513 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14514 else if (flag_pic || MACHOPIC_INDIRECT)
14515 output_pic_addr_const (file, x, code);
14516 else
14517 output_addr_const (file, x);
14518 }
14519 }
14520
14521 static bool
14522 ix86_print_operand_punct_valid_p (unsigned char code)
14523 {
14524 return (code == '@' || code == '*' || code == '+'
14525 || code == '&' || code == ';' || code == '~');
14526 }
14527 \f
14528 /* Print a memory operand whose address is ADDR. */
14529
14530 static void
14531 ix86_print_operand_address (FILE *file, rtx addr)
14532 {
14533 struct ix86_address parts;
14534 rtx base, index, disp;
14535 int scale;
14536 int ok;
14537 bool vsib = false;
14538
14539 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14540 {
14541 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14542 gcc_assert (parts.index == NULL_RTX);
14543 parts.index = XVECEXP (addr, 0, 1);
14544 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14545 addr = XVECEXP (addr, 0, 0);
14546 vsib = true;
14547 }
14548 else
14549 ok = ix86_decompose_address (addr, &parts);
14550
14551 gcc_assert (ok);
14552
14553 if (parts.base && GET_CODE (parts.base) == SUBREG)
14554 {
14555 rtx tmp = SUBREG_REG (parts.base);
14556 parts.base = simplify_subreg (GET_MODE (parts.base),
14557 tmp, GET_MODE (tmp), 0);
14558 }
14559
14560 if (parts.index && GET_CODE (parts.index) == SUBREG)
14561 {
14562 rtx tmp = SUBREG_REG (parts.index);
14563 parts.index = simplify_subreg (GET_MODE (parts.index),
14564 tmp, GET_MODE (tmp), 0);
14565 }
14566
14567 base = parts.base;
14568 index = parts.index;
14569 disp = parts.disp;
14570 scale = parts.scale;
14571
14572 switch (parts.seg)
14573 {
14574 case SEG_DEFAULT:
14575 break;
14576 case SEG_FS:
14577 case SEG_GS:
14578 if (ASSEMBLER_DIALECT == ASM_ATT)
14579 putc ('%', file);
14580 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14581 break;
14582 default:
14583 gcc_unreachable ();
14584 }
14585
14586 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14587 if (TARGET_64BIT && !base && !index)
14588 {
14589 rtx symbol = disp;
14590
14591 if (GET_CODE (disp) == CONST
14592 && GET_CODE (XEXP (disp, 0)) == PLUS
14593 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14594 symbol = XEXP (XEXP (disp, 0), 0);
14595
14596 if (GET_CODE (symbol) == LABEL_REF
14597 || (GET_CODE (symbol) == SYMBOL_REF
14598 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14599 base = pc_rtx;
14600 }
14601 if (!base && !index)
14602 {
14603 /* Displacement only requires special attention. */
14604
14605 if (CONST_INT_P (disp))
14606 {
14607 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14608 fputs ("ds:", file);
14609 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14610 }
14611 else if (flag_pic)
14612 output_pic_addr_const (file, disp, 0);
14613 else
14614 output_addr_const (file, disp);
14615 }
14616 else
14617 {
14618 int code = 0;
14619
14620 /* Print SImode registers for zero-extended addresses to force
14621 addr32 prefix. Otherwise print DImode registers to avoid it. */
14622 if (TARGET_64BIT)
14623 code = ((GET_CODE (addr) == ZERO_EXTEND
14624 || GET_CODE (addr) == AND)
14625 ? 'l'
14626 : 'q');
14627
14628 if (ASSEMBLER_DIALECT == ASM_ATT)
14629 {
14630 if (disp)
14631 {
14632 if (flag_pic)
14633 output_pic_addr_const (file, disp, 0);
14634 else if (GET_CODE (disp) == LABEL_REF)
14635 output_asm_label (disp);
14636 else
14637 output_addr_const (file, disp);
14638 }
14639
14640 putc ('(', file);
14641 if (base)
14642 print_reg (base, code, file);
14643 if (index)
14644 {
14645 putc (',', file);
14646 print_reg (index, vsib ? 0 : code, file);
14647 if (scale != 1 || vsib)
14648 fprintf (file, ",%d", scale);
14649 }
14650 putc (')', file);
14651 }
14652 else
14653 {
14654 rtx offset = NULL_RTX;
14655
14656 if (disp)
14657 {
14658 /* Pull out the offset of a symbol; print any symbol itself. */
14659 if (GET_CODE (disp) == CONST
14660 && GET_CODE (XEXP (disp, 0)) == PLUS
14661 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14662 {
14663 offset = XEXP (XEXP (disp, 0), 1);
14664 disp = gen_rtx_CONST (VOIDmode,
14665 XEXP (XEXP (disp, 0), 0));
14666 }
14667
14668 if (flag_pic)
14669 output_pic_addr_const (file, disp, 0);
14670 else if (GET_CODE (disp) == LABEL_REF)
14671 output_asm_label (disp);
14672 else if (CONST_INT_P (disp))
14673 offset = disp;
14674 else
14675 output_addr_const (file, disp);
14676 }
14677
14678 putc ('[', file);
14679 if (base)
14680 {
14681 print_reg (base, code, file);
14682 if (offset)
14683 {
14684 if (INTVAL (offset) >= 0)
14685 putc ('+', file);
14686 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14687 }
14688 }
14689 else if (offset)
14690 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14691 else
14692 putc ('0', file);
14693
14694 if (index)
14695 {
14696 putc ('+', file);
14697 print_reg (index, vsib ? 0 : code, file);
14698 if (scale != 1 || vsib)
14699 fprintf (file, "*%d", scale);
14700 }
14701 putc (']', file);
14702 }
14703 }
14704 }
14705
14706 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14707
14708 static bool
14709 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14710 {
14711 rtx op;
14712
14713 if (GET_CODE (x) != UNSPEC)
14714 return false;
14715
14716 op = XVECEXP (x, 0, 0);
14717 switch (XINT (x, 1))
14718 {
14719 case UNSPEC_GOTTPOFF:
14720 output_addr_const (file, op);
14721 /* FIXME: This might be @TPOFF in Sun ld. */
14722 fputs ("@gottpoff", file);
14723 break;
14724 case UNSPEC_TPOFF:
14725 output_addr_const (file, op);
14726 fputs ("@tpoff", file);
14727 break;
14728 case UNSPEC_NTPOFF:
14729 output_addr_const (file, op);
14730 if (TARGET_64BIT)
14731 fputs ("@tpoff", file);
14732 else
14733 fputs ("@ntpoff", file);
14734 break;
14735 case UNSPEC_DTPOFF:
14736 output_addr_const (file, op);
14737 fputs ("@dtpoff", file);
14738 break;
14739 case UNSPEC_GOTNTPOFF:
14740 output_addr_const (file, op);
14741 if (TARGET_64BIT)
14742 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14743 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14744 else
14745 fputs ("@gotntpoff", file);
14746 break;
14747 case UNSPEC_INDNTPOFF:
14748 output_addr_const (file, op);
14749 fputs ("@indntpoff", file);
14750 break;
14751 #if TARGET_MACHO
14752 case UNSPEC_MACHOPIC_OFFSET:
14753 output_addr_const (file, op);
14754 putc ('-', file);
14755 machopic_output_function_base_name (file);
14756 break;
14757 #endif
14758
14759 case UNSPEC_STACK_CHECK:
14760 {
14761 int offset;
14762
14763 gcc_assert (flag_split_stack);
14764
14765 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14766 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14767 #else
14768 gcc_unreachable ();
14769 #endif
14770
14771 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14772 }
14773 break;
14774
14775 default:
14776 return false;
14777 }
14778
14779 return true;
14780 }
14781 \f
14782 /* Split one or more double-mode RTL references into pairs of half-mode
14783 references. The RTL can be REG, offsettable MEM, integer constant, or
14784 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14785 split and "num" is its length. lo_half and hi_half are output arrays
14786 that parallel "operands". */
14787
14788 void
14789 split_double_mode (enum machine_mode mode, rtx operands[],
14790 int num, rtx lo_half[], rtx hi_half[])
14791 {
14792 enum machine_mode half_mode;
14793 unsigned int byte;
14794
14795 switch (mode)
14796 {
14797 case TImode:
14798 half_mode = DImode;
14799 break;
14800 case DImode:
14801 half_mode = SImode;
14802 break;
14803 default:
14804 gcc_unreachable ();
14805 }
14806
14807 byte = GET_MODE_SIZE (half_mode);
14808
14809 while (num--)
14810 {
14811 rtx op = operands[num];
14812
14813 /* simplify_subreg refuse to split volatile memory addresses,
14814 but we still have to handle it. */
14815 if (MEM_P (op))
14816 {
14817 lo_half[num] = adjust_address (op, half_mode, 0);
14818 hi_half[num] = adjust_address (op, half_mode, byte);
14819 }
14820 else
14821 {
14822 lo_half[num] = simplify_gen_subreg (half_mode, op,
14823 GET_MODE (op) == VOIDmode
14824 ? mode : GET_MODE (op), 0);
14825 hi_half[num] = simplify_gen_subreg (half_mode, op,
14826 GET_MODE (op) == VOIDmode
14827 ? mode : GET_MODE (op), byte);
14828 }
14829 }
14830 }
14831 \f
14832 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14833 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14834 is the expression of the binary operation. The output may either be
14835 emitted here, or returned to the caller, like all output_* functions.
14836
14837 There is no guarantee that the operands are the same mode, as they
14838 might be within FLOAT or FLOAT_EXTEND expressions. */
14839
14840 #ifndef SYSV386_COMPAT
14841 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14842 wants to fix the assemblers because that causes incompatibility
14843 with gcc. No-one wants to fix gcc because that causes
14844 incompatibility with assemblers... You can use the option of
14845 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14846 #define SYSV386_COMPAT 1
14847 #endif
14848
14849 const char *
14850 output_387_binary_op (rtx insn, rtx *operands)
14851 {
14852 static char buf[40];
14853 const char *p;
14854 const char *ssep;
14855 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14856
14857 #ifdef ENABLE_CHECKING
14858 /* Even if we do not want to check the inputs, this documents input
14859 constraints. Which helps in understanding the following code. */
14860 if (STACK_REG_P (operands[0])
14861 && ((REG_P (operands[1])
14862 && REGNO (operands[0]) == REGNO (operands[1])
14863 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14864 || (REG_P (operands[2])
14865 && REGNO (operands[0]) == REGNO (operands[2])
14866 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14867 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14868 ; /* ok */
14869 else
14870 gcc_assert (is_sse);
14871 #endif
14872
14873 switch (GET_CODE (operands[3]))
14874 {
14875 case PLUS:
14876 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14877 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14878 p = "fiadd";
14879 else
14880 p = "fadd";
14881 ssep = "vadd";
14882 break;
14883
14884 case MINUS:
14885 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14886 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14887 p = "fisub";
14888 else
14889 p = "fsub";
14890 ssep = "vsub";
14891 break;
14892
14893 case MULT:
14894 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14895 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14896 p = "fimul";
14897 else
14898 p = "fmul";
14899 ssep = "vmul";
14900 break;
14901
14902 case DIV:
14903 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14904 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14905 p = "fidiv";
14906 else
14907 p = "fdiv";
14908 ssep = "vdiv";
14909 break;
14910
14911 default:
14912 gcc_unreachable ();
14913 }
14914
14915 if (is_sse)
14916 {
14917 if (TARGET_AVX)
14918 {
14919 strcpy (buf, ssep);
14920 if (GET_MODE (operands[0]) == SFmode)
14921 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14922 else
14923 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14924 }
14925 else
14926 {
14927 strcpy (buf, ssep + 1);
14928 if (GET_MODE (operands[0]) == SFmode)
14929 strcat (buf, "ss\t{%2, %0|%0, %2}");
14930 else
14931 strcat (buf, "sd\t{%2, %0|%0, %2}");
14932 }
14933 return buf;
14934 }
14935 strcpy (buf, p);
14936
14937 switch (GET_CODE (operands[3]))
14938 {
14939 case MULT:
14940 case PLUS:
14941 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14942 {
14943 rtx temp = operands[2];
14944 operands[2] = operands[1];
14945 operands[1] = temp;
14946 }
14947
14948 /* know operands[0] == operands[1]. */
14949
14950 if (MEM_P (operands[2]))
14951 {
14952 p = "%Z2\t%2";
14953 break;
14954 }
14955
14956 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14957 {
14958 if (STACK_TOP_P (operands[0]))
14959 /* How is it that we are storing to a dead operand[2]?
14960 Well, presumably operands[1] is dead too. We can't
14961 store the result to st(0) as st(0) gets popped on this
14962 instruction. Instead store to operands[2] (which I
14963 think has to be st(1)). st(1) will be popped later.
14964 gcc <= 2.8.1 didn't have this check and generated
14965 assembly code that the Unixware assembler rejected. */
14966 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14967 else
14968 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14969 break;
14970 }
14971
14972 if (STACK_TOP_P (operands[0]))
14973 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14974 else
14975 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14976 break;
14977
14978 case MINUS:
14979 case DIV:
14980 if (MEM_P (operands[1]))
14981 {
14982 p = "r%Z1\t%1";
14983 break;
14984 }
14985
14986 if (MEM_P (operands[2]))
14987 {
14988 p = "%Z2\t%2";
14989 break;
14990 }
14991
14992 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14993 {
14994 #if SYSV386_COMPAT
14995 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14996 derived assemblers, confusingly reverse the direction of
14997 the operation for fsub{r} and fdiv{r} when the
14998 destination register is not st(0). The Intel assembler
14999 doesn't have this brain damage. Read !SYSV386_COMPAT to
15000 figure out what the hardware really does. */
15001 if (STACK_TOP_P (operands[0]))
15002 p = "{p\t%0, %2|rp\t%2, %0}";
15003 else
15004 p = "{rp\t%2, %0|p\t%0, %2}";
15005 #else
15006 if (STACK_TOP_P (operands[0]))
15007 /* As above for fmul/fadd, we can't store to st(0). */
15008 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15009 else
15010 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15011 #endif
15012 break;
15013 }
15014
15015 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15016 {
15017 #if SYSV386_COMPAT
15018 if (STACK_TOP_P (operands[0]))
15019 p = "{rp\t%0, %1|p\t%1, %0}";
15020 else
15021 p = "{p\t%1, %0|rp\t%0, %1}";
15022 #else
15023 if (STACK_TOP_P (operands[0]))
15024 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15025 else
15026 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15027 #endif
15028 break;
15029 }
15030
15031 if (STACK_TOP_P (operands[0]))
15032 {
15033 if (STACK_TOP_P (operands[1]))
15034 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15035 else
15036 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15037 break;
15038 }
15039 else if (STACK_TOP_P (operands[1]))
15040 {
15041 #if SYSV386_COMPAT
15042 p = "{\t%1, %0|r\t%0, %1}";
15043 #else
15044 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15045 #endif
15046 }
15047 else
15048 {
15049 #if SYSV386_COMPAT
15050 p = "{r\t%2, %0|\t%0, %2}";
15051 #else
15052 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15053 #endif
15054 }
15055 break;
15056
15057 default:
15058 gcc_unreachable ();
15059 }
15060
15061 strcat (buf, p);
15062 return buf;
15063 }
15064
15065 /* Return needed mode for entity in optimize_mode_switching pass. */
15066
15067 int
15068 ix86_mode_needed (int entity, rtx insn)
15069 {
15070 enum attr_i387_cw mode;
15071
15072 /* The mode UNINITIALIZED is used to store control word after a
15073 function call or ASM pattern. The mode ANY specify that function
15074 has no requirements on the control word and make no changes in the
15075 bits we are interested in. */
15076
15077 if (CALL_P (insn)
15078 || (NONJUMP_INSN_P (insn)
15079 && (asm_noperands (PATTERN (insn)) >= 0
15080 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15081 return I387_CW_UNINITIALIZED;
15082
15083 if (recog_memoized (insn) < 0)
15084 return I387_CW_ANY;
15085
15086 mode = get_attr_i387_cw (insn);
15087
15088 switch (entity)
15089 {
15090 case I387_TRUNC:
15091 if (mode == I387_CW_TRUNC)
15092 return mode;
15093 break;
15094
15095 case I387_FLOOR:
15096 if (mode == I387_CW_FLOOR)
15097 return mode;
15098 break;
15099
15100 case I387_CEIL:
15101 if (mode == I387_CW_CEIL)
15102 return mode;
15103 break;
15104
15105 case I387_MASK_PM:
15106 if (mode == I387_CW_MASK_PM)
15107 return mode;
15108 break;
15109
15110 default:
15111 gcc_unreachable ();
15112 }
15113
15114 return I387_CW_ANY;
15115 }
15116
15117 /* Output code to initialize control word copies used by trunc?f?i and
15118 rounding patterns. CURRENT_MODE is set to current control word,
15119 while NEW_MODE is set to new control word. */
15120
15121 void
15122 emit_i387_cw_initialization (int mode)
15123 {
15124 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15125 rtx new_mode;
15126
15127 enum ix86_stack_slot slot;
15128
15129 rtx reg = gen_reg_rtx (HImode);
15130
15131 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15132 emit_move_insn (reg, copy_rtx (stored_mode));
15133
15134 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15135 || optimize_function_for_size_p (cfun))
15136 {
15137 switch (mode)
15138 {
15139 case I387_CW_TRUNC:
15140 /* round toward zero (truncate) */
15141 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15142 slot = SLOT_CW_TRUNC;
15143 break;
15144
15145 case I387_CW_FLOOR:
15146 /* round down toward -oo */
15147 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15148 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15149 slot = SLOT_CW_FLOOR;
15150 break;
15151
15152 case I387_CW_CEIL:
15153 /* round up toward +oo */
15154 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15155 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15156 slot = SLOT_CW_CEIL;
15157 break;
15158
15159 case I387_CW_MASK_PM:
15160 /* mask precision exception for nearbyint() */
15161 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15162 slot = SLOT_CW_MASK_PM;
15163 break;
15164
15165 default:
15166 gcc_unreachable ();
15167 }
15168 }
15169 else
15170 {
15171 switch (mode)
15172 {
15173 case I387_CW_TRUNC:
15174 /* round toward zero (truncate) */
15175 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15176 slot = SLOT_CW_TRUNC;
15177 break;
15178
15179 case I387_CW_FLOOR:
15180 /* round down toward -oo */
15181 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15182 slot = SLOT_CW_FLOOR;
15183 break;
15184
15185 case I387_CW_CEIL:
15186 /* round up toward +oo */
15187 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15188 slot = SLOT_CW_CEIL;
15189 break;
15190
15191 case I387_CW_MASK_PM:
15192 /* mask precision exception for nearbyint() */
15193 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15194 slot = SLOT_CW_MASK_PM;
15195 break;
15196
15197 default:
15198 gcc_unreachable ();
15199 }
15200 }
15201
15202 gcc_assert (slot < MAX_386_STACK_LOCALS);
15203
15204 new_mode = assign_386_stack_local (HImode, slot);
15205 emit_move_insn (new_mode, reg);
15206 }
15207
15208 /* Output code for INSN to convert a float to a signed int. OPERANDS
15209 are the insn operands. The output may be [HSD]Imode and the input
15210 operand may be [SDX]Fmode. */
15211
15212 const char *
15213 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15214 {
15215 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15216 int dimode_p = GET_MODE (operands[0]) == DImode;
15217 int round_mode = get_attr_i387_cw (insn);
15218
15219 /* Jump through a hoop or two for DImode, since the hardware has no
15220 non-popping instruction. We used to do this a different way, but
15221 that was somewhat fragile and broke with post-reload splitters. */
15222 if ((dimode_p || fisttp) && !stack_top_dies)
15223 output_asm_insn ("fld\t%y1", operands);
15224
15225 gcc_assert (STACK_TOP_P (operands[1]));
15226 gcc_assert (MEM_P (operands[0]));
15227 gcc_assert (GET_MODE (operands[1]) != TFmode);
15228
15229 if (fisttp)
15230 output_asm_insn ("fisttp%Z0\t%0", operands);
15231 else
15232 {
15233 if (round_mode != I387_CW_ANY)
15234 output_asm_insn ("fldcw\t%3", operands);
15235 if (stack_top_dies || dimode_p)
15236 output_asm_insn ("fistp%Z0\t%0", operands);
15237 else
15238 output_asm_insn ("fist%Z0\t%0", operands);
15239 if (round_mode != I387_CW_ANY)
15240 output_asm_insn ("fldcw\t%2", operands);
15241 }
15242
15243 return "";
15244 }
15245
15246 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15247 have the values zero or one, indicates the ffreep insn's operand
15248 from the OPERANDS array. */
15249
15250 static const char *
15251 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15252 {
15253 if (TARGET_USE_FFREEP)
15254 #ifdef HAVE_AS_IX86_FFREEP
15255 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15256 #else
15257 {
15258 static char retval[32];
15259 int regno = REGNO (operands[opno]);
15260
15261 gcc_assert (FP_REGNO_P (regno));
15262
15263 regno -= FIRST_STACK_REG;
15264
15265 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15266 return retval;
15267 }
15268 #endif
15269
15270 return opno ? "fstp\t%y1" : "fstp\t%y0";
15271 }
15272
15273
15274 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15275 should be used. UNORDERED_P is true when fucom should be used. */
15276
15277 const char *
15278 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15279 {
15280 int stack_top_dies;
15281 rtx cmp_op0, cmp_op1;
15282 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15283
15284 if (eflags_p)
15285 {
15286 cmp_op0 = operands[0];
15287 cmp_op1 = operands[1];
15288 }
15289 else
15290 {
15291 cmp_op0 = operands[1];
15292 cmp_op1 = operands[2];
15293 }
15294
15295 if (is_sse)
15296 {
15297 if (GET_MODE (operands[0]) == SFmode)
15298 if (unordered_p)
15299 return "%vucomiss\t{%1, %0|%0, %1}";
15300 else
15301 return "%vcomiss\t{%1, %0|%0, %1}";
15302 else
15303 if (unordered_p)
15304 return "%vucomisd\t{%1, %0|%0, %1}";
15305 else
15306 return "%vcomisd\t{%1, %0|%0, %1}";
15307 }
15308
15309 gcc_assert (STACK_TOP_P (cmp_op0));
15310
15311 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15312
15313 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15314 {
15315 if (stack_top_dies)
15316 {
15317 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15318 return output_387_ffreep (operands, 1);
15319 }
15320 else
15321 return "ftst\n\tfnstsw\t%0";
15322 }
15323
15324 if (STACK_REG_P (cmp_op1)
15325 && stack_top_dies
15326 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15327 && REGNO (cmp_op1) != FIRST_STACK_REG)
15328 {
15329 /* If both the top of the 387 stack dies, and the other operand
15330 is also a stack register that dies, then this must be a
15331 `fcompp' float compare */
15332
15333 if (eflags_p)
15334 {
15335 /* There is no double popping fcomi variant. Fortunately,
15336 eflags is immune from the fstp's cc clobbering. */
15337 if (unordered_p)
15338 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15339 else
15340 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15341 return output_387_ffreep (operands, 0);
15342 }
15343 else
15344 {
15345 if (unordered_p)
15346 return "fucompp\n\tfnstsw\t%0";
15347 else
15348 return "fcompp\n\tfnstsw\t%0";
15349 }
15350 }
15351 else
15352 {
15353 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15354
15355 static const char * const alt[16] =
15356 {
15357 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15358 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15359 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15360 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15361
15362 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15363 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15364 NULL,
15365 NULL,
15366
15367 "fcomi\t{%y1, %0|%0, %y1}",
15368 "fcomip\t{%y1, %0|%0, %y1}",
15369 "fucomi\t{%y1, %0|%0, %y1}",
15370 "fucomip\t{%y1, %0|%0, %y1}",
15371
15372 NULL,
15373 NULL,
15374 NULL,
15375 NULL
15376 };
15377
15378 int mask;
15379 const char *ret;
15380
15381 mask = eflags_p << 3;
15382 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15383 mask |= unordered_p << 1;
15384 mask |= stack_top_dies;
15385
15386 gcc_assert (mask < 16);
15387 ret = alt[mask];
15388 gcc_assert (ret);
15389
15390 return ret;
15391 }
15392 }
15393
15394 void
15395 ix86_output_addr_vec_elt (FILE *file, int value)
15396 {
15397 const char *directive = ASM_LONG;
15398
15399 #ifdef ASM_QUAD
15400 if (TARGET_LP64)
15401 directive = ASM_QUAD;
15402 #else
15403 gcc_assert (!TARGET_64BIT);
15404 #endif
15405
15406 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15407 }
15408
15409 void
15410 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15411 {
15412 const char *directive = ASM_LONG;
15413
15414 #ifdef ASM_QUAD
15415 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15416 directive = ASM_QUAD;
15417 #else
15418 gcc_assert (!TARGET_64BIT);
15419 #endif
15420 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15421 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15422 fprintf (file, "%s%s%d-%s%d\n",
15423 directive, LPREFIX, value, LPREFIX, rel);
15424 else if (HAVE_AS_GOTOFF_IN_DATA)
15425 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15426 #if TARGET_MACHO
15427 else if (TARGET_MACHO)
15428 {
15429 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15430 machopic_output_function_base_name (file);
15431 putc ('\n', file);
15432 }
15433 #endif
15434 else
15435 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15436 GOT_SYMBOL_NAME, LPREFIX, value);
15437 }
15438 \f
15439 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15440 for the target. */
15441
15442 void
15443 ix86_expand_clear (rtx dest)
15444 {
15445 rtx tmp;
15446
15447 /* We play register width games, which are only valid after reload. */
15448 gcc_assert (reload_completed);
15449
15450 /* Avoid HImode and its attendant prefix byte. */
15451 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15452 dest = gen_rtx_REG (SImode, REGNO (dest));
15453 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15454
15455 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15456 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15457 {
15458 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15459 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15460 }
15461
15462 emit_insn (tmp);
15463 }
15464
15465 /* X is an unchanging MEM. If it is a constant pool reference, return
15466 the constant pool rtx, else NULL. */
15467
15468 rtx
15469 maybe_get_pool_constant (rtx x)
15470 {
15471 x = ix86_delegitimize_address (XEXP (x, 0));
15472
15473 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15474 return get_pool_constant (x);
15475
15476 return NULL_RTX;
15477 }
15478
15479 void
15480 ix86_expand_move (enum machine_mode mode, rtx operands[])
15481 {
15482 rtx op0, op1;
15483 enum tls_model model;
15484
15485 op0 = operands[0];
15486 op1 = operands[1];
15487
15488 if (GET_CODE (op1) == SYMBOL_REF)
15489 {
15490 model = SYMBOL_REF_TLS_MODEL (op1);
15491 if (model)
15492 {
15493 op1 = legitimize_tls_address (op1, model, true);
15494 op1 = force_operand (op1, op0);
15495 if (op1 == op0)
15496 return;
15497 if (GET_MODE (op1) != mode)
15498 op1 = convert_to_mode (mode, op1, 1);
15499 }
15500 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15501 && SYMBOL_REF_DLLIMPORT_P (op1))
15502 op1 = legitimize_dllimport_symbol (op1, false);
15503 }
15504 else if (GET_CODE (op1) == CONST
15505 && GET_CODE (XEXP (op1, 0)) == PLUS
15506 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15507 {
15508 rtx addend = XEXP (XEXP (op1, 0), 1);
15509 rtx symbol = XEXP (XEXP (op1, 0), 0);
15510 rtx tmp = NULL;
15511
15512 model = SYMBOL_REF_TLS_MODEL (symbol);
15513 if (model)
15514 tmp = legitimize_tls_address (symbol, model, true);
15515 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15516 && SYMBOL_REF_DLLIMPORT_P (symbol))
15517 tmp = legitimize_dllimport_symbol (symbol, true);
15518
15519 if (tmp)
15520 {
15521 tmp = force_operand (tmp, NULL);
15522 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15523 op0, 1, OPTAB_DIRECT);
15524 if (tmp == op0)
15525 return;
15526 if (GET_MODE (tmp) != mode)
15527 op1 = convert_to_mode (mode, tmp, 1);
15528 }
15529 }
15530
15531 if ((flag_pic || MACHOPIC_INDIRECT)
15532 && symbolic_operand (op1, mode))
15533 {
15534 if (TARGET_MACHO && !TARGET_64BIT)
15535 {
15536 #if TARGET_MACHO
15537 /* dynamic-no-pic */
15538 if (MACHOPIC_INDIRECT)
15539 {
15540 rtx temp = ((reload_in_progress
15541 || ((op0 && REG_P (op0))
15542 && mode == Pmode))
15543 ? op0 : gen_reg_rtx (Pmode));
15544 op1 = machopic_indirect_data_reference (op1, temp);
15545 if (MACHOPIC_PURE)
15546 op1 = machopic_legitimize_pic_address (op1, mode,
15547 temp == op1 ? 0 : temp);
15548 }
15549 if (op0 != op1 && GET_CODE (op0) != MEM)
15550 {
15551 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15552 emit_insn (insn);
15553 return;
15554 }
15555 if (GET_CODE (op0) == MEM)
15556 op1 = force_reg (Pmode, op1);
15557 else
15558 {
15559 rtx temp = op0;
15560 if (GET_CODE (temp) != REG)
15561 temp = gen_reg_rtx (Pmode);
15562 temp = legitimize_pic_address (op1, temp);
15563 if (temp == op0)
15564 return;
15565 op1 = temp;
15566 }
15567 /* dynamic-no-pic */
15568 #endif
15569 }
15570 else
15571 {
15572 if (MEM_P (op0))
15573 op1 = force_reg (mode, op1);
15574 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15575 {
15576 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15577 op1 = legitimize_pic_address (op1, reg);
15578 if (op0 == op1)
15579 return;
15580 if (GET_MODE (op1) != mode)
15581 op1 = convert_to_mode (mode, op1, 1);
15582 }
15583 }
15584 }
15585 else
15586 {
15587 if (MEM_P (op0)
15588 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15589 || !push_operand (op0, mode))
15590 && MEM_P (op1))
15591 op1 = force_reg (mode, op1);
15592
15593 if (push_operand (op0, mode)
15594 && ! general_no_elim_operand (op1, mode))
15595 op1 = copy_to_mode_reg (mode, op1);
15596
15597 /* Force large constants in 64bit compilation into register
15598 to get them CSEed. */
15599 if (can_create_pseudo_p ()
15600 && (mode == DImode) && TARGET_64BIT
15601 && immediate_operand (op1, mode)
15602 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15603 && !register_operand (op0, mode)
15604 && optimize)
15605 op1 = copy_to_mode_reg (mode, op1);
15606
15607 if (can_create_pseudo_p ()
15608 && FLOAT_MODE_P (mode)
15609 && GET_CODE (op1) == CONST_DOUBLE)
15610 {
15611 /* If we are loading a floating point constant to a register,
15612 force the value to memory now, since we'll get better code
15613 out the back end. */
15614
15615 op1 = validize_mem (force_const_mem (mode, op1));
15616 if (!register_operand (op0, mode))
15617 {
15618 rtx temp = gen_reg_rtx (mode);
15619 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15620 emit_move_insn (op0, temp);
15621 return;
15622 }
15623 }
15624 }
15625
15626 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15627 }
15628
15629 void
15630 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15631 {
15632 rtx op0 = operands[0], op1 = operands[1];
15633 unsigned int align = GET_MODE_ALIGNMENT (mode);
15634
15635 /* Force constants other than zero into memory. We do not know how
15636 the instructions used to build constants modify the upper 64 bits
15637 of the register, once we have that information we may be able
15638 to handle some of them more efficiently. */
15639 if (can_create_pseudo_p ()
15640 && register_operand (op0, mode)
15641 && (CONSTANT_P (op1)
15642 || (GET_CODE (op1) == SUBREG
15643 && CONSTANT_P (SUBREG_REG (op1))))
15644 && !standard_sse_constant_p (op1))
15645 op1 = validize_mem (force_const_mem (mode, op1));
15646
15647 /* We need to check memory alignment for SSE mode since attribute
15648 can make operands unaligned. */
15649 if (can_create_pseudo_p ()
15650 && SSE_REG_MODE_P (mode)
15651 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15652 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15653 {
15654 rtx tmp[2];
15655
15656 /* ix86_expand_vector_move_misalign() does not like constants ... */
15657 if (CONSTANT_P (op1)
15658 || (GET_CODE (op1) == SUBREG
15659 && CONSTANT_P (SUBREG_REG (op1))))
15660 op1 = validize_mem (force_const_mem (mode, op1));
15661
15662 /* ... nor both arguments in memory. */
15663 if (!register_operand (op0, mode)
15664 && !register_operand (op1, mode))
15665 op1 = force_reg (mode, op1);
15666
15667 tmp[0] = op0; tmp[1] = op1;
15668 ix86_expand_vector_move_misalign (mode, tmp);
15669 return;
15670 }
15671
15672 /* Make operand1 a register if it isn't already. */
15673 if (can_create_pseudo_p ()
15674 && !register_operand (op0, mode)
15675 && !register_operand (op1, mode))
15676 {
15677 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15678 return;
15679 }
15680
15681 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15682 }
15683
15684 /* Split 32-byte AVX unaligned load and store if needed. */
15685
15686 static void
15687 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15688 {
15689 rtx m;
15690 rtx (*extract) (rtx, rtx, rtx);
15691 rtx (*move_unaligned) (rtx, rtx);
15692 enum machine_mode mode;
15693
15694 switch (GET_MODE (op0))
15695 {
15696 default:
15697 gcc_unreachable ();
15698 case V32QImode:
15699 extract = gen_avx_vextractf128v32qi;
15700 move_unaligned = gen_avx_movdqu256;
15701 mode = V16QImode;
15702 break;
15703 case V8SFmode:
15704 extract = gen_avx_vextractf128v8sf;
15705 move_unaligned = gen_avx_movups256;
15706 mode = V4SFmode;
15707 break;
15708 case V4DFmode:
15709 extract = gen_avx_vextractf128v4df;
15710 move_unaligned = gen_avx_movupd256;
15711 mode = V2DFmode;
15712 break;
15713 }
15714
15715 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15716 {
15717 rtx r = gen_reg_rtx (mode);
15718 m = adjust_address (op1, mode, 0);
15719 emit_move_insn (r, m);
15720 m = adjust_address (op1, mode, 16);
15721 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15722 emit_move_insn (op0, r);
15723 }
15724 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15725 {
15726 m = adjust_address (op0, mode, 0);
15727 emit_insn (extract (m, op1, const0_rtx));
15728 m = adjust_address (op0, mode, 16);
15729 emit_insn (extract (m, op1, const1_rtx));
15730 }
15731 else
15732 emit_insn (move_unaligned (op0, op1));
15733 }
15734
15735 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15736 straight to ix86_expand_vector_move. */
15737 /* Code generation for scalar reg-reg moves of single and double precision data:
15738 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15739 movaps reg, reg
15740 else
15741 movss reg, reg
15742 if (x86_sse_partial_reg_dependency == true)
15743 movapd reg, reg
15744 else
15745 movsd reg, reg
15746
15747 Code generation for scalar loads of double precision data:
15748 if (x86_sse_split_regs == true)
15749 movlpd mem, reg (gas syntax)
15750 else
15751 movsd mem, reg
15752
15753 Code generation for unaligned packed loads of single precision data
15754 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15755 if (x86_sse_unaligned_move_optimal)
15756 movups mem, reg
15757
15758 if (x86_sse_partial_reg_dependency == true)
15759 {
15760 xorps reg, reg
15761 movlps mem, reg
15762 movhps mem+8, reg
15763 }
15764 else
15765 {
15766 movlps mem, reg
15767 movhps mem+8, reg
15768 }
15769
15770 Code generation for unaligned packed loads of double precision data
15771 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15772 if (x86_sse_unaligned_move_optimal)
15773 movupd mem, reg
15774
15775 if (x86_sse_split_regs == true)
15776 {
15777 movlpd mem, reg
15778 movhpd mem+8, reg
15779 }
15780 else
15781 {
15782 movsd mem, reg
15783 movhpd mem+8, reg
15784 }
15785 */
15786
15787 void
15788 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15789 {
15790 rtx op0, op1, m;
15791
15792 op0 = operands[0];
15793 op1 = operands[1];
15794
15795 if (TARGET_AVX)
15796 {
15797 switch (GET_MODE_CLASS (mode))
15798 {
15799 case MODE_VECTOR_INT:
15800 case MODE_INT:
15801 switch (GET_MODE_SIZE (mode))
15802 {
15803 case 16:
15804 /* If we're optimizing for size, movups is the smallest. */
15805 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15806 {
15807 op0 = gen_lowpart (V4SFmode, op0);
15808 op1 = gen_lowpart (V4SFmode, op1);
15809 emit_insn (gen_sse_movups (op0, op1));
15810 return;
15811 }
15812 op0 = gen_lowpart (V16QImode, op0);
15813 op1 = gen_lowpart (V16QImode, op1);
15814 emit_insn (gen_sse2_movdqu (op0, op1));
15815 break;
15816 case 32:
15817 op0 = gen_lowpart (V32QImode, op0);
15818 op1 = gen_lowpart (V32QImode, op1);
15819 ix86_avx256_split_vector_move_misalign (op0, op1);
15820 break;
15821 default:
15822 gcc_unreachable ();
15823 }
15824 break;
15825 case MODE_VECTOR_FLOAT:
15826 op0 = gen_lowpart (mode, op0);
15827 op1 = gen_lowpart (mode, op1);
15828
15829 switch (mode)
15830 {
15831 case V4SFmode:
15832 emit_insn (gen_sse_movups (op0, op1));
15833 break;
15834 case V8SFmode:
15835 ix86_avx256_split_vector_move_misalign (op0, op1);
15836 break;
15837 case V2DFmode:
15838 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15839 {
15840 op0 = gen_lowpart (V4SFmode, op0);
15841 op1 = gen_lowpart (V4SFmode, op1);
15842 emit_insn (gen_sse_movups (op0, op1));
15843 return;
15844 }
15845 emit_insn (gen_sse2_movupd (op0, op1));
15846 break;
15847 case V4DFmode:
15848 ix86_avx256_split_vector_move_misalign (op0, op1);
15849 break;
15850 default:
15851 gcc_unreachable ();
15852 }
15853 break;
15854
15855 default:
15856 gcc_unreachable ();
15857 }
15858
15859 return;
15860 }
15861
15862 if (MEM_P (op1))
15863 {
15864 /* If we're optimizing for size, movups is the smallest. */
15865 if (optimize_insn_for_size_p ()
15866 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15867 {
15868 op0 = gen_lowpart (V4SFmode, op0);
15869 op1 = gen_lowpart (V4SFmode, op1);
15870 emit_insn (gen_sse_movups (op0, op1));
15871 return;
15872 }
15873
15874 /* ??? If we have typed data, then it would appear that using
15875 movdqu is the only way to get unaligned data loaded with
15876 integer type. */
15877 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15878 {
15879 op0 = gen_lowpart (V16QImode, op0);
15880 op1 = gen_lowpart (V16QImode, op1);
15881 emit_insn (gen_sse2_movdqu (op0, op1));
15882 return;
15883 }
15884
15885 if (TARGET_SSE2 && mode == V2DFmode)
15886 {
15887 rtx zero;
15888
15889 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15890 {
15891 op0 = gen_lowpart (V2DFmode, op0);
15892 op1 = gen_lowpart (V2DFmode, op1);
15893 emit_insn (gen_sse2_movupd (op0, op1));
15894 return;
15895 }
15896
15897 /* When SSE registers are split into halves, we can avoid
15898 writing to the top half twice. */
15899 if (TARGET_SSE_SPLIT_REGS)
15900 {
15901 emit_clobber (op0);
15902 zero = op0;
15903 }
15904 else
15905 {
15906 /* ??? Not sure about the best option for the Intel chips.
15907 The following would seem to satisfy; the register is
15908 entirely cleared, breaking the dependency chain. We
15909 then store to the upper half, with a dependency depth
15910 of one. A rumor has it that Intel recommends two movsd
15911 followed by an unpacklpd, but this is unconfirmed. And
15912 given that the dependency depth of the unpacklpd would
15913 still be one, I'm not sure why this would be better. */
15914 zero = CONST0_RTX (V2DFmode);
15915 }
15916
15917 m = adjust_address (op1, DFmode, 0);
15918 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15919 m = adjust_address (op1, DFmode, 8);
15920 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15921 }
15922 else
15923 {
15924 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15925 {
15926 op0 = gen_lowpart (V4SFmode, op0);
15927 op1 = gen_lowpart (V4SFmode, op1);
15928 emit_insn (gen_sse_movups (op0, op1));
15929 return;
15930 }
15931
15932 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15933 emit_move_insn (op0, CONST0_RTX (mode));
15934 else
15935 emit_clobber (op0);
15936
15937 if (mode != V4SFmode)
15938 op0 = gen_lowpart (V4SFmode, op0);
15939 m = adjust_address (op1, V2SFmode, 0);
15940 emit_insn (gen_sse_loadlps (op0, op0, m));
15941 m = adjust_address (op1, V2SFmode, 8);
15942 emit_insn (gen_sse_loadhps (op0, op0, m));
15943 }
15944 }
15945 else if (MEM_P (op0))
15946 {
15947 /* If we're optimizing for size, movups is the smallest. */
15948 if (optimize_insn_for_size_p ()
15949 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15950 {
15951 op0 = gen_lowpart (V4SFmode, op0);
15952 op1 = gen_lowpart (V4SFmode, op1);
15953 emit_insn (gen_sse_movups (op0, op1));
15954 return;
15955 }
15956
15957 /* ??? Similar to above, only less clear because of quote
15958 typeless stores unquote. */
15959 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15960 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15961 {
15962 op0 = gen_lowpart (V16QImode, op0);
15963 op1 = gen_lowpart (V16QImode, op1);
15964 emit_insn (gen_sse2_movdqu (op0, op1));
15965 return;
15966 }
15967
15968 if (TARGET_SSE2 && mode == V2DFmode)
15969 {
15970 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15971 {
15972 op0 = gen_lowpart (V2DFmode, op0);
15973 op1 = gen_lowpart (V2DFmode, op1);
15974 emit_insn (gen_sse2_movupd (op0, op1));
15975 }
15976 else
15977 {
15978 m = adjust_address (op0, DFmode, 0);
15979 emit_insn (gen_sse2_storelpd (m, op1));
15980 m = adjust_address (op0, DFmode, 8);
15981 emit_insn (gen_sse2_storehpd (m, op1));
15982 }
15983 }
15984 else
15985 {
15986 if (mode != V4SFmode)
15987 op1 = gen_lowpart (V4SFmode, op1);
15988
15989 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15990 {
15991 op0 = gen_lowpart (V4SFmode, op0);
15992 emit_insn (gen_sse_movups (op0, op1));
15993 }
15994 else
15995 {
15996 m = adjust_address (op0, V2SFmode, 0);
15997 emit_insn (gen_sse_storelps (m, op1));
15998 m = adjust_address (op0, V2SFmode, 8);
15999 emit_insn (gen_sse_storehps (m, op1));
16000 }
16001 }
16002 }
16003 else
16004 gcc_unreachable ();
16005 }
16006
16007 /* Expand a push in MODE. This is some mode for which we do not support
16008 proper push instructions, at least from the registers that we expect
16009 the value to live in. */
16010
16011 void
16012 ix86_expand_push (enum machine_mode mode, rtx x)
16013 {
16014 rtx tmp;
16015
16016 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16017 GEN_INT (-GET_MODE_SIZE (mode)),
16018 stack_pointer_rtx, 1, OPTAB_DIRECT);
16019 if (tmp != stack_pointer_rtx)
16020 emit_move_insn (stack_pointer_rtx, tmp);
16021
16022 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16023
16024 /* When we push an operand onto stack, it has to be aligned at least
16025 at the function argument boundary. However since we don't have
16026 the argument type, we can't determine the actual argument
16027 boundary. */
16028 emit_move_insn (tmp, x);
16029 }
16030
16031 /* Helper function of ix86_fixup_binary_operands to canonicalize
16032 operand order. Returns true if the operands should be swapped. */
16033
16034 static bool
16035 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16036 rtx operands[])
16037 {
16038 rtx dst = operands[0];
16039 rtx src1 = operands[1];
16040 rtx src2 = operands[2];
16041
16042 /* If the operation is not commutative, we can't do anything. */
16043 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16044 return false;
16045
16046 /* Highest priority is that src1 should match dst. */
16047 if (rtx_equal_p (dst, src1))
16048 return false;
16049 if (rtx_equal_p (dst, src2))
16050 return true;
16051
16052 /* Next highest priority is that immediate constants come second. */
16053 if (immediate_operand (src2, mode))
16054 return false;
16055 if (immediate_operand (src1, mode))
16056 return true;
16057
16058 /* Lowest priority is that memory references should come second. */
16059 if (MEM_P (src2))
16060 return false;
16061 if (MEM_P (src1))
16062 return true;
16063
16064 return false;
16065 }
16066
16067
16068 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16069 destination to use for the operation. If different from the true
16070 destination in operands[0], a copy operation will be required. */
16071
16072 rtx
16073 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16074 rtx operands[])
16075 {
16076 rtx dst = operands[0];
16077 rtx src1 = operands[1];
16078 rtx src2 = operands[2];
16079
16080 /* Canonicalize operand order. */
16081 if (ix86_swap_binary_operands_p (code, mode, operands))
16082 {
16083 rtx temp;
16084
16085 /* It is invalid to swap operands of different modes. */
16086 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16087
16088 temp = src1;
16089 src1 = src2;
16090 src2 = temp;
16091 }
16092
16093 /* Both source operands cannot be in memory. */
16094 if (MEM_P (src1) && MEM_P (src2))
16095 {
16096 /* Optimization: Only read from memory once. */
16097 if (rtx_equal_p (src1, src2))
16098 {
16099 src2 = force_reg (mode, src2);
16100 src1 = src2;
16101 }
16102 else
16103 src2 = force_reg (mode, src2);
16104 }
16105
16106 /* If the destination is memory, and we do not have matching source
16107 operands, do things in registers. */
16108 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16109 dst = gen_reg_rtx (mode);
16110
16111 /* Source 1 cannot be a constant. */
16112 if (CONSTANT_P (src1))
16113 src1 = force_reg (mode, src1);
16114
16115 /* Source 1 cannot be a non-matching memory. */
16116 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16117 src1 = force_reg (mode, src1);
16118
16119 /* Improve address combine. */
16120 if (code == PLUS
16121 && GET_MODE_CLASS (mode) == MODE_INT
16122 && MEM_P (src2))
16123 src2 = force_reg (mode, src2);
16124
16125 operands[1] = src1;
16126 operands[2] = src2;
16127 return dst;
16128 }
16129
16130 /* Similarly, but assume that the destination has already been
16131 set up properly. */
16132
16133 void
16134 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16135 enum machine_mode mode, rtx operands[])
16136 {
16137 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16138 gcc_assert (dst == operands[0]);
16139 }
16140
16141 /* Attempt to expand a binary operator. Make the expansion closer to the
16142 actual machine, then just general_operand, which will allow 3 separate
16143 memory references (one output, two input) in a single insn. */
16144
16145 void
16146 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16147 rtx operands[])
16148 {
16149 rtx src1, src2, dst, op, clob;
16150
16151 dst = ix86_fixup_binary_operands (code, mode, operands);
16152 src1 = operands[1];
16153 src2 = operands[2];
16154
16155 /* Emit the instruction. */
16156
16157 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16158 if (reload_in_progress)
16159 {
16160 /* Reload doesn't know about the flags register, and doesn't know that
16161 it doesn't want to clobber it. We can only do this with PLUS. */
16162 gcc_assert (code == PLUS);
16163 emit_insn (op);
16164 }
16165 else if (reload_completed
16166 && code == PLUS
16167 && !rtx_equal_p (dst, src1))
16168 {
16169 /* This is going to be an LEA; avoid splitting it later. */
16170 emit_insn (op);
16171 }
16172 else
16173 {
16174 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16175 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16176 }
16177
16178 /* Fix up the destination if needed. */
16179 if (dst != operands[0])
16180 emit_move_insn (operands[0], dst);
16181 }
16182
16183 /* Return TRUE or FALSE depending on whether the binary operator meets the
16184 appropriate constraints. */
16185
16186 bool
16187 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16188 rtx operands[3])
16189 {
16190 rtx dst = operands[0];
16191 rtx src1 = operands[1];
16192 rtx src2 = operands[2];
16193
16194 /* Both source operands cannot be in memory. */
16195 if (MEM_P (src1) && MEM_P (src2))
16196 return false;
16197
16198 /* Canonicalize operand order for commutative operators. */
16199 if (ix86_swap_binary_operands_p (code, mode, operands))
16200 {
16201 rtx temp = src1;
16202 src1 = src2;
16203 src2 = temp;
16204 }
16205
16206 /* If the destination is memory, we must have a matching source operand. */
16207 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16208 return false;
16209
16210 /* Source 1 cannot be a constant. */
16211 if (CONSTANT_P (src1))
16212 return false;
16213
16214 /* Source 1 cannot be a non-matching memory. */
16215 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16216 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16217 return (code == AND
16218 && (mode == HImode
16219 || mode == SImode
16220 || (TARGET_64BIT && mode == DImode))
16221 && satisfies_constraint_L (src2));
16222
16223 return true;
16224 }
16225
16226 /* Attempt to expand a unary operator. Make the expansion closer to the
16227 actual machine, then just general_operand, which will allow 2 separate
16228 memory references (one output, one input) in a single insn. */
16229
16230 void
16231 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16232 rtx operands[])
16233 {
16234 int matching_memory;
16235 rtx src, dst, op, clob;
16236
16237 dst = operands[0];
16238 src = operands[1];
16239
16240 /* If the destination is memory, and we do not have matching source
16241 operands, do things in registers. */
16242 matching_memory = 0;
16243 if (MEM_P (dst))
16244 {
16245 if (rtx_equal_p (dst, src))
16246 matching_memory = 1;
16247 else
16248 dst = gen_reg_rtx (mode);
16249 }
16250
16251 /* When source operand is memory, destination must match. */
16252 if (MEM_P (src) && !matching_memory)
16253 src = force_reg (mode, src);
16254
16255 /* Emit the instruction. */
16256
16257 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16258 if (reload_in_progress || code == NOT)
16259 {
16260 /* Reload doesn't know about the flags register, and doesn't know that
16261 it doesn't want to clobber it. */
16262 gcc_assert (code == NOT);
16263 emit_insn (op);
16264 }
16265 else
16266 {
16267 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16268 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16269 }
16270
16271 /* Fix up the destination if needed. */
16272 if (dst != operands[0])
16273 emit_move_insn (operands[0], dst);
16274 }
16275
16276 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16277 divisor are within the range [0-255]. */
16278
16279 void
16280 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16281 bool signed_p)
16282 {
16283 rtx end_label, qimode_label;
16284 rtx insn, div, mod;
16285 rtx scratch, tmp0, tmp1, tmp2;
16286 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16287 rtx (*gen_zero_extend) (rtx, rtx);
16288 rtx (*gen_test_ccno_1) (rtx, rtx);
16289
16290 switch (mode)
16291 {
16292 case SImode:
16293 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16294 gen_test_ccno_1 = gen_testsi_ccno_1;
16295 gen_zero_extend = gen_zero_extendqisi2;
16296 break;
16297 case DImode:
16298 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16299 gen_test_ccno_1 = gen_testdi_ccno_1;
16300 gen_zero_extend = gen_zero_extendqidi2;
16301 break;
16302 default:
16303 gcc_unreachable ();
16304 }
16305
16306 end_label = gen_label_rtx ();
16307 qimode_label = gen_label_rtx ();
16308
16309 scratch = gen_reg_rtx (mode);
16310
16311 /* Use 8bit unsigned divimod if dividend and divisor are within
16312 the range [0-255]. */
16313 emit_move_insn (scratch, operands[2]);
16314 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16315 scratch, 1, OPTAB_DIRECT);
16316 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16317 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16318 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16319 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16320 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16321 pc_rtx);
16322 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16323 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16324 JUMP_LABEL (insn) = qimode_label;
16325
16326 /* Generate original signed/unsigned divimod. */
16327 div = gen_divmod4_1 (operands[0], operands[1],
16328 operands[2], operands[3]);
16329 emit_insn (div);
16330
16331 /* Branch to the end. */
16332 emit_jump_insn (gen_jump (end_label));
16333 emit_barrier ();
16334
16335 /* Generate 8bit unsigned divide. */
16336 emit_label (qimode_label);
16337 /* Don't use operands[0] for result of 8bit divide since not all
16338 registers support QImode ZERO_EXTRACT. */
16339 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16340 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16341 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16342 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16343
16344 if (signed_p)
16345 {
16346 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16347 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16348 }
16349 else
16350 {
16351 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16352 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16353 }
16354
16355 /* Extract remainder from AH. */
16356 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16357 if (REG_P (operands[1]))
16358 insn = emit_move_insn (operands[1], tmp1);
16359 else
16360 {
16361 /* Need a new scratch register since the old one has result
16362 of 8bit divide. */
16363 scratch = gen_reg_rtx (mode);
16364 emit_move_insn (scratch, tmp1);
16365 insn = emit_move_insn (operands[1], scratch);
16366 }
16367 set_unique_reg_note (insn, REG_EQUAL, mod);
16368
16369 /* Zero extend quotient from AL. */
16370 tmp1 = gen_lowpart (QImode, tmp0);
16371 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16372 set_unique_reg_note (insn, REG_EQUAL, div);
16373
16374 emit_label (end_label);
16375 }
16376
16377 #define LEA_MAX_STALL (3)
16378 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16379
16380 /* Increase given DISTANCE in half-cycles according to
16381 dependencies between PREV and NEXT instructions.
16382 Add 1 half-cycle if there is no dependency and
16383 go to next cycle if there is some dependecy. */
16384
16385 static unsigned int
16386 increase_distance (rtx prev, rtx next, unsigned int distance)
16387 {
16388 df_ref *use_rec;
16389 df_ref *def_rec;
16390
16391 if (!prev || !next)
16392 return distance + (distance & 1) + 2;
16393
16394 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16395 return distance + 1;
16396
16397 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16398 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16399 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16400 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16401 return distance + (distance & 1) + 2;
16402
16403 return distance + 1;
16404 }
16405
16406 /* Function checks if instruction INSN defines register number
16407 REGNO1 or REGNO2. */
16408
16409 static bool
16410 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16411 rtx insn)
16412 {
16413 df_ref *def_rec;
16414
16415 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16416 if (DF_REF_REG_DEF_P (*def_rec)
16417 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16418 && (regno1 == DF_REF_REGNO (*def_rec)
16419 || regno2 == DF_REF_REGNO (*def_rec)))
16420 {
16421 return true;
16422 }
16423
16424 return false;
16425 }
16426
16427 /* Function checks if instruction INSN uses register number
16428 REGNO as a part of address expression. */
16429
16430 static bool
16431 insn_uses_reg_mem (unsigned int regno, rtx insn)
16432 {
16433 df_ref *use_rec;
16434
16435 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16436 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16437 return true;
16438
16439 return false;
16440 }
16441
16442 /* Search backward for non-agu definition of register number REGNO1
16443 or register number REGNO2 in basic block starting from instruction
16444 START up to head of basic block or instruction INSN.
16445
16446 Function puts true value into *FOUND var if definition was found
16447 and false otherwise.
16448
16449 Distance in half-cycles between START and found instruction or head
16450 of BB is added to DISTANCE and returned. */
16451
16452 static int
16453 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16454 rtx insn, int distance,
16455 rtx start, bool *found)
16456 {
16457 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16458 rtx prev = start;
16459 rtx next = NULL;
16460 enum attr_type insn_type;
16461
16462 *found = false;
16463
16464 while (prev
16465 && prev != insn
16466 && distance < LEA_SEARCH_THRESHOLD)
16467 {
16468 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16469 {
16470 distance = increase_distance (prev, next, distance);
16471 if (insn_defines_reg (regno1, regno2, prev))
16472 {
16473 insn_type = get_attr_type (prev);
16474 if (insn_type != TYPE_LEA)
16475 {
16476 *found = true;
16477 return distance;
16478 }
16479 }
16480
16481 next = prev;
16482 }
16483 if (prev == BB_HEAD (bb))
16484 break;
16485
16486 prev = PREV_INSN (prev);
16487 }
16488
16489 return distance;
16490 }
16491
16492 /* Search backward for non-agu definition of register number REGNO1
16493 or register number REGNO2 in INSN's basic block until
16494 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16495 2. Reach neighbour BBs boundary, or
16496 3. Reach agu definition.
16497 Returns the distance between the non-agu definition point and INSN.
16498 If no definition point, returns -1. */
16499
16500 static int
16501 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16502 rtx insn)
16503 {
16504 basic_block bb = BLOCK_FOR_INSN (insn);
16505 int distance = 0;
16506 bool found = false;
16507
16508 if (insn != BB_HEAD (bb))
16509 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16510 distance, PREV_INSN (insn),
16511 &found);
16512
16513 if (!found && distance < LEA_SEARCH_THRESHOLD)
16514 {
16515 edge e;
16516 edge_iterator ei;
16517 bool simple_loop = false;
16518
16519 FOR_EACH_EDGE (e, ei, bb->preds)
16520 if (e->src == bb)
16521 {
16522 simple_loop = true;
16523 break;
16524 }
16525
16526 if (simple_loop)
16527 distance = distance_non_agu_define_in_bb (regno1, regno2,
16528 insn, distance,
16529 BB_END (bb), &found);
16530 else
16531 {
16532 int shortest_dist = -1;
16533 bool found_in_bb = false;
16534
16535 FOR_EACH_EDGE (e, ei, bb->preds)
16536 {
16537 int bb_dist
16538 = distance_non_agu_define_in_bb (regno1, regno2,
16539 insn, distance,
16540 BB_END (e->src),
16541 &found_in_bb);
16542 if (found_in_bb)
16543 {
16544 if (shortest_dist < 0)
16545 shortest_dist = bb_dist;
16546 else if (bb_dist > 0)
16547 shortest_dist = MIN (bb_dist, shortest_dist);
16548
16549 found = true;
16550 }
16551 }
16552
16553 distance = shortest_dist;
16554 }
16555 }
16556
16557 /* get_attr_type may modify recog data. We want to make sure
16558 that recog data is valid for instruction INSN, on which
16559 distance_non_agu_define is called. INSN is unchanged here. */
16560 extract_insn_cached (insn);
16561
16562 if (!found)
16563 return -1;
16564
16565 return distance >> 1;
16566 }
16567
16568 /* Return the distance in half-cycles between INSN and the next
16569 insn that uses register number REGNO in memory address added
16570 to DISTANCE. Return -1 if REGNO0 is set.
16571
16572 Put true value into *FOUND if register usage was found and
16573 false otherwise.
16574 Put true value into *REDEFINED if register redefinition was
16575 found and false otherwise. */
16576
16577 static int
16578 distance_agu_use_in_bb (unsigned int regno,
16579 rtx insn, int distance, rtx start,
16580 bool *found, bool *redefined)
16581 {
16582 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16583 rtx next = start;
16584 rtx prev = NULL;
16585
16586 *found = false;
16587 *redefined = false;
16588
16589 while (next
16590 && next != insn
16591 && distance < LEA_SEARCH_THRESHOLD)
16592 {
16593 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16594 {
16595 distance = increase_distance(prev, next, distance);
16596 if (insn_uses_reg_mem (regno, next))
16597 {
16598 /* Return DISTANCE if OP0 is used in memory
16599 address in NEXT. */
16600 *found = true;
16601 return distance;
16602 }
16603
16604 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16605 {
16606 /* Return -1 if OP0 is set in NEXT. */
16607 *redefined = true;
16608 return -1;
16609 }
16610
16611 prev = next;
16612 }
16613
16614 if (next == BB_END (bb))
16615 break;
16616
16617 next = NEXT_INSN (next);
16618 }
16619
16620 return distance;
16621 }
16622
16623 /* Return the distance between INSN and the next insn that uses
16624 register number REGNO0 in memory address. Return -1 if no such
16625 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16626
16627 static int
16628 distance_agu_use (unsigned int regno0, rtx insn)
16629 {
16630 basic_block bb = BLOCK_FOR_INSN (insn);
16631 int distance = 0;
16632 bool found = false;
16633 bool redefined = false;
16634
16635 if (insn != BB_END (bb))
16636 distance = distance_agu_use_in_bb (regno0, insn, distance,
16637 NEXT_INSN (insn),
16638 &found, &redefined);
16639
16640 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16641 {
16642 edge e;
16643 edge_iterator ei;
16644 bool simple_loop = false;
16645
16646 FOR_EACH_EDGE (e, ei, bb->succs)
16647 if (e->dest == bb)
16648 {
16649 simple_loop = true;
16650 break;
16651 }
16652
16653 if (simple_loop)
16654 distance = distance_agu_use_in_bb (regno0, insn,
16655 distance, BB_HEAD (bb),
16656 &found, &redefined);
16657 else
16658 {
16659 int shortest_dist = -1;
16660 bool found_in_bb = false;
16661 bool redefined_in_bb = false;
16662
16663 FOR_EACH_EDGE (e, ei, bb->succs)
16664 {
16665 int bb_dist
16666 = distance_agu_use_in_bb (regno0, insn,
16667 distance, BB_HEAD (e->dest),
16668 &found_in_bb, &redefined_in_bb);
16669 if (found_in_bb)
16670 {
16671 if (shortest_dist < 0)
16672 shortest_dist = bb_dist;
16673 else if (bb_dist > 0)
16674 shortest_dist = MIN (bb_dist, shortest_dist);
16675
16676 found = true;
16677 }
16678 }
16679
16680 distance = shortest_dist;
16681 }
16682 }
16683
16684 if (!found || redefined)
16685 return -1;
16686
16687 return distance >> 1;
16688 }
16689
16690 /* Define this macro to tune LEA priority vs ADD, it take effect when
16691 there is a dilemma of choicing LEA or ADD
16692 Negative value: ADD is more preferred than LEA
16693 Zero: Netrual
16694 Positive value: LEA is more preferred than ADD*/
16695 #define IX86_LEA_PRIORITY 0
16696
16697 /* Return true if usage of lea INSN has performance advantage
16698 over a sequence of instructions. Instructions sequence has
16699 SPLIT_COST cycles higher latency than lea latency. */
16700
16701 bool
16702 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16703 unsigned int regno2, unsigned int split_cost)
16704 {
16705 int dist_define, dist_use;
16706
16707 dist_define = distance_non_agu_define (regno1, regno2, insn);
16708 dist_use = distance_agu_use (regno0, insn);
16709
16710 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16711 {
16712 /* If there is no non AGU operand definition, no AGU
16713 operand usage and split cost is 0 then both lea
16714 and non lea variants have same priority. Currently
16715 we prefer lea for 64 bit code and non lea on 32 bit
16716 code. */
16717 if (dist_use < 0 && split_cost == 0)
16718 return TARGET_64BIT || IX86_LEA_PRIORITY;
16719 else
16720 return true;
16721 }
16722
16723 /* With longer definitions distance lea is more preferable.
16724 Here we change it to take into account splitting cost and
16725 lea priority. */
16726 dist_define += split_cost + IX86_LEA_PRIORITY;
16727
16728 /* If there is no use in memory addess then we just check
16729 that split cost does not exceed AGU stall. */
16730 if (dist_use < 0)
16731 return dist_define >= LEA_MAX_STALL;
16732
16733 /* If this insn has both backward non-agu dependence and forward
16734 agu dependence, the one with short distance takes effect. */
16735 return dist_define >= dist_use;
16736 }
16737
16738 /* Return true if it is legal to clobber flags by INSN and
16739 false otherwise. */
16740
16741 static bool
16742 ix86_ok_to_clobber_flags (rtx insn)
16743 {
16744 basic_block bb = BLOCK_FOR_INSN (insn);
16745 df_ref *use;
16746 bitmap live;
16747
16748 while (insn)
16749 {
16750 if (NONDEBUG_INSN_P (insn))
16751 {
16752 for (use = DF_INSN_USES (insn); *use; use++)
16753 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16754 return false;
16755
16756 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16757 return true;
16758 }
16759
16760 if (insn == BB_END (bb))
16761 break;
16762
16763 insn = NEXT_INSN (insn);
16764 }
16765
16766 live = df_get_live_out(bb);
16767 return !REGNO_REG_SET_P (live, FLAGS_REG);
16768 }
16769
16770 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16771 move and add to avoid AGU stalls. */
16772
16773 bool
16774 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16775 {
16776 unsigned int regno0 = true_regnum (operands[0]);
16777 unsigned int regno1 = true_regnum (operands[1]);
16778 unsigned int regno2 = true_regnum (operands[2]);
16779
16780 /* Check if we need to optimize. */
16781 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16782 return false;
16783
16784 /* Check it is correct to split here. */
16785 if (!ix86_ok_to_clobber_flags(insn))
16786 return false;
16787
16788 /* We need to split only adds with non destructive
16789 destination operand. */
16790 if (regno0 == regno1 || regno0 == regno2)
16791 return false;
16792 else
16793 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16794 }
16795
16796 /* Return true if we should emit lea instruction instead of mov
16797 instruction. */
16798
16799 bool
16800 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16801 {
16802 unsigned int regno0;
16803 unsigned int regno1;
16804
16805 /* Check if we need to optimize. */
16806 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16807 return false;
16808
16809 /* Use lea for reg to reg moves only. */
16810 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16811 return false;
16812
16813 regno0 = true_regnum (operands[0]);
16814 regno1 = true_regnum (operands[1]);
16815
16816 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16817 }
16818
16819 /* Return true if we need to split lea into a sequence of
16820 instructions to avoid AGU stalls. */
16821
16822 bool
16823 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16824 {
16825 unsigned int regno0 = true_regnum (operands[0]) ;
16826 unsigned int regno1 = -1;
16827 unsigned int regno2 = -1;
16828 unsigned int split_cost = 0;
16829 struct ix86_address parts;
16830 int ok;
16831
16832 /* Check we need to optimize. */
16833 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16834 return false;
16835
16836 /* Check it is correct to split here. */
16837 if (!ix86_ok_to_clobber_flags(insn))
16838 return false;
16839
16840 ok = ix86_decompose_address (operands[1], &parts);
16841 gcc_assert (ok);
16842
16843 /* We should not split into add if non legitimate pic
16844 operand is used as displacement. */
16845 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16846 return false;
16847
16848 if (parts.base)
16849 regno1 = true_regnum (parts.base);
16850 if (parts.index)
16851 regno2 = true_regnum (parts.index);
16852
16853 /* Compute how many cycles we will add to execution time
16854 if split lea into a sequence of instructions. */
16855 if (parts.base || parts.index)
16856 {
16857 /* Have to use mov instruction if non desctructive
16858 destination form is used. */
16859 if (regno1 != regno0 && regno2 != regno0)
16860 split_cost += 1;
16861
16862 /* Have to add index to base if both exist. */
16863 if (parts.base && parts.index)
16864 split_cost += 1;
16865
16866 /* Have to use shift and adds if scale is 2 or greater. */
16867 if (parts.scale > 1)
16868 {
16869 if (regno0 != regno1)
16870 split_cost += 1;
16871 else if (regno2 == regno0)
16872 split_cost += 4;
16873 else
16874 split_cost += parts.scale;
16875 }
16876
16877 /* Have to use add instruction with immediate if
16878 disp is non zero. */
16879 if (parts.disp && parts.disp != const0_rtx)
16880 split_cost += 1;
16881
16882 /* Subtract the price of lea. */
16883 split_cost -= 1;
16884 }
16885
16886 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16887 }
16888
16889 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16890 matches destination. RTX includes clobber of FLAGS_REG. */
16891
16892 static void
16893 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16894 rtx dst, rtx src)
16895 {
16896 rtx op, clob;
16897
16898 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16899 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16900
16901 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16902 }
16903
16904 /* Split lea instructions into a sequence of instructions
16905 which are executed on ALU to avoid AGU stalls.
16906 It is assumed that it is allowed to clobber flags register
16907 at lea position. */
16908
16909 extern void
16910 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16911 {
16912 unsigned int regno0 = true_regnum (operands[0]) ;
16913 unsigned int regno1 = INVALID_REGNUM;
16914 unsigned int regno2 = INVALID_REGNUM;
16915 struct ix86_address parts;
16916 rtx tmp;
16917 int ok, adds;
16918
16919 ok = ix86_decompose_address (operands[1], &parts);
16920 gcc_assert (ok);
16921
16922 if (parts.base)
16923 {
16924 if (GET_MODE (parts.base) != mode)
16925 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16926 regno1 = true_regnum (parts.base);
16927 }
16928
16929 if (parts.index)
16930 {
16931 if (GET_MODE (parts.index) != mode)
16932 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16933 regno2 = true_regnum (parts.index);
16934 }
16935
16936 if (parts.scale > 1)
16937 {
16938 /* Case r1 = r1 + ... */
16939 if (regno1 == regno0)
16940 {
16941 /* If we have a case r1 = r1 + C * r1 then we
16942 should use multiplication which is very
16943 expensive. Assume cost model is wrong if we
16944 have such case here. */
16945 gcc_assert (regno2 != regno0);
16946
16947 for (adds = parts.scale; adds > 0; adds--)
16948 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16949 }
16950 else
16951 {
16952 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16953 if (regno0 != regno2)
16954 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16955
16956 /* Use shift for scaling. */
16957 ix86_emit_binop (ASHIFT, mode, operands[0],
16958 GEN_INT (exact_log2 (parts.scale)));
16959
16960 if (parts.base)
16961 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16962
16963 if (parts.disp && parts.disp != const0_rtx)
16964 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16965 }
16966 }
16967 else if (!parts.base && !parts.index)
16968 {
16969 gcc_assert(parts.disp);
16970 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16971 }
16972 else
16973 {
16974 if (!parts.base)
16975 {
16976 if (regno0 != regno2)
16977 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16978 }
16979 else if (!parts.index)
16980 {
16981 if (regno0 != regno1)
16982 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16983 }
16984 else
16985 {
16986 if (regno0 == regno1)
16987 tmp = parts.index;
16988 else if (regno0 == regno2)
16989 tmp = parts.base;
16990 else
16991 {
16992 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16993 tmp = parts.index;
16994 }
16995
16996 ix86_emit_binop (PLUS, mode, operands[0], tmp);
16997 }
16998
16999 if (parts.disp && parts.disp != const0_rtx)
17000 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
17001 }
17002 }
17003
17004 /* Return true if it is ok to optimize an ADD operation to LEA
17005 operation to avoid flag register consumation. For most processors,
17006 ADD is faster than LEA. For the processors like ATOM, if the
17007 destination register of LEA holds an actual address which will be
17008 used soon, LEA is better and otherwise ADD is better. */
17009
17010 bool
17011 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17012 {
17013 unsigned int regno0 = true_regnum (operands[0]);
17014 unsigned int regno1 = true_regnum (operands[1]);
17015 unsigned int regno2 = true_regnum (operands[2]);
17016
17017 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17018 if (regno0 != regno1 && regno0 != regno2)
17019 return true;
17020
17021 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17022 return false;
17023
17024 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17025 }
17026
17027 /* Return true if destination reg of SET_BODY is shift count of
17028 USE_BODY. */
17029
17030 static bool
17031 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17032 {
17033 rtx set_dest;
17034 rtx shift_rtx;
17035 int i;
17036
17037 /* Retrieve destination of SET_BODY. */
17038 switch (GET_CODE (set_body))
17039 {
17040 case SET:
17041 set_dest = SET_DEST (set_body);
17042 if (!set_dest || !REG_P (set_dest))
17043 return false;
17044 break;
17045 case PARALLEL:
17046 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17047 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17048 use_body))
17049 return true;
17050 default:
17051 return false;
17052 break;
17053 }
17054
17055 /* Retrieve shift count of USE_BODY. */
17056 switch (GET_CODE (use_body))
17057 {
17058 case SET:
17059 shift_rtx = XEXP (use_body, 1);
17060 break;
17061 case PARALLEL:
17062 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17063 if (ix86_dep_by_shift_count_body (set_body,
17064 XVECEXP (use_body, 0, i)))
17065 return true;
17066 default:
17067 return false;
17068 break;
17069 }
17070
17071 if (shift_rtx
17072 && (GET_CODE (shift_rtx) == ASHIFT
17073 || GET_CODE (shift_rtx) == LSHIFTRT
17074 || GET_CODE (shift_rtx) == ASHIFTRT
17075 || GET_CODE (shift_rtx) == ROTATE
17076 || GET_CODE (shift_rtx) == ROTATERT))
17077 {
17078 rtx shift_count = XEXP (shift_rtx, 1);
17079
17080 /* Return true if shift count is dest of SET_BODY. */
17081 if (REG_P (shift_count)
17082 && true_regnum (set_dest) == true_regnum (shift_count))
17083 return true;
17084 }
17085
17086 return false;
17087 }
17088
17089 /* Return true if destination reg of SET_INSN is shift count of
17090 USE_INSN. */
17091
17092 bool
17093 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17094 {
17095 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17096 PATTERN (use_insn));
17097 }
17098
17099 /* Return TRUE or FALSE depending on whether the unary operator meets the
17100 appropriate constraints. */
17101
17102 bool
17103 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17104 enum machine_mode mode ATTRIBUTE_UNUSED,
17105 rtx operands[2] ATTRIBUTE_UNUSED)
17106 {
17107 /* If one of operands is memory, source and destination must match. */
17108 if ((MEM_P (operands[0])
17109 || MEM_P (operands[1]))
17110 && ! rtx_equal_p (operands[0], operands[1]))
17111 return false;
17112 return true;
17113 }
17114
17115 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17116 are ok, keeping in mind the possible movddup alternative. */
17117
17118 bool
17119 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17120 {
17121 if (MEM_P (operands[0]))
17122 return rtx_equal_p (operands[0], operands[1 + high]);
17123 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17124 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17125 return true;
17126 }
17127
17128 /* Post-reload splitter for converting an SF or DFmode value in an
17129 SSE register into an unsigned SImode. */
17130
17131 void
17132 ix86_split_convert_uns_si_sse (rtx operands[])
17133 {
17134 enum machine_mode vecmode;
17135 rtx value, large, zero_or_two31, input, two31, x;
17136
17137 large = operands[1];
17138 zero_or_two31 = operands[2];
17139 input = operands[3];
17140 two31 = operands[4];
17141 vecmode = GET_MODE (large);
17142 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17143
17144 /* Load up the value into the low element. We must ensure that the other
17145 elements are valid floats -- zero is the easiest such value. */
17146 if (MEM_P (input))
17147 {
17148 if (vecmode == V4SFmode)
17149 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17150 else
17151 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17152 }
17153 else
17154 {
17155 input = gen_rtx_REG (vecmode, REGNO (input));
17156 emit_move_insn (value, CONST0_RTX (vecmode));
17157 if (vecmode == V4SFmode)
17158 emit_insn (gen_sse_movss (value, value, input));
17159 else
17160 emit_insn (gen_sse2_movsd (value, value, input));
17161 }
17162
17163 emit_move_insn (large, two31);
17164 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17165
17166 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17167 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17168
17169 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17170 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17171
17172 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17173 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17174
17175 large = gen_rtx_REG (V4SImode, REGNO (large));
17176 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17177
17178 x = gen_rtx_REG (V4SImode, REGNO (value));
17179 if (vecmode == V4SFmode)
17180 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17181 else
17182 emit_insn (gen_sse2_cvttpd2dq (x, value));
17183 value = x;
17184
17185 emit_insn (gen_xorv4si3 (value, value, large));
17186 }
17187
17188 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17189 Expects the 64-bit DImode to be supplied in a pair of integral
17190 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17191 -mfpmath=sse, !optimize_size only. */
17192
17193 void
17194 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17195 {
17196 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17197 rtx int_xmm, fp_xmm;
17198 rtx biases, exponents;
17199 rtx x;
17200
17201 int_xmm = gen_reg_rtx (V4SImode);
17202 if (TARGET_INTER_UNIT_MOVES)
17203 emit_insn (gen_movdi_to_sse (int_xmm, input));
17204 else if (TARGET_SSE_SPLIT_REGS)
17205 {
17206 emit_clobber (int_xmm);
17207 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17208 }
17209 else
17210 {
17211 x = gen_reg_rtx (V2DImode);
17212 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17213 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17214 }
17215
17216 x = gen_rtx_CONST_VECTOR (V4SImode,
17217 gen_rtvec (4, GEN_INT (0x43300000UL),
17218 GEN_INT (0x45300000UL),
17219 const0_rtx, const0_rtx));
17220 exponents = validize_mem (force_const_mem (V4SImode, x));
17221
17222 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17223 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17224
17225 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17226 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17227 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17228 (0x1.0p84 + double(fp_value_hi_xmm)).
17229 Note these exponents differ by 32. */
17230
17231 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17232
17233 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17234 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17235 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17236 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17237 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17238 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17239 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17240 biases = validize_mem (force_const_mem (V2DFmode, biases));
17241 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17242
17243 /* Add the upper and lower DFmode values together. */
17244 if (TARGET_SSE3)
17245 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17246 else
17247 {
17248 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17249 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17250 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17251 }
17252
17253 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17254 }
17255
17256 /* Not used, but eases macroization of patterns. */
17257 void
17258 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17259 rtx input ATTRIBUTE_UNUSED)
17260 {
17261 gcc_unreachable ();
17262 }
17263
17264 /* Convert an unsigned SImode value into a DFmode. Only currently used
17265 for SSE, but applicable anywhere. */
17266
17267 void
17268 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17269 {
17270 REAL_VALUE_TYPE TWO31r;
17271 rtx x, fp;
17272
17273 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17274 NULL, 1, OPTAB_DIRECT);
17275
17276 fp = gen_reg_rtx (DFmode);
17277 emit_insn (gen_floatsidf2 (fp, x));
17278
17279 real_ldexp (&TWO31r, &dconst1, 31);
17280 x = const_double_from_real_value (TWO31r, DFmode);
17281
17282 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17283 if (x != target)
17284 emit_move_insn (target, x);
17285 }
17286
17287 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17288 32-bit mode; otherwise we have a direct convert instruction. */
17289
17290 void
17291 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17292 {
17293 REAL_VALUE_TYPE TWO32r;
17294 rtx fp_lo, fp_hi, x;
17295
17296 fp_lo = gen_reg_rtx (DFmode);
17297 fp_hi = gen_reg_rtx (DFmode);
17298
17299 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17300
17301 real_ldexp (&TWO32r, &dconst1, 32);
17302 x = const_double_from_real_value (TWO32r, DFmode);
17303 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17304
17305 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17306
17307 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17308 0, OPTAB_DIRECT);
17309 if (x != target)
17310 emit_move_insn (target, x);
17311 }
17312
17313 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17314 For x86_32, -mfpmath=sse, !optimize_size only. */
17315 void
17316 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17317 {
17318 REAL_VALUE_TYPE ONE16r;
17319 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17320
17321 real_ldexp (&ONE16r, &dconst1, 16);
17322 x = const_double_from_real_value (ONE16r, SFmode);
17323 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17324 NULL, 0, OPTAB_DIRECT);
17325 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17326 NULL, 0, OPTAB_DIRECT);
17327 fp_hi = gen_reg_rtx (SFmode);
17328 fp_lo = gen_reg_rtx (SFmode);
17329 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17330 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17331 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17332 0, OPTAB_DIRECT);
17333 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17334 0, OPTAB_DIRECT);
17335 if (!rtx_equal_p (target, fp_hi))
17336 emit_move_insn (target, fp_hi);
17337 }
17338
17339 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17340 a vector of unsigned ints VAL to vector of floats TARGET. */
17341
17342 void
17343 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17344 {
17345 rtx tmp[8];
17346 REAL_VALUE_TYPE TWO16r;
17347 enum machine_mode intmode = GET_MODE (val);
17348 enum machine_mode fltmode = GET_MODE (target);
17349 rtx (*cvt) (rtx, rtx);
17350
17351 if (intmode == V4SImode)
17352 cvt = gen_floatv4siv4sf2;
17353 else
17354 cvt = gen_floatv8siv8sf2;
17355 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17356 tmp[0] = force_reg (intmode, tmp[0]);
17357 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17358 OPTAB_DIRECT);
17359 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17360 NULL_RTX, 1, OPTAB_DIRECT);
17361 tmp[3] = gen_reg_rtx (fltmode);
17362 emit_insn (cvt (tmp[3], tmp[1]));
17363 tmp[4] = gen_reg_rtx (fltmode);
17364 emit_insn (cvt (tmp[4], tmp[2]));
17365 real_ldexp (&TWO16r, &dconst1, 16);
17366 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17367 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17368 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17369 OPTAB_DIRECT);
17370 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17371 OPTAB_DIRECT);
17372 if (tmp[7] != target)
17373 emit_move_insn (target, tmp[7]);
17374 }
17375
17376 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17377 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17378 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17379 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17380
17381 rtx
17382 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17383 {
17384 REAL_VALUE_TYPE TWO31r;
17385 rtx two31r, tmp[4];
17386 enum machine_mode mode = GET_MODE (val);
17387 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17388 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17389 rtx (*cmp) (rtx, rtx, rtx, rtx);
17390 int i;
17391
17392 for (i = 0; i < 3; i++)
17393 tmp[i] = gen_reg_rtx (mode);
17394 real_ldexp (&TWO31r, &dconst1, 31);
17395 two31r = const_double_from_real_value (TWO31r, scalarmode);
17396 two31r = ix86_build_const_vector (mode, 1, two31r);
17397 two31r = force_reg (mode, two31r);
17398 switch (mode)
17399 {
17400 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17401 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17402 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17403 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17404 default: gcc_unreachable ();
17405 }
17406 tmp[3] = gen_rtx_LE (mode, two31r, val);
17407 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17408 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17409 0, OPTAB_DIRECT);
17410 if (intmode == V4SImode || TARGET_AVX2)
17411 *xorp = expand_simple_binop (intmode, ASHIFT,
17412 gen_lowpart (intmode, tmp[0]),
17413 GEN_INT (31), NULL_RTX, 0,
17414 OPTAB_DIRECT);
17415 else
17416 {
17417 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17418 two31 = ix86_build_const_vector (intmode, 1, two31);
17419 *xorp = expand_simple_binop (intmode, AND,
17420 gen_lowpart (intmode, tmp[0]),
17421 two31, NULL_RTX, 0,
17422 OPTAB_DIRECT);
17423 }
17424 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17425 0, OPTAB_DIRECT);
17426 }
17427
17428 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17429 then replicate the value for all elements of the vector
17430 register. */
17431
17432 rtx
17433 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17434 {
17435 int i, n_elt;
17436 rtvec v;
17437 enum machine_mode scalar_mode;
17438
17439 switch (mode)
17440 {
17441 case V32QImode:
17442 case V16QImode:
17443 case V16HImode:
17444 case V8HImode:
17445 case V8SImode:
17446 case V4SImode:
17447 case V4DImode:
17448 case V2DImode:
17449 gcc_assert (vect);
17450 case V8SFmode:
17451 case V4SFmode:
17452 case V4DFmode:
17453 case V2DFmode:
17454 n_elt = GET_MODE_NUNITS (mode);
17455 v = rtvec_alloc (n_elt);
17456 scalar_mode = GET_MODE_INNER (mode);
17457
17458 RTVEC_ELT (v, 0) = value;
17459
17460 for (i = 1; i < n_elt; ++i)
17461 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17462
17463 return gen_rtx_CONST_VECTOR (mode, v);
17464
17465 default:
17466 gcc_unreachable ();
17467 }
17468 }
17469
17470 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17471 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17472 for an SSE register. If VECT is true, then replicate the mask for
17473 all elements of the vector register. If INVERT is true, then create
17474 a mask excluding the sign bit. */
17475
17476 rtx
17477 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17478 {
17479 enum machine_mode vec_mode, imode;
17480 HOST_WIDE_INT hi, lo;
17481 int shift = 63;
17482 rtx v;
17483 rtx mask;
17484
17485 /* Find the sign bit, sign extended to 2*HWI. */
17486 switch (mode)
17487 {
17488 case V8SImode:
17489 case V4SImode:
17490 case V8SFmode:
17491 case V4SFmode:
17492 vec_mode = mode;
17493 mode = GET_MODE_INNER (mode);
17494 imode = SImode;
17495 lo = 0x80000000, hi = lo < 0;
17496 break;
17497
17498 case V4DImode:
17499 case V2DImode:
17500 case V4DFmode:
17501 case V2DFmode:
17502 vec_mode = mode;
17503 mode = GET_MODE_INNER (mode);
17504 imode = DImode;
17505 if (HOST_BITS_PER_WIDE_INT >= 64)
17506 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17507 else
17508 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17509 break;
17510
17511 case TImode:
17512 case TFmode:
17513 vec_mode = VOIDmode;
17514 if (HOST_BITS_PER_WIDE_INT >= 64)
17515 {
17516 imode = TImode;
17517 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17518 }
17519 else
17520 {
17521 rtvec vec;
17522
17523 imode = DImode;
17524 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17525
17526 if (invert)
17527 {
17528 lo = ~lo, hi = ~hi;
17529 v = constm1_rtx;
17530 }
17531 else
17532 v = const0_rtx;
17533
17534 mask = immed_double_const (lo, hi, imode);
17535
17536 vec = gen_rtvec (2, v, mask);
17537 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17538 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17539
17540 return v;
17541 }
17542 break;
17543
17544 default:
17545 gcc_unreachable ();
17546 }
17547
17548 if (invert)
17549 lo = ~lo, hi = ~hi;
17550
17551 /* Force this value into the low part of a fp vector constant. */
17552 mask = immed_double_const (lo, hi, imode);
17553 mask = gen_lowpart (mode, mask);
17554
17555 if (vec_mode == VOIDmode)
17556 return force_reg (mode, mask);
17557
17558 v = ix86_build_const_vector (vec_mode, vect, mask);
17559 return force_reg (vec_mode, v);
17560 }
17561
17562 /* Generate code for floating point ABS or NEG. */
17563
17564 void
17565 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17566 rtx operands[])
17567 {
17568 rtx mask, set, dst, src;
17569 bool use_sse = false;
17570 bool vector_mode = VECTOR_MODE_P (mode);
17571 enum machine_mode vmode = mode;
17572
17573 if (vector_mode)
17574 use_sse = true;
17575 else if (mode == TFmode)
17576 use_sse = true;
17577 else if (TARGET_SSE_MATH)
17578 {
17579 use_sse = SSE_FLOAT_MODE_P (mode);
17580 if (mode == SFmode)
17581 vmode = V4SFmode;
17582 else if (mode == DFmode)
17583 vmode = V2DFmode;
17584 }
17585
17586 /* NEG and ABS performed with SSE use bitwise mask operations.
17587 Create the appropriate mask now. */
17588 if (use_sse)
17589 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17590 else
17591 mask = NULL_RTX;
17592
17593 dst = operands[0];
17594 src = operands[1];
17595
17596 set = gen_rtx_fmt_e (code, mode, src);
17597 set = gen_rtx_SET (VOIDmode, dst, set);
17598
17599 if (mask)
17600 {
17601 rtx use, clob;
17602 rtvec par;
17603
17604 use = gen_rtx_USE (VOIDmode, mask);
17605 if (vector_mode)
17606 par = gen_rtvec (2, set, use);
17607 else
17608 {
17609 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17610 par = gen_rtvec (3, set, use, clob);
17611 }
17612 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17613 }
17614 else
17615 emit_insn (set);
17616 }
17617
17618 /* Expand a copysign operation. Special case operand 0 being a constant. */
17619
17620 void
17621 ix86_expand_copysign (rtx operands[])
17622 {
17623 enum machine_mode mode, vmode;
17624 rtx dest, op0, op1, mask, nmask;
17625
17626 dest = operands[0];
17627 op0 = operands[1];
17628 op1 = operands[2];
17629
17630 mode = GET_MODE (dest);
17631
17632 if (mode == SFmode)
17633 vmode = V4SFmode;
17634 else if (mode == DFmode)
17635 vmode = V2DFmode;
17636 else
17637 vmode = mode;
17638
17639 if (GET_CODE (op0) == CONST_DOUBLE)
17640 {
17641 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17642
17643 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17644 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17645
17646 if (mode == SFmode || mode == DFmode)
17647 {
17648 if (op0 == CONST0_RTX (mode))
17649 op0 = CONST0_RTX (vmode);
17650 else
17651 {
17652 rtx v = ix86_build_const_vector (vmode, false, op0);
17653
17654 op0 = force_reg (vmode, v);
17655 }
17656 }
17657 else if (op0 != CONST0_RTX (mode))
17658 op0 = force_reg (mode, op0);
17659
17660 mask = ix86_build_signbit_mask (vmode, 0, 0);
17661
17662 if (mode == SFmode)
17663 copysign_insn = gen_copysignsf3_const;
17664 else if (mode == DFmode)
17665 copysign_insn = gen_copysigndf3_const;
17666 else
17667 copysign_insn = gen_copysigntf3_const;
17668
17669 emit_insn (copysign_insn (dest, op0, op1, mask));
17670 }
17671 else
17672 {
17673 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17674
17675 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17676 mask = ix86_build_signbit_mask (vmode, 0, 0);
17677
17678 if (mode == SFmode)
17679 copysign_insn = gen_copysignsf3_var;
17680 else if (mode == DFmode)
17681 copysign_insn = gen_copysigndf3_var;
17682 else
17683 copysign_insn = gen_copysigntf3_var;
17684
17685 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17686 }
17687 }
17688
17689 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17690 be a constant, and so has already been expanded into a vector constant. */
17691
17692 void
17693 ix86_split_copysign_const (rtx operands[])
17694 {
17695 enum machine_mode mode, vmode;
17696 rtx dest, op0, mask, x;
17697
17698 dest = operands[0];
17699 op0 = operands[1];
17700 mask = operands[3];
17701
17702 mode = GET_MODE (dest);
17703 vmode = GET_MODE (mask);
17704
17705 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17706 x = gen_rtx_AND (vmode, dest, mask);
17707 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17708
17709 if (op0 != CONST0_RTX (vmode))
17710 {
17711 x = gen_rtx_IOR (vmode, dest, op0);
17712 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17713 }
17714 }
17715
17716 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17717 so we have to do two masks. */
17718
17719 void
17720 ix86_split_copysign_var (rtx operands[])
17721 {
17722 enum machine_mode mode, vmode;
17723 rtx dest, scratch, op0, op1, mask, nmask, x;
17724
17725 dest = operands[0];
17726 scratch = operands[1];
17727 op0 = operands[2];
17728 op1 = operands[3];
17729 nmask = operands[4];
17730 mask = operands[5];
17731
17732 mode = GET_MODE (dest);
17733 vmode = GET_MODE (mask);
17734
17735 if (rtx_equal_p (op0, op1))
17736 {
17737 /* Shouldn't happen often (it's useless, obviously), but when it does
17738 we'd generate incorrect code if we continue below. */
17739 emit_move_insn (dest, op0);
17740 return;
17741 }
17742
17743 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17744 {
17745 gcc_assert (REGNO (op1) == REGNO (scratch));
17746
17747 x = gen_rtx_AND (vmode, scratch, mask);
17748 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17749
17750 dest = mask;
17751 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17752 x = gen_rtx_NOT (vmode, dest);
17753 x = gen_rtx_AND (vmode, x, op0);
17754 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17755 }
17756 else
17757 {
17758 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17759 {
17760 x = gen_rtx_AND (vmode, scratch, mask);
17761 }
17762 else /* alternative 2,4 */
17763 {
17764 gcc_assert (REGNO (mask) == REGNO (scratch));
17765 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17766 x = gen_rtx_AND (vmode, scratch, op1);
17767 }
17768 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17769
17770 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17771 {
17772 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17773 x = gen_rtx_AND (vmode, dest, nmask);
17774 }
17775 else /* alternative 3,4 */
17776 {
17777 gcc_assert (REGNO (nmask) == REGNO (dest));
17778 dest = nmask;
17779 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17780 x = gen_rtx_AND (vmode, dest, op0);
17781 }
17782 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17783 }
17784
17785 x = gen_rtx_IOR (vmode, dest, scratch);
17786 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17787 }
17788
17789 /* Return TRUE or FALSE depending on whether the first SET in INSN
17790 has source and destination with matching CC modes, and that the
17791 CC mode is at least as constrained as REQ_MODE. */
17792
17793 bool
17794 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17795 {
17796 rtx set;
17797 enum machine_mode set_mode;
17798
17799 set = PATTERN (insn);
17800 if (GET_CODE (set) == PARALLEL)
17801 set = XVECEXP (set, 0, 0);
17802 gcc_assert (GET_CODE (set) == SET);
17803 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17804
17805 set_mode = GET_MODE (SET_DEST (set));
17806 switch (set_mode)
17807 {
17808 case CCNOmode:
17809 if (req_mode != CCNOmode
17810 && (req_mode != CCmode
17811 || XEXP (SET_SRC (set), 1) != const0_rtx))
17812 return false;
17813 break;
17814 case CCmode:
17815 if (req_mode == CCGCmode)
17816 return false;
17817 /* FALLTHRU */
17818 case CCGCmode:
17819 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17820 return false;
17821 /* FALLTHRU */
17822 case CCGOCmode:
17823 if (req_mode == CCZmode)
17824 return false;
17825 /* FALLTHRU */
17826 case CCZmode:
17827 break;
17828
17829 case CCAmode:
17830 case CCCmode:
17831 case CCOmode:
17832 case CCSmode:
17833 if (set_mode != req_mode)
17834 return false;
17835 break;
17836
17837 default:
17838 gcc_unreachable ();
17839 }
17840
17841 return GET_MODE (SET_SRC (set)) == set_mode;
17842 }
17843
17844 /* Generate insn patterns to do an integer compare of OPERANDS. */
17845
17846 static rtx
17847 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17848 {
17849 enum machine_mode cmpmode;
17850 rtx tmp, flags;
17851
17852 cmpmode = SELECT_CC_MODE (code, op0, op1);
17853 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17854
17855 /* This is very simple, but making the interface the same as in the
17856 FP case makes the rest of the code easier. */
17857 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17858 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17859
17860 /* Return the test that should be put into the flags user, i.e.
17861 the bcc, scc, or cmov instruction. */
17862 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17863 }
17864
17865 /* Figure out whether to use ordered or unordered fp comparisons.
17866 Return the appropriate mode to use. */
17867
17868 enum machine_mode
17869 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17870 {
17871 /* ??? In order to make all comparisons reversible, we do all comparisons
17872 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17873 all forms trapping and nontrapping comparisons, we can make inequality
17874 comparisons trapping again, since it results in better code when using
17875 FCOM based compares. */
17876 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17877 }
17878
17879 enum machine_mode
17880 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17881 {
17882 enum machine_mode mode = GET_MODE (op0);
17883
17884 if (SCALAR_FLOAT_MODE_P (mode))
17885 {
17886 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17887 return ix86_fp_compare_mode (code);
17888 }
17889
17890 switch (code)
17891 {
17892 /* Only zero flag is needed. */
17893 case EQ: /* ZF=0 */
17894 case NE: /* ZF!=0 */
17895 return CCZmode;
17896 /* Codes needing carry flag. */
17897 case GEU: /* CF=0 */
17898 case LTU: /* CF=1 */
17899 /* Detect overflow checks. They need just the carry flag. */
17900 if (GET_CODE (op0) == PLUS
17901 && rtx_equal_p (op1, XEXP (op0, 0)))
17902 return CCCmode;
17903 else
17904 return CCmode;
17905 case GTU: /* CF=0 & ZF=0 */
17906 case LEU: /* CF=1 | ZF=1 */
17907 /* Detect overflow checks. They need just the carry flag. */
17908 if (GET_CODE (op0) == MINUS
17909 && rtx_equal_p (op1, XEXP (op0, 0)))
17910 return CCCmode;
17911 else
17912 return CCmode;
17913 /* Codes possibly doable only with sign flag when
17914 comparing against zero. */
17915 case GE: /* SF=OF or SF=0 */
17916 case LT: /* SF<>OF or SF=1 */
17917 if (op1 == const0_rtx)
17918 return CCGOCmode;
17919 else
17920 /* For other cases Carry flag is not required. */
17921 return CCGCmode;
17922 /* Codes doable only with sign flag when comparing
17923 against zero, but we miss jump instruction for it
17924 so we need to use relational tests against overflow
17925 that thus needs to be zero. */
17926 case GT: /* ZF=0 & SF=OF */
17927 case LE: /* ZF=1 | SF<>OF */
17928 if (op1 == const0_rtx)
17929 return CCNOmode;
17930 else
17931 return CCGCmode;
17932 /* strcmp pattern do (use flags) and combine may ask us for proper
17933 mode. */
17934 case USE:
17935 return CCmode;
17936 default:
17937 gcc_unreachable ();
17938 }
17939 }
17940
17941 /* Return the fixed registers used for condition codes. */
17942
17943 static bool
17944 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17945 {
17946 *p1 = FLAGS_REG;
17947 *p2 = FPSR_REG;
17948 return true;
17949 }
17950
17951 /* If two condition code modes are compatible, return a condition code
17952 mode which is compatible with both. Otherwise, return
17953 VOIDmode. */
17954
17955 static enum machine_mode
17956 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17957 {
17958 if (m1 == m2)
17959 return m1;
17960
17961 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17962 return VOIDmode;
17963
17964 if ((m1 == CCGCmode && m2 == CCGOCmode)
17965 || (m1 == CCGOCmode && m2 == CCGCmode))
17966 return CCGCmode;
17967
17968 switch (m1)
17969 {
17970 default:
17971 gcc_unreachable ();
17972
17973 case CCmode:
17974 case CCGCmode:
17975 case CCGOCmode:
17976 case CCNOmode:
17977 case CCAmode:
17978 case CCCmode:
17979 case CCOmode:
17980 case CCSmode:
17981 case CCZmode:
17982 switch (m2)
17983 {
17984 default:
17985 return VOIDmode;
17986
17987 case CCmode:
17988 case CCGCmode:
17989 case CCGOCmode:
17990 case CCNOmode:
17991 case CCAmode:
17992 case CCCmode:
17993 case CCOmode:
17994 case CCSmode:
17995 case CCZmode:
17996 return CCmode;
17997 }
17998
17999 case CCFPmode:
18000 case CCFPUmode:
18001 /* These are only compatible with themselves, which we already
18002 checked above. */
18003 return VOIDmode;
18004 }
18005 }
18006
18007
18008 /* Return a comparison we can do and that it is equivalent to
18009 swap_condition (code) apart possibly from orderedness.
18010 But, never change orderedness if TARGET_IEEE_FP, returning
18011 UNKNOWN in that case if necessary. */
18012
18013 static enum rtx_code
18014 ix86_fp_swap_condition (enum rtx_code code)
18015 {
18016 switch (code)
18017 {
18018 case GT: /* GTU - CF=0 & ZF=0 */
18019 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18020 case GE: /* GEU - CF=0 */
18021 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18022 case UNLT: /* LTU - CF=1 */
18023 return TARGET_IEEE_FP ? UNKNOWN : GT;
18024 case UNLE: /* LEU - CF=1 | ZF=1 */
18025 return TARGET_IEEE_FP ? UNKNOWN : GE;
18026 default:
18027 return swap_condition (code);
18028 }
18029 }
18030
18031 /* Return cost of comparison CODE using the best strategy for performance.
18032 All following functions do use number of instructions as a cost metrics.
18033 In future this should be tweaked to compute bytes for optimize_size and
18034 take into account performance of various instructions on various CPUs. */
18035
18036 static int
18037 ix86_fp_comparison_cost (enum rtx_code code)
18038 {
18039 int arith_cost;
18040
18041 /* The cost of code using bit-twiddling on %ah. */
18042 switch (code)
18043 {
18044 case UNLE:
18045 case UNLT:
18046 case LTGT:
18047 case GT:
18048 case GE:
18049 case UNORDERED:
18050 case ORDERED:
18051 case UNEQ:
18052 arith_cost = 4;
18053 break;
18054 case LT:
18055 case NE:
18056 case EQ:
18057 case UNGE:
18058 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18059 break;
18060 case LE:
18061 case UNGT:
18062 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18063 break;
18064 default:
18065 gcc_unreachable ();
18066 }
18067
18068 switch (ix86_fp_comparison_strategy (code))
18069 {
18070 case IX86_FPCMP_COMI:
18071 return arith_cost > 4 ? 3 : 2;
18072 case IX86_FPCMP_SAHF:
18073 return arith_cost > 4 ? 4 : 3;
18074 default:
18075 return arith_cost;
18076 }
18077 }
18078
18079 /* Return strategy to use for floating-point. We assume that fcomi is always
18080 preferrable where available, since that is also true when looking at size
18081 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18082
18083 enum ix86_fpcmp_strategy
18084 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18085 {
18086 /* Do fcomi/sahf based test when profitable. */
18087
18088 if (TARGET_CMOVE)
18089 return IX86_FPCMP_COMI;
18090
18091 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18092 return IX86_FPCMP_SAHF;
18093
18094 return IX86_FPCMP_ARITH;
18095 }
18096
18097 /* Swap, force into registers, or otherwise massage the two operands
18098 to a fp comparison. The operands are updated in place; the new
18099 comparison code is returned. */
18100
18101 static enum rtx_code
18102 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18103 {
18104 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18105 rtx op0 = *pop0, op1 = *pop1;
18106 enum machine_mode op_mode = GET_MODE (op0);
18107 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18108
18109 /* All of the unordered compare instructions only work on registers.
18110 The same is true of the fcomi compare instructions. The XFmode
18111 compare instructions require registers except when comparing
18112 against zero or when converting operand 1 from fixed point to
18113 floating point. */
18114
18115 if (!is_sse
18116 && (fpcmp_mode == CCFPUmode
18117 || (op_mode == XFmode
18118 && ! (standard_80387_constant_p (op0) == 1
18119 || standard_80387_constant_p (op1) == 1)
18120 && GET_CODE (op1) != FLOAT)
18121 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18122 {
18123 op0 = force_reg (op_mode, op0);
18124 op1 = force_reg (op_mode, op1);
18125 }
18126 else
18127 {
18128 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18129 things around if they appear profitable, otherwise force op0
18130 into a register. */
18131
18132 if (standard_80387_constant_p (op0) == 0
18133 || (MEM_P (op0)
18134 && ! (standard_80387_constant_p (op1) == 0
18135 || MEM_P (op1))))
18136 {
18137 enum rtx_code new_code = ix86_fp_swap_condition (code);
18138 if (new_code != UNKNOWN)
18139 {
18140 rtx tmp;
18141 tmp = op0, op0 = op1, op1 = tmp;
18142 code = new_code;
18143 }
18144 }
18145
18146 if (!REG_P (op0))
18147 op0 = force_reg (op_mode, op0);
18148
18149 if (CONSTANT_P (op1))
18150 {
18151 int tmp = standard_80387_constant_p (op1);
18152 if (tmp == 0)
18153 op1 = validize_mem (force_const_mem (op_mode, op1));
18154 else if (tmp == 1)
18155 {
18156 if (TARGET_CMOVE)
18157 op1 = force_reg (op_mode, op1);
18158 }
18159 else
18160 op1 = force_reg (op_mode, op1);
18161 }
18162 }
18163
18164 /* Try to rearrange the comparison to make it cheaper. */
18165 if (ix86_fp_comparison_cost (code)
18166 > ix86_fp_comparison_cost (swap_condition (code))
18167 && (REG_P (op1) || can_create_pseudo_p ()))
18168 {
18169 rtx tmp;
18170 tmp = op0, op0 = op1, op1 = tmp;
18171 code = swap_condition (code);
18172 if (!REG_P (op0))
18173 op0 = force_reg (op_mode, op0);
18174 }
18175
18176 *pop0 = op0;
18177 *pop1 = op1;
18178 return code;
18179 }
18180
18181 /* Convert comparison codes we use to represent FP comparison to integer
18182 code that will result in proper branch. Return UNKNOWN if no such code
18183 is available. */
18184
18185 enum rtx_code
18186 ix86_fp_compare_code_to_integer (enum rtx_code code)
18187 {
18188 switch (code)
18189 {
18190 case GT:
18191 return GTU;
18192 case GE:
18193 return GEU;
18194 case ORDERED:
18195 case UNORDERED:
18196 return code;
18197 break;
18198 case UNEQ:
18199 return EQ;
18200 break;
18201 case UNLT:
18202 return LTU;
18203 break;
18204 case UNLE:
18205 return LEU;
18206 break;
18207 case LTGT:
18208 return NE;
18209 break;
18210 default:
18211 return UNKNOWN;
18212 }
18213 }
18214
18215 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18216
18217 static rtx
18218 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18219 {
18220 enum machine_mode fpcmp_mode, intcmp_mode;
18221 rtx tmp, tmp2;
18222
18223 fpcmp_mode = ix86_fp_compare_mode (code);
18224 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18225
18226 /* Do fcomi/sahf based test when profitable. */
18227 switch (ix86_fp_comparison_strategy (code))
18228 {
18229 case IX86_FPCMP_COMI:
18230 intcmp_mode = fpcmp_mode;
18231 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18232 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18233 tmp);
18234 emit_insn (tmp);
18235 break;
18236
18237 case IX86_FPCMP_SAHF:
18238 intcmp_mode = fpcmp_mode;
18239 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18240 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18241 tmp);
18242
18243 if (!scratch)
18244 scratch = gen_reg_rtx (HImode);
18245 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18246 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18247 break;
18248
18249 case IX86_FPCMP_ARITH:
18250 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18251 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18252 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18253 if (!scratch)
18254 scratch = gen_reg_rtx (HImode);
18255 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18256
18257 /* In the unordered case, we have to check C2 for NaN's, which
18258 doesn't happen to work out to anything nice combination-wise.
18259 So do some bit twiddling on the value we've got in AH to come
18260 up with an appropriate set of condition codes. */
18261
18262 intcmp_mode = CCNOmode;
18263 switch (code)
18264 {
18265 case GT:
18266 case UNGT:
18267 if (code == GT || !TARGET_IEEE_FP)
18268 {
18269 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18270 code = EQ;
18271 }
18272 else
18273 {
18274 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18275 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18276 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18277 intcmp_mode = CCmode;
18278 code = GEU;
18279 }
18280 break;
18281 case LT:
18282 case UNLT:
18283 if (code == LT && TARGET_IEEE_FP)
18284 {
18285 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18286 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18287 intcmp_mode = CCmode;
18288 code = EQ;
18289 }
18290 else
18291 {
18292 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18293 code = NE;
18294 }
18295 break;
18296 case GE:
18297 case UNGE:
18298 if (code == GE || !TARGET_IEEE_FP)
18299 {
18300 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18301 code = EQ;
18302 }
18303 else
18304 {
18305 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18306 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18307 code = NE;
18308 }
18309 break;
18310 case LE:
18311 case UNLE:
18312 if (code == LE && TARGET_IEEE_FP)
18313 {
18314 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18315 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18316 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18317 intcmp_mode = CCmode;
18318 code = LTU;
18319 }
18320 else
18321 {
18322 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18323 code = NE;
18324 }
18325 break;
18326 case EQ:
18327 case UNEQ:
18328 if (code == EQ && TARGET_IEEE_FP)
18329 {
18330 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18331 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18332 intcmp_mode = CCmode;
18333 code = EQ;
18334 }
18335 else
18336 {
18337 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18338 code = NE;
18339 }
18340 break;
18341 case NE:
18342 case LTGT:
18343 if (code == NE && TARGET_IEEE_FP)
18344 {
18345 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18346 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18347 GEN_INT (0x40)));
18348 code = NE;
18349 }
18350 else
18351 {
18352 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18353 code = EQ;
18354 }
18355 break;
18356
18357 case UNORDERED:
18358 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18359 code = NE;
18360 break;
18361 case ORDERED:
18362 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18363 code = EQ;
18364 break;
18365
18366 default:
18367 gcc_unreachable ();
18368 }
18369 break;
18370
18371 default:
18372 gcc_unreachable();
18373 }
18374
18375 /* Return the test that should be put into the flags user, i.e.
18376 the bcc, scc, or cmov instruction. */
18377 return gen_rtx_fmt_ee (code, VOIDmode,
18378 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18379 const0_rtx);
18380 }
18381
18382 static rtx
18383 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18384 {
18385 rtx ret;
18386
18387 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18388 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18389
18390 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18391 {
18392 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18393 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18394 }
18395 else
18396 ret = ix86_expand_int_compare (code, op0, op1);
18397
18398 return ret;
18399 }
18400
18401 void
18402 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18403 {
18404 enum machine_mode mode = GET_MODE (op0);
18405 rtx tmp;
18406
18407 switch (mode)
18408 {
18409 case SFmode:
18410 case DFmode:
18411 case XFmode:
18412 case QImode:
18413 case HImode:
18414 case SImode:
18415 simple:
18416 tmp = ix86_expand_compare (code, op0, op1);
18417 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18418 gen_rtx_LABEL_REF (VOIDmode, label),
18419 pc_rtx);
18420 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18421 return;
18422
18423 case DImode:
18424 if (TARGET_64BIT)
18425 goto simple;
18426 case TImode:
18427 /* Expand DImode branch into multiple compare+branch. */
18428 {
18429 rtx lo[2], hi[2], label2;
18430 enum rtx_code code1, code2, code3;
18431 enum machine_mode submode;
18432
18433 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18434 {
18435 tmp = op0, op0 = op1, op1 = tmp;
18436 code = swap_condition (code);
18437 }
18438
18439 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18440 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18441
18442 submode = mode == DImode ? SImode : DImode;
18443
18444 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18445 avoid two branches. This costs one extra insn, so disable when
18446 optimizing for size. */
18447
18448 if ((code == EQ || code == NE)
18449 && (!optimize_insn_for_size_p ()
18450 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18451 {
18452 rtx xor0, xor1;
18453
18454 xor1 = hi[0];
18455 if (hi[1] != const0_rtx)
18456 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18457 NULL_RTX, 0, OPTAB_WIDEN);
18458
18459 xor0 = lo[0];
18460 if (lo[1] != const0_rtx)
18461 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18462 NULL_RTX, 0, OPTAB_WIDEN);
18463
18464 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18465 NULL_RTX, 0, OPTAB_WIDEN);
18466
18467 ix86_expand_branch (code, tmp, const0_rtx, label);
18468 return;
18469 }
18470
18471 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18472 op1 is a constant and the low word is zero, then we can just
18473 examine the high word. Similarly for low word -1 and
18474 less-or-equal-than or greater-than. */
18475
18476 if (CONST_INT_P (hi[1]))
18477 switch (code)
18478 {
18479 case LT: case LTU: case GE: case GEU:
18480 if (lo[1] == const0_rtx)
18481 {
18482 ix86_expand_branch (code, hi[0], hi[1], label);
18483 return;
18484 }
18485 break;
18486 case LE: case LEU: case GT: case GTU:
18487 if (lo[1] == constm1_rtx)
18488 {
18489 ix86_expand_branch (code, hi[0], hi[1], label);
18490 return;
18491 }
18492 break;
18493 default:
18494 break;
18495 }
18496
18497 /* Otherwise, we need two or three jumps. */
18498
18499 label2 = gen_label_rtx ();
18500
18501 code1 = code;
18502 code2 = swap_condition (code);
18503 code3 = unsigned_condition (code);
18504
18505 switch (code)
18506 {
18507 case LT: case GT: case LTU: case GTU:
18508 break;
18509
18510 case LE: code1 = LT; code2 = GT; break;
18511 case GE: code1 = GT; code2 = LT; break;
18512 case LEU: code1 = LTU; code2 = GTU; break;
18513 case GEU: code1 = GTU; code2 = LTU; break;
18514
18515 case EQ: code1 = UNKNOWN; code2 = NE; break;
18516 case NE: code2 = UNKNOWN; break;
18517
18518 default:
18519 gcc_unreachable ();
18520 }
18521
18522 /*
18523 * a < b =>
18524 * if (hi(a) < hi(b)) goto true;
18525 * if (hi(a) > hi(b)) goto false;
18526 * if (lo(a) < lo(b)) goto true;
18527 * false:
18528 */
18529
18530 if (code1 != UNKNOWN)
18531 ix86_expand_branch (code1, hi[0], hi[1], label);
18532 if (code2 != UNKNOWN)
18533 ix86_expand_branch (code2, hi[0], hi[1], label2);
18534
18535 ix86_expand_branch (code3, lo[0], lo[1], label);
18536
18537 if (code2 != UNKNOWN)
18538 emit_label (label2);
18539 return;
18540 }
18541
18542 default:
18543 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18544 goto simple;
18545 }
18546 }
18547
18548 /* Split branch based on floating point condition. */
18549 void
18550 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18551 rtx target1, rtx target2, rtx tmp, rtx pushed)
18552 {
18553 rtx condition;
18554 rtx i;
18555
18556 if (target2 != pc_rtx)
18557 {
18558 rtx tmp = target2;
18559 code = reverse_condition_maybe_unordered (code);
18560 target2 = target1;
18561 target1 = tmp;
18562 }
18563
18564 condition = ix86_expand_fp_compare (code, op1, op2,
18565 tmp);
18566
18567 /* Remove pushed operand from stack. */
18568 if (pushed)
18569 ix86_free_from_memory (GET_MODE (pushed));
18570
18571 i = emit_jump_insn (gen_rtx_SET
18572 (VOIDmode, pc_rtx,
18573 gen_rtx_IF_THEN_ELSE (VOIDmode,
18574 condition, target1, target2)));
18575 if (split_branch_probability >= 0)
18576 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18577 }
18578
18579 void
18580 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18581 {
18582 rtx ret;
18583
18584 gcc_assert (GET_MODE (dest) == QImode);
18585
18586 ret = ix86_expand_compare (code, op0, op1);
18587 PUT_MODE (ret, QImode);
18588 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18589 }
18590
18591 /* Expand comparison setting or clearing carry flag. Return true when
18592 successful and set pop for the operation. */
18593 static bool
18594 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18595 {
18596 enum machine_mode mode =
18597 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18598
18599 /* Do not handle double-mode compares that go through special path. */
18600 if (mode == (TARGET_64BIT ? TImode : DImode))
18601 return false;
18602
18603 if (SCALAR_FLOAT_MODE_P (mode))
18604 {
18605 rtx compare_op, compare_seq;
18606
18607 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18608
18609 /* Shortcut: following common codes never translate
18610 into carry flag compares. */
18611 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18612 || code == ORDERED || code == UNORDERED)
18613 return false;
18614
18615 /* These comparisons require zero flag; swap operands so they won't. */
18616 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18617 && !TARGET_IEEE_FP)
18618 {
18619 rtx tmp = op0;
18620 op0 = op1;
18621 op1 = tmp;
18622 code = swap_condition (code);
18623 }
18624
18625 /* Try to expand the comparison and verify that we end up with
18626 carry flag based comparison. This fails to be true only when
18627 we decide to expand comparison using arithmetic that is not
18628 too common scenario. */
18629 start_sequence ();
18630 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18631 compare_seq = get_insns ();
18632 end_sequence ();
18633
18634 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18635 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18636 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18637 else
18638 code = GET_CODE (compare_op);
18639
18640 if (code != LTU && code != GEU)
18641 return false;
18642
18643 emit_insn (compare_seq);
18644 *pop = compare_op;
18645 return true;
18646 }
18647
18648 if (!INTEGRAL_MODE_P (mode))
18649 return false;
18650
18651 switch (code)
18652 {
18653 case LTU:
18654 case GEU:
18655 break;
18656
18657 /* Convert a==0 into (unsigned)a<1. */
18658 case EQ:
18659 case NE:
18660 if (op1 != const0_rtx)
18661 return false;
18662 op1 = const1_rtx;
18663 code = (code == EQ ? LTU : GEU);
18664 break;
18665
18666 /* Convert a>b into b<a or a>=b-1. */
18667 case GTU:
18668 case LEU:
18669 if (CONST_INT_P (op1))
18670 {
18671 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18672 /* Bail out on overflow. We still can swap operands but that
18673 would force loading of the constant into register. */
18674 if (op1 == const0_rtx
18675 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18676 return false;
18677 code = (code == GTU ? GEU : LTU);
18678 }
18679 else
18680 {
18681 rtx tmp = op1;
18682 op1 = op0;
18683 op0 = tmp;
18684 code = (code == GTU ? LTU : GEU);
18685 }
18686 break;
18687
18688 /* Convert a>=0 into (unsigned)a<0x80000000. */
18689 case LT:
18690 case GE:
18691 if (mode == DImode || op1 != const0_rtx)
18692 return false;
18693 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18694 code = (code == LT ? GEU : LTU);
18695 break;
18696 case LE:
18697 case GT:
18698 if (mode == DImode || op1 != constm1_rtx)
18699 return false;
18700 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18701 code = (code == LE ? GEU : LTU);
18702 break;
18703
18704 default:
18705 return false;
18706 }
18707 /* Swapping operands may cause constant to appear as first operand. */
18708 if (!nonimmediate_operand (op0, VOIDmode))
18709 {
18710 if (!can_create_pseudo_p ())
18711 return false;
18712 op0 = force_reg (mode, op0);
18713 }
18714 *pop = ix86_expand_compare (code, op0, op1);
18715 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18716 return true;
18717 }
18718
18719 bool
18720 ix86_expand_int_movcc (rtx operands[])
18721 {
18722 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18723 rtx compare_seq, compare_op;
18724 enum machine_mode mode = GET_MODE (operands[0]);
18725 bool sign_bit_compare_p = false;
18726 rtx op0 = XEXP (operands[1], 0);
18727 rtx op1 = XEXP (operands[1], 1);
18728
18729 start_sequence ();
18730 compare_op = ix86_expand_compare (code, op0, op1);
18731 compare_seq = get_insns ();
18732 end_sequence ();
18733
18734 compare_code = GET_CODE (compare_op);
18735
18736 if ((op1 == const0_rtx && (code == GE || code == LT))
18737 || (op1 == constm1_rtx && (code == GT || code == LE)))
18738 sign_bit_compare_p = true;
18739
18740 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18741 HImode insns, we'd be swallowed in word prefix ops. */
18742
18743 if ((mode != HImode || TARGET_FAST_PREFIX)
18744 && (mode != (TARGET_64BIT ? TImode : DImode))
18745 && CONST_INT_P (operands[2])
18746 && CONST_INT_P (operands[3]))
18747 {
18748 rtx out = operands[0];
18749 HOST_WIDE_INT ct = INTVAL (operands[2]);
18750 HOST_WIDE_INT cf = INTVAL (operands[3]);
18751 HOST_WIDE_INT diff;
18752
18753 diff = ct - cf;
18754 /* Sign bit compares are better done using shifts than we do by using
18755 sbb. */
18756 if (sign_bit_compare_p
18757 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18758 {
18759 /* Detect overlap between destination and compare sources. */
18760 rtx tmp = out;
18761
18762 if (!sign_bit_compare_p)
18763 {
18764 rtx flags;
18765 bool fpcmp = false;
18766
18767 compare_code = GET_CODE (compare_op);
18768
18769 flags = XEXP (compare_op, 0);
18770
18771 if (GET_MODE (flags) == CCFPmode
18772 || GET_MODE (flags) == CCFPUmode)
18773 {
18774 fpcmp = true;
18775 compare_code
18776 = ix86_fp_compare_code_to_integer (compare_code);
18777 }
18778
18779 /* To simplify rest of code, restrict to the GEU case. */
18780 if (compare_code == LTU)
18781 {
18782 HOST_WIDE_INT tmp = ct;
18783 ct = cf;
18784 cf = tmp;
18785 compare_code = reverse_condition (compare_code);
18786 code = reverse_condition (code);
18787 }
18788 else
18789 {
18790 if (fpcmp)
18791 PUT_CODE (compare_op,
18792 reverse_condition_maybe_unordered
18793 (GET_CODE (compare_op)));
18794 else
18795 PUT_CODE (compare_op,
18796 reverse_condition (GET_CODE (compare_op)));
18797 }
18798 diff = ct - cf;
18799
18800 if (reg_overlap_mentioned_p (out, op0)
18801 || reg_overlap_mentioned_p (out, op1))
18802 tmp = gen_reg_rtx (mode);
18803
18804 if (mode == DImode)
18805 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18806 else
18807 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18808 flags, compare_op));
18809 }
18810 else
18811 {
18812 if (code == GT || code == GE)
18813 code = reverse_condition (code);
18814 else
18815 {
18816 HOST_WIDE_INT tmp = ct;
18817 ct = cf;
18818 cf = tmp;
18819 diff = ct - cf;
18820 }
18821 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18822 }
18823
18824 if (diff == 1)
18825 {
18826 /*
18827 * cmpl op0,op1
18828 * sbbl dest,dest
18829 * [addl dest, ct]
18830 *
18831 * Size 5 - 8.
18832 */
18833 if (ct)
18834 tmp = expand_simple_binop (mode, PLUS,
18835 tmp, GEN_INT (ct),
18836 copy_rtx (tmp), 1, OPTAB_DIRECT);
18837 }
18838 else if (cf == -1)
18839 {
18840 /*
18841 * cmpl op0,op1
18842 * sbbl dest,dest
18843 * orl $ct, dest
18844 *
18845 * Size 8.
18846 */
18847 tmp = expand_simple_binop (mode, IOR,
18848 tmp, GEN_INT (ct),
18849 copy_rtx (tmp), 1, OPTAB_DIRECT);
18850 }
18851 else if (diff == -1 && ct)
18852 {
18853 /*
18854 * cmpl op0,op1
18855 * sbbl dest,dest
18856 * notl dest
18857 * [addl dest, cf]
18858 *
18859 * Size 8 - 11.
18860 */
18861 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18862 if (cf)
18863 tmp = expand_simple_binop (mode, PLUS,
18864 copy_rtx (tmp), GEN_INT (cf),
18865 copy_rtx (tmp), 1, OPTAB_DIRECT);
18866 }
18867 else
18868 {
18869 /*
18870 * cmpl op0,op1
18871 * sbbl dest,dest
18872 * [notl dest]
18873 * andl cf - ct, dest
18874 * [addl dest, ct]
18875 *
18876 * Size 8 - 11.
18877 */
18878
18879 if (cf == 0)
18880 {
18881 cf = ct;
18882 ct = 0;
18883 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18884 }
18885
18886 tmp = expand_simple_binop (mode, AND,
18887 copy_rtx (tmp),
18888 gen_int_mode (cf - ct, mode),
18889 copy_rtx (tmp), 1, OPTAB_DIRECT);
18890 if (ct)
18891 tmp = expand_simple_binop (mode, PLUS,
18892 copy_rtx (tmp), GEN_INT (ct),
18893 copy_rtx (tmp), 1, OPTAB_DIRECT);
18894 }
18895
18896 if (!rtx_equal_p (tmp, out))
18897 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18898
18899 return true;
18900 }
18901
18902 if (diff < 0)
18903 {
18904 enum machine_mode cmp_mode = GET_MODE (op0);
18905
18906 HOST_WIDE_INT tmp;
18907 tmp = ct, ct = cf, cf = tmp;
18908 diff = -diff;
18909
18910 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18911 {
18912 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18913
18914 /* We may be reversing unordered compare to normal compare, that
18915 is not valid in general (we may convert non-trapping condition
18916 to trapping one), however on i386 we currently emit all
18917 comparisons unordered. */
18918 compare_code = reverse_condition_maybe_unordered (compare_code);
18919 code = reverse_condition_maybe_unordered (code);
18920 }
18921 else
18922 {
18923 compare_code = reverse_condition (compare_code);
18924 code = reverse_condition (code);
18925 }
18926 }
18927
18928 compare_code = UNKNOWN;
18929 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18930 && CONST_INT_P (op1))
18931 {
18932 if (op1 == const0_rtx
18933 && (code == LT || code == GE))
18934 compare_code = code;
18935 else if (op1 == constm1_rtx)
18936 {
18937 if (code == LE)
18938 compare_code = LT;
18939 else if (code == GT)
18940 compare_code = GE;
18941 }
18942 }
18943
18944 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18945 if (compare_code != UNKNOWN
18946 && GET_MODE (op0) == GET_MODE (out)
18947 && (cf == -1 || ct == -1))
18948 {
18949 /* If lea code below could be used, only optimize
18950 if it results in a 2 insn sequence. */
18951
18952 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18953 || diff == 3 || diff == 5 || diff == 9)
18954 || (compare_code == LT && ct == -1)
18955 || (compare_code == GE && cf == -1))
18956 {
18957 /*
18958 * notl op1 (if necessary)
18959 * sarl $31, op1
18960 * orl cf, op1
18961 */
18962 if (ct != -1)
18963 {
18964 cf = ct;
18965 ct = -1;
18966 code = reverse_condition (code);
18967 }
18968
18969 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18970
18971 out = expand_simple_binop (mode, IOR,
18972 out, GEN_INT (cf),
18973 out, 1, OPTAB_DIRECT);
18974 if (out != operands[0])
18975 emit_move_insn (operands[0], out);
18976
18977 return true;
18978 }
18979 }
18980
18981
18982 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
18983 || diff == 3 || diff == 5 || diff == 9)
18984 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
18985 && (mode != DImode
18986 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
18987 {
18988 /*
18989 * xorl dest,dest
18990 * cmpl op1,op2
18991 * setcc dest
18992 * lea cf(dest*(ct-cf)),dest
18993 *
18994 * Size 14.
18995 *
18996 * This also catches the degenerate setcc-only case.
18997 */
18998
18999 rtx tmp;
19000 int nops;
19001
19002 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19003
19004 nops = 0;
19005 /* On x86_64 the lea instruction operates on Pmode, so we need
19006 to get arithmetics done in proper mode to match. */
19007 if (diff == 1)
19008 tmp = copy_rtx (out);
19009 else
19010 {
19011 rtx out1;
19012 out1 = copy_rtx (out);
19013 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19014 nops++;
19015 if (diff & 1)
19016 {
19017 tmp = gen_rtx_PLUS (mode, tmp, out1);
19018 nops++;
19019 }
19020 }
19021 if (cf != 0)
19022 {
19023 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19024 nops++;
19025 }
19026 if (!rtx_equal_p (tmp, out))
19027 {
19028 if (nops == 1)
19029 out = force_operand (tmp, copy_rtx (out));
19030 else
19031 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19032 }
19033 if (!rtx_equal_p (out, operands[0]))
19034 emit_move_insn (operands[0], copy_rtx (out));
19035
19036 return true;
19037 }
19038
19039 /*
19040 * General case: Jumpful:
19041 * xorl dest,dest cmpl op1, op2
19042 * cmpl op1, op2 movl ct, dest
19043 * setcc dest jcc 1f
19044 * decl dest movl cf, dest
19045 * andl (cf-ct),dest 1:
19046 * addl ct,dest
19047 *
19048 * Size 20. Size 14.
19049 *
19050 * This is reasonably steep, but branch mispredict costs are
19051 * high on modern cpus, so consider failing only if optimizing
19052 * for space.
19053 */
19054
19055 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19056 && BRANCH_COST (optimize_insn_for_speed_p (),
19057 false) >= 2)
19058 {
19059 if (cf == 0)
19060 {
19061 enum machine_mode cmp_mode = GET_MODE (op0);
19062
19063 cf = ct;
19064 ct = 0;
19065
19066 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19067 {
19068 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19069
19070 /* We may be reversing unordered compare to normal compare,
19071 that is not valid in general (we may convert non-trapping
19072 condition to trapping one), however on i386 we currently
19073 emit all comparisons unordered. */
19074 code = reverse_condition_maybe_unordered (code);
19075 }
19076 else
19077 {
19078 code = reverse_condition (code);
19079 if (compare_code != UNKNOWN)
19080 compare_code = reverse_condition (compare_code);
19081 }
19082 }
19083
19084 if (compare_code != UNKNOWN)
19085 {
19086 /* notl op1 (if needed)
19087 sarl $31, op1
19088 andl (cf-ct), op1
19089 addl ct, op1
19090
19091 For x < 0 (resp. x <= -1) there will be no notl,
19092 so if possible swap the constants to get rid of the
19093 complement.
19094 True/false will be -1/0 while code below (store flag
19095 followed by decrement) is 0/-1, so the constants need
19096 to be exchanged once more. */
19097
19098 if (compare_code == GE || !cf)
19099 {
19100 code = reverse_condition (code);
19101 compare_code = LT;
19102 }
19103 else
19104 {
19105 HOST_WIDE_INT tmp = cf;
19106 cf = ct;
19107 ct = tmp;
19108 }
19109
19110 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19111 }
19112 else
19113 {
19114 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19115
19116 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19117 constm1_rtx,
19118 copy_rtx (out), 1, OPTAB_DIRECT);
19119 }
19120
19121 out = expand_simple_binop (mode, AND, copy_rtx (out),
19122 gen_int_mode (cf - ct, mode),
19123 copy_rtx (out), 1, OPTAB_DIRECT);
19124 if (ct)
19125 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19126 copy_rtx (out), 1, OPTAB_DIRECT);
19127 if (!rtx_equal_p (out, operands[0]))
19128 emit_move_insn (operands[0], copy_rtx (out));
19129
19130 return true;
19131 }
19132 }
19133
19134 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19135 {
19136 /* Try a few things more with specific constants and a variable. */
19137
19138 optab op;
19139 rtx var, orig_out, out, tmp;
19140
19141 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19142 return false;
19143
19144 /* If one of the two operands is an interesting constant, load a
19145 constant with the above and mask it in with a logical operation. */
19146
19147 if (CONST_INT_P (operands[2]))
19148 {
19149 var = operands[3];
19150 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19151 operands[3] = constm1_rtx, op = and_optab;
19152 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19153 operands[3] = const0_rtx, op = ior_optab;
19154 else
19155 return false;
19156 }
19157 else if (CONST_INT_P (operands[3]))
19158 {
19159 var = operands[2];
19160 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19161 operands[2] = constm1_rtx, op = and_optab;
19162 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19163 operands[2] = const0_rtx, op = ior_optab;
19164 else
19165 return false;
19166 }
19167 else
19168 return false;
19169
19170 orig_out = operands[0];
19171 tmp = gen_reg_rtx (mode);
19172 operands[0] = tmp;
19173
19174 /* Recurse to get the constant loaded. */
19175 if (ix86_expand_int_movcc (operands) == 0)
19176 return false;
19177
19178 /* Mask in the interesting variable. */
19179 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19180 OPTAB_WIDEN);
19181 if (!rtx_equal_p (out, orig_out))
19182 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19183
19184 return true;
19185 }
19186
19187 /*
19188 * For comparison with above,
19189 *
19190 * movl cf,dest
19191 * movl ct,tmp
19192 * cmpl op1,op2
19193 * cmovcc tmp,dest
19194 *
19195 * Size 15.
19196 */
19197
19198 if (! nonimmediate_operand (operands[2], mode))
19199 operands[2] = force_reg (mode, operands[2]);
19200 if (! nonimmediate_operand (operands[3], mode))
19201 operands[3] = force_reg (mode, operands[3]);
19202
19203 if (! register_operand (operands[2], VOIDmode)
19204 && (mode == QImode
19205 || ! register_operand (operands[3], VOIDmode)))
19206 operands[2] = force_reg (mode, operands[2]);
19207
19208 if (mode == QImode
19209 && ! register_operand (operands[3], VOIDmode))
19210 operands[3] = force_reg (mode, operands[3]);
19211
19212 emit_insn (compare_seq);
19213 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19214 gen_rtx_IF_THEN_ELSE (mode,
19215 compare_op, operands[2],
19216 operands[3])));
19217 return true;
19218 }
19219
19220 /* Swap, force into registers, or otherwise massage the two operands
19221 to an sse comparison with a mask result. Thus we differ a bit from
19222 ix86_prepare_fp_compare_args which expects to produce a flags result.
19223
19224 The DEST operand exists to help determine whether to commute commutative
19225 operators. The POP0/POP1 operands are updated in place. The new
19226 comparison code is returned, or UNKNOWN if not implementable. */
19227
19228 static enum rtx_code
19229 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19230 rtx *pop0, rtx *pop1)
19231 {
19232 rtx tmp;
19233
19234 switch (code)
19235 {
19236 case LTGT:
19237 case UNEQ:
19238 /* AVX supports all the needed comparisons. */
19239 if (TARGET_AVX)
19240 break;
19241 /* We have no LTGT as an operator. We could implement it with
19242 NE & ORDERED, but this requires an extra temporary. It's
19243 not clear that it's worth it. */
19244 return UNKNOWN;
19245
19246 case LT:
19247 case LE:
19248 case UNGT:
19249 case UNGE:
19250 /* These are supported directly. */
19251 break;
19252
19253 case EQ:
19254 case NE:
19255 case UNORDERED:
19256 case ORDERED:
19257 /* AVX has 3 operand comparisons, no need to swap anything. */
19258 if (TARGET_AVX)
19259 break;
19260 /* For commutative operators, try to canonicalize the destination
19261 operand to be first in the comparison - this helps reload to
19262 avoid extra moves. */
19263 if (!dest || !rtx_equal_p (dest, *pop1))
19264 break;
19265 /* FALLTHRU */
19266
19267 case GE:
19268 case GT:
19269 case UNLE:
19270 case UNLT:
19271 /* These are not supported directly before AVX, and furthermore
19272 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19273 comparison operands to transform into something that is
19274 supported. */
19275 tmp = *pop0;
19276 *pop0 = *pop1;
19277 *pop1 = tmp;
19278 code = swap_condition (code);
19279 break;
19280
19281 default:
19282 gcc_unreachable ();
19283 }
19284
19285 return code;
19286 }
19287
19288 /* Detect conditional moves that exactly match min/max operational
19289 semantics. Note that this is IEEE safe, as long as we don't
19290 interchange the operands.
19291
19292 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19293 and TRUE if the operation is successful and instructions are emitted. */
19294
19295 static bool
19296 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19297 rtx cmp_op1, rtx if_true, rtx if_false)
19298 {
19299 enum machine_mode mode;
19300 bool is_min;
19301 rtx tmp;
19302
19303 if (code == LT)
19304 ;
19305 else if (code == UNGE)
19306 {
19307 tmp = if_true;
19308 if_true = if_false;
19309 if_false = tmp;
19310 }
19311 else
19312 return false;
19313
19314 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19315 is_min = true;
19316 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19317 is_min = false;
19318 else
19319 return false;
19320
19321 mode = GET_MODE (dest);
19322
19323 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19324 but MODE may be a vector mode and thus not appropriate. */
19325 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19326 {
19327 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19328 rtvec v;
19329
19330 if_true = force_reg (mode, if_true);
19331 v = gen_rtvec (2, if_true, if_false);
19332 tmp = gen_rtx_UNSPEC (mode, v, u);
19333 }
19334 else
19335 {
19336 code = is_min ? SMIN : SMAX;
19337 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19338 }
19339
19340 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19341 return true;
19342 }
19343
19344 /* Expand an sse vector comparison. Return the register with the result. */
19345
19346 static rtx
19347 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19348 rtx op_true, rtx op_false)
19349 {
19350 enum machine_mode mode = GET_MODE (dest);
19351 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19352 rtx x;
19353
19354 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19355 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19356 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19357
19358 if (optimize
19359 || reg_overlap_mentioned_p (dest, op_true)
19360 || reg_overlap_mentioned_p (dest, op_false))
19361 dest = gen_reg_rtx (mode);
19362
19363 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19364 if (cmp_mode != mode)
19365 {
19366 x = force_reg (cmp_mode, x);
19367 convert_move (dest, x, false);
19368 }
19369 else
19370 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19371
19372 return dest;
19373 }
19374
19375 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19376 operations. This is used for both scalar and vector conditional moves. */
19377
19378 static void
19379 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19380 {
19381 enum machine_mode mode = GET_MODE (dest);
19382 rtx t2, t3, x;
19383
19384 if (vector_all_ones_operand (op_true, mode)
19385 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19386 {
19387 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19388 }
19389 else if (op_false == CONST0_RTX (mode))
19390 {
19391 op_true = force_reg (mode, op_true);
19392 x = gen_rtx_AND (mode, cmp, op_true);
19393 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19394 }
19395 else if (op_true == CONST0_RTX (mode))
19396 {
19397 op_false = force_reg (mode, op_false);
19398 x = gen_rtx_NOT (mode, cmp);
19399 x = gen_rtx_AND (mode, x, op_false);
19400 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19401 }
19402 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19403 {
19404 op_false = force_reg (mode, op_false);
19405 x = gen_rtx_IOR (mode, cmp, op_false);
19406 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19407 }
19408 else if (TARGET_XOP)
19409 {
19410 op_true = force_reg (mode, op_true);
19411
19412 if (!nonimmediate_operand (op_false, mode))
19413 op_false = force_reg (mode, op_false);
19414
19415 emit_insn (gen_rtx_SET (mode, dest,
19416 gen_rtx_IF_THEN_ELSE (mode, cmp,
19417 op_true,
19418 op_false)));
19419 }
19420 else
19421 {
19422 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19423
19424 if (!nonimmediate_operand (op_true, mode))
19425 op_true = force_reg (mode, op_true);
19426
19427 op_false = force_reg (mode, op_false);
19428
19429 switch (mode)
19430 {
19431 case V4SFmode:
19432 if (TARGET_SSE4_1)
19433 gen = gen_sse4_1_blendvps;
19434 break;
19435 case V2DFmode:
19436 if (TARGET_SSE4_1)
19437 gen = gen_sse4_1_blendvpd;
19438 break;
19439 case V16QImode:
19440 case V8HImode:
19441 case V4SImode:
19442 case V2DImode:
19443 if (TARGET_SSE4_1)
19444 {
19445 gen = gen_sse4_1_pblendvb;
19446 dest = gen_lowpart (V16QImode, dest);
19447 op_false = gen_lowpart (V16QImode, op_false);
19448 op_true = gen_lowpart (V16QImode, op_true);
19449 cmp = gen_lowpart (V16QImode, cmp);
19450 }
19451 break;
19452 case V8SFmode:
19453 if (TARGET_AVX)
19454 gen = gen_avx_blendvps256;
19455 break;
19456 case V4DFmode:
19457 if (TARGET_AVX)
19458 gen = gen_avx_blendvpd256;
19459 break;
19460 case V32QImode:
19461 case V16HImode:
19462 case V8SImode:
19463 case V4DImode:
19464 if (TARGET_AVX2)
19465 {
19466 gen = gen_avx2_pblendvb;
19467 dest = gen_lowpart (V32QImode, dest);
19468 op_false = gen_lowpart (V32QImode, op_false);
19469 op_true = gen_lowpart (V32QImode, op_true);
19470 cmp = gen_lowpart (V32QImode, cmp);
19471 }
19472 break;
19473 default:
19474 break;
19475 }
19476
19477 if (gen != NULL)
19478 emit_insn (gen (dest, op_false, op_true, cmp));
19479 else
19480 {
19481 op_true = force_reg (mode, op_true);
19482
19483 t2 = gen_reg_rtx (mode);
19484 if (optimize)
19485 t3 = gen_reg_rtx (mode);
19486 else
19487 t3 = dest;
19488
19489 x = gen_rtx_AND (mode, op_true, cmp);
19490 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19491
19492 x = gen_rtx_NOT (mode, cmp);
19493 x = gen_rtx_AND (mode, x, op_false);
19494 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19495
19496 x = gen_rtx_IOR (mode, t3, t2);
19497 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19498 }
19499 }
19500 }
19501
19502 /* Expand a floating-point conditional move. Return true if successful. */
19503
19504 bool
19505 ix86_expand_fp_movcc (rtx operands[])
19506 {
19507 enum machine_mode mode = GET_MODE (operands[0]);
19508 enum rtx_code code = GET_CODE (operands[1]);
19509 rtx tmp, compare_op;
19510 rtx op0 = XEXP (operands[1], 0);
19511 rtx op1 = XEXP (operands[1], 1);
19512
19513 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19514 {
19515 enum machine_mode cmode;
19516
19517 /* Since we've no cmove for sse registers, don't force bad register
19518 allocation just to gain access to it. Deny movcc when the
19519 comparison mode doesn't match the move mode. */
19520 cmode = GET_MODE (op0);
19521 if (cmode == VOIDmode)
19522 cmode = GET_MODE (op1);
19523 if (cmode != mode)
19524 return false;
19525
19526 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19527 if (code == UNKNOWN)
19528 return false;
19529
19530 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19531 operands[2], operands[3]))
19532 return true;
19533
19534 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19535 operands[2], operands[3]);
19536 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19537 return true;
19538 }
19539
19540 /* The floating point conditional move instructions don't directly
19541 support conditions resulting from a signed integer comparison. */
19542
19543 compare_op = ix86_expand_compare (code, op0, op1);
19544 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19545 {
19546 tmp = gen_reg_rtx (QImode);
19547 ix86_expand_setcc (tmp, code, op0, op1);
19548
19549 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19550 }
19551
19552 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19553 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19554 operands[2], operands[3])));
19555
19556 return true;
19557 }
19558
19559 /* Expand a floating-point vector conditional move; a vcond operation
19560 rather than a movcc operation. */
19561
19562 bool
19563 ix86_expand_fp_vcond (rtx operands[])
19564 {
19565 enum rtx_code code = GET_CODE (operands[3]);
19566 rtx cmp;
19567
19568 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19569 &operands[4], &operands[5]);
19570 if (code == UNKNOWN)
19571 {
19572 rtx temp;
19573 switch (GET_CODE (operands[3]))
19574 {
19575 case LTGT:
19576 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19577 operands[5], operands[0], operands[0]);
19578 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19579 operands[5], operands[1], operands[2]);
19580 code = AND;
19581 break;
19582 case UNEQ:
19583 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19584 operands[5], operands[0], operands[0]);
19585 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19586 operands[5], operands[1], operands[2]);
19587 code = IOR;
19588 break;
19589 default:
19590 gcc_unreachable ();
19591 }
19592 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19593 OPTAB_DIRECT);
19594 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19595 return true;
19596 }
19597
19598 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19599 operands[5], operands[1], operands[2]))
19600 return true;
19601
19602 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19603 operands[1], operands[2]);
19604 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19605 return true;
19606 }
19607
19608 /* Expand a signed/unsigned integral vector conditional move. */
19609
19610 bool
19611 ix86_expand_int_vcond (rtx operands[])
19612 {
19613 enum machine_mode data_mode = GET_MODE (operands[0]);
19614 enum machine_mode mode = GET_MODE (operands[4]);
19615 enum rtx_code code = GET_CODE (operands[3]);
19616 bool negate = false;
19617 rtx x, cop0, cop1;
19618
19619 cop0 = operands[4];
19620 cop1 = operands[5];
19621
19622 /* XOP supports all of the comparisons on all vector int types. */
19623 if (!TARGET_XOP)
19624 {
19625 /* Canonicalize the comparison to EQ, GT, GTU. */
19626 switch (code)
19627 {
19628 case EQ:
19629 case GT:
19630 case GTU:
19631 break;
19632
19633 case NE:
19634 case LE:
19635 case LEU:
19636 code = reverse_condition (code);
19637 negate = true;
19638 break;
19639
19640 case GE:
19641 case GEU:
19642 code = reverse_condition (code);
19643 negate = true;
19644 /* FALLTHRU */
19645
19646 case LT:
19647 case LTU:
19648 code = swap_condition (code);
19649 x = cop0, cop0 = cop1, cop1 = x;
19650 break;
19651
19652 default:
19653 gcc_unreachable ();
19654 }
19655
19656 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19657 if (mode == V2DImode)
19658 {
19659 switch (code)
19660 {
19661 case EQ:
19662 /* SSE4.1 supports EQ. */
19663 if (!TARGET_SSE4_1)
19664 return false;
19665 break;
19666
19667 case GT:
19668 case GTU:
19669 /* SSE4.2 supports GT/GTU. */
19670 if (!TARGET_SSE4_2)
19671 return false;
19672 break;
19673
19674 default:
19675 gcc_unreachable ();
19676 }
19677 }
19678
19679 /* Unsigned parallel compare is not supported by the hardware.
19680 Play some tricks to turn this into a signed comparison
19681 against 0. */
19682 if (code == GTU)
19683 {
19684 cop0 = force_reg (mode, cop0);
19685
19686 switch (mode)
19687 {
19688 case V8SImode:
19689 case V4DImode:
19690 case V4SImode:
19691 case V2DImode:
19692 {
19693 rtx t1, t2, mask;
19694 rtx (*gen_sub3) (rtx, rtx, rtx);
19695
19696 switch (mode)
19697 {
19698 case V8SImode: gen_sub3 = gen_subv8si3; break;
19699 case V4DImode: gen_sub3 = gen_subv4di3; break;
19700 case V4SImode: gen_sub3 = gen_subv4si3; break;
19701 case V2DImode: gen_sub3 = gen_subv2di3; break;
19702 default:
19703 gcc_unreachable ();
19704 }
19705 /* Subtract (-(INT MAX) - 1) from both operands to make
19706 them signed. */
19707 mask = ix86_build_signbit_mask (mode, true, false);
19708 t1 = gen_reg_rtx (mode);
19709 emit_insn (gen_sub3 (t1, cop0, mask));
19710
19711 t2 = gen_reg_rtx (mode);
19712 emit_insn (gen_sub3 (t2, cop1, mask));
19713
19714 cop0 = t1;
19715 cop1 = t2;
19716 code = GT;
19717 }
19718 break;
19719
19720 case V32QImode:
19721 case V16HImode:
19722 case V16QImode:
19723 case V8HImode:
19724 /* Perform a parallel unsigned saturating subtraction. */
19725 x = gen_reg_rtx (mode);
19726 emit_insn (gen_rtx_SET (VOIDmode, x,
19727 gen_rtx_US_MINUS (mode, cop0, cop1)));
19728
19729 cop0 = x;
19730 cop1 = CONST0_RTX (mode);
19731 code = EQ;
19732 negate = !negate;
19733 break;
19734
19735 default:
19736 gcc_unreachable ();
19737 }
19738 }
19739 }
19740
19741 /* Allow the comparison to be done in one mode, but the movcc to
19742 happen in another mode. */
19743 if (data_mode == mode)
19744 {
19745 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19746 operands[1+negate], operands[2-negate]);
19747 }
19748 else
19749 {
19750 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19751 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19752 code, cop0, cop1,
19753 operands[1+negate], operands[2-negate]);
19754 x = gen_lowpart (data_mode, x);
19755 }
19756
19757 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19758 operands[2-negate]);
19759 return true;
19760 }
19761
19762 /* Expand a variable vector permutation. */
19763
19764 void
19765 ix86_expand_vec_perm (rtx operands[])
19766 {
19767 rtx target = operands[0];
19768 rtx op0 = operands[1];
19769 rtx op1 = operands[2];
19770 rtx mask = operands[3];
19771 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19772 enum machine_mode mode = GET_MODE (op0);
19773 enum machine_mode maskmode = GET_MODE (mask);
19774 int w, e, i;
19775 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19776
19777 /* Number of elements in the vector. */
19778 w = GET_MODE_NUNITS (mode);
19779 e = GET_MODE_UNIT_SIZE (mode);
19780 gcc_assert (w <= 32);
19781
19782 if (TARGET_AVX2)
19783 {
19784 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19785 {
19786 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19787 an constant shuffle operand. With a tiny bit of effort we can
19788 use VPERMD instead. A re-interpretation stall for V4DFmode is
19789 unfortunate but there's no avoiding it.
19790 Similarly for V16HImode we don't have instructions for variable
19791 shuffling, while for V32QImode we can use after preparing suitable
19792 masks vpshufb; vpshufb; vpermq; vpor. */
19793
19794 if (mode == V16HImode)
19795 {
19796 maskmode = mode = V32QImode;
19797 w = 32;
19798 e = 1;
19799 }
19800 else
19801 {
19802 maskmode = mode = V8SImode;
19803 w = 8;
19804 e = 4;
19805 }
19806 t1 = gen_reg_rtx (maskmode);
19807
19808 /* Replicate the low bits of the V4DImode mask into V8SImode:
19809 mask = { A B C D }
19810 t1 = { A A B B C C D D }. */
19811 for (i = 0; i < w / 2; ++i)
19812 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19813 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19814 vt = force_reg (maskmode, vt);
19815 mask = gen_lowpart (maskmode, mask);
19816 if (maskmode == V8SImode)
19817 emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
19818 else
19819 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19820
19821 /* Multiply the shuffle indicies by two. */
19822 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19823 OPTAB_DIRECT);
19824
19825 /* Add one to the odd shuffle indicies:
19826 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19827 for (i = 0; i < w / 2; ++i)
19828 {
19829 vec[i * 2] = const0_rtx;
19830 vec[i * 2 + 1] = const1_rtx;
19831 }
19832 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19833 vt = force_const_mem (maskmode, vt);
19834 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19835 OPTAB_DIRECT);
19836
19837 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19838 operands[3] = mask = t1;
19839 target = gen_lowpart (mode, target);
19840 op0 = gen_lowpart (mode, op0);
19841 op1 = gen_lowpart (mode, op1);
19842 }
19843
19844 switch (mode)
19845 {
19846 case V8SImode:
19847 /* The VPERMD and VPERMPS instructions already properly ignore
19848 the high bits of the shuffle elements. No need for us to
19849 perform an AND ourselves. */
19850 if (one_operand_shuffle)
19851 emit_insn (gen_avx2_permvarv8si (target, mask, op0));
19852 else
19853 {
19854 t1 = gen_reg_rtx (V8SImode);
19855 t2 = gen_reg_rtx (V8SImode);
19856 emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
19857 emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
19858 goto merge_two;
19859 }
19860 return;
19861
19862 case V8SFmode:
19863 mask = gen_lowpart (V8SFmode, mask);
19864 if (one_operand_shuffle)
19865 emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
19866 else
19867 {
19868 t1 = gen_reg_rtx (V8SFmode);
19869 t2 = gen_reg_rtx (V8SFmode);
19870 emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
19871 emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
19872 goto merge_two;
19873 }
19874 return;
19875
19876 case V4SImode:
19877 /* By combining the two 128-bit input vectors into one 256-bit
19878 input vector, we can use VPERMD and VPERMPS for the full
19879 two-operand shuffle. */
19880 t1 = gen_reg_rtx (V8SImode);
19881 t2 = gen_reg_rtx (V8SImode);
19882 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19883 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19884 emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
19885 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19886 return;
19887
19888 case V4SFmode:
19889 t1 = gen_reg_rtx (V8SFmode);
19890 t2 = gen_reg_rtx (V8SFmode);
19891 mask = gen_lowpart (V4SFmode, mask);
19892 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19893 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19894 emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
19895 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19896 return;
19897
19898 case V32QImode:
19899 t1 = gen_reg_rtx (V32QImode);
19900 t2 = gen_reg_rtx (V32QImode);
19901 t3 = gen_reg_rtx (V32QImode);
19902 vt2 = GEN_INT (128);
19903 for (i = 0; i < 32; i++)
19904 vec[i] = vt2;
19905 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19906 vt = force_reg (V32QImode, vt);
19907 for (i = 0; i < 32; i++)
19908 vec[i] = i < 16 ? vt2 : const0_rtx;
19909 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19910 vt2 = force_reg (V32QImode, vt2);
19911 /* From mask create two adjusted masks, which contain the same
19912 bits as mask in the low 7 bits of each vector element.
19913 The first mask will have the most significant bit clear
19914 if it requests element from the same 128-bit lane
19915 and MSB set if it requests element from the other 128-bit lane.
19916 The second mask will have the opposite values of the MSB,
19917 and additionally will have its 128-bit lanes swapped.
19918 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19919 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19920 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19921 stands for other 12 bytes. */
19922 /* The bit whether element is from the same lane or the other
19923 lane is bit 4, so shift it up by 3 to the MSB position. */
19924 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19925 gen_lowpart (V4DImode, mask),
19926 GEN_INT (3)));
19927 /* Clear MSB bits from the mask just in case it had them set. */
19928 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19929 /* After this t1 will have MSB set for elements from other lane. */
19930 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
19931 /* Clear bits other than MSB. */
19932 emit_insn (gen_andv32qi3 (t1, t1, vt));
19933 /* Or in the lower bits from mask into t3. */
19934 emit_insn (gen_iorv32qi3 (t3, t1, t2));
19935 /* And invert MSB bits in t1, so MSB is set for elements from the same
19936 lane. */
19937 emit_insn (gen_xorv32qi3 (t1, t1, vt));
19938 /* Swap 128-bit lanes in t3. */
19939 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19940 gen_lowpart (V4DImode, t3),
19941 const2_rtx, GEN_INT (3),
19942 const0_rtx, const1_rtx));
19943 /* And or in the lower bits from mask into t1. */
19944 emit_insn (gen_iorv32qi3 (t1, t1, t2));
19945 if (one_operand_shuffle)
19946 {
19947 /* Each of these shuffles will put 0s in places where
19948 element from the other 128-bit lane is needed, otherwise
19949 will shuffle in the requested value. */
19950 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
19951 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
19952 /* For t3 the 128-bit lanes are swapped again. */
19953 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19954 gen_lowpart (V4DImode, t3),
19955 const2_rtx, GEN_INT (3),
19956 const0_rtx, const1_rtx));
19957 /* And oring both together leads to the result. */
19958 emit_insn (gen_iorv32qi3 (target, t1, t3));
19959 return;
19960 }
19961
19962 t4 = gen_reg_rtx (V32QImode);
19963 /* Similarly to the above one_operand_shuffle code,
19964 just for repeated twice for each operand. merge_two:
19965 code will merge the two results together. */
19966 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
19967 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
19968 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
19969 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
19970 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
19971 gen_lowpart (V4DImode, t4),
19972 const2_rtx, GEN_INT (3),
19973 const0_rtx, const1_rtx));
19974 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19975 gen_lowpart (V4DImode, t3),
19976 const2_rtx, GEN_INT (3),
19977 const0_rtx, const1_rtx));
19978 emit_insn (gen_iorv32qi3 (t4, t2, t4));
19979 emit_insn (gen_iorv32qi3 (t3, t1, t3));
19980 t1 = t4;
19981 t2 = t3;
19982 goto merge_two;
19983
19984 default:
19985 gcc_assert (GET_MODE_SIZE (mode) <= 16);
19986 break;
19987 }
19988 }
19989
19990 if (TARGET_XOP)
19991 {
19992 /* The XOP VPPERM insn supports three inputs. By ignoring the
19993 one_operand_shuffle special case, we avoid creating another
19994 set of constant vectors in memory. */
19995 one_operand_shuffle = false;
19996
19997 /* mask = mask & {2*w-1, ...} */
19998 vt = GEN_INT (2*w - 1);
19999 }
20000 else
20001 {
20002 /* mask = mask & {w-1, ...} */
20003 vt = GEN_INT (w - 1);
20004 }
20005
20006 for (i = 0; i < w; i++)
20007 vec[i] = vt;
20008 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20009 mask = expand_simple_binop (maskmode, AND, mask, vt,
20010 NULL_RTX, 0, OPTAB_DIRECT);
20011
20012 /* For non-QImode operations, convert the word permutation control
20013 into a byte permutation control. */
20014 if (mode != V16QImode)
20015 {
20016 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20017 GEN_INT (exact_log2 (e)),
20018 NULL_RTX, 0, OPTAB_DIRECT);
20019
20020 /* Convert mask to vector of chars. */
20021 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20022
20023 /* Replicate each of the input bytes into byte positions:
20024 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20025 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20026 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20027 for (i = 0; i < 16; ++i)
20028 vec[i] = GEN_INT (i/e * e);
20029 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20030 vt = force_const_mem (V16QImode, vt);
20031 if (TARGET_XOP)
20032 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20033 else
20034 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20035
20036 /* Convert it into the byte positions by doing
20037 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20038 for (i = 0; i < 16; ++i)
20039 vec[i] = GEN_INT (i % e);
20040 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20041 vt = force_const_mem (V16QImode, vt);
20042 emit_insn (gen_addv16qi3 (mask, mask, vt));
20043 }
20044
20045 /* The actual shuffle operations all operate on V16QImode. */
20046 op0 = gen_lowpart (V16QImode, op0);
20047 op1 = gen_lowpart (V16QImode, op1);
20048 target = gen_lowpart (V16QImode, target);
20049
20050 if (TARGET_XOP)
20051 {
20052 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20053 }
20054 else if (one_operand_shuffle)
20055 {
20056 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20057 }
20058 else
20059 {
20060 rtx xops[6];
20061 bool ok;
20062
20063 /* Shuffle the two input vectors independently. */
20064 t1 = gen_reg_rtx (V16QImode);
20065 t2 = gen_reg_rtx (V16QImode);
20066 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20067 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20068
20069 merge_two:
20070 /* Then merge them together. The key is whether any given control
20071 element contained a bit set that indicates the second word. */
20072 mask = operands[3];
20073 vt = GEN_INT (w);
20074 if (maskmode == V2DImode && !TARGET_SSE4_1)
20075 {
20076 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20077 more shuffle to convert the V2DI input mask into a V4SI
20078 input mask. At which point the masking that expand_int_vcond
20079 will work as desired. */
20080 rtx t3 = gen_reg_rtx (V4SImode);
20081 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20082 const0_rtx, const0_rtx,
20083 const2_rtx, const2_rtx));
20084 mask = t3;
20085 maskmode = V4SImode;
20086 e = w = 4;
20087 }
20088
20089 for (i = 0; i < w; i++)
20090 vec[i] = vt;
20091 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20092 vt = force_reg (maskmode, vt);
20093 mask = expand_simple_binop (maskmode, AND, mask, vt,
20094 NULL_RTX, 0, OPTAB_DIRECT);
20095
20096 xops[0] = gen_lowpart (mode, operands[0]);
20097 xops[1] = gen_lowpart (mode, t2);
20098 xops[2] = gen_lowpart (mode, t1);
20099 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20100 xops[4] = mask;
20101 xops[5] = vt;
20102 ok = ix86_expand_int_vcond (xops);
20103 gcc_assert (ok);
20104 }
20105 }
20106
20107 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20108 true if we should do zero extension, else sign extension. HIGH_P is
20109 true if we want the N/2 high elements, else the low elements. */
20110
20111 void
20112 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
20113 {
20114 enum machine_mode imode = GET_MODE (operands[1]);
20115 rtx tmp, dest;
20116
20117 if (TARGET_SSE4_1)
20118 {
20119 rtx (*unpack)(rtx, rtx);
20120 rtx (*extract)(rtx, rtx) = NULL;
20121 enum machine_mode halfmode = BLKmode;
20122
20123 switch (imode)
20124 {
20125 case V32QImode:
20126 if (unsigned_p)
20127 unpack = gen_avx2_zero_extendv16qiv16hi2;
20128 else
20129 unpack = gen_avx2_sign_extendv16qiv16hi2;
20130 halfmode = V16QImode;
20131 extract
20132 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20133 break;
20134 case V16HImode:
20135 if (unsigned_p)
20136 unpack = gen_avx2_zero_extendv8hiv8si2;
20137 else
20138 unpack = gen_avx2_sign_extendv8hiv8si2;
20139 halfmode = V8HImode;
20140 extract
20141 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20142 break;
20143 case V8SImode:
20144 if (unsigned_p)
20145 unpack = gen_avx2_zero_extendv4siv4di2;
20146 else
20147 unpack = gen_avx2_sign_extendv4siv4di2;
20148 halfmode = V4SImode;
20149 extract
20150 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20151 break;
20152 case V16QImode:
20153 if (unsigned_p)
20154 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20155 else
20156 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20157 break;
20158 case V8HImode:
20159 if (unsigned_p)
20160 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20161 else
20162 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20163 break;
20164 case V4SImode:
20165 if (unsigned_p)
20166 unpack = gen_sse4_1_zero_extendv2siv2di2;
20167 else
20168 unpack = gen_sse4_1_sign_extendv2siv2di2;
20169 break;
20170 default:
20171 gcc_unreachable ();
20172 }
20173
20174 if (GET_MODE_SIZE (imode) == 32)
20175 {
20176 tmp = gen_reg_rtx (halfmode);
20177 emit_insn (extract (tmp, operands[1]));
20178 }
20179 else if (high_p)
20180 {
20181 /* Shift higher 8 bytes to lower 8 bytes. */
20182 tmp = gen_reg_rtx (imode);
20183 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20184 gen_lowpart (V1TImode, operands[1]),
20185 GEN_INT (64)));
20186 }
20187 else
20188 tmp = operands[1];
20189
20190 emit_insn (unpack (operands[0], tmp));
20191 }
20192 else
20193 {
20194 rtx (*unpack)(rtx, rtx, rtx);
20195
20196 switch (imode)
20197 {
20198 case V16QImode:
20199 if (high_p)
20200 unpack = gen_vec_interleave_highv16qi;
20201 else
20202 unpack = gen_vec_interleave_lowv16qi;
20203 break;
20204 case V8HImode:
20205 if (high_p)
20206 unpack = gen_vec_interleave_highv8hi;
20207 else
20208 unpack = gen_vec_interleave_lowv8hi;
20209 break;
20210 case V4SImode:
20211 if (high_p)
20212 unpack = gen_vec_interleave_highv4si;
20213 else
20214 unpack = gen_vec_interleave_lowv4si;
20215 break;
20216 default:
20217 gcc_unreachable ();
20218 }
20219
20220 dest = gen_lowpart (imode, operands[0]);
20221
20222 if (unsigned_p)
20223 tmp = force_reg (imode, CONST0_RTX (imode));
20224 else
20225 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20226 operands[1], pc_rtx, pc_rtx);
20227
20228 emit_insn (unpack (dest, operands[1], tmp));
20229 }
20230 }
20231
20232 /* Expand conditional increment or decrement using adb/sbb instructions.
20233 The default case using setcc followed by the conditional move can be
20234 done by generic code. */
20235 bool
20236 ix86_expand_int_addcc (rtx operands[])
20237 {
20238 enum rtx_code code = GET_CODE (operands[1]);
20239 rtx flags;
20240 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20241 rtx compare_op;
20242 rtx val = const0_rtx;
20243 bool fpcmp = false;
20244 enum machine_mode mode;
20245 rtx op0 = XEXP (operands[1], 0);
20246 rtx op1 = XEXP (operands[1], 1);
20247
20248 if (operands[3] != const1_rtx
20249 && operands[3] != constm1_rtx)
20250 return false;
20251 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20252 return false;
20253 code = GET_CODE (compare_op);
20254
20255 flags = XEXP (compare_op, 0);
20256
20257 if (GET_MODE (flags) == CCFPmode
20258 || GET_MODE (flags) == CCFPUmode)
20259 {
20260 fpcmp = true;
20261 code = ix86_fp_compare_code_to_integer (code);
20262 }
20263
20264 if (code != LTU)
20265 {
20266 val = constm1_rtx;
20267 if (fpcmp)
20268 PUT_CODE (compare_op,
20269 reverse_condition_maybe_unordered
20270 (GET_CODE (compare_op)));
20271 else
20272 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20273 }
20274
20275 mode = GET_MODE (operands[0]);
20276
20277 /* Construct either adc or sbb insn. */
20278 if ((code == LTU) == (operands[3] == constm1_rtx))
20279 {
20280 switch (mode)
20281 {
20282 case QImode:
20283 insn = gen_subqi3_carry;
20284 break;
20285 case HImode:
20286 insn = gen_subhi3_carry;
20287 break;
20288 case SImode:
20289 insn = gen_subsi3_carry;
20290 break;
20291 case DImode:
20292 insn = gen_subdi3_carry;
20293 break;
20294 default:
20295 gcc_unreachable ();
20296 }
20297 }
20298 else
20299 {
20300 switch (mode)
20301 {
20302 case QImode:
20303 insn = gen_addqi3_carry;
20304 break;
20305 case HImode:
20306 insn = gen_addhi3_carry;
20307 break;
20308 case SImode:
20309 insn = gen_addsi3_carry;
20310 break;
20311 case DImode:
20312 insn = gen_adddi3_carry;
20313 break;
20314 default:
20315 gcc_unreachable ();
20316 }
20317 }
20318 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20319
20320 return true;
20321 }
20322
20323
20324 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20325 but works for floating pointer parameters and nonoffsetable memories.
20326 For pushes, it returns just stack offsets; the values will be saved
20327 in the right order. Maximally three parts are generated. */
20328
20329 static int
20330 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20331 {
20332 int size;
20333
20334 if (!TARGET_64BIT)
20335 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20336 else
20337 size = (GET_MODE_SIZE (mode) + 4) / 8;
20338
20339 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20340 gcc_assert (size >= 2 && size <= 4);
20341
20342 /* Optimize constant pool reference to immediates. This is used by fp
20343 moves, that force all constants to memory to allow combining. */
20344 if (MEM_P (operand) && MEM_READONLY_P (operand))
20345 {
20346 rtx tmp = maybe_get_pool_constant (operand);
20347 if (tmp)
20348 operand = tmp;
20349 }
20350
20351 if (MEM_P (operand) && !offsettable_memref_p (operand))
20352 {
20353 /* The only non-offsetable memories we handle are pushes. */
20354 int ok = push_operand (operand, VOIDmode);
20355
20356 gcc_assert (ok);
20357
20358 operand = copy_rtx (operand);
20359 PUT_MODE (operand, Pmode);
20360 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20361 return size;
20362 }
20363
20364 if (GET_CODE (operand) == CONST_VECTOR)
20365 {
20366 enum machine_mode imode = int_mode_for_mode (mode);
20367 /* Caution: if we looked through a constant pool memory above,
20368 the operand may actually have a different mode now. That's
20369 ok, since we want to pun this all the way back to an integer. */
20370 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20371 gcc_assert (operand != NULL);
20372 mode = imode;
20373 }
20374
20375 if (!TARGET_64BIT)
20376 {
20377 if (mode == DImode)
20378 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20379 else
20380 {
20381 int i;
20382
20383 if (REG_P (operand))
20384 {
20385 gcc_assert (reload_completed);
20386 for (i = 0; i < size; i++)
20387 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20388 }
20389 else if (offsettable_memref_p (operand))
20390 {
20391 operand = adjust_address (operand, SImode, 0);
20392 parts[0] = operand;
20393 for (i = 1; i < size; i++)
20394 parts[i] = adjust_address (operand, SImode, 4 * i);
20395 }
20396 else if (GET_CODE (operand) == CONST_DOUBLE)
20397 {
20398 REAL_VALUE_TYPE r;
20399 long l[4];
20400
20401 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20402 switch (mode)
20403 {
20404 case TFmode:
20405 real_to_target (l, &r, mode);
20406 parts[3] = gen_int_mode (l[3], SImode);
20407 parts[2] = gen_int_mode (l[2], SImode);
20408 break;
20409 case XFmode:
20410 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20411 parts[2] = gen_int_mode (l[2], SImode);
20412 break;
20413 case DFmode:
20414 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20415 break;
20416 default:
20417 gcc_unreachable ();
20418 }
20419 parts[1] = gen_int_mode (l[1], SImode);
20420 parts[0] = gen_int_mode (l[0], SImode);
20421 }
20422 else
20423 gcc_unreachable ();
20424 }
20425 }
20426 else
20427 {
20428 if (mode == TImode)
20429 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20430 if (mode == XFmode || mode == TFmode)
20431 {
20432 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20433 if (REG_P (operand))
20434 {
20435 gcc_assert (reload_completed);
20436 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20437 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20438 }
20439 else if (offsettable_memref_p (operand))
20440 {
20441 operand = adjust_address (operand, DImode, 0);
20442 parts[0] = operand;
20443 parts[1] = adjust_address (operand, upper_mode, 8);
20444 }
20445 else if (GET_CODE (operand) == CONST_DOUBLE)
20446 {
20447 REAL_VALUE_TYPE r;
20448 long l[4];
20449
20450 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20451 real_to_target (l, &r, mode);
20452
20453 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20454 if (HOST_BITS_PER_WIDE_INT >= 64)
20455 parts[0]
20456 = gen_int_mode
20457 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20458 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20459 DImode);
20460 else
20461 parts[0] = immed_double_const (l[0], l[1], DImode);
20462
20463 if (upper_mode == SImode)
20464 parts[1] = gen_int_mode (l[2], SImode);
20465 else if (HOST_BITS_PER_WIDE_INT >= 64)
20466 parts[1]
20467 = gen_int_mode
20468 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20469 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20470 DImode);
20471 else
20472 parts[1] = immed_double_const (l[2], l[3], DImode);
20473 }
20474 else
20475 gcc_unreachable ();
20476 }
20477 }
20478
20479 return size;
20480 }
20481
20482 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20483 Return false when normal moves are needed; true when all required
20484 insns have been emitted. Operands 2-4 contain the input values
20485 int the correct order; operands 5-7 contain the output values. */
20486
20487 void
20488 ix86_split_long_move (rtx operands[])
20489 {
20490 rtx part[2][4];
20491 int nparts, i, j;
20492 int push = 0;
20493 int collisions = 0;
20494 enum machine_mode mode = GET_MODE (operands[0]);
20495 bool collisionparts[4];
20496
20497 /* The DFmode expanders may ask us to move double.
20498 For 64bit target this is single move. By hiding the fact
20499 here we simplify i386.md splitters. */
20500 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20501 {
20502 /* Optimize constant pool reference to immediates. This is used by
20503 fp moves, that force all constants to memory to allow combining. */
20504
20505 if (MEM_P (operands[1])
20506 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20507 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20508 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20509 if (push_operand (operands[0], VOIDmode))
20510 {
20511 operands[0] = copy_rtx (operands[0]);
20512 PUT_MODE (operands[0], Pmode);
20513 }
20514 else
20515 operands[0] = gen_lowpart (DImode, operands[0]);
20516 operands[1] = gen_lowpart (DImode, operands[1]);
20517 emit_move_insn (operands[0], operands[1]);
20518 return;
20519 }
20520
20521 /* The only non-offsettable memory we handle is push. */
20522 if (push_operand (operands[0], VOIDmode))
20523 push = 1;
20524 else
20525 gcc_assert (!MEM_P (operands[0])
20526 || offsettable_memref_p (operands[0]));
20527
20528 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20529 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20530
20531 /* When emitting push, take care for source operands on the stack. */
20532 if (push && MEM_P (operands[1])
20533 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20534 {
20535 rtx src_base = XEXP (part[1][nparts - 1], 0);
20536
20537 /* Compensate for the stack decrement by 4. */
20538 if (!TARGET_64BIT && nparts == 3
20539 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20540 src_base = plus_constant (src_base, 4);
20541
20542 /* src_base refers to the stack pointer and is
20543 automatically decreased by emitted push. */
20544 for (i = 0; i < nparts; i++)
20545 part[1][i] = change_address (part[1][i],
20546 GET_MODE (part[1][i]), src_base);
20547 }
20548
20549 /* We need to do copy in the right order in case an address register
20550 of the source overlaps the destination. */
20551 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20552 {
20553 rtx tmp;
20554
20555 for (i = 0; i < nparts; i++)
20556 {
20557 collisionparts[i]
20558 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20559 if (collisionparts[i])
20560 collisions++;
20561 }
20562
20563 /* Collision in the middle part can be handled by reordering. */
20564 if (collisions == 1 && nparts == 3 && collisionparts [1])
20565 {
20566 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20567 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20568 }
20569 else if (collisions == 1
20570 && nparts == 4
20571 && (collisionparts [1] || collisionparts [2]))
20572 {
20573 if (collisionparts [1])
20574 {
20575 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20576 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20577 }
20578 else
20579 {
20580 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20581 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20582 }
20583 }
20584
20585 /* If there are more collisions, we can't handle it by reordering.
20586 Do an lea to the last part and use only one colliding move. */
20587 else if (collisions > 1)
20588 {
20589 rtx base;
20590
20591 collisions = 1;
20592
20593 base = part[0][nparts - 1];
20594
20595 /* Handle the case when the last part isn't valid for lea.
20596 Happens in 64-bit mode storing the 12-byte XFmode. */
20597 if (GET_MODE (base) != Pmode)
20598 base = gen_rtx_REG (Pmode, REGNO (base));
20599
20600 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20601 part[1][0] = replace_equiv_address (part[1][0], base);
20602 for (i = 1; i < nparts; i++)
20603 {
20604 tmp = plus_constant (base, UNITS_PER_WORD * i);
20605 part[1][i] = replace_equiv_address (part[1][i], tmp);
20606 }
20607 }
20608 }
20609
20610 if (push)
20611 {
20612 if (!TARGET_64BIT)
20613 {
20614 if (nparts == 3)
20615 {
20616 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20617 emit_insn (gen_addsi3 (stack_pointer_rtx,
20618 stack_pointer_rtx, GEN_INT (-4)));
20619 emit_move_insn (part[0][2], part[1][2]);
20620 }
20621 else if (nparts == 4)
20622 {
20623 emit_move_insn (part[0][3], part[1][3]);
20624 emit_move_insn (part[0][2], part[1][2]);
20625 }
20626 }
20627 else
20628 {
20629 /* In 64bit mode we don't have 32bit push available. In case this is
20630 register, it is OK - we will just use larger counterpart. We also
20631 retype memory - these comes from attempt to avoid REX prefix on
20632 moving of second half of TFmode value. */
20633 if (GET_MODE (part[1][1]) == SImode)
20634 {
20635 switch (GET_CODE (part[1][1]))
20636 {
20637 case MEM:
20638 part[1][1] = adjust_address (part[1][1], DImode, 0);
20639 break;
20640
20641 case REG:
20642 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20643 break;
20644
20645 default:
20646 gcc_unreachable ();
20647 }
20648
20649 if (GET_MODE (part[1][0]) == SImode)
20650 part[1][0] = part[1][1];
20651 }
20652 }
20653 emit_move_insn (part[0][1], part[1][1]);
20654 emit_move_insn (part[0][0], part[1][0]);
20655 return;
20656 }
20657
20658 /* Choose correct order to not overwrite the source before it is copied. */
20659 if ((REG_P (part[0][0])
20660 && REG_P (part[1][1])
20661 && (REGNO (part[0][0]) == REGNO (part[1][1])
20662 || (nparts == 3
20663 && REGNO (part[0][0]) == REGNO (part[1][2]))
20664 || (nparts == 4
20665 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20666 || (collisions > 0
20667 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20668 {
20669 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20670 {
20671 operands[2 + i] = part[0][j];
20672 operands[6 + i] = part[1][j];
20673 }
20674 }
20675 else
20676 {
20677 for (i = 0; i < nparts; i++)
20678 {
20679 operands[2 + i] = part[0][i];
20680 operands[6 + i] = part[1][i];
20681 }
20682 }
20683
20684 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20685 if (optimize_insn_for_size_p ())
20686 {
20687 for (j = 0; j < nparts - 1; j++)
20688 if (CONST_INT_P (operands[6 + j])
20689 && operands[6 + j] != const0_rtx
20690 && REG_P (operands[2 + j]))
20691 for (i = j; i < nparts - 1; i++)
20692 if (CONST_INT_P (operands[7 + i])
20693 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20694 operands[7 + i] = operands[2 + j];
20695 }
20696
20697 for (i = 0; i < nparts; i++)
20698 emit_move_insn (operands[2 + i], operands[6 + i]);
20699
20700 return;
20701 }
20702
20703 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20704 left shift by a constant, either using a single shift or
20705 a sequence of add instructions. */
20706
20707 static void
20708 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20709 {
20710 rtx (*insn)(rtx, rtx, rtx);
20711
20712 if (count == 1
20713 || (count * ix86_cost->add <= ix86_cost->shift_const
20714 && !optimize_insn_for_size_p ()))
20715 {
20716 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20717 while (count-- > 0)
20718 emit_insn (insn (operand, operand, operand));
20719 }
20720 else
20721 {
20722 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20723 emit_insn (insn (operand, operand, GEN_INT (count)));
20724 }
20725 }
20726
20727 void
20728 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20729 {
20730 rtx (*gen_ashl3)(rtx, rtx, rtx);
20731 rtx (*gen_shld)(rtx, rtx, rtx);
20732 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20733
20734 rtx low[2], high[2];
20735 int count;
20736
20737 if (CONST_INT_P (operands[2]))
20738 {
20739 split_double_mode (mode, operands, 2, low, high);
20740 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20741
20742 if (count >= half_width)
20743 {
20744 emit_move_insn (high[0], low[1]);
20745 emit_move_insn (low[0], const0_rtx);
20746
20747 if (count > half_width)
20748 ix86_expand_ashl_const (high[0], count - half_width, mode);
20749 }
20750 else
20751 {
20752 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20753
20754 if (!rtx_equal_p (operands[0], operands[1]))
20755 emit_move_insn (operands[0], operands[1]);
20756
20757 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20758 ix86_expand_ashl_const (low[0], count, mode);
20759 }
20760 return;
20761 }
20762
20763 split_double_mode (mode, operands, 1, low, high);
20764
20765 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20766
20767 if (operands[1] == const1_rtx)
20768 {
20769 /* Assuming we've chosen a QImode capable registers, then 1 << N
20770 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20771 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20772 {
20773 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20774
20775 ix86_expand_clear (low[0]);
20776 ix86_expand_clear (high[0]);
20777 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20778
20779 d = gen_lowpart (QImode, low[0]);
20780 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20781 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20782 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20783
20784 d = gen_lowpart (QImode, high[0]);
20785 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20786 s = gen_rtx_NE (QImode, flags, const0_rtx);
20787 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20788 }
20789
20790 /* Otherwise, we can get the same results by manually performing
20791 a bit extract operation on bit 5/6, and then performing the two
20792 shifts. The two methods of getting 0/1 into low/high are exactly
20793 the same size. Avoiding the shift in the bit extract case helps
20794 pentium4 a bit; no one else seems to care much either way. */
20795 else
20796 {
20797 enum machine_mode half_mode;
20798 rtx (*gen_lshr3)(rtx, rtx, rtx);
20799 rtx (*gen_and3)(rtx, rtx, rtx);
20800 rtx (*gen_xor3)(rtx, rtx, rtx);
20801 HOST_WIDE_INT bits;
20802 rtx x;
20803
20804 if (mode == DImode)
20805 {
20806 half_mode = SImode;
20807 gen_lshr3 = gen_lshrsi3;
20808 gen_and3 = gen_andsi3;
20809 gen_xor3 = gen_xorsi3;
20810 bits = 5;
20811 }
20812 else
20813 {
20814 half_mode = DImode;
20815 gen_lshr3 = gen_lshrdi3;
20816 gen_and3 = gen_anddi3;
20817 gen_xor3 = gen_xordi3;
20818 bits = 6;
20819 }
20820
20821 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20822 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20823 else
20824 x = gen_lowpart (half_mode, operands[2]);
20825 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20826
20827 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20828 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20829 emit_move_insn (low[0], high[0]);
20830 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20831 }
20832
20833 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20834 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20835 return;
20836 }
20837
20838 if (operands[1] == constm1_rtx)
20839 {
20840 /* For -1 << N, we can avoid the shld instruction, because we
20841 know that we're shifting 0...31/63 ones into a -1. */
20842 emit_move_insn (low[0], constm1_rtx);
20843 if (optimize_insn_for_size_p ())
20844 emit_move_insn (high[0], low[0]);
20845 else
20846 emit_move_insn (high[0], constm1_rtx);
20847 }
20848 else
20849 {
20850 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20851
20852 if (!rtx_equal_p (operands[0], operands[1]))
20853 emit_move_insn (operands[0], operands[1]);
20854
20855 split_double_mode (mode, operands, 1, low, high);
20856 emit_insn (gen_shld (high[0], low[0], operands[2]));
20857 }
20858
20859 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20860
20861 if (TARGET_CMOVE && scratch)
20862 {
20863 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20864 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20865
20866 ix86_expand_clear (scratch);
20867 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20868 }
20869 else
20870 {
20871 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20872 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20873
20874 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20875 }
20876 }
20877
20878 void
20879 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20880 {
20881 rtx (*gen_ashr3)(rtx, rtx, rtx)
20882 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20883 rtx (*gen_shrd)(rtx, rtx, rtx);
20884 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20885
20886 rtx low[2], high[2];
20887 int count;
20888
20889 if (CONST_INT_P (operands[2]))
20890 {
20891 split_double_mode (mode, operands, 2, low, high);
20892 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20893
20894 if (count == GET_MODE_BITSIZE (mode) - 1)
20895 {
20896 emit_move_insn (high[0], high[1]);
20897 emit_insn (gen_ashr3 (high[0], high[0],
20898 GEN_INT (half_width - 1)));
20899 emit_move_insn (low[0], high[0]);
20900
20901 }
20902 else if (count >= half_width)
20903 {
20904 emit_move_insn (low[0], high[1]);
20905 emit_move_insn (high[0], low[0]);
20906 emit_insn (gen_ashr3 (high[0], high[0],
20907 GEN_INT (half_width - 1)));
20908
20909 if (count > half_width)
20910 emit_insn (gen_ashr3 (low[0], low[0],
20911 GEN_INT (count - half_width)));
20912 }
20913 else
20914 {
20915 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20916
20917 if (!rtx_equal_p (operands[0], operands[1]))
20918 emit_move_insn (operands[0], operands[1]);
20919
20920 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20921 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20922 }
20923 }
20924 else
20925 {
20926 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20927
20928 if (!rtx_equal_p (operands[0], operands[1]))
20929 emit_move_insn (operands[0], operands[1]);
20930
20931 split_double_mode (mode, operands, 1, low, high);
20932
20933 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20934 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
20935
20936 if (TARGET_CMOVE && scratch)
20937 {
20938 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20939 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20940
20941 emit_move_insn (scratch, high[0]);
20942 emit_insn (gen_ashr3 (scratch, scratch,
20943 GEN_INT (half_width - 1)));
20944 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20945 scratch));
20946 }
20947 else
20948 {
20949 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
20950 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
20951
20952 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
20953 }
20954 }
20955 }
20956
20957 void
20958 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
20959 {
20960 rtx (*gen_lshr3)(rtx, rtx, rtx)
20961 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
20962 rtx (*gen_shrd)(rtx, rtx, rtx);
20963 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20964
20965 rtx low[2], high[2];
20966 int count;
20967
20968 if (CONST_INT_P (operands[2]))
20969 {
20970 split_double_mode (mode, operands, 2, low, high);
20971 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20972
20973 if (count >= half_width)
20974 {
20975 emit_move_insn (low[0], high[1]);
20976 ix86_expand_clear (high[0]);
20977
20978 if (count > half_width)
20979 emit_insn (gen_lshr3 (low[0], low[0],
20980 GEN_INT (count - half_width)));
20981 }
20982 else
20983 {
20984 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20985
20986 if (!rtx_equal_p (operands[0], operands[1]))
20987 emit_move_insn (operands[0], operands[1]);
20988
20989 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20990 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
20991 }
20992 }
20993 else
20994 {
20995 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20996
20997 if (!rtx_equal_p (operands[0], operands[1]))
20998 emit_move_insn (operands[0], operands[1]);
20999
21000 split_double_mode (mode, operands, 1, low, high);
21001
21002 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21003 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21004
21005 if (TARGET_CMOVE && scratch)
21006 {
21007 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21008 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21009
21010 ix86_expand_clear (scratch);
21011 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21012 scratch));
21013 }
21014 else
21015 {
21016 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21017 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21018
21019 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21020 }
21021 }
21022 }
21023
21024 /* Predict just emitted jump instruction to be taken with probability PROB. */
21025 static void
21026 predict_jump (int prob)
21027 {
21028 rtx insn = get_last_insn ();
21029 gcc_assert (JUMP_P (insn));
21030 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21031 }
21032
21033 /* Helper function for the string operations below. Dest VARIABLE whether
21034 it is aligned to VALUE bytes. If true, jump to the label. */
21035 static rtx
21036 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21037 {
21038 rtx label = gen_label_rtx ();
21039 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21040 if (GET_MODE (variable) == DImode)
21041 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21042 else
21043 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21044 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21045 1, label);
21046 if (epilogue)
21047 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21048 else
21049 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21050 return label;
21051 }
21052
21053 /* Adjust COUNTER by the VALUE. */
21054 static void
21055 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21056 {
21057 rtx (*gen_add)(rtx, rtx, rtx)
21058 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21059
21060 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21061 }
21062
21063 /* Zero extend possibly SImode EXP to Pmode register. */
21064 rtx
21065 ix86_zero_extend_to_Pmode (rtx exp)
21066 {
21067 rtx r;
21068 if (GET_MODE (exp) == VOIDmode)
21069 return force_reg (Pmode, exp);
21070 if (GET_MODE (exp) == Pmode)
21071 return copy_to_mode_reg (Pmode, exp);
21072 r = gen_reg_rtx (Pmode);
21073 emit_insn (gen_zero_extendsidi2 (r, exp));
21074 return r;
21075 }
21076
21077 /* Divide COUNTREG by SCALE. */
21078 static rtx
21079 scale_counter (rtx countreg, int scale)
21080 {
21081 rtx sc;
21082
21083 if (scale == 1)
21084 return countreg;
21085 if (CONST_INT_P (countreg))
21086 return GEN_INT (INTVAL (countreg) / scale);
21087 gcc_assert (REG_P (countreg));
21088
21089 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21090 GEN_INT (exact_log2 (scale)),
21091 NULL, 1, OPTAB_DIRECT);
21092 return sc;
21093 }
21094
21095 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21096 DImode for constant loop counts. */
21097
21098 static enum machine_mode
21099 counter_mode (rtx count_exp)
21100 {
21101 if (GET_MODE (count_exp) != VOIDmode)
21102 return GET_MODE (count_exp);
21103 if (!CONST_INT_P (count_exp))
21104 return Pmode;
21105 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21106 return DImode;
21107 return SImode;
21108 }
21109
21110 /* Helper function for expand_set_or_movmem_via_loop.
21111
21112 When SRCPTR is non-NULL, output simple loop to move memory
21113 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21114 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21115 equivalent loop to set memory by VALUE (supposed to be in MODE).
21116
21117 The size is rounded down to whole number of chunk size moved at once.
21118 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.
21119
21120 If ITER isn't NULL, than it'll be used in the generated loop without
21121 initialization (that allows to generate several consequent loops using the
21122 same iterator).
21123 If CHANGE_PTRS is specified, DESTPTR and SRCPTR would be increased by
21124 iterator value at the end of the function (as if they iterate in the loop).
21125 Otherwise, their vaules'll stay unchanged.
21126
21127 If EXPECTED_SIZE isn't -1, than it's used to compute branch-probabilities on
21128 the loop backedge. When expected size is unknown (it's -1), the probability
21129 is set to 80%.
21130
21131 Return value is rtx of iterator, used in the loop - it could be reused in
21132 consequent calls of this function. */
21133 static rtx
21134 expand_set_or_movmem_via_loop_with_iter (rtx destmem, rtx srcmem,
21135 rtx destptr, rtx srcptr, rtx value,
21136 rtx count, rtx iter,
21137 enum machine_mode mode, int unroll,
21138 int expected_size, bool change_ptrs)
21139 {
21140 rtx out_label, top_label, tmp;
21141 enum machine_mode iter_mode = counter_mode (count);
21142 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21143 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21144 rtx size;
21145 rtx x_addr;
21146 rtx y_addr;
21147 int i;
21148 bool reuse_iter = (iter != NULL_RTX);
21149
21150 top_label = gen_label_rtx ();
21151 out_label = gen_label_rtx ();
21152 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21153 NULL, 1, OPTAB_DIRECT);
21154 if (!reuse_iter)
21155 {
21156 iter = gen_reg_rtx (iter_mode);
21157 /* Those two should combine. */
21158 if (piece_size == const1_rtx)
21159 {
21160 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21161 true, out_label);
21162 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21163 }
21164 emit_move_insn (iter, const0_rtx);
21165 }
21166 else
21167 {
21168 emit_cmp_and_jump_insns (iter, size, GE, NULL_RTX, iter_mode,
21169 true, out_label);
21170 }
21171
21172 emit_label (top_label);
21173
21174 tmp = convert_modes (Pmode, iter_mode, iter, true);
21175 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21176 destmem =
21177 adjust_automodify_address_nv (copy_rtx (destmem), mode, x_addr, 0);
21178
21179 if (srcmem)
21180 {
21181 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21182 srcmem =
21183 adjust_automodify_address_nv (copy_rtx (srcmem), mode, y_addr, 0);
21184
21185 /* When unrolling for chips that reorder memory reads and writes,
21186 we can save registers by using single temporary.
21187 Also using 4 temporaries is overkill in 32bit mode. */
21188 if (!TARGET_64BIT && 0)
21189 {
21190 for (i = 0; i < unroll; i++)
21191 {
21192 if (i)
21193 {
21194 destmem =
21195 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21196 srcmem =
21197 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21198 }
21199 emit_move_insn (destmem, srcmem);
21200 }
21201 }
21202 else
21203 {
21204 rtx tmpreg[4];
21205 gcc_assert (unroll <= 4);
21206 for (i = 0; i < unroll; i++)
21207 {
21208 tmpreg[i] = gen_reg_rtx (mode);
21209 if (i)
21210 {
21211 srcmem =
21212 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21213 }
21214 emit_move_insn (tmpreg[i], srcmem);
21215 }
21216 for (i = 0; i < unroll; i++)
21217 {
21218 if (i)
21219 {
21220 destmem =
21221 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21222 }
21223 emit_move_insn (destmem, tmpreg[i]);
21224 }
21225 }
21226 }
21227 else
21228 for (i = 0; i < unroll; i++)
21229 {
21230 if (i)
21231 destmem =
21232 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21233 emit_move_insn (destmem, value);
21234 }
21235
21236 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21237 true, OPTAB_LIB_WIDEN);
21238 if (tmp != iter)
21239 emit_move_insn (iter, tmp);
21240
21241 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21242 true, top_label);
21243 if (expected_size != -1)
21244 {
21245 expected_size /= GET_MODE_SIZE (mode) * unroll;
21246 if (expected_size == 0)
21247 predict_jump (0);
21248 else if (expected_size > REG_BR_PROB_BASE)
21249 predict_jump (REG_BR_PROB_BASE - 1);
21250 else
21251 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21252 }
21253 else
21254 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21255 if (change_ptrs)
21256 {
21257 iter = ix86_zero_extend_to_Pmode (iter);
21258 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21259 true, OPTAB_LIB_WIDEN);
21260 if (tmp != destptr)
21261 emit_move_insn (destptr, tmp);
21262 if (srcptr)
21263 {
21264 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21265 true, OPTAB_LIB_WIDEN);
21266 if (tmp != srcptr)
21267 emit_move_insn (srcptr, tmp);
21268 }
21269 }
21270 emit_label (out_label);
21271 return iter;
21272 }
21273
21274 /* When SRCPTR is non-NULL, output simple loop to move memory
21275 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21276 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21277 equivalent loop to set memory by VALUE (supposed to be in MODE).
21278
21279 The size is rounded down to whole number of chunk size moved at once.
21280 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21281
21282 static void
21283 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21284 rtx destptr, rtx srcptr, rtx value,
21285 rtx count, enum machine_mode mode, int unroll,
21286 int expected_size)
21287 {
21288 expand_set_or_movmem_via_loop_with_iter (destmem, srcmem,
21289 destptr, srcptr, value,
21290 count, NULL_RTX, mode, unroll,
21291 expected_size, true);
21292 }
21293
21294 /* Output "rep; mov" instruction.
21295 Arguments have same meaning as for previous function */
21296 static void
21297 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21298 rtx destptr, rtx srcptr,
21299 rtx count,
21300 enum machine_mode mode)
21301 {
21302 rtx destexp;
21303 rtx srcexp;
21304 rtx countreg;
21305 HOST_WIDE_INT rounded_count;
21306
21307 /* If the size is known, it is shorter to use rep movs. */
21308 if (mode == QImode && CONST_INT_P (count)
21309 && !(INTVAL (count) & 3))
21310 mode = SImode;
21311
21312 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21313 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21314 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21315 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21316 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21317 if (mode != QImode)
21318 {
21319 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21320 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21321 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21322 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21323 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21324 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21325 }
21326 else
21327 {
21328 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21329 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21330 }
21331 if (CONST_INT_P (count))
21332 {
21333 rounded_count = (INTVAL (count)
21334 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21335 destmem = shallow_copy_rtx (destmem);
21336 srcmem = shallow_copy_rtx (srcmem);
21337 set_mem_size (destmem, rounded_count);
21338 set_mem_size (srcmem, rounded_count);
21339 }
21340 else
21341 {
21342 if (MEM_SIZE_KNOWN_P (destmem))
21343 clear_mem_size (destmem);
21344 if (MEM_SIZE_KNOWN_P (srcmem))
21345 clear_mem_size (srcmem);
21346 }
21347 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21348 destexp, srcexp));
21349 }
21350
21351 /* Output "rep; stos" instruction.
21352 Arguments have same meaning as for previous function */
21353 static void
21354 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21355 rtx count, enum machine_mode mode,
21356 rtx orig_value)
21357 {
21358 rtx destexp;
21359 rtx countreg;
21360 HOST_WIDE_INT rounded_count;
21361
21362 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21363 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21364 value = force_reg (mode, gen_lowpart (mode, value));
21365 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21366 if (mode != QImode)
21367 {
21368 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21369 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21370 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21371 }
21372 else
21373 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21374 if (orig_value == const0_rtx && CONST_INT_P (count))
21375 {
21376 rounded_count = (INTVAL (count)
21377 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21378 destmem = shallow_copy_rtx (destmem);
21379 set_mem_size (destmem, rounded_count);
21380 }
21381 else if (MEM_SIZE_KNOWN_P (destmem))
21382 clear_mem_size (destmem);
21383 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21384 }
21385
21386 static void
21387 emit_strmov (rtx destmem, rtx srcmem,
21388 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21389 {
21390 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21391 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21392 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21393 }
21394
21395 /* Emit strset instuction. If RHS is constant, and vector mode will be used,
21396 then move this constant to a vector register before emitting strset. */
21397 static void
21398 emit_strset (rtx destmem, rtx value,
21399 rtx destptr, enum machine_mode mode, int offset)
21400 {
21401 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21402 emit_insn (gen_strset (destptr, dest, value));
21403 }
21404
21405 /* Output code to copy (COUNT % MAX_SIZE) bytes from SRCPTR to DESTPTR.
21406 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21407 static void
21408 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21409 rtx destptr, rtx srcptr, rtx count, int max_size)
21410 {
21411 rtx src, dest;
21412 if (CONST_INT_P (count))
21413 {
21414 HOST_WIDE_INT countval = INTVAL (count);
21415 int offset = 0;
21416
21417 int remainder_size = countval % max_size;
21418 enum machine_mode move_mode = Pmode;
21419
21420 /* Firstly, try to move data with the widest possible mode.
21421 Remaining part we'll move using Pmode and narrower modes. */
21422 if (TARGET_SSE)
21423 {
21424 if (max_size >= GET_MODE_SIZE (V4SImode))
21425 move_mode = V4SImode;
21426 else if (max_size >= GET_MODE_SIZE (DImode))
21427 move_mode = DImode;
21428 }
21429
21430 while (remainder_size >= GET_MODE_SIZE (move_mode))
21431 {
21432 emit_strmov (destmem, srcmem, destptr, srcptr, move_mode, offset);
21433 offset += GET_MODE_SIZE (move_mode);
21434 remainder_size -= GET_MODE_SIZE (move_mode);
21435 }
21436
21437 /* Move the remaining part of epilogue - its size might be
21438 a size of the widest mode. */
21439 move_mode = Pmode;
21440 while (remainder_size >= GET_MODE_SIZE (move_mode))
21441 {
21442 emit_strmov (destmem, srcmem, destptr, srcptr, move_mode, offset);
21443 offset += GET_MODE_SIZE (move_mode);
21444 remainder_size -= GET_MODE_SIZE (move_mode);
21445 }
21446
21447 if (remainder_size >= 4)
21448 {
21449 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21450 offset += 4;
21451 remainder_size -= 4;
21452 }
21453 if (remainder_size >= 2)
21454 {
21455 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21456 offset += 2;
21457 remainder_size -= 2;
21458 }
21459 if (remainder_size >= 1)
21460 {
21461 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21462 offset += 1;
21463 remainder_size -= 1;
21464 }
21465 gcc_assert (remainder_size == 0);
21466 return;
21467 }
21468 if (max_size > 16)
21469 {
21470 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21471 count, 1, OPTAB_DIRECT);
21472 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21473 count, QImode, 1, 4);
21474 return;
21475 }
21476
21477 /* When there are stringops, we can cheaply increase dest and src pointers.
21478 Otherwise we save code size by maintaining offset (zero is readily
21479 available from preceding rep operation) and using x86 addressing modes.
21480 */
21481 if (TARGET_SINGLE_STRINGOP)
21482 {
21483 if (max_size > 8)
21484 {
21485 rtx label = ix86_expand_aligntest (count, 8, true);
21486 if (TARGET_64BIT)
21487 {
21488 src = change_address (srcmem, DImode, srcptr);
21489 dest = change_address (destmem, DImode, destptr);
21490 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21491 }
21492 else
21493 {
21494 src = change_address (srcmem, SImode, srcptr);
21495 dest = change_address (destmem, SImode, destptr);
21496 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21497 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21498 }
21499 emit_label (label);
21500 LABEL_NUSES (label) = 1;
21501 }
21502 if (max_size > 4)
21503 {
21504 rtx label = ix86_expand_aligntest (count, 4, true);
21505 src = change_address (srcmem, SImode, srcptr);
21506 dest = change_address (destmem, SImode, destptr);
21507 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21508 emit_label (label);
21509 LABEL_NUSES (label) = 1;
21510 }
21511 if (max_size > 2)
21512 {
21513 rtx label = ix86_expand_aligntest (count, 2, true);
21514 src = change_address (srcmem, HImode, srcptr);
21515 dest = change_address (destmem, HImode, destptr);
21516 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21517 emit_label (label);
21518 LABEL_NUSES (label) = 1;
21519 }
21520 if (max_size > 1)
21521 {
21522 rtx label = ix86_expand_aligntest (count, 1, true);
21523 src = change_address (srcmem, QImode, srcptr);
21524 dest = change_address (destmem, QImode, destptr);
21525 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21526 emit_label (label);
21527 LABEL_NUSES (label) = 1;
21528 }
21529 }
21530 else
21531 {
21532 rtx offset = force_reg (Pmode, const0_rtx);
21533 rtx tmp;
21534
21535 if (max_size > 8)
21536 {
21537 rtx label = ix86_expand_aligntest (count, 8, true);
21538 if (TARGET_64BIT)
21539 {
21540 src = change_address (srcmem, DImode, srcptr);
21541 dest = change_address (destmem, DImode, destptr);
21542 emit_move_insn (dest, src);
21543 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (8), NULL,
21544 true, OPTAB_LIB_WIDEN);
21545 }
21546 else
21547 {
21548 src = change_address (srcmem, SImode, srcptr);
21549 dest = change_address (destmem, SImode, destptr);
21550 emit_move_insn (dest, src);
21551 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21552 true, OPTAB_LIB_WIDEN);
21553 if (tmp != offset)
21554 emit_move_insn (offset, tmp);
21555 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21556 true, OPTAB_LIB_WIDEN);
21557 emit_move_insn (dest, src);
21558 }
21559 if (tmp != offset)
21560 emit_move_insn (offset, tmp);
21561 emit_label (label);
21562 LABEL_NUSES (label) = 1;
21563 }
21564 if (max_size > 4)
21565 {
21566 rtx label = ix86_expand_aligntest (count, 4, true);
21567 src = change_address (srcmem, SImode, srcptr);
21568 dest = change_address (destmem, SImode, destptr);
21569 emit_move_insn (dest, src);
21570 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21571 true, OPTAB_LIB_WIDEN);
21572 if (tmp != offset)
21573 emit_move_insn (offset, tmp);
21574 emit_label (label);
21575 LABEL_NUSES (label) = 1;
21576 }
21577 if (max_size > 2)
21578 {
21579 rtx label = ix86_expand_aligntest (count, 2, true);
21580 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21581 src = change_address (srcmem, HImode, tmp);
21582 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21583 dest = change_address (destmem, HImode, tmp);
21584 emit_move_insn (dest, src);
21585 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21586 true, OPTAB_LIB_WIDEN);
21587 if (tmp != offset)
21588 emit_move_insn (offset, tmp);
21589 emit_label (label);
21590 LABEL_NUSES (label) = 1;
21591 }
21592 if (max_size > 1)
21593 {
21594 rtx label = ix86_expand_aligntest (count, 1, true);
21595 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21596 src = change_address (srcmem, QImode, tmp);
21597 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21598 dest = change_address (destmem, QImode, tmp);
21599 emit_move_insn (dest, src);
21600 emit_label (label);
21601 LABEL_NUSES (label) = 1;
21602 }
21603 }
21604 }
21605
21606 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21607 static void
21608 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21609 rtx count, int max_size)
21610 {
21611 count =
21612 expand_simple_binop (counter_mode (count), AND, count,
21613 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21614 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21615 gen_lowpart (QImode, value), count, QImode,
21616 1, max_size / 2);
21617 }
21618
21619 /* Output code to set with VALUE at most (COUNT % MAX_SIZE) bytes starting from
21620 DESTPTR.
21621 DESTMEM provides MEMrtx to feed proper aliasing info.
21622 PROMOTED_TO_GPR_VALUE is rtx representing a GPR containing broadcasted VALUE.
21623 PROMOTED_TO_VECTOR_VALUE is rtx representing a vector register containing
21624 broadcasted VALUE.
21625 PROMOTED_TO_GPR_VALUE and PROMOTED_TO_VECTOR_VALUE could be NULL if the
21626 promotion hasn't been generated before. */
21627 static void
21628 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx promoted_to_vector_value,
21629 rtx promoted_to_gpr_value, rtx value, rtx count,
21630 int max_size)
21631 {
21632 if (CONST_INT_P (count))
21633 {
21634 HOST_WIDE_INT countval = INTVAL (count);
21635 int offset = 0;
21636
21637 int remainder_size = countval % max_size;
21638 enum machine_mode move_mode = Pmode;
21639
21640 /* Firstly, try to move data with the widest possible mode.
21641 Remaining part we'll move using Pmode and narrower modes. */
21642
21643 if (promoted_to_vector_value)
21644 {
21645 if (promoted_to_vector_value)
21646 {
21647 if (max_size >= GET_MODE_SIZE (V4SImode))
21648 move_mode = V4SImode;
21649 else if (max_size >= GET_MODE_SIZE (DImode))
21650 move_mode = DImode;
21651 }
21652 while (remainder_size >= GET_MODE_SIZE (move_mode))
21653 {
21654 if (GET_MODE (destmem) != move_mode)
21655 destmem = adjust_automodify_address_nv (destmem, move_mode,
21656 destptr, offset);
21657 emit_strset (destmem,
21658 promoted_to_vector_value,
21659 destptr,
21660 move_mode, offset);
21661
21662 offset += GET_MODE_SIZE (move_mode);
21663 remainder_size -= GET_MODE_SIZE (move_mode);
21664 }
21665 }
21666
21667 /* Move the remaining part of epilogue - its size might be
21668 a size of the widest mode. */
21669 while (remainder_size >= GET_MODE_SIZE (Pmode))
21670 {
21671 if (!promoted_to_gpr_value)
21672 promoted_to_gpr_value = promote_duplicated_reg (Pmode, value);
21673 emit_strset (destmem, promoted_to_gpr_value, destptr, Pmode, offset);
21674 offset += GET_MODE_SIZE (Pmode);
21675 remainder_size -= GET_MODE_SIZE (Pmode);
21676 }
21677
21678 if (!promoted_to_gpr_value && remainder_size > 1)
21679 promoted_to_gpr_value = promote_duplicated_reg (remainder_size >= 4
21680 ? SImode : HImode, value);
21681 if (remainder_size >= 4)
21682 {
21683 emit_strset (destmem, gen_lowpart (SImode, promoted_to_gpr_value), destptr,
21684 SImode, offset);
21685 offset += 4;
21686 remainder_size -= 4;
21687 }
21688 if (remainder_size >= 2)
21689 {
21690 emit_strset (destmem, gen_lowpart (HImode, promoted_to_gpr_value), destptr,
21691 HImode, offset);
21692 offset +=2;
21693 remainder_size -= 2;
21694 }
21695 if (remainder_size >= 1)
21696 {
21697 emit_strset (destmem,
21698 promoted_to_gpr_value ? gen_lowpart (QImode, promoted_to_gpr_value) : value,
21699 destptr,
21700 QImode, offset);
21701 offset += 1;
21702 remainder_size -= 1;
21703 }
21704 gcc_assert (remainder_size == 0);
21705 return;
21706 }
21707
21708 /* count isn't const. */
21709 if (max_size > 32)
21710 {
21711 expand_setmem_epilogue_via_loop (destmem, destptr, value, count,
21712 max_size);
21713 return;
21714 }
21715
21716 if (!promoted_to_gpr_value)
21717 promoted_to_gpr_value = promote_duplicated_reg_to_size (value,
21718 GET_MODE_SIZE (Pmode),
21719 GET_MODE_SIZE (Pmode),
21720 GET_MODE_SIZE (Pmode));
21721
21722 if (max_size > 16)
21723 {
21724 rtx label = ix86_expand_aligntest (count, 16, true);
21725 if (TARGET_SSE && promoted_to_vector_value)
21726 {
21727 destmem = change_address (destmem,
21728 GET_MODE (promoted_to_vector_value),
21729 destptr);
21730 emit_insn (gen_strset (destptr, destmem, promoted_to_vector_value));
21731 }
21732 else if (TARGET_64BIT)
21733 {
21734 destmem = change_address (destmem, DImode, destptr);
21735 emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
21736 emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
21737 }
21738 else
21739 {
21740 destmem = change_address (destmem, SImode, destptr);
21741 emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
21742 emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
21743 emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
21744 emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
21745 }
21746 emit_label (label);
21747 LABEL_NUSES (label) = 1;
21748 }
21749 if (max_size > 8)
21750 {
21751 rtx label = ix86_expand_aligntest (count, 8, true);
21752 if (TARGET_64BIT)
21753 {
21754 destmem = change_address (destmem, DImode, destptr);
21755 emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
21756 }
21757 /* FIXME: When this hunk it output, IRA classifies promoted_to_vector_value
21758 as NO_REGS. */
21759 else if (TARGET_SSE && promoted_to_vector_value && 0)
21760 {
21761 destmem = change_address (destmem, V2SImode, destptr);
21762 emit_insn (gen_strset (destptr, destmem,
21763 gen_lowpart (V2SImode, promoted_to_vector_value)));
21764 }
21765 else
21766 {
21767 destmem = change_address (destmem, SImode, destptr);
21768 emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
21769 emit_insn (gen_strset (destptr, destmem, promoted_to_gpr_value));
21770 }
21771 emit_label (label);
21772 LABEL_NUSES (label) = 1;
21773 }
21774 if (max_size > 4)
21775 {
21776 rtx label = ix86_expand_aligntest (count, 4, true);
21777 destmem = change_address (destmem, SImode, destptr);
21778 emit_insn (gen_strset (destptr, destmem,
21779 gen_lowpart (SImode, promoted_to_gpr_value)));
21780 emit_label (label);
21781 LABEL_NUSES (label) = 1;
21782 }
21783 if (max_size > 2)
21784 {
21785 rtx label = ix86_expand_aligntest (count, 2, true);
21786 destmem = change_address (destmem, HImode, destptr);
21787 emit_insn (gen_strset (destptr, destmem,
21788 gen_lowpart (HImode, promoted_to_gpr_value)));
21789 emit_label (label);
21790 LABEL_NUSES (label) = 1;
21791 }
21792 if (max_size > 1)
21793 {
21794 rtx label = ix86_expand_aligntest (count, 1, true);
21795 destmem = change_address (destmem, QImode, destptr);
21796 emit_insn (gen_strset (destptr, destmem,
21797 gen_lowpart (QImode, promoted_to_gpr_value)));
21798 emit_label (label);
21799 LABEL_NUSES (label) = 1;
21800 }
21801 }
21802
21803 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21804 DESIRED_ALIGNMENT. */
21805 static void
21806 expand_movmem_prologue (rtx destmem, rtx srcmem,
21807 rtx destptr, rtx srcptr, rtx count,
21808 int align, int desired_alignment)
21809 {
21810 if (align <= 1 && desired_alignment > 1)
21811 {
21812 rtx label = ix86_expand_aligntest (destptr, 1, false);
21813 srcmem = adjust_automodify_address_nv (srcmem, QImode, srcptr, 0);
21814 destmem = adjust_automodify_address_nv (destmem, QImode, destptr, 0);
21815 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21816 ix86_adjust_counter (count, 1);
21817 emit_label (label);
21818 LABEL_NUSES (label) = 1;
21819 }
21820 if (align <= 2 && desired_alignment > 2)
21821 {
21822 rtx label = ix86_expand_aligntest (destptr, 2, false);
21823 srcmem = adjust_automodify_address_nv (srcmem, HImode, srcptr, 0);
21824 destmem = adjust_automodify_address_nv (destmem, HImode, destptr, 0);
21825 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21826 ix86_adjust_counter (count, 2);
21827 emit_label (label);
21828 LABEL_NUSES (label) = 1;
21829 }
21830 if (align <= 4 && desired_alignment > 4)
21831 {
21832 rtx label = ix86_expand_aligntest (destptr, 4, false);
21833 srcmem = adjust_automodify_address_nv (srcmem, SImode, srcptr, 0);
21834 destmem = adjust_automodify_address_nv (destmem, SImode, destptr, 0);
21835 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21836 ix86_adjust_counter (count, 4);
21837 emit_label (label);
21838 LABEL_NUSES (label) = 1;
21839 }
21840 if (align <= 8 && desired_alignment > 8)
21841 {
21842 rtx label = ix86_expand_aligntest (destptr, 8, false);
21843 if (TARGET_64BIT || TARGET_SSE)
21844 {
21845 srcmem = adjust_automodify_address_nv (srcmem, DImode, srcptr, 0);
21846 destmem = adjust_automodify_address_nv (destmem, DImode, destptr, 0);
21847 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21848 }
21849 else
21850 {
21851 srcmem = adjust_automodify_address_nv (srcmem, SImode, srcptr, 0);
21852 destmem = adjust_automodify_address_nv (destmem, SImode, destptr, 0);
21853 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21854 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21855 }
21856 ix86_adjust_counter (count, 8);
21857 emit_label (label);
21858 LABEL_NUSES (label) = 1;
21859 }
21860 gcc_assert (desired_alignment <= 16);
21861 }
21862
21863 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21864 ALIGN_BYTES is how many bytes need to be copied. */
21865 static rtx
21866 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21867 int desired_align, int align_bytes)
21868 {
21869 rtx src = *srcp;
21870 rtx orig_dst = dst;
21871 rtx orig_src = src;
21872 int off = 0;
21873 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21874 if (src_align_bytes >= 0)
21875 src_align_bytes = desired_align - src_align_bytes;
21876 if (align_bytes & 1)
21877 {
21878 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21879 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21880 off = 1;
21881 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21882 }
21883 if (align_bytes & 2)
21884 {
21885 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21886 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21887 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21888 set_mem_align (dst, 2 * BITS_PER_UNIT);
21889 if (src_align_bytes >= 0
21890 && (src_align_bytes & 1) == (align_bytes & 1)
21891 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21892 set_mem_align (src, 2 * BITS_PER_UNIT);
21893 off = 2;
21894 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21895 }
21896 if (align_bytes & 4)
21897 {
21898 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21899 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21900 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21901 set_mem_align (dst, 4 * BITS_PER_UNIT);
21902 if (src_align_bytes >= 0)
21903 {
21904 unsigned int src_align = 0;
21905 if ((src_align_bytes & 3) == (align_bytes & 3))
21906 src_align = 4;
21907 else if ((src_align_bytes & 1) == (align_bytes & 1))
21908 src_align = 2;
21909 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21910 set_mem_align (src, src_align * BITS_PER_UNIT);
21911 }
21912 off = 4;
21913 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21914 }
21915 if (align_bytes & 8)
21916 {
21917 if (TARGET_64BIT || TARGET_SSE)
21918 {
21919 dst = adjust_automodify_address_nv (dst, DImode, destreg, off);
21920 src = adjust_automodify_address_nv (src, DImode, srcreg, off);
21921 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21922 }
21923 else
21924 {
21925 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21926 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21927 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21928 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21929 }
21930 if (MEM_ALIGN (dst) < 8 * BITS_PER_UNIT)
21931 set_mem_align (dst, 8 * BITS_PER_UNIT);
21932 if (src_align_bytes >= 0)
21933 {
21934 unsigned int src_align = 0;
21935 if ((src_align_bytes & 7) == (align_bytes & 7))
21936 src_align = 8;
21937 else if ((src_align_bytes & 3) == (align_bytes & 3))
21938 src_align = 4;
21939 else if ((src_align_bytes & 1) == (align_bytes & 1))
21940 src_align = 2;
21941 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21942 set_mem_align (src, src_align * BITS_PER_UNIT);
21943 }
21944 off = 8;
21945 }
21946 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21947 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21948 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21949 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21950 if (src_align_bytes >= 0)
21951 {
21952 unsigned int src_align = 0;
21953 if ((src_align_bytes & 15) == (align_bytes & 15))
21954 src_align = 16;
21955 else if ((src_align_bytes & 7) == (align_bytes & 7))
21956 src_align = 8;
21957 else if ((src_align_bytes & 3) == (align_bytes & 3))
21958 src_align = 4;
21959 else if ((src_align_bytes & 1) == (align_bytes & 1))
21960 src_align = 2;
21961 if (src_align > (unsigned int) desired_align)
21962 src_align = desired_align;
21963 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21964 set_mem_align (src, src_align * BITS_PER_UNIT);
21965 }
21966 if (MEM_SIZE_KNOWN_P (orig_dst))
21967 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21968 if (MEM_SIZE_KNOWN_P (orig_src))
21969 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21970 *srcp = src;
21971 return dst;
21972 }
21973
21974 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21975 DESIRED_ALIGNMENT. */
21976 static void
21977 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21978 int align, int desired_alignment)
21979 {
21980 if (align <= 1 && desired_alignment > 1)
21981 {
21982 rtx label = ix86_expand_aligntest (destptr, 1, false);
21983 destmem = adjust_automodify_address_nv (destmem, QImode, destptr, 0);
21984 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21985 ix86_adjust_counter (count, 1);
21986 emit_label (label);
21987 LABEL_NUSES (label) = 1;
21988 }
21989 if (align <= 2 && desired_alignment > 2)
21990 {
21991 rtx label = ix86_expand_aligntest (destptr, 2, false);
21992 destmem = adjust_automodify_address_nv (destmem, HImode, destptr, 0);
21993 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21994 ix86_adjust_counter (count, 2);
21995 emit_label (label);
21996 LABEL_NUSES (label) = 1;
21997 }
21998 if (align <= 4 && desired_alignment > 4)
21999 {
22000 rtx label = ix86_expand_aligntest (destptr, 4, false);
22001 destmem = adjust_automodify_address_nv (destmem, SImode, destptr, 0);
22002 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22003 ix86_adjust_counter (count, 4);
22004 emit_label (label);
22005 LABEL_NUSES (label) = 1;
22006 }
22007 if (align <= 8 && desired_alignment > 8)
22008 {
22009 rtx label = ix86_expand_aligntest (destptr, 8, false);
22010 destmem = adjust_automodify_address_nv (destmem, SImode, destptr, 0);
22011 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22012 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22013 ix86_adjust_counter (count, 8);
22014 emit_label (label);
22015 LABEL_NUSES (label) = 1;
22016 }
22017 gcc_assert (desired_alignment <= 16);
22018 }
22019
22020 /* Set enough from DST to align DST known to by aligned by ALIGN to
22021 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22022 static rtx
22023 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22024 int desired_align, int align_bytes)
22025 {
22026 int off = 0;
22027 rtx orig_dst = dst;
22028 if (align_bytes & 1)
22029 {
22030 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22031 off = 1;
22032 emit_insn (gen_strset (destreg, dst,
22033 gen_lowpart (QImode, value)));
22034 }
22035 if (align_bytes & 2)
22036 {
22037 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22038 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22039 set_mem_align (dst, 2 * BITS_PER_UNIT);
22040 off = 2;
22041 emit_insn (gen_strset (destreg, dst,
22042 gen_lowpart (HImode, value)));
22043 }
22044 if (align_bytes & 4)
22045 {
22046 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22047 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22048 set_mem_align (dst, 4 * BITS_PER_UNIT);
22049 off = 4;
22050 emit_insn (gen_strset (destreg, dst,
22051 gen_lowpart (SImode, value)));
22052 }
22053 if (align_bytes & 8)
22054 {
22055 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22056 emit_insn (gen_strset (destreg, dst,
22057 gen_lowpart (SImode, value)));
22058 off = 4;
22059 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22060 emit_insn (gen_strset (destreg, dst,
22061 gen_lowpart (SImode, value)));
22062 if (MEM_ALIGN (dst) < 8 * BITS_PER_UNIT)
22063 set_mem_align (dst, 8 * BITS_PER_UNIT);
22064 off = 4;
22065 }
22066 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22067 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22068 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22069 if (MEM_SIZE_KNOWN_P (orig_dst))
22070 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22071 return dst;
22072 }
22073
22074 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22075 static enum stringop_alg
22076 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22077 int *dynamic_check, bool align_unknown)
22078 {
22079 const struct stringop_algs * algs;
22080 bool optimize_for_speed;
22081 /* Algorithms using the rep prefix want at least edi and ecx;
22082 additionally, memset wants eax and memcpy wants esi. Don't
22083 consider such algorithms if the user has appropriated those
22084 registers for their own purposes. */
22085 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22086 || (memset
22087 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22088
22089 #define ALG_USABLE_P(alg) ((rep_prefix_usable \
22090 || (alg != rep_prefix_1_byte \
22091 && alg != rep_prefix_4_byte \
22092 && alg != rep_prefix_8_byte)) \
22093 && (TARGET_SSE2 || alg != sse_loop))
22094 const struct processor_costs *cost;
22095
22096 /* Even if the string operation call is cold, we still might spend a lot
22097 of time processing large blocks. */
22098 if (optimize_function_for_size_p (cfun)
22099 || (optimize_insn_for_size_p ()
22100 && expected_size != -1 && expected_size < 256))
22101 optimize_for_speed = false;
22102 else
22103 optimize_for_speed = true;
22104
22105 if (!optimize)
22106 return (rep_prefix_usable ? rep_prefix_1_byte : libcall);
22107
22108 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22109
22110 *dynamic_check = -1;
22111 if (memset)
22112 algs = &cost->memset[align_unknown][TARGET_64BIT != 0];
22113 else
22114 algs = &cost->memcpy[align_unknown][TARGET_64BIT != 0];
22115 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22116 return ix86_stringop_alg;
22117 /* rep; movq or rep; movl is the smallest variant. */
22118 else if (!optimize_for_speed)
22119 {
22120 if (!count || (count & 3) || memset)
22121 return rep_prefix_usable ? rep_prefix_1_byte : libcall;
22122 else
22123 return rep_prefix_usable ? rep_prefix_4_byte : libcall;
22124 }
22125 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22126 */
22127 else if (expected_size != -1 && expected_size < 4)
22128 return loop_1_byte;
22129 else if (expected_size != -1)
22130 {
22131 unsigned int i;
22132 enum stringop_alg alg = libcall;
22133 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22134 {
22135 /* We get here if the algorithms that were not libcall-based
22136 were rep-prefix based and we are unable to use rep prefixes
22137 based on global register usage. Break out of the loop and
22138 use the heuristic below. */
22139 if (algs->size[i].max == 0)
22140 break;
22141 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22142 {
22143 enum stringop_alg candidate = algs->size[i].alg;
22144
22145 if (candidate != libcall && ALG_USABLE_P (candidate))
22146 alg = candidate;
22147 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22148 last non-libcall inline algorithm. */
22149 if (TARGET_INLINE_ALL_STRINGOPS)
22150 {
22151 /* When the current size is best to be copied by a libcall,
22152 but we are still forced to inline, run the heuristic below
22153 that will pick code for medium sized blocks. */
22154 if (alg != libcall)
22155 return alg;
22156 break;
22157 }
22158 else if (ALG_USABLE_P (candidate))
22159 return candidate;
22160 }
22161 }
22162 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22163 }
22164 /* When asked to inline the call anyway, try to pick meaningful choice.
22165 We look for maximal size of block that is faster to copy by hand and
22166 take blocks of at most of that size guessing that average size will
22167 be roughly half of the block.
22168
22169 If this turns out to be bad, we might simply specify the preferred
22170 choice in ix86_costs. */
22171 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22172 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22173 {
22174 int max = -1;
22175 enum stringop_alg alg;
22176 int i;
22177 bool only_libcall_fits = true;
22178
22179 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22180 {
22181 enum stringop_alg candidate = algs->size[i].alg;
22182
22183 if (candidate != libcall && candidate
22184 && ALG_USABLE_P (candidate))
22185 {
22186 max = algs->size[i].max;
22187 only_libcall_fits = false;
22188 }
22189 }
22190 /* If there aren't any usable algorithms, then recursing on
22191 smaller sizes isn't going to find anything. Just return the
22192 simple byte-at-a-time copy loop. */
22193 if (only_libcall_fits)
22194 {
22195 /* Pick something reasonable. */
22196 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22197 *dynamic_check = 128;
22198 return loop_1_byte;
22199 }
22200 if (max == -1)
22201 max = 4096;
22202 alg = decide_alg (count, max / 2, memset, dynamic_check, align_unknown);
22203 gcc_assert (*dynamic_check == -1);
22204 gcc_assert (alg != libcall);
22205 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22206 *dynamic_check = max;
22207 return alg;
22208 }
22209 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22210 #undef ALG_USABLE_P
22211 }
22212
22213 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22214 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22215 static int
22216 decide_alignment (int align,
22217 enum stringop_alg alg,
22218 int expected_size)
22219 {
22220 int desired_align = 0;
22221 switch (alg)
22222 {
22223 case no_stringop:
22224 gcc_unreachable ();
22225 case loop:
22226 desired_align = GET_MODE_SIZE (Pmode);
22227 break;
22228 case unrolled_loop:
22229 desired_align = GET_MODE_SIZE (Pmode);
22230 break;
22231 case sse_loop:
22232 desired_align = 16;
22233 break;
22234 case rep_prefix_8_byte:
22235 desired_align = 8;
22236 break;
22237 case rep_prefix_4_byte:
22238 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22239 copying whole cacheline at once. */
22240 if (TARGET_PENTIUMPRO)
22241 desired_align = 8;
22242 else
22243 desired_align = 4;
22244 break;
22245 case rep_prefix_1_byte:
22246 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22247 copying whole cacheline at once. */
22248 if (TARGET_PENTIUMPRO)
22249 desired_align = 8;
22250 else
22251 desired_align = 1;
22252 break;
22253 case loop_1_byte:
22254 desired_align = 1;
22255 break;
22256 case libcall:
22257 return 0;
22258 }
22259
22260 if (optimize_size)
22261 desired_align = 1;
22262 if (desired_align < align)
22263 desired_align = align;
22264 if (expected_size != -1 && expected_size < 4)
22265 desired_align = align;
22266 return desired_align;
22267 }
22268
22269 /* Return the smallest power of 2 greater than VAL. */
22270 static int
22271 smallest_pow2_greater_than (int val)
22272 {
22273 int ret = 1;
22274 while (ret <= val)
22275 ret <<= 1;
22276 return ret;
22277 }
22278
22279 /* Expand string move (memcpy) operation. Use i386 string operations
22280 when profitable. expand_setmem contains similar code. The code
22281 depends upon architecture, block size and alignment, but always has
22282 the same overall structure:
22283
22284 1) Prologue guard: Conditional that jumps up to epilogues for small
22285 blocks that can be handled by epilogue alone. This is faster
22286 but also needed for correctness, since prologue assume the block
22287 is larger than the desired alignment.
22288
22289 Optional dynamic check for size and libcall for large
22290 blocks is emitted here too, with -minline-stringops-dynamically.
22291
22292 2) Prologue: copy first few bytes in order to get destination
22293 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22294 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22295 copied. We emit either a jump tree on power of two sized
22296 blocks, or a byte loop.
22297
22298 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22299 with specified algorithm.
22300
22301 4) Epilogue: code copying tail of the block that is too small to be
22302 handled by main body (or up to size guarded by prologue guard). */
22303
22304 bool
22305 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22306 rtx expected_align_exp, rtx expected_size_exp)
22307 {
22308 rtx destreg;
22309 rtx srcreg;
22310 rtx label = NULL;
22311 rtx tmp;
22312 rtx jump_around_label = NULL;
22313 HOST_WIDE_INT align = 1;
22314 unsigned HOST_WIDE_INT count = 0;
22315 HOST_WIDE_INT expected_size = -1;
22316 int size_needed = 0, epilogue_size_needed;
22317 int desired_align = 0, align_bytes = 0;
22318 enum stringop_alg alg;
22319 int dynamic_check;
22320 bool need_zero_guard = false;
22321 bool align_unknown;
22322 unsigned int unroll_factor;
22323 enum machine_mode move_mode;
22324 rtx loop_iter = NULL_RTX;
22325 int dst_offset, src_offset;
22326
22327 if (CONST_INT_P (align_exp))
22328 align = INTVAL (align_exp);
22329 /* i386 can do misaligned access on reasonably increased cost. */
22330 if (CONST_INT_P (expected_align_exp)
22331 && INTVAL (expected_align_exp) > align)
22332 align = INTVAL (expected_align_exp);
22333 /* ALIGN is the minimum of destination and source alignment, but we care here
22334 just about destination alignment. */
22335 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22336 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22337
22338 if (CONST_INT_P (count_exp))
22339 count = expected_size = INTVAL (count_exp);
22340 if (CONST_INT_P (expected_size_exp) && count == 0)
22341 expected_size = INTVAL (expected_size_exp);
22342
22343 /* Make sure we don't need to care about overflow later on. */
22344 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22345 return false;
22346
22347 /* Step 0: Decide on preferred algorithm, desired alignment and
22348 size of chunks to be copied by main loop. */
22349 dst_offset = get_mem_align_offset (dst, MOVE_MAX*BITS_PER_UNIT);
22350 src_offset = get_mem_align_offset (src, MOVE_MAX*BITS_PER_UNIT);
22351 align_unknown = (dst_offset < 0
22352 || src_offset < 0
22353 || src_offset != dst_offset);
22354 alg = decide_alg (count, expected_size, false, &dynamic_check, align_unknown);
22355 desired_align = decide_alignment (align, alg, expected_size);
22356 if (align_unknown)
22357 desired_align = align;
22358 unroll_factor = 1;
22359 move_mode = Pmode;
22360
22361 if (!TARGET_ALIGN_STRINGOPS)
22362 align = desired_align;
22363
22364 if (alg == libcall)
22365 return false;
22366 gcc_assert (alg != no_stringop);
22367 if (!count)
22368 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22369 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22370 srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
22371 switch (alg)
22372 {
22373 case libcall:
22374 case no_stringop:
22375 gcc_unreachable ();
22376 case loop:
22377 need_zero_guard = true;
22378 move_mode = Pmode;
22379 unroll_factor = 1;
22380 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
22381 break;
22382 case unrolled_loop:
22383 need_zero_guard = true;
22384 move_mode = Pmode;
22385 unroll_factor = 1;
22386 /* Select maximal available 1,2 or 4 unroll factor.
22387 In 32bit we can not afford to use 4 registers inside the loop. */
22388 if (!count)
22389 unroll_factor = TARGET_64BIT ? 4 : 2;
22390 else
22391 while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
22392 && unroll_factor < (TARGET_64BIT ? 4 :2))
22393 unroll_factor *= 2;
22394 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
22395 break;
22396 case sse_loop:
22397 need_zero_guard = true;
22398 /* Use SSE instructions, if possible. */
22399 move_mode = V4SImode;
22400 /* Select maximal available 1,2 or 4 unroll factor. */
22401 if (!count)
22402 unroll_factor = 4;
22403 else
22404 while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
22405 && unroll_factor < 4)
22406 unroll_factor *= 2;
22407 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
22408 break;
22409 case rep_prefix_8_byte:
22410 size_needed = 8;
22411 break;
22412 case rep_prefix_4_byte:
22413 size_needed = 4;
22414 break;
22415 case rep_prefix_1_byte:
22416 size_needed = 1;
22417 break;
22418 case loop_1_byte:
22419 need_zero_guard = true;
22420 size_needed = 1;
22421 break;
22422 }
22423
22424 epilogue_size_needed = size_needed;
22425
22426 /* Step 1: Prologue guard. */
22427
22428 /* Alignment code needs count to be in register. */
22429 if (CONST_INT_P (count_exp) && desired_align > align)
22430 {
22431 if (INTVAL (count_exp) > desired_align
22432 && INTVAL (count_exp) > size_needed)
22433 {
22434 align_bytes
22435 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22436 if (align_bytes <= 0)
22437 align_bytes = 0;
22438 else
22439 align_bytes = desired_align - align_bytes;
22440 }
22441 if (align_bytes == 0)
22442 count_exp = force_reg (counter_mode (count_exp), count_exp);
22443 }
22444 gcc_assert (desired_align >= 1 && align >= 1);
22445
22446 /* Ensure that alignment prologue won't copy past end of block. */
22447 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22448 {
22449 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22450 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22451 Make sure it is power of 2. */
22452 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22453
22454 if (count)
22455 {
22456 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22457 {
22458 /* If main algorithm works on QImode, no epilogue is needed.
22459 For small sizes just don't align anything. */
22460 if (size_needed == 1)
22461 desired_align = align;
22462 else
22463 goto epilogue;
22464 }
22465 }
22466 else
22467 {
22468 /* SSE and unrolled algs re-use iteration counter in the epilogue. */
22469 if (alg == sse_loop || alg == unrolled_loop)
22470 {
22471 loop_iter = gen_reg_rtx (counter_mode (count_exp));
22472 emit_move_insn (loop_iter, const0_rtx);
22473 }
22474 label = gen_label_rtx ();
22475 emit_cmp_and_jump_insns (count_exp,
22476 GEN_INT (epilogue_size_needed),
22477 LTU, 0, counter_mode (count_exp), 1, label);
22478 if (expected_size == -1 || expected_size < epilogue_size_needed)
22479 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22480 else
22481 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22482 }
22483 }
22484
22485 /* Emit code to decide on runtime whether library call or inline should be
22486 used. */
22487 if (dynamic_check != -1)
22488 {
22489 if (CONST_INT_P (count_exp))
22490 {
22491 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22492 {
22493 emit_block_move_via_libcall (dst, src, count_exp, false);
22494 count_exp = const0_rtx;
22495 goto epilogue;
22496 }
22497 }
22498 else
22499 {
22500 rtx hot_label = gen_label_rtx ();
22501 jump_around_label = gen_label_rtx ();
22502 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22503 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22504 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22505 emit_block_move_via_libcall (dst, src, count_exp, false);
22506 emit_jump (jump_around_label);
22507 emit_label (hot_label);
22508 }
22509 }
22510
22511 /* Step 2: Alignment prologue. */
22512
22513 if (desired_align > align)
22514 {
22515 if (align_bytes == 0)
22516 {
22517 /* Except for the first move in epilogue, we no longer know
22518 constant offset in aliasing info. It don't seems to worth
22519 the pain to maintain it for the first move, so throw away
22520 the info early. */
22521 src = change_address (src, BLKmode, srcreg);
22522 dst = change_address (dst, BLKmode, destreg);
22523 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22524 desired_align);
22525 set_mem_align (src, desired_align*BITS_PER_UNIT);
22526 set_mem_align (dst, desired_align*BITS_PER_UNIT);
22527 }
22528 else
22529 {
22530 /* If we know how many bytes need to be stored before dst is
22531 sufficiently aligned, maintain aliasing info accurately. */
22532 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22533 desired_align, align_bytes);
22534 count_exp = plus_constant (count_exp, -align_bytes);
22535 count -= align_bytes;
22536 }
22537 if (need_zero_guard
22538 && (count < (unsigned HOST_WIDE_INT) size_needed
22539 || (align_bytes == 0
22540 && count < ((unsigned HOST_WIDE_INT) size_needed
22541 + desired_align - align))))
22542 {
22543 /* It is possible that we copied enough so the main loop will not
22544 execute. */
22545 gcc_assert (size_needed > 1);
22546 if (label == NULL_RTX)
22547 label = gen_label_rtx ();
22548 emit_cmp_and_jump_insns (count_exp,
22549 GEN_INT (size_needed),
22550 LTU, 0, counter_mode (count_exp), 1, label);
22551 if (expected_size == -1
22552 || expected_size < (desired_align - align) / 2 + size_needed)
22553 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22554 else
22555 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22556 }
22557 }
22558 if (label && size_needed == 1)
22559 {
22560 emit_label (label);
22561 LABEL_NUSES (label) = 1;
22562 label = NULL;
22563 epilogue_size_needed = 1;
22564 }
22565 else if (label == NULL_RTX)
22566 epilogue_size_needed = size_needed;
22567
22568 /* Step 3: Main loop. */
22569
22570 switch (alg)
22571 {
22572 case libcall:
22573 case no_stringop:
22574 gcc_unreachable ();
22575 case loop_1_byte:
22576 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22577 count_exp, QImode, 1, expected_size);
22578 break;
22579 case loop:
22580 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22581 count_exp, Pmode, 1, expected_size);
22582 break;
22583 case sse_loop:
22584 case unrolled_loop:
22585 /* In some cases we want to use the same iterator in several adjacent
22586 loops, so here we save loop iterator rtx and don't update addresses. */
22587 loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, src, destreg,
22588 srcreg, NULL,
22589 count_exp, loop_iter,
22590 move_mode,
22591 unroll_factor,
22592 expected_size, false);
22593 break;
22594 case rep_prefix_8_byte:
22595 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22596 DImode);
22597 break;
22598 case rep_prefix_4_byte:
22599 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22600 SImode);
22601 break;
22602 case rep_prefix_1_byte:
22603 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22604 QImode);
22605 break;
22606 }
22607 /* Adjust properly the offset of src and dest memory for aliasing. */
22608 if (CONST_INT_P (count_exp))
22609 {
22610 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22611 (count / size_needed) * size_needed);
22612 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22613 (count / size_needed) * size_needed);
22614 }
22615 else
22616 {
22617 src = change_address (src, BLKmode, srcreg);
22618 dst = change_address (dst, BLKmode, destreg);
22619 }
22620
22621 /* Step 4: Epilogue to copy the remaining bytes. */
22622 epilogue:
22623 if (label)
22624 {
22625 /* When the main loop is done, COUNT_EXP might hold original count,
22626 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22627 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22628 bytes. Compensate if needed. */
22629
22630 if (size_needed < epilogue_size_needed)
22631 {
22632 tmp =
22633 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22634 GEN_INT (size_needed - 1), count_exp, 1,
22635 OPTAB_DIRECT);
22636 if (tmp != count_exp)
22637 emit_move_insn (count_exp, tmp);
22638 }
22639 emit_label (label);
22640 LABEL_NUSES (label) = 1;
22641 }
22642
22643 /* We haven't updated addresses, so we'll do it now.
22644 Also, if the epilogue seems to be big, we'll generate a loop (not
22645 unrolled) in it. We'll do it only if alignment is unknown, because in
22646 this case in epilogue we have to perform memmove by bytes, which is very
22647 slow. */
22648 if (alg == sse_loop || alg == unrolled_loop)
22649 {
22650 rtx tmp;
22651 int remainder_size = epilogue_size_needed;
22652
22653 /* We may not need the epilgoue loop at all when the count is known
22654 and alignment is not adjusted. */
22655 if (count && desired_align <= align)
22656 remainder_size = count % epilogue_size_needed;
22657 if (remainder_size > 31)
22658 {
22659 /* Reduce epilogue's size by creating not-unrolled loop. If we won't
22660 do this, we can have very big epilogue - when alignment is statically
22661 unknown we'll have the epilogue byte by byte which may be very slow. */
22662 loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, src, destreg,
22663 srcreg, NULL, count_exp,
22664 loop_iter, move_mode, 1,
22665 expected_size, false);
22666 src = change_address (src, BLKmode, srcreg);
22667 dst = change_address (dst, BLKmode, destreg);
22668 epilogue_size_needed = GET_MODE_SIZE (move_mode);
22669 }
22670 tmp = expand_simple_binop (Pmode, PLUS, destreg, loop_iter, destreg,
22671 true, OPTAB_LIB_WIDEN);
22672 if (tmp != destreg)
22673 emit_move_insn (destreg, tmp);
22674
22675 tmp = expand_simple_binop (Pmode, PLUS, srcreg, loop_iter, srcreg,
22676 true, OPTAB_LIB_WIDEN);
22677 if (tmp != srcreg)
22678 emit_move_insn (srcreg, tmp);
22679 }
22680 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22681 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22682 epilogue_size_needed);
22683
22684 if (jump_around_label)
22685 emit_label (jump_around_label);
22686 return true;
22687 }
22688
22689 /* Helper function for memcpy. For QImode value 0xXY produce
22690 0xXYXYXYXY of wide specified by MODE. This is essentially
22691 a * 0x10101010, but we can do slightly better than
22692 synth_mult by unwinding the sequence by hand on CPUs with
22693 slow multiply. */
22694 static rtx
22695 promote_duplicated_reg (enum machine_mode mode, rtx val)
22696 {
22697 enum machine_mode valmode = GET_MODE (val);
22698 rtx tmp;
22699 int nops = mode == DImode ? 3 : 2;
22700
22701 if (VECTOR_MODE_P (mode))
22702 {
22703 enum machine_mode inner = GET_MODE_INNER (mode);
22704 rtx promoted_val, vec_reg;
22705 if (CONST_INT_P (val))
22706 return ix86_build_const_vector (mode, true, val);
22707
22708 promoted_val = promote_duplicated_reg (inner, val);
22709 vec_reg = gen_reg_rtx (mode);
22710 switch (mode)
22711 {
22712 case V2DImode:
22713 emit_insn (gen_vec_dupv2di (vec_reg, promoted_val));
22714 break;
22715 case V4SImode:
22716 emit_insn (gen_vec_dupv4si (vec_reg, promoted_val));
22717 break;
22718 default:
22719 gcc_unreachable ();
22720 break;
22721 }
22722
22723 return vec_reg;
22724 }
22725 gcc_assert (mode == SImode || mode == DImode);
22726 if (mode == DImode && !TARGET_64BIT)
22727 {
22728 rtx vec_reg = promote_duplicated_reg (V4SImode, val);
22729 vec_reg = convert_to_mode (V2DImode, vec_reg, 1);
22730 return vec_reg;
22731 }
22732 if (val == const0_rtx)
22733 return copy_to_mode_reg (mode, const0_rtx);
22734 if (CONST_INT_P (val))
22735 {
22736 HOST_WIDE_INT v = INTVAL (val) & 255;
22737
22738 v |= v << 8;
22739 v |= v << 16;
22740 if (mode == DImode)
22741 v |= (v << 16) << 16;
22742 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22743 }
22744
22745 if (valmode == VOIDmode)
22746 valmode = QImode;
22747 if (valmode != QImode)
22748 val = gen_lowpart (QImode, val);
22749 if (mode == QImode)
22750 return val;
22751 if (!TARGET_PARTIAL_REG_STALL)
22752 nops--;
22753 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22754 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22755 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22756 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22757 {
22758 rtx reg = convert_modes (mode, QImode, val, true);
22759 tmp = promote_duplicated_reg (mode, const1_rtx);
22760 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22761 OPTAB_DIRECT);
22762 }
22763 else
22764 {
22765 rtx reg = convert_modes (mode, QImode, val, true);
22766
22767 if (!TARGET_PARTIAL_REG_STALL)
22768 if (mode == SImode)
22769 emit_insn (gen_movsi_insv_1 (reg, reg));
22770 else
22771 emit_insn (gen_movdi_insv_1 (reg, reg));
22772 else
22773 {
22774 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22775 NULL, 1, OPTAB_DIRECT);
22776 reg =
22777 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22778 }
22779 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22780 NULL, 1, OPTAB_DIRECT);
22781 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22782 if (mode == SImode)
22783 return reg;
22784 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22785 NULL, 1, OPTAB_DIRECT);
22786 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22787 return reg;
22788 }
22789 }
22790
22791 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22792 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22793 alignment from ALIGN to DESIRED_ALIGN. */
22794 static rtx
22795 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22796 {
22797 rtx promoted_val = NULL_RTX;
22798
22799 if (size_needed > 8)
22800 {
22801 /* We want to promote to vector register, so we expect that at least SSE
22802 is available. */
22803 gcc_assert (TARGET_SSE);
22804
22805 /* In case of promotion to vector register, we expect that val is a
22806 constant or already promoted to GPR value. */
22807 gcc_assert (GET_MODE (val) == Pmode || CONSTANT_P (val));
22808 if (TARGET_64BIT)
22809 promoted_val = promote_duplicated_reg (V2DImode, val);
22810 else
22811 promoted_val = promote_duplicated_reg (V4SImode, val);
22812 }
22813 else if (size_needed > 4)
22814 {
22815 gcc_assert (TARGET_64BIT);
22816 promoted_val = promote_duplicated_reg (DImode, val);
22817 }
22818 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22819 promoted_val = promote_duplicated_reg (SImode, val);
22820 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22821 promoted_val = promote_duplicated_reg (HImode, val);
22822 else
22823 promoted_val = val;
22824
22825 return promoted_val;
22826 }
22827
22828 /* Expand string clear operation (bzero). Use i386 string operations when
22829 profitable. See expand_movmem comment for explanation of individual
22830 steps performed. */
22831 bool
22832 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22833 rtx expected_align_exp, rtx expected_size_exp)
22834 {
22835 rtx destreg;
22836 rtx label = NULL;
22837 rtx tmp;
22838 rtx jump_around_label = NULL;
22839 HOST_WIDE_INT align = 1;
22840 unsigned HOST_WIDE_INT count = 0;
22841 HOST_WIDE_INT expected_size = -1;
22842 int size_needed = 0, epilogue_size_needed;
22843 int desired_align = 0, align_bytes = 0;
22844 enum stringop_alg alg;
22845 rtx gpr_promoted_val = NULL;
22846 rtx vec_promoted_val = NULL;
22847 int dynamic_check;
22848 bool need_zero_guard = false;
22849 bool align_unknown;
22850 unsigned int unroll_factor;
22851 enum machine_mode move_mode;
22852 rtx loop_iter = NULL_RTX;
22853 bool early_jump = false;
22854
22855 if (CONST_INT_P (align_exp))
22856 align = INTVAL (align_exp);
22857 /* i386 can do misaligned access on reasonably increased cost. */
22858 if (CONST_INT_P (expected_align_exp)
22859 && INTVAL (expected_align_exp) > align)
22860 align = INTVAL (expected_align_exp);
22861 if (CONST_INT_P (count_exp))
22862 count = expected_size = INTVAL (count_exp);
22863 if (CONST_INT_P (expected_size_exp) && count == 0)
22864 expected_size = INTVAL (expected_size_exp);
22865
22866 /* Make sure we don't need to care about overflow later on. */
22867 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22868 return false;
22869
22870 /* Step 0: Decide on preferred algorithm, desired alignment and
22871 size of chunks to be copied by main loop. */
22872
22873 align_unknown = !(CONST_INT_P (align_exp) && INTVAL (align_exp) > 0);
22874 alg = decide_alg (count, expected_size, true, &dynamic_check, align_unknown);
22875 desired_align = decide_alignment (align, alg, expected_size);
22876 unroll_factor = 1;
22877 move_mode = Pmode;
22878
22879 if (!TARGET_ALIGN_STRINGOPS)
22880 align = desired_align;
22881
22882 if (alg == libcall)
22883 return false;
22884 gcc_assert (alg != no_stringop);
22885 if (!count)
22886 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22887 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22888 switch (alg)
22889 {
22890 case libcall:
22891 case no_stringop:
22892 gcc_unreachable ();
22893 case loop:
22894 need_zero_guard = true;
22895 move_mode = Pmode;
22896 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
22897 break;
22898 case unrolled_loop:
22899 need_zero_guard = true;
22900 move_mode = Pmode;
22901 unroll_factor = 1;
22902 /* Select maximal available 1,2 or 4 unroll factor. */
22903 if (!count)
22904 unroll_factor = 4;
22905 else
22906 while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
22907 && unroll_factor < 4)
22908 unroll_factor *= 2;
22909 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
22910 break;
22911 case sse_loop:
22912 need_zero_guard = true;
22913 move_mode = TARGET_64BIT ? V2DImode : V4SImode;
22914 unroll_factor = 1;
22915 /* Select maximal available 1,2 or 4 unroll factor. */
22916 if (!count)
22917 unroll_factor = 4;
22918 else
22919 while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
22920 && unroll_factor < 4)
22921 unroll_factor *= 2;
22922 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
22923 break;
22924 case rep_prefix_8_byte:
22925 size_needed = 8;
22926 break;
22927 case rep_prefix_4_byte:
22928 size_needed = 4;
22929 break;
22930 case rep_prefix_1_byte:
22931 size_needed = 1;
22932 break;
22933 case loop_1_byte:
22934 need_zero_guard = true;
22935 size_needed = 1;
22936 break;
22937 }
22938 epilogue_size_needed = size_needed;
22939
22940 /* Step 1: Prologue guard. */
22941
22942 /* Alignment code needs count to be in register. */
22943 if (CONST_INT_P (count_exp) && desired_align > align)
22944 {
22945 if (INTVAL (count_exp) > desired_align
22946 && INTVAL (count_exp) > size_needed)
22947 {
22948 align_bytes
22949 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22950 if (align_bytes <= 0)
22951 align_bytes = 0;
22952 else
22953 align_bytes = desired_align - align_bytes;
22954 }
22955 if (align_bytes == 0)
22956 {
22957 enum machine_mode mode = SImode;
22958 if (TARGET_64BIT && (count & ~0xffffffff))
22959 mode = DImode;
22960 count_exp = force_reg (mode, count_exp);
22961 }
22962 }
22963 /* Do the cheap promotion to allow better CSE across the
22964 main loop and epilogue (ie one load of the big constant in the
22965 front of all code. */
22966 if (CONST_INT_P (val_exp))
22967 gpr_promoted_val = promote_duplicated_reg_to_size (val_exp,
22968 GET_MODE_SIZE (Pmode),
22969 GET_MODE_SIZE (Pmode),
22970 align);
22971 /* Ensure that alignment prologue won't copy past end of block. */
22972 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22973 {
22974 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22975 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22976 Make sure it is power of 2. */
22977 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22978
22979 if (count)
22980 {
22981 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22982 {
22983 /* If main algorithm works on QImode, no epilogue is needed.
22984 For small sizes just don't align anything. */
22985 if (size_needed == 1)
22986 desired_align = align;
22987 else
22988 goto epilogue;
22989 }
22990 }
22991 else
22992 {
22993 /* SSE and unrolled_lopo algs re-use iteration counter in the epilogue. */
22994 if (alg == sse_loop || alg == unrolled_loop)
22995 {
22996 loop_iter = gen_reg_rtx (counter_mode (count_exp));
22997 emit_move_insn (loop_iter, const0_rtx);
22998 }
22999 label = gen_label_rtx ();
23000 early_jump = true;
23001 emit_cmp_and_jump_insns (count_exp,
23002 GEN_INT (epilogue_size_needed),
23003 LTU, 0, counter_mode (count_exp), 1, label);
23004 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23005 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23006 else
23007 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23008 }
23009 }
23010 if (dynamic_check != -1)
23011 {
23012 rtx hot_label = gen_label_rtx ();
23013 jump_around_label = gen_label_rtx ();
23014 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23015 LEU, 0, counter_mode (count_exp), 1, hot_label);
23016 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23017 set_storage_via_libcall (dst, count_exp, val_exp, false);
23018 emit_jump (jump_around_label);
23019 emit_label (hot_label);
23020 }
23021
23022 /* Step 2: Alignment prologue. */
23023
23024 /* Do the expensive promotion once we branched off the small blocks. */
23025 if (!gpr_promoted_val)
23026 gpr_promoted_val = promote_duplicated_reg_to_size (val_exp,
23027 GET_MODE_SIZE (Pmode),
23028 GET_MODE_SIZE (Pmode),
23029 align);
23030 gcc_assert (desired_align >= 1 && align >= 1);
23031
23032 if (desired_align > align)
23033 {
23034 if (align_bytes == 0)
23035 {
23036 /* Except for the first move in epilogue, we no longer know
23037 constant offset in aliasing info. It don't seems to worth
23038 the pain to maintain it for the first move, so throw away
23039 the info early. */
23040 dst = change_address (dst, BLKmode, destreg);
23041 expand_setmem_prologue (dst, destreg, gpr_promoted_val, count_exp, align,
23042 desired_align);
23043 set_mem_align (dst, desired_align*BITS_PER_UNIT);
23044 }
23045 else
23046 {
23047 /* If we know how many bytes need to be stored before dst is
23048 sufficiently aligned, maintain aliasing info accurately. */
23049 dst = expand_constant_setmem_prologue (dst, destreg, gpr_promoted_val,
23050 desired_align, align_bytes);
23051 count_exp = plus_constant (count_exp, -align_bytes);
23052 count -= align_bytes;
23053 if (count < (unsigned HOST_WIDE_INT) size_needed)
23054 goto epilogue;
23055 }
23056 if (need_zero_guard
23057 && (count < (unsigned HOST_WIDE_INT) size_needed
23058 || (align_bytes == 0
23059 && count < ((unsigned HOST_WIDE_INT) size_needed
23060 + desired_align - align))))
23061 {
23062 /* It is possible that we copied enough so the main loop will not
23063 execute. */
23064 gcc_assert (size_needed > 1);
23065 if (label == NULL_RTX)
23066 label = gen_label_rtx ();
23067 emit_cmp_and_jump_insns (count_exp,
23068 GEN_INT (size_needed),
23069 LTU, 0, counter_mode (count_exp), 1, label);
23070 if (expected_size == -1
23071 || expected_size < (desired_align - align) / 2 + size_needed)
23072 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23073 else
23074 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23075 }
23076 }
23077 if (label && size_needed == 1)
23078 {
23079 emit_label (label);
23080 LABEL_NUSES (label) = 1;
23081 label = NULL;
23082 gpr_promoted_val = val_exp;
23083 epilogue_size_needed = 1;
23084 }
23085 else if (label == NULL_RTX)
23086 epilogue_size_needed = size_needed;
23087
23088 /* Step 3: Main loop. */
23089
23090 switch (alg)
23091 {
23092 case libcall:
23093 case no_stringop:
23094 gcc_unreachable ();
23095 case loop_1_byte:
23096 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, val_exp,
23097 count_exp, QImode, 1, expected_size);
23098 break;
23099 case loop:
23100 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, gpr_promoted_val,
23101 count_exp, Pmode, 1, expected_size);
23102 break;
23103 case unrolled_loop:
23104 loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg,
23105 NULL, gpr_promoted_val, count_exp,
23106 loop_iter, move_mode, unroll_factor,
23107 expected_size, false);
23108 break;
23109 case sse_loop:
23110 vec_promoted_val =
23111 promote_duplicated_reg_to_size (gpr_promoted_val,
23112 GET_MODE_SIZE (move_mode),
23113 GET_MODE_SIZE (move_mode), align);
23114 loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg,
23115 NULL, vec_promoted_val, count_exp,
23116 loop_iter, move_mode, unroll_factor,
23117 expected_size, false);
23118 break;
23119 case rep_prefix_8_byte:
23120 gcc_assert (TARGET_64BIT);
23121 expand_setmem_via_rep_stos (dst, destreg, gpr_promoted_val, count_exp,
23122 DImode, val_exp);
23123 break;
23124 case rep_prefix_4_byte:
23125 expand_setmem_via_rep_stos (dst, destreg, gpr_promoted_val, count_exp,
23126 SImode, val_exp);
23127 break;
23128 case rep_prefix_1_byte:
23129 expand_setmem_via_rep_stos (dst, destreg, gpr_promoted_val, count_exp,
23130 QImode, val_exp);
23131 break;
23132 }
23133 /* Adjust properly the offset of src and dest memory for aliasing. */
23134 if (CONST_INT_P (count_exp))
23135 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23136 (count / size_needed) * size_needed);
23137 else
23138 dst = change_address (dst, BLKmode, destreg);
23139
23140 /* Step 4: Epilogue to copy the remaining bytes. */
23141
23142 if (label)
23143 {
23144 /* When the main loop is done, COUNT_EXP might hold original count,
23145 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23146 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23147 bytes. Compensate if needed. */
23148
23149 if (size_needed < epilogue_size_needed)
23150 {
23151 tmp =
23152 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23153 GEN_INT (size_needed - 1), count_exp, 1,
23154 OPTAB_DIRECT);
23155 if (tmp != count_exp)
23156 emit_move_insn (count_exp, tmp);
23157 }
23158 emit_label (label);
23159 LABEL_NUSES (label) = 1;
23160 /* We can not rely on fact that promoved value is known. */
23161 vec_promoted_val = 0;
23162 if (early_jump)
23163 gpr_promoted_val = 0;
23164 }
23165 epilogue:
23166 if (alg == unrolled_loop || alg == sse_loop)
23167 {
23168 rtx tmp;
23169 int remainder_size = epilogue_size_needed;
23170 if (count && desired_align <= align)
23171 remainder_size = count % epilogue_size_needed;
23172 /* We may not need the epilgoue loop at all when the count is known
23173 and alignment is not adjusted. */
23174 if (remainder_size > 31
23175 && (alg == sse_loop ? vec_promoted_val : gpr_promoted_val))
23176 {
23177 /* Reduce epilogue's size by creating not-unrolled loop. If we won't
23178 do this, we can have very big epilogue - when alignment is statically
23179 unknown we'll have the epilogue byte by byte which may be very slow. */
23180 loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg,
23181 NULL, (alg == sse_loop ? vec_promoted_val : gpr_promoted_val), count_exp,
23182 loop_iter, move_mode, 1,
23183 expected_size, false);
23184 dst = change_address (dst, BLKmode, destreg);
23185 epilogue_size_needed = GET_MODE_SIZE (move_mode);
23186 }
23187 tmp = expand_simple_binop (Pmode, PLUS, destreg, loop_iter, destreg,
23188 true, OPTAB_LIB_WIDEN);
23189 if (tmp != destreg)
23190 emit_move_insn (destreg, tmp);
23191 }
23192 if (count_exp == const0_rtx || epilogue_size_needed <= 1)
23193 ;
23194 else if (!gpr_promoted_val)
23195 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23196 epilogue_size_needed);
23197 else
23198 expand_setmem_epilogue (dst, destreg, vec_promoted_val, gpr_promoted_val,
23199 val_exp, count_exp, epilogue_size_needed);
23200 if (jump_around_label)
23201 emit_label (jump_around_label);
23202 return true;
23203 }
23204
23205 /* Expand the appropriate insns for doing strlen if not just doing
23206 repnz; scasb
23207
23208 out = result, initialized with the start address
23209 align_rtx = alignment of the address.
23210 scratch = scratch register, initialized with the startaddress when
23211 not aligned, otherwise undefined
23212
23213 This is just the body. It needs the initializations mentioned above and
23214 some address computing at the end. These things are done in i386.md. */
23215
23216 static void
23217 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23218 {
23219 int align;
23220 rtx tmp;
23221 rtx align_2_label = NULL_RTX;
23222 rtx align_3_label = NULL_RTX;
23223 rtx align_4_label = gen_label_rtx ();
23224 rtx end_0_label = gen_label_rtx ();
23225 rtx mem;
23226 rtx tmpreg = gen_reg_rtx (SImode);
23227 rtx scratch = gen_reg_rtx (SImode);
23228 rtx cmp;
23229
23230 align = 0;
23231 if (CONST_INT_P (align_rtx))
23232 align = INTVAL (align_rtx);
23233
23234 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23235
23236 /* Is there a known alignment and is it less than 4? */
23237 if (align < 4)
23238 {
23239 rtx scratch1 = gen_reg_rtx (Pmode);
23240 emit_move_insn (scratch1, out);
23241 /* Is there a known alignment and is it not 2? */
23242 if (align != 2)
23243 {
23244 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23245 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23246
23247 /* Leave just the 3 lower bits. */
23248 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23249 NULL_RTX, 0, OPTAB_WIDEN);
23250
23251 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23252 Pmode, 1, align_4_label);
23253 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23254 Pmode, 1, align_2_label);
23255 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23256 Pmode, 1, align_3_label);
23257 }
23258 else
23259 {
23260 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23261 check if is aligned to 4 - byte. */
23262
23263 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23264 NULL_RTX, 0, OPTAB_WIDEN);
23265
23266 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23267 Pmode, 1, align_4_label);
23268 }
23269
23270 mem = change_address (src, QImode, out);
23271
23272 /* Now compare the bytes. */
23273
23274 /* Compare the first n unaligned byte on a byte per byte basis. */
23275 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23276 QImode, 1, end_0_label);
23277
23278 /* Increment the address. */
23279 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23280
23281 /* Not needed with an alignment of 2 */
23282 if (align != 2)
23283 {
23284 emit_label (align_2_label);
23285
23286 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23287 end_0_label);
23288
23289 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23290
23291 emit_label (align_3_label);
23292 }
23293
23294 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23295 end_0_label);
23296
23297 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23298 }
23299
23300 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23301 align this loop. It gives only huge programs, but does not help to
23302 speed up. */
23303 emit_label (align_4_label);
23304
23305 mem = change_address (src, SImode, out);
23306 emit_move_insn (scratch, mem);
23307 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23308
23309 /* This formula yields a nonzero result iff one of the bytes is zero.
23310 This saves three branches inside loop and many cycles. */
23311
23312 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23313 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23314 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23315 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23316 gen_int_mode (0x80808080, SImode)));
23317 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23318 align_4_label);
23319
23320 if (TARGET_CMOVE)
23321 {
23322 rtx reg = gen_reg_rtx (SImode);
23323 rtx reg2 = gen_reg_rtx (Pmode);
23324 emit_move_insn (reg, tmpreg);
23325 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23326
23327 /* If zero is not in the first two bytes, move two bytes forward. */
23328 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23329 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23330 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23331 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23332 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23333 reg,
23334 tmpreg)));
23335 /* Emit lea manually to avoid clobbering of flags. */
23336 emit_insn (gen_rtx_SET (SImode, reg2,
23337 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23338
23339 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23340 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23341 emit_insn (gen_rtx_SET (VOIDmode, out,
23342 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23343 reg2,
23344 out)));
23345 }
23346 else
23347 {
23348 rtx end_2_label = gen_label_rtx ();
23349 /* Is zero in the first two bytes? */
23350
23351 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23352 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23353 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23354 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23355 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23356 pc_rtx);
23357 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23358 JUMP_LABEL (tmp) = end_2_label;
23359
23360 /* Not in the first two. Move two bytes forward. */
23361 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23362 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23363
23364 emit_label (end_2_label);
23365
23366 }
23367
23368 /* Avoid branch in fixing the byte. */
23369 tmpreg = gen_lowpart (QImode, tmpreg);
23370 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23371 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23372 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23373 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23374
23375 emit_label (end_0_label);
23376 }
23377
23378 /* Expand strlen. */
23379
23380 bool
23381 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23382 {
23383 rtx addr, scratch1, scratch2, scratch3, scratch4;
23384
23385 /* The generic case of strlen expander is long. Avoid it's
23386 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23387
23388 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23389 && !TARGET_INLINE_ALL_STRINGOPS
23390 && !optimize_insn_for_size_p ()
23391 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23392 return false;
23393
23394 addr = force_reg (Pmode, XEXP (src, 0));
23395 scratch1 = gen_reg_rtx (Pmode);
23396
23397 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23398 && !optimize_insn_for_size_p ())
23399 {
23400 /* Well it seems that some optimizer does not combine a call like
23401 foo(strlen(bar), strlen(bar));
23402 when the move and the subtraction is done here. It does calculate
23403 the length just once when these instructions are done inside of
23404 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23405 often used and I use one fewer register for the lifetime of
23406 output_strlen_unroll() this is better. */
23407
23408 emit_move_insn (out, addr);
23409
23410 ix86_expand_strlensi_unroll_1 (out, src, align);
23411
23412 /* strlensi_unroll_1 returns the address of the zero at the end of
23413 the string, like memchr(), so compute the length by subtracting
23414 the start address. */
23415 emit_insn (ix86_gen_sub3 (out, out, addr));
23416 }
23417 else
23418 {
23419 rtx unspec;
23420
23421 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23422 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23423 return false;
23424
23425 scratch2 = gen_reg_rtx (Pmode);
23426 scratch3 = gen_reg_rtx (Pmode);
23427 scratch4 = force_reg (Pmode, constm1_rtx);
23428
23429 emit_move_insn (scratch3, addr);
23430 eoschar = force_reg (QImode, eoschar);
23431
23432 src = replace_equiv_address_nv (src, scratch3);
23433
23434 /* If .md starts supporting :P, this can be done in .md. */
23435 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23436 scratch4), UNSPEC_SCAS);
23437 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23438 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23439 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23440 }
23441 return true;
23442 }
23443
23444 /* For given symbol (function) construct code to compute address of it's PLT
23445 entry in large x86-64 PIC model. */
23446 rtx
23447 construct_plt_address (rtx symbol)
23448 {
23449 rtx tmp = gen_reg_rtx (Pmode);
23450 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23451
23452 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23453 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23454
23455 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23456 emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
23457 return tmp;
23458 }
23459
23460 rtx
23461 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23462 rtx callarg2,
23463 rtx pop, bool sibcall)
23464 {
23465 /* We need to represent that SI and DI registers are clobbered
23466 by SYSV calls. */
23467 static int clobbered_registers[] = {
23468 XMM6_REG, XMM7_REG, XMM8_REG,
23469 XMM9_REG, XMM10_REG, XMM11_REG,
23470 XMM12_REG, XMM13_REG, XMM14_REG,
23471 XMM15_REG, SI_REG, DI_REG
23472 };
23473 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23474 rtx use = NULL, call;
23475 unsigned int vec_len;
23476
23477 if (pop == const0_rtx)
23478 pop = NULL;
23479 gcc_assert (!TARGET_64BIT || !pop);
23480
23481 if (TARGET_MACHO && !TARGET_64BIT)
23482 {
23483 #if TARGET_MACHO
23484 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23485 fnaddr = machopic_indirect_call_target (fnaddr);
23486 #endif
23487 }
23488 else
23489 {
23490 /* Static functions and indirect calls don't need the pic register. */
23491 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23492 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23493 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23494 use_reg (&use, pic_offset_table_rtx);
23495 }
23496
23497 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23498 {
23499 rtx al = gen_rtx_REG (QImode, AX_REG);
23500 emit_move_insn (al, callarg2);
23501 use_reg (&use, al);
23502 }
23503
23504 if (ix86_cmodel == CM_LARGE_PIC
23505 && MEM_P (fnaddr)
23506 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23507 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23508 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23509 else if (sibcall
23510 ? !sibcall_insn_operand (XEXP (fnaddr, 0), Pmode)
23511 : !call_insn_operand (XEXP (fnaddr, 0), Pmode))
23512 {
23513 fnaddr = XEXP (fnaddr, 0);
23514 if (GET_MODE (fnaddr) != Pmode)
23515 fnaddr = convert_to_mode (Pmode, fnaddr, 1);
23516 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (Pmode, fnaddr));
23517 }
23518
23519 vec_len = 0;
23520 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23521 if (retval)
23522 call = gen_rtx_SET (VOIDmode, retval, call);
23523 vec[vec_len++] = call;
23524
23525 if (pop)
23526 {
23527 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23528 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23529 vec[vec_len++] = pop;
23530 }
23531
23532 if (TARGET_64BIT_MS_ABI
23533 && (!callarg2 || INTVAL (callarg2) != -2))
23534 {
23535 unsigned i;
23536
23537 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23538 UNSPEC_MS_TO_SYSV_CALL);
23539
23540 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23541 vec[vec_len++]
23542 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
23543 ? TImode : DImode,
23544 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23545 ? TImode : DImode,
23546 clobbered_registers[i]));
23547 }
23548
23549 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
23550 if (TARGET_VZEROUPPER)
23551 {
23552 int avx256;
23553 if (cfun->machine->callee_pass_avx256_p)
23554 {
23555 if (cfun->machine->callee_return_avx256_p)
23556 avx256 = callee_return_pass_avx256;
23557 else
23558 avx256 = callee_pass_avx256;
23559 }
23560 else if (cfun->machine->callee_return_avx256_p)
23561 avx256 = callee_return_avx256;
23562 else
23563 avx256 = call_no_avx256;
23564
23565 if (reload_completed)
23566 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23567 else
23568 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23569 gen_rtvec (1, GEN_INT (avx256)),
23570 UNSPEC_CALL_NEEDS_VZEROUPPER);
23571 }
23572
23573 if (vec_len > 1)
23574 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23575 call = emit_call_insn (call);
23576 if (use)
23577 CALL_INSN_FUNCTION_USAGE (call) = use;
23578
23579 return call;
23580 }
23581
23582 void
23583 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23584 {
23585 rtx pat = PATTERN (insn);
23586 rtvec vec = XVEC (pat, 0);
23587 int len = GET_NUM_ELEM (vec) - 1;
23588
23589 /* Strip off the last entry of the parallel. */
23590 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23591 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23592 if (len == 1)
23593 pat = RTVEC_ELT (vec, 0);
23594 else
23595 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23596
23597 emit_insn (gen_avx_vzeroupper (vzeroupper));
23598 emit_call_insn (pat);
23599 }
23600
23601 /* Output the assembly for a call instruction. */
23602
23603 const char *
23604 ix86_output_call_insn (rtx insn, rtx call_op)
23605 {
23606 bool direct_p = constant_call_address_operand (call_op, Pmode);
23607 bool seh_nop_p = false;
23608 const char *xasm;
23609
23610 if (SIBLING_CALL_P (insn))
23611 {
23612 if (direct_p)
23613 xasm = "jmp\t%P0";
23614 /* SEH epilogue detection requires the indirect branch case
23615 to include REX.W. */
23616 else if (TARGET_SEH)
23617 xasm = "rex.W jmp %A0";
23618 else
23619 xasm = "jmp\t%A0";
23620
23621 output_asm_insn (xasm, &call_op);
23622 return "";
23623 }
23624
23625 /* SEH unwinding can require an extra nop to be emitted in several
23626 circumstances. Determine if we have one of those. */
23627 if (TARGET_SEH)
23628 {
23629 rtx i;
23630
23631 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23632 {
23633 /* If we get to another real insn, we don't need the nop. */
23634 if (INSN_P (i))
23635 break;
23636
23637 /* If we get to the epilogue note, prevent a catch region from
23638 being adjacent to the standard epilogue sequence. If non-
23639 call-exceptions, we'll have done this during epilogue emission. */
23640 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23641 && !flag_non_call_exceptions
23642 && !can_throw_internal (insn))
23643 {
23644 seh_nop_p = true;
23645 break;
23646 }
23647 }
23648
23649 /* If we didn't find a real insn following the call, prevent the
23650 unwinder from looking into the next function. */
23651 if (i == NULL)
23652 seh_nop_p = true;
23653 }
23654
23655 if (direct_p)
23656 xasm = "call\t%P0";
23657 else
23658 xasm = "call\t%A0";
23659
23660 output_asm_insn (xasm, &call_op);
23661
23662 if (seh_nop_p)
23663 return "nop";
23664
23665 return "";
23666 }
23667 \f
23668 /* Clear stack slot assignments remembered from previous functions.
23669 This is called from INIT_EXPANDERS once before RTL is emitted for each
23670 function. */
23671
23672 static struct machine_function *
23673 ix86_init_machine_status (void)
23674 {
23675 struct machine_function *f;
23676
23677 f = ggc_alloc_cleared_machine_function ();
23678 f->use_fast_prologue_epilogue_nregs = -1;
23679 f->tls_descriptor_call_expanded_p = 0;
23680 f->call_abi = ix86_abi;
23681
23682 return f;
23683 }
23684
23685 /* Return a MEM corresponding to a stack slot with mode MODE.
23686 Allocate a new slot if necessary.
23687
23688 The RTL for a function can have several slots available: N is
23689 which slot to use. */
23690
23691 rtx
23692 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23693 {
23694 struct stack_local_entry *s;
23695
23696 gcc_assert (n < MAX_386_STACK_LOCALS);
23697
23698 /* Virtual slot is valid only before vregs are instantiated. */
23699 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23700
23701 for (s = ix86_stack_locals; s; s = s->next)
23702 if (s->mode == mode && s->n == n)
23703 return validize_mem (copy_rtx (s->rtl));
23704
23705 s = ggc_alloc_stack_local_entry ();
23706 s->n = n;
23707 s->mode = mode;
23708 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23709
23710 s->next = ix86_stack_locals;
23711 ix86_stack_locals = s;
23712 return validize_mem (s->rtl);
23713 }
23714 \f
23715 /* Calculate the length of the memory address in the instruction encoding.
23716 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23717 or other prefixes. */
23718
23719 int
23720 memory_address_length (rtx addr)
23721 {
23722 struct ix86_address parts;
23723 rtx base, index, disp;
23724 int len;
23725 int ok;
23726
23727 if (GET_CODE (addr) == PRE_DEC
23728 || GET_CODE (addr) == POST_INC
23729 || GET_CODE (addr) == PRE_MODIFY
23730 || GET_CODE (addr) == POST_MODIFY)
23731 return 0;
23732
23733 ok = ix86_decompose_address (addr, &parts);
23734 gcc_assert (ok);
23735
23736 if (parts.base && GET_CODE (parts.base) == SUBREG)
23737 parts.base = SUBREG_REG (parts.base);
23738 if (parts.index && GET_CODE (parts.index) == SUBREG)
23739 parts.index = SUBREG_REG (parts.index);
23740
23741 base = parts.base;
23742 index = parts.index;
23743 disp = parts.disp;
23744
23745 /* Add length of addr32 prefix. */
23746 len = (GET_CODE (addr) == ZERO_EXTEND
23747 || GET_CODE (addr) == AND);
23748
23749 /* Rule of thumb:
23750 - esp as the base always wants an index,
23751 - ebp as the base always wants a displacement,
23752 - r12 as the base always wants an index,
23753 - r13 as the base always wants a displacement. */
23754
23755 /* Register Indirect. */
23756 if (base && !index && !disp)
23757 {
23758 /* esp (for its index) and ebp (for its displacement) need
23759 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23760 code. */
23761 if (REG_P (addr)
23762 && (addr == arg_pointer_rtx
23763 || addr == frame_pointer_rtx
23764 || REGNO (addr) == SP_REG
23765 || REGNO (addr) == BP_REG
23766 || REGNO (addr) == R12_REG
23767 || REGNO (addr) == R13_REG))
23768 len = 1;
23769 }
23770
23771 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23772 is not disp32, but disp32(%rip), so for disp32
23773 SIB byte is needed, unless print_operand_address
23774 optimizes it into disp32(%rip) or (%rip) is implied
23775 by UNSPEC. */
23776 else if (disp && !base && !index)
23777 {
23778 len = 4;
23779 if (TARGET_64BIT)
23780 {
23781 rtx symbol = disp;
23782
23783 if (GET_CODE (disp) == CONST)
23784 symbol = XEXP (disp, 0);
23785 if (GET_CODE (symbol) == PLUS
23786 && CONST_INT_P (XEXP (symbol, 1)))
23787 symbol = XEXP (symbol, 0);
23788
23789 if (GET_CODE (symbol) != LABEL_REF
23790 && (GET_CODE (symbol) != SYMBOL_REF
23791 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23792 && (GET_CODE (symbol) != UNSPEC
23793 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23794 && XINT (symbol, 1) != UNSPEC_PCREL
23795 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23796 len += 1;
23797 }
23798 }
23799
23800 else
23801 {
23802 /* Find the length of the displacement constant. */
23803 if (disp)
23804 {
23805 if (base && satisfies_constraint_K (disp))
23806 len = 1;
23807 else
23808 len = 4;
23809 }
23810 /* ebp always wants a displacement. Similarly r13. */
23811 else if (base && REG_P (base)
23812 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23813 len = 1;
23814
23815 /* An index requires the two-byte modrm form.... */
23816 if (index
23817 /* ...like esp (or r12), which always wants an index. */
23818 || base == arg_pointer_rtx
23819 || base == frame_pointer_rtx
23820 || (base && REG_P (base)
23821 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23822 len += 1;
23823 }
23824
23825 switch (parts.seg)
23826 {
23827 case SEG_FS:
23828 case SEG_GS:
23829 len += 1;
23830 break;
23831 default:
23832 break;
23833 }
23834
23835 return len;
23836 }
23837
23838 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23839 is set, expect that insn have 8bit immediate alternative. */
23840 int
23841 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23842 {
23843 int len = 0;
23844 int i;
23845 extract_insn_cached (insn);
23846 for (i = recog_data.n_operands - 1; i >= 0; --i)
23847 if (CONSTANT_P (recog_data.operand[i]))
23848 {
23849 enum attr_mode mode = get_attr_mode (insn);
23850
23851 gcc_assert (!len);
23852 if (shortform && CONST_INT_P (recog_data.operand[i]))
23853 {
23854 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23855 switch (mode)
23856 {
23857 case MODE_QI:
23858 len = 1;
23859 continue;
23860 case MODE_HI:
23861 ival = trunc_int_for_mode (ival, HImode);
23862 break;
23863 case MODE_SI:
23864 ival = trunc_int_for_mode (ival, SImode);
23865 break;
23866 default:
23867 break;
23868 }
23869 if (IN_RANGE (ival, -128, 127))
23870 {
23871 len = 1;
23872 continue;
23873 }
23874 }
23875 switch (mode)
23876 {
23877 case MODE_QI:
23878 len = 1;
23879 break;
23880 case MODE_HI:
23881 len = 2;
23882 break;
23883 case MODE_SI:
23884 len = 4;
23885 break;
23886 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23887 case MODE_DI:
23888 len = 4;
23889 break;
23890 default:
23891 fatal_insn ("unknown insn mode", insn);
23892 }
23893 }
23894 return len;
23895 }
23896 /* Compute default value for "length_address" attribute. */
23897 int
23898 ix86_attr_length_address_default (rtx insn)
23899 {
23900 int i;
23901
23902 if (get_attr_type (insn) == TYPE_LEA)
23903 {
23904 rtx set = PATTERN (insn), addr;
23905
23906 if (GET_CODE (set) == PARALLEL)
23907 set = XVECEXP (set, 0, 0);
23908
23909 gcc_assert (GET_CODE (set) == SET);
23910
23911 addr = SET_SRC (set);
23912 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23913 {
23914 if (GET_CODE (addr) == ZERO_EXTEND)
23915 addr = XEXP (addr, 0);
23916 if (GET_CODE (addr) == SUBREG)
23917 addr = SUBREG_REG (addr);
23918 }
23919
23920 return memory_address_length (addr);
23921 }
23922
23923 extract_insn_cached (insn);
23924 for (i = recog_data.n_operands - 1; i >= 0; --i)
23925 if (MEM_P (recog_data.operand[i]))
23926 {
23927 constrain_operands_cached (reload_completed);
23928 if (which_alternative != -1)
23929 {
23930 const char *constraints = recog_data.constraints[i];
23931 int alt = which_alternative;
23932
23933 while (*constraints == '=' || *constraints == '+')
23934 constraints++;
23935 while (alt-- > 0)
23936 while (*constraints++ != ',')
23937 ;
23938 /* Skip ignored operands. */
23939 if (*constraints == 'X')
23940 continue;
23941 }
23942 return memory_address_length (XEXP (recog_data.operand[i], 0));
23943 }
23944 return 0;
23945 }
23946
23947 /* Compute default value for "length_vex" attribute. It includes
23948 2 or 3 byte VEX prefix and 1 opcode byte. */
23949
23950 int
23951 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23952 {
23953 int i;
23954
23955 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23956 byte VEX prefix. */
23957 if (!has_0f_opcode || has_vex_w)
23958 return 3 + 1;
23959
23960 /* We can always use 2 byte VEX prefix in 32bit. */
23961 if (!TARGET_64BIT)
23962 return 2 + 1;
23963
23964 extract_insn_cached (insn);
23965
23966 for (i = recog_data.n_operands - 1; i >= 0; --i)
23967 if (REG_P (recog_data.operand[i]))
23968 {
23969 /* REX.W bit uses 3 byte VEX prefix. */
23970 if (GET_MODE (recog_data.operand[i]) == DImode
23971 && GENERAL_REG_P (recog_data.operand[i]))
23972 return 3 + 1;
23973 }
23974 else
23975 {
23976 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23977 if (MEM_P (recog_data.operand[i])
23978 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23979 return 3 + 1;
23980 }
23981
23982 return 2 + 1;
23983 }
23984 \f
23985 /* Return the maximum number of instructions a cpu can issue. */
23986
23987 static int
23988 ix86_issue_rate (void)
23989 {
23990 switch (ix86_tune)
23991 {
23992 case PROCESSOR_PENTIUM:
23993 case PROCESSOR_ATOM:
23994 case PROCESSOR_K6:
23995 return 2;
23996
23997 case PROCESSOR_PENTIUMPRO:
23998 case PROCESSOR_PENTIUM4:
23999 case PROCESSOR_CORE2_32:
24000 case PROCESSOR_CORE2_64:
24001 case PROCESSOR_COREI7_32:
24002 case PROCESSOR_COREI7_64:
24003 case PROCESSOR_ATHLON:
24004 case PROCESSOR_K8:
24005 case PROCESSOR_AMDFAM10:
24006 case PROCESSOR_NOCONA:
24007 case PROCESSOR_GENERIC32:
24008 case PROCESSOR_GENERIC64:
24009 case PROCESSOR_BDVER1:
24010 case PROCESSOR_BDVER2:
24011 case PROCESSOR_BTVER1:
24012 return 3;
24013
24014 default:
24015 return 1;
24016 }
24017 }
24018
24019 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24020 by DEP_INSN and nothing set by DEP_INSN. */
24021
24022 static bool
24023 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24024 {
24025 rtx set, set2;
24026
24027 /* Simplify the test for uninteresting insns. */
24028 if (insn_type != TYPE_SETCC
24029 && insn_type != TYPE_ICMOV
24030 && insn_type != TYPE_FCMOV
24031 && insn_type != TYPE_IBR)
24032 return false;
24033
24034 if ((set = single_set (dep_insn)) != 0)
24035 {
24036 set = SET_DEST (set);
24037 set2 = NULL_RTX;
24038 }
24039 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24040 && XVECLEN (PATTERN (dep_insn), 0) == 2
24041 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24042 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24043 {
24044 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24045 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24046 }
24047 else
24048 return false;
24049
24050 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24051 return false;
24052
24053 /* This test is true if the dependent insn reads the flags but
24054 not any other potentially set register. */
24055 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24056 return false;
24057
24058 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24059 return false;
24060
24061 return true;
24062 }
24063
24064 /* Return true iff USE_INSN has a memory address with operands set by
24065 SET_INSN. */
24066
24067 bool
24068 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24069 {
24070 int i;
24071 extract_insn_cached (use_insn);
24072 for (i = recog_data.n_operands - 1; i >= 0; --i)
24073 if (MEM_P (recog_data.operand[i]))
24074 {
24075 rtx addr = XEXP (recog_data.operand[i], 0);
24076 return modified_in_p (addr, set_insn) != 0;
24077 }
24078 return false;
24079 }
24080
24081 static int
24082 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24083 {
24084 enum attr_type insn_type, dep_insn_type;
24085 enum attr_memory memory;
24086 rtx set, set2;
24087 int dep_insn_code_number;
24088
24089 /* Anti and output dependencies have zero cost on all CPUs. */
24090 if (REG_NOTE_KIND (link) != 0)
24091 return 0;
24092
24093 dep_insn_code_number = recog_memoized (dep_insn);
24094
24095 /* If we can't recognize the insns, we can't really do anything. */
24096 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24097 return cost;
24098
24099 insn_type = get_attr_type (insn);
24100 dep_insn_type = get_attr_type (dep_insn);
24101
24102 switch (ix86_tune)
24103 {
24104 case PROCESSOR_PENTIUM:
24105 /* Address Generation Interlock adds a cycle of latency. */
24106 if (insn_type == TYPE_LEA)
24107 {
24108 rtx addr = PATTERN (insn);
24109
24110 if (GET_CODE (addr) == PARALLEL)
24111 addr = XVECEXP (addr, 0, 0);
24112
24113 gcc_assert (GET_CODE (addr) == SET);
24114
24115 addr = SET_SRC (addr);
24116 if (modified_in_p (addr, dep_insn))
24117 cost += 1;
24118 }
24119 else if (ix86_agi_dependent (dep_insn, insn))
24120 cost += 1;
24121
24122 /* ??? Compares pair with jump/setcc. */
24123 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24124 cost = 0;
24125
24126 /* Floating point stores require value to be ready one cycle earlier. */
24127 if (insn_type == TYPE_FMOV
24128 && get_attr_memory (insn) == MEMORY_STORE
24129 && !ix86_agi_dependent (dep_insn, insn))
24130 cost += 1;
24131 break;
24132
24133 case PROCESSOR_PENTIUMPRO:
24134 memory = get_attr_memory (insn);
24135
24136 /* INT->FP conversion is expensive. */
24137 if (get_attr_fp_int_src (dep_insn))
24138 cost += 5;
24139
24140 /* There is one cycle extra latency between an FP op and a store. */
24141 if (insn_type == TYPE_FMOV
24142 && (set = single_set (dep_insn)) != NULL_RTX
24143 && (set2 = single_set (insn)) != NULL_RTX
24144 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24145 && MEM_P (SET_DEST (set2)))
24146 cost += 1;
24147
24148 /* Show ability of reorder buffer to hide latency of load by executing
24149 in parallel with previous instruction in case
24150 previous instruction is not needed to compute the address. */
24151 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24152 && !ix86_agi_dependent (dep_insn, insn))
24153 {
24154 /* Claim moves to take one cycle, as core can issue one load
24155 at time and the next load can start cycle later. */
24156 if (dep_insn_type == TYPE_IMOV
24157 || dep_insn_type == TYPE_FMOV)
24158 cost = 1;
24159 else if (cost > 1)
24160 cost--;
24161 }
24162 break;
24163
24164 case PROCESSOR_K6:
24165 memory = get_attr_memory (insn);
24166
24167 /* The esp dependency is resolved before the instruction is really
24168 finished. */
24169 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24170 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24171 return 1;
24172
24173 /* INT->FP conversion is expensive. */
24174 if (get_attr_fp_int_src (dep_insn))
24175 cost += 5;
24176
24177 /* Show ability of reorder buffer to hide latency of load by executing
24178 in parallel with previous instruction in case
24179 previous instruction is not needed to compute the address. */
24180 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24181 && !ix86_agi_dependent (dep_insn, insn))
24182 {
24183 /* Claim moves to take one cycle, as core can issue one load
24184 at time and the next load can start cycle later. */
24185 if (dep_insn_type == TYPE_IMOV
24186 || dep_insn_type == TYPE_FMOV)
24187 cost = 1;
24188 else if (cost > 2)
24189 cost -= 2;
24190 else
24191 cost = 1;
24192 }
24193 break;
24194
24195 case PROCESSOR_ATHLON:
24196 case PROCESSOR_K8:
24197 case PROCESSOR_AMDFAM10:
24198 case PROCESSOR_BDVER1:
24199 case PROCESSOR_BDVER2:
24200 case PROCESSOR_BTVER1:
24201 case PROCESSOR_ATOM:
24202 case PROCESSOR_GENERIC32:
24203 case PROCESSOR_GENERIC64:
24204 memory = get_attr_memory (insn);
24205
24206 /* Show ability of reorder buffer to hide latency of load by executing
24207 in parallel with previous instruction in case
24208 previous instruction is not needed to compute the address. */
24209 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24210 && !ix86_agi_dependent (dep_insn, insn))
24211 {
24212 enum attr_unit unit = get_attr_unit (insn);
24213 int loadcost = 3;
24214
24215 /* Because of the difference between the length of integer and
24216 floating unit pipeline preparation stages, the memory operands
24217 for floating point are cheaper.
24218
24219 ??? For Athlon it the difference is most probably 2. */
24220 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24221 loadcost = 3;
24222 else
24223 loadcost = TARGET_ATHLON ? 2 : 0;
24224
24225 if (cost >= loadcost)
24226 cost -= loadcost;
24227 else
24228 cost = 0;
24229 }
24230
24231 default:
24232 break;
24233 }
24234
24235 return cost;
24236 }
24237
24238 /* How many alternative schedules to try. This should be as wide as the
24239 scheduling freedom in the DFA, but no wider. Making this value too
24240 large results extra work for the scheduler. */
24241
24242 static int
24243 ia32_multipass_dfa_lookahead (void)
24244 {
24245 switch (ix86_tune)
24246 {
24247 case PROCESSOR_PENTIUM:
24248 return 2;
24249
24250 case PROCESSOR_PENTIUMPRO:
24251 case PROCESSOR_K6:
24252 return 1;
24253
24254 case PROCESSOR_CORE2_32:
24255 case PROCESSOR_CORE2_64:
24256 case PROCESSOR_COREI7_32:
24257 case PROCESSOR_COREI7_64:
24258 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24259 as many instructions can be executed on a cycle, i.e.,
24260 issue_rate. I wonder why tuning for many CPUs does not do this. */
24261 return ix86_issue_rate ();
24262
24263 default:
24264 return 0;
24265 }
24266 }
24267
24268 \f
24269
24270 /* Model decoder of Core 2/i7.
24271 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24272 track the instruction fetch block boundaries and make sure that long
24273 (9+ bytes) instructions are assigned to D0. */
24274
24275 /* Maximum length of an insn that can be handled by
24276 a secondary decoder unit. '8' for Core 2/i7. */
24277 static int core2i7_secondary_decoder_max_insn_size;
24278
24279 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24280 '16' for Core 2/i7. */
24281 static int core2i7_ifetch_block_size;
24282
24283 /* Maximum number of instructions decoder can handle per cycle.
24284 '6' for Core 2/i7. */
24285 static int core2i7_ifetch_block_max_insns;
24286
24287 typedef struct ix86_first_cycle_multipass_data_ *
24288 ix86_first_cycle_multipass_data_t;
24289 typedef const struct ix86_first_cycle_multipass_data_ *
24290 const_ix86_first_cycle_multipass_data_t;
24291
24292 /* A variable to store target state across calls to max_issue within
24293 one cycle. */
24294 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24295 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24296
24297 /* Initialize DATA. */
24298 static void
24299 core2i7_first_cycle_multipass_init (void *_data)
24300 {
24301 ix86_first_cycle_multipass_data_t data
24302 = (ix86_first_cycle_multipass_data_t) _data;
24303
24304 data->ifetch_block_len = 0;
24305 data->ifetch_block_n_insns = 0;
24306 data->ready_try_change = NULL;
24307 data->ready_try_change_size = 0;
24308 }
24309
24310 /* Advancing the cycle; reset ifetch block counts. */
24311 static void
24312 core2i7_dfa_post_advance_cycle (void)
24313 {
24314 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24315
24316 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24317
24318 data->ifetch_block_len = 0;
24319 data->ifetch_block_n_insns = 0;
24320 }
24321
24322 static int min_insn_size (rtx);
24323
24324 /* Filter out insns from ready_try that the core will not be able to issue
24325 on current cycle due to decoder. */
24326 static void
24327 core2i7_first_cycle_multipass_filter_ready_try
24328 (const_ix86_first_cycle_multipass_data_t data,
24329 char *ready_try, int n_ready, bool first_cycle_insn_p)
24330 {
24331 while (n_ready--)
24332 {
24333 rtx insn;
24334 int insn_size;
24335
24336 if (ready_try[n_ready])
24337 continue;
24338
24339 insn = get_ready_element (n_ready);
24340 insn_size = min_insn_size (insn);
24341
24342 if (/* If this is a too long an insn for a secondary decoder ... */
24343 (!first_cycle_insn_p
24344 && insn_size > core2i7_secondary_decoder_max_insn_size)
24345 /* ... or it would not fit into the ifetch block ... */
24346 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24347 /* ... or the decoder is full already ... */
24348 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24349 /* ... mask the insn out. */
24350 {
24351 ready_try[n_ready] = 1;
24352
24353 if (data->ready_try_change)
24354 SET_BIT (data->ready_try_change, n_ready);
24355 }
24356 }
24357 }
24358
24359 /* Prepare for a new round of multipass lookahead scheduling. */
24360 static void
24361 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24362 bool first_cycle_insn_p)
24363 {
24364 ix86_first_cycle_multipass_data_t data
24365 = (ix86_first_cycle_multipass_data_t) _data;
24366 const_ix86_first_cycle_multipass_data_t prev_data
24367 = ix86_first_cycle_multipass_data;
24368
24369 /* Restore the state from the end of the previous round. */
24370 data->ifetch_block_len = prev_data->ifetch_block_len;
24371 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24372
24373 /* Filter instructions that cannot be issued on current cycle due to
24374 decoder restrictions. */
24375 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24376 first_cycle_insn_p);
24377 }
24378
24379 /* INSN is being issued in current solution. Account for its impact on
24380 the decoder model. */
24381 static void
24382 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24383 rtx insn, const void *_prev_data)
24384 {
24385 ix86_first_cycle_multipass_data_t data
24386 = (ix86_first_cycle_multipass_data_t) _data;
24387 const_ix86_first_cycle_multipass_data_t prev_data
24388 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24389
24390 int insn_size = min_insn_size (insn);
24391
24392 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24393 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24394 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24395 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24396
24397 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24398 if (!data->ready_try_change)
24399 {
24400 data->ready_try_change = sbitmap_alloc (n_ready);
24401 data->ready_try_change_size = n_ready;
24402 }
24403 else if (data->ready_try_change_size < n_ready)
24404 {
24405 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24406 n_ready, 0);
24407 data->ready_try_change_size = n_ready;
24408 }
24409 sbitmap_zero (data->ready_try_change);
24410
24411 /* Filter out insns from ready_try that the core will not be able to issue
24412 on current cycle due to decoder. */
24413 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24414 false);
24415 }
24416
24417 /* Revert the effect on ready_try. */
24418 static void
24419 core2i7_first_cycle_multipass_backtrack (const void *_data,
24420 char *ready_try,
24421 int n_ready ATTRIBUTE_UNUSED)
24422 {
24423 const_ix86_first_cycle_multipass_data_t data
24424 = (const_ix86_first_cycle_multipass_data_t) _data;
24425 unsigned int i = 0;
24426 sbitmap_iterator sbi;
24427
24428 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
24429 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
24430 {
24431 ready_try[i] = 0;
24432 }
24433 }
24434
24435 /* Save the result of multipass lookahead scheduling for the next round. */
24436 static void
24437 core2i7_first_cycle_multipass_end (const void *_data)
24438 {
24439 const_ix86_first_cycle_multipass_data_t data
24440 = (const_ix86_first_cycle_multipass_data_t) _data;
24441 ix86_first_cycle_multipass_data_t next_data
24442 = ix86_first_cycle_multipass_data;
24443
24444 if (data != NULL)
24445 {
24446 next_data->ifetch_block_len = data->ifetch_block_len;
24447 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24448 }
24449 }
24450
24451 /* Deallocate target data. */
24452 static void
24453 core2i7_first_cycle_multipass_fini (void *_data)
24454 {
24455 ix86_first_cycle_multipass_data_t data
24456 = (ix86_first_cycle_multipass_data_t) _data;
24457
24458 if (data->ready_try_change)
24459 {
24460 sbitmap_free (data->ready_try_change);
24461 data->ready_try_change = NULL;
24462 data->ready_try_change_size = 0;
24463 }
24464 }
24465
24466 /* Prepare for scheduling pass. */
24467 static void
24468 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24469 int verbose ATTRIBUTE_UNUSED,
24470 int max_uid ATTRIBUTE_UNUSED)
24471 {
24472 /* Install scheduling hooks for current CPU. Some of these hooks are used
24473 in time-critical parts of the scheduler, so we only set them up when
24474 they are actually used. */
24475 switch (ix86_tune)
24476 {
24477 case PROCESSOR_CORE2_32:
24478 case PROCESSOR_CORE2_64:
24479 case PROCESSOR_COREI7_32:
24480 case PROCESSOR_COREI7_64:
24481 targetm.sched.dfa_post_advance_cycle
24482 = core2i7_dfa_post_advance_cycle;
24483 targetm.sched.first_cycle_multipass_init
24484 = core2i7_first_cycle_multipass_init;
24485 targetm.sched.first_cycle_multipass_begin
24486 = core2i7_first_cycle_multipass_begin;
24487 targetm.sched.first_cycle_multipass_issue
24488 = core2i7_first_cycle_multipass_issue;
24489 targetm.sched.first_cycle_multipass_backtrack
24490 = core2i7_first_cycle_multipass_backtrack;
24491 targetm.sched.first_cycle_multipass_end
24492 = core2i7_first_cycle_multipass_end;
24493 targetm.sched.first_cycle_multipass_fini
24494 = core2i7_first_cycle_multipass_fini;
24495
24496 /* Set decoder parameters. */
24497 core2i7_secondary_decoder_max_insn_size = 8;
24498 core2i7_ifetch_block_size = 16;
24499 core2i7_ifetch_block_max_insns = 6;
24500 break;
24501
24502 default:
24503 targetm.sched.dfa_post_advance_cycle = NULL;
24504 targetm.sched.first_cycle_multipass_init = NULL;
24505 targetm.sched.first_cycle_multipass_begin = NULL;
24506 targetm.sched.first_cycle_multipass_issue = NULL;
24507 targetm.sched.first_cycle_multipass_backtrack = NULL;
24508 targetm.sched.first_cycle_multipass_end = NULL;
24509 targetm.sched.first_cycle_multipass_fini = NULL;
24510 break;
24511 }
24512 }
24513
24514 \f
24515 /* Compute the alignment given to a constant that is being placed in memory.
24516 EXP is the constant and ALIGN is the alignment that the object would
24517 ordinarily have.
24518 The value of this function is used instead of that alignment to align
24519 the object. */
24520
24521 int
24522 ix86_constant_alignment (tree exp, int align)
24523 {
24524 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24525 || TREE_CODE (exp) == INTEGER_CST)
24526 {
24527 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24528 return 64;
24529 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24530 return 128;
24531 }
24532 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24533 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24534 return BITS_PER_WORD;
24535
24536 return align;
24537 }
24538
24539 /* Compute the alignment for a static variable.
24540 TYPE is the data type, and ALIGN is the alignment that
24541 the object would ordinarily have. The value of this function is used
24542 instead of that alignment to align the object. */
24543
24544 int
24545 ix86_data_alignment (tree type, int align)
24546 {
24547 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24548
24549 if (AGGREGATE_TYPE_P (type)
24550 && TYPE_SIZE (type)
24551 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24552 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24553 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24554 && align < max_align)
24555 align = max_align;
24556
24557 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24558 to 16byte boundary. */
24559 if (TARGET_64BIT)
24560 {
24561 if (AGGREGATE_TYPE_P (type)
24562 && TYPE_SIZE (type)
24563 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24564 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24565 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24566 return 128;
24567 }
24568
24569 if (TREE_CODE (type) == ARRAY_TYPE)
24570 {
24571 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24572 return 64;
24573 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24574 return 128;
24575 }
24576 else if (TREE_CODE (type) == COMPLEX_TYPE)
24577 {
24578
24579 if (TYPE_MODE (type) == DCmode && align < 64)
24580 return 64;
24581 if ((TYPE_MODE (type) == XCmode
24582 || TYPE_MODE (type) == TCmode) && align < 128)
24583 return 128;
24584 }
24585 else if ((TREE_CODE (type) == RECORD_TYPE
24586 || TREE_CODE (type) == UNION_TYPE
24587 || TREE_CODE (type) == QUAL_UNION_TYPE)
24588 && TYPE_FIELDS (type))
24589 {
24590 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24591 return 64;
24592 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24593 return 128;
24594 }
24595 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24596 || TREE_CODE (type) == INTEGER_TYPE)
24597 {
24598 if (TYPE_MODE (type) == DFmode && align < 64)
24599 return 64;
24600 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24601 return 128;
24602 }
24603
24604 return align;
24605 }
24606
24607 /* Compute the alignment for a local variable or a stack slot. EXP is
24608 the data type or decl itself, MODE is the widest mode available and
24609 ALIGN is the alignment that the object would ordinarily have. The
24610 value of this macro is used instead of that alignment to align the
24611 object. */
24612
24613 unsigned int
24614 ix86_local_alignment (tree exp, enum machine_mode mode,
24615 unsigned int align)
24616 {
24617 tree type, decl;
24618
24619 if (exp && DECL_P (exp))
24620 {
24621 type = TREE_TYPE (exp);
24622 decl = exp;
24623 }
24624 else
24625 {
24626 type = exp;
24627 decl = NULL;
24628 }
24629
24630 /* Don't do dynamic stack realignment for long long objects with
24631 -mpreferred-stack-boundary=2. */
24632 if (!TARGET_64BIT
24633 && align == 64
24634 && ix86_preferred_stack_boundary < 64
24635 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24636 && (!type || !TYPE_USER_ALIGN (type))
24637 && (!decl || !DECL_USER_ALIGN (decl)))
24638 align = 32;
24639
24640 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24641 register in MODE. We will return the largest alignment of XF
24642 and DF. */
24643 if (!type)
24644 {
24645 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24646 align = GET_MODE_ALIGNMENT (DFmode);
24647 return align;
24648 }
24649
24650 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24651 to 16byte boundary. Exact wording is:
24652
24653 An array uses the same alignment as its elements, except that a local or
24654 global array variable of length at least 16 bytes or
24655 a C99 variable-length array variable always has alignment of at least 16 bytes.
24656
24657 This was added to allow use of aligned SSE instructions at arrays. This
24658 rule is meant for static storage (where compiler can not do the analysis
24659 by itself). We follow it for automatic variables only when convenient.
24660 We fully control everything in the function compiled and functions from
24661 other unit can not rely on the alignment.
24662
24663 Exclude va_list type. It is the common case of local array where
24664 we can not benefit from the alignment. */
24665 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24666 && TARGET_SSE)
24667 {
24668 if (AGGREGATE_TYPE_P (type)
24669 && (va_list_type_node == NULL_TREE
24670 || (TYPE_MAIN_VARIANT (type)
24671 != TYPE_MAIN_VARIANT (va_list_type_node)))
24672 && TYPE_SIZE (type)
24673 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24674 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24675 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24676 return 128;
24677 }
24678 if (TREE_CODE (type) == ARRAY_TYPE)
24679 {
24680 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24681 return 64;
24682 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24683 return 128;
24684 }
24685 else if (TREE_CODE (type) == COMPLEX_TYPE)
24686 {
24687 if (TYPE_MODE (type) == DCmode && align < 64)
24688 return 64;
24689 if ((TYPE_MODE (type) == XCmode
24690 || TYPE_MODE (type) == TCmode) && align < 128)
24691 return 128;
24692 }
24693 else if ((TREE_CODE (type) == RECORD_TYPE
24694 || TREE_CODE (type) == UNION_TYPE
24695 || TREE_CODE (type) == QUAL_UNION_TYPE)
24696 && TYPE_FIELDS (type))
24697 {
24698 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24699 return 64;
24700 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24701 return 128;
24702 }
24703 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24704 || TREE_CODE (type) == INTEGER_TYPE)
24705 {
24706
24707 if (TYPE_MODE (type) == DFmode && align < 64)
24708 return 64;
24709 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24710 return 128;
24711 }
24712 return align;
24713 }
24714
24715 /* Compute the minimum required alignment for dynamic stack realignment
24716 purposes for a local variable, parameter or a stack slot. EXP is
24717 the data type or decl itself, MODE is its mode and ALIGN is the
24718 alignment that the object would ordinarily have. */
24719
24720 unsigned int
24721 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24722 unsigned int align)
24723 {
24724 tree type, decl;
24725
24726 if (exp && DECL_P (exp))
24727 {
24728 type = TREE_TYPE (exp);
24729 decl = exp;
24730 }
24731 else
24732 {
24733 type = exp;
24734 decl = NULL;
24735 }
24736
24737 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24738 return align;
24739
24740 /* Don't do dynamic stack realignment for long long objects with
24741 -mpreferred-stack-boundary=2. */
24742 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24743 && (!type || !TYPE_USER_ALIGN (type))
24744 && (!decl || !DECL_USER_ALIGN (decl)))
24745 return 32;
24746
24747 return align;
24748 }
24749 \f
24750 /* Find a location for the static chain incoming to a nested function.
24751 This is a register, unless all free registers are used by arguments. */
24752
24753 static rtx
24754 ix86_static_chain (const_tree fndecl, bool incoming_p)
24755 {
24756 unsigned regno;
24757
24758 if (!DECL_STATIC_CHAIN (fndecl))
24759 return NULL;
24760
24761 if (TARGET_64BIT)
24762 {
24763 /* We always use R10 in 64-bit mode. */
24764 regno = R10_REG;
24765 }
24766 else
24767 {
24768 tree fntype;
24769 unsigned int ccvt;
24770
24771 /* By default in 32-bit mode we use ECX to pass the static chain. */
24772 regno = CX_REG;
24773
24774 fntype = TREE_TYPE (fndecl);
24775 ccvt = ix86_get_callcvt (fntype);
24776 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24777 {
24778 /* Fastcall functions use ecx/edx for arguments, which leaves
24779 us with EAX for the static chain.
24780 Thiscall functions use ecx for arguments, which also
24781 leaves us with EAX for the static chain. */
24782 regno = AX_REG;
24783 }
24784 else if (ix86_function_regparm (fntype, fndecl) == 3)
24785 {
24786 /* For regparm 3, we have no free call-clobbered registers in
24787 which to store the static chain. In order to implement this,
24788 we have the trampoline push the static chain to the stack.
24789 However, we can't push a value below the return address when
24790 we call the nested function directly, so we have to use an
24791 alternate entry point. For this we use ESI, and have the
24792 alternate entry point push ESI, so that things appear the
24793 same once we're executing the nested function. */
24794 if (incoming_p)
24795 {
24796 if (fndecl == current_function_decl)
24797 ix86_static_chain_on_stack = true;
24798 return gen_frame_mem (SImode,
24799 plus_constant (arg_pointer_rtx, -8));
24800 }
24801 regno = SI_REG;
24802 }
24803 }
24804
24805 return gen_rtx_REG (Pmode, regno);
24806 }
24807
24808 /* Emit RTL insns to initialize the variable parts of a trampoline.
24809 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24810 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24811 to be passed to the target function. */
24812
24813 static void
24814 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24815 {
24816 rtx mem, fnaddr;
24817 int opcode;
24818 int offset = 0;
24819
24820 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24821
24822 if (TARGET_64BIT)
24823 {
24824 int size;
24825
24826 /* Load the function address to r11. Try to load address using
24827 the shorter movl instead of movabs. We may want to support
24828 movq for kernel mode, but kernel does not use trampolines at
24829 the moment. */
24830 if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24831 {
24832 fnaddr = copy_to_mode_reg (DImode, fnaddr);
24833
24834 mem = adjust_address (m_tramp, HImode, offset);
24835 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24836
24837 mem = adjust_address (m_tramp, SImode, offset + 2);
24838 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24839 offset += 6;
24840 }
24841 else
24842 {
24843 mem = adjust_address (m_tramp, HImode, offset);
24844 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24845
24846 mem = adjust_address (m_tramp, DImode, offset + 2);
24847 emit_move_insn (mem, fnaddr);
24848 offset += 10;
24849 }
24850
24851 /* Load static chain using movabs to r10. Use the
24852 shorter movl instead of movabs for x32. */
24853 if (TARGET_X32)
24854 {
24855 opcode = 0xba41;
24856 size = 6;
24857 }
24858 else
24859 {
24860 opcode = 0xba49;
24861 size = 10;
24862 }
24863
24864 mem = adjust_address (m_tramp, HImode, offset);
24865 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24866
24867 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24868 emit_move_insn (mem, chain_value);
24869 offset += size;
24870
24871 /* Jump to r11; the last (unused) byte is a nop, only there to
24872 pad the write out to a single 32-bit store. */
24873 mem = adjust_address (m_tramp, SImode, offset);
24874 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24875 offset += 4;
24876 }
24877 else
24878 {
24879 rtx disp, chain;
24880
24881 /* Depending on the static chain location, either load a register
24882 with a constant, or push the constant to the stack. All of the
24883 instructions are the same size. */
24884 chain = ix86_static_chain (fndecl, true);
24885 if (REG_P (chain))
24886 {
24887 switch (REGNO (chain))
24888 {
24889 case AX_REG:
24890 opcode = 0xb8; break;
24891 case CX_REG:
24892 opcode = 0xb9; break;
24893 default:
24894 gcc_unreachable ();
24895 }
24896 }
24897 else
24898 opcode = 0x68;
24899
24900 mem = adjust_address (m_tramp, QImode, offset);
24901 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24902
24903 mem = adjust_address (m_tramp, SImode, offset + 1);
24904 emit_move_insn (mem, chain_value);
24905 offset += 5;
24906
24907 mem = adjust_address (m_tramp, QImode, offset);
24908 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24909
24910 mem = adjust_address (m_tramp, SImode, offset + 1);
24911
24912 /* Compute offset from the end of the jmp to the target function.
24913 In the case in which the trampoline stores the static chain on
24914 the stack, we need to skip the first insn which pushes the
24915 (call-saved) register static chain; this push is 1 byte. */
24916 offset += 5;
24917 disp = expand_binop (SImode, sub_optab, fnaddr,
24918 plus_constant (XEXP (m_tramp, 0),
24919 offset - (MEM_P (chain) ? 1 : 0)),
24920 NULL_RTX, 1, OPTAB_DIRECT);
24921 emit_move_insn (mem, disp);
24922 }
24923
24924 gcc_assert (offset <= TRAMPOLINE_SIZE);
24925
24926 #ifdef HAVE_ENABLE_EXECUTE_STACK
24927 #ifdef CHECK_EXECUTE_STACK_ENABLED
24928 if (CHECK_EXECUTE_STACK_ENABLED)
24929 #endif
24930 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24931 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24932 #endif
24933 }
24934 \f
24935 /* The following file contains several enumerations and data structures
24936 built from the definitions in i386-builtin-types.def. */
24937
24938 #include "i386-builtin-types.inc"
24939
24940 /* Table for the ix86 builtin non-function types. */
24941 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24942
24943 /* Retrieve an element from the above table, building some of
24944 the types lazily. */
24945
24946 static tree
24947 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24948 {
24949 unsigned int index;
24950 tree type, itype;
24951
24952 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24953
24954 type = ix86_builtin_type_tab[(int) tcode];
24955 if (type != NULL)
24956 return type;
24957
24958 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24959 if (tcode <= IX86_BT_LAST_VECT)
24960 {
24961 enum machine_mode mode;
24962
24963 index = tcode - IX86_BT_LAST_PRIM - 1;
24964 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24965 mode = ix86_builtin_type_vect_mode[index];
24966
24967 type = build_vector_type_for_mode (itype, mode);
24968 }
24969 else
24970 {
24971 int quals;
24972
24973 index = tcode - IX86_BT_LAST_VECT - 1;
24974 if (tcode <= IX86_BT_LAST_PTR)
24975 quals = TYPE_UNQUALIFIED;
24976 else
24977 quals = TYPE_QUAL_CONST;
24978
24979 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24980 if (quals != TYPE_UNQUALIFIED)
24981 itype = build_qualified_type (itype, quals);
24982
24983 type = build_pointer_type (itype);
24984 }
24985
24986 ix86_builtin_type_tab[(int) tcode] = type;
24987 return type;
24988 }
24989
24990 /* Table for the ix86 builtin function types. */
24991 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24992
24993 /* Retrieve an element from the above table, building some of
24994 the types lazily. */
24995
24996 static tree
24997 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24998 {
24999 tree type;
25000
25001 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25002
25003 type = ix86_builtin_func_type_tab[(int) tcode];
25004 if (type != NULL)
25005 return type;
25006
25007 if (tcode <= IX86_BT_LAST_FUNC)
25008 {
25009 unsigned start = ix86_builtin_func_start[(int) tcode];
25010 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25011 tree rtype, atype, args = void_list_node;
25012 unsigned i;
25013
25014 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25015 for (i = after - 1; i > start; --i)
25016 {
25017 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25018 args = tree_cons (NULL, atype, args);
25019 }
25020
25021 type = build_function_type (rtype, args);
25022 }
25023 else
25024 {
25025 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25026 enum ix86_builtin_func_type icode;
25027
25028 icode = ix86_builtin_func_alias_base[index];
25029 type = ix86_get_builtin_func_type (icode);
25030 }
25031
25032 ix86_builtin_func_type_tab[(int) tcode] = type;
25033 return type;
25034 }
25035
25036
25037 /* Codes for all the SSE/MMX builtins. */
25038 enum ix86_builtins
25039 {
25040 IX86_BUILTIN_ADDPS,
25041 IX86_BUILTIN_ADDSS,
25042 IX86_BUILTIN_DIVPS,
25043 IX86_BUILTIN_DIVSS,
25044 IX86_BUILTIN_MULPS,
25045 IX86_BUILTIN_MULSS,
25046 IX86_BUILTIN_SUBPS,
25047 IX86_BUILTIN_SUBSS,
25048
25049 IX86_BUILTIN_CMPEQPS,
25050 IX86_BUILTIN_CMPLTPS,
25051 IX86_BUILTIN_CMPLEPS,
25052 IX86_BUILTIN_CMPGTPS,
25053 IX86_BUILTIN_CMPGEPS,
25054 IX86_BUILTIN_CMPNEQPS,
25055 IX86_BUILTIN_CMPNLTPS,
25056 IX86_BUILTIN_CMPNLEPS,
25057 IX86_BUILTIN_CMPNGTPS,
25058 IX86_BUILTIN_CMPNGEPS,
25059 IX86_BUILTIN_CMPORDPS,
25060 IX86_BUILTIN_CMPUNORDPS,
25061 IX86_BUILTIN_CMPEQSS,
25062 IX86_BUILTIN_CMPLTSS,
25063 IX86_BUILTIN_CMPLESS,
25064 IX86_BUILTIN_CMPNEQSS,
25065 IX86_BUILTIN_CMPNLTSS,
25066 IX86_BUILTIN_CMPNLESS,
25067 IX86_BUILTIN_CMPNGTSS,
25068 IX86_BUILTIN_CMPNGESS,
25069 IX86_BUILTIN_CMPORDSS,
25070 IX86_BUILTIN_CMPUNORDSS,
25071
25072 IX86_BUILTIN_COMIEQSS,
25073 IX86_BUILTIN_COMILTSS,
25074 IX86_BUILTIN_COMILESS,
25075 IX86_BUILTIN_COMIGTSS,
25076 IX86_BUILTIN_COMIGESS,
25077 IX86_BUILTIN_COMINEQSS,
25078 IX86_BUILTIN_UCOMIEQSS,
25079 IX86_BUILTIN_UCOMILTSS,
25080 IX86_BUILTIN_UCOMILESS,
25081 IX86_BUILTIN_UCOMIGTSS,
25082 IX86_BUILTIN_UCOMIGESS,
25083 IX86_BUILTIN_UCOMINEQSS,
25084
25085 IX86_BUILTIN_CVTPI2PS,
25086 IX86_BUILTIN_CVTPS2PI,
25087 IX86_BUILTIN_CVTSI2SS,
25088 IX86_BUILTIN_CVTSI642SS,
25089 IX86_BUILTIN_CVTSS2SI,
25090 IX86_BUILTIN_CVTSS2SI64,
25091 IX86_BUILTIN_CVTTPS2PI,
25092 IX86_BUILTIN_CVTTSS2SI,
25093 IX86_BUILTIN_CVTTSS2SI64,
25094
25095 IX86_BUILTIN_MAXPS,
25096 IX86_BUILTIN_MAXSS,
25097 IX86_BUILTIN_MINPS,
25098 IX86_BUILTIN_MINSS,
25099
25100 IX86_BUILTIN_LOADUPS,
25101 IX86_BUILTIN_STOREUPS,
25102 IX86_BUILTIN_MOVSS,
25103
25104 IX86_BUILTIN_MOVHLPS,
25105 IX86_BUILTIN_MOVLHPS,
25106 IX86_BUILTIN_LOADHPS,
25107 IX86_BUILTIN_LOADLPS,
25108 IX86_BUILTIN_STOREHPS,
25109 IX86_BUILTIN_STORELPS,
25110
25111 IX86_BUILTIN_MASKMOVQ,
25112 IX86_BUILTIN_MOVMSKPS,
25113 IX86_BUILTIN_PMOVMSKB,
25114
25115 IX86_BUILTIN_MOVNTPS,
25116 IX86_BUILTIN_MOVNTQ,
25117
25118 IX86_BUILTIN_LOADDQU,
25119 IX86_BUILTIN_STOREDQU,
25120
25121 IX86_BUILTIN_PACKSSWB,
25122 IX86_BUILTIN_PACKSSDW,
25123 IX86_BUILTIN_PACKUSWB,
25124
25125 IX86_BUILTIN_PADDB,
25126 IX86_BUILTIN_PADDW,
25127 IX86_BUILTIN_PADDD,
25128 IX86_BUILTIN_PADDQ,
25129 IX86_BUILTIN_PADDSB,
25130 IX86_BUILTIN_PADDSW,
25131 IX86_BUILTIN_PADDUSB,
25132 IX86_BUILTIN_PADDUSW,
25133 IX86_BUILTIN_PSUBB,
25134 IX86_BUILTIN_PSUBW,
25135 IX86_BUILTIN_PSUBD,
25136 IX86_BUILTIN_PSUBQ,
25137 IX86_BUILTIN_PSUBSB,
25138 IX86_BUILTIN_PSUBSW,
25139 IX86_BUILTIN_PSUBUSB,
25140 IX86_BUILTIN_PSUBUSW,
25141
25142 IX86_BUILTIN_PAND,
25143 IX86_BUILTIN_PANDN,
25144 IX86_BUILTIN_POR,
25145 IX86_BUILTIN_PXOR,
25146
25147 IX86_BUILTIN_PAVGB,
25148 IX86_BUILTIN_PAVGW,
25149
25150 IX86_BUILTIN_PCMPEQB,
25151 IX86_BUILTIN_PCMPEQW,
25152 IX86_BUILTIN_PCMPEQD,
25153 IX86_BUILTIN_PCMPGTB,
25154 IX86_BUILTIN_PCMPGTW,
25155 IX86_BUILTIN_PCMPGTD,
25156
25157 IX86_BUILTIN_PMADDWD,
25158
25159 IX86_BUILTIN_PMAXSW,
25160 IX86_BUILTIN_PMAXUB,
25161 IX86_BUILTIN_PMINSW,
25162 IX86_BUILTIN_PMINUB,
25163
25164 IX86_BUILTIN_PMULHUW,
25165 IX86_BUILTIN_PMULHW,
25166 IX86_BUILTIN_PMULLW,
25167
25168 IX86_BUILTIN_PSADBW,
25169 IX86_BUILTIN_PSHUFW,
25170
25171 IX86_BUILTIN_PSLLW,
25172 IX86_BUILTIN_PSLLD,
25173 IX86_BUILTIN_PSLLQ,
25174 IX86_BUILTIN_PSRAW,
25175 IX86_BUILTIN_PSRAD,
25176 IX86_BUILTIN_PSRLW,
25177 IX86_BUILTIN_PSRLD,
25178 IX86_BUILTIN_PSRLQ,
25179 IX86_BUILTIN_PSLLWI,
25180 IX86_BUILTIN_PSLLDI,
25181 IX86_BUILTIN_PSLLQI,
25182 IX86_BUILTIN_PSRAWI,
25183 IX86_BUILTIN_PSRADI,
25184 IX86_BUILTIN_PSRLWI,
25185 IX86_BUILTIN_PSRLDI,
25186 IX86_BUILTIN_PSRLQI,
25187
25188 IX86_BUILTIN_PUNPCKHBW,
25189 IX86_BUILTIN_PUNPCKHWD,
25190 IX86_BUILTIN_PUNPCKHDQ,
25191 IX86_BUILTIN_PUNPCKLBW,
25192 IX86_BUILTIN_PUNPCKLWD,
25193 IX86_BUILTIN_PUNPCKLDQ,
25194
25195 IX86_BUILTIN_SHUFPS,
25196
25197 IX86_BUILTIN_RCPPS,
25198 IX86_BUILTIN_RCPSS,
25199 IX86_BUILTIN_RSQRTPS,
25200 IX86_BUILTIN_RSQRTPS_NR,
25201 IX86_BUILTIN_RSQRTSS,
25202 IX86_BUILTIN_RSQRTF,
25203 IX86_BUILTIN_SQRTPS,
25204 IX86_BUILTIN_SQRTPS_NR,
25205 IX86_BUILTIN_SQRTSS,
25206
25207 IX86_BUILTIN_UNPCKHPS,
25208 IX86_BUILTIN_UNPCKLPS,
25209
25210 IX86_BUILTIN_ANDPS,
25211 IX86_BUILTIN_ANDNPS,
25212 IX86_BUILTIN_ORPS,
25213 IX86_BUILTIN_XORPS,
25214
25215 IX86_BUILTIN_EMMS,
25216 IX86_BUILTIN_LDMXCSR,
25217 IX86_BUILTIN_STMXCSR,
25218 IX86_BUILTIN_SFENCE,
25219
25220 /* 3DNow! Original */
25221 IX86_BUILTIN_FEMMS,
25222 IX86_BUILTIN_PAVGUSB,
25223 IX86_BUILTIN_PF2ID,
25224 IX86_BUILTIN_PFACC,
25225 IX86_BUILTIN_PFADD,
25226 IX86_BUILTIN_PFCMPEQ,
25227 IX86_BUILTIN_PFCMPGE,
25228 IX86_BUILTIN_PFCMPGT,
25229 IX86_BUILTIN_PFMAX,
25230 IX86_BUILTIN_PFMIN,
25231 IX86_BUILTIN_PFMUL,
25232 IX86_BUILTIN_PFRCP,
25233 IX86_BUILTIN_PFRCPIT1,
25234 IX86_BUILTIN_PFRCPIT2,
25235 IX86_BUILTIN_PFRSQIT1,
25236 IX86_BUILTIN_PFRSQRT,
25237 IX86_BUILTIN_PFSUB,
25238 IX86_BUILTIN_PFSUBR,
25239 IX86_BUILTIN_PI2FD,
25240 IX86_BUILTIN_PMULHRW,
25241
25242 /* 3DNow! Athlon Extensions */
25243 IX86_BUILTIN_PF2IW,
25244 IX86_BUILTIN_PFNACC,
25245 IX86_BUILTIN_PFPNACC,
25246 IX86_BUILTIN_PI2FW,
25247 IX86_BUILTIN_PSWAPDSI,
25248 IX86_BUILTIN_PSWAPDSF,
25249
25250 /* SSE2 */
25251 IX86_BUILTIN_ADDPD,
25252 IX86_BUILTIN_ADDSD,
25253 IX86_BUILTIN_DIVPD,
25254 IX86_BUILTIN_DIVSD,
25255 IX86_BUILTIN_MULPD,
25256 IX86_BUILTIN_MULSD,
25257 IX86_BUILTIN_SUBPD,
25258 IX86_BUILTIN_SUBSD,
25259
25260 IX86_BUILTIN_CMPEQPD,
25261 IX86_BUILTIN_CMPLTPD,
25262 IX86_BUILTIN_CMPLEPD,
25263 IX86_BUILTIN_CMPGTPD,
25264 IX86_BUILTIN_CMPGEPD,
25265 IX86_BUILTIN_CMPNEQPD,
25266 IX86_BUILTIN_CMPNLTPD,
25267 IX86_BUILTIN_CMPNLEPD,
25268 IX86_BUILTIN_CMPNGTPD,
25269 IX86_BUILTIN_CMPNGEPD,
25270 IX86_BUILTIN_CMPORDPD,
25271 IX86_BUILTIN_CMPUNORDPD,
25272 IX86_BUILTIN_CMPEQSD,
25273 IX86_BUILTIN_CMPLTSD,
25274 IX86_BUILTIN_CMPLESD,
25275 IX86_BUILTIN_CMPNEQSD,
25276 IX86_BUILTIN_CMPNLTSD,
25277 IX86_BUILTIN_CMPNLESD,
25278 IX86_BUILTIN_CMPORDSD,
25279 IX86_BUILTIN_CMPUNORDSD,
25280
25281 IX86_BUILTIN_COMIEQSD,
25282 IX86_BUILTIN_COMILTSD,
25283 IX86_BUILTIN_COMILESD,
25284 IX86_BUILTIN_COMIGTSD,
25285 IX86_BUILTIN_COMIGESD,
25286 IX86_BUILTIN_COMINEQSD,
25287 IX86_BUILTIN_UCOMIEQSD,
25288 IX86_BUILTIN_UCOMILTSD,
25289 IX86_BUILTIN_UCOMILESD,
25290 IX86_BUILTIN_UCOMIGTSD,
25291 IX86_BUILTIN_UCOMIGESD,
25292 IX86_BUILTIN_UCOMINEQSD,
25293
25294 IX86_BUILTIN_MAXPD,
25295 IX86_BUILTIN_MAXSD,
25296 IX86_BUILTIN_MINPD,
25297 IX86_BUILTIN_MINSD,
25298
25299 IX86_BUILTIN_ANDPD,
25300 IX86_BUILTIN_ANDNPD,
25301 IX86_BUILTIN_ORPD,
25302 IX86_BUILTIN_XORPD,
25303
25304 IX86_BUILTIN_SQRTPD,
25305 IX86_BUILTIN_SQRTSD,
25306
25307 IX86_BUILTIN_UNPCKHPD,
25308 IX86_BUILTIN_UNPCKLPD,
25309
25310 IX86_BUILTIN_SHUFPD,
25311
25312 IX86_BUILTIN_LOADUPD,
25313 IX86_BUILTIN_STOREUPD,
25314 IX86_BUILTIN_MOVSD,
25315
25316 IX86_BUILTIN_LOADHPD,
25317 IX86_BUILTIN_LOADLPD,
25318
25319 IX86_BUILTIN_CVTDQ2PD,
25320 IX86_BUILTIN_CVTDQ2PS,
25321
25322 IX86_BUILTIN_CVTPD2DQ,
25323 IX86_BUILTIN_CVTPD2PI,
25324 IX86_BUILTIN_CVTPD2PS,
25325 IX86_BUILTIN_CVTTPD2DQ,
25326 IX86_BUILTIN_CVTTPD2PI,
25327
25328 IX86_BUILTIN_CVTPI2PD,
25329 IX86_BUILTIN_CVTSI2SD,
25330 IX86_BUILTIN_CVTSI642SD,
25331
25332 IX86_BUILTIN_CVTSD2SI,
25333 IX86_BUILTIN_CVTSD2SI64,
25334 IX86_BUILTIN_CVTSD2SS,
25335 IX86_BUILTIN_CVTSS2SD,
25336 IX86_BUILTIN_CVTTSD2SI,
25337 IX86_BUILTIN_CVTTSD2SI64,
25338
25339 IX86_BUILTIN_CVTPS2DQ,
25340 IX86_BUILTIN_CVTPS2PD,
25341 IX86_BUILTIN_CVTTPS2DQ,
25342
25343 IX86_BUILTIN_MOVNTI,
25344 IX86_BUILTIN_MOVNTI64,
25345 IX86_BUILTIN_MOVNTPD,
25346 IX86_BUILTIN_MOVNTDQ,
25347
25348 IX86_BUILTIN_MOVQ128,
25349
25350 /* SSE2 MMX */
25351 IX86_BUILTIN_MASKMOVDQU,
25352 IX86_BUILTIN_MOVMSKPD,
25353 IX86_BUILTIN_PMOVMSKB128,
25354
25355 IX86_BUILTIN_PACKSSWB128,
25356 IX86_BUILTIN_PACKSSDW128,
25357 IX86_BUILTIN_PACKUSWB128,
25358
25359 IX86_BUILTIN_PADDB128,
25360 IX86_BUILTIN_PADDW128,
25361 IX86_BUILTIN_PADDD128,
25362 IX86_BUILTIN_PADDQ128,
25363 IX86_BUILTIN_PADDSB128,
25364 IX86_BUILTIN_PADDSW128,
25365 IX86_BUILTIN_PADDUSB128,
25366 IX86_BUILTIN_PADDUSW128,
25367 IX86_BUILTIN_PSUBB128,
25368 IX86_BUILTIN_PSUBW128,
25369 IX86_BUILTIN_PSUBD128,
25370 IX86_BUILTIN_PSUBQ128,
25371 IX86_BUILTIN_PSUBSB128,
25372 IX86_BUILTIN_PSUBSW128,
25373 IX86_BUILTIN_PSUBUSB128,
25374 IX86_BUILTIN_PSUBUSW128,
25375
25376 IX86_BUILTIN_PAND128,
25377 IX86_BUILTIN_PANDN128,
25378 IX86_BUILTIN_POR128,
25379 IX86_BUILTIN_PXOR128,
25380
25381 IX86_BUILTIN_PAVGB128,
25382 IX86_BUILTIN_PAVGW128,
25383
25384 IX86_BUILTIN_PCMPEQB128,
25385 IX86_BUILTIN_PCMPEQW128,
25386 IX86_BUILTIN_PCMPEQD128,
25387 IX86_BUILTIN_PCMPGTB128,
25388 IX86_BUILTIN_PCMPGTW128,
25389 IX86_BUILTIN_PCMPGTD128,
25390
25391 IX86_BUILTIN_PMADDWD128,
25392
25393 IX86_BUILTIN_PMAXSW128,
25394 IX86_BUILTIN_PMAXUB128,
25395 IX86_BUILTIN_PMINSW128,
25396 IX86_BUILTIN_PMINUB128,
25397
25398 IX86_BUILTIN_PMULUDQ,
25399 IX86_BUILTIN_PMULUDQ128,
25400 IX86_BUILTIN_PMULHUW128,
25401 IX86_BUILTIN_PMULHW128,
25402 IX86_BUILTIN_PMULLW128,
25403
25404 IX86_BUILTIN_PSADBW128,
25405 IX86_BUILTIN_PSHUFHW,
25406 IX86_BUILTIN_PSHUFLW,
25407 IX86_BUILTIN_PSHUFD,
25408
25409 IX86_BUILTIN_PSLLDQI128,
25410 IX86_BUILTIN_PSLLWI128,
25411 IX86_BUILTIN_PSLLDI128,
25412 IX86_BUILTIN_PSLLQI128,
25413 IX86_BUILTIN_PSRAWI128,
25414 IX86_BUILTIN_PSRADI128,
25415 IX86_BUILTIN_PSRLDQI128,
25416 IX86_BUILTIN_PSRLWI128,
25417 IX86_BUILTIN_PSRLDI128,
25418 IX86_BUILTIN_PSRLQI128,
25419
25420 IX86_BUILTIN_PSLLDQ128,
25421 IX86_BUILTIN_PSLLW128,
25422 IX86_BUILTIN_PSLLD128,
25423 IX86_BUILTIN_PSLLQ128,
25424 IX86_BUILTIN_PSRAW128,
25425 IX86_BUILTIN_PSRAD128,
25426 IX86_BUILTIN_PSRLW128,
25427 IX86_BUILTIN_PSRLD128,
25428 IX86_BUILTIN_PSRLQ128,
25429
25430 IX86_BUILTIN_PUNPCKHBW128,
25431 IX86_BUILTIN_PUNPCKHWD128,
25432 IX86_BUILTIN_PUNPCKHDQ128,
25433 IX86_BUILTIN_PUNPCKHQDQ128,
25434 IX86_BUILTIN_PUNPCKLBW128,
25435 IX86_BUILTIN_PUNPCKLWD128,
25436 IX86_BUILTIN_PUNPCKLDQ128,
25437 IX86_BUILTIN_PUNPCKLQDQ128,
25438
25439 IX86_BUILTIN_CLFLUSH,
25440 IX86_BUILTIN_MFENCE,
25441 IX86_BUILTIN_LFENCE,
25442 IX86_BUILTIN_PAUSE,
25443
25444 IX86_BUILTIN_BSRSI,
25445 IX86_BUILTIN_BSRDI,
25446 IX86_BUILTIN_RDPMC,
25447 IX86_BUILTIN_RDTSC,
25448 IX86_BUILTIN_RDTSCP,
25449 IX86_BUILTIN_ROLQI,
25450 IX86_BUILTIN_ROLHI,
25451 IX86_BUILTIN_RORQI,
25452 IX86_BUILTIN_RORHI,
25453
25454 /* SSE3. */
25455 IX86_BUILTIN_ADDSUBPS,
25456 IX86_BUILTIN_HADDPS,
25457 IX86_BUILTIN_HSUBPS,
25458 IX86_BUILTIN_MOVSHDUP,
25459 IX86_BUILTIN_MOVSLDUP,
25460 IX86_BUILTIN_ADDSUBPD,
25461 IX86_BUILTIN_HADDPD,
25462 IX86_BUILTIN_HSUBPD,
25463 IX86_BUILTIN_LDDQU,
25464
25465 IX86_BUILTIN_MONITOR,
25466 IX86_BUILTIN_MWAIT,
25467
25468 /* SSSE3. */
25469 IX86_BUILTIN_PHADDW,
25470 IX86_BUILTIN_PHADDD,
25471 IX86_BUILTIN_PHADDSW,
25472 IX86_BUILTIN_PHSUBW,
25473 IX86_BUILTIN_PHSUBD,
25474 IX86_BUILTIN_PHSUBSW,
25475 IX86_BUILTIN_PMADDUBSW,
25476 IX86_BUILTIN_PMULHRSW,
25477 IX86_BUILTIN_PSHUFB,
25478 IX86_BUILTIN_PSIGNB,
25479 IX86_BUILTIN_PSIGNW,
25480 IX86_BUILTIN_PSIGND,
25481 IX86_BUILTIN_PALIGNR,
25482 IX86_BUILTIN_PABSB,
25483 IX86_BUILTIN_PABSW,
25484 IX86_BUILTIN_PABSD,
25485
25486 IX86_BUILTIN_PHADDW128,
25487 IX86_BUILTIN_PHADDD128,
25488 IX86_BUILTIN_PHADDSW128,
25489 IX86_BUILTIN_PHSUBW128,
25490 IX86_BUILTIN_PHSUBD128,
25491 IX86_BUILTIN_PHSUBSW128,
25492 IX86_BUILTIN_PMADDUBSW128,
25493 IX86_BUILTIN_PMULHRSW128,
25494 IX86_BUILTIN_PSHUFB128,
25495 IX86_BUILTIN_PSIGNB128,
25496 IX86_BUILTIN_PSIGNW128,
25497 IX86_BUILTIN_PSIGND128,
25498 IX86_BUILTIN_PALIGNR128,
25499 IX86_BUILTIN_PABSB128,
25500 IX86_BUILTIN_PABSW128,
25501 IX86_BUILTIN_PABSD128,
25502
25503 /* AMDFAM10 - SSE4A New Instructions. */
25504 IX86_BUILTIN_MOVNTSD,
25505 IX86_BUILTIN_MOVNTSS,
25506 IX86_BUILTIN_EXTRQI,
25507 IX86_BUILTIN_EXTRQ,
25508 IX86_BUILTIN_INSERTQI,
25509 IX86_BUILTIN_INSERTQ,
25510
25511 /* SSE4.1. */
25512 IX86_BUILTIN_BLENDPD,
25513 IX86_BUILTIN_BLENDPS,
25514 IX86_BUILTIN_BLENDVPD,
25515 IX86_BUILTIN_BLENDVPS,
25516 IX86_BUILTIN_PBLENDVB128,
25517 IX86_BUILTIN_PBLENDW128,
25518
25519 IX86_BUILTIN_DPPD,
25520 IX86_BUILTIN_DPPS,
25521
25522 IX86_BUILTIN_INSERTPS128,
25523
25524 IX86_BUILTIN_MOVNTDQA,
25525 IX86_BUILTIN_MPSADBW128,
25526 IX86_BUILTIN_PACKUSDW128,
25527 IX86_BUILTIN_PCMPEQQ,
25528 IX86_BUILTIN_PHMINPOSUW128,
25529
25530 IX86_BUILTIN_PMAXSB128,
25531 IX86_BUILTIN_PMAXSD128,
25532 IX86_BUILTIN_PMAXUD128,
25533 IX86_BUILTIN_PMAXUW128,
25534
25535 IX86_BUILTIN_PMINSB128,
25536 IX86_BUILTIN_PMINSD128,
25537 IX86_BUILTIN_PMINUD128,
25538 IX86_BUILTIN_PMINUW128,
25539
25540 IX86_BUILTIN_PMOVSXBW128,
25541 IX86_BUILTIN_PMOVSXBD128,
25542 IX86_BUILTIN_PMOVSXBQ128,
25543 IX86_BUILTIN_PMOVSXWD128,
25544 IX86_BUILTIN_PMOVSXWQ128,
25545 IX86_BUILTIN_PMOVSXDQ128,
25546
25547 IX86_BUILTIN_PMOVZXBW128,
25548 IX86_BUILTIN_PMOVZXBD128,
25549 IX86_BUILTIN_PMOVZXBQ128,
25550 IX86_BUILTIN_PMOVZXWD128,
25551 IX86_BUILTIN_PMOVZXWQ128,
25552 IX86_BUILTIN_PMOVZXDQ128,
25553
25554 IX86_BUILTIN_PMULDQ128,
25555 IX86_BUILTIN_PMULLD128,
25556
25557 IX86_BUILTIN_ROUNDSD,
25558 IX86_BUILTIN_ROUNDSS,
25559
25560 IX86_BUILTIN_ROUNDPD,
25561 IX86_BUILTIN_ROUNDPS,
25562
25563 IX86_BUILTIN_FLOORPD,
25564 IX86_BUILTIN_CEILPD,
25565 IX86_BUILTIN_TRUNCPD,
25566 IX86_BUILTIN_RINTPD,
25567 IX86_BUILTIN_ROUNDPD_AZ,
25568
25569 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25570 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25571 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25572
25573 IX86_BUILTIN_FLOORPS,
25574 IX86_BUILTIN_CEILPS,
25575 IX86_BUILTIN_TRUNCPS,
25576 IX86_BUILTIN_RINTPS,
25577 IX86_BUILTIN_ROUNDPS_AZ,
25578
25579 IX86_BUILTIN_FLOORPS_SFIX,
25580 IX86_BUILTIN_CEILPS_SFIX,
25581 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25582
25583 IX86_BUILTIN_PTESTZ,
25584 IX86_BUILTIN_PTESTC,
25585 IX86_BUILTIN_PTESTNZC,
25586
25587 IX86_BUILTIN_VEC_INIT_V2SI,
25588 IX86_BUILTIN_VEC_INIT_V4HI,
25589 IX86_BUILTIN_VEC_INIT_V8QI,
25590 IX86_BUILTIN_VEC_EXT_V2DF,
25591 IX86_BUILTIN_VEC_EXT_V2DI,
25592 IX86_BUILTIN_VEC_EXT_V4SF,
25593 IX86_BUILTIN_VEC_EXT_V4SI,
25594 IX86_BUILTIN_VEC_EXT_V8HI,
25595 IX86_BUILTIN_VEC_EXT_V2SI,
25596 IX86_BUILTIN_VEC_EXT_V4HI,
25597 IX86_BUILTIN_VEC_EXT_V16QI,
25598 IX86_BUILTIN_VEC_SET_V2DI,
25599 IX86_BUILTIN_VEC_SET_V4SF,
25600 IX86_BUILTIN_VEC_SET_V4SI,
25601 IX86_BUILTIN_VEC_SET_V8HI,
25602 IX86_BUILTIN_VEC_SET_V4HI,
25603 IX86_BUILTIN_VEC_SET_V16QI,
25604
25605 IX86_BUILTIN_VEC_PACK_SFIX,
25606 IX86_BUILTIN_VEC_PACK_SFIX256,
25607
25608 /* SSE4.2. */
25609 IX86_BUILTIN_CRC32QI,
25610 IX86_BUILTIN_CRC32HI,
25611 IX86_BUILTIN_CRC32SI,
25612 IX86_BUILTIN_CRC32DI,
25613
25614 IX86_BUILTIN_PCMPESTRI128,
25615 IX86_BUILTIN_PCMPESTRM128,
25616 IX86_BUILTIN_PCMPESTRA128,
25617 IX86_BUILTIN_PCMPESTRC128,
25618 IX86_BUILTIN_PCMPESTRO128,
25619 IX86_BUILTIN_PCMPESTRS128,
25620 IX86_BUILTIN_PCMPESTRZ128,
25621 IX86_BUILTIN_PCMPISTRI128,
25622 IX86_BUILTIN_PCMPISTRM128,
25623 IX86_BUILTIN_PCMPISTRA128,
25624 IX86_BUILTIN_PCMPISTRC128,
25625 IX86_BUILTIN_PCMPISTRO128,
25626 IX86_BUILTIN_PCMPISTRS128,
25627 IX86_BUILTIN_PCMPISTRZ128,
25628
25629 IX86_BUILTIN_PCMPGTQ,
25630
25631 /* AES instructions */
25632 IX86_BUILTIN_AESENC128,
25633 IX86_BUILTIN_AESENCLAST128,
25634 IX86_BUILTIN_AESDEC128,
25635 IX86_BUILTIN_AESDECLAST128,
25636 IX86_BUILTIN_AESIMC128,
25637 IX86_BUILTIN_AESKEYGENASSIST128,
25638
25639 /* PCLMUL instruction */
25640 IX86_BUILTIN_PCLMULQDQ128,
25641
25642 /* AVX */
25643 IX86_BUILTIN_ADDPD256,
25644 IX86_BUILTIN_ADDPS256,
25645 IX86_BUILTIN_ADDSUBPD256,
25646 IX86_BUILTIN_ADDSUBPS256,
25647 IX86_BUILTIN_ANDPD256,
25648 IX86_BUILTIN_ANDPS256,
25649 IX86_BUILTIN_ANDNPD256,
25650 IX86_BUILTIN_ANDNPS256,
25651 IX86_BUILTIN_BLENDPD256,
25652 IX86_BUILTIN_BLENDPS256,
25653 IX86_BUILTIN_BLENDVPD256,
25654 IX86_BUILTIN_BLENDVPS256,
25655 IX86_BUILTIN_DIVPD256,
25656 IX86_BUILTIN_DIVPS256,
25657 IX86_BUILTIN_DPPS256,
25658 IX86_BUILTIN_HADDPD256,
25659 IX86_BUILTIN_HADDPS256,
25660 IX86_BUILTIN_HSUBPD256,
25661 IX86_BUILTIN_HSUBPS256,
25662 IX86_BUILTIN_MAXPD256,
25663 IX86_BUILTIN_MAXPS256,
25664 IX86_BUILTIN_MINPD256,
25665 IX86_BUILTIN_MINPS256,
25666 IX86_BUILTIN_MULPD256,
25667 IX86_BUILTIN_MULPS256,
25668 IX86_BUILTIN_ORPD256,
25669 IX86_BUILTIN_ORPS256,
25670 IX86_BUILTIN_SHUFPD256,
25671 IX86_BUILTIN_SHUFPS256,
25672 IX86_BUILTIN_SUBPD256,
25673 IX86_BUILTIN_SUBPS256,
25674 IX86_BUILTIN_XORPD256,
25675 IX86_BUILTIN_XORPS256,
25676 IX86_BUILTIN_CMPSD,
25677 IX86_BUILTIN_CMPSS,
25678 IX86_BUILTIN_CMPPD,
25679 IX86_BUILTIN_CMPPS,
25680 IX86_BUILTIN_CMPPD256,
25681 IX86_BUILTIN_CMPPS256,
25682 IX86_BUILTIN_CVTDQ2PD256,
25683 IX86_BUILTIN_CVTDQ2PS256,
25684 IX86_BUILTIN_CVTPD2PS256,
25685 IX86_BUILTIN_CVTPS2DQ256,
25686 IX86_BUILTIN_CVTPS2PD256,
25687 IX86_BUILTIN_CVTTPD2DQ256,
25688 IX86_BUILTIN_CVTPD2DQ256,
25689 IX86_BUILTIN_CVTTPS2DQ256,
25690 IX86_BUILTIN_EXTRACTF128PD256,
25691 IX86_BUILTIN_EXTRACTF128PS256,
25692 IX86_BUILTIN_EXTRACTF128SI256,
25693 IX86_BUILTIN_VZEROALL,
25694 IX86_BUILTIN_VZEROUPPER,
25695 IX86_BUILTIN_VPERMILVARPD,
25696 IX86_BUILTIN_VPERMILVARPS,
25697 IX86_BUILTIN_VPERMILVARPD256,
25698 IX86_BUILTIN_VPERMILVARPS256,
25699 IX86_BUILTIN_VPERMILPD,
25700 IX86_BUILTIN_VPERMILPS,
25701 IX86_BUILTIN_VPERMILPD256,
25702 IX86_BUILTIN_VPERMILPS256,
25703 IX86_BUILTIN_VPERMIL2PD,
25704 IX86_BUILTIN_VPERMIL2PS,
25705 IX86_BUILTIN_VPERMIL2PD256,
25706 IX86_BUILTIN_VPERMIL2PS256,
25707 IX86_BUILTIN_VPERM2F128PD256,
25708 IX86_BUILTIN_VPERM2F128PS256,
25709 IX86_BUILTIN_VPERM2F128SI256,
25710 IX86_BUILTIN_VBROADCASTSS,
25711 IX86_BUILTIN_VBROADCASTSD256,
25712 IX86_BUILTIN_VBROADCASTSS256,
25713 IX86_BUILTIN_VBROADCASTPD256,
25714 IX86_BUILTIN_VBROADCASTPS256,
25715 IX86_BUILTIN_VINSERTF128PD256,
25716 IX86_BUILTIN_VINSERTF128PS256,
25717 IX86_BUILTIN_VINSERTF128SI256,
25718 IX86_BUILTIN_LOADUPD256,
25719 IX86_BUILTIN_LOADUPS256,
25720 IX86_BUILTIN_STOREUPD256,
25721 IX86_BUILTIN_STOREUPS256,
25722 IX86_BUILTIN_LDDQU256,
25723 IX86_BUILTIN_MOVNTDQ256,
25724 IX86_BUILTIN_MOVNTPD256,
25725 IX86_BUILTIN_MOVNTPS256,
25726 IX86_BUILTIN_LOADDQU256,
25727 IX86_BUILTIN_STOREDQU256,
25728 IX86_BUILTIN_MASKLOADPD,
25729 IX86_BUILTIN_MASKLOADPS,
25730 IX86_BUILTIN_MASKSTOREPD,
25731 IX86_BUILTIN_MASKSTOREPS,
25732 IX86_BUILTIN_MASKLOADPD256,
25733 IX86_BUILTIN_MASKLOADPS256,
25734 IX86_BUILTIN_MASKSTOREPD256,
25735 IX86_BUILTIN_MASKSTOREPS256,
25736 IX86_BUILTIN_MOVSHDUP256,
25737 IX86_BUILTIN_MOVSLDUP256,
25738 IX86_BUILTIN_MOVDDUP256,
25739
25740 IX86_BUILTIN_SQRTPD256,
25741 IX86_BUILTIN_SQRTPS256,
25742 IX86_BUILTIN_SQRTPS_NR256,
25743 IX86_BUILTIN_RSQRTPS256,
25744 IX86_BUILTIN_RSQRTPS_NR256,
25745
25746 IX86_BUILTIN_RCPPS256,
25747
25748 IX86_BUILTIN_ROUNDPD256,
25749 IX86_BUILTIN_ROUNDPS256,
25750
25751 IX86_BUILTIN_FLOORPD256,
25752 IX86_BUILTIN_CEILPD256,
25753 IX86_BUILTIN_TRUNCPD256,
25754 IX86_BUILTIN_RINTPD256,
25755 IX86_BUILTIN_ROUNDPD_AZ256,
25756
25757 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25758 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25759 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25760
25761 IX86_BUILTIN_FLOORPS256,
25762 IX86_BUILTIN_CEILPS256,
25763 IX86_BUILTIN_TRUNCPS256,
25764 IX86_BUILTIN_RINTPS256,
25765 IX86_BUILTIN_ROUNDPS_AZ256,
25766
25767 IX86_BUILTIN_FLOORPS_SFIX256,
25768 IX86_BUILTIN_CEILPS_SFIX256,
25769 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25770
25771 IX86_BUILTIN_UNPCKHPD256,
25772 IX86_BUILTIN_UNPCKLPD256,
25773 IX86_BUILTIN_UNPCKHPS256,
25774 IX86_BUILTIN_UNPCKLPS256,
25775
25776 IX86_BUILTIN_SI256_SI,
25777 IX86_BUILTIN_PS256_PS,
25778 IX86_BUILTIN_PD256_PD,
25779 IX86_BUILTIN_SI_SI256,
25780 IX86_BUILTIN_PS_PS256,
25781 IX86_BUILTIN_PD_PD256,
25782
25783 IX86_BUILTIN_VTESTZPD,
25784 IX86_BUILTIN_VTESTCPD,
25785 IX86_BUILTIN_VTESTNZCPD,
25786 IX86_BUILTIN_VTESTZPS,
25787 IX86_BUILTIN_VTESTCPS,
25788 IX86_BUILTIN_VTESTNZCPS,
25789 IX86_BUILTIN_VTESTZPD256,
25790 IX86_BUILTIN_VTESTCPD256,
25791 IX86_BUILTIN_VTESTNZCPD256,
25792 IX86_BUILTIN_VTESTZPS256,
25793 IX86_BUILTIN_VTESTCPS256,
25794 IX86_BUILTIN_VTESTNZCPS256,
25795 IX86_BUILTIN_PTESTZ256,
25796 IX86_BUILTIN_PTESTC256,
25797 IX86_BUILTIN_PTESTNZC256,
25798
25799 IX86_BUILTIN_MOVMSKPD256,
25800 IX86_BUILTIN_MOVMSKPS256,
25801
25802 /* AVX2 */
25803 IX86_BUILTIN_MPSADBW256,
25804 IX86_BUILTIN_PABSB256,
25805 IX86_BUILTIN_PABSW256,
25806 IX86_BUILTIN_PABSD256,
25807 IX86_BUILTIN_PACKSSDW256,
25808 IX86_BUILTIN_PACKSSWB256,
25809 IX86_BUILTIN_PACKUSDW256,
25810 IX86_BUILTIN_PACKUSWB256,
25811 IX86_BUILTIN_PADDB256,
25812 IX86_BUILTIN_PADDW256,
25813 IX86_BUILTIN_PADDD256,
25814 IX86_BUILTIN_PADDQ256,
25815 IX86_BUILTIN_PADDSB256,
25816 IX86_BUILTIN_PADDSW256,
25817 IX86_BUILTIN_PADDUSB256,
25818 IX86_BUILTIN_PADDUSW256,
25819 IX86_BUILTIN_PALIGNR256,
25820 IX86_BUILTIN_AND256I,
25821 IX86_BUILTIN_ANDNOT256I,
25822 IX86_BUILTIN_PAVGB256,
25823 IX86_BUILTIN_PAVGW256,
25824 IX86_BUILTIN_PBLENDVB256,
25825 IX86_BUILTIN_PBLENDVW256,
25826 IX86_BUILTIN_PCMPEQB256,
25827 IX86_BUILTIN_PCMPEQW256,
25828 IX86_BUILTIN_PCMPEQD256,
25829 IX86_BUILTIN_PCMPEQQ256,
25830 IX86_BUILTIN_PCMPGTB256,
25831 IX86_BUILTIN_PCMPGTW256,
25832 IX86_BUILTIN_PCMPGTD256,
25833 IX86_BUILTIN_PCMPGTQ256,
25834 IX86_BUILTIN_PHADDW256,
25835 IX86_BUILTIN_PHADDD256,
25836 IX86_BUILTIN_PHADDSW256,
25837 IX86_BUILTIN_PHSUBW256,
25838 IX86_BUILTIN_PHSUBD256,
25839 IX86_BUILTIN_PHSUBSW256,
25840 IX86_BUILTIN_PMADDUBSW256,
25841 IX86_BUILTIN_PMADDWD256,
25842 IX86_BUILTIN_PMAXSB256,
25843 IX86_BUILTIN_PMAXSW256,
25844 IX86_BUILTIN_PMAXSD256,
25845 IX86_BUILTIN_PMAXUB256,
25846 IX86_BUILTIN_PMAXUW256,
25847 IX86_BUILTIN_PMAXUD256,
25848 IX86_BUILTIN_PMINSB256,
25849 IX86_BUILTIN_PMINSW256,
25850 IX86_BUILTIN_PMINSD256,
25851 IX86_BUILTIN_PMINUB256,
25852 IX86_BUILTIN_PMINUW256,
25853 IX86_BUILTIN_PMINUD256,
25854 IX86_BUILTIN_PMOVMSKB256,
25855 IX86_BUILTIN_PMOVSXBW256,
25856 IX86_BUILTIN_PMOVSXBD256,
25857 IX86_BUILTIN_PMOVSXBQ256,
25858 IX86_BUILTIN_PMOVSXWD256,
25859 IX86_BUILTIN_PMOVSXWQ256,
25860 IX86_BUILTIN_PMOVSXDQ256,
25861 IX86_BUILTIN_PMOVZXBW256,
25862 IX86_BUILTIN_PMOVZXBD256,
25863 IX86_BUILTIN_PMOVZXBQ256,
25864 IX86_BUILTIN_PMOVZXWD256,
25865 IX86_BUILTIN_PMOVZXWQ256,
25866 IX86_BUILTIN_PMOVZXDQ256,
25867 IX86_BUILTIN_PMULDQ256,
25868 IX86_BUILTIN_PMULHRSW256,
25869 IX86_BUILTIN_PMULHUW256,
25870 IX86_BUILTIN_PMULHW256,
25871 IX86_BUILTIN_PMULLW256,
25872 IX86_BUILTIN_PMULLD256,
25873 IX86_BUILTIN_PMULUDQ256,
25874 IX86_BUILTIN_POR256,
25875 IX86_BUILTIN_PSADBW256,
25876 IX86_BUILTIN_PSHUFB256,
25877 IX86_BUILTIN_PSHUFD256,
25878 IX86_BUILTIN_PSHUFHW256,
25879 IX86_BUILTIN_PSHUFLW256,
25880 IX86_BUILTIN_PSIGNB256,
25881 IX86_BUILTIN_PSIGNW256,
25882 IX86_BUILTIN_PSIGND256,
25883 IX86_BUILTIN_PSLLDQI256,
25884 IX86_BUILTIN_PSLLWI256,
25885 IX86_BUILTIN_PSLLW256,
25886 IX86_BUILTIN_PSLLDI256,
25887 IX86_BUILTIN_PSLLD256,
25888 IX86_BUILTIN_PSLLQI256,
25889 IX86_BUILTIN_PSLLQ256,
25890 IX86_BUILTIN_PSRAWI256,
25891 IX86_BUILTIN_PSRAW256,
25892 IX86_BUILTIN_PSRADI256,
25893 IX86_BUILTIN_PSRAD256,
25894 IX86_BUILTIN_PSRLDQI256,
25895 IX86_BUILTIN_PSRLWI256,
25896 IX86_BUILTIN_PSRLW256,
25897 IX86_BUILTIN_PSRLDI256,
25898 IX86_BUILTIN_PSRLD256,
25899 IX86_BUILTIN_PSRLQI256,
25900 IX86_BUILTIN_PSRLQ256,
25901 IX86_BUILTIN_PSUBB256,
25902 IX86_BUILTIN_PSUBW256,
25903 IX86_BUILTIN_PSUBD256,
25904 IX86_BUILTIN_PSUBQ256,
25905 IX86_BUILTIN_PSUBSB256,
25906 IX86_BUILTIN_PSUBSW256,
25907 IX86_BUILTIN_PSUBUSB256,
25908 IX86_BUILTIN_PSUBUSW256,
25909 IX86_BUILTIN_PUNPCKHBW256,
25910 IX86_BUILTIN_PUNPCKHWD256,
25911 IX86_BUILTIN_PUNPCKHDQ256,
25912 IX86_BUILTIN_PUNPCKHQDQ256,
25913 IX86_BUILTIN_PUNPCKLBW256,
25914 IX86_BUILTIN_PUNPCKLWD256,
25915 IX86_BUILTIN_PUNPCKLDQ256,
25916 IX86_BUILTIN_PUNPCKLQDQ256,
25917 IX86_BUILTIN_PXOR256,
25918 IX86_BUILTIN_MOVNTDQA256,
25919 IX86_BUILTIN_VBROADCASTSS_PS,
25920 IX86_BUILTIN_VBROADCASTSS_PS256,
25921 IX86_BUILTIN_VBROADCASTSD_PD256,
25922 IX86_BUILTIN_VBROADCASTSI256,
25923 IX86_BUILTIN_PBLENDD256,
25924 IX86_BUILTIN_PBLENDD128,
25925 IX86_BUILTIN_PBROADCASTB256,
25926 IX86_BUILTIN_PBROADCASTW256,
25927 IX86_BUILTIN_PBROADCASTD256,
25928 IX86_BUILTIN_PBROADCASTQ256,
25929 IX86_BUILTIN_PBROADCASTB128,
25930 IX86_BUILTIN_PBROADCASTW128,
25931 IX86_BUILTIN_PBROADCASTD128,
25932 IX86_BUILTIN_PBROADCASTQ128,
25933 IX86_BUILTIN_VPERMVARSI256,
25934 IX86_BUILTIN_VPERMDF256,
25935 IX86_BUILTIN_VPERMVARSF256,
25936 IX86_BUILTIN_VPERMDI256,
25937 IX86_BUILTIN_VPERMTI256,
25938 IX86_BUILTIN_VEXTRACT128I256,
25939 IX86_BUILTIN_VINSERT128I256,
25940 IX86_BUILTIN_MASKLOADD,
25941 IX86_BUILTIN_MASKLOADQ,
25942 IX86_BUILTIN_MASKLOADD256,
25943 IX86_BUILTIN_MASKLOADQ256,
25944 IX86_BUILTIN_MASKSTORED,
25945 IX86_BUILTIN_MASKSTOREQ,
25946 IX86_BUILTIN_MASKSTORED256,
25947 IX86_BUILTIN_MASKSTOREQ256,
25948 IX86_BUILTIN_PSLLVV4DI,
25949 IX86_BUILTIN_PSLLVV2DI,
25950 IX86_BUILTIN_PSLLVV8SI,
25951 IX86_BUILTIN_PSLLVV4SI,
25952 IX86_BUILTIN_PSRAVV8SI,
25953 IX86_BUILTIN_PSRAVV4SI,
25954 IX86_BUILTIN_PSRLVV4DI,
25955 IX86_BUILTIN_PSRLVV2DI,
25956 IX86_BUILTIN_PSRLVV8SI,
25957 IX86_BUILTIN_PSRLVV4SI,
25958
25959 IX86_BUILTIN_GATHERSIV2DF,
25960 IX86_BUILTIN_GATHERSIV4DF,
25961 IX86_BUILTIN_GATHERDIV2DF,
25962 IX86_BUILTIN_GATHERDIV4DF,
25963 IX86_BUILTIN_GATHERSIV4SF,
25964 IX86_BUILTIN_GATHERSIV8SF,
25965 IX86_BUILTIN_GATHERDIV4SF,
25966 IX86_BUILTIN_GATHERDIV8SF,
25967 IX86_BUILTIN_GATHERSIV2DI,
25968 IX86_BUILTIN_GATHERSIV4DI,
25969 IX86_BUILTIN_GATHERDIV2DI,
25970 IX86_BUILTIN_GATHERDIV4DI,
25971 IX86_BUILTIN_GATHERSIV4SI,
25972 IX86_BUILTIN_GATHERSIV8SI,
25973 IX86_BUILTIN_GATHERDIV4SI,
25974 IX86_BUILTIN_GATHERDIV8SI,
25975
25976 /* Alternate 4 element gather for the vectorizer where
25977 all operands are 32-byte wide. */
25978 IX86_BUILTIN_GATHERALTSIV4DF,
25979 IX86_BUILTIN_GATHERALTDIV8SF,
25980 IX86_BUILTIN_GATHERALTSIV4DI,
25981 IX86_BUILTIN_GATHERALTDIV8SI,
25982
25983 /* TFmode support builtins. */
25984 IX86_BUILTIN_INFQ,
25985 IX86_BUILTIN_HUGE_VALQ,
25986 IX86_BUILTIN_FABSQ,
25987 IX86_BUILTIN_COPYSIGNQ,
25988
25989 /* Vectorizer support builtins. */
25990 IX86_BUILTIN_CPYSGNPS,
25991 IX86_BUILTIN_CPYSGNPD,
25992 IX86_BUILTIN_CPYSGNPS256,
25993 IX86_BUILTIN_CPYSGNPD256,
25994
25995 /* FMA4 instructions. */
25996 IX86_BUILTIN_VFMADDSS,
25997 IX86_BUILTIN_VFMADDSD,
25998 IX86_BUILTIN_VFMADDPS,
25999 IX86_BUILTIN_VFMADDPD,
26000 IX86_BUILTIN_VFMADDPS256,
26001 IX86_BUILTIN_VFMADDPD256,
26002 IX86_BUILTIN_VFMADDSUBPS,
26003 IX86_BUILTIN_VFMADDSUBPD,
26004 IX86_BUILTIN_VFMADDSUBPS256,
26005 IX86_BUILTIN_VFMADDSUBPD256,
26006
26007 /* FMA3 instructions. */
26008 IX86_BUILTIN_VFMADDSS3,
26009 IX86_BUILTIN_VFMADDSD3,
26010
26011 /* XOP instructions. */
26012 IX86_BUILTIN_VPCMOV,
26013 IX86_BUILTIN_VPCMOV_V2DI,
26014 IX86_BUILTIN_VPCMOV_V4SI,
26015 IX86_BUILTIN_VPCMOV_V8HI,
26016 IX86_BUILTIN_VPCMOV_V16QI,
26017 IX86_BUILTIN_VPCMOV_V4SF,
26018 IX86_BUILTIN_VPCMOV_V2DF,
26019 IX86_BUILTIN_VPCMOV256,
26020 IX86_BUILTIN_VPCMOV_V4DI256,
26021 IX86_BUILTIN_VPCMOV_V8SI256,
26022 IX86_BUILTIN_VPCMOV_V16HI256,
26023 IX86_BUILTIN_VPCMOV_V32QI256,
26024 IX86_BUILTIN_VPCMOV_V8SF256,
26025 IX86_BUILTIN_VPCMOV_V4DF256,
26026
26027 IX86_BUILTIN_VPPERM,
26028
26029 IX86_BUILTIN_VPMACSSWW,
26030 IX86_BUILTIN_VPMACSWW,
26031 IX86_BUILTIN_VPMACSSWD,
26032 IX86_BUILTIN_VPMACSWD,
26033 IX86_BUILTIN_VPMACSSDD,
26034 IX86_BUILTIN_VPMACSDD,
26035 IX86_BUILTIN_VPMACSSDQL,
26036 IX86_BUILTIN_VPMACSSDQH,
26037 IX86_BUILTIN_VPMACSDQL,
26038 IX86_BUILTIN_VPMACSDQH,
26039 IX86_BUILTIN_VPMADCSSWD,
26040 IX86_BUILTIN_VPMADCSWD,
26041
26042 IX86_BUILTIN_VPHADDBW,
26043 IX86_BUILTIN_VPHADDBD,
26044 IX86_BUILTIN_VPHADDBQ,
26045 IX86_BUILTIN_VPHADDWD,
26046 IX86_BUILTIN_VPHADDWQ,
26047 IX86_BUILTIN_VPHADDDQ,
26048 IX86_BUILTIN_VPHADDUBW,
26049 IX86_BUILTIN_VPHADDUBD,
26050 IX86_BUILTIN_VPHADDUBQ,
26051 IX86_BUILTIN_VPHADDUWD,
26052 IX86_BUILTIN_VPHADDUWQ,
26053 IX86_BUILTIN_VPHADDUDQ,
26054 IX86_BUILTIN_VPHSUBBW,
26055 IX86_BUILTIN_VPHSUBWD,
26056 IX86_BUILTIN_VPHSUBDQ,
26057
26058 IX86_BUILTIN_VPROTB,
26059 IX86_BUILTIN_VPROTW,
26060 IX86_BUILTIN_VPROTD,
26061 IX86_BUILTIN_VPROTQ,
26062 IX86_BUILTIN_VPROTB_IMM,
26063 IX86_BUILTIN_VPROTW_IMM,
26064 IX86_BUILTIN_VPROTD_IMM,
26065 IX86_BUILTIN_VPROTQ_IMM,
26066
26067 IX86_BUILTIN_VPSHLB,
26068 IX86_BUILTIN_VPSHLW,
26069 IX86_BUILTIN_VPSHLD,
26070 IX86_BUILTIN_VPSHLQ,
26071 IX86_BUILTIN_VPSHAB,
26072 IX86_BUILTIN_VPSHAW,
26073 IX86_BUILTIN_VPSHAD,
26074 IX86_BUILTIN_VPSHAQ,
26075
26076 IX86_BUILTIN_VFRCZSS,
26077 IX86_BUILTIN_VFRCZSD,
26078 IX86_BUILTIN_VFRCZPS,
26079 IX86_BUILTIN_VFRCZPD,
26080 IX86_BUILTIN_VFRCZPS256,
26081 IX86_BUILTIN_VFRCZPD256,
26082
26083 IX86_BUILTIN_VPCOMEQUB,
26084 IX86_BUILTIN_VPCOMNEUB,
26085 IX86_BUILTIN_VPCOMLTUB,
26086 IX86_BUILTIN_VPCOMLEUB,
26087 IX86_BUILTIN_VPCOMGTUB,
26088 IX86_BUILTIN_VPCOMGEUB,
26089 IX86_BUILTIN_VPCOMFALSEUB,
26090 IX86_BUILTIN_VPCOMTRUEUB,
26091
26092 IX86_BUILTIN_VPCOMEQUW,
26093 IX86_BUILTIN_VPCOMNEUW,
26094 IX86_BUILTIN_VPCOMLTUW,
26095 IX86_BUILTIN_VPCOMLEUW,
26096 IX86_BUILTIN_VPCOMGTUW,
26097 IX86_BUILTIN_VPCOMGEUW,
26098 IX86_BUILTIN_VPCOMFALSEUW,
26099 IX86_BUILTIN_VPCOMTRUEUW,
26100
26101 IX86_BUILTIN_VPCOMEQUD,
26102 IX86_BUILTIN_VPCOMNEUD,
26103 IX86_BUILTIN_VPCOMLTUD,
26104 IX86_BUILTIN_VPCOMLEUD,
26105 IX86_BUILTIN_VPCOMGTUD,
26106 IX86_BUILTIN_VPCOMGEUD,
26107 IX86_BUILTIN_VPCOMFALSEUD,
26108 IX86_BUILTIN_VPCOMTRUEUD,
26109
26110 IX86_BUILTIN_VPCOMEQUQ,
26111 IX86_BUILTIN_VPCOMNEUQ,
26112 IX86_BUILTIN_VPCOMLTUQ,
26113 IX86_BUILTIN_VPCOMLEUQ,
26114 IX86_BUILTIN_VPCOMGTUQ,
26115 IX86_BUILTIN_VPCOMGEUQ,
26116 IX86_BUILTIN_VPCOMFALSEUQ,
26117 IX86_BUILTIN_VPCOMTRUEUQ,
26118
26119 IX86_BUILTIN_VPCOMEQB,
26120 IX86_BUILTIN_VPCOMNEB,
26121 IX86_BUILTIN_VPCOMLTB,
26122 IX86_BUILTIN_VPCOMLEB,
26123 IX86_BUILTIN_VPCOMGTB,
26124 IX86_BUILTIN_VPCOMGEB,
26125 IX86_BUILTIN_VPCOMFALSEB,
26126 IX86_BUILTIN_VPCOMTRUEB,
26127
26128 IX86_BUILTIN_VPCOMEQW,
26129 IX86_BUILTIN_VPCOMNEW,
26130 IX86_BUILTIN_VPCOMLTW,
26131 IX86_BUILTIN_VPCOMLEW,
26132 IX86_BUILTIN_VPCOMGTW,
26133 IX86_BUILTIN_VPCOMGEW,
26134 IX86_BUILTIN_VPCOMFALSEW,
26135 IX86_BUILTIN_VPCOMTRUEW,
26136
26137 IX86_BUILTIN_VPCOMEQD,
26138 IX86_BUILTIN_VPCOMNED,
26139 IX86_BUILTIN_VPCOMLTD,
26140 IX86_BUILTIN_VPCOMLED,
26141 IX86_BUILTIN_VPCOMGTD,
26142 IX86_BUILTIN_VPCOMGED,
26143 IX86_BUILTIN_VPCOMFALSED,
26144 IX86_BUILTIN_VPCOMTRUED,
26145
26146 IX86_BUILTIN_VPCOMEQQ,
26147 IX86_BUILTIN_VPCOMNEQ,
26148 IX86_BUILTIN_VPCOMLTQ,
26149 IX86_BUILTIN_VPCOMLEQ,
26150 IX86_BUILTIN_VPCOMGTQ,
26151 IX86_BUILTIN_VPCOMGEQ,
26152 IX86_BUILTIN_VPCOMFALSEQ,
26153 IX86_BUILTIN_VPCOMTRUEQ,
26154
26155 /* LWP instructions. */
26156 IX86_BUILTIN_LLWPCB,
26157 IX86_BUILTIN_SLWPCB,
26158 IX86_BUILTIN_LWPVAL32,
26159 IX86_BUILTIN_LWPVAL64,
26160 IX86_BUILTIN_LWPINS32,
26161 IX86_BUILTIN_LWPINS64,
26162
26163 IX86_BUILTIN_CLZS,
26164
26165 /* BMI instructions. */
26166 IX86_BUILTIN_BEXTR32,
26167 IX86_BUILTIN_BEXTR64,
26168 IX86_BUILTIN_CTZS,
26169
26170 /* TBM instructions. */
26171 IX86_BUILTIN_BEXTRI32,
26172 IX86_BUILTIN_BEXTRI64,
26173
26174 /* BMI2 instructions. */
26175 IX86_BUILTIN_BZHI32,
26176 IX86_BUILTIN_BZHI64,
26177 IX86_BUILTIN_PDEP32,
26178 IX86_BUILTIN_PDEP64,
26179 IX86_BUILTIN_PEXT32,
26180 IX86_BUILTIN_PEXT64,
26181
26182 /* FSGSBASE instructions. */
26183 IX86_BUILTIN_RDFSBASE32,
26184 IX86_BUILTIN_RDFSBASE64,
26185 IX86_BUILTIN_RDGSBASE32,
26186 IX86_BUILTIN_RDGSBASE64,
26187 IX86_BUILTIN_WRFSBASE32,
26188 IX86_BUILTIN_WRFSBASE64,
26189 IX86_BUILTIN_WRGSBASE32,
26190 IX86_BUILTIN_WRGSBASE64,
26191
26192 /* RDRND instructions. */
26193 IX86_BUILTIN_RDRAND16_STEP,
26194 IX86_BUILTIN_RDRAND32_STEP,
26195 IX86_BUILTIN_RDRAND64_STEP,
26196
26197 /* F16C instructions. */
26198 IX86_BUILTIN_CVTPH2PS,
26199 IX86_BUILTIN_CVTPH2PS256,
26200 IX86_BUILTIN_CVTPS2PH,
26201 IX86_BUILTIN_CVTPS2PH256,
26202
26203 /* CFString built-in for darwin */
26204 IX86_BUILTIN_CFSTRING,
26205
26206 IX86_BUILTIN_MAX
26207 };
26208
26209 /* Table for the ix86 builtin decls. */
26210 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26211
26212 /* Table of all of the builtin functions that are possible with different ISA's
26213 but are waiting to be built until a function is declared to use that
26214 ISA. */
26215 struct builtin_isa {
26216 const char *name; /* function name */
26217 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26218 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26219 bool const_p; /* true if the declaration is constant */
26220 bool set_and_not_built_p;
26221 };
26222
26223 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26224
26225
26226 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26227 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26228 function decl in the ix86_builtins array. Returns the function decl or
26229 NULL_TREE, if the builtin was not added.
26230
26231 If the front end has a special hook for builtin functions, delay adding
26232 builtin functions that aren't in the current ISA until the ISA is changed
26233 with function specific optimization. Doing so, can save about 300K for the
26234 default compiler. When the builtin is expanded, check at that time whether
26235 it is valid.
26236
26237 If the front end doesn't have a special hook, record all builtins, even if
26238 it isn't an instruction set in the current ISA in case the user uses
26239 function specific options for a different ISA, so that we don't get scope
26240 errors if a builtin is added in the middle of a function scope. */
26241
26242 static inline tree
26243 def_builtin (HOST_WIDE_INT mask, const char *name,
26244 enum ix86_builtin_func_type tcode,
26245 enum ix86_builtins code)
26246 {
26247 tree decl = NULL_TREE;
26248
26249 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26250 {
26251 ix86_builtins_isa[(int) code].isa = mask;
26252
26253 mask &= ~OPTION_MASK_ISA_64BIT;
26254 if (mask == 0
26255 || (mask & ix86_isa_flags) != 0
26256 || (lang_hooks.builtin_function
26257 == lang_hooks.builtin_function_ext_scope))
26258
26259 {
26260 tree type = ix86_get_builtin_func_type (tcode);
26261 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26262 NULL, NULL_TREE);
26263 ix86_builtins[(int) code] = decl;
26264 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26265 }
26266 else
26267 {
26268 ix86_builtins[(int) code] = NULL_TREE;
26269 ix86_builtins_isa[(int) code].tcode = tcode;
26270 ix86_builtins_isa[(int) code].name = name;
26271 ix86_builtins_isa[(int) code].const_p = false;
26272 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26273 }
26274 }
26275
26276 return decl;
26277 }
26278
26279 /* Like def_builtin, but also marks the function decl "const". */
26280
26281 static inline tree
26282 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26283 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26284 {
26285 tree decl = def_builtin (mask, name, tcode, code);
26286 if (decl)
26287 TREE_READONLY (decl) = 1;
26288 else
26289 ix86_builtins_isa[(int) code].const_p = true;
26290
26291 return decl;
26292 }
26293
26294 /* Add any new builtin functions for a given ISA that may not have been
26295 declared. This saves a bit of space compared to adding all of the
26296 declarations to the tree, even if we didn't use them. */
26297
26298 static void
26299 ix86_add_new_builtins (HOST_WIDE_INT isa)
26300 {
26301 int i;
26302
26303 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26304 {
26305 if ((ix86_builtins_isa[i].isa & isa) != 0
26306 && ix86_builtins_isa[i].set_and_not_built_p)
26307 {
26308 tree decl, type;
26309
26310 /* Don't define the builtin again. */
26311 ix86_builtins_isa[i].set_and_not_built_p = false;
26312
26313 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26314 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26315 type, i, BUILT_IN_MD, NULL,
26316 NULL_TREE);
26317
26318 ix86_builtins[i] = decl;
26319 if (ix86_builtins_isa[i].const_p)
26320 TREE_READONLY (decl) = 1;
26321 }
26322 }
26323 }
26324
26325 /* Bits for builtin_description.flag. */
26326
26327 /* Set when we don't support the comparison natively, and should
26328 swap_comparison in order to support it. */
26329 #define BUILTIN_DESC_SWAP_OPERANDS 1
26330
26331 struct builtin_description
26332 {
26333 const HOST_WIDE_INT mask;
26334 const enum insn_code icode;
26335 const char *const name;
26336 const enum ix86_builtins code;
26337 const enum rtx_code comparison;
26338 const int flag;
26339 };
26340
26341 static const struct builtin_description bdesc_comi[] =
26342 {
26343 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26344 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26345 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26346 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26347 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26348 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26349 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26350 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26351 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26352 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26353 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26354 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26355 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26356 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26357 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26358 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26359 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26360 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26361 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26362 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26363 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26364 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26365 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26366 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26367 };
26368
26369 static const struct builtin_description bdesc_pcmpestr[] =
26370 {
26371 /* SSE4.2 */
26372 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26373 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26374 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26375 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26376 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26377 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26378 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26379 };
26380
26381 static const struct builtin_description bdesc_pcmpistr[] =
26382 {
26383 /* SSE4.2 */
26384 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26385 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26386 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26387 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26388 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26389 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26390 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26391 };
26392
26393 /* Special builtins with variable number of arguments. */
26394 static const struct builtin_description bdesc_special_args[] =
26395 {
26396 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26397 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26398 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26399
26400 /* MMX */
26401 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26402
26403 /* 3DNow! */
26404 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26405
26406 /* SSE */
26407 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26408 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26409 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26410
26411 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26412 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26413 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26414 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26415
26416 /* SSE or 3DNow!A */
26417 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26418 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntdi, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26419
26420 /* SSE2 */
26421 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26422 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26423 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26424 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26425 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26426 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26427 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26428 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26429 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26430 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26431
26432 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26433 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26434
26435 /* SSE3 */
26436 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26437
26438 /* SSE4.1 */
26439 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26440
26441 /* SSE4A */
26442 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26443 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26444
26445 /* AVX */
26446 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26447 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26448
26449 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26450 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26451 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26452 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26453 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26454
26455 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26456 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26457 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26458 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26459 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26460 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26461 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26462
26463 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26464 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26465 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26466
26467 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26468 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26469 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26470 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26471 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26472 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26473 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26474 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26475
26476 /* AVX2 */
26477 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26478 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26479 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26480 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26481 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26482 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26483 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26484 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26485 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26486
26487 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26488 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26489 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26490 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26491 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26492 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26493
26494 /* FSGSBASE */
26495 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26496 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26497 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26498 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26499 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26500 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26501 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26502 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26503 };
26504
26505 /* Builtins with variable number of arguments. */
26506 static const struct builtin_description bdesc_args[] =
26507 {
26508 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26509 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26510 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26511 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26512 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26513 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26514 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26515
26516 /* MMX */
26517 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26518 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26519 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26520 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26521 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26522 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26523
26524 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26525 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26526 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26527 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26528 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26529 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26530 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26531 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26532
26533 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26534 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26535
26536 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26537 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26538 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26539 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26540
26541 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26542 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26543 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26544 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26545 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26546 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26547
26548 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26549 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26550 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26551 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26552 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26553 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26554
26555 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26556 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26557 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26558
26559 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26560
26561 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26562 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26563 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26564 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26565 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26566 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26567
26568 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26569 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26570 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26571 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26572 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26573 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26574
26575 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26576 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26577 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26578 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26579
26580 /* 3DNow! */
26581 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26582 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26583 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26584 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26585
26586 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26587 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26588 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26589 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26590 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26591 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26592 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26593 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26594 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26595 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26596 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26597 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26598 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26599 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26600 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26601
26602 /* 3DNow!A */
26603 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26604 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26605 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26606 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26607 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26608 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26609
26610 /* SSE */
26611 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26612 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26613 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26614 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26615 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26616 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26617 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26618 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26619 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26620 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26621 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26622 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26623
26624 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26625
26626 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26627 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26628 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26629 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26630 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26631 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26632 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26633 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26634
26635 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26636 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26637 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26638 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26639 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26640 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26641 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26642 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26643 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26644 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26645 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26646 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26647 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26648 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26649 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26650 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26651 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26652 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26653 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26654 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26655 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26656 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26657
26658 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26659 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26660 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26661 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26662
26663 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26664 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26665 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26666 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26667
26668 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26669
26670 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26671 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26672 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26673 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26674 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26675
26676 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26677 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26678 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26679
26680 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26681
26682 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26683 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26684 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26685
26686 /* SSE MMX or 3Dnow!A */
26687 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26688 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26689 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26690
26691 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26692 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26693 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26694 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26695
26696 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26697 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26698
26699 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26700
26701 /* SSE2 */
26702 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26703
26704 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26705 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26706 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26707 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26708 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26709
26710 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26711 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26712 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26713 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26714 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26715
26716 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26717
26718 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26719 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26720 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26721 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26722
26723 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26724 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26725 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26726
26727 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26728 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26729 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26730 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26731 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26732 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26733 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26734 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26735
26736 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26737 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26738 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26739 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26740 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26741 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26742 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26743 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26744 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26745 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26746 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26747 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26748 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26749 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26750 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26751 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26752 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26753 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26754 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26755 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26756
26757 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26758 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26759 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26760 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26761
26762 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26763 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26764 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26765 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26766
26767 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26768
26769 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26770 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26771 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26772
26773 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26774
26775 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26776 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26777 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26778 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26779 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26780 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26781 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26782 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26783
26784 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26785 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26786 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26787 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26788 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26789 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26790 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26791 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26792
26793 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26794 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26795
26796 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26797 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26798 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26799 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26800
26801 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26802 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26803
26804 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26805 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26806 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26807 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26808 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26809 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26810
26811 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26812 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26813 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26814 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26815
26816 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26817 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26818 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26819 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26820 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26821 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26822 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26823 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26824
26825 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26826 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26827 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26828
26829 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26830 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26831
26832 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26833 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26834
26835 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26836
26837 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26838 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26839 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26840 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26841
26842 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26843 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26844 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26845 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26846 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26847 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26848 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26849
26850 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26851 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26852 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26853 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26854 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26855 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26856 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26857
26858 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26859 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26860 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26861 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26862
26863 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26864 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26865 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26866
26867 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26868
26869 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26870 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26871
26872 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26873
26874 /* SSE2 MMX */
26875 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26876 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26877
26878 /* SSE3 */
26879 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26880 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26881
26882 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26883 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26884 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26885 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26886 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26887 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26888
26889 /* SSSE3 */
26890 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26891 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26892 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26893 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26894 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26895 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26896
26897 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26898 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26899 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26900 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26901 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26902 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26903 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26904 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26905 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26906 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26907 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26908 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26909 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26910 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26911 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26912 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26913 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26914 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26915 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26916 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26917 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26918 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26919 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26920 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26921
26922 /* SSSE3. */
26923 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26924 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26925
26926 /* SSE4.1 */
26927 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26928 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26929 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26930 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26931 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26932 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26933 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26934 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26935 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26936 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26937
26938 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26939 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26940 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26941 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26942 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26943 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26944 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26945 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26946 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26947 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26948 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26949 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26950 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26951
26952 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26953 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26954 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26955 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26956 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26957 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26958 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26959 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26960 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26961 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26962 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26963 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26964
26965 /* SSE4.1 */
26966 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26967 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26968 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26969 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26970
26971 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26972 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26973 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26974 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26975
26976 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26977 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26978
26979 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26980 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26981
26982 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26983 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26984 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26985 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26986
26987 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26988 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26989
26990 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26991 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26992
26993 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26994 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26995 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26996
26997 /* SSE4.2 */
26998 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26999 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27000 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27001 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27002 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27003
27004 /* SSE4A */
27005 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27006 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27007 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27008 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27009
27010 /* AES */
27011 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27012 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27013
27014 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27015 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27016 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27017 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27018
27019 /* PCLMUL */
27020 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27021
27022 /* AVX */
27023 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27024 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27025 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27026 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27027 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27028 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27029 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27030 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27031 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27032 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27033 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27034 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27035 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27036 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27037 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27038 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27039 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27040 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27041 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27042 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27043 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27044 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27045 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27046 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27047 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27048 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27049
27050 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27051 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27052 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27053 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27054
27055 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27056 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27057 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27058 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27059 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27060 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27061 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27062 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27063 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27064 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27065 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27066 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27067 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27068 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27069 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27070 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27071 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27072 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27073 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27074 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27075 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27076 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27077 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27078 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27079 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27080 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27081 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27082 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27083 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27084 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27085 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27086 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27087 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27088 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27089
27090 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27091 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27092 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27093
27094 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27095 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27096 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27097 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27098 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27099
27100 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27101
27102 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27103 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27104
27105 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27106 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27107 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27108 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27109
27110 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27111 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27112
27113 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27114 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27115
27116 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27117 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27118 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27119 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27120
27121 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27122 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27123
27124 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27125 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27126
27127 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27128 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27129 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27130 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27131
27132 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27133 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27134 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27135 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27136 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27137 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27138
27139 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27140 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27141 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27142 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27143 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27144 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27145 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27146 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27147 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27148 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27149 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27150 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27151 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27152 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27153 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27154
27155 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27156 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27157
27158 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27159 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27160
27161 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27162
27163 /* AVX2 */
27164 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27165 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27166 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27167 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27168 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27169 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27170 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27171 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27172 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27173 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27174 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27175 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27176 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27177 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27178 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27179 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27180 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27181 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27182 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27183 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27184 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27185 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27186 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27187 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27188 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27189 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27190 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27191 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27192 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27193 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27194 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27195 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27196 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27197 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27198 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27199 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27200 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27201 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27202 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27203 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27204 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27205 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27206 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27207 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27208 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27209 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27210 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27211 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27212 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27213 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27214 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27215 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27216 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27217 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27218 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27219 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27220 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27221 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27222 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27223 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27224 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27225 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27226 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27227 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27228 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27229 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27230 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27231 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27232 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27233 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27234 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27235 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27236 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27237 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27238 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27239 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27240 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27241 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27242 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27243 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27244 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27245 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27246 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27247 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27248 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27249 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27250 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27251 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27252 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27253 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27254 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27255 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27256 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27257 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27258 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27259 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27260 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27261 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27262 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27263 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27264 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27265 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27266 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27267 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27268 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27269 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27270 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27271 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27272 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27273 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27274 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27275 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27276 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27277 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27278 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27279 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27280 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27281 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27282 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27283 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27284 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27285 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27286 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27287 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27288 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27289 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27290 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27291 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27292 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27293 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27294 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27295 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27296 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27297 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27298 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27299 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27300 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27301 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27302 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27303 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27304 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27305 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27306 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27307 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27308 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27309 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27310
27311 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27312
27313 /* BMI */
27314 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27315 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27316 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27317
27318 /* TBM */
27319 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27320 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27321
27322 /* F16C */
27323 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27324 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27325 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27326 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27327
27328 /* BMI2 */
27329 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27330 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27331 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27332 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27333 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27334 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27335 };
27336
27337 /* FMA4 and XOP. */
27338 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27339 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27340 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27341 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27342 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27343 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27344 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27345 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27346 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27347 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27348 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27349 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27350 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27351 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27352 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27353 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27354 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27355 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27356 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27357 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27358 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27359 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27360 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27361 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27362 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27363 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27364 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27365 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27366 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27367 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27368 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27369 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27370 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27371 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27372 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27373 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27374 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27375 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27376 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27377 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27378 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27379 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27380 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27381 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27382 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27383 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27384 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27385 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27386 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27387 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27388 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27389 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27390
27391 static const struct builtin_description bdesc_multi_arg[] =
27392 {
27393 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27394 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27395 UNKNOWN, (int)MULTI_ARG_3_SF },
27396 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27397 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27398 UNKNOWN, (int)MULTI_ARG_3_DF },
27399
27400 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27401 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27402 UNKNOWN, (int)MULTI_ARG_3_SF },
27403 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27404 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27405 UNKNOWN, (int)MULTI_ARG_3_DF },
27406
27407 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27408 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27409 UNKNOWN, (int)MULTI_ARG_3_SF },
27410 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27411 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27412 UNKNOWN, (int)MULTI_ARG_3_DF },
27413 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27414 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27415 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27416 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27417 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27418 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27419
27420 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27421 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27422 UNKNOWN, (int)MULTI_ARG_3_SF },
27423 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27424 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27425 UNKNOWN, (int)MULTI_ARG_3_DF },
27426 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27427 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27428 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27429 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27430 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27431 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27432
27433 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27434 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27435 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27436 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27437 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27438 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27439 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27440
27441 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27442 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27443 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27444 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27445 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27446 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27447 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27448
27449 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27450
27451 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27452 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27453 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27454 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27455 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27456 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27457 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27458 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27459 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27460 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27461 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27462 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27463
27464 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27465 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27466 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27467 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27468 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27469 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27470 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27471 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27472 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27473 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27474 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27475 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27476 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27477 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27478 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27479 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27480
27481 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27482 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27483 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27484 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27485 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27486 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27487
27488 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27489 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27490 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27491 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27492 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27493 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27494 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27495 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27496 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27497 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27498 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27499 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27500 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27501 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27502 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27503
27504 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27505 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27506 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27507 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27508 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27509 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27510 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27511
27512 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27513 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27514 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27515 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27516 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27517 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27518 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27519
27520 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27521 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27522 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27523 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27524 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27525 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27526 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27527
27528 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27529 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27530 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27531 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27532 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27533 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27534 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27535
27536 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27537 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27538 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27539 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27540 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27541 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27542 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27543
27544 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27545 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27546 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27547 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27548 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27549 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27550 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27551
27552 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27553 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27554 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27555 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27556 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27557 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27558 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27559
27560 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27561 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27562 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27563 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27564 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27565 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27566 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27567
27568 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27569 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27570 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27571 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27572 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27573 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27574 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27575 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27576
27577 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27578 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27579 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27580 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27581 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27582 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27583 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27584 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27585
27586 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27587 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27588 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27589 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27590
27591 };
27592 \f
27593 /* TM vector builtins. */
27594
27595 /* Reuse the existing x86-specific `struct builtin_description' cause
27596 we're lazy. Add casts to make them fit. */
27597 static const struct builtin_description bdesc_tm[] =
27598 {
27599 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27600 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27601 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27602 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27603 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27604 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27605 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27606
27607 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27608 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27609 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27610 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27611 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27612 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27613 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27614
27615 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27616 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27617 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27618 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27619 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27620 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27621 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27622
27623 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27624 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27625 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27626 };
27627
27628 /* TM callbacks. */
27629
27630 /* Return the builtin decl needed to load a vector of TYPE. */
27631
27632 static tree
27633 ix86_builtin_tm_load (tree type)
27634 {
27635 if (TREE_CODE (type) == VECTOR_TYPE)
27636 {
27637 switch (tree_low_cst (TYPE_SIZE (type), 1))
27638 {
27639 case 64:
27640 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27641 case 128:
27642 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27643 case 256:
27644 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27645 }
27646 }
27647 return NULL_TREE;
27648 }
27649
27650 /* Return the builtin decl needed to store a vector of TYPE. */
27651
27652 static tree
27653 ix86_builtin_tm_store (tree type)
27654 {
27655 if (TREE_CODE (type) == VECTOR_TYPE)
27656 {
27657 switch (tree_low_cst (TYPE_SIZE (type), 1))
27658 {
27659 case 64:
27660 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27661 case 128:
27662 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27663 case 256:
27664 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27665 }
27666 }
27667 return NULL_TREE;
27668 }
27669 \f
27670 /* Initialize the transactional memory vector load/store builtins. */
27671
27672 static void
27673 ix86_init_tm_builtins (void)
27674 {
27675 enum ix86_builtin_func_type ftype;
27676 const struct builtin_description *d;
27677 size_t i;
27678 tree decl;
27679 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27680 tree attrs_log, attrs_type_log;
27681
27682 if (!flag_tm)
27683 return;
27684
27685 /* Use whatever attributes a normal TM load has. */
27686 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27687 attrs_load = DECL_ATTRIBUTES (decl);
27688 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27689 /* Use whatever attributes a normal TM store has. */
27690 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27691 attrs_store = DECL_ATTRIBUTES (decl);
27692 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27693 /* Use whatever attributes a normal TM log has. */
27694 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27695 attrs_log = DECL_ATTRIBUTES (decl);
27696 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27697
27698 for (i = 0, d = bdesc_tm;
27699 i < ARRAY_SIZE (bdesc_tm);
27700 i++, d++)
27701 {
27702 if ((d->mask & ix86_isa_flags) != 0
27703 || (lang_hooks.builtin_function
27704 == lang_hooks.builtin_function_ext_scope))
27705 {
27706 tree type, attrs, attrs_type;
27707 enum built_in_function code = (enum built_in_function) d->code;
27708
27709 ftype = (enum ix86_builtin_func_type) d->flag;
27710 type = ix86_get_builtin_func_type (ftype);
27711
27712 if (BUILTIN_TM_LOAD_P (code))
27713 {
27714 attrs = attrs_load;
27715 attrs_type = attrs_type_load;
27716 }
27717 else if (BUILTIN_TM_STORE_P (code))
27718 {
27719 attrs = attrs_store;
27720 attrs_type = attrs_type_store;
27721 }
27722 else
27723 {
27724 attrs = attrs_log;
27725 attrs_type = attrs_type_log;
27726 }
27727 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27728 /* The builtin without the prefix for
27729 calling it directly. */
27730 d->name + strlen ("__builtin_"),
27731 attrs);
27732 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27733 set the TYPE_ATTRIBUTES. */
27734 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27735
27736 set_builtin_decl (code, decl, false);
27737 }
27738 }
27739 }
27740
27741 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27742 in the current target ISA to allow the user to compile particular modules
27743 with different target specific options that differ from the command line
27744 options. */
27745 static void
27746 ix86_init_mmx_sse_builtins (void)
27747 {
27748 const struct builtin_description * d;
27749 enum ix86_builtin_func_type ftype;
27750 size_t i;
27751
27752 /* Add all special builtins with variable number of operands. */
27753 for (i = 0, d = bdesc_special_args;
27754 i < ARRAY_SIZE (bdesc_special_args);
27755 i++, d++)
27756 {
27757 if (d->name == 0)
27758 continue;
27759
27760 ftype = (enum ix86_builtin_func_type) d->flag;
27761 def_builtin (d->mask, d->name, ftype, d->code);
27762 }
27763
27764 /* Add all builtins with variable number of operands. */
27765 for (i = 0, d = bdesc_args;
27766 i < ARRAY_SIZE (bdesc_args);
27767 i++, d++)
27768 {
27769 if (d->name == 0)
27770 continue;
27771
27772 ftype = (enum ix86_builtin_func_type) d->flag;
27773 def_builtin_const (d->mask, d->name, ftype, d->code);
27774 }
27775
27776 /* pcmpestr[im] insns. */
27777 for (i = 0, d = bdesc_pcmpestr;
27778 i < ARRAY_SIZE (bdesc_pcmpestr);
27779 i++, d++)
27780 {
27781 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27782 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27783 else
27784 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27785 def_builtin_const (d->mask, d->name, ftype, d->code);
27786 }
27787
27788 /* pcmpistr[im] insns. */
27789 for (i = 0, d = bdesc_pcmpistr;
27790 i < ARRAY_SIZE (bdesc_pcmpistr);
27791 i++, d++)
27792 {
27793 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27794 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27795 else
27796 ftype = INT_FTYPE_V16QI_V16QI_INT;
27797 def_builtin_const (d->mask, d->name, ftype, d->code);
27798 }
27799
27800 /* comi/ucomi insns. */
27801 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27802 {
27803 if (d->mask == OPTION_MASK_ISA_SSE2)
27804 ftype = INT_FTYPE_V2DF_V2DF;
27805 else
27806 ftype = INT_FTYPE_V4SF_V4SF;
27807 def_builtin_const (d->mask, d->name, ftype, d->code);
27808 }
27809
27810 /* SSE */
27811 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27812 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27813 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27814 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27815
27816 /* SSE or 3DNow!A */
27817 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27818 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27819 IX86_BUILTIN_MASKMOVQ);
27820
27821 /* SSE2 */
27822 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27823 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27824
27825 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27826 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27827 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27828 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27829
27830 /* SSE3. */
27831 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27832 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27833 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27834 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27835
27836 /* AES */
27837 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27838 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27839 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27840 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27841 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27842 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27843 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27844 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27845 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27846 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27847 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27848 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27849
27850 /* PCLMUL */
27851 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27852 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27853
27854 /* RDRND */
27855 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27856 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27857 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27858 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27859 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27860 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27861 IX86_BUILTIN_RDRAND64_STEP);
27862
27863 /* AVX2 */
27864 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27865 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27866 IX86_BUILTIN_GATHERSIV2DF);
27867
27868 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27869 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27870 IX86_BUILTIN_GATHERSIV4DF);
27871
27872 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27873 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27874 IX86_BUILTIN_GATHERDIV2DF);
27875
27876 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27877 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27878 IX86_BUILTIN_GATHERDIV4DF);
27879
27880 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27881 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27882 IX86_BUILTIN_GATHERSIV4SF);
27883
27884 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27885 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27886 IX86_BUILTIN_GATHERSIV8SF);
27887
27888 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27889 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27890 IX86_BUILTIN_GATHERDIV4SF);
27891
27892 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27893 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27894 IX86_BUILTIN_GATHERDIV8SF);
27895
27896 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27897 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27898 IX86_BUILTIN_GATHERSIV2DI);
27899
27900 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27901 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27902 IX86_BUILTIN_GATHERSIV4DI);
27903
27904 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27905 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27906 IX86_BUILTIN_GATHERDIV2DI);
27907
27908 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27909 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27910 IX86_BUILTIN_GATHERDIV4DI);
27911
27912 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27913 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27914 IX86_BUILTIN_GATHERSIV4SI);
27915
27916 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27917 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27918 IX86_BUILTIN_GATHERSIV8SI);
27919
27920 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27921 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27922 IX86_BUILTIN_GATHERDIV4SI);
27923
27924 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27925 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27926 IX86_BUILTIN_GATHERDIV8SI);
27927
27928 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27929 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27930 IX86_BUILTIN_GATHERALTSIV4DF);
27931
27932 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27933 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27934 IX86_BUILTIN_GATHERALTDIV8SF);
27935
27936 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27937 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27938 IX86_BUILTIN_GATHERALTSIV4DI);
27939
27940 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27941 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27942 IX86_BUILTIN_GATHERALTDIV8SI);
27943
27944 /* MMX access to the vec_init patterns. */
27945 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27946 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27947
27948 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27949 V4HI_FTYPE_HI_HI_HI_HI,
27950 IX86_BUILTIN_VEC_INIT_V4HI);
27951
27952 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27953 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27954 IX86_BUILTIN_VEC_INIT_V8QI);
27955
27956 /* Access to the vec_extract patterns. */
27957 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27958 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27959 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27960 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27961 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27962 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27963 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27964 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27965 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27966 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27967
27968 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27969 "__builtin_ia32_vec_ext_v4hi",
27970 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27971
27972 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27973 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27974
27975 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27976 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27977
27978 /* Access to the vec_set patterns. */
27979 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27980 "__builtin_ia32_vec_set_v2di",
27981 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27982
27983 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27984 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27985
27986 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27987 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27988
27989 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27990 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27991
27992 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27993 "__builtin_ia32_vec_set_v4hi",
27994 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27995
27996 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27997 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27998
27999 /* Add FMA4 multi-arg argument instructions */
28000 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28001 {
28002 if (d->name == 0)
28003 continue;
28004
28005 ftype = (enum ix86_builtin_func_type) d->flag;
28006 def_builtin_const (d->mask, d->name, ftype, d->code);
28007 }
28008 }
28009
28010 /* Internal method for ix86_init_builtins. */
28011
28012 static void
28013 ix86_init_builtins_va_builtins_abi (void)
28014 {
28015 tree ms_va_ref, sysv_va_ref;
28016 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
28017 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
28018 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
28019 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
28020
28021 if (!TARGET_64BIT)
28022 return;
28023 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
28024 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
28025 ms_va_ref = build_reference_type (ms_va_list_type_node);
28026 sysv_va_ref =
28027 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
28028
28029 fnvoid_va_end_ms =
28030 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28031 fnvoid_va_start_ms =
28032 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
28033 fnvoid_va_end_sysv =
28034 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
28035 fnvoid_va_start_sysv =
28036 build_varargs_function_type_list (void_type_node, sysv_va_ref,
28037 NULL_TREE);
28038 fnvoid_va_copy_ms =
28039 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
28040 NULL_TREE);
28041 fnvoid_va_copy_sysv =
28042 build_function_type_list (void_type_node, sysv_va_ref,
28043 sysv_va_ref, NULL_TREE);
28044
28045 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
28046 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
28047 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
28048 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
28049 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
28050 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
28051 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
28052 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28053 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
28054 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28055 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
28056 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
28057 }
28058
28059 static void
28060 ix86_init_builtin_types (void)
28061 {
28062 tree float128_type_node, float80_type_node;
28063
28064 /* The __float80 type. */
28065 float80_type_node = long_double_type_node;
28066 if (TYPE_MODE (float80_type_node) != XFmode)
28067 {
28068 /* The __float80 type. */
28069 float80_type_node = make_node (REAL_TYPE);
28070
28071 TYPE_PRECISION (float80_type_node) = 80;
28072 layout_type (float80_type_node);
28073 }
28074 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
28075
28076 /* The __float128 type. */
28077 float128_type_node = make_node (REAL_TYPE);
28078 TYPE_PRECISION (float128_type_node) = 128;
28079 layout_type (float128_type_node);
28080 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
28081
28082 /* This macro is built by i386-builtin-types.awk. */
28083 DEFINE_BUILTIN_PRIMITIVE_TYPES;
28084 }
28085
28086 static void
28087 ix86_init_builtins (void)
28088 {
28089 tree t;
28090
28091 ix86_init_builtin_types ();
28092
28093 /* TFmode support builtins. */
28094 def_builtin_const (0, "__builtin_infq",
28095 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
28096 def_builtin_const (0, "__builtin_huge_valq",
28097 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
28098
28099 /* We will expand them to normal call if SSE2 isn't available since
28100 they are used by libgcc. */
28101 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
28102 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
28103 BUILT_IN_MD, "__fabstf2", NULL_TREE);
28104 TREE_READONLY (t) = 1;
28105 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
28106
28107 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
28108 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
28109 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
28110 TREE_READONLY (t) = 1;
28111 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
28112
28113 ix86_init_tm_builtins ();
28114 ix86_init_mmx_sse_builtins ();
28115
28116 if (TARGET_LP64)
28117 ix86_init_builtins_va_builtins_abi ();
28118
28119 #ifdef SUBTARGET_INIT_BUILTINS
28120 SUBTARGET_INIT_BUILTINS;
28121 #endif
28122 }
28123
28124 /* Return the ix86 builtin for CODE. */
28125
28126 static tree
28127 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
28128 {
28129 if (code >= IX86_BUILTIN_MAX)
28130 return error_mark_node;
28131
28132 return ix86_builtins[code];
28133 }
28134
28135 /* Errors in the source file can cause expand_expr to return const0_rtx
28136 where we expect a vector. To avoid crashing, use one of the vector
28137 clear instructions. */
28138 static rtx
28139 safe_vector_operand (rtx x, enum machine_mode mode)
28140 {
28141 if (x == const0_rtx)
28142 x = CONST0_RTX (mode);
28143 return x;
28144 }
28145
28146 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
28147
28148 static rtx
28149 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
28150 {
28151 rtx pat;
28152 tree arg0 = CALL_EXPR_ARG (exp, 0);
28153 tree arg1 = CALL_EXPR_ARG (exp, 1);
28154 rtx op0 = expand_normal (arg0);
28155 rtx op1 = expand_normal (arg1);
28156 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28157 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28158 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
28159
28160 if (VECTOR_MODE_P (mode0))
28161 op0 = safe_vector_operand (op0, mode0);
28162 if (VECTOR_MODE_P (mode1))
28163 op1 = safe_vector_operand (op1, mode1);
28164
28165 if (optimize || !target
28166 || GET_MODE (target) != tmode
28167 || !insn_data[icode].operand[0].predicate (target, tmode))
28168 target = gen_reg_rtx (tmode);
28169
28170 if (GET_MODE (op1) == SImode && mode1 == TImode)
28171 {
28172 rtx x = gen_reg_rtx (V4SImode);
28173 emit_insn (gen_sse2_loadd (x, op1));
28174 op1 = gen_lowpart (TImode, x);
28175 }
28176
28177 if (!insn_data[icode].operand[1].predicate (op0, mode0))
28178 op0 = copy_to_mode_reg (mode0, op0);
28179 if (!insn_data[icode].operand[2].predicate (op1, mode1))
28180 op1 = copy_to_mode_reg (mode1, op1);
28181
28182 pat = GEN_FCN (icode) (target, op0, op1);
28183 if (! pat)
28184 return 0;
28185
28186 emit_insn (pat);
28187
28188 return target;
28189 }
28190
28191 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
28192
28193 static rtx
28194 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
28195 enum ix86_builtin_func_type m_type,
28196 enum rtx_code sub_code)
28197 {
28198 rtx pat;
28199 int i;
28200 int nargs;
28201 bool comparison_p = false;
28202 bool tf_p = false;
28203 bool last_arg_constant = false;
28204 int num_memory = 0;
28205 struct {
28206 rtx op;
28207 enum machine_mode mode;
28208 } args[4];
28209
28210 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28211
28212 switch (m_type)
28213 {
28214 case MULTI_ARG_4_DF2_DI_I:
28215 case MULTI_ARG_4_DF2_DI_I1:
28216 case MULTI_ARG_4_SF2_SI_I:
28217 case MULTI_ARG_4_SF2_SI_I1:
28218 nargs = 4;
28219 last_arg_constant = true;
28220 break;
28221
28222 case MULTI_ARG_3_SF:
28223 case MULTI_ARG_3_DF:
28224 case MULTI_ARG_3_SF2:
28225 case MULTI_ARG_3_DF2:
28226 case MULTI_ARG_3_DI:
28227 case MULTI_ARG_3_SI:
28228 case MULTI_ARG_3_SI_DI:
28229 case MULTI_ARG_3_HI:
28230 case MULTI_ARG_3_HI_SI:
28231 case MULTI_ARG_3_QI:
28232 case MULTI_ARG_3_DI2:
28233 case MULTI_ARG_3_SI2:
28234 case MULTI_ARG_3_HI2:
28235 case MULTI_ARG_3_QI2:
28236 nargs = 3;
28237 break;
28238
28239 case MULTI_ARG_2_SF:
28240 case MULTI_ARG_2_DF:
28241 case MULTI_ARG_2_DI:
28242 case MULTI_ARG_2_SI:
28243 case MULTI_ARG_2_HI:
28244 case MULTI_ARG_2_QI:
28245 nargs = 2;
28246 break;
28247
28248 case MULTI_ARG_2_DI_IMM:
28249 case MULTI_ARG_2_SI_IMM:
28250 case MULTI_ARG_2_HI_IMM:
28251 case MULTI_ARG_2_QI_IMM:
28252 nargs = 2;
28253 last_arg_constant = true;
28254 break;
28255
28256 case MULTI_ARG_1_SF:
28257 case MULTI_ARG_1_DF:
28258 case MULTI_ARG_1_SF2:
28259 case MULTI_ARG_1_DF2:
28260 case MULTI_ARG_1_DI:
28261 case MULTI_ARG_1_SI:
28262 case MULTI_ARG_1_HI:
28263 case MULTI_ARG_1_QI:
28264 case MULTI_ARG_1_SI_DI:
28265 case MULTI_ARG_1_HI_DI:
28266 case MULTI_ARG_1_HI_SI:
28267 case MULTI_ARG_1_QI_DI:
28268 case MULTI_ARG_1_QI_SI:
28269 case MULTI_ARG_1_QI_HI:
28270 nargs = 1;
28271 break;
28272
28273 case MULTI_ARG_2_DI_CMP:
28274 case MULTI_ARG_2_SI_CMP:
28275 case MULTI_ARG_2_HI_CMP:
28276 case MULTI_ARG_2_QI_CMP:
28277 nargs = 2;
28278 comparison_p = true;
28279 break;
28280
28281 case MULTI_ARG_2_SF_TF:
28282 case MULTI_ARG_2_DF_TF:
28283 case MULTI_ARG_2_DI_TF:
28284 case MULTI_ARG_2_SI_TF:
28285 case MULTI_ARG_2_HI_TF:
28286 case MULTI_ARG_2_QI_TF:
28287 nargs = 2;
28288 tf_p = true;
28289 break;
28290
28291 default:
28292 gcc_unreachable ();
28293 }
28294
28295 if (optimize || !target
28296 || GET_MODE (target) != tmode
28297 || !insn_data[icode].operand[0].predicate (target, tmode))
28298 target = gen_reg_rtx (tmode);
28299
28300 gcc_assert (nargs <= 4);
28301
28302 for (i = 0; i < nargs; i++)
28303 {
28304 tree arg = CALL_EXPR_ARG (exp, i);
28305 rtx op = expand_normal (arg);
28306 int adjust = (comparison_p) ? 1 : 0;
28307 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
28308
28309 if (last_arg_constant && i == nargs - 1)
28310 {
28311 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
28312 {
28313 enum insn_code new_icode = icode;
28314 switch (icode)
28315 {
28316 case CODE_FOR_xop_vpermil2v2df3:
28317 case CODE_FOR_xop_vpermil2v4sf3:
28318 case CODE_FOR_xop_vpermil2v4df3:
28319 case CODE_FOR_xop_vpermil2v8sf3:
28320 error ("the last argument must be a 2-bit immediate");
28321 return gen_reg_rtx (tmode);
28322 case CODE_FOR_xop_rotlv2di3:
28323 new_icode = CODE_FOR_rotlv2di3;
28324 goto xop_rotl;
28325 case CODE_FOR_xop_rotlv4si3:
28326 new_icode = CODE_FOR_rotlv4si3;
28327 goto xop_rotl;
28328 case CODE_FOR_xop_rotlv8hi3:
28329 new_icode = CODE_FOR_rotlv8hi3;
28330 goto xop_rotl;
28331 case CODE_FOR_xop_rotlv16qi3:
28332 new_icode = CODE_FOR_rotlv16qi3;
28333 xop_rotl:
28334 if (CONST_INT_P (op))
28335 {
28336 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
28337 op = GEN_INT (INTVAL (op) & mask);
28338 gcc_checking_assert
28339 (insn_data[icode].operand[i + 1].predicate (op, mode));
28340 }
28341 else
28342 {
28343 gcc_checking_assert
28344 (nargs == 2
28345 && insn_data[new_icode].operand[0].mode == tmode
28346 && insn_data[new_icode].operand[1].mode == tmode
28347 && insn_data[new_icode].operand[2].mode == mode
28348 && insn_data[new_icode].operand[0].predicate
28349 == insn_data[icode].operand[0].predicate
28350 && insn_data[new_icode].operand[1].predicate
28351 == insn_data[icode].operand[1].predicate);
28352 icode = new_icode;
28353 goto non_constant;
28354 }
28355 break;
28356 default:
28357 gcc_unreachable ();
28358 }
28359 }
28360 }
28361 else
28362 {
28363 non_constant:
28364 if (VECTOR_MODE_P (mode))
28365 op = safe_vector_operand (op, mode);
28366
28367 /* If we aren't optimizing, only allow one memory operand to be
28368 generated. */
28369 if (memory_operand (op, mode))
28370 num_memory++;
28371
28372 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
28373
28374 if (optimize
28375 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
28376 || num_memory > 1)
28377 op = force_reg (mode, op);
28378 }
28379
28380 args[i].op = op;
28381 args[i].mode = mode;
28382 }
28383
28384 switch (nargs)
28385 {
28386 case 1:
28387 pat = GEN_FCN (icode) (target, args[0].op);
28388 break;
28389
28390 case 2:
28391 if (tf_p)
28392 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
28393 GEN_INT ((int)sub_code));
28394 else if (! comparison_p)
28395 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28396 else
28397 {
28398 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
28399 args[0].op,
28400 args[1].op);
28401
28402 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
28403 }
28404 break;
28405
28406 case 3:
28407 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28408 break;
28409
28410 case 4:
28411 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
28412 break;
28413
28414 default:
28415 gcc_unreachable ();
28416 }
28417
28418 if (! pat)
28419 return 0;
28420
28421 emit_insn (pat);
28422 return target;
28423 }
28424
28425 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
28426 insns with vec_merge. */
28427
28428 static rtx
28429 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
28430 rtx target)
28431 {
28432 rtx pat;
28433 tree arg0 = CALL_EXPR_ARG (exp, 0);
28434 rtx op1, op0 = expand_normal (arg0);
28435 enum machine_mode tmode = insn_data[icode].operand[0].mode;
28436 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
28437
28438 if (optimize || !target
28439 || GET_MODE (target) != tmode
28440 || !insn_data[icode].operand[0].predicate (target, tmode))
28441 target = gen_reg_rtx (tmode);
28442
28443 if (VECTOR_MODE_P (mode0))
28444 op0 = safe_vector_operand (op0, mode0);
28445
28446 if ((optimize && !register_operand (op0, mode0))
28447 || !insn_data[icode].operand[1].predicate (op0, mode0))
28448 op0 = copy_to_mode_reg (mode0, op0);
28449
28450 op1 = op0;
28451 if (!insn_data[icode].operand[2].predicate (op1, mode0))
28452 op1 = copy_to_mode_reg (mode0, op1);
28453
28454 pat = GEN_FCN (icode) (target, op0, op1);
28455 if (! pat)
28456 return 0;
28457 emit_insn (pat);
28458 return target;
28459 }
28460
28461 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
28462
28463 static rtx
28464 ix86_expand_sse_compare (const struct builtin_description *d,
28465 tree exp, rtx target, bool swap)
28466 {
28467 rtx pat;
28468 tree arg0 = CALL_EXPR_ARG (exp, 0);
28469 tree arg1 = CALL_EXPR_ARG (exp, 1);
28470 rtx op0 = expand_normal (arg0);
28471 rtx op1 = expand_normal (arg1);
28472 rtx op2;
28473 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28474 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28475 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28476 enum rtx_code comparison = d->comparison;
28477
28478 if (VECTOR_MODE_P (mode0))
28479 op0 = safe_vector_operand (op0, mode0);
28480 if (VECTOR_MODE_P (mode1))
28481 op1 = safe_vector_operand (op1, mode1);
28482
28483 /* Swap operands if we have a comparison that isn't available in
28484 hardware. */
28485 if (swap)
28486 {
28487 rtx tmp = gen_reg_rtx (mode1);
28488 emit_move_insn (tmp, op1);
28489 op1 = op0;
28490 op0 = tmp;
28491 }
28492
28493 if (optimize || !target
28494 || GET_MODE (target) != tmode
28495 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28496 target = gen_reg_rtx (tmode);
28497
28498 if ((optimize && !register_operand (op0, mode0))
28499 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
28500 op0 = copy_to_mode_reg (mode0, op0);
28501 if ((optimize && !register_operand (op1, mode1))
28502 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
28503 op1 = copy_to_mode_reg (mode1, op1);
28504
28505 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
28506 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28507 if (! pat)
28508 return 0;
28509 emit_insn (pat);
28510 return target;
28511 }
28512
28513 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
28514
28515 static rtx
28516 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
28517 rtx target)
28518 {
28519 rtx pat;
28520 tree arg0 = CALL_EXPR_ARG (exp, 0);
28521 tree arg1 = CALL_EXPR_ARG (exp, 1);
28522 rtx op0 = expand_normal (arg0);
28523 rtx op1 = expand_normal (arg1);
28524 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28525 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28526 enum rtx_code comparison = d->comparison;
28527
28528 if (VECTOR_MODE_P (mode0))
28529 op0 = safe_vector_operand (op0, mode0);
28530 if (VECTOR_MODE_P (mode1))
28531 op1 = safe_vector_operand (op1, mode1);
28532
28533 /* Swap operands if we have a comparison that isn't available in
28534 hardware. */
28535 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
28536 {
28537 rtx tmp = op1;
28538 op1 = op0;
28539 op0 = tmp;
28540 }
28541
28542 target = gen_reg_rtx (SImode);
28543 emit_move_insn (target, const0_rtx);
28544 target = gen_rtx_SUBREG (QImode, target, 0);
28545
28546 if ((optimize && !register_operand (op0, mode0))
28547 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28548 op0 = copy_to_mode_reg (mode0, op0);
28549 if ((optimize && !register_operand (op1, mode1))
28550 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28551 op1 = copy_to_mode_reg (mode1, op1);
28552
28553 pat = GEN_FCN (d->icode) (op0, op1);
28554 if (! pat)
28555 return 0;
28556 emit_insn (pat);
28557 emit_insn (gen_rtx_SET (VOIDmode,
28558 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28559 gen_rtx_fmt_ee (comparison, QImode,
28560 SET_DEST (pat),
28561 const0_rtx)));
28562
28563 return SUBREG_REG (target);
28564 }
28565
28566 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
28567
28568 static rtx
28569 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28570 rtx target)
28571 {
28572 rtx pat;
28573 tree arg0 = CALL_EXPR_ARG (exp, 0);
28574 rtx op1, op0 = expand_normal (arg0);
28575 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28576 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28577
28578 if (optimize || target == 0
28579 || GET_MODE (target) != tmode
28580 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28581 target = gen_reg_rtx (tmode);
28582
28583 if (VECTOR_MODE_P (mode0))
28584 op0 = safe_vector_operand (op0, mode0);
28585
28586 if ((optimize && !register_operand (op0, mode0))
28587 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28588 op0 = copy_to_mode_reg (mode0, op0);
28589
28590 op1 = GEN_INT (d->comparison);
28591
28592 pat = GEN_FCN (d->icode) (target, op0, op1);
28593 if (! pat)
28594 return 0;
28595 emit_insn (pat);
28596 return target;
28597 }
28598
28599 static rtx
28600 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28601 tree exp, rtx target)
28602 {
28603 rtx pat;
28604 tree arg0 = CALL_EXPR_ARG (exp, 0);
28605 tree arg1 = CALL_EXPR_ARG (exp, 1);
28606 rtx op0 = expand_normal (arg0);
28607 rtx op1 = expand_normal (arg1);
28608 rtx op2;
28609 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28610 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28611 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28612
28613 if (optimize || target == 0
28614 || GET_MODE (target) != tmode
28615 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28616 target = gen_reg_rtx (tmode);
28617
28618 op0 = safe_vector_operand (op0, mode0);
28619 op1 = safe_vector_operand (op1, mode1);
28620
28621 if ((optimize && !register_operand (op0, mode0))
28622 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28623 op0 = copy_to_mode_reg (mode0, op0);
28624 if ((optimize && !register_operand (op1, mode1))
28625 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28626 op1 = copy_to_mode_reg (mode1, op1);
28627
28628 op2 = GEN_INT (d->comparison);
28629
28630 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28631 if (! pat)
28632 return 0;
28633 emit_insn (pat);
28634 return target;
28635 }
28636
28637 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28638
28639 static rtx
28640 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28641 rtx target)
28642 {
28643 rtx pat;
28644 tree arg0 = CALL_EXPR_ARG (exp, 0);
28645 tree arg1 = CALL_EXPR_ARG (exp, 1);
28646 rtx op0 = expand_normal (arg0);
28647 rtx op1 = expand_normal (arg1);
28648 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28649 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28650 enum rtx_code comparison = d->comparison;
28651
28652 if (VECTOR_MODE_P (mode0))
28653 op0 = safe_vector_operand (op0, mode0);
28654 if (VECTOR_MODE_P (mode1))
28655 op1 = safe_vector_operand (op1, mode1);
28656
28657 target = gen_reg_rtx (SImode);
28658 emit_move_insn (target, const0_rtx);
28659 target = gen_rtx_SUBREG (QImode, target, 0);
28660
28661 if ((optimize && !register_operand (op0, mode0))
28662 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28663 op0 = copy_to_mode_reg (mode0, op0);
28664 if ((optimize && !register_operand (op1, mode1))
28665 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28666 op1 = copy_to_mode_reg (mode1, op1);
28667
28668 pat = GEN_FCN (d->icode) (op0, op1);
28669 if (! pat)
28670 return 0;
28671 emit_insn (pat);
28672 emit_insn (gen_rtx_SET (VOIDmode,
28673 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28674 gen_rtx_fmt_ee (comparison, QImode,
28675 SET_DEST (pat),
28676 const0_rtx)));
28677
28678 return SUBREG_REG (target);
28679 }
28680
28681 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28682
28683 static rtx
28684 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28685 tree exp, rtx target)
28686 {
28687 rtx pat;
28688 tree arg0 = CALL_EXPR_ARG (exp, 0);
28689 tree arg1 = CALL_EXPR_ARG (exp, 1);
28690 tree arg2 = CALL_EXPR_ARG (exp, 2);
28691 tree arg3 = CALL_EXPR_ARG (exp, 3);
28692 tree arg4 = CALL_EXPR_ARG (exp, 4);
28693 rtx scratch0, scratch1;
28694 rtx op0 = expand_normal (arg0);
28695 rtx op1 = expand_normal (arg1);
28696 rtx op2 = expand_normal (arg2);
28697 rtx op3 = expand_normal (arg3);
28698 rtx op4 = expand_normal (arg4);
28699 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28700
28701 tmode0 = insn_data[d->icode].operand[0].mode;
28702 tmode1 = insn_data[d->icode].operand[1].mode;
28703 modev2 = insn_data[d->icode].operand[2].mode;
28704 modei3 = insn_data[d->icode].operand[3].mode;
28705 modev4 = insn_data[d->icode].operand[4].mode;
28706 modei5 = insn_data[d->icode].operand[5].mode;
28707 modeimm = insn_data[d->icode].operand[6].mode;
28708
28709 if (VECTOR_MODE_P (modev2))
28710 op0 = safe_vector_operand (op0, modev2);
28711 if (VECTOR_MODE_P (modev4))
28712 op2 = safe_vector_operand (op2, modev4);
28713
28714 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28715 op0 = copy_to_mode_reg (modev2, op0);
28716 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28717 op1 = copy_to_mode_reg (modei3, op1);
28718 if ((optimize && !register_operand (op2, modev4))
28719 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28720 op2 = copy_to_mode_reg (modev4, op2);
28721 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28722 op3 = copy_to_mode_reg (modei5, op3);
28723
28724 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28725 {
28726 error ("the fifth argument must be an 8-bit immediate");
28727 return const0_rtx;
28728 }
28729
28730 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28731 {
28732 if (optimize || !target
28733 || GET_MODE (target) != tmode0
28734 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28735 target = gen_reg_rtx (tmode0);
28736
28737 scratch1 = gen_reg_rtx (tmode1);
28738
28739 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28740 }
28741 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28742 {
28743 if (optimize || !target
28744 || GET_MODE (target) != tmode1
28745 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28746 target = gen_reg_rtx (tmode1);
28747
28748 scratch0 = gen_reg_rtx (tmode0);
28749
28750 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28751 }
28752 else
28753 {
28754 gcc_assert (d->flag);
28755
28756 scratch0 = gen_reg_rtx (tmode0);
28757 scratch1 = gen_reg_rtx (tmode1);
28758
28759 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28760 }
28761
28762 if (! pat)
28763 return 0;
28764
28765 emit_insn (pat);
28766
28767 if (d->flag)
28768 {
28769 target = gen_reg_rtx (SImode);
28770 emit_move_insn (target, const0_rtx);
28771 target = gen_rtx_SUBREG (QImode, target, 0);
28772
28773 emit_insn
28774 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28775 gen_rtx_fmt_ee (EQ, QImode,
28776 gen_rtx_REG ((enum machine_mode) d->flag,
28777 FLAGS_REG),
28778 const0_rtx)));
28779 return SUBREG_REG (target);
28780 }
28781 else
28782 return target;
28783 }
28784
28785
28786 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28787
28788 static rtx
28789 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28790 tree exp, rtx target)
28791 {
28792 rtx pat;
28793 tree arg0 = CALL_EXPR_ARG (exp, 0);
28794 tree arg1 = CALL_EXPR_ARG (exp, 1);
28795 tree arg2 = CALL_EXPR_ARG (exp, 2);
28796 rtx scratch0, scratch1;
28797 rtx op0 = expand_normal (arg0);
28798 rtx op1 = expand_normal (arg1);
28799 rtx op2 = expand_normal (arg2);
28800 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28801
28802 tmode0 = insn_data[d->icode].operand[0].mode;
28803 tmode1 = insn_data[d->icode].operand[1].mode;
28804 modev2 = insn_data[d->icode].operand[2].mode;
28805 modev3 = insn_data[d->icode].operand[3].mode;
28806 modeimm = insn_data[d->icode].operand[4].mode;
28807
28808 if (VECTOR_MODE_P (modev2))
28809 op0 = safe_vector_operand (op0, modev2);
28810 if (VECTOR_MODE_P (modev3))
28811 op1 = safe_vector_operand (op1, modev3);
28812
28813 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28814 op0 = copy_to_mode_reg (modev2, op0);
28815 if ((optimize && !register_operand (op1, modev3))
28816 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28817 op1 = copy_to_mode_reg (modev3, op1);
28818
28819 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28820 {
28821 error ("the third argument must be an 8-bit immediate");
28822 return const0_rtx;
28823 }
28824
28825 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28826 {
28827 if (optimize || !target
28828 || GET_MODE (target) != tmode0
28829 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28830 target = gen_reg_rtx (tmode0);
28831
28832 scratch1 = gen_reg_rtx (tmode1);
28833
28834 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28835 }
28836 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28837 {
28838 if (optimize || !target
28839 || GET_MODE (target) != tmode1
28840 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28841 target = gen_reg_rtx (tmode1);
28842
28843 scratch0 = gen_reg_rtx (tmode0);
28844
28845 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28846 }
28847 else
28848 {
28849 gcc_assert (d->flag);
28850
28851 scratch0 = gen_reg_rtx (tmode0);
28852 scratch1 = gen_reg_rtx (tmode1);
28853
28854 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28855 }
28856
28857 if (! pat)
28858 return 0;
28859
28860 emit_insn (pat);
28861
28862 if (d->flag)
28863 {
28864 target = gen_reg_rtx (SImode);
28865 emit_move_insn (target, const0_rtx);
28866 target = gen_rtx_SUBREG (QImode, target, 0);
28867
28868 emit_insn
28869 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28870 gen_rtx_fmt_ee (EQ, QImode,
28871 gen_rtx_REG ((enum machine_mode) d->flag,
28872 FLAGS_REG),
28873 const0_rtx)));
28874 return SUBREG_REG (target);
28875 }
28876 else
28877 return target;
28878 }
28879
28880 /* Subroutine of ix86_expand_builtin to take care of insns with
28881 variable number of operands. */
28882
28883 static rtx
28884 ix86_expand_args_builtin (const struct builtin_description *d,
28885 tree exp, rtx target)
28886 {
28887 rtx pat, real_target;
28888 unsigned int i, nargs;
28889 unsigned int nargs_constant = 0;
28890 int num_memory = 0;
28891 struct
28892 {
28893 rtx op;
28894 enum machine_mode mode;
28895 } args[4];
28896 bool last_arg_count = false;
28897 enum insn_code icode = d->icode;
28898 const struct insn_data_d *insn_p = &insn_data[icode];
28899 enum machine_mode tmode = insn_p->operand[0].mode;
28900 enum machine_mode rmode = VOIDmode;
28901 bool swap = false;
28902 enum rtx_code comparison = d->comparison;
28903
28904 switch ((enum ix86_builtin_func_type) d->flag)
28905 {
28906 case V2DF_FTYPE_V2DF_ROUND:
28907 case V4DF_FTYPE_V4DF_ROUND:
28908 case V4SF_FTYPE_V4SF_ROUND:
28909 case V8SF_FTYPE_V8SF_ROUND:
28910 case V4SI_FTYPE_V4SF_ROUND:
28911 case V8SI_FTYPE_V8SF_ROUND:
28912 return ix86_expand_sse_round (d, exp, target);
28913 case V4SI_FTYPE_V2DF_V2DF_ROUND:
28914 case V8SI_FTYPE_V4DF_V4DF_ROUND:
28915 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28916 case INT_FTYPE_V8SF_V8SF_PTEST:
28917 case INT_FTYPE_V4DI_V4DI_PTEST:
28918 case INT_FTYPE_V4DF_V4DF_PTEST:
28919 case INT_FTYPE_V4SF_V4SF_PTEST:
28920 case INT_FTYPE_V2DI_V2DI_PTEST:
28921 case INT_FTYPE_V2DF_V2DF_PTEST:
28922 return ix86_expand_sse_ptest (d, exp, target);
28923 case FLOAT128_FTYPE_FLOAT128:
28924 case FLOAT_FTYPE_FLOAT:
28925 case INT_FTYPE_INT:
28926 case UINT64_FTYPE_INT:
28927 case UINT16_FTYPE_UINT16:
28928 case INT64_FTYPE_INT64:
28929 case INT64_FTYPE_V4SF:
28930 case INT64_FTYPE_V2DF:
28931 case INT_FTYPE_V16QI:
28932 case INT_FTYPE_V8QI:
28933 case INT_FTYPE_V8SF:
28934 case INT_FTYPE_V4DF:
28935 case INT_FTYPE_V4SF:
28936 case INT_FTYPE_V2DF:
28937 case INT_FTYPE_V32QI:
28938 case V16QI_FTYPE_V16QI:
28939 case V8SI_FTYPE_V8SF:
28940 case V8SI_FTYPE_V4SI:
28941 case V8HI_FTYPE_V8HI:
28942 case V8HI_FTYPE_V16QI:
28943 case V8QI_FTYPE_V8QI:
28944 case V8SF_FTYPE_V8SF:
28945 case V8SF_FTYPE_V8SI:
28946 case V8SF_FTYPE_V4SF:
28947 case V8SF_FTYPE_V8HI:
28948 case V4SI_FTYPE_V4SI:
28949 case V4SI_FTYPE_V16QI:
28950 case V4SI_FTYPE_V4SF:
28951 case V4SI_FTYPE_V8SI:
28952 case V4SI_FTYPE_V8HI:
28953 case V4SI_FTYPE_V4DF:
28954 case V4SI_FTYPE_V2DF:
28955 case V4HI_FTYPE_V4HI:
28956 case V4DF_FTYPE_V4DF:
28957 case V4DF_FTYPE_V4SI:
28958 case V4DF_FTYPE_V4SF:
28959 case V4DF_FTYPE_V2DF:
28960 case V4SF_FTYPE_V4SF:
28961 case V4SF_FTYPE_V4SI:
28962 case V4SF_FTYPE_V8SF:
28963 case V4SF_FTYPE_V4DF:
28964 case V4SF_FTYPE_V8HI:
28965 case V4SF_FTYPE_V2DF:
28966 case V2DI_FTYPE_V2DI:
28967 case V2DI_FTYPE_V16QI:
28968 case V2DI_FTYPE_V8HI:
28969 case V2DI_FTYPE_V4SI:
28970 case V2DF_FTYPE_V2DF:
28971 case V2DF_FTYPE_V4SI:
28972 case V2DF_FTYPE_V4DF:
28973 case V2DF_FTYPE_V4SF:
28974 case V2DF_FTYPE_V2SI:
28975 case V2SI_FTYPE_V2SI:
28976 case V2SI_FTYPE_V4SF:
28977 case V2SI_FTYPE_V2SF:
28978 case V2SI_FTYPE_V2DF:
28979 case V2SF_FTYPE_V2SF:
28980 case V2SF_FTYPE_V2SI:
28981 case V32QI_FTYPE_V32QI:
28982 case V32QI_FTYPE_V16QI:
28983 case V16HI_FTYPE_V16HI:
28984 case V16HI_FTYPE_V8HI:
28985 case V8SI_FTYPE_V8SI:
28986 case V16HI_FTYPE_V16QI:
28987 case V8SI_FTYPE_V16QI:
28988 case V4DI_FTYPE_V16QI:
28989 case V8SI_FTYPE_V8HI:
28990 case V4DI_FTYPE_V8HI:
28991 case V4DI_FTYPE_V4SI:
28992 case V4DI_FTYPE_V2DI:
28993 nargs = 1;
28994 break;
28995 case V4SF_FTYPE_V4SF_VEC_MERGE:
28996 case V2DF_FTYPE_V2DF_VEC_MERGE:
28997 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28998 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28999 case V16QI_FTYPE_V16QI_V16QI:
29000 case V16QI_FTYPE_V8HI_V8HI:
29001 case V8QI_FTYPE_V8QI_V8QI:
29002 case V8QI_FTYPE_V4HI_V4HI:
29003 case V8HI_FTYPE_V8HI_V8HI:
29004 case V8HI_FTYPE_V16QI_V16QI:
29005 case V8HI_FTYPE_V4SI_V4SI:
29006 case V8SF_FTYPE_V8SF_V8SF:
29007 case V8SF_FTYPE_V8SF_V8SI:
29008 case V4SI_FTYPE_V4SI_V4SI:
29009 case V4SI_FTYPE_V8HI_V8HI:
29010 case V4SI_FTYPE_V4SF_V4SF:
29011 case V4SI_FTYPE_V2DF_V2DF:
29012 case V4HI_FTYPE_V4HI_V4HI:
29013 case V4HI_FTYPE_V8QI_V8QI:
29014 case V4HI_FTYPE_V2SI_V2SI:
29015 case V4DF_FTYPE_V4DF_V4DF:
29016 case V4DF_FTYPE_V4DF_V4DI:
29017 case V4SF_FTYPE_V4SF_V4SF:
29018 case V4SF_FTYPE_V4SF_V4SI:
29019 case V4SF_FTYPE_V4SF_V2SI:
29020 case V4SF_FTYPE_V4SF_V2DF:
29021 case V4SF_FTYPE_V4SF_DI:
29022 case V4SF_FTYPE_V4SF_SI:
29023 case V2DI_FTYPE_V2DI_V2DI:
29024 case V2DI_FTYPE_V16QI_V16QI:
29025 case V2DI_FTYPE_V4SI_V4SI:
29026 case V2DI_FTYPE_V2DI_V16QI:
29027 case V2DI_FTYPE_V2DF_V2DF:
29028 case V2SI_FTYPE_V2SI_V2SI:
29029 case V2SI_FTYPE_V4HI_V4HI:
29030 case V2SI_FTYPE_V2SF_V2SF:
29031 case V2DF_FTYPE_V2DF_V2DF:
29032 case V2DF_FTYPE_V2DF_V4SF:
29033 case V2DF_FTYPE_V2DF_V2DI:
29034 case V2DF_FTYPE_V2DF_DI:
29035 case V2DF_FTYPE_V2DF_SI:
29036 case V2SF_FTYPE_V2SF_V2SF:
29037 case V1DI_FTYPE_V1DI_V1DI:
29038 case V1DI_FTYPE_V8QI_V8QI:
29039 case V1DI_FTYPE_V2SI_V2SI:
29040 case V32QI_FTYPE_V16HI_V16HI:
29041 case V16HI_FTYPE_V8SI_V8SI:
29042 case V32QI_FTYPE_V32QI_V32QI:
29043 case V16HI_FTYPE_V32QI_V32QI:
29044 case V16HI_FTYPE_V16HI_V16HI:
29045 case V8SI_FTYPE_V4DF_V4DF:
29046 case V8SI_FTYPE_V8SI_V8SI:
29047 case V8SI_FTYPE_V16HI_V16HI:
29048 case V4DI_FTYPE_V4DI_V4DI:
29049 case V4DI_FTYPE_V8SI_V8SI:
29050 if (comparison == UNKNOWN)
29051 return ix86_expand_binop_builtin (icode, exp, target);
29052 nargs = 2;
29053 break;
29054 case V4SF_FTYPE_V4SF_V4SF_SWAP:
29055 case V2DF_FTYPE_V2DF_V2DF_SWAP:
29056 gcc_assert (comparison != UNKNOWN);
29057 nargs = 2;
29058 swap = true;
29059 break;
29060 case V16HI_FTYPE_V16HI_V8HI_COUNT:
29061 case V16HI_FTYPE_V16HI_SI_COUNT:
29062 case V8SI_FTYPE_V8SI_V4SI_COUNT:
29063 case V8SI_FTYPE_V8SI_SI_COUNT:
29064 case V4DI_FTYPE_V4DI_V2DI_COUNT:
29065 case V4DI_FTYPE_V4DI_INT_COUNT:
29066 case V8HI_FTYPE_V8HI_V8HI_COUNT:
29067 case V8HI_FTYPE_V8HI_SI_COUNT:
29068 case V4SI_FTYPE_V4SI_V4SI_COUNT:
29069 case V4SI_FTYPE_V4SI_SI_COUNT:
29070 case V4HI_FTYPE_V4HI_V4HI_COUNT:
29071 case V4HI_FTYPE_V4HI_SI_COUNT:
29072 case V2DI_FTYPE_V2DI_V2DI_COUNT:
29073 case V2DI_FTYPE_V2DI_SI_COUNT:
29074 case V2SI_FTYPE_V2SI_V2SI_COUNT:
29075 case V2SI_FTYPE_V2SI_SI_COUNT:
29076 case V1DI_FTYPE_V1DI_V1DI_COUNT:
29077 case V1DI_FTYPE_V1DI_SI_COUNT:
29078 nargs = 2;
29079 last_arg_count = true;
29080 break;
29081 case UINT64_FTYPE_UINT64_UINT64:
29082 case UINT_FTYPE_UINT_UINT:
29083 case UINT_FTYPE_UINT_USHORT:
29084 case UINT_FTYPE_UINT_UCHAR:
29085 case UINT16_FTYPE_UINT16_INT:
29086 case UINT8_FTYPE_UINT8_INT:
29087 nargs = 2;
29088 break;
29089 case V2DI_FTYPE_V2DI_INT_CONVERT:
29090 nargs = 2;
29091 rmode = V1TImode;
29092 nargs_constant = 1;
29093 break;
29094 case V4DI_FTYPE_V4DI_INT_CONVERT:
29095 nargs = 2;
29096 rmode = V2TImode;
29097 nargs_constant = 1;
29098 break;
29099 case V8HI_FTYPE_V8HI_INT:
29100 case V8HI_FTYPE_V8SF_INT:
29101 case V8HI_FTYPE_V4SF_INT:
29102 case V8SF_FTYPE_V8SF_INT:
29103 case V4SI_FTYPE_V4SI_INT:
29104 case V4SI_FTYPE_V8SI_INT:
29105 case V4HI_FTYPE_V4HI_INT:
29106 case V4DF_FTYPE_V4DF_INT:
29107 case V4SF_FTYPE_V4SF_INT:
29108 case V4SF_FTYPE_V8SF_INT:
29109 case V2DI_FTYPE_V2DI_INT:
29110 case V2DF_FTYPE_V2DF_INT:
29111 case V2DF_FTYPE_V4DF_INT:
29112 case V16HI_FTYPE_V16HI_INT:
29113 case V8SI_FTYPE_V8SI_INT:
29114 case V4DI_FTYPE_V4DI_INT:
29115 case V2DI_FTYPE_V4DI_INT:
29116 nargs = 2;
29117 nargs_constant = 1;
29118 break;
29119 case V16QI_FTYPE_V16QI_V16QI_V16QI:
29120 case V8SF_FTYPE_V8SF_V8SF_V8SF:
29121 case V4DF_FTYPE_V4DF_V4DF_V4DF:
29122 case V4SF_FTYPE_V4SF_V4SF_V4SF:
29123 case V2DF_FTYPE_V2DF_V2DF_V2DF:
29124 case V32QI_FTYPE_V32QI_V32QI_V32QI:
29125 nargs = 3;
29126 break;
29127 case V32QI_FTYPE_V32QI_V32QI_INT:
29128 case V16HI_FTYPE_V16HI_V16HI_INT:
29129 case V16QI_FTYPE_V16QI_V16QI_INT:
29130 case V4DI_FTYPE_V4DI_V4DI_INT:
29131 case V8HI_FTYPE_V8HI_V8HI_INT:
29132 case V8SI_FTYPE_V8SI_V8SI_INT:
29133 case V8SI_FTYPE_V8SI_V4SI_INT:
29134 case V8SF_FTYPE_V8SF_V8SF_INT:
29135 case V8SF_FTYPE_V8SF_V4SF_INT:
29136 case V4SI_FTYPE_V4SI_V4SI_INT:
29137 case V4DF_FTYPE_V4DF_V4DF_INT:
29138 case V4DF_FTYPE_V4DF_V2DF_INT:
29139 case V4SF_FTYPE_V4SF_V4SF_INT:
29140 case V2DI_FTYPE_V2DI_V2DI_INT:
29141 case V4DI_FTYPE_V4DI_V2DI_INT:
29142 case V2DF_FTYPE_V2DF_V2DF_INT:
29143 nargs = 3;
29144 nargs_constant = 1;
29145 break;
29146 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
29147 nargs = 3;
29148 rmode = V4DImode;
29149 nargs_constant = 1;
29150 break;
29151 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
29152 nargs = 3;
29153 rmode = V2DImode;
29154 nargs_constant = 1;
29155 break;
29156 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
29157 nargs = 3;
29158 rmode = DImode;
29159 nargs_constant = 1;
29160 break;
29161 case V2DI_FTYPE_V2DI_UINT_UINT:
29162 nargs = 3;
29163 nargs_constant = 2;
29164 break;
29165 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
29166 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
29167 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
29168 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
29169 nargs = 4;
29170 nargs_constant = 1;
29171 break;
29172 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
29173 nargs = 4;
29174 nargs_constant = 2;
29175 break;
29176 default:
29177 gcc_unreachable ();
29178 }
29179
29180 gcc_assert (nargs <= ARRAY_SIZE (args));
29181
29182 if (comparison != UNKNOWN)
29183 {
29184 gcc_assert (nargs == 2);
29185 return ix86_expand_sse_compare (d, exp, target, swap);
29186 }
29187
29188 if (rmode == VOIDmode || rmode == tmode)
29189 {
29190 if (optimize
29191 || target == 0
29192 || GET_MODE (target) != tmode
29193 || !insn_p->operand[0].predicate (target, tmode))
29194 target = gen_reg_rtx (tmode);
29195 real_target = target;
29196 }
29197 else
29198 {
29199 target = gen_reg_rtx (rmode);
29200 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
29201 }
29202
29203 for (i = 0; i < nargs; i++)
29204 {
29205 tree arg = CALL_EXPR_ARG (exp, i);
29206 rtx op = expand_normal (arg);
29207 enum machine_mode mode = insn_p->operand[i + 1].mode;
29208 bool match = insn_p->operand[i + 1].predicate (op, mode);
29209
29210 if (last_arg_count && (i + 1) == nargs)
29211 {
29212 /* SIMD shift insns take either an 8-bit immediate or
29213 register as count. But builtin functions take int as
29214 count. If count doesn't match, we put it in register. */
29215 if (!match)
29216 {
29217 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
29218 if (!insn_p->operand[i + 1].predicate (op, mode))
29219 op = copy_to_reg (op);
29220 }
29221 }
29222 else if ((nargs - i) <= nargs_constant)
29223 {
29224 if (!match)
29225 switch (icode)
29226 {
29227 case CODE_FOR_avx2_inserti128:
29228 case CODE_FOR_avx2_extracti128:
29229 error ("the last argument must be an 1-bit immediate");
29230 return const0_rtx;
29231
29232 case CODE_FOR_sse4_1_roundsd:
29233 case CODE_FOR_sse4_1_roundss:
29234
29235 case CODE_FOR_sse4_1_roundpd:
29236 case CODE_FOR_sse4_1_roundps:
29237 case CODE_FOR_avx_roundpd256:
29238 case CODE_FOR_avx_roundps256:
29239
29240 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
29241 case CODE_FOR_sse4_1_roundps_sfix:
29242 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
29243 case CODE_FOR_avx_roundps_sfix256:
29244
29245 case CODE_FOR_sse4_1_blendps:
29246 case CODE_FOR_avx_blendpd256:
29247 case CODE_FOR_avx_vpermilv4df:
29248 error ("the last argument must be a 4-bit immediate");
29249 return const0_rtx;
29250
29251 case CODE_FOR_sse4_1_blendpd:
29252 case CODE_FOR_avx_vpermilv2df:
29253 case CODE_FOR_xop_vpermil2v2df3:
29254 case CODE_FOR_xop_vpermil2v4sf3:
29255 case CODE_FOR_xop_vpermil2v4df3:
29256 case CODE_FOR_xop_vpermil2v8sf3:
29257 error ("the last argument must be a 2-bit immediate");
29258 return const0_rtx;
29259
29260 case CODE_FOR_avx_vextractf128v4df:
29261 case CODE_FOR_avx_vextractf128v8sf:
29262 case CODE_FOR_avx_vextractf128v8si:
29263 case CODE_FOR_avx_vinsertf128v4df:
29264 case CODE_FOR_avx_vinsertf128v8sf:
29265 case CODE_FOR_avx_vinsertf128v8si:
29266 error ("the last argument must be a 1-bit immediate");
29267 return const0_rtx;
29268
29269 case CODE_FOR_avx_vmcmpv2df3:
29270 case CODE_FOR_avx_vmcmpv4sf3:
29271 case CODE_FOR_avx_cmpv2df3:
29272 case CODE_FOR_avx_cmpv4sf3:
29273 case CODE_FOR_avx_cmpv4df3:
29274 case CODE_FOR_avx_cmpv8sf3:
29275 error ("the last argument must be a 5-bit immediate");
29276 return const0_rtx;
29277
29278 default:
29279 switch (nargs_constant)
29280 {
29281 case 2:
29282 if ((nargs - i) == nargs_constant)
29283 {
29284 error ("the next to last argument must be an 8-bit immediate");
29285 break;
29286 }
29287 case 1:
29288 error ("the last argument must be an 8-bit immediate");
29289 break;
29290 default:
29291 gcc_unreachable ();
29292 }
29293 return const0_rtx;
29294 }
29295 }
29296 else
29297 {
29298 if (VECTOR_MODE_P (mode))
29299 op = safe_vector_operand (op, mode);
29300
29301 /* If we aren't optimizing, only allow one memory operand to
29302 be generated. */
29303 if (memory_operand (op, mode))
29304 num_memory++;
29305
29306 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
29307 {
29308 if (optimize || !match || num_memory > 1)
29309 op = copy_to_mode_reg (mode, op);
29310 }
29311 else
29312 {
29313 op = copy_to_reg (op);
29314 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
29315 }
29316 }
29317
29318 args[i].op = op;
29319 args[i].mode = mode;
29320 }
29321
29322 switch (nargs)
29323 {
29324 case 1:
29325 pat = GEN_FCN (icode) (real_target, args[0].op);
29326 break;
29327 case 2:
29328 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
29329 break;
29330 case 3:
29331 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29332 args[2].op);
29333 break;
29334 case 4:
29335 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
29336 args[2].op, args[3].op);
29337 break;
29338 default:
29339 gcc_unreachable ();
29340 }
29341
29342 if (! pat)
29343 return 0;
29344
29345 emit_insn (pat);
29346 return target;
29347 }
29348
29349 /* Subroutine of ix86_expand_builtin to take care of special insns
29350 with variable number of operands. */
29351
29352 static rtx
29353 ix86_expand_special_args_builtin (const struct builtin_description *d,
29354 tree exp, rtx target)
29355 {
29356 tree arg;
29357 rtx pat, op;
29358 unsigned int i, nargs, arg_adjust, memory;
29359 struct
29360 {
29361 rtx op;
29362 enum machine_mode mode;
29363 } args[3];
29364 enum insn_code icode = d->icode;
29365 bool last_arg_constant = false;
29366 const struct insn_data_d *insn_p = &insn_data[icode];
29367 enum machine_mode tmode = insn_p->operand[0].mode;
29368 enum { load, store } klass;
29369
29370 switch ((enum ix86_builtin_func_type) d->flag)
29371 {
29372 case VOID_FTYPE_VOID:
29373 if (icode == CODE_FOR_avx_vzeroupper)
29374 target = GEN_INT (vzeroupper_intrinsic);
29375 emit_insn (GEN_FCN (icode) (target));
29376 return 0;
29377 case VOID_FTYPE_UINT64:
29378 case VOID_FTYPE_UNSIGNED:
29379 nargs = 0;
29380 klass = store;
29381 memory = 0;
29382 break;
29383 case UINT64_FTYPE_VOID:
29384 case UNSIGNED_FTYPE_VOID:
29385 nargs = 0;
29386 klass = load;
29387 memory = 0;
29388 break;
29389 case UINT64_FTYPE_PUNSIGNED:
29390 case V2DI_FTYPE_PV2DI:
29391 case V4DI_FTYPE_PV4DI:
29392 case V32QI_FTYPE_PCCHAR:
29393 case V16QI_FTYPE_PCCHAR:
29394 case V8SF_FTYPE_PCV4SF:
29395 case V8SF_FTYPE_PCFLOAT:
29396 case V4SF_FTYPE_PCFLOAT:
29397 case V4DF_FTYPE_PCV2DF:
29398 case V4DF_FTYPE_PCDOUBLE:
29399 case V2DF_FTYPE_PCDOUBLE:
29400 case VOID_FTYPE_PVOID:
29401 nargs = 1;
29402 klass = load;
29403 memory = 0;
29404 break;
29405 case VOID_FTYPE_PV2SF_V4SF:
29406 case VOID_FTYPE_PV4DI_V4DI:
29407 case VOID_FTYPE_PV2DI_V2DI:
29408 case VOID_FTYPE_PCHAR_V32QI:
29409 case VOID_FTYPE_PCHAR_V16QI:
29410 case VOID_FTYPE_PFLOAT_V8SF:
29411 case VOID_FTYPE_PFLOAT_V4SF:
29412 case VOID_FTYPE_PDOUBLE_V4DF:
29413 case VOID_FTYPE_PDOUBLE_V2DF:
29414 case VOID_FTYPE_PLONGLONG_LONGLONG:
29415 case VOID_FTYPE_PULONGLONG_ULONGLONG:
29416 case VOID_FTYPE_PINT_INT:
29417 nargs = 1;
29418 klass = store;
29419 /* Reserve memory operand for target. */
29420 memory = ARRAY_SIZE (args);
29421 break;
29422 case V4SF_FTYPE_V4SF_PCV2SF:
29423 case V2DF_FTYPE_V2DF_PCDOUBLE:
29424 nargs = 2;
29425 klass = load;
29426 memory = 1;
29427 break;
29428 case V8SF_FTYPE_PCV8SF_V8SI:
29429 case V4DF_FTYPE_PCV4DF_V4DI:
29430 case V4SF_FTYPE_PCV4SF_V4SI:
29431 case V2DF_FTYPE_PCV2DF_V2DI:
29432 case V8SI_FTYPE_PCV8SI_V8SI:
29433 case V4DI_FTYPE_PCV4DI_V4DI:
29434 case V4SI_FTYPE_PCV4SI_V4SI:
29435 case V2DI_FTYPE_PCV2DI_V2DI:
29436 nargs = 2;
29437 klass = load;
29438 memory = 0;
29439 break;
29440 case VOID_FTYPE_PV8SF_V8SI_V8SF:
29441 case VOID_FTYPE_PV4DF_V4DI_V4DF:
29442 case VOID_FTYPE_PV4SF_V4SI_V4SF:
29443 case VOID_FTYPE_PV2DF_V2DI_V2DF:
29444 case VOID_FTYPE_PV8SI_V8SI_V8SI:
29445 case VOID_FTYPE_PV4DI_V4DI_V4DI:
29446 case VOID_FTYPE_PV4SI_V4SI_V4SI:
29447 case VOID_FTYPE_PV2DI_V2DI_V2DI:
29448 nargs = 2;
29449 klass = store;
29450 /* Reserve memory operand for target. */
29451 memory = ARRAY_SIZE (args);
29452 break;
29453 case VOID_FTYPE_UINT_UINT_UINT:
29454 case VOID_FTYPE_UINT64_UINT_UINT:
29455 case UCHAR_FTYPE_UINT_UINT_UINT:
29456 case UCHAR_FTYPE_UINT64_UINT_UINT:
29457 nargs = 3;
29458 klass = load;
29459 memory = ARRAY_SIZE (args);
29460 last_arg_constant = true;
29461 break;
29462 default:
29463 gcc_unreachable ();
29464 }
29465
29466 gcc_assert (nargs <= ARRAY_SIZE (args));
29467
29468 if (klass == store)
29469 {
29470 arg = CALL_EXPR_ARG (exp, 0);
29471 op = expand_normal (arg);
29472 gcc_assert (target == 0);
29473 if (memory)
29474 {
29475 if (GET_MODE (op) != Pmode)
29476 op = convert_to_mode (Pmode, op, 1);
29477 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
29478 }
29479 else
29480 target = force_reg (tmode, op);
29481 arg_adjust = 1;
29482 }
29483 else
29484 {
29485 arg_adjust = 0;
29486 if (optimize
29487 || target == 0
29488 || GET_MODE (target) != tmode
29489 || !insn_p->operand[0].predicate (target, tmode))
29490 target = gen_reg_rtx (tmode);
29491 }
29492
29493 for (i = 0; i < nargs; i++)
29494 {
29495 enum machine_mode mode = insn_p->operand[i + 1].mode;
29496 bool match;
29497
29498 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
29499 op = expand_normal (arg);
29500 match = insn_p->operand[i + 1].predicate (op, mode);
29501
29502 if (last_arg_constant && (i + 1) == nargs)
29503 {
29504 if (!match)
29505 {
29506 if (icode == CODE_FOR_lwp_lwpvalsi3
29507 || icode == CODE_FOR_lwp_lwpinssi3
29508 || icode == CODE_FOR_lwp_lwpvaldi3
29509 || icode == CODE_FOR_lwp_lwpinsdi3)
29510 error ("the last argument must be a 32-bit immediate");
29511 else
29512 error ("the last argument must be an 8-bit immediate");
29513 return const0_rtx;
29514 }
29515 }
29516 else
29517 {
29518 if (i == memory)
29519 {
29520 /* This must be the memory operand. */
29521 if (GET_MODE (op) != Pmode)
29522 op = convert_to_mode (Pmode, op, 1);
29523 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
29524 gcc_assert (GET_MODE (op) == mode
29525 || GET_MODE (op) == VOIDmode);
29526 }
29527 else
29528 {
29529 /* This must be register. */
29530 if (VECTOR_MODE_P (mode))
29531 op = safe_vector_operand (op, mode);
29532
29533 gcc_assert (GET_MODE (op) == mode
29534 || GET_MODE (op) == VOIDmode);
29535 op = copy_to_mode_reg (mode, op);
29536 }
29537 }
29538
29539 args[i].op = op;
29540 args[i].mode = mode;
29541 }
29542
29543 switch (nargs)
29544 {
29545 case 0:
29546 pat = GEN_FCN (icode) (target);
29547 break;
29548 case 1:
29549 pat = GEN_FCN (icode) (target, args[0].op);
29550 break;
29551 case 2:
29552 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
29553 break;
29554 case 3:
29555 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
29556 break;
29557 default:
29558 gcc_unreachable ();
29559 }
29560
29561 if (! pat)
29562 return 0;
29563 emit_insn (pat);
29564 return klass == store ? 0 : target;
29565 }
29566
29567 /* Return the integer constant in ARG. Constrain it to be in the range
29568 of the subparts of VEC_TYPE; issue an error if not. */
29569
29570 static int
29571 get_element_number (tree vec_type, tree arg)
29572 {
29573 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29574
29575 if (!host_integerp (arg, 1)
29576 || (elt = tree_low_cst (arg, 1), elt > max))
29577 {
29578 error ("selector must be an integer constant in the range 0..%wi", max);
29579 return 0;
29580 }
29581
29582 return elt;
29583 }
29584
29585 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29586 ix86_expand_vector_init. We DO have language-level syntax for this, in
29587 the form of (type){ init-list }. Except that since we can't place emms
29588 instructions from inside the compiler, we can't allow the use of MMX
29589 registers unless the user explicitly asks for it. So we do *not* define
29590 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29591 we have builtins invoked by mmintrin.h that gives us license to emit
29592 these sorts of instructions. */
29593
29594 static rtx
29595 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29596 {
29597 enum machine_mode tmode = TYPE_MODE (type);
29598 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29599 int i, n_elt = GET_MODE_NUNITS (tmode);
29600 rtvec v = rtvec_alloc (n_elt);
29601
29602 gcc_assert (VECTOR_MODE_P (tmode));
29603 gcc_assert (call_expr_nargs (exp) == n_elt);
29604
29605 for (i = 0; i < n_elt; ++i)
29606 {
29607 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29608 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29609 }
29610
29611 if (!target || !register_operand (target, tmode))
29612 target = gen_reg_rtx (tmode);
29613
29614 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29615 return target;
29616 }
29617
29618 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29619 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29620 had a language-level syntax for referencing vector elements. */
29621
29622 static rtx
29623 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29624 {
29625 enum machine_mode tmode, mode0;
29626 tree arg0, arg1;
29627 int elt;
29628 rtx op0;
29629
29630 arg0 = CALL_EXPR_ARG (exp, 0);
29631 arg1 = CALL_EXPR_ARG (exp, 1);
29632
29633 op0 = expand_normal (arg0);
29634 elt = get_element_number (TREE_TYPE (arg0), arg1);
29635
29636 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29637 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29638 gcc_assert (VECTOR_MODE_P (mode0));
29639
29640 op0 = force_reg (mode0, op0);
29641
29642 if (optimize || !target || !register_operand (target, tmode))
29643 target = gen_reg_rtx (tmode);
29644
29645 ix86_expand_vector_extract (true, target, op0, elt);
29646
29647 return target;
29648 }
29649
29650 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29651 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29652 a language-level syntax for referencing vector elements. */
29653
29654 static rtx
29655 ix86_expand_vec_set_builtin (tree exp)
29656 {
29657 enum machine_mode tmode, mode1;
29658 tree arg0, arg1, arg2;
29659 int elt;
29660 rtx op0, op1, target;
29661
29662 arg0 = CALL_EXPR_ARG (exp, 0);
29663 arg1 = CALL_EXPR_ARG (exp, 1);
29664 arg2 = CALL_EXPR_ARG (exp, 2);
29665
29666 tmode = TYPE_MODE (TREE_TYPE (arg0));
29667 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29668 gcc_assert (VECTOR_MODE_P (tmode));
29669
29670 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29671 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29672 elt = get_element_number (TREE_TYPE (arg0), arg2);
29673
29674 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29675 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29676
29677 op0 = force_reg (tmode, op0);
29678 op1 = force_reg (mode1, op1);
29679
29680 /* OP0 is the source of these builtin functions and shouldn't be
29681 modified. Create a copy, use it and return it as target. */
29682 target = gen_reg_rtx (tmode);
29683 emit_move_insn (target, op0);
29684 ix86_expand_vector_set (true, target, op1, elt);
29685
29686 return target;
29687 }
29688
29689 /* Expand an expression EXP that calls a built-in function,
29690 with result going to TARGET if that's convenient
29691 (and in mode MODE if that's convenient).
29692 SUBTARGET may be used as the target for computing one of EXP's operands.
29693 IGNORE is nonzero if the value is to be ignored. */
29694
29695 static rtx
29696 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29697 enum machine_mode mode ATTRIBUTE_UNUSED,
29698 int ignore ATTRIBUTE_UNUSED)
29699 {
29700 const struct builtin_description *d;
29701 size_t i;
29702 enum insn_code icode;
29703 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29704 tree arg0, arg1, arg2, arg3, arg4;
29705 rtx op0, op1, op2, op3, op4, pat;
29706 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29707 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29708
29709 /* Determine whether the builtin function is available under the current ISA.
29710 Originally the builtin was not created if it wasn't applicable to the
29711 current ISA based on the command line switches. With function specific
29712 options, we need to check in the context of the function making the call
29713 whether it is supported. */
29714 if (ix86_builtins_isa[fcode].isa
29715 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29716 {
29717 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29718 NULL, (enum fpmath_unit) 0, false);
29719
29720 if (!opts)
29721 error ("%qE needs unknown isa option", fndecl);
29722 else
29723 {
29724 gcc_assert (opts != NULL);
29725 error ("%qE needs isa option %s", fndecl, opts);
29726 free (opts);
29727 }
29728 return const0_rtx;
29729 }
29730
29731 switch (fcode)
29732 {
29733 case IX86_BUILTIN_MASKMOVQ:
29734 case IX86_BUILTIN_MASKMOVDQU:
29735 icode = (fcode == IX86_BUILTIN_MASKMOVQ
29736 ? CODE_FOR_mmx_maskmovq
29737 : CODE_FOR_sse2_maskmovdqu);
29738 /* Note the arg order is different from the operand order. */
29739 arg1 = CALL_EXPR_ARG (exp, 0);
29740 arg2 = CALL_EXPR_ARG (exp, 1);
29741 arg0 = CALL_EXPR_ARG (exp, 2);
29742 op0 = expand_normal (arg0);
29743 op1 = expand_normal (arg1);
29744 op2 = expand_normal (arg2);
29745 mode0 = insn_data[icode].operand[0].mode;
29746 mode1 = insn_data[icode].operand[1].mode;
29747 mode2 = insn_data[icode].operand[2].mode;
29748
29749 if (GET_MODE (op0) != Pmode)
29750 op0 = convert_to_mode (Pmode, op0, 1);
29751 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29752
29753 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29754 op0 = copy_to_mode_reg (mode0, op0);
29755 if (!insn_data[icode].operand[1].predicate (op1, mode1))
29756 op1 = copy_to_mode_reg (mode1, op1);
29757 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29758 op2 = copy_to_mode_reg (mode2, op2);
29759 pat = GEN_FCN (icode) (op0, op1, op2);
29760 if (! pat)
29761 return 0;
29762 emit_insn (pat);
29763 return 0;
29764
29765 case IX86_BUILTIN_LDMXCSR:
29766 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29767 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29768 emit_move_insn (target, op0);
29769 emit_insn (gen_sse_ldmxcsr (target));
29770 return 0;
29771
29772 case IX86_BUILTIN_STMXCSR:
29773 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29774 emit_insn (gen_sse_stmxcsr (target));
29775 return copy_to_mode_reg (SImode, target);
29776
29777 case IX86_BUILTIN_CLFLUSH:
29778 arg0 = CALL_EXPR_ARG (exp, 0);
29779 op0 = expand_normal (arg0);
29780 icode = CODE_FOR_sse2_clflush;
29781 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29782 {
29783 if (GET_MODE (op0) != Pmode)
29784 op0 = convert_to_mode (Pmode, op0, 1);
29785 op0 = force_reg (Pmode, op0);
29786 }
29787
29788 emit_insn (gen_sse2_clflush (op0));
29789 return 0;
29790
29791 case IX86_BUILTIN_MONITOR:
29792 arg0 = CALL_EXPR_ARG (exp, 0);
29793 arg1 = CALL_EXPR_ARG (exp, 1);
29794 arg2 = CALL_EXPR_ARG (exp, 2);
29795 op0 = expand_normal (arg0);
29796 op1 = expand_normal (arg1);
29797 op2 = expand_normal (arg2);
29798 if (!REG_P (op0))
29799 {
29800 if (GET_MODE (op0) != Pmode)
29801 op0 = convert_to_mode (Pmode, op0, 1);
29802 op0 = force_reg (Pmode, op0);
29803 }
29804 if (!REG_P (op1))
29805 op1 = copy_to_mode_reg (SImode, op1);
29806 if (!REG_P (op2))
29807 op2 = copy_to_mode_reg (SImode, op2);
29808 emit_insn (ix86_gen_monitor (op0, op1, op2));
29809 return 0;
29810
29811 case IX86_BUILTIN_MWAIT:
29812 arg0 = CALL_EXPR_ARG (exp, 0);
29813 arg1 = CALL_EXPR_ARG (exp, 1);
29814 op0 = expand_normal (arg0);
29815 op1 = expand_normal (arg1);
29816 if (!REG_P (op0))
29817 op0 = copy_to_mode_reg (SImode, op0);
29818 if (!REG_P (op1))
29819 op1 = copy_to_mode_reg (SImode, op1);
29820 emit_insn (gen_sse3_mwait (op0, op1));
29821 return 0;
29822
29823 case IX86_BUILTIN_VEC_INIT_V2SI:
29824 case IX86_BUILTIN_VEC_INIT_V4HI:
29825 case IX86_BUILTIN_VEC_INIT_V8QI:
29826 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29827
29828 case IX86_BUILTIN_VEC_EXT_V2DF:
29829 case IX86_BUILTIN_VEC_EXT_V2DI:
29830 case IX86_BUILTIN_VEC_EXT_V4SF:
29831 case IX86_BUILTIN_VEC_EXT_V4SI:
29832 case IX86_BUILTIN_VEC_EXT_V8HI:
29833 case IX86_BUILTIN_VEC_EXT_V2SI:
29834 case IX86_BUILTIN_VEC_EXT_V4HI:
29835 case IX86_BUILTIN_VEC_EXT_V16QI:
29836 return ix86_expand_vec_ext_builtin (exp, target);
29837
29838 case IX86_BUILTIN_VEC_SET_V2DI:
29839 case IX86_BUILTIN_VEC_SET_V4SF:
29840 case IX86_BUILTIN_VEC_SET_V4SI:
29841 case IX86_BUILTIN_VEC_SET_V8HI:
29842 case IX86_BUILTIN_VEC_SET_V4HI:
29843 case IX86_BUILTIN_VEC_SET_V16QI:
29844 return ix86_expand_vec_set_builtin (exp);
29845
29846 case IX86_BUILTIN_INFQ:
29847 case IX86_BUILTIN_HUGE_VALQ:
29848 {
29849 REAL_VALUE_TYPE inf;
29850 rtx tmp;
29851
29852 real_inf (&inf);
29853 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29854
29855 tmp = validize_mem (force_const_mem (mode, tmp));
29856
29857 if (target == 0)
29858 target = gen_reg_rtx (mode);
29859
29860 emit_move_insn (target, tmp);
29861 return target;
29862 }
29863
29864 case IX86_BUILTIN_LLWPCB:
29865 arg0 = CALL_EXPR_ARG (exp, 0);
29866 op0 = expand_normal (arg0);
29867 icode = CODE_FOR_lwp_llwpcb;
29868 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29869 {
29870 if (GET_MODE (op0) != Pmode)
29871 op0 = convert_to_mode (Pmode, op0, 1);
29872 op0 = force_reg (Pmode, op0);
29873 }
29874 emit_insn (gen_lwp_llwpcb (op0));
29875 return 0;
29876
29877 case IX86_BUILTIN_SLWPCB:
29878 icode = CODE_FOR_lwp_slwpcb;
29879 if (!target
29880 || !insn_data[icode].operand[0].predicate (target, Pmode))
29881 target = gen_reg_rtx (Pmode);
29882 emit_insn (gen_lwp_slwpcb (target));
29883 return target;
29884
29885 case IX86_BUILTIN_BEXTRI32:
29886 case IX86_BUILTIN_BEXTRI64:
29887 arg0 = CALL_EXPR_ARG (exp, 0);
29888 arg1 = CALL_EXPR_ARG (exp, 1);
29889 op0 = expand_normal (arg0);
29890 op1 = expand_normal (arg1);
29891 icode = (fcode == IX86_BUILTIN_BEXTRI32
29892 ? CODE_FOR_tbm_bextri_si
29893 : CODE_FOR_tbm_bextri_di);
29894 if (!CONST_INT_P (op1))
29895 {
29896 error ("last argument must be an immediate");
29897 return const0_rtx;
29898 }
29899 else
29900 {
29901 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29902 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29903 op1 = GEN_INT (length);
29904 op2 = GEN_INT (lsb_index);
29905 pat = GEN_FCN (icode) (target, op0, op1, op2);
29906 if (pat)
29907 emit_insn (pat);
29908 return target;
29909 }
29910
29911 case IX86_BUILTIN_RDRAND16_STEP:
29912 icode = CODE_FOR_rdrandhi_1;
29913 mode0 = HImode;
29914 goto rdrand_step;
29915
29916 case IX86_BUILTIN_RDRAND32_STEP:
29917 icode = CODE_FOR_rdrandsi_1;
29918 mode0 = SImode;
29919 goto rdrand_step;
29920
29921 case IX86_BUILTIN_RDRAND64_STEP:
29922 icode = CODE_FOR_rdranddi_1;
29923 mode0 = DImode;
29924
29925 rdrand_step:
29926 op0 = gen_reg_rtx (mode0);
29927 emit_insn (GEN_FCN (icode) (op0));
29928
29929 arg0 = CALL_EXPR_ARG (exp, 0);
29930 op1 = expand_normal (arg0);
29931 if (!address_operand (op1, VOIDmode))
29932 {
29933 op1 = convert_memory_address (Pmode, op1);
29934 op1 = copy_addr_to_reg (op1);
29935 }
29936 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29937
29938 op1 = gen_reg_rtx (SImode);
29939 emit_move_insn (op1, CONST1_RTX (SImode));
29940
29941 /* Emit SImode conditional move. */
29942 if (mode0 == HImode)
29943 {
29944 op2 = gen_reg_rtx (SImode);
29945 emit_insn (gen_zero_extendhisi2 (op2, op0));
29946 }
29947 else if (mode0 == SImode)
29948 op2 = op0;
29949 else
29950 op2 = gen_rtx_SUBREG (SImode, op0, 0);
29951
29952 if (target == 0)
29953 target = gen_reg_rtx (SImode);
29954
29955 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29956 const0_rtx);
29957 emit_insn (gen_rtx_SET (VOIDmode, target,
29958 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29959 return target;
29960
29961 case IX86_BUILTIN_GATHERSIV2DF:
29962 icode = CODE_FOR_avx2_gathersiv2df;
29963 goto gather_gen;
29964 case IX86_BUILTIN_GATHERSIV4DF:
29965 icode = CODE_FOR_avx2_gathersiv4df;
29966 goto gather_gen;
29967 case IX86_BUILTIN_GATHERDIV2DF:
29968 icode = CODE_FOR_avx2_gatherdiv2df;
29969 goto gather_gen;
29970 case IX86_BUILTIN_GATHERDIV4DF:
29971 icode = CODE_FOR_avx2_gatherdiv4df;
29972 goto gather_gen;
29973 case IX86_BUILTIN_GATHERSIV4SF:
29974 icode = CODE_FOR_avx2_gathersiv4sf;
29975 goto gather_gen;
29976 case IX86_BUILTIN_GATHERSIV8SF:
29977 icode = CODE_FOR_avx2_gathersiv8sf;
29978 goto gather_gen;
29979 case IX86_BUILTIN_GATHERDIV4SF:
29980 icode = CODE_FOR_avx2_gatherdiv4sf;
29981 goto gather_gen;
29982 case IX86_BUILTIN_GATHERDIV8SF:
29983 icode = CODE_FOR_avx2_gatherdiv8sf;
29984 goto gather_gen;
29985 case IX86_BUILTIN_GATHERSIV2DI:
29986 icode = CODE_FOR_avx2_gathersiv2di;
29987 goto gather_gen;
29988 case IX86_BUILTIN_GATHERSIV4DI:
29989 icode = CODE_FOR_avx2_gathersiv4di;
29990 goto gather_gen;
29991 case IX86_BUILTIN_GATHERDIV2DI:
29992 icode = CODE_FOR_avx2_gatherdiv2di;
29993 goto gather_gen;
29994 case IX86_BUILTIN_GATHERDIV4DI:
29995 icode = CODE_FOR_avx2_gatherdiv4di;
29996 goto gather_gen;
29997 case IX86_BUILTIN_GATHERSIV4SI:
29998 icode = CODE_FOR_avx2_gathersiv4si;
29999 goto gather_gen;
30000 case IX86_BUILTIN_GATHERSIV8SI:
30001 icode = CODE_FOR_avx2_gathersiv8si;
30002 goto gather_gen;
30003 case IX86_BUILTIN_GATHERDIV4SI:
30004 icode = CODE_FOR_avx2_gatherdiv4si;
30005 goto gather_gen;
30006 case IX86_BUILTIN_GATHERDIV8SI:
30007 icode = CODE_FOR_avx2_gatherdiv8si;
30008 goto gather_gen;
30009 case IX86_BUILTIN_GATHERALTSIV4DF:
30010 icode = CODE_FOR_avx2_gathersiv4df;
30011 goto gather_gen;
30012 case IX86_BUILTIN_GATHERALTDIV8SF:
30013 icode = CODE_FOR_avx2_gatherdiv8sf;
30014 goto gather_gen;
30015 case IX86_BUILTIN_GATHERALTSIV4DI:
30016 icode = CODE_FOR_avx2_gathersiv4df;
30017 goto gather_gen;
30018 case IX86_BUILTIN_GATHERALTDIV8SI:
30019 icode = CODE_FOR_avx2_gatherdiv8si;
30020 goto gather_gen;
30021
30022 gather_gen:
30023 arg0 = CALL_EXPR_ARG (exp, 0);
30024 arg1 = CALL_EXPR_ARG (exp, 1);
30025 arg2 = CALL_EXPR_ARG (exp, 2);
30026 arg3 = CALL_EXPR_ARG (exp, 3);
30027 arg4 = CALL_EXPR_ARG (exp, 4);
30028 op0 = expand_normal (arg0);
30029 op1 = expand_normal (arg1);
30030 op2 = expand_normal (arg2);
30031 op3 = expand_normal (arg3);
30032 op4 = expand_normal (arg4);
30033 /* Note the arg order is different from the operand order. */
30034 mode0 = insn_data[icode].operand[1].mode;
30035 mode2 = insn_data[icode].operand[3].mode;
30036 mode3 = insn_data[icode].operand[4].mode;
30037 mode4 = insn_data[icode].operand[5].mode;
30038
30039 if (target == NULL_RTX
30040 || GET_MODE (target) != insn_data[icode].operand[0].mode)
30041 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
30042 else
30043 subtarget = target;
30044
30045 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
30046 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
30047 {
30048 rtx half = gen_reg_rtx (V4SImode);
30049 if (!nonimmediate_operand (op2, V8SImode))
30050 op2 = copy_to_mode_reg (V8SImode, op2);
30051 emit_insn (gen_vec_extract_lo_v8si (half, op2));
30052 op2 = half;
30053 }
30054 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
30055 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
30056 {
30057 rtx (*gen) (rtx, rtx);
30058 rtx half = gen_reg_rtx (mode0);
30059 if (mode0 == V4SFmode)
30060 gen = gen_vec_extract_lo_v8sf;
30061 else
30062 gen = gen_vec_extract_lo_v8si;
30063 if (!nonimmediate_operand (op0, GET_MODE (op0)))
30064 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
30065 emit_insn (gen (half, op0));
30066 op0 = half;
30067 if (!nonimmediate_operand (op3, GET_MODE (op3)))
30068 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
30069 emit_insn (gen (half, op3));
30070 op3 = half;
30071 }
30072
30073 /* Force memory operand only with base register here. But we
30074 don't want to do it on memory operand for other builtin
30075 functions. */
30076 if (GET_MODE (op1) != Pmode)
30077 op1 = convert_to_mode (Pmode, op1, 1);
30078 op1 = force_reg (Pmode, op1);
30079
30080 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30081 op0 = copy_to_mode_reg (mode0, op0);
30082 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
30083 op1 = copy_to_mode_reg (Pmode, op1);
30084 if (!insn_data[icode].operand[3].predicate (op2, mode2))
30085 op2 = copy_to_mode_reg (mode2, op2);
30086 if (!insn_data[icode].operand[4].predicate (op3, mode3))
30087 op3 = copy_to_mode_reg (mode3, op3);
30088 if (!insn_data[icode].operand[5].predicate (op4, mode4))
30089 {
30090 error ("last argument must be scale 1, 2, 4, 8");
30091 return const0_rtx;
30092 }
30093
30094 /* Optimize. If mask is known to have all high bits set,
30095 replace op0 with pc_rtx to signal that the instruction
30096 overwrites the whole destination and doesn't use its
30097 previous contents. */
30098 if (optimize)
30099 {
30100 if (TREE_CODE (arg3) == VECTOR_CST)
30101 {
30102 tree elt;
30103 unsigned int negative = 0;
30104 for (elt = TREE_VECTOR_CST_ELTS (arg3);
30105 elt; elt = TREE_CHAIN (elt))
30106 {
30107 tree cst = TREE_VALUE (elt);
30108 if (TREE_CODE (cst) == INTEGER_CST
30109 && tree_int_cst_sign_bit (cst))
30110 negative++;
30111 else if (TREE_CODE (cst) == REAL_CST
30112 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
30113 negative++;
30114 }
30115 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
30116 op0 = pc_rtx;
30117 }
30118 else if (TREE_CODE (arg3) == SSA_NAME)
30119 {
30120 /* Recognize also when mask is like:
30121 __v2df src = _mm_setzero_pd ();
30122 __v2df mask = _mm_cmpeq_pd (src, src);
30123 or
30124 __v8sf src = _mm256_setzero_ps ();
30125 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
30126 as that is a cheaper way to load all ones into
30127 a register than having to load a constant from
30128 memory. */
30129 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
30130 if (is_gimple_call (def_stmt))
30131 {
30132 tree fndecl = gimple_call_fndecl (def_stmt);
30133 if (fndecl
30134 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30135 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
30136 {
30137 case IX86_BUILTIN_CMPPD:
30138 case IX86_BUILTIN_CMPPS:
30139 case IX86_BUILTIN_CMPPD256:
30140 case IX86_BUILTIN_CMPPS256:
30141 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
30142 break;
30143 /* FALLTHRU */
30144 case IX86_BUILTIN_CMPEQPD:
30145 case IX86_BUILTIN_CMPEQPS:
30146 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
30147 && initializer_zerop (gimple_call_arg (def_stmt,
30148 1)))
30149 op0 = pc_rtx;
30150 break;
30151 default:
30152 break;
30153 }
30154 }
30155 }
30156 }
30157
30158 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
30159 if (! pat)
30160 return const0_rtx;
30161 emit_insn (pat);
30162
30163 if (fcode == IX86_BUILTIN_GATHERDIV8SF
30164 || fcode == IX86_BUILTIN_GATHERDIV8SI)
30165 {
30166 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
30167 ? V4SFmode : V4SImode;
30168 if (target == NULL_RTX)
30169 target = gen_reg_rtx (tmode);
30170 if (tmode == V4SFmode)
30171 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
30172 else
30173 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
30174 }
30175 else
30176 target = subtarget;
30177
30178 return target;
30179
30180 default:
30181 break;
30182 }
30183
30184 for (i = 0, d = bdesc_special_args;
30185 i < ARRAY_SIZE (bdesc_special_args);
30186 i++, d++)
30187 if (d->code == fcode)
30188 return ix86_expand_special_args_builtin (d, exp, target);
30189
30190 for (i = 0, d = bdesc_args;
30191 i < ARRAY_SIZE (bdesc_args);
30192 i++, d++)
30193 if (d->code == fcode)
30194 switch (fcode)
30195 {
30196 case IX86_BUILTIN_FABSQ:
30197 case IX86_BUILTIN_COPYSIGNQ:
30198 if (!TARGET_SSE2)
30199 /* Emit a normal call if SSE2 isn't available. */
30200 return expand_call (exp, target, ignore);
30201 default:
30202 return ix86_expand_args_builtin (d, exp, target);
30203 }
30204
30205 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30206 if (d->code == fcode)
30207 return ix86_expand_sse_comi (d, exp, target);
30208
30209 for (i = 0, d = bdesc_pcmpestr;
30210 i < ARRAY_SIZE (bdesc_pcmpestr);
30211 i++, d++)
30212 if (d->code == fcode)
30213 return ix86_expand_sse_pcmpestr (d, exp, target);
30214
30215 for (i = 0, d = bdesc_pcmpistr;
30216 i < ARRAY_SIZE (bdesc_pcmpistr);
30217 i++, d++)
30218 if (d->code == fcode)
30219 return ix86_expand_sse_pcmpistr (d, exp, target);
30220
30221 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30222 if (d->code == fcode)
30223 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
30224 (enum ix86_builtin_func_type)
30225 d->flag, d->comparison);
30226
30227 gcc_unreachable ();
30228 }
30229
30230 /* Returns a function decl for a vectorized version of the builtin function
30231 with builtin function code FN and the result vector type TYPE, or NULL_TREE
30232 if it is not available. */
30233
30234 static tree
30235 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
30236 tree type_in)
30237 {
30238 enum machine_mode in_mode, out_mode;
30239 int in_n, out_n;
30240 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
30241
30242 if (TREE_CODE (type_out) != VECTOR_TYPE
30243 || TREE_CODE (type_in) != VECTOR_TYPE
30244 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
30245 return NULL_TREE;
30246
30247 out_mode = TYPE_MODE (TREE_TYPE (type_out));
30248 out_n = TYPE_VECTOR_SUBPARTS (type_out);
30249 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30250 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30251
30252 switch (fn)
30253 {
30254 case BUILT_IN_SQRT:
30255 if (out_mode == DFmode && in_mode == DFmode)
30256 {
30257 if (out_n == 2 && in_n == 2)
30258 return ix86_builtins[IX86_BUILTIN_SQRTPD];
30259 else if (out_n == 4 && in_n == 4)
30260 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
30261 }
30262 break;
30263
30264 case BUILT_IN_SQRTF:
30265 if (out_mode == SFmode && in_mode == SFmode)
30266 {
30267 if (out_n == 4 && in_n == 4)
30268 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
30269 else if (out_n == 8 && in_n == 8)
30270 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
30271 }
30272 break;
30273
30274 case BUILT_IN_IFLOOR:
30275 case BUILT_IN_LFLOOR:
30276 case BUILT_IN_LLFLOOR:
30277 /* The round insn does not trap on denormals. */
30278 if (flag_trapping_math || !TARGET_ROUND)
30279 break;
30280
30281 if (out_mode == SImode && in_mode == DFmode)
30282 {
30283 if (out_n == 4 && in_n == 2)
30284 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
30285 else if (out_n == 8 && in_n == 4)
30286 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
30287 }
30288 break;
30289
30290 case BUILT_IN_IFLOORF:
30291 case BUILT_IN_LFLOORF:
30292 case BUILT_IN_LLFLOORF:
30293 /* The round insn does not trap on denormals. */
30294 if (flag_trapping_math || !TARGET_ROUND)
30295 break;
30296
30297 if (out_mode == SImode && in_mode == SFmode)
30298 {
30299 if (out_n == 4 && in_n == 4)
30300 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
30301 else if (out_n == 8 && in_n == 8)
30302 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
30303 }
30304 break;
30305
30306 case BUILT_IN_ICEIL:
30307 case BUILT_IN_LCEIL:
30308 case BUILT_IN_LLCEIL:
30309 /* The round insn does not trap on denormals. */
30310 if (flag_trapping_math || !TARGET_ROUND)
30311 break;
30312
30313 if (out_mode == SImode && in_mode == DFmode)
30314 {
30315 if (out_n == 4 && in_n == 2)
30316 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
30317 else if (out_n == 8 && in_n == 4)
30318 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
30319 }
30320 break;
30321
30322 case BUILT_IN_ICEILF:
30323 case BUILT_IN_LCEILF:
30324 case BUILT_IN_LLCEILF:
30325 /* The round insn does not trap on denormals. */
30326 if (flag_trapping_math || !TARGET_ROUND)
30327 break;
30328
30329 if (out_mode == SImode && in_mode == SFmode)
30330 {
30331 if (out_n == 4 && in_n == 4)
30332 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
30333 else if (out_n == 8 && in_n == 8)
30334 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
30335 }
30336 break;
30337
30338 case BUILT_IN_IRINT:
30339 case BUILT_IN_LRINT:
30340 case BUILT_IN_LLRINT:
30341 if (out_mode == SImode && in_mode == DFmode)
30342 {
30343 if (out_n == 4 && in_n == 2)
30344 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
30345 else if (out_n == 8 && in_n == 4)
30346 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
30347 }
30348 break;
30349
30350 case BUILT_IN_IRINTF:
30351 case BUILT_IN_LRINTF:
30352 case BUILT_IN_LLRINTF:
30353 if (out_mode == SImode && in_mode == SFmode)
30354 {
30355 if (out_n == 4 && in_n == 4)
30356 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
30357 else if (out_n == 8 && in_n == 8)
30358 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
30359 }
30360 break;
30361
30362 case BUILT_IN_IROUND:
30363 case BUILT_IN_LROUND:
30364 case BUILT_IN_LLROUND:
30365 /* The round insn does not trap on denormals. */
30366 if (flag_trapping_math || !TARGET_ROUND)
30367 break;
30368
30369 if (out_mode == SImode && in_mode == DFmode)
30370 {
30371 if (out_n == 4 && in_n == 2)
30372 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
30373 else if (out_n == 8 && in_n == 4)
30374 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
30375 }
30376 break;
30377
30378 case BUILT_IN_IROUNDF:
30379 case BUILT_IN_LROUNDF:
30380 case BUILT_IN_LLROUNDF:
30381 /* The round insn does not trap on denormals. */
30382 if (flag_trapping_math || !TARGET_ROUND)
30383 break;
30384
30385 if (out_mode == SImode && in_mode == SFmode)
30386 {
30387 if (out_n == 4 && in_n == 4)
30388 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
30389 else if (out_n == 8 && in_n == 8)
30390 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
30391 }
30392 break;
30393
30394 case BUILT_IN_COPYSIGN:
30395 if (out_mode == DFmode && in_mode == DFmode)
30396 {
30397 if (out_n == 2 && in_n == 2)
30398 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
30399 else if (out_n == 4 && in_n == 4)
30400 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
30401 }
30402 break;
30403
30404 case BUILT_IN_COPYSIGNF:
30405 if (out_mode == SFmode && in_mode == SFmode)
30406 {
30407 if (out_n == 4 && in_n == 4)
30408 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
30409 else if (out_n == 8 && in_n == 8)
30410 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
30411 }
30412 break;
30413
30414 case BUILT_IN_FLOOR:
30415 /* The round insn does not trap on denormals. */
30416 if (flag_trapping_math || !TARGET_ROUND)
30417 break;
30418
30419 if (out_mode == DFmode && in_mode == DFmode)
30420 {
30421 if (out_n == 2 && in_n == 2)
30422 return ix86_builtins[IX86_BUILTIN_FLOORPD];
30423 else if (out_n == 4 && in_n == 4)
30424 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
30425 }
30426 break;
30427
30428 case BUILT_IN_FLOORF:
30429 /* The round insn does not trap on denormals. */
30430 if (flag_trapping_math || !TARGET_ROUND)
30431 break;
30432
30433 if (out_mode == SFmode && in_mode == SFmode)
30434 {
30435 if (out_n == 4 && in_n == 4)
30436 return ix86_builtins[IX86_BUILTIN_FLOORPS];
30437 else if (out_n == 8 && in_n == 8)
30438 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
30439 }
30440 break;
30441
30442 case BUILT_IN_CEIL:
30443 /* The round insn does not trap on denormals. */
30444 if (flag_trapping_math || !TARGET_ROUND)
30445 break;
30446
30447 if (out_mode == DFmode && in_mode == DFmode)
30448 {
30449 if (out_n == 2 && in_n == 2)
30450 return ix86_builtins[IX86_BUILTIN_CEILPD];
30451 else if (out_n == 4 && in_n == 4)
30452 return ix86_builtins[IX86_BUILTIN_CEILPD256];
30453 }
30454 break;
30455
30456 case BUILT_IN_CEILF:
30457 /* The round insn does not trap on denormals. */
30458 if (flag_trapping_math || !TARGET_ROUND)
30459 break;
30460
30461 if (out_mode == SFmode && in_mode == SFmode)
30462 {
30463 if (out_n == 4 && in_n == 4)
30464 return ix86_builtins[IX86_BUILTIN_CEILPS];
30465 else if (out_n == 8 && in_n == 8)
30466 return ix86_builtins[IX86_BUILTIN_CEILPS256];
30467 }
30468 break;
30469
30470 case BUILT_IN_TRUNC:
30471 /* The round insn does not trap on denormals. */
30472 if (flag_trapping_math || !TARGET_ROUND)
30473 break;
30474
30475 if (out_mode == DFmode && in_mode == DFmode)
30476 {
30477 if (out_n == 2 && in_n == 2)
30478 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
30479 else if (out_n == 4 && in_n == 4)
30480 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
30481 }
30482 break;
30483
30484 case BUILT_IN_TRUNCF:
30485 /* The round insn does not trap on denormals. */
30486 if (flag_trapping_math || !TARGET_ROUND)
30487 break;
30488
30489 if (out_mode == SFmode && in_mode == SFmode)
30490 {
30491 if (out_n == 4 && in_n == 4)
30492 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
30493 else if (out_n == 8 && in_n == 8)
30494 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
30495 }
30496 break;
30497
30498 case BUILT_IN_RINT:
30499 /* The round insn does not trap on denormals. */
30500 if (flag_trapping_math || !TARGET_ROUND)
30501 break;
30502
30503 if (out_mode == DFmode && in_mode == DFmode)
30504 {
30505 if (out_n == 2 && in_n == 2)
30506 return ix86_builtins[IX86_BUILTIN_RINTPD];
30507 else if (out_n == 4 && in_n == 4)
30508 return ix86_builtins[IX86_BUILTIN_RINTPD256];
30509 }
30510 break;
30511
30512 case BUILT_IN_RINTF:
30513 /* The round insn does not trap on denormals. */
30514 if (flag_trapping_math || !TARGET_ROUND)
30515 break;
30516
30517 if (out_mode == SFmode && in_mode == SFmode)
30518 {
30519 if (out_n == 4 && in_n == 4)
30520 return ix86_builtins[IX86_BUILTIN_RINTPS];
30521 else if (out_n == 8 && in_n == 8)
30522 return ix86_builtins[IX86_BUILTIN_RINTPS256];
30523 }
30524 break;
30525
30526 case BUILT_IN_ROUND:
30527 /* The round insn does not trap on denormals. */
30528 if (flag_trapping_math || !TARGET_ROUND)
30529 break;
30530
30531 if (out_mode == DFmode && in_mode == DFmode)
30532 {
30533 if (out_n == 2 && in_n == 2)
30534 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
30535 else if (out_n == 4 && in_n == 4)
30536 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
30537 }
30538 break;
30539
30540 case BUILT_IN_ROUNDF:
30541 /* The round insn does not trap on denormals. */
30542 if (flag_trapping_math || !TARGET_ROUND)
30543 break;
30544
30545 if (out_mode == SFmode && in_mode == SFmode)
30546 {
30547 if (out_n == 4 && in_n == 4)
30548 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
30549 else if (out_n == 8 && in_n == 8)
30550 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
30551 }
30552 break;
30553
30554 case BUILT_IN_FMA:
30555 if (out_mode == DFmode && in_mode == DFmode)
30556 {
30557 if (out_n == 2 && in_n == 2)
30558 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
30559 if (out_n == 4 && in_n == 4)
30560 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30561 }
30562 break;
30563
30564 case BUILT_IN_FMAF:
30565 if (out_mode == SFmode && in_mode == SFmode)
30566 {
30567 if (out_n == 4 && in_n == 4)
30568 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30569 if (out_n == 8 && in_n == 8)
30570 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30571 }
30572 break;
30573
30574 default:
30575 break;
30576 }
30577
30578 /* Dispatch to a handler for a vectorization library. */
30579 if (ix86_veclib_handler)
30580 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30581 type_in);
30582
30583 return NULL_TREE;
30584 }
30585
30586 /* Handler for an SVML-style interface to
30587 a library with vectorized intrinsics. */
30588
30589 static tree
30590 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30591 {
30592 char name[20];
30593 tree fntype, new_fndecl, args;
30594 unsigned arity;
30595 const char *bname;
30596 enum machine_mode el_mode, in_mode;
30597 int n, in_n;
30598
30599 /* The SVML is suitable for unsafe math only. */
30600 if (!flag_unsafe_math_optimizations)
30601 return NULL_TREE;
30602
30603 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30604 n = TYPE_VECTOR_SUBPARTS (type_out);
30605 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30606 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30607 if (el_mode != in_mode
30608 || n != in_n)
30609 return NULL_TREE;
30610
30611 switch (fn)
30612 {
30613 case BUILT_IN_EXP:
30614 case BUILT_IN_LOG:
30615 case BUILT_IN_LOG10:
30616 case BUILT_IN_POW:
30617 case BUILT_IN_TANH:
30618 case BUILT_IN_TAN:
30619 case BUILT_IN_ATAN:
30620 case BUILT_IN_ATAN2:
30621 case BUILT_IN_ATANH:
30622 case BUILT_IN_CBRT:
30623 case BUILT_IN_SINH:
30624 case BUILT_IN_SIN:
30625 case BUILT_IN_ASINH:
30626 case BUILT_IN_ASIN:
30627 case BUILT_IN_COSH:
30628 case BUILT_IN_COS:
30629 case BUILT_IN_ACOSH:
30630 case BUILT_IN_ACOS:
30631 if (el_mode != DFmode || n != 2)
30632 return NULL_TREE;
30633 break;
30634
30635 case BUILT_IN_EXPF:
30636 case BUILT_IN_LOGF:
30637 case BUILT_IN_LOG10F:
30638 case BUILT_IN_POWF:
30639 case BUILT_IN_TANHF:
30640 case BUILT_IN_TANF:
30641 case BUILT_IN_ATANF:
30642 case BUILT_IN_ATAN2F:
30643 case BUILT_IN_ATANHF:
30644 case BUILT_IN_CBRTF:
30645 case BUILT_IN_SINHF:
30646 case BUILT_IN_SINF:
30647 case BUILT_IN_ASINHF:
30648 case BUILT_IN_ASINF:
30649 case BUILT_IN_COSHF:
30650 case BUILT_IN_COSF:
30651 case BUILT_IN_ACOSHF:
30652 case BUILT_IN_ACOSF:
30653 if (el_mode != SFmode || n != 4)
30654 return NULL_TREE;
30655 break;
30656
30657 default:
30658 return NULL_TREE;
30659 }
30660
30661 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30662
30663 if (fn == BUILT_IN_LOGF)
30664 strcpy (name, "vmlsLn4");
30665 else if (fn == BUILT_IN_LOG)
30666 strcpy (name, "vmldLn2");
30667 else if (n == 4)
30668 {
30669 sprintf (name, "vmls%s", bname+10);
30670 name[strlen (name)-1] = '4';
30671 }
30672 else
30673 sprintf (name, "vmld%s2", bname+10);
30674
30675 /* Convert to uppercase. */
30676 name[4] &= ~0x20;
30677
30678 arity = 0;
30679 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30680 args;
30681 args = TREE_CHAIN (args))
30682 arity++;
30683
30684 if (arity == 1)
30685 fntype = build_function_type_list (type_out, type_in, NULL);
30686 else
30687 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30688
30689 /* Build a function declaration for the vectorized function. */
30690 new_fndecl = build_decl (BUILTINS_LOCATION,
30691 FUNCTION_DECL, get_identifier (name), fntype);
30692 TREE_PUBLIC (new_fndecl) = 1;
30693 DECL_EXTERNAL (new_fndecl) = 1;
30694 DECL_IS_NOVOPS (new_fndecl) = 1;
30695 TREE_READONLY (new_fndecl) = 1;
30696
30697 return new_fndecl;
30698 }
30699
30700 /* Handler for an ACML-style interface to
30701 a library with vectorized intrinsics. */
30702
30703 static tree
30704 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30705 {
30706 char name[20] = "__vr.._";
30707 tree fntype, new_fndecl, args;
30708 unsigned arity;
30709 const char *bname;
30710 enum machine_mode el_mode, in_mode;
30711 int n, in_n;
30712
30713 /* The ACML is 64bits only and suitable for unsafe math only as
30714 it does not correctly support parts of IEEE with the required
30715 precision such as denormals. */
30716 if (!TARGET_64BIT
30717 || !flag_unsafe_math_optimizations)
30718 return NULL_TREE;
30719
30720 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30721 n = TYPE_VECTOR_SUBPARTS (type_out);
30722 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30723 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30724 if (el_mode != in_mode
30725 || n != in_n)
30726 return NULL_TREE;
30727
30728 switch (fn)
30729 {
30730 case BUILT_IN_SIN:
30731 case BUILT_IN_COS:
30732 case BUILT_IN_EXP:
30733 case BUILT_IN_LOG:
30734 case BUILT_IN_LOG2:
30735 case BUILT_IN_LOG10:
30736 name[4] = 'd';
30737 name[5] = '2';
30738 if (el_mode != DFmode
30739 || n != 2)
30740 return NULL_TREE;
30741 break;
30742
30743 case BUILT_IN_SINF:
30744 case BUILT_IN_COSF:
30745 case BUILT_IN_EXPF:
30746 case BUILT_IN_POWF:
30747 case BUILT_IN_LOGF:
30748 case BUILT_IN_LOG2F:
30749 case BUILT_IN_LOG10F:
30750 name[4] = 's';
30751 name[5] = '4';
30752 if (el_mode != SFmode
30753 || n != 4)
30754 return NULL_TREE;
30755 break;
30756
30757 default:
30758 return NULL_TREE;
30759 }
30760
30761 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30762 sprintf (name + 7, "%s", bname+10);
30763
30764 arity = 0;
30765 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30766 args;
30767 args = TREE_CHAIN (args))
30768 arity++;
30769
30770 if (arity == 1)
30771 fntype = build_function_type_list (type_out, type_in, NULL);
30772 else
30773 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30774
30775 /* Build a function declaration for the vectorized function. */
30776 new_fndecl = build_decl (BUILTINS_LOCATION,
30777 FUNCTION_DECL, get_identifier (name), fntype);
30778 TREE_PUBLIC (new_fndecl) = 1;
30779 DECL_EXTERNAL (new_fndecl) = 1;
30780 DECL_IS_NOVOPS (new_fndecl) = 1;
30781 TREE_READONLY (new_fndecl) = 1;
30782
30783 return new_fndecl;
30784 }
30785
30786 /* Returns a decl of a function that implements gather load with
30787 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30788 Return NULL_TREE if it is not available. */
30789
30790 static tree
30791 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30792 const_tree index_type, int scale)
30793 {
30794 bool si;
30795 enum ix86_builtins code;
30796
30797 if (! TARGET_AVX2)
30798 return NULL_TREE;
30799
30800 if ((TREE_CODE (index_type) != INTEGER_TYPE
30801 && !POINTER_TYPE_P (index_type))
30802 || (TYPE_MODE (index_type) != SImode
30803 && TYPE_MODE (index_type) != DImode))
30804 return NULL_TREE;
30805
30806 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30807 return NULL_TREE;
30808
30809 /* v*gather* insn sign extends index to pointer mode. */
30810 if (TYPE_PRECISION (index_type) < POINTER_SIZE
30811 && TYPE_UNSIGNED (index_type))
30812 return NULL_TREE;
30813
30814 if (scale <= 0
30815 || scale > 8
30816 || (scale & (scale - 1)) != 0)
30817 return NULL_TREE;
30818
30819 si = TYPE_MODE (index_type) == SImode;
30820 switch (TYPE_MODE (mem_vectype))
30821 {
30822 case V2DFmode:
30823 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30824 break;
30825 case V4DFmode:
30826 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30827 break;
30828 case V2DImode:
30829 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30830 break;
30831 case V4DImode:
30832 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30833 break;
30834 case V4SFmode:
30835 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30836 break;
30837 case V8SFmode:
30838 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30839 break;
30840 case V4SImode:
30841 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30842 break;
30843 case V8SImode:
30844 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30845 break;
30846 default:
30847 return NULL_TREE;
30848 }
30849
30850 return ix86_builtins[code];
30851 }
30852
30853 /* Returns a code for a target-specific builtin that implements
30854 reciprocal of the function, or NULL_TREE if not available. */
30855
30856 static tree
30857 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30858 bool sqrt ATTRIBUTE_UNUSED)
30859 {
30860 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30861 && flag_finite_math_only && !flag_trapping_math
30862 && flag_unsafe_math_optimizations))
30863 return NULL_TREE;
30864
30865 if (md_fn)
30866 /* Machine dependent builtins. */
30867 switch (fn)
30868 {
30869 /* Vectorized version of sqrt to rsqrt conversion. */
30870 case IX86_BUILTIN_SQRTPS_NR:
30871 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30872
30873 case IX86_BUILTIN_SQRTPS_NR256:
30874 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30875
30876 default:
30877 return NULL_TREE;
30878 }
30879 else
30880 /* Normal builtins. */
30881 switch (fn)
30882 {
30883 /* Sqrt to rsqrt conversion. */
30884 case BUILT_IN_SQRTF:
30885 return ix86_builtins[IX86_BUILTIN_RSQRTF];
30886
30887 default:
30888 return NULL_TREE;
30889 }
30890 }
30891 \f
30892 /* Helper for avx_vpermilps256_operand et al. This is also used by
30893 the expansion functions to turn the parallel back into a mask.
30894 The return value is 0 for no match and the imm8+1 for a match. */
30895
30896 int
30897 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30898 {
30899 unsigned i, nelt = GET_MODE_NUNITS (mode);
30900 unsigned mask = 0;
30901 unsigned char ipar[8];
30902
30903 if (XVECLEN (par, 0) != (int) nelt)
30904 return 0;
30905
30906 /* Validate that all of the elements are constants, and not totally
30907 out of range. Copy the data into an integral array to make the
30908 subsequent checks easier. */
30909 for (i = 0; i < nelt; ++i)
30910 {
30911 rtx er = XVECEXP (par, 0, i);
30912 unsigned HOST_WIDE_INT ei;
30913
30914 if (!CONST_INT_P (er))
30915 return 0;
30916 ei = INTVAL (er);
30917 if (ei >= nelt)
30918 return 0;
30919 ipar[i] = ei;
30920 }
30921
30922 switch (mode)
30923 {
30924 case V4DFmode:
30925 /* In the 256-bit DFmode case, we can only move elements within
30926 a 128-bit lane. */
30927 for (i = 0; i < 2; ++i)
30928 {
30929 if (ipar[i] >= 2)
30930 return 0;
30931 mask |= ipar[i] << i;
30932 }
30933 for (i = 2; i < 4; ++i)
30934 {
30935 if (ipar[i] < 2)
30936 return 0;
30937 mask |= (ipar[i] - 2) << i;
30938 }
30939 break;
30940
30941 case V8SFmode:
30942 /* In the 256-bit SFmode case, we have full freedom of movement
30943 within the low 128-bit lane, but the high 128-bit lane must
30944 mirror the exact same pattern. */
30945 for (i = 0; i < 4; ++i)
30946 if (ipar[i] + 4 != ipar[i + 4])
30947 return 0;
30948 nelt = 4;
30949 /* FALLTHRU */
30950
30951 case V2DFmode:
30952 case V4SFmode:
30953 /* In the 128-bit case, we've full freedom in the placement of
30954 the elements from the source operand. */
30955 for (i = 0; i < nelt; ++i)
30956 mask |= ipar[i] << (i * (nelt / 2));
30957 break;
30958
30959 default:
30960 gcc_unreachable ();
30961 }
30962
30963 /* Make sure success has a non-zero value by adding one. */
30964 return mask + 1;
30965 }
30966
30967 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
30968 the expansion functions to turn the parallel back into a mask.
30969 The return value is 0 for no match and the imm8+1 for a match. */
30970
30971 int
30972 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30973 {
30974 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30975 unsigned mask = 0;
30976 unsigned char ipar[8];
30977
30978 if (XVECLEN (par, 0) != (int) nelt)
30979 return 0;
30980
30981 /* Validate that all of the elements are constants, and not totally
30982 out of range. Copy the data into an integral array to make the
30983 subsequent checks easier. */
30984 for (i = 0; i < nelt; ++i)
30985 {
30986 rtx er = XVECEXP (par, 0, i);
30987 unsigned HOST_WIDE_INT ei;
30988
30989 if (!CONST_INT_P (er))
30990 return 0;
30991 ei = INTVAL (er);
30992 if (ei >= 2 * nelt)
30993 return 0;
30994 ipar[i] = ei;
30995 }
30996
30997 /* Validate that the halves of the permute are halves. */
30998 for (i = 0; i < nelt2 - 1; ++i)
30999 if (ipar[i] + 1 != ipar[i + 1])
31000 return 0;
31001 for (i = nelt2; i < nelt - 1; ++i)
31002 if (ipar[i] + 1 != ipar[i + 1])
31003 return 0;
31004
31005 /* Reconstruct the mask. */
31006 for (i = 0; i < 2; ++i)
31007 {
31008 unsigned e = ipar[i * nelt2];
31009 if (e % nelt2)
31010 return 0;
31011 e /= nelt2;
31012 mask |= e << (i * 4);
31013 }
31014
31015 /* Make sure success has a non-zero value by adding one. */
31016 return mask + 1;
31017 }
31018 \f
31019 /* Store OPERAND to the memory after reload is completed. This means
31020 that we can't easily use assign_stack_local. */
31021 rtx
31022 ix86_force_to_memory (enum machine_mode mode, rtx operand)
31023 {
31024 rtx result;
31025
31026 gcc_assert (reload_completed);
31027 if (ix86_using_red_zone ())
31028 {
31029 result = gen_rtx_MEM (mode,
31030 gen_rtx_PLUS (Pmode,
31031 stack_pointer_rtx,
31032 GEN_INT (-RED_ZONE_SIZE)));
31033 emit_move_insn (result, operand);
31034 }
31035 else if (TARGET_64BIT)
31036 {
31037 switch (mode)
31038 {
31039 case HImode:
31040 case SImode:
31041 operand = gen_lowpart (DImode, operand);
31042 /* FALLTHRU */
31043 case DImode:
31044 emit_insn (
31045 gen_rtx_SET (VOIDmode,
31046 gen_rtx_MEM (DImode,
31047 gen_rtx_PRE_DEC (DImode,
31048 stack_pointer_rtx)),
31049 operand));
31050 break;
31051 default:
31052 gcc_unreachable ();
31053 }
31054 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31055 }
31056 else
31057 {
31058 switch (mode)
31059 {
31060 case DImode:
31061 {
31062 rtx operands[2];
31063 split_double_mode (mode, &operand, 1, operands, operands + 1);
31064 emit_insn (
31065 gen_rtx_SET (VOIDmode,
31066 gen_rtx_MEM (SImode,
31067 gen_rtx_PRE_DEC (Pmode,
31068 stack_pointer_rtx)),
31069 operands[1]));
31070 emit_insn (
31071 gen_rtx_SET (VOIDmode,
31072 gen_rtx_MEM (SImode,
31073 gen_rtx_PRE_DEC (Pmode,
31074 stack_pointer_rtx)),
31075 operands[0]));
31076 }
31077 break;
31078 case HImode:
31079 /* Store HImodes as SImodes. */
31080 operand = gen_lowpart (SImode, operand);
31081 /* FALLTHRU */
31082 case SImode:
31083 emit_insn (
31084 gen_rtx_SET (VOIDmode,
31085 gen_rtx_MEM (GET_MODE (operand),
31086 gen_rtx_PRE_DEC (SImode,
31087 stack_pointer_rtx)),
31088 operand));
31089 break;
31090 default:
31091 gcc_unreachable ();
31092 }
31093 result = gen_rtx_MEM (mode, stack_pointer_rtx);
31094 }
31095 return result;
31096 }
31097
31098 /* Free operand from the memory. */
31099 void
31100 ix86_free_from_memory (enum machine_mode mode)
31101 {
31102 if (!ix86_using_red_zone ())
31103 {
31104 int size;
31105
31106 if (mode == DImode || TARGET_64BIT)
31107 size = 8;
31108 else
31109 size = 4;
31110 /* Use LEA to deallocate stack space. In peephole2 it will be converted
31111 to pop or add instruction if registers are available. */
31112 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
31113 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
31114 GEN_INT (size))));
31115 }
31116 }
31117
31118 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
31119
31120 Put float CONST_DOUBLE in the constant pool instead of fp regs.
31121 QImode must go into class Q_REGS.
31122 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
31123 movdf to do mem-to-mem moves through integer regs. */
31124
31125 static reg_class_t
31126 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
31127 {
31128 enum machine_mode mode = GET_MODE (x);
31129
31130 /* We're only allowed to return a subclass of CLASS. Many of the
31131 following checks fail for NO_REGS, so eliminate that early. */
31132 if (regclass == NO_REGS)
31133 return NO_REGS;
31134
31135 /* All classes can load zeros. */
31136 if (x == CONST0_RTX (mode))
31137 return regclass;
31138
31139 /* Force constants into memory if we are loading a (nonzero) constant into
31140 an MMX or SSE register. This is because there are no MMX/SSE instructions
31141 to load from a constant. */
31142 if (CONSTANT_P (x)
31143 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
31144 return NO_REGS;
31145
31146 /* Prefer SSE regs only, if we can use them for math. */
31147 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
31148 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
31149
31150 /* Floating-point constants need more complex checks. */
31151 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
31152 {
31153 /* General regs can load everything. */
31154 if (reg_class_subset_p (regclass, GENERAL_REGS))
31155 return regclass;
31156
31157 /* Floats can load 0 and 1 plus some others. Note that we eliminated
31158 zero above. We only want to wind up preferring 80387 registers if
31159 we plan on doing computation with them. */
31160 if (TARGET_80387
31161 && standard_80387_constant_p (x) > 0)
31162 {
31163 /* Limit class to non-sse. */
31164 if (regclass == FLOAT_SSE_REGS)
31165 return FLOAT_REGS;
31166 if (regclass == FP_TOP_SSE_REGS)
31167 return FP_TOP_REG;
31168 if (regclass == FP_SECOND_SSE_REGS)
31169 return FP_SECOND_REG;
31170 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
31171 return regclass;
31172 }
31173
31174 return NO_REGS;
31175 }
31176
31177 /* Generally when we see PLUS here, it's the function invariant
31178 (plus soft-fp const_int). Which can only be computed into general
31179 regs. */
31180 if (GET_CODE (x) == PLUS)
31181 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
31182
31183 /* QImode constants are easy to load, but non-constant QImode data
31184 must go into Q_REGS. */
31185 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
31186 {
31187 if (reg_class_subset_p (regclass, Q_REGS))
31188 return regclass;
31189 if (reg_class_subset_p (Q_REGS, regclass))
31190 return Q_REGS;
31191 return NO_REGS;
31192 }
31193
31194 return regclass;
31195 }
31196
31197 /* Discourage putting floating-point values in SSE registers unless
31198 SSE math is being used, and likewise for the 387 registers. */
31199 static reg_class_t
31200 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
31201 {
31202 enum machine_mode mode = GET_MODE (x);
31203
31204 /* Restrict the output reload class to the register bank that we are doing
31205 math on. If we would like not to return a subset of CLASS, reject this
31206 alternative: if reload cannot do this, it will still use its choice. */
31207 mode = GET_MODE (x);
31208 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
31209 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
31210
31211 if (X87_FLOAT_MODE_P (mode))
31212 {
31213 if (regclass == FP_TOP_SSE_REGS)
31214 return FP_TOP_REG;
31215 else if (regclass == FP_SECOND_SSE_REGS)
31216 return FP_SECOND_REG;
31217 else
31218 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
31219 }
31220
31221 return regclass;
31222 }
31223
31224 static reg_class_t
31225 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
31226 enum machine_mode mode, secondary_reload_info *sri)
31227 {
31228 /* Double-word spills from general registers to non-offsettable memory
31229 references (zero-extended addresses) require special handling. */
31230 if (TARGET_64BIT
31231 && MEM_P (x)
31232 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
31233 && rclass == GENERAL_REGS
31234 && !offsettable_memref_p (x))
31235 {
31236 sri->icode = (in_p
31237 ? CODE_FOR_reload_noff_load
31238 : CODE_FOR_reload_noff_store);
31239 /* Add the cost of moving address to a temporary. */
31240 sri->extra_cost = 1;
31241
31242 return NO_REGS;
31243 }
31244
31245 /* QImode spills from non-QI registers require
31246 intermediate register on 32bit targets. */
31247 if (!TARGET_64BIT
31248 && !in_p && mode == QImode
31249 && (rclass == GENERAL_REGS
31250 || rclass == LEGACY_REGS
31251 || rclass == INDEX_REGS))
31252 {
31253 int regno;
31254
31255 if (REG_P (x))
31256 regno = REGNO (x);
31257 else
31258 regno = -1;
31259
31260 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
31261 regno = true_regnum (x);
31262
31263 /* Return Q_REGS if the operand is in memory. */
31264 if (regno == -1)
31265 return Q_REGS;
31266 }
31267
31268 /* This condition handles corner case where an expression involving
31269 pointers gets vectorized. We're trying to use the address of a
31270 stack slot as a vector initializer.
31271
31272 (set (reg:V2DI 74 [ vect_cst_.2 ])
31273 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
31274
31275 Eventually frame gets turned into sp+offset like this:
31276
31277 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31278 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31279 (const_int 392 [0x188]))))
31280
31281 That later gets turned into:
31282
31283 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31284 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
31285 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
31286
31287 We'll have the following reload recorded:
31288
31289 Reload 0: reload_in (DI) =
31290 (plus:DI (reg/f:DI 7 sp)
31291 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
31292 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31293 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
31294 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
31295 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
31296 reload_reg_rtx: (reg:V2DI 22 xmm1)
31297
31298 Which isn't going to work since SSE instructions can't handle scalar
31299 additions. Returning GENERAL_REGS forces the addition into integer
31300 register and reload can handle subsequent reloads without problems. */
31301
31302 if (in_p && GET_CODE (x) == PLUS
31303 && SSE_CLASS_P (rclass)
31304 && SCALAR_INT_MODE_P (mode))
31305 return GENERAL_REGS;
31306
31307 return NO_REGS;
31308 }
31309
31310 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
31311
31312 static bool
31313 ix86_class_likely_spilled_p (reg_class_t rclass)
31314 {
31315 switch (rclass)
31316 {
31317 case AREG:
31318 case DREG:
31319 case CREG:
31320 case BREG:
31321 case AD_REGS:
31322 case SIREG:
31323 case DIREG:
31324 case SSE_FIRST_REG:
31325 case FP_TOP_REG:
31326 case FP_SECOND_REG:
31327 return true;
31328
31329 default:
31330 break;
31331 }
31332
31333 return false;
31334 }
31335
31336 /* If we are copying between general and FP registers, we need a memory
31337 location. The same is true for SSE and MMX registers.
31338
31339 To optimize register_move_cost performance, allow inline variant.
31340
31341 The macro can't work reliably when one of the CLASSES is class containing
31342 registers from multiple units (SSE, MMX, integer). We avoid this by never
31343 combining those units in single alternative in the machine description.
31344 Ensure that this constraint holds to avoid unexpected surprises.
31345
31346 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
31347 enforce these sanity checks. */
31348
31349 static inline bool
31350 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31351 enum machine_mode mode, int strict)
31352 {
31353 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
31354 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
31355 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
31356 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
31357 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
31358 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
31359 {
31360 gcc_assert (!strict);
31361 return true;
31362 }
31363
31364 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
31365 return true;
31366
31367 /* ??? This is a lie. We do have moves between mmx/general, and for
31368 mmx/sse2. But by saying we need secondary memory we discourage the
31369 register allocator from using the mmx registers unless needed. */
31370 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
31371 return true;
31372
31373 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31374 {
31375 /* SSE1 doesn't have any direct moves from other classes. */
31376 if (!TARGET_SSE2)
31377 return true;
31378
31379 /* If the target says that inter-unit moves are more expensive
31380 than moving through memory, then don't generate them. */
31381 if (!TARGET_INTER_UNIT_MOVES)
31382 return true;
31383
31384 /* Between SSE and general, we have moves no larger than word size. */
31385 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
31386 return true;
31387 }
31388
31389 return false;
31390 }
31391
31392 bool
31393 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
31394 enum machine_mode mode, int strict)
31395 {
31396 return inline_secondary_memory_needed (class1, class2, mode, strict);
31397 }
31398
31399 /* Implement the TARGET_CLASS_MAX_NREGS hook.
31400
31401 On the 80386, this is the size of MODE in words,
31402 except in the FP regs, where a single reg is always enough. */
31403
31404 static unsigned char
31405 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
31406 {
31407 if (MAYBE_INTEGER_CLASS_P (rclass))
31408 {
31409 if (mode == XFmode)
31410 return (TARGET_64BIT ? 2 : 3);
31411 else if (mode == XCmode)
31412 return (TARGET_64BIT ? 4 : 6);
31413 else
31414 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
31415 }
31416 else
31417 {
31418 if (COMPLEX_MODE_P (mode))
31419 return 2;
31420 else
31421 return 1;
31422 }
31423 }
31424
31425 /* Return true if the registers in CLASS cannot represent the change from
31426 modes FROM to TO. */
31427
31428 bool
31429 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
31430 enum reg_class regclass)
31431 {
31432 if (from == to)
31433 return false;
31434
31435 /* x87 registers can't do subreg at all, as all values are reformatted
31436 to extended precision. */
31437 if (MAYBE_FLOAT_CLASS_P (regclass))
31438 return true;
31439
31440 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
31441 {
31442 /* Vector registers do not support QI or HImode loads. If we don't
31443 disallow a change to these modes, reload will assume it's ok to
31444 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
31445 the vec_dupv4hi pattern. */
31446 if (GET_MODE_SIZE (from) < 4)
31447 return true;
31448
31449 /* Vector registers do not support subreg with nonzero offsets, which
31450 are otherwise valid for integer registers. Since we can't see
31451 whether we have a nonzero offset from here, prohibit all
31452 nonparadoxical subregs changing size. */
31453 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
31454 return true;
31455 }
31456
31457 return false;
31458 }
31459
31460 /* Return the cost of moving data of mode M between a
31461 register and memory. A value of 2 is the default; this cost is
31462 relative to those in `REGISTER_MOVE_COST'.
31463
31464 This function is used extensively by register_move_cost that is used to
31465 build tables at startup. Make it inline in this case.
31466 When IN is 2, return maximum of in and out move cost.
31467
31468 If moving between registers and memory is more expensive than
31469 between two registers, you should define this macro to express the
31470 relative cost.
31471
31472 Model also increased moving costs of QImode registers in non
31473 Q_REGS classes.
31474 */
31475 static inline int
31476 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
31477 int in)
31478 {
31479 int cost;
31480 if (FLOAT_CLASS_P (regclass))
31481 {
31482 int index;
31483 switch (mode)
31484 {
31485 case SFmode:
31486 index = 0;
31487 break;
31488 case DFmode:
31489 index = 1;
31490 break;
31491 case XFmode:
31492 index = 2;
31493 break;
31494 default:
31495 return 100;
31496 }
31497 if (in == 2)
31498 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
31499 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
31500 }
31501 if (SSE_CLASS_P (regclass))
31502 {
31503 int index;
31504 switch (GET_MODE_SIZE (mode))
31505 {
31506 case 4:
31507 index = 0;
31508 break;
31509 case 8:
31510 index = 1;
31511 break;
31512 case 16:
31513 index = 2;
31514 break;
31515 default:
31516 return 100;
31517 }
31518 if (in == 2)
31519 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
31520 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
31521 }
31522 if (MMX_CLASS_P (regclass))
31523 {
31524 int index;
31525 switch (GET_MODE_SIZE (mode))
31526 {
31527 case 4:
31528 index = 0;
31529 break;
31530 case 8:
31531 index = 1;
31532 break;
31533 default:
31534 return 100;
31535 }
31536 if (in)
31537 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
31538 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
31539 }
31540 switch (GET_MODE_SIZE (mode))
31541 {
31542 case 1:
31543 if (Q_CLASS_P (regclass) || TARGET_64BIT)
31544 {
31545 if (!in)
31546 return ix86_cost->int_store[0];
31547 if (TARGET_PARTIAL_REG_DEPENDENCY
31548 && optimize_function_for_speed_p (cfun))
31549 cost = ix86_cost->movzbl_load;
31550 else
31551 cost = ix86_cost->int_load[0];
31552 if (in == 2)
31553 return MAX (cost, ix86_cost->int_store[0]);
31554 return cost;
31555 }
31556 else
31557 {
31558 if (in == 2)
31559 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
31560 if (in)
31561 return ix86_cost->movzbl_load;
31562 else
31563 return ix86_cost->int_store[0] + 4;
31564 }
31565 break;
31566 case 2:
31567 if (in == 2)
31568 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31569 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31570 default:
31571 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
31572 if (mode == TFmode)
31573 mode = XFmode;
31574 if (in == 2)
31575 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31576 else if (in)
31577 cost = ix86_cost->int_load[2];
31578 else
31579 cost = ix86_cost->int_store[2];
31580 return (cost * (((int) GET_MODE_SIZE (mode)
31581 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31582 }
31583 }
31584
31585 static int
31586 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31587 bool in)
31588 {
31589 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31590 }
31591
31592
31593 /* Return the cost of moving data from a register in class CLASS1 to
31594 one in class CLASS2.
31595
31596 It is not required that the cost always equal 2 when FROM is the same as TO;
31597 on some machines it is expensive to move between registers if they are not
31598 general registers. */
31599
31600 static int
31601 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31602 reg_class_t class2_i)
31603 {
31604 enum reg_class class1 = (enum reg_class) class1_i;
31605 enum reg_class class2 = (enum reg_class) class2_i;
31606
31607 /* In case we require secondary memory, compute cost of the store followed
31608 by load. In order to avoid bad register allocation choices, we need
31609 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
31610
31611 if (inline_secondary_memory_needed (class1, class2, mode, 0))
31612 {
31613 int cost = 1;
31614
31615 cost += inline_memory_move_cost (mode, class1, 2);
31616 cost += inline_memory_move_cost (mode, class2, 2);
31617
31618 /* In case of copying from general_purpose_register we may emit multiple
31619 stores followed by single load causing memory size mismatch stall.
31620 Count this as arbitrarily high cost of 20. */
31621 if (targetm.class_max_nregs (class1, mode)
31622 > targetm.class_max_nregs (class2, mode))
31623 cost += 20;
31624
31625 /* In the case of FP/MMX moves, the registers actually overlap, and we
31626 have to switch modes in order to treat them differently. */
31627 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31628 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31629 cost += 20;
31630
31631 return cost;
31632 }
31633
31634 /* Moves between SSE/MMX and integer unit are expensive. */
31635 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31636 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31637
31638 /* ??? By keeping returned value relatively high, we limit the number
31639 of moves between integer and MMX/SSE registers for all targets.
31640 Additionally, high value prevents problem with x86_modes_tieable_p(),
31641 where integer modes in MMX/SSE registers are not tieable
31642 because of missing QImode and HImode moves to, from or between
31643 MMX/SSE registers. */
31644 return MAX (8, ix86_cost->mmxsse_to_integer);
31645
31646 if (MAYBE_FLOAT_CLASS_P (class1))
31647 return ix86_cost->fp_move;
31648 if (MAYBE_SSE_CLASS_P (class1))
31649 return ix86_cost->sse_move;
31650 if (MAYBE_MMX_CLASS_P (class1))
31651 return ix86_cost->mmx_move;
31652 return 2;
31653 }
31654
31655 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31656 MODE. */
31657
31658 bool
31659 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31660 {
31661 /* Flags and only flags can only hold CCmode values. */
31662 if (CC_REGNO_P (regno))
31663 return GET_MODE_CLASS (mode) == MODE_CC;
31664 if (GET_MODE_CLASS (mode) == MODE_CC
31665 || GET_MODE_CLASS (mode) == MODE_RANDOM
31666 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31667 return false;
31668 if (FP_REGNO_P (regno))
31669 return VALID_FP_MODE_P (mode);
31670 if (SSE_REGNO_P (regno))
31671 {
31672 /* We implement the move patterns for all vector modes into and
31673 out of SSE registers, even when no operation instructions
31674 are available. OImode move is available only when AVX is
31675 enabled. */
31676 return ((TARGET_AVX && mode == OImode)
31677 || VALID_AVX256_REG_MODE (mode)
31678 || VALID_SSE_REG_MODE (mode)
31679 || VALID_SSE2_REG_MODE (mode)
31680 || VALID_MMX_REG_MODE (mode)
31681 || VALID_MMX_REG_MODE_3DNOW (mode));
31682 }
31683 if (MMX_REGNO_P (regno))
31684 {
31685 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31686 so if the register is available at all, then we can move data of
31687 the given mode into or out of it. */
31688 return (VALID_MMX_REG_MODE (mode)
31689 || VALID_MMX_REG_MODE_3DNOW (mode));
31690 }
31691
31692 if (mode == QImode)
31693 {
31694 /* Take care for QImode values - they can be in non-QI regs,
31695 but then they do cause partial register stalls. */
31696 if (regno <= BX_REG || TARGET_64BIT)
31697 return true;
31698 if (!TARGET_PARTIAL_REG_STALL)
31699 return true;
31700 return !can_create_pseudo_p ();
31701 }
31702 /* We handle both integer and floats in the general purpose registers. */
31703 else if (VALID_INT_MODE_P (mode))
31704 return true;
31705 else if (VALID_FP_MODE_P (mode))
31706 return true;
31707 else if (VALID_DFP_MODE_P (mode))
31708 return true;
31709 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
31710 on to use that value in smaller contexts, this can easily force a
31711 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
31712 supporting DImode, allow it. */
31713 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31714 return true;
31715
31716 return false;
31717 }
31718
31719 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
31720 tieable integer mode. */
31721
31722 static bool
31723 ix86_tieable_integer_mode_p (enum machine_mode mode)
31724 {
31725 switch (mode)
31726 {
31727 case HImode:
31728 case SImode:
31729 return true;
31730
31731 case QImode:
31732 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31733
31734 case DImode:
31735 return TARGET_64BIT;
31736
31737 default:
31738 return false;
31739 }
31740 }
31741
31742 /* Return true if MODE1 is accessible in a register that can hold MODE2
31743 without copying. That is, all register classes that can hold MODE2
31744 can also hold MODE1. */
31745
31746 bool
31747 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31748 {
31749 if (mode1 == mode2)
31750 return true;
31751
31752 if (ix86_tieable_integer_mode_p (mode1)
31753 && ix86_tieable_integer_mode_p (mode2))
31754 return true;
31755
31756 /* MODE2 being XFmode implies fp stack or general regs, which means we
31757 can tie any smaller floating point modes to it. Note that we do not
31758 tie this with TFmode. */
31759 if (mode2 == XFmode)
31760 return mode1 == SFmode || mode1 == DFmode;
31761
31762 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31763 that we can tie it with SFmode. */
31764 if (mode2 == DFmode)
31765 return mode1 == SFmode;
31766
31767 /* If MODE2 is only appropriate for an SSE register, then tie with
31768 any other mode acceptable to SSE registers. */
31769 if (GET_MODE_SIZE (mode2) == 16
31770 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31771 return (GET_MODE_SIZE (mode1) == 16
31772 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31773
31774 /* If MODE2 is appropriate for an MMX register, then tie
31775 with any other mode acceptable to MMX registers. */
31776 if (GET_MODE_SIZE (mode2) == 8
31777 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31778 return (GET_MODE_SIZE (mode1) == 8
31779 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31780
31781 return false;
31782 }
31783
31784 /* Compute a (partial) cost for rtx X. Return true if the complete
31785 cost has been computed, and false if subexpressions should be
31786 scanned. In either case, *TOTAL contains the cost result. */
31787
31788 static bool
31789 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
31790 bool speed)
31791 {
31792 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31793 enum machine_mode mode = GET_MODE (x);
31794 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31795
31796 switch (code)
31797 {
31798 case CONST_INT:
31799 case CONST:
31800 case LABEL_REF:
31801 case SYMBOL_REF:
31802 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31803 *total = 3;
31804 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31805 *total = 2;
31806 else if (flag_pic && SYMBOLIC_CONST (x)
31807 && (!TARGET_64BIT
31808 || (!GET_CODE (x) != LABEL_REF
31809 && (GET_CODE (x) != SYMBOL_REF
31810 || !SYMBOL_REF_LOCAL_P (x)))))
31811 *total = 1;
31812 else
31813 *total = 0;
31814 return true;
31815
31816 case CONST_DOUBLE:
31817 if (mode == VOIDmode)
31818 *total = 0;
31819 else
31820 switch (standard_80387_constant_p (x))
31821 {
31822 case 1: /* 0.0 */
31823 *total = 1;
31824 break;
31825 default: /* Other constants */
31826 *total = 2;
31827 break;
31828 case 0:
31829 case -1:
31830 /* Start with (MEM (SYMBOL_REF)), since that's where
31831 it'll probably end up. Add a penalty for size. */
31832 *total = (COSTS_N_INSNS (1)
31833 + (flag_pic != 0 && !TARGET_64BIT)
31834 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
31835 break;
31836 }
31837 return true;
31838
31839 case ZERO_EXTEND:
31840 /* The zero extensions is often completely free on x86_64, so make
31841 it as cheap as possible. */
31842 if (TARGET_64BIT && mode == DImode
31843 && GET_MODE (XEXP (x, 0)) == SImode)
31844 *total = 1;
31845 else if (TARGET_ZERO_EXTEND_WITH_AND)
31846 *total = cost->add;
31847 else
31848 *total = cost->movzx;
31849 return false;
31850
31851 case SIGN_EXTEND:
31852 *total = cost->movsx;
31853 return false;
31854
31855 case ASHIFT:
31856 if (CONST_INT_P (XEXP (x, 1))
31857 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
31858 {
31859 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31860 if (value == 1)
31861 {
31862 *total = cost->add;
31863 return false;
31864 }
31865 if ((value == 2 || value == 3)
31866 && cost->lea <= cost->shift_const)
31867 {
31868 *total = cost->lea;
31869 return false;
31870 }
31871 }
31872 /* FALLTHRU */
31873
31874 case ROTATE:
31875 case ASHIFTRT:
31876 case LSHIFTRT:
31877 case ROTATERT:
31878 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
31879 {
31880 if (CONST_INT_P (XEXP (x, 1)))
31881 {
31882 if (INTVAL (XEXP (x, 1)) > 32)
31883 *total = cost->shift_const + COSTS_N_INSNS (2);
31884 else
31885 *total = cost->shift_const * 2;
31886 }
31887 else
31888 {
31889 if (GET_CODE (XEXP (x, 1)) == AND)
31890 *total = cost->shift_var * 2;
31891 else
31892 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
31893 }
31894 }
31895 else
31896 {
31897 if (CONST_INT_P (XEXP (x, 1)))
31898 *total = cost->shift_const;
31899 else
31900 *total = cost->shift_var;
31901 }
31902 return false;
31903
31904 case FMA:
31905 {
31906 rtx sub;
31907
31908 gcc_assert (FLOAT_MODE_P (mode));
31909 gcc_assert (TARGET_FMA || TARGET_FMA4);
31910
31911 /* ??? SSE scalar/vector cost should be used here. */
31912 /* ??? Bald assumption that fma has the same cost as fmul. */
31913 *total = cost->fmul;
31914 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31915
31916 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
31917 sub = XEXP (x, 0);
31918 if (GET_CODE (sub) == NEG)
31919 sub = XEXP (sub, 0);
31920 *total += rtx_cost (sub, FMA, 0, speed);
31921
31922 sub = XEXP (x, 2);
31923 if (GET_CODE (sub) == NEG)
31924 sub = XEXP (sub, 0);
31925 *total += rtx_cost (sub, FMA, 2, speed);
31926 return true;
31927 }
31928
31929 case MULT:
31930 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31931 {
31932 /* ??? SSE scalar cost should be used here. */
31933 *total = cost->fmul;
31934 return false;
31935 }
31936 else if (X87_FLOAT_MODE_P (mode))
31937 {
31938 *total = cost->fmul;
31939 return false;
31940 }
31941 else if (FLOAT_MODE_P (mode))
31942 {
31943 /* ??? SSE vector cost should be used here. */
31944 *total = cost->fmul;
31945 return false;
31946 }
31947 else
31948 {
31949 rtx op0 = XEXP (x, 0);
31950 rtx op1 = XEXP (x, 1);
31951 int nbits;
31952 if (CONST_INT_P (XEXP (x, 1)))
31953 {
31954 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31955 for (nbits = 0; value != 0; value &= value - 1)
31956 nbits++;
31957 }
31958 else
31959 /* This is arbitrary. */
31960 nbits = 7;
31961
31962 /* Compute costs correctly for widening multiplication. */
31963 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
31964 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
31965 == GET_MODE_SIZE (mode))
31966 {
31967 int is_mulwiden = 0;
31968 enum machine_mode inner_mode = GET_MODE (op0);
31969
31970 if (GET_CODE (op0) == GET_CODE (op1))
31971 is_mulwiden = 1, op1 = XEXP (op1, 0);
31972 else if (CONST_INT_P (op1))
31973 {
31974 if (GET_CODE (op0) == SIGN_EXTEND)
31975 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
31976 == INTVAL (op1);
31977 else
31978 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
31979 }
31980
31981 if (is_mulwiden)
31982 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
31983 }
31984
31985 *total = (cost->mult_init[MODE_INDEX (mode)]
31986 + nbits * cost->mult_bit
31987 + rtx_cost (op0, outer_code, opno, speed)
31988 + rtx_cost (op1, outer_code, opno, speed));
31989
31990 return true;
31991 }
31992
31993 case DIV:
31994 case UDIV:
31995 case MOD:
31996 case UMOD:
31997 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31998 /* ??? SSE cost should be used here. */
31999 *total = cost->fdiv;
32000 else if (X87_FLOAT_MODE_P (mode))
32001 *total = cost->fdiv;
32002 else if (FLOAT_MODE_P (mode))
32003 /* ??? SSE vector cost should be used here. */
32004 *total = cost->fdiv;
32005 else
32006 *total = cost->divide[MODE_INDEX (mode)];
32007 return false;
32008
32009 case PLUS:
32010 if (GET_MODE_CLASS (mode) == MODE_INT
32011 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
32012 {
32013 if (GET_CODE (XEXP (x, 0)) == PLUS
32014 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
32015 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
32016 && CONSTANT_P (XEXP (x, 1)))
32017 {
32018 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
32019 if (val == 2 || val == 4 || val == 8)
32020 {
32021 *total = cost->lea;
32022 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32023 outer_code, opno, speed);
32024 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
32025 outer_code, opno, speed);
32026 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32027 return true;
32028 }
32029 }
32030 else if (GET_CODE (XEXP (x, 0)) == MULT
32031 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
32032 {
32033 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
32034 if (val == 2 || val == 4 || val == 8)
32035 {
32036 *total = cost->lea;
32037 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32038 outer_code, opno, speed);
32039 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32040 return true;
32041 }
32042 }
32043 else if (GET_CODE (XEXP (x, 0)) == PLUS)
32044 {
32045 *total = cost->lea;
32046 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
32047 outer_code, opno, speed);
32048 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
32049 outer_code, opno, speed);
32050 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
32051 return true;
32052 }
32053 }
32054 /* FALLTHRU */
32055
32056 case MINUS:
32057 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32058 {
32059 /* ??? SSE cost should be used here. */
32060 *total = cost->fadd;
32061 return false;
32062 }
32063 else if (X87_FLOAT_MODE_P (mode))
32064 {
32065 *total = cost->fadd;
32066 return false;
32067 }
32068 else if (FLOAT_MODE_P (mode))
32069 {
32070 /* ??? SSE vector cost should be used here. */
32071 *total = cost->fadd;
32072 return false;
32073 }
32074 /* FALLTHRU */
32075
32076 case AND:
32077 case IOR:
32078 case XOR:
32079 if (!TARGET_64BIT && mode == DImode)
32080 {
32081 *total = (cost->add * 2
32082 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
32083 << (GET_MODE (XEXP (x, 0)) != DImode))
32084 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
32085 << (GET_MODE (XEXP (x, 1)) != DImode)));
32086 return true;
32087 }
32088 /* FALLTHRU */
32089
32090 case NEG:
32091 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32092 {
32093 /* ??? SSE cost should be used here. */
32094 *total = cost->fchs;
32095 return false;
32096 }
32097 else if (X87_FLOAT_MODE_P (mode))
32098 {
32099 *total = cost->fchs;
32100 return false;
32101 }
32102 else if (FLOAT_MODE_P (mode))
32103 {
32104 /* ??? SSE vector cost should be used here. */
32105 *total = cost->fchs;
32106 return false;
32107 }
32108 /* FALLTHRU */
32109
32110 case NOT:
32111 if (!TARGET_64BIT && mode == DImode)
32112 *total = cost->add * 2;
32113 else
32114 *total = cost->add;
32115 return false;
32116
32117 case COMPARE:
32118 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
32119 && XEXP (XEXP (x, 0), 1) == const1_rtx
32120 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
32121 && XEXP (x, 1) == const0_rtx)
32122 {
32123 /* This kind of construct is implemented using test[bwl].
32124 Treat it as if we had an AND. */
32125 *total = (cost->add
32126 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
32127 + rtx_cost (const1_rtx, outer_code, opno, speed));
32128 return true;
32129 }
32130 return false;
32131
32132 case FLOAT_EXTEND:
32133 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
32134 *total = 0;
32135 return false;
32136
32137 case ABS:
32138 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32139 /* ??? SSE cost should be used here. */
32140 *total = cost->fabs;
32141 else if (X87_FLOAT_MODE_P (mode))
32142 *total = cost->fabs;
32143 else if (FLOAT_MODE_P (mode))
32144 /* ??? SSE vector cost should be used here. */
32145 *total = cost->fabs;
32146 return false;
32147
32148 case SQRT:
32149 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
32150 /* ??? SSE cost should be used here. */
32151 *total = cost->fsqrt;
32152 else if (X87_FLOAT_MODE_P (mode))
32153 *total = cost->fsqrt;
32154 else if (FLOAT_MODE_P (mode))
32155 /* ??? SSE vector cost should be used here. */
32156 *total = cost->fsqrt;
32157 return false;
32158
32159 case UNSPEC:
32160 if (XINT (x, 1) == UNSPEC_TP)
32161 *total = 0;
32162 return false;
32163
32164 case VEC_SELECT:
32165 case VEC_CONCAT:
32166 case VEC_MERGE:
32167 case VEC_DUPLICATE:
32168 /* ??? Assume all of these vector manipulation patterns are
32169 recognizable. In which case they all pretty much have the
32170 same cost. */
32171 *total = COSTS_N_INSNS (1);
32172 return true;
32173
32174 default:
32175 return false;
32176 }
32177 }
32178
32179 #if TARGET_MACHO
32180
32181 static int current_machopic_label_num;
32182
32183 /* Given a symbol name and its associated stub, write out the
32184 definition of the stub. */
32185
32186 void
32187 machopic_output_stub (FILE *file, const char *symb, const char *stub)
32188 {
32189 unsigned int length;
32190 char *binder_name, *symbol_name, lazy_ptr_name[32];
32191 int label = ++current_machopic_label_num;
32192
32193 /* For 64-bit we shouldn't get here. */
32194 gcc_assert (!TARGET_64BIT);
32195
32196 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
32197 symb = targetm.strip_name_encoding (symb);
32198
32199 length = strlen (stub);
32200 binder_name = XALLOCAVEC (char, length + 32);
32201 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
32202
32203 length = strlen (symb);
32204 symbol_name = XALLOCAVEC (char, length + 32);
32205 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
32206
32207 sprintf (lazy_ptr_name, "L%d$lz", label);
32208
32209 if (MACHOPIC_ATT_STUB)
32210 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
32211 else if (MACHOPIC_PURE)
32212 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
32213 else
32214 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
32215
32216 fprintf (file, "%s:\n", stub);
32217 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32218
32219 if (MACHOPIC_ATT_STUB)
32220 {
32221 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
32222 }
32223 else if (MACHOPIC_PURE)
32224 {
32225 /* PIC stub. */
32226 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32227 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
32228 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
32229 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
32230 label, lazy_ptr_name, label);
32231 fprintf (file, "\tjmp\t*%%ecx\n");
32232 }
32233 else
32234 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
32235
32236 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
32237 it needs no stub-binding-helper. */
32238 if (MACHOPIC_ATT_STUB)
32239 return;
32240
32241 fprintf (file, "%s:\n", binder_name);
32242
32243 if (MACHOPIC_PURE)
32244 {
32245 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
32246 fprintf (file, "\tpushl\t%%ecx\n");
32247 }
32248 else
32249 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
32250
32251 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
32252
32253 /* N.B. Keep the correspondence of these
32254 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
32255 old-pic/new-pic/non-pic stubs; altering this will break
32256 compatibility with existing dylibs. */
32257 if (MACHOPIC_PURE)
32258 {
32259 /* 25-byte PIC stub using "CALL get_pc_thunk". */
32260 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
32261 }
32262 else
32263 /* 16-byte -mdynamic-no-pic stub. */
32264 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
32265
32266 fprintf (file, "%s:\n", lazy_ptr_name);
32267 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
32268 fprintf (file, ASM_LONG "%s\n", binder_name);
32269 }
32270 #endif /* TARGET_MACHO */
32271
32272 /* Order the registers for register allocator. */
32273
32274 void
32275 x86_order_regs_for_local_alloc (void)
32276 {
32277 int pos = 0;
32278 int i;
32279
32280 /* First allocate the local general purpose registers. */
32281 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32282 if (GENERAL_REGNO_P (i) && call_used_regs[i])
32283 reg_alloc_order [pos++] = i;
32284
32285 /* Global general purpose registers. */
32286 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
32287 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
32288 reg_alloc_order [pos++] = i;
32289
32290 /* x87 registers come first in case we are doing FP math
32291 using them. */
32292 if (!TARGET_SSE_MATH)
32293 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32294 reg_alloc_order [pos++] = i;
32295
32296 /* SSE registers. */
32297 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
32298 reg_alloc_order [pos++] = i;
32299 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
32300 reg_alloc_order [pos++] = i;
32301
32302 /* x87 registers. */
32303 if (TARGET_SSE_MATH)
32304 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
32305 reg_alloc_order [pos++] = i;
32306
32307 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
32308 reg_alloc_order [pos++] = i;
32309
32310 /* Initialize the rest of array as we do not allocate some registers
32311 at all. */
32312 while (pos < FIRST_PSEUDO_REGISTER)
32313 reg_alloc_order [pos++] = 0;
32314 }
32315
32316 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
32317 in struct attribute_spec handler. */
32318 static tree
32319 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
32320 tree args,
32321 int flags ATTRIBUTE_UNUSED,
32322 bool *no_add_attrs)
32323 {
32324 if (TREE_CODE (*node) != FUNCTION_TYPE
32325 && TREE_CODE (*node) != METHOD_TYPE
32326 && TREE_CODE (*node) != FIELD_DECL
32327 && TREE_CODE (*node) != TYPE_DECL)
32328 {
32329 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32330 name);
32331 *no_add_attrs = true;
32332 return NULL_TREE;
32333 }
32334 if (TARGET_64BIT)
32335 {
32336 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
32337 name);
32338 *no_add_attrs = true;
32339 return NULL_TREE;
32340 }
32341 if (is_attribute_p ("callee_pop_aggregate_return", name))
32342 {
32343 tree cst;
32344
32345 cst = TREE_VALUE (args);
32346 if (TREE_CODE (cst) != INTEGER_CST)
32347 {
32348 warning (OPT_Wattributes,
32349 "%qE attribute requires an integer constant argument",
32350 name);
32351 *no_add_attrs = true;
32352 }
32353 else if (compare_tree_int (cst, 0) != 0
32354 && compare_tree_int (cst, 1) != 0)
32355 {
32356 warning (OPT_Wattributes,
32357 "argument to %qE attribute is neither zero, nor one",
32358 name);
32359 *no_add_attrs = true;
32360 }
32361
32362 return NULL_TREE;
32363 }
32364
32365 return NULL_TREE;
32366 }
32367
32368 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
32369 struct attribute_spec.handler. */
32370 static tree
32371 ix86_handle_abi_attribute (tree *node, tree name,
32372 tree args ATTRIBUTE_UNUSED,
32373 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32374 {
32375 if (TREE_CODE (*node) != FUNCTION_TYPE
32376 && TREE_CODE (*node) != METHOD_TYPE
32377 && TREE_CODE (*node) != FIELD_DECL
32378 && TREE_CODE (*node) != TYPE_DECL)
32379 {
32380 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32381 name);
32382 *no_add_attrs = true;
32383 return NULL_TREE;
32384 }
32385
32386 /* Can combine regparm with all attributes but fastcall. */
32387 if (is_attribute_p ("ms_abi", name))
32388 {
32389 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
32390 {
32391 error ("ms_abi and sysv_abi attributes are not compatible");
32392 }
32393
32394 return NULL_TREE;
32395 }
32396 else if (is_attribute_p ("sysv_abi", name))
32397 {
32398 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
32399 {
32400 error ("ms_abi and sysv_abi attributes are not compatible");
32401 }
32402
32403 return NULL_TREE;
32404 }
32405
32406 return NULL_TREE;
32407 }
32408
32409 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
32410 struct attribute_spec.handler. */
32411 static tree
32412 ix86_handle_struct_attribute (tree *node, tree name,
32413 tree args ATTRIBUTE_UNUSED,
32414 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32415 {
32416 tree *type = NULL;
32417 if (DECL_P (*node))
32418 {
32419 if (TREE_CODE (*node) == TYPE_DECL)
32420 type = &TREE_TYPE (*node);
32421 }
32422 else
32423 type = node;
32424
32425 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
32426 || TREE_CODE (*type) == UNION_TYPE)))
32427 {
32428 warning (OPT_Wattributes, "%qE attribute ignored",
32429 name);
32430 *no_add_attrs = true;
32431 }
32432
32433 else if ((is_attribute_p ("ms_struct", name)
32434 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
32435 || ((is_attribute_p ("gcc_struct", name)
32436 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
32437 {
32438 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
32439 name);
32440 *no_add_attrs = true;
32441 }
32442
32443 return NULL_TREE;
32444 }
32445
32446 static tree
32447 ix86_handle_fndecl_attribute (tree *node, tree name,
32448 tree args ATTRIBUTE_UNUSED,
32449 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
32450 {
32451 if (TREE_CODE (*node) != FUNCTION_DECL)
32452 {
32453 warning (OPT_Wattributes, "%qE attribute only applies to functions",
32454 name);
32455 *no_add_attrs = true;
32456 }
32457 return NULL_TREE;
32458 }
32459
32460 static bool
32461 ix86_ms_bitfield_layout_p (const_tree record_type)
32462 {
32463 return ((TARGET_MS_BITFIELD_LAYOUT
32464 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
32465 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
32466 }
32467
32468 /* Returns an expression indicating where the this parameter is
32469 located on entry to the FUNCTION. */
32470
32471 static rtx
32472 x86_this_parameter (tree function)
32473 {
32474 tree type = TREE_TYPE (function);
32475 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
32476 int nregs;
32477
32478 if (TARGET_64BIT)
32479 {
32480 const int *parm_regs;
32481
32482 if (ix86_function_type_abi (type) == MS_ABI)
32483 parm_regs = x86_64_ms_abi_int_parameter_registers;
32484 else
32485 parm_regs = x86_64_int_parameter_registers;
32486 return gen_rtx_REG (DImode, parm_regs[aggr]);
32487 }
32488
32489 nregs = ix86_function_regparm (type, function);
32490
32491 if (nregs > 0 && !stdarg_p (type))
32492 {
32493 int regno;
32494 unsigned int ccvt = ix86_get_callcvt (type);
32495
32496 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
32497 regno = aggr ? DX_REG : CX_REG;
32498 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
32499 {
32500 regno = CX_REG;
32501 if (aggr)
32502 return gen_rtx_MEM (SImode,
32503 plus_constant (stack_pointer_rtx, 4));
32504 }
32505 else
32506 {
32507 regno = AX_REG;
32508 if (aggr)
32509 {
32510 regno = DX_REG;
32511 if (nregs == 1)
32512 return gen_rtx_MEM (SImode,
32513 plus_constant (stack_pointer_rtx, 4));
32514 }
32515 }
32516 return gen_rtx_REG (SImode, regno);
32517 }
32518
32519 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
32520 }
32521
32522 /* Determine whether x86_output_mi_thunk can succeed. */
32523
32524 static bool
32525 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
32526 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
32527 HOST_WIDE_INT vcall_offset, const_tree function)
32528 {
32529 /* 64-bit can handle anything. */
32530 if (TARGET_64BIT)
32531 return true;
32532
32533 /* For 32-bit, everything's fine if we have one free register. */
32534 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
32535 return true;
32536
32537 /* Need a free register for vcall_offset. */
32538 if (vcall_offset)
32539 return false;
32540
32541 /* Need a free register for GOT references. */
32542 if (flag_pic && !targetm.binds_local_p (function))
32543 return false;
32544
32545 /* Otherwise ok. */
32546 return true;
32547 }
32548
32549 /* Output the assembler code for a thunk function. THUNK_DECL is the
32550 declaration for the thunk function itself, FUNCTION is the decl for
32551 the target function. DELTA is an immediate constant offset to be
32552 added to THIS. If VCALL_OFFSET is nonzero, the word at
32553 *(*this + vcall_offset) should be added to THIS. */
32554
32555 static void
32556 x86_output_mi_thunk (FILE *file,
32557 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
32558 HOST_WIDE_INT vcall_offset, tree function)
32559 {
32560 rtx this_param = x86_this_parameter (function);
32561 rtx this_reg, tmp, fnaddr;
32562
32563 emit_note (NOTE_INSN_PROLOGUE_END);
32564
32565 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
32566 pull it in now and let DELTA benefit. */
32567 if (REG_P (this_param))
32568 this_reg = this_param;
32569 else if (vcall_offset)
32570 {
32571 /* Put the this parameter into %eax. */
32572 this_reg = gen_rtx_REG (Pmode, AX_REG);
32573 emit_move_insn (this_reg, this_param);
32574 }
32575 else
32576 this_reg = NULL_RTX;
32577
32578 /* Adjust the this parameter by a fixed constant. */
32579 if (delta)
32580 {
32581 rtx delta_rtx = GEN_INT (delta);
32582 rtx delta_dst = this_reg ? this_reg : this_param;
32583
32584 if (TARGET_64BIT)
32585 {
32586 if (!x86_64_general_operand (delta_rtx, Pmode))
32587 {
32588 tmp = gen_rtx_REG (Pmode, R10_REG);
32589 emit_move_insn (tmp, delta_rtx);
32590 delta_rtx = tmp;
32591 }
32592 }
32593
32594 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32595 }
32596
32597 /* Adjust the this parameter by a value stored in the vtable. */
32598 if (vcall_offset)
32599 {
32600 rtx vcall_addr, vcall_mem, this_mem;
32601 unsigned int tmp_regno;
32602
32603 if (TARGET_64BIT)
32604 tmp_regno = R10_REG;
32605 else
32606 {
32607 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32608 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
32609 tmp_regno = AX_REG;
32610 else
32611 tmp_regno = CX_REG;
32612 }
32613 tmp = gen_rtx_REG (Pmode, tmp_regno);
32614
32615 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32616 if (Pmode != ptr_mode)
32617 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32618 emit_move_insn (tmp, this_mem);
32619
32620 /* Adjust the this parameter. */
32621 vcall_addr = plus_constant (tmp, vcall_offset);
32622 if (TARGET_64BIT
32623 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32624 {
32625 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32626 emit_move_insn (tmp2, GEN_INT (vcall_offset));
32627 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32628 }
32629
32630 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32631 if (Pmode != ptr_mode)
32632 emit_insn (gen_addsi_1_zext (this_reg,
32633 gen_rtx_REG (ptr_mode,
32634 REGNO (this_reg)),
32635 vcall_mem));
32636 else
32637 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32638 }
32639
32640 /* If necessary, drop THIS back to its stack slot. */
32641 if (this_reg && this_reg != this_param)
32642 emit_move_insn (this_param, this_reg);
32643
32644 fnaddr = XEXP (DECL_RTL (function), 0);
32645 if (TARGET_64BIT)
32646 {
32647 if (!flag_pic || targetm.binds_local_p (function)
32648 || cfun->machine->call_abi == MS_ABI)
32649 ;
32650 else
32651 {
32652 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32653 tmp = gen_rtx_CONST (Pmode, tmp);
32654 fnaddr = gen_rtx_MEM (Pmode, tmp);
32655 }
32656 }
32657 else
32658 {
32659 if (!flag_pic || targetm.binds_local_p (function))
32660 ;
32661 #if TARGET_MACHO
32662 else if (TARGET_MACHO)
32663 {
32664 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32665 fnaddr = XEXP (fnaddr, 0);
32666 }
32667 #endif /* TARGET_MACHO */
32668 else
32669 {
32670 tmp = gen_rtx_REG (Pmode, CX_REG);
32671 output_set_got (tmp, NULL_RTX);
32672
32673 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32674 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32675 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32676 }
32677 }
32678
32679 /* Our sibling call patterns do not allow memories, because we have no
32680 predicate that can distinguish between frame and non-frame memory.
32681 For our purposes here, we can get away with (ab)using a jump pattern,
32682 because we're going to do no optimization. */
32683 if (MEM_P (fnaddr))
32684 emit_jump_insn (gen_indirect_jump (fnaddr));
32685 else
32686 {
32687 tmp = gen_rtx_MEM (QImode, fnaddr);
32688 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32689 tmp = emit_call_insn (tmp);
32690 SIBLING_CALL_P (tmp) = 1;
32691 }
32692 emit_barrier ();
32693
32694 /* Emit just enough of rest_of_compilation to get the insns emitted.
32695 Note that use_thunk calls assemble_start_function et al. */
32696 tmp = get_insns ();
32697 insn_locators_alloc ();
32698 shorten_branches (tmp);
32699 final_start_function (tmp, file, 1);
32700 final (tmp, file, 1);
32701 final_end_function ();
32702 }
32703
32704 static void
32705 x86_file_start (void)
32706 {
32707 default_file_start ();
32708 #if TARGET_MACHO
32709 darwin_file_start ();
32710 #endif
32711 if (X86_FILE_START_VERSION_DIRECTIVE)
32712 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32713 if (X86_FILE_START_FLTUSED)
32714 fputs ("\t.global\t__fltused\n", asm_out_file);
32715 if (ix86_asm_dialect == ASM_INTEL)
32716 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32717 }
32718
32719 int
32720 x86_field_alignment (tree field, int computed)
32721 {
32722 enum machine_mode mode;
32723 tree type = TREE_TYPE (field);
32724
32725 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32726 return computed;
32727 mode = TYPE_MODE (strip_array_types (type));
32728 if (mode == DFmode || mode == DCmode
32729 || GET_MODE_CLASS (mode) == MODE_INT
32730 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32731 return MIN (32, computed);
32732 return computed;
32733 }
32734
32735 /* Output assembler code to FILE to increment profiler label # LABELNO
32736 for profiling a function entry. */
32737 void
32738 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32739 {
32740 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32741 : MCOUNT_NAME);
32742
32743 if (TARGET_64BIT)
32744 {
32745 #ifndef NO_PROFILE_COUNTERS
32746 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32747 #endif
32748
32749 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32750 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32751 else
32752 fprintf (file, "\tcall\t%s\n", mcount_name);
32753 }
32754 else if (flag_pic)
32755 {
32756 #ifndef NO_PROFILE_COUNTERS
32757 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32758 LPREFIX, labelno);
32759 #endif
32760 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32761 }
32762 else
32763 {
32764 #ifndef NO_PROFILE_COUNTERS
32765 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32766 LPREFIX, labelno);
32767 #endif
32768 fprintf (file, "\tcall\t%s\n", mcount_name);
32769 }
32770 }
32771
32772 /* We don't have exact information about the insn sizes, but we may assume
32773 quite safely that we are informed about all 1 byte insns and memory
32774 address sizes. This is enough to eliminate unnecessary padding in
32775 99% of cases. */
32776
32777 static int
32778 min_insn_size (rtx insn)
32779 {
32780 int l = 0, len;
32781
32782 if (!INSN_P (insn) || !active_insn_p (insn))
32783 return 0;
32784
32785 /* Discard alignments we've emit and jump instructions. */
32786 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
32787 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
32788 return 0;
32789 if (JUMP_TABLE_DATA_P (insn))
32790 return 0;
32791
32792 /* Important case - calls are always 5 bytes.
32793 It is common to have many calls in the row. */
32794 if (CALL_P (insn)
32795 && symbolic_reference_mentioned_p (PATTERN (insn))
32796 && !SIBLING_CALL_P (insn))
32797 return 5;
32798 len = get_attr_length (insn);
32799 if (len <= 1)
32800 return 1;
32801
32802 /* For normal instructions we rely on get_attr_length being exact,
32803 with a few exceptions. */
32804 if (!JUMP_P (insn))
32805 {
32806 enum attr_type type = get_attr_type (insn);
32807
32808 switch (type)
32809 {
32810 case TYPE_MULTI:
32811 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
32812 || asm_noperands (PATTERN (insn)) >= 0)
32813 return 0;
32814 break;
32815 case TYPE_OTHER:
32816 case TYPE_FCMP:
32817 break;
32818 default:
32819 /* Otherwise trust get_attr_length. */
32820 return len;
32821 }
32822
32823 l = get_attr_length_address (insn);
32824 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
32825 l = 4;
32826 }
32827 if (l)
32828 return 1+l;
32829 else
32830 return 2;
32831 }
32832
32833 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32834
32835 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
32836 window. */
32837
32838 static void
32839 ix86_avoid_jump_mispredicts (void)
32840 {
32841 rtx insn, start = get_insns ();
32842 int nbytes = 0, njumps = 0;
32843 int isjump = 0;
32844
32845 /* Look for all minimal intervals of instructions containing 4 jumps.
32846 The intervals are bounded by START and INSN. NBYTES is the total
32847 size of instructions in the interval including INSN and not including
32848 START. When the NBYTES is smaller than 16 bytes, it is possible
32849 that the end of START and INSN ends up in the same 16byte page.
32850
32851 The smallest offset in the page INSN can start is the case where START
32852 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
32853 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
32854 */
32855 for (insn = start; insn; insn = NEXT_INSN (insn))
32856 {
32857 int min_size;
32858
32859 if (LABEL_P (insn))
32860 {
32861 int align = label_to_alignment (insn);
32862 int max_skip = label_to_max_skip (insn);
32863
32864 if (max_skip > 15)
32865 max_skip = 15;
32866 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
32867 already in the current 16 byte page, because otherwise
32868 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
32869 bytes to reach 16 byte boundary. */
32870 if (align <= 0
32871 || (align <= 3 && max_skip != (1 << align) - 1))
32872 max_skip = 0;
32873 if (dump_file)
32874 fprintf (dump_file, "Label %i with max_skip %i\n",
32875 INSN_UID (insn), max_skip);
32876 if (max_skip)
32877 {
32878 while (nbytes + max_skip >= 16)
32879 {
32880 start = NEXT_INSN (start);
32881 if ((JUMP_P (start)
32882 && GET_CODE (PATTERN (start)) != ADDR_VEC
32883 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32884 || CALL_P (start))
32885 njumps--, isjump = 1;
32886 else
32887 isjump = 0;
32888 nbytes -= min_insn_size (start);
32889 }
32890 }
32891 continue;
32892 }
32893
32894 min_size = min_insn_size (insn);
32895 nbytes += min_size;
32896 if (dump_file)
32897 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
32898 INSN_UID (insn), min_size);
32899 if ((JUMP_P (insn)
32900 && GET_CODE (PATTERN (insn)) != ADDR_VEC
32901 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
32902 || CALL_P (insn))
32903 njumps++;
32904 else
32905 continue;
32906
32907 while (njumps > 3)
32908 {
32909 start = NEXT_INSN (start);
32910 if ((JUMP_P (start)
32911 && GET_CODE (PATTERN (start)) != ADDR_VEC
32912 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32913 || CALL_P (start))
32914 njumps--, isjump = 1;
32915 else
32916 isjump = 0;
32917 nbytes -= min_insn_size (start);
32918 }
32919 gcc_assert (njumps >= 0);
32920 if (dump_file)
32921 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
32922 INSN_UID (start), INSN_UID (insn), nbytes);
32923
32924 if (njumps == 3 && isjump && nbytes < 16)
32925 {
32926 int padsize = 15 - nbytes + min_insn_size (insn);
32927
32928 if (dump_file)
32929 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
32930 INSN_UID (insn), padsize);
32931 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
32932 }
32933 }
32934 }
32935 #endif
32936
32937 /* AMD Athlon works faster
32938 when RET is not destination of conditional jump or directly preceded
32939 by other jump instruction. We avoid the penalty by inserting NOP just
32940 before the RET instructions in such cases. */
32941 static void
32942 ix86_pad_returns (void)
32943 {
32944 edge e;
32945 edge_iterator ei;
32946
32947 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32948 {
32949 basic_block bb = e->src;
32950 rtx ret = BB_END (bb);
32951 rtx prev;
32952 bool replace = false;
32953
32954 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
32955 || optimize_bb_for_size_p (bb))
32956 continue;
32957 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
32958 if (active_insn_p (prev) || LABEL_P (prev))
32959 break;
32960 if (prev && LABEL_P (prev))
32961 {
32962 edge e;
32963 edge_iterator ei;
32964
32965 FOR_EACH_EDGE (e, ei, bb->preds)
32966 if (EDGE_FREQUENCY (e) && e->src->index >= 0
32967 && !(e->flags & EDGE_FALLTHRU))
32968 replace = true;
32969 }
32970 if (!replace)
32971 {
32972 prev = prev_active_insn (ret);
32973 if (prev
32974 && ((JUMP_P (prev) && any_condjump_p (prev))
32975 || CALL_P (prev)))
32976 replace = true;
32977 /* Empty functions get branch mispredict even when
32978 the jump destination is not visible to us. */
32979 if (!prev && !optimize_function_for_size_p (cfun))
32980 replace = true;
32981 }
32982 if (replace)
32983 {
32984 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
32985 delete_insn (ret);
32986 }
32987 }
32988 }
32989
32990 /* Count the minimum number of instructions in BB. Return 4 if the
32991 number of instructions >= 4. */
32992
32993 static int
32994 ix86_count_insn_bb (basic_block bb)
32995 {
32996 rtx insn;
32997 int insn_count = 0;
32998
32999 /* Count number of instructions in this block. Return 4 if the number
33000 of instructions >= 4. */
33001 FOR_BB_INSNS (bb, insn)
33002 {
33003 /* Only happen in exit blocks. */
33004 if (JUMP_P (insn)
33005 && ANY_RETURN_P (PATTERN (insn)))
33006 break;
33007
33008 if (NONDEBUG_INSN_P (insn)
33009 && GET_CODE (PATTERN (insn)) != USE
33010 && GET_CODE (PATTERN (insn)) != CLOBBER)
33011 {
33012 insn_count++;
33013 if (insn_count >= 4)
33014 return insn_count;
33015 }
33016 }
33017
33018 return insn_count;
33019 }
33020
33021
33022 /* Count the minimum number of instructions in code path in BB.
33023 Return 4 if the number of instructions >= 4. */
33024
33025 static int
33026 ix86_count_insn (basic_block bb)
33027 {
33028 edge e;
33029 edge_iterator ei;
33030 int min_prev_count;
33031
33032 /* Only bother counting instructions along paths with no
33033 more than 2 basic blocks between entry and exit. Given
33034 that BB has an edge to exit, determine if a predecessor
33035 of BB has an edge from entry. If so, compute the number
33036 of instructions in the predecessor block. If there
33037 happen to be multiple such blocks, compute the minimum. */
33038 min_prev_count = 4;
33039 FOR_EACH_EDGE (e, ei, bb->preds)
33040 {
33041 edge prev_e;
33042 edge_iterator prev_ei;
33043
33044 if (e->src == ENTRY_BLOCK_PTR)
33045 {
33046 min_prev_count = 0;
33047 break;
33048 }
33049 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
33050 {
33051 if (prev_e->src == ENTRY_BLOCK_PTR)
33052 {
33053 int count = ix86_count_insn_bb (e->src);
33054 if (count < min_prev_count)
33055 min_prev_count = count;
33056 break;
33057 }
33058 }
33059 }
33060
33061 if (min_prev_count < 4)
33062 min_prev_count += ix86_count_insn_bb (bb);
33063
33064 return min_prev_count;
33065 }
33066
33067 /* Pad short funtion to 4 instructions. */
33068
33069 static void
33070 ix86_pad_short_function (void)
33071 {
33072 edge e;
33073 edge_iterator ei;
33074
33075 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
33076 {
33077 rtx ret = BB_END (e->src);
33078 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
33079 {
33080 int insn_count = ix86_count_insn (e->src);
33081
33082 /* Pad short function. */
33083 if (insn_count < 4)
33084 {
33085 rtx insn = ret;
33086
33087 /* Find epilogue. */
33088 while (insn
33089 && (!NOTE_P (insn)
33090 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
33091 insn = PREV_INSN (insn);
33092
33093 if (!insn)
33094 insn = ret;
33095
33096 /* Two NOPs count as one instruction. */
33097 insn_count = 2 * (4 - insn_count);
33098 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
33099 }
33100 }
33101 }
33102 }
33103
33104 /* Implement machine specific optimizations. We implement padding of returns
33105 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
33106 static void
33107 ix86_reorg (void)
33108 {
33109 /* We are freeing block_for_insn in the toplev to keep compatibility
33110 with old MDEP_REORGS that are not CFG based. Recompute it now. */
33111 compute_bb_for_insn ();
33112
33113 /* Run the vzeroupper optimization if needed. */
33114 if (TARGET_VZEROUPPER)
33115 move_or_delete_vzeroupper ();
33116
33117 if (optimize && optimize_function_for_speed_p (cfun))
33118 {
33119 if (TARGET_PAD_SHORT_FUNCTION)
33120 ix86_pad_short_function ();
33121 else if (TARGET_PAD_RETURNS)
33122 ix86_pad_returns ();
33123 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
33124 if (TARGET_FOUR_JUMP_LIMIT)
33125 ix86_avoid_jump_mispredicts ();
33126 #endif
33127 }
33128 }
33129
33130 /* Return nonzero when QImode register that must be represented via REX prefix
33131 is used. */
33132 bool
33133 x86_extended_QIreg_mentioned_p (rtx insn)
33134 {
33135 int i;
33136 extract_insn_cached (insn);
33137 for (i = 0; i < recog_data.n_operands; i++)
33138 if (REG_P (recog_data.operand[i])
33139 && REGNO (recog_data.operand[i]) > BX_REG)
33140 return true;
33141 return false;
33142 }
33143
33144 /* Return nonzero when P points to register encoded via REX prefix.
33145 Called via for_each_rtx. */
33146 static int
33147 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
33148 {
33149 unsigned int regno;
33150 if (!REG_P (*p))
33151 return 0;
33152 regno = REGNO (*p);
33153 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
33154 }
33155
33156 /* Return true when INSN mentions register that must be encoded using REX
33157 prefix. */
33158 bool
33159 x86_extended_reg_mentioned_p (rtx insn)
33160 {
33161 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
33162 extended_reg_mentioned_1, NULL);
33163 }
33164
33165 /* If profitable, negate (without causing overflow) integer constant
33166 of mode MODE at location LOC. Return true in this case. */
33167 bool
33168 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
33169 {
33170 HOST_WIDE_INT val;
33171
33172 if (!CONST_INT_P (*loc))
33173 return false;
33174
33175 switch (mode)
33176 {
33177 case DImode:
33178 /* DImode x86_64 constants must fit in 32 bits. */
33179 gcc_assert (x86_64_immediate_operand (*loc, mode));
33180
33181 mode = SImode;
33182 break;
33183
33184 case SImode:
33185 case HImode:
33186 case QImode:
33187 break;
33188
33189 default:
33190 gcc_unreachable ();
33191 }
33192
33193 /* Avoid overflows. */
33194 if (mode_signbit_p (mode, *loc))
33195 return false;
33196
33197 val = INTVAL (*loc);
33198
33199 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
33200 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
33201 if ((val < 0 && val != -128)
33202 || val == 128)
33203 {
33204 *loc = GEN_INT (-val);
33205 return true;
33206 }
33207
33208 return false;
33209 }
33210
33211 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
33212 optabs would emit if we didn't have TFmode patterns. */
33213
33214 void
33215 x86_emit_floatuns (rtx operands[2])
33216 {
33217 rtx neglab, donelab, i0, i1, f0, in, out;
33218 enum machine_mode mode, inmode;
33219
33220 inmode = GET_MODE (operands[1]);
33221 gcc_assert (inmode == SImode || inmode == DImode);
33222
33223 out = operands[0];
33224 in = force_reg (inmode, operands[1]);
33225 mode = GET_MODE (out);
33226 neglab = gen_label_rtx ();
33227 donelab = gen_label_rtx ();
33228 f0 = gen_reg_rtx (mode);
33229
33230 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
33231
33232 expand_float (out, in, 0);
33233
33234 emit_jump_insn (gen_jump (donelab));
33235 emit_barrier ();
33236
33237 emit_label (neglab);
33238
33239 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
33240 1, OPTAB_DIRECT);
33241 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
33242 1, OPTAB_DIRECT);
33243 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
33244
33245 expand_float (f0, i0, 0);
33246
33247 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
33248
33249 emit_label (donelab);
33250 }
33251 \f
33252 /* AVX2 does support 32-byte integer vector operations,
33253 thus the longest vector we are faced with is V32QImode. */
33254 #define MAX_VECT_LEN 32
33255
33256 struct expand_vec_perm_d
33257 {
33258 rtx target, op0, op1;
33259 unsigned char perm[MAX_VECT_LEN];
33260 enum machine_mode vmode;
33261 unsigned char nelt;
33262 bool testing_p;
33263 };
33264
33265 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
33266 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
33267
33268 /* Get a vector mode of the same size as the original but with elements
33269 twice as wide. This is only guaranteed to apply to integral vectors. */
33270
33271 static inline enum machine_mode
33272 get_mode_wider_vector (enum machine_mode o)
33273 {
33274 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
33275 enum machine_mode n = GET_MODE_WIDER_MODE (o);
33276 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
33277 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
33278 return n;
33279 }
33280
33281 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33282 with all elements equal to VAR. Return true if successful. */
33283
33284 static bool
33285 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
33286 rtx target, rtx val)
33287 {
33288 bool ok;
33289
33290 switch (mode)
33291 {
33292 case V2SImode:
33293 case V2SFmode:
33294 if (!mmx_ok)
33295 return false;
33296 /* FALLTHRU */
33297
33298 case V4DFmode:
33299 case V4DImode:
33300 case V8SFmode:
33301 case V8SImode:
33302 case V2DFmode:
33303 case V2DImode:
33304 case V4SFmode:
33305 case V4SImode:
33306 {
33307 rtx insn, dup;
33308
33309 /* First attempt to recognize VAL as-is. */
33310 dup = gen_rtx_VEC_DUPLICATE (mode, val);
33311 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
33312 if (recog_memoized (insn) < 0)
33313 {
33314 rtx seq;
33315 /* If that fails, force VAL into a register. */
33316
33317 start_sequence ();
33318 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
33319 seq = get_insns ();
33320 end_sequence ();
33321 if (seq)
33322 emit_insn_before (seq, insn);
33323
33324 ok = recog_memoized (insn) >= 0;
33325 gcc_assert (ok);
33326 }
33327 }
33328 return true;
33329
33330 case V4HImode:
33331 if (!mmx_ok)
33332 return false;
33333 if (TARGET_SSE || TARGET_3DNOW_A)
33334 {
33335 rtx x;
33336
33337 val = gen_lowpart (SImode, val);
33338 x = gen_rtx_TRUNCATE (HImode, val);
33339 x = gen_rtx_VEC_DUPLICATE (mode, x);
33340 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33341 return true;
33342 }
33343 goto widen;
33344
33345 case V8QImode:
33346 if (!mmx_ok)
33347 return false;
33348 goto widen;
33349
33350 case V8HImode:
33351 if (TARGET_SSE2)
33352 {
33353 struct expand_vec_perm_d dperm;
33354 rtx tmp1, tmp2;
33355
33356 permute:
33357 memset (&dperm, 0, sizeof (dperm));
33358 dperm.target = target;
33359 dperm.vmode = mode;
33360 dperm.nelt = GET_MODE_NUNITS (mode);
33361 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
33362
33363 /* Extend to SImode using a paradoxical SUBREG. */
33364 tmp1 = gen_reg_rtx (SImode);
33365 emit_move_insn (tmp1, gen_lowpart (SImode, val));
33366
33367 /* Insert the SImode value as low element of a V4SImode vector. */
33368 tmp2 = gen_lowpart (V4SImode, dperm.op0);
33369 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
33370
33371 ok = (expand_vec_perm_1 (&dperm)
33372 || expand_vec_perm_broadcast_1 (&dperm));
33373 gcc_assert (ok);
33374 return ok;
33375 }
33376 goto widen;
33377
33378 case V16QImode:
33379 if (TARGET_SSE2)
33380 goto permute;
33381 goto widen;
33382
33383 widen:
33384 /* Replicate the value once into the next wider mode and recurse. */
33385 {
33386 enum machine_mode smode, wsmode, wvmode;
33387 rtx x;
33388
33389 smode = GET_MODE_INNER (mode);
33390 wvmode = get_mode_wider_vector (mode);
33391 wsmode = GET_MODE_INNER (wvmode);
33392
33393 val = convert_modes (wsmode, smode, val, true);
33394 x = expand_simple_binop (wsmode, ASHIFT, val,
33395 GEN_INT (GET_MODE_BITSIZE (smode)),
33396 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33397 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
33398
33399 x = gen_lowpart (wvmode, target);
33400 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
33401 gcc_assert (ok);
33402 return ok;
33403 }
33404
33405 case V16HImode:
33406 case V32QImode:
33407 {
33408 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
33409 rtx x = gen_reg_rtx (hvmode);
33410
33411 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
33412 gcc_assert (ok);
33413
33414 x = gen_rtx_VEC_CONCAT (mode, x, x);
33415 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33416 }
33417 return true;
33418
33419 default:
33420 return false;
33421 }
33422 }
33423
33424 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33425 whose ONE_VAR element is VAR, and other elements are zero. Return true
33426 if successful. */
33427
33428 static bool
33429 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
33430 rtx target, rtx var, int one_var)
33431 {
33432 enum machine_mode vsimode;
33433 rtx new_target;
33434 rtx x, tmp;
33435 bool use_vector_set = false;
33436
33437 switch (mode)
33438 {
33439 case V2DImode:
33440 /* For SSE4.1, we normally use vector set. But if the second
33441 element is zero and inter-unit moves are OK, we use movq
33442 instead. */
33443 use_vector_set = (TARGET_64BIT
33444 && TARGET_SSE4_1
33445 && !(TARGET_INTER_UNIT_MOVES
33446 && one_var == 0));
33447 break;
33448 case V16QImode:
33449 case V4SImode:
33450 case V4SFmode:
33451 use_vector_set = TARGET_SSE4_1;
33452 break;
33453 case V8HImode:
33454 use_vector_set = TARGET_SSE2;
33455 break;
33456 case V4HImode:
33457 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
33458 break;
33459 case V32QImode:
33460 case V16HImode:
33461 case V8SImode:
33462 case V8SFmode:
33463 case V4DFmode:
33464 use_vector_set = TARGET_AVX;
33465 break;
33466 case V4DImode:
33467 /* Use ix86_expand_vector_set in 64bit mode only. */
33468 use_vector_set = TARGET_AVX && TARGET_64BIT;
33469 break;
33470 default:
33471 break;
33472 }
33473
33474 if (use_vector_set)
33475 {
33476 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
33477 var = force_reg (GET_MODE_INNER (mode), var);
33478 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33479 return true;
33480 }
33481
33482 switch (mode)
33483 {
33484 case V2SFmode:
33485 case V2SImode:
33486 if (!mmx_ok)
33487 return false;
33488 /* FALLTHRU */
33489
33490 case V2DFmode:
33491 case V2DImode:
33492 if (one_var != 0)
33493 return false;
33494 var = force_reg (GET_MODE_INNER (mode), var);
33495 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
33496 emit_insn (gen_rtx_SET (VOIDmode, target, x));
33497 return true;
33498
33499 case V4SFmode:
33500 case V4SImode:
33501 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
33502 new_target = gen_reg_rtx (mode);
33503 else
33504 new_target = target;
33505 var = force_reg (GET_MODE_INNER (mode), var);
33506 x = gen_rtx_VEC_DUPLICATE (mode, var);
33507 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
33508 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
33509 if (one_var != 0)
33510 {
33511 /* We need to shuffle the value to the correct position, so
33512 create a new pseudo to store the intermediate result. */
33513
33514 /* With SSE2, we can use the integer shuffle insns. */
33515 if (mode != V4SFmode && TARGET_SSE2)
33516 {
33517 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
33518 const1_rtx,
33519 GEN_INT (one_var == 1 ? 0 : 1),
33520 GEN_INT (one_var == 2 ? 0 : 1),
33521 GEN_INT (one_var == 3 ? 0 : 1)));
33522 if (target != new_target)
33523 emit_move_insn (target, new_target);
33524 return true;
33525 }
33526
33527 /* Otherwise convert the intermediate result to V4SFmode and
33528 use the SSE1 shuffle instructions. */
33529 if (mode != V4SFmode)
33530 {
33531 tmp = gen_reg_rtx (V4SFmode);
33532 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
33533 }
33534 else
33535 tmp = new_target;
33536
33537 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
33538 const1_rtx,
33539 GEN_INT (one_var == 1 ? 0 : 1),
33540 GEN_INT (one_var == 2 ? 0+4 : 1+4),
33541 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
33542
33543 if (mode != V4SFmode)
33544 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
33545 else if (tmp != target)
33546 emit_move_insn (target, tmp);
33547 }
33548 else if (target != new_target)
33549 emit_move_insn (target, new_target);
33550 return true;
33551
33552 case V8HImode:
33553 case V16QImode:
33554 vsimode = V4SImode;
33555 goto widen;
33556 case V4HImode:
33557 case V8QImode:
33558 if (!mmx_ok)
33559 return false;
33560 vsimode = V2SImode;
33561 goto widen;
33562 widen:
33563 if (one_var != 0)
33564 return false;
33565
33566 /* Zero extend the variable element to SImode and recurse. */
33567 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
33568
33569 x = gen_reg_rtx (vsimode);
33570 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
33571 var, one_var))
33572 gcc_unreachable ();
33573
33574 emit_move_insn (target, gen_lowpart (mode, x));
33575 return true;
33576
33577 default:
33578 return false;
33579 }
33580 }
33581
33582 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33583 consisting of the values in VALS. It is known that all elements
33584 except ONE_VAR are constants. Return true if successful. */
33585
33586 static bool
33587 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33588 rtx target, rtx vals, int one_var)
33589 {
33590 rtx var = XVECEXP (vals, 0, one_var);
33591 enum machine_mode wmode;
33592 rtx const_vec, x;
33593
33594 const_vec = copy_rtx (vals);
33595 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33596 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33597
33598 switch (mode)
33599 {
33600 case V2DFmode:
33601 case V2DImode:
33602 case V2SFmode:
33603 case V2SImode:
33604 /* For the two element vectors, it's just as easy to use
33605 the general case. */
33606 return false;
33607
33608 case V4DImode:
33609 /* Use ix86_expand_vector_set in 64bit mode only. */
33610 if (!TARGET_64BIT)
33611 return false;
33612 case V4DFmode:
33613 case V8SFmode:
33614 case V8SImode:
33615 case V16HImode:
33616 case V32QImode:
33617 case V4SFmode:
33618 case V4SImode:
33619 case V8HImode:
33620 case V4HImode:
33621 break;
33622
33623 case V16QImode:
33624 if (TARGET_SSE4_1)
33625 break;
33626 wmode = V8HImode;
33627 goto widen;
33628 case V8QImode:
33629 wmode = V4HImode;
33630 goto widen;
33631 widen:
33632 /* There's no way to set one QImode entry easily. Combine
33633 the variable value with its adjacent constant value, and
33634 promote to an HImode set. */
33635 x = XVECEXP (vals, 0, one_var ^ 1);
33636 if (one_var & 1)
33637 {
33638 var = convert_modes (HImode, QImode, var, true);
33639 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33640 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33641 x = GEN_INT (INTVAL (x) & 0xff);
33642 }
33643 else
33644 {
33645 var = convert_modes (HImode, QImode, var, true);
33646 x = gen_int_mode (INTVAL (x) << 8, HImode);
33647 }
33648 if (x != const0_rtx)
33649 var = expand_simple_binop (HImode, IOR, var, x, var,
33650 1, OPTAB_LIB_WIDEN);
33651
33652 x = gen_reg_rtx (wmode);
33653 emit_move_insn (x, gen_lowpart (wmode, const_vec));
33654 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33655
33656 emit_move_insn (target, gen_lowpart (mode, x));
33657 return true;
33658
33659 default:
33660 return false;
33661 }
33662
33663 emit_move_insn (target, const_vec);
33664 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33665 return true;
33666 }
33667
33668 /* A subroutine of ix86_expand_vector_init_general. Use vector
33669 concatenate to handle the most general case: all values variable,
33670 and none identical. */
33671
33672 static void
33673 ix86_expand_vector_init_concat (enum machine_mode mode,
33674 rtx target, rtx *ops, int n)
33675 {
33676 enum machine_mode cmode, hmode = VOIDmode;
33677 rtx first[8], second[4];
33678 rtvec v;
33679 int i, j;
33680
33681 switch (n)
33682 {
33683 case 2:
33684 switch (mode)
33685 {
33686 case V8SImode:
33687 cmode = V4SImode;
33688 break;
33689 case V8SFmode:
33690 cmode = V4SFmode;
33691 break;
33692 case V4DImode:
33693 cmode = V2DImode;
33694 break;
33695 case V4DFmode:
33696 cmode = V2DFmode;
33697 break;
33698 case V4SImode:
33699 cmode = V2SImode;
33700 break;
33701 case V4SFmode:
33702 cmode = V2SFmode;
33703 break;
33704 case V2DImode:
33705 cmode = DImode;
33706 break;
33707 case V2SImode:
33708 cmode = SImode;
33709 break;
33710 case V2DFmode:
33711 cmode = DFmode;
33712 break;
33713 case V2SFmode:
33714 cmode = SFmode;
33715 break;
33716 default:
33717 gcc_unreachable ();
33718 }
33719
33720 if (!register_operand (ops[1], cmode))
33721 ops[1] = force_reg (cmode, ops[1]);
33722 if (!register_operand (ops[0], cmode))
33723 ops[0] = force_reg (cmode, ops[0]);
33724 emit_insn (gen_rtx_SET (VOIDmode, target,
33725 gen_rtx_VEC_CONCAT (mode, ops[0],
33726 ops[1])));
33727 break;
33728
33729 case 4:
33730 switch (mode)
33731 {
33732 case V4DImode:
33733 cmode = V2DImode;
33734 break;
33735 case V4DFmode:
33736 cmode = V2DFmode;
33737 break;
33738 case V4SImode:
33739 cmode = V2SImode;
33740 break;
33741 case V4SFmode:
33742 cmode = V2SFmode;
33743 break;
33744 default:
33745 gcc_unreachable ();
33746 }
33747 goto half;
33748
33749 case 8:
33750 switch (mode)
33751 {
33752 case V8SImode:
33753 cmode = V2SImode;
33754 hmode = V4SImode;
33755 break;
33756 case V8SFmode:
33757 cmode = V2SFmode;
33758 hmode = V4SFmode;
33759 break;
33760 default:
33761 gcc_unreachable ();
33762 }
33763 goto half;
33764
33765 half:
33766 /* FIXME: We process inputs backward to help RA. PR 36222. */
33767 i = n - 1;
33768 j = (n >> 1) - 1;
33769 for (; i > 0; i -= 2, j--)
33770 {
33771 first[j] = gen_reg_rtx (cmode);
33772 v = gen_rtvec (2, ops[i - 1], ops[i]);
33773 ix86_expand_vector_init (false, first[j],
33774 gen_rtx_PARALLEL (cmode, v));
33775 }
33776
33777 n >>= 1;
33778 if (n > 2)
33779 {
33780 gcc_assert (hmode != VOIDmode);
33781 for (i = j = 0; i < n; i += 2, j++)
33782 {
33783 second[j] = gen_reg_rtx (hmode);
33784 ix86_expand_vector_init_concat (hmode, second [j],
33785 &first [i], 2);
33786 }
33787 n >>= 1;
33788 ix86_expand_vector_init_concat (mode, target, second, n);
33789 }
33790 else
33791 ix86_expand_vector_init_concat (mode, target, first, n);
33792 break;
33793
33794 default:
33795 gcc_unreachable ();
33796 }
33797 }
33798
33799 /* A subroutine of ix86_expand_vector_init_general. Use vector
33800 interleave to handle the most general case: all values variable,
33801 and none identical. */
33802
33803 static void
33804 ix86_expand_vector_init_interleave (enum machine_mode mode,
33805 rtx target, rtx *ops, int n)
33806 {
33807 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
33808 int i, j;
33809 rtx op0, op1;
33810 rtx (*gen_load_even) (rtx, rtx, rtx);
33811 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
33812 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
33813
33814 switch (mode)
33815 {
33816 case V8HImode:
33817 gen_load_even = gen_vec_setv8hi;
33818 gen_interleave_first_low = gen_vec_interleave_lowv4si;
33819 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33820 inner_mode = HImode;
33821 first_imode = V4SImode;
33822 second_imode = V2DImode;
33823 third_imode = VOIDmode;
33824 break;
33825 case V16QImode:
33826 gen_load_even = gen_vec_setv16qi;
33827 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
33828 gen_interleave_second_low = gen_vec_interleave_lowv4si;
33829 inner_mode = QImode;
33830 first_imode = V8HImode;
33831 second_imode = V4SImode;
33832 third_imode = V2DImode;
33833 break;
33834 default:
33835 gcc_unreachable ();
33836 }
33837
33838 for (i = 0; i < n; i++)
33839 {
33840 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
33841 op0 = gen_reg_rtx (SImode);
33842 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
33843
33844 /* Insert the SImode value as low element of V4SImode vector. */
33845 op1 = gen_reg_rtx (V4SImode);
33846 op0 = gen_rtx_VEC_MERGE (V4SImode,
33847 gen_rtx_VEC_DUPLICATE (V4SImode,
33848 op0),
33849 CONST0_RTX (V4SImode),
33850 const1_rtx);
33851 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
33852
33853 /* Cast the V4SImode vector back to a vector in orignal mode. */
33854 op0 = gen_reg_rtx (mode);
33855 emit_move_insn (op0, gen_lowpart (mode, op1));
33856
33857 /* Load even elements into the second positon. */
33858 emit_insn (gen_load_even (op0,
33859 force_reg (inner_mode,
33860 ops [i + i + 1]),
33861 const1_rtx));
33862
33863 /* Cast vector to FIRST_IMODE vector. */
33864 ops[i] = gen_reg_rtx (first_imode);
33865 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
33866 }
33867
33868 /* Interleave low FIRST_IMODE vectors. */
33869 for (i = j = 0; i < n; i += 2, j++)
33870 {
33871 op0 = gen_reg_rtx (first_imode);
33872 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
33873
33874 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
33875 ops[j] = gen_reg_rtx (second_imode);
33876 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
33877 }
33878
33879 /* Interleave low SECOND_IMODE vectors. */
33880 switch (second_imode)
33881 {
33882 case V4SImode:
33883 for (i = j = 0; i < n / 2; i += 2, j++)
33884 {
33885 op0 = gen_reg_rtx (second_imode);
33886 emit_insn (gen_interleave_second_low (op0, ops[i],
33887 ops[i + 1]));
33888
33889 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
33890 vector. */
33891 ops[j] = gen_reg_rtx (third_imode);
33892 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
33893 }
33894 second_imode = V2DImode;
33895 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33896 /* FALLTHRU */
33897
33898 case V2DImode:
33899 op0 = gen_reg_rtx (second_imode);
33900 emit_insn (gen_interleave_second_low (op0, ops[0],
33901 ops[1]));
33902
33903 /* Cast the SECOND_IMODE vector back to a vector on original
33904 mode. */
33905 emit_insn (gen_rtx_SET (VOIDmode, target,
33906 gen_lowpart (mode, op0)));
33907 break;
33908
33909 default:
33910 gcc_unreachable ();
33911 }
33912 }
33913
33914 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
33915 all values variable, and none identical. */
33916
33917 static void
33918 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
33919 rtx target, rtx vals)
33920 {
33921 rtx ops[32], op0, op1;
33922 enum machine_mode half_mode = VOIDmode;
33923 int n, i;
33924
33925 switch (mode)
33926 {
33927 case V2SFmode:
33928 case V2SImode:
33929 if (!mmx_ok && !TARGET_SSE)
33930 break;
33931 /* FALLTHRU */
33932
33933 case V8SFmode:
33934 case V8SImode:
33935 case V4DFmode:
33936 case V4DImode:
33937 case V4SFmode:
33938 case V4SImode:
33939 case V2DFmode:
33940 case V2DImode:
33941 n = GET_MODE_NUNITS (mode);
33942 for (i = 0; i < n; i++)
33943 ops[i] = XVECEXP (vals, 0, i);
33944 ix86_expand_vector_init_concat (mode, target, ops, n);
33945 return;
33946
33947 case V32QImode:
33948 half_mode = V16QImode;
33949 goto half;
33950
33951 case V16HImode:
33952 half_mode = V8HImode;
33953 goto half;
33954
33955 half:
33956 n = GET_MODE_NUNITS (mode);
33957 for (i = 0; i < n; i++)
33958 ops[i] = XVECEXP (vals, 0, i);
33959 op0 = gen_reg_rtx (half_mode);
33960 op1 = gen_reg_rtx (half_mode);
33961 ix86_expand_vector_init_interleave (half_mode, op0, ops,
33962 n >> 2);
33963 ix86_expand_vector_init_interleave (half_mode, op1,
33964 &ops [n >> 1], n >> 2);
33965 emit_insn (gen_rtx_SET (VOIDmode, target,
33966 gen_rtx_VEC_CONCAT (mode, op0, op1)));
33967 return;
33968
33969 case V16QImode:
33970 if (!TARGET_SSE4_1)
33971 break;
33972 /* FALLTHRU */
33973
33974 case V8HImode:
33975 if (!TARGET_SSE2)
33976 break;
33977
33978 /* Don't use ix86_expand_vector_init_interleave if we can't
33979 move from GPR to SSE register directly. */
33980 if (!TARGET_INTER_UNIT_MOVES)
33981 break;
33982
33983 n = GET_MODE_NUNITS (mode);
33984 for (i = 0; i < n; i++)
33985 ops[i] = XVECEXP (vals, 0, i);
33986 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
33987 return;
33988
33989 case V4HImode:
33990 case V8QImode:
33991 break;
33992
33993 default:
33994 gcc_unreachable ();
33995 }
33996
33997 {
33998 int i, j, n_elts, n_words, n_elt_per_word;
33999 enum machine_mode inner_mode;
34000 rtx words[4], shift;
34001
34002 inner_mode = GET_MODE_INNER (mode);
34003 n_elts = GET_MODE_NUNITS (mode);
34004 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
34005 n_elt_per_word = n_elts / n_words;
34006 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
34007
34008 for (i = 0; i < n_words; ++i)
34009 {
34010 rtx word = NULL_RTX;
34011
34012 for (j = 0; j < n_elt_per_word; ++j)
34013 {
34014 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
34015 elt = convert_modes (word_mode, inner_mode, elt, true);
34016
34017 if (j == 0)
34018 word = elt;
34019 else
34020 {
34021 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
34022 word, 1, OPTAB_LIB_WIDEN);
34023 word = expand_simple_binop (word_mode, IOR, word, elt,
34024 word, 1, OPTAB_LIB_WIDEN);
34025 }
34026 }
34027
34028 words[i] = word;
34029 }
34030
34031 if (n_words == 1)
34032 emit_move_insn (target, gen_lowpart (mode, words[0]));
34033 else if (n_words == 2)
34034 {
34035 rtx tmp = gen_reg_rtx (mode);
34036 emit_clobber (tmp);
34037 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
34038 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
34039 emit_move_insn (target, tmp);
34040 }
34041 else if (n_words == 4)
34042 {
34043 rtx tmp = gen_reg_rtx (V4SImode);
34044 gcc_assert (word_mode == SImode);
34045 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
34046 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
34047 emit_move_insn (target, gen_lowpart (mode, tmp));
34048 }
34049 else
34050 gcc_unreachable ();
34051 }
34052 }
34053
34054 /* Initialize vector TARGET via VALS. Suppress the use of MMX
34055 instructions unless MMX_OK is true. */
34056
34057 void
34058 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
34059 {
34060 enum machine_mode mode = GET_MODE (target);
34061 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34062 int n_elts = GET_MODE_NUNITS (mode);
34063 int n_var = 0, one_var = -1;
34064 bool all_same = true, all_const_zero = true;
34065 int i;
34066 rtx x;
34067
34068 for (i = 0; i < n_elts; ++i)
34069 {
34070 x = XVECEXP (vals, 0, i);
34071 if (!(CONST_INT_P (x)
34072 || GET_CODE (x) == CONST_DOUBLE
34073 || GET_CODE (x) == CONST_FIXED))
34074 n_var++, one_var = i;
34075 else if (x != CONST0_RTX (inner_mode))
34076 all_const_zero = false;
34077 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
34078 all_same = false;
34079 }
34080
34081 /* Constants are best loaded from the constant pool. */
34082 if (n_var == 0)
34083 {
34084 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
34085 return;
34086 }
34087
34088 /* If all values are identical, broadcast the value. */
34089 if (all_same
34090 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
34091 XVECEXP (vals, 0, 0)))
34092 return;
34093
34094 /* Values where only one field is non-constant are best loaded from
34095 the pool and overwritten via move later. */
34096 if (n_var == 1)
34097 {
34098 if (all_const_zero
34099 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
34100 XVECEXP (vals, 0, one_var),
34101 one_var))
34102 return;
34103
34104 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
34105 return;
34106 }
34107
34108 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
34109 }
34110
34111 void
34112 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
34113 {
34114 enum machine_mode mode = GET_MODE (target);
34115 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34116 enum machine_mode half_mode;
34117 bool use_vec_merge = false;
34118 rtx tmp;
34119 static rtx (*gen_extract[6][2]) (rtx, rtx)
34120 = {
34121 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
34122 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
34123 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
34124 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
34125 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
34126 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
34127 };
34128 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
34129 = {
34130 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
34131 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
34132 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
34133 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
34134 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
34135 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
34136 };
34137 int i, j, n;
34138
34139 switch (mode)
34140 {
34141 case V2SFmode:
34142 case V2SImode:
34143 if (mmx_ok)
34144 {
34145 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34146 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
34147 if (elt == 0)
34148 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34149 else
34150 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34151 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34152 return;
34153 }
34154 break;
34155
34156 case V2DImode:
34157 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
34158 if (use_vec_merge)
34159 break;
34160
34161 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
34162 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
34163 if (elt == 0)
34164 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
34165 else
34166 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
34167 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34168 return;
34169
34170 case V2DFmode:
34171 {
34172 rtx op0, op1;
34173
34174 /* For the two element vectors, we implement a VEC_CONCAT with
34175 the extraction of the other element. */
34176
34177 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
34178 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
34179
34180 if (elt == 0)
34181 op0 = val, op1 = tmp;
34182 else
34183 op0 = tmp, op1 = val;
34184
34185 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
34186 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34187 }
34188 return;
34189
34190 case V4SFmode:
34191 use_vec_merge = TARGET_SSE4_1;
34192 if (use_vec_merge)
34193 break;
34194
34195 switch (elt)
34196 {
34197 case 0:
34198 use_vec_merge = true;
34199 break;
34200
34201 case 1:
34202 /* tmp = target = A B C D */
34203 tmp = copy_to_reg (target);
34204 /* target = A A B B */
34205 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
34206 /* target = X A B B */
34207 ix86_expand_vector_set (false, target, val, 0);
34208 /* target = A X C D */
34209 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34210 const1_rtx, const0_rtx,
34211 GEN_INT (2+4), GEN_INT (3+4)));
34212 return;
34213
34214 case 2:
34215 /* tmp = target = A B C D */
34216 tmp = copy_to_reg (target);
34217 /* tmp = X B C D */
34218 ix86_expand_vector_set (false, tmp, val, 0);
34219 /* target = A B X D */
34220 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34221 const0_rtx, const1_rtx,
34222 GEN_INT (0+4), GEN_INT (3+4)));
34223 return;
34224
34225 case 3:
34226 /* tmp = target = A B C D */
34227 tmp = copy_to_reg (target);
34228 /* tmp = X B C D */
34229 ix86_expand_vector_set (false, tmp, val, 0);
34230 /* target = A B X D */
34231 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
34232 const0_rtx, const1_rtx,
34233 GEN_INT (2+4), GEN_INT (0+4)));
34234 return;
34235
34236 default:
34237 gcc_unreachable ();
34238 }
34239 break;
34240
34241 case V4SImode:
34242 use_vec_merge = TARGET_SSE4_1;
34243 if (use_vec_merge)
34244 break;
34245
34246 /* Element 0 handled by vec_merge below. */
34247 if (elt == 0)
34248 {
34249 use_vec_merge = true;
34250 break;
34251 }
34252
34253 if (TARGET_SSE2)
34254 {
34255 /* With SSE2, use integer shuffles to swap element 0 and ELT,
34256 store into element 0, then shuffle them back. */
34257
34258 rtx order[4];
34259
34260 order[0] = GEN_INT (elt);
34261 order[1] = const1_rtx;
34262 order[2] = const2_rtx;
34263 order[3] = GEN_INT (3);
34264 order[elt] = const0_rtx;
34265
34266 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34267 order[1], order[2], order[3]));
34268
34269 ix86_expand_vector_set (false, target, val, 0);
34270
34271 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
34272 order[1], order[2], order[3]));
34273 }
34274 else
34275 {
34276 /* For SSE1, we have to reuse the V4SF code. */
34277 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
34278 gen_lowpart (SFmode, val), elt);
34279 }
34280 return;
34281
34282 case V8HImode:
34283 use_vec_merge = TARGET_SSE2;
34284 break;
34285 case V4HImode:
34286 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34287 break;
34288
34289 case V16QImode:
34290 use_vec_merge = TARGET_SSE4_1;
34291 break;
34292
34293 case V8QImode:
34294 break;
34295
34296 case V32QImode:
34297 half_mode = V16QImode;
34298 j = 0;
34299 n = 16;
34300 goto half;
34301
34302 case V16HImode:
34303 half_mode = V8HImode;
34304 j = 1;
34305 n = 8;
34306 goto half;
34307
34308 case V8SImode:
34309 half_mode = V4SImode;
34310 j = 2;
34311 n = 4;
34312 goto half;
34313
34314 case V4DImode:
34315 half_mode = V2DImode;
34316 j = 3;
34317 n = 2;
34318 goto half;
34319
34320 case V8SFmode:
34321 half_mode = V4SFmode;
34322 j = 4;
34323 n = 4;
34324 goto half;
34325
34326 case V4DFmode:
34327 half_mode = V2DFmode;
34328 j = 5;
34329 n = 2;
34330 goto half;
34331
34332 half:
34333 /* Compute offset. */
34334 i = elt / n;
34335 elt %= n;
34336
34337 gcc_assert (i <= 1);
34338
34339 /* Extract the half. */
34340 tmp = gen_reg_rtx (half_mode);
34341 emit_insn (gen_extract[j][i] (tmp, target));
34342
34343 /* Put val in tmp at elt. */
34344 ix86_expand_vector_set (false, tmp, val, elt);
34345
34346 /* Put it back. */
34347 emit_insn (gen_insert[j][i] (target, target, tmp));
34348 return;
34349
34350 default:
34351 break;
34352 }
34353
34354 if (use_vec_merge)
34355 {
34356 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
34357 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
34358 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34359 }
34360 else
34361 {
34362 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34363
34364 emit_move_insn (mem, target);
34365
34366 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34367 emit_move_insn (tmp, val);
34368
34369 emit_move_insn (target, mem);
34370 }
34371 }
34372
34373 void
34374 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
34375 {
34376 enum machine_mode mode = GET_MODE (vec);
34377 enum machine_mode inner_mode = GET_MODE_INNER (mode);
34378 bool use_vec_extr = false;
34379 rtx tmp;
34380
34381 switch (mode)
34382 {
34383 case V2SImode:
34384 case V2SFmode:
34385 if (!mmx_ok)
34386 break;
34387 /* FALLTHRU */
34388
34389 case V2DFmode:
34390 case V2DImode:
34391 use_vec_extr = true;
34392 break;
34393
34394 case V4SFmode:
34395 use_vec_extr = TARGET_SSE4_1;
34396 if (use_vec_extr)
34397 break;
34398
34399 switch (elt)
34400 {
34401 case 0:
34402 tmp = vec;
34403 break;
34404
34405 case 1:
34406 case 3:
34407 tmp = gen_reg_rtx (mode);
34408 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
34409 GEN_INT (elt), GEN_INT (elt),
34410 GEN_INT (elt+4), GEN_INT (elt+4)));
34411 break;
34412
34413 case 2:
34414 tmp = gen_reg_rtx (mode);
34415 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
34416 break;
34417
34418 default:
34419 gcc_unreachable ();
34420 }
34421 vec = tmp;
34422 use_vec_extr = true;
34423 elt = 0;
34424 break;
34425
34426 case V4SImode:
34427 use_vec_extr = TARGET_SSE4_1;
34428 if (use_vec_extr)
34429 break;
34430
34431 if (TARGET_SSE2)
34432 {
34433 switch (elt)
34434 {
34435 case 0:
34436 tmp = vec;
34437 break;
34438
34439 case 1:
34440 case 3:
34441 tmp = gen_reg_rtx (mode);
34442 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
34443 GEN_INT (elt), GEN_INT (elt),
34444 GEN_INT (elt), GEN_INT (elt)));
34445 break;
34446
34447 case 2:
34448 tmp = gen_reg_rtx (mode);
34449 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
34450 break;
34451
34452 default:
34453 gcc_unreachable ();
34454 }
34455 vec = tmp;
34456 use_vec_extr = true;
34457 elt = 0;
34458 }
34459 else
34460 {
34461 /* For SSE1, we have to reuse the V4SF code. */
34462 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
34463 gen_lowpart (V4SFmode, vec), elt);
34464 return;
34465 }
34466 break;
34467
34468 case V8HImode:
34469 use_vec_extr = TARGET_SSE2;
34470 break;
34471 case V4HImode:
34472 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
34473 break;
34474
34475 case V16QImode:
34476 use_vec_extr = TARGET_SSE4_1;
34477 break;
34478
34479 case V8SFmode:
34480 if (TARGET_AVX)
34481 {
34482 tmp = gen_reg_rtx (V4SFmode);
34483 if (elt < 4)
34484 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
34485 else
34486 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
34487 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34488 return;
34489 }
34490 break;
34491
34492 case V4DFmode:
34493 if (TARGET_AVX)
34494 {
34495 tmp = gen_reg_rtx (V2DFmode);
34496 if (elt < 2)
34497 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
34498 else
34499 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
34500 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34501 return;
34502 }
34503 break;
34504
34505 case V32QImode:
34506 if (TARGET_AVX)
34507 {
34508 tmp = gen_reg_rtx (V16QImode);
34509 if (elt < 16)
34510 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
34511 else
34512 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
34513 ix86_expand_vector_extract (false, target, tmp, elt & 15);
34514 return;
34515 }
34516 break;
34517
34518 case V16HImode:
34519 if (TARGET_AVX)
34520 {
34521 tmp = gen_reg_rtx (V8HImode);
34522 if (elt < 8)
34523 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
34524 else
34525 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
34526 ix86_expand_vector_extract (false, target, tmp, elt & 7);
34527 return;
34528 }
34529 break;
34530
34531 case V8SImode:
34532 if (TARGET_AVX)
34533 {
34534 tmp = gen_reg_rtx (V4SImode);
34535 if (elt < 4)
34536 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
34537 else
34538 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
34539 ix86_expand_vector_extract (false, target, tmp, elt & 3);
34540 return;
34541 }
34542 break;
34543
34544 case V4DImode:
34545 if (TARGET_AVX)
34546 {
34547 tmp = gen_reg_rtx (V2DImode);
34548 if (elt < 2)
34549 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
34550 else
34551 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
34552 ix86_expand_vector_extract (false, target, tmp, elt & 1);
34553 return;
34554 }
34555 break;
34556
34557 case V8QImode:
34558 /* ??? Could extract the appropriate HImode element and shift. */
34559 default:
34560 break;
34561 }
34562
34563 if (use_vec_extr)
34564 {
34565 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
34566 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
34567
34568 /* Let the rtl optimizers know about the zero extension performed. */
34569 if (inner_mode == QImode || inner_mode == HImode)
34570 {
34571 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
34572 target = gen_lowpart (SImode, target);
34573 }
34574
34575 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34576 }
34577 else
34578 {
34579 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34580
34581 emit_move_insn (mem, vec);
34582
34583 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34584 emit_move_insn (target, tmp);
34585 }
34586 }
34587
34588 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34589 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34590 The upper bits of DEST are undefined, though they shouldn't cause
34591 exceptions (some bits from src or all zeros are ok). */
34592
34593 static void
34594 emit_reduc_half (rtx dest, rtx src, int i)
34595 {
34596 rtx tem;
34597 switch (GET_MODE (src))
34598 {
34599 case V4SFmode:
34600 if (i == 128)
34601 tem = gen_sse_movhlps (dest, src, src);
34602 else
34603 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34604 GEN_INT (1 + 4), GEN_INT (1 + 4));
34605 break;
34606 case V2DFmode:
34607 tem = gen_vec_interleave_highv2df (dest, src, src);
34608 break;
34609 case V16QImode:
34610 case V8HImode:
34611 case V4SImode:
34612 case V2DImode:
34613 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34614 gen_lowpart (V1TImode, src),
34615 GEN_INT (i / 2));
34616 break;
34617 case V8SFmode:
34618 if (i == 256)
34619 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34620 else
34621 tem = gen_avx_shufps256 (dest, src, src,
34622 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34623 break;
34624 case V4DFmode:
34625 if (i == 256)
34626 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34627 else
34628 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34629 break;
34630 case V32QImode:
34631 case V16HImode:
34632 case V8SImode:
34633 case V4DImode:
34634 if (i == 256)
34635 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34636 gen_lowpart (V4DImode, src),
34637 gen_lowpart (V4DImode, src),
34638 const1_rtx);
34639 else
34640 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34641 gen_lowpart (V2TImode, src),
34642 GEN_INT (i / 2));
34643 break;
34644 default:
34645 gcc_unreachable ();
34646 }
34647 emit_insn (tem);
34648 }
34649
34650 /* Expand a vector reduction. FN is the binary pattern to reduce;
34651 DEST is the destination; IN is the input vector. */
34652
34653 void
34654 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34655 {
34656 rtx half, dst, vec = in;
34657 enum machine_mode mode = GET_MODE (in);
34658 int i;
34659
34660 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
34661 if (TARGET_SSE4_1
34662 && mode == V8HImode
34663 && fn == gen_uminv8hi3)
34664 {
34665 emit_insn (gen_sse4_1_phminposuw (dest, in));
34666 return;
34667 }
34668
34669 for (i = GET_MODE_BITSIZE (mode);
34670 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34671 i >>= 1)
34672 {
34673 half = gen_reg_rtx (mode);
34674 emit_reduc_half (half, vec, i);
34675 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34676 dst = dest;
34677 else
34678 dst = gen_reg_rtx (mode);
34679 emit_insn (fn (dst, half, vec));
34680 vec = dst;
34681 }
34682 }
34683 \f
34684 /* Target hook for scalar_mode_supported_p. */
34685 static bool
34686 ix86_scalar_mode_supported_p (enum machine_mode mode)
34687 {
34688 if (DECIMAL_FLOAT_MODE_P (mode))
34689 return default_decimal_float_supported_p ();
34690 else if (mode == TFmode)
34691 return true;
34692 else
34693 return default_scalar_mode_supported_p (mode);
34694 }
34695
34696 /* Implements target hook vector_mode_supported_p. */
34697 static bool
34698 ix86_vector_mode_supported_p (enum machine_mode mode)
34699 {
34700 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34701 return true;
34702 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34703 return true;
34704 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34705 return true;
34706 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34707 return true;
34708 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34709 return true;
34710 return false;
34711 }
34712
34713 /* Target hook for c_mode_for_suffix. */
34714 static enum machine_mode
34715 ix86_c_mode_for_suffix (char suffix)
34716 {
34717 if (suffix == 'q')
34718 return TFmode;
34719 if (suffix == 'w')
34720 return XFmode;
34721
34722 return VOIDmode;
34723 }
34724
34725 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34726
34727 We do this in the new i386 backend to maintain source compatibility
34728 with the old cc0-based compiler. */
34729
34730 static tree
34731 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34732 tree inputs ATTRIBUTE_UNUSED,
34733 tree clobbers)
34734 {
34735 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34736 clobbers);
34737 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34738 clobbers);
34739 return clobbers;
34740 }
34741
34742 /* Implements target vector targetm.asm.encode_section_info. */
34743
34744 static void ATTRIBUTE_UNUSED
34745 ix86_encode_section_info (tree decl, rtx rtl, int first)
34746 {
34747 default_encode_section_info (decl, rtl, first);
34748
34749 if (TREE_CODE (decl) == VAR_DECL
34750 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34751 && ix86_in_large_data_p (decl))
34752 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34753 }
34754
34755 /* Worker function for REVERSE_CONDITION. */
34756
34757 enum rtx_code
34758 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34759 {
34760 return (mode != CCFPmode && mode != CCFPUmode
34761 ? reverse_condition (code)
34762 : reverse_condition_maybe_unordered (code));
34763 }
34764
34765 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34766 to OPERANDS[0]. */
34767
34768 const char *
34769 output_387_reg_move (rtx insn, rtx *operands)
34770 {
34771 if (REG_P (operands[0]))
34772 {
34773 if (REG_P (operands[1])
34774 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34775 {
34776 if (REGNO (operands[0]) == FIRST_STACK_REG)
34777 return output_387_ffreep (operands, 0);
34778 return "fstp\t%y0";
34779 }
34780 if (STACK_TOP_P (operands[0]))
34781 return "fld%Z1\t%y1";
34782 return "fst\t%y0";
34783 }
34784 else if (MEM_P (operands[0]))
34785 {
34786 gcc_assert (REG_P (operands[1]));
34787 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34788 return "fstp%Z0\t%y0";
34789 else
34790 {
34791 /* There is no non-popping store to memory for XFmode.
34792 So if we need one, follow the store with a load. */
34793 if (GET_MODE (operands[0]) == XFmode)
34794 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
34795 else
34796 return "fst%Z0\t%y0";
34797 }
34798 }
34799 else
34800 gcc_unreachable();
34801 }
34802
34803 /* Output code to perform a conditional jump to LABEL, if C2 flag in
34804 FP status register is set. */
34805
34806 void
34807 ix86_emit_fp_unordered_jump (rtx label)
34808 {
34809 rtx reg = gen_reg_rtx (HImode);
34810 rtx temp;
34811
34812 emit_insn (gen_x86_fnstsw_1 (reg));
34813
34814 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
34815 {
34816 emit_insn (gen_x86_sahf_1 (reg));
34817
34818 temp = gen_rtx_REG (CCmode, FLAGS_REG);
34819 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
34820 }
34821 else
34822 {
34823 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
34824
34825 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
34826 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
34827 }
34828
34829 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
34830 gen_rtx_LABEL_REF (VOIDmode, label),
34831 pc_rtx);
34832 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
34833
34834 emit_jump_insn (temp);
34835 predict_jump (REG_BR_PROB_BASE * 10 / 100);
34836 }
34837
34838 /* Output code to perform a log1p XFmode calculation. */
34839
34840 void ix86_emit_i387_log1p (rtx op0, rtx op1)
34841 {
34842 rtx label1 = gen_label_rtx ();
34843 rtx label2 = gen_label_rtx ();
34844
34845 rtx tmp = gen_reg_rtx (XFmode);
34846 rtx tmp2 = gen_reg_rtx (XFmode);
34847 rtx test;
34848
34849 emit_insn (gen_absxf2 (tmp, op1));
34850 test = gen_rtx_GE (VOIDmode, tmp,
34851 CONST_DOUBLE_FROM_REAL_VALUE (
34852 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
34853 XFmode));
34854 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
34855
34856 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34857 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
34858 emit_jump (label2);
34859
34860 emit_label (label1);
34861 emit_move_insn (tmp, CONST1_RTX (XFmode));
34862 emit_insn (gen_addxf3 (tmp, op1, tmp));
34863 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34864 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
34865
34866 emit_label (label2);
34867 }
34868
34869 /* Emit code for round calculation. */
34870 void ix86_emit_i387_round (rtx op0, rtx op1)
34871 {
34872 enum machine_mode inmode = GET_MODE (op1);
34873 enum machine_mode outmode = GET_MODE (op0);
34874 rtx e1, e2, res, tmp, tmp1, half;
34875 rtx scratch = gen_reg_rtx (HImode);
34876 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
34877 rtx jump_label = gen_label_rtx ();
34878 rtx insn;
34879 rtx (*gen_abs) (rtx, rtx);
34880 rtx (*gen_neg) (rtx, rtx);
34881
34882 switch (inmode)
34883 {
34884 case SFmode:
34885 gen_abs = gen_abssf2;
34886 break;
34887 case DFmode:
34888 gen_abs = gen_absdf2;
34889 break;
34890 case XFmode:
34891 gen_abs = gen_absxf2;
34892 break;
34893 default:
34894 gcc_unreachable ();
34895 }
34896
34897 switch (outmode)
34898 {
34899 case SFmode:
34900 gen_neg = gen_negsf2;
34901 break;
34902 case DFmode:
34903 gen_neg = gen_negdf2;
34904 break;
34905 case XFmode:
34906 gen_neg = gen_negxf2;
34907 break;
34908 case HImode:
34909 gen_neg = gen_neghi2;
34910 break;
34911 case SImode:
34912 gen_neg = gen_negsi2;
34913 break;
34914 case DImode:
34915 gen_neg = gen_negdi2;
34916 break;
34917 default:
34918 gcc_unreachable ();
34919 }
34920
34921 e1 = gen_reg_rtx (inmode);
34922 e2 = gen_reg_rtx (inmode);
34923 res = gen_reg_rtx (outmode);
34924
34925 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
34926
34927 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
34928
34929 /* scratch = fxam(op1) */
34930 emit_insn (gen_rtx_SET (VOIDmode, scratch,
34931 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
34932 UNSPEC_FXAM)));
34933 /* e1 = fabs(op1) */
34934 emit_insn (gen_abs (e1, op1));
34935
34936 /* e2 = e1 + 0.5 */
34937 half = force_reg (inmode, half);
34938 emit_insn (gen_rtx_SET (VOIDmode, e2,
34939 gen_rtx_PLUS (inmode, e1, half)));
34940
34941 /* res = floor(e2) */
34942 if (inmode != XFmode)
34943 {
34944 tmp1 = gen_reg_rtx (XFmode);
34945
34946 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
34947 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
34948 }
34949 else
34950 tmp1 = e2;
34951
34952 switch (outmode)
34953 {
34954 case SFmode:
34955 case DFmode:
34956 {
34957 rtx tmp0 = gen_reg_rtx (XFmode);
34958
34959 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
34960
34961 emit_insn (gen_rtx_SET (VOIDmode, res,
34962 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
34963 UNSPEC_TRUNC_NOOP)));
34964 }
34965 break;
34966 case XFmode:
34967 emit_insn (gen_frndintxf2_floor (res, tmp1));
34968 break;
34969 case HImode:
34970 emit_insn (gen_lfloorxfhi2 (res, tmp1));
34971 break;
34972 case SImode:
34973 emit_insn (gen_lfloorxfsi2 (res, tmp1));
34974 break;
34975 case DImode:
34976 emit_insn (gen_lfloorxfdi2 (res, tmp1));
34977 break;
34978 default:
34979 gcc_unreachable ();
34980 }
34981
34982 /* flags = signbit(a) */
34983 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
34984
34985 /* if (flags) then res = -res */
34986 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
34987 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
34988 gen_rtx_LABEL_REF (VOIDmode, jump_label),
34989 pc_rtx);
34990 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34991 predict_jump (REG_BR_PROB_BASE * 50 / 100);
34992 JUMP_LABEL (insn) = jump_label;
34993
34994 emit_insn (gen_neg (res, res));
34995
34996 emit_label (jump_label);
34997 LABEL_NUSES (jump_label) = 1;
34998
34999 emit_move_insn (op0, res);
35000 }
35001
35002 /* Output code to perform a Newton-Rhapson approximation of a single precision
35003 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
35004
35005 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
35006 {
35007 rtx x0, x1, e0, e1;
35008
35009 x0 = gen_reg_rtx (mode);
35010 e0 = gen_reg_rtx (mode);
35011 e1 = gen_reg_rtx (mode);
35012 x1 = gen_reg_rtx (mode);
35013
35014 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
35015
35016 b = force_reg (mode, b);
35017
35018 /* x0 = rcp(b) estimate */
35019 emit_insn (gen_rtx_SET (VOIDmode, x0,
35020 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
35021 UNSPEC_RCP)));
35022 /* e0 = x0 * b */
35023 emit_insn (gen_rtx_SET (VOIDmode, e0,
35024 gen_rtx_MULT (mode, x0, b)));
35025
35026 /* e0 = x0 * e0 */
35027 emit_insn (gen_rtx_SET (VOIDmode, e0,
35028 gen_rtx_MULT (mode, x0, e0)));
35029
35030 /* e1 = x0 + x0 */
35031 emit_insn (gen_rtx_SET (VOIDmode, e1,
35032 gen_rtx_PLUS (mode, x0, x0)));
35033
35034 /* x1 = e1 - e0 */
35035 emit_insn (gen_rtx_SET (VOIDmode, x1,
35036 gen_rtx_MINUS (mode, e1, e0)));
35037
35038 /* res = a * x1 */
35039 emit_insn (gen_rtx_SET (VOIDmode, res,
35040 gen_rtx_MULT (mode, a, x1)));
35041 }
35042
35043 /* Output code to perform a Newton-Rhapson approximation of a
35044 single precision floating point [reciprocal] square root. */
35045
35046 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
35047 bool recip)
35048 {
35049 rtx x0, e0, e1, e2, e3, mthree, mhalf;
35050 REAL_VALUE_TYPE r;
35051
35052 x0 = gen_reg_rtx (mode);
35053 e0 = gen_reg_rtx (mode);
35054 e1 = gen_reg_rtx (mode);
35055 e2 = gen_reg_rtx (mode);
35056 e3 = gen_reg_rtx (mode);
35057
35058 real_from_integer (&r, VOIDmode, -3, -1, 0);
35059 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35060
35061 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
35062 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
35063
35064 if (VECTOR_MODE_P (mode))
35065 {
35066 mthree = ix86_build_const_vector (mode, true, mthree);
35067 mhalf = ix86_build_const_vector (mode, true, mhalf);
35068 }
35069
35070 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
35071 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
35072
35073 a = force_reg (mode, a);
35074
35075 /* x0 = rsqrt(a) estimate */
35076 emit_insn (gen_rtx_SET (VOIDmode, x0,
35077 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
35078 UNSPEC_RSQRT)));
35079
35080 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
35081 if (!recip)
35082 {
35083 rtx zero, mask;
35084
35085 zero = gen_reg_rtx (mode);
35086 mask = gen_reg_rtx (mode);
35087
35088 zero = force_reg (mode, CONST0_RTX(mode));
35089 emit_insn (gen_rtx_SET (VOIDmode, mask,
35090 gen_rtx_NE (mode, zero, a)));
35091
35092 emit_insn (gen_rtx_SET (VOIDmode, x0,
35093 gen_rtx_AND (mode, x0, mask)));
35094 }
35095
35096 /* e0 = x0 * a */
35097 emit_insn (gen_rtx_SET (VOIDmode, e0,
35098 gen_rtx_MULT (mode, x0, a)));
35099 /* e1 = e0 * x0 */
35100 emit_insn (gen_rtx_SET (VOIDmode, e1,
35101 gen_rtx_MULT (mode, e0, x0)));
35102
35103 /* e2 = e1 - 3. */
35104 mthree = force_reg (mode, mthree);
35105 emit_insn (gen_rtx_SET (VOIDmode, e2,
35106 gen_rtx_PLUS (mode, e1, mthree)));
35107
35108 mhalf = force_reg (mode, mhalf);
35109 if (recip)
35110 /* e3 = -.5 * x0 */
35111 emit_insn (gen_rtx_SET (VOIDmode, e3,
35112 gen_rtx_MULT (mode, x0, mhalf)));
35113 else
35114 /* e3 = -.5 * e0 */
35115 emit_insn (gen_rtx_SET (VOIDmode, e3,
35116 gen_rtx_MULT (mode, e0, mhalf)));
35117 /* ret = e2 * e3 */
35118 emit_insn (gen_rtx_SET (VOIDmode, res,
35119 gen_rtx_MULT (mode, e2, e3)));
35120 }
35121
35122 #ifdef TARGET_SOLARIS
35123 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
35124
35125 static void
35126 i386_solaris_elf_named_section (const char *name, unsigned int flags,
35127 tree decl)
35128 {
35129 /* With Binutils 2.15, the "@unwind" marker must be specified on
35130 every occurrence of the ".eh_frame" section, not just the first
35131 one. */
35132 if (TARGET_64BIT
35133 && strcmp (name, ".eh_frame") == 0)
35134 {
35135 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
35136 flags & SECTION_WRITE ? "aw" : "a");
35137 return;
35138 }
35139
35140 #ifndef USE_GAS
35141 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
35142 {
35143 solaris_elf_asm_comdat_section (name, flags, decl);
35144 return;
35145 }
35146 #endif
35147
35148 default_elf_asm_named_section (name, flags, decl);
35149 }
35150 #endif /* TARGET_SOLARIS */
35151
35152 /* Return the mangling of TYPE if it is an extended fundamental type. */
35153
35154 static const char *
35155 ix86_mangle_type (const_tree type)
35156 {
35157 type = TYPE_MAIN_VARIANT (type);
35158
35159 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
35160 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
35161 return NULL;
35162
35163 switch (TYPE_MODE (type))
35164 {
35165 case TFmode:
35166 /* __float128 is "g". */
35167 return "g";
35168 case XFmode:
35169 /* "long double" or __float80 is "e". */
35170 return "e";
35171 default:
35172 return NULL;
35173 }
35174 }
35175
35176 /* For 32-bit code we can save PIC register setup by using
35177 __stack_chk_fail_local hidden function instead of calling
35178 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
35179 register, so it is better to call __stack_chk_fail directly. */
35180
35181 static tree ATTRIBUTE_UNUSED
35182 ix86_stack_protect_fail (void)
35183 {
35184 return TARGET_64BIT
35185 ? default_external_stack_protect_fail ()
35186 : default_hidden_stack_protect_fail ();
35187 }
35188
35189 /* Select a format to encode pointers in exception handling data. CODE
35190 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
35191 true if the symbol may be affected by dynamic relocations.
35192
35193 ??? All x86 object file formats are capable of representing this.
35194 After all, the relocation needed is the same as for the call insn.
35195 Whether or not a particular assembler allows us to enter such, I
35196 guess we'll have to see. */
35197 int
35198 asm_preferred_eh_data_format (int code, int global)
35199 {
35200 if (flag_pic)
35201 {
35202 int type = DW_EH_PE_sdata8;
35203 if (!TARGET_64BIT
35204 || ix86_cmodel == CM_SMALL_PIC
35205 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
35206 type = DW_EH_PE_sdata4;
35207 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
35208 }
35209 if (ix86_cmodel == CM_SMALL
35210 || (ix86_cmodel == CM_MEDIUM && code))
35211 return DW_EH_PE_udata4;
35212 return DW_EH_PE_absptr;
35213 }
35214 \f
35215 /* Expand copysign from SIGN to the positive value ABS_VALUE
35216 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
35217 the sign-bit. */
35218 static void
35219 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
35220 {
35221 enum machine_mode mode = GET_MODE (sign);
35222 rtx sgn = gen_reg_rtx (mode);
35223 if (mask == NULL_RTX)
35224 {
35225 enum machine_mode vmode;
35226
35227 if (mode == SFmode)
35228 vmode = V4SFmode;
35229 else if (mode == DFmode)
35230 vmode = V2DFmode;
35231 else
35232 vmode = mode;
35233
35234 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
35235 if (!VECTOR_MODE_P (mode))
35236 {
35237 /* We need to generate a scalar mode mask in this case. */
35238 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35239 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35240 mask = gen_reg_rtx (mode);
35241 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35242 }
35243 }
35244 else
35245 mask = gen_rtx_NOT (mode, mask);
35246 emit_insn (gen_rtx_SET (VOIDmode, sgn,
35247 gen_rtx_AND (mode, mask, sign)));
35248 emit_insn (gen_rtx_SET (VOIDmode, result,
35249 gen_rtx_IOR (mode, abs_value, sgn)));
35250 }
35251
35252 /* Expand fabs (OP0) and return a new rtx that holds the result. The
35253 mask for masking out the sign-bit is stored in *SMASK, if that is
35254 non-null. */
35255 static rtx
35256 ix86_expand_sse_fabs (rtx op0, rtx *smask)
35257 {
35258 enum machine_mode vmode, mode = GET_MODE (op0);
35259 rtx xa, mask;
35260
35261 xa = gen_reg_rtx (mode);
35262 if (mode == SFmode)
35263 vmode = V4SFmode;
35264 else if (mode == DFmode)
35265 vmode = V2DFmode;
35266 else
35267 vmode = mode;
35268 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
35269 if (!VECTOR_MODE_P (mode))
35270 {
35271 /* We need to generate a scalar mode mask in this case. */
35272 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
35273 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
35274 mask = gen_reg_rtx (mode);
35275 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
35276 }
35277 emit_insn (gen_rtx_SET (VOIDmode, xa,
35278 gen_rtx_AND (mode, op0, mask)));
35279
35280 if (smask)
35281 *smask = mask;
35282
35283 return xa;
35284 }
35285
35286 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
35287 swapping the operands if SWAP_OPERANDS is true. The expanded
35288 code is a forward jump to a newly created label in case the
35289 comparison is true. The generated label rtx is returned. */
35290 static rtx
35291 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
35292 bool swap_operands)
35293 {
35294 rtx label, tmp;
35295
35296 if (swap_operands)
35297 {
35298 tmp = op0;
35299 op0 = op1;
35300 op1 = tmp;
35301 }
35302
35303 label = gen_label_rtx ();
35304 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
35305 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35306 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
35307 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
35308 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
35309 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
35310 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
35311 JUMP_LABEL (tmp) = label;
35312
35313 return label;
35314 }
35315
35316 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
35317 using comparison code CODE. Operands are swapped for the comparison if
35318 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
35319 static rtx
35320 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
35321 bool swap_operands)
35322 {
35323 rtx (*insn)(rtx, rtx, rtx, rtx);
35324 enum machine_mode mode = GET_MODE (op0);
35325 rtx mask = gen_reg_rtx (mode);
35326
35327 if (swap_operands)
35328 {
35329 rtx tmp = op0;
35330 op0 = op1;
35331 op1 = tmp;
35332 }
35333
35334 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
35335
35336 emit_insn (insn (mask, op0, op1,
35337 gen_rtx_fmt_ee (code, mode, op0, op1)));
35338 return mask;
35339 }
35340
35341 /* Generate and return a rtx of mode MODE for 2**n where n is the number
35342 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
35343 static rtx
35344 ix86_gen_TWO52 (enum machine_mode mode)
35345 {
35346 REAL_VALUE_TYPE TWO52r;
35347 rtx TWO52;
35348
35349 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
35350 TWO52 = const_double_from_real_value (TWO52r, mode);
35351 TWO52 = force_reg (mode, TWO52);
35352
35353 return TWO52;
35354 }
35355
35356 /* Expand SSE sequence for computing lround from OP1 storing
35357 into OP0. */
35358 void
35359 ix86_expand_lround (rtx op0, rtx op1)
35360 {
35361 /* C code for the stuff we're doing below:
35362 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
35363 return (long)tmp;
35364 */
35365 enum machine_mode mode = GET_MODE (op1);
35366 const struct real_format *fmt;
35367 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35368 rtx adj;
35369
35370 /* load nextafter (0.5, 0.0) */
35371 fmt = REAL_MODE_FORMAT (mode);
35372 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35373 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35374
35375 /* adj = copysign (0.5, op1) */
35376 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
35377 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
35378
35379 /* adj = op1 + adj */
35380 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
35381
35382 /* op0 = (imode)adj */
35383 expand_fix (op0, adj, 0);
35384 }
35385
35386 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
35387 into OPERAND0. */
35388 void
35389 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
35390 {
35391 /* C code for the stuff we're doing below (for do_floor):
35392 xi = (long)op1;
35393 xi -= (double)xi > op1 ? 1 : 0;
35394 return xi;
35395 */
35396 enum machine_mode fmode = GET_MODE (op1);
35397 enum machine_mode imode = GET_MODE (op0);
35398 rtx ireg, freg, label, tmp;
35399
35400 /* reg = (long)op1 */
35401 ireg = gen_reg_rtx (imode);
35402 expand_fix (ireg, op1, 0);
35403
35404 /* freg = (double)reg */
35405 freg = gen_reg_rtx (fmode);
35406 expand_float (freg, ireg, 0);
35407
35408 /* ireg = (freg > op1) ? ireg - 1 : ireg */
35409 label = ix86_expand_sse_compare_and_jump (UNLE,
35410 freg, op1, !do_floor);
35411 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
35412 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
35413 emit_move_insn (ireg, tmp);
35414
35415 emit_label (label);
35416 LABEL_NUSES (label) = 1;
35417
35418 emit_move_insn (op0, ireg);
35419 }
35420
35421 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
35422 result in OPERAND0. */
35423 void
35424 ix86_expand_rint (rtx operand0, rtx operand1)
35425 {
35426 /* C code for the stuff we're doing below:
35427 xa = fabs (operand1);
35428 if (!isless (xa, 2**52))
35429 return operand1;
35430 xa = xa + 2**52 - 2**52;
35431 return copysign (xa, operand1);
35432 */
35433 enum machine_mode mode = GET_MODE (operand0);
35434 rtx res, xa, label, TWO52, mask;
35435
35436 res = gen_reg_rtx (mode);
35437 emit_move_insn (res, operand1);
35438
35439 /* xa = abs (operand1) */
35440 xa = ix86_expand_sse_fabs (res, &mask);
35441
35442 /* if (!isless (xa, TWO52)) goto label; */
35443 TWO52 = ix86_gen_TWO52 (mode);
35444 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35445
35446 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35447 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35448
35449 ix86_sse_copysign_to_positive (res, xa, res, mask);
35450
35451 emit_label (label);
35452 LABEL_NUSES (label) = 1;
35453
35454 emit_move_insn (operand0, res);
35455 }
35456
35457 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35458 into OPERAND0. */
35459 void
35460 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
35461 {
35462 /* C code for the stuff we expand below.
35463 double xa = fabs (x), x2;
35464 if (!isless (xa, TWO52))
35465 return x;
35466 xa = xa + TWO52 - TWO52;
35467 x2 = copysign (xa, x);
35468 Compensate. Floor:
35469 if (x2 > x)
35470 x2 -= 1;
35471 Compensate. Ceil:
35472 if (x2 < x)
35473 x2 -= -1;
35474 return x2;
35475 */
35476 enum machine_mode mode = GET_MODE (operand0);
35477 rtx xa, TWO52, tmp, label, one, res, mask;
35478
35479 TWO52 = ix86_gen_TWO52 (mode);
35480
35481 /* Temporary for holding the result, initialized to the input
35482 operand to ease control flow. */
35483 res = gen_reg_rtx (mode);
35484 emit_move_insn (res, operand1);
35485
35486 /* xa = abs (operand1) */
35487 xa = ix86_expand_sse_fabs (res, &mask);
35488
35489 /* if (!isless (xa, TWO52)) goto label; */
35490 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35491
35492 /* xa = xa + TWO52 - TWO52; */
35493 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35494 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
35495
35496 /* xa = copysign (xa, operand1) */
35497 ix86_sse_copysign_to_positive (xa, xa, res, mask);
35498
35499 /* generate 1.0 or -1.0 */
35500 one = force_reg (mode,
35501 const_double_from_real_value (do_floor
35502 ? dconst1 : dconstm1, mode));
35503
35504 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35505 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35506 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35507 gen_rtx_AND (mode, one, tmp)));
35508 /* We always need to subtract here to preserve signed zero. */
35509 tmp = expand_simple_binop (mode, MINUS,
35510 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35511 emit_move_insn (res, tmp);
35512
35513 emit_label (label);
35514 LABEL_NUSES (label) = 1;
35515
35516 emit_move_insn (operand0, res);
35517 }
35518
35519 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
35520 into OPERAND0. */
35521 void
35522 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
35523 {
35524 /* C code for the stuff we expand below.
35525 double xa = fabs (x), x2;
35526 if (!isless (xa, TWO52))
35527 return x;
35528 x2 = (double)(long)x;
35529 Compensate. Floor:
35530 if (x2 > x)
35531 x2 -= 1;
35532 Compensate. Ceil:
35533 if (x2 < x)
35534 x2 += 1;
35535 if (HONOR_SIGNED_ZEROS (mode))
35536 return copysign (x2, x);
35537 return x2;
35538 */
35539 enum machine_mode mode = GET_MODE (operand0);
35540 rtx xa, xi, TWO52, tmp, label, one, res, mask;
35541
35542 TWO52 = ix86_gen_TWO52 (mode);
35543
35544 /* Temporary for holding the result, initialized to the input
35545 operand to ease control flow. */
35546 res = gen_reg_rtx (mode);
35547 emit_move_insn (res, operand1);
35548
35549 /* xa = abs (operand1) */
35550 xa = ix86_expand_sse_fabs (res, &mask);
35551
35552 /* if (!isless (xa, TWO52)) goto label; */
35553 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35554
35555 /* xa = (double)(long)x */
35556 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35557 expand_fix (xi, res, 0);
35558 expand_float (xa, xi, 0);
35559
35560 /* generate 1.0 */
35561 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35562
35563 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35564 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35565 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35566 gen_rtx_AND (mode, one, tmp)));
35567 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
35568 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35569 emit_move_insn (res, tmp);
35570
35571 if (HONOR_SIGNED_ZEROS (mode))
35572 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35573
35574 emit_label (label);
35575 LABEL_NUSES (label) = 1;
35576
35577 emit_move_insn (operand0, res);
35578 }
35579
35580 /* Expand SSE sequence for computing round from OPERAND1 storing
35581 into OPERAND0. Sequence that works without relying on DImode truncation
35582 via cvttsd2siq that is only available on 64bit targets. */
35583 void
35584 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35585 {
35586 /* C code for the stuff we expand below.
35587 double xa = fabs (x), xa2, x2;
35588 if (!isless (xa, TWO52))
35589 return x;
35590 Using the absolute value and copying back sign makes
35591 -0.0 -> -0.0 correct.
35592 xa2 = xa + TWO52 - TWO52;
35593 Compensate.
35594 dxa = xa2 - xa;
35595 if (dxa <= -0.5)
35596 xa2 += 1;
35597 else if (dxa > 0.5)
35598 xa2 -= 1;
35599 x2 = copysign (xa2, x);
35600 return x2;
35601 */
35602 enum machine_mode mode = GET_MODE (operand0);
35603 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35604
35605 TWO52 = ix86_gen_TWO52 (mode);
35606
35607 /* Temporary for holding the result, initialized to the input
35608 operand to ease control flow. */
35609 res = gen_reg_rtx (mode);
35610 emit_move_insn (res, operand1);
35611
35612 /* xa = abs (operand1) */
35613 xa = ix86_expand_sse_fabs (res, &mask);
35614
35615 /* if (!isless (xa, TWO52)) goto label; */
35616 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35617
35618 /* xa2 = xa + TWO52 - TWO52; */
35619 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35620 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35621
35622 /* dxa = xa2 - xa; */
35623 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35624
35625 /* generate 0.5, 1.0 and -0.5 */
35626 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35627 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35628 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35629 0, OPTAB_DIRECT);
35630
35631 /* Compensate. */
35632 tmp = gen_reg_rtx (mode);
35633 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35634 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35635 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35636 gen_rtx_AND (mode, one, tmp)));
35637 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35638 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35639 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35640 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35641 gen_rtx_AND (mode, one, tmp)));
35642 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35643
35644 /* res = copysign (xa2, operand1) */
35645 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35646
35647 emit_label (label);
35648 LABEL_NUSES (label) = 1;
35649
35650 emit_move_insn (operand0, res);
35651 }
35652
35653 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35654 into OPERAND0. */
35655 void
35656 ix86_expand_trunc (rtx operand0, rtx operand1)
35657 {
35658 /* C code for SSE variant we expand below.
35659 double xa = fabs (x), x2;
35660 if (!isless (xa, TWO52))
35661 return x;
35662 x2 = (double)(long)x;
35663 if (HONOR_SIGNED_ZEROS (mode))
35664 return copysign (x2, x);
35665 return x2;
35666 */
35667 enum machine_mode mode = GET_MODE (operand0);
35668 rtx xa, xi, TWO52, label, res, mask;
35669
35670 TWO52 = ix86_gen_TWO52 (mode);
35671
35672 /* Temporary for holding the result, initialized to the input
35673 operand to ease control flow. */
35674 res = gen_reg_rtx (mode);
35675 emit_move_insn (res, operand1);
35676
35677 /* xa = abs (operand1) */
35678 xa = ix86_expand_sse_fabs (res, &mask);
35679
35680 /* if (!isless (xa, TWO52)) goto label; */
35681 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35682
35683 /* x = (double)(long)x */
35684 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35685 expand_fix (xi, res, 0);
35686 expand_float (res, xi, 0);
35687
35688 if (HONOR_SIGNED_ZEROS (mode))
35689 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35690
35691 emit_label (label);
35692 LABEL_NUSES (label) = 1;
35693
35694 emit_move_insn (operand0, res);
35695 }
35696
35697 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35698 into OPERAND0. */
35699 void
35700 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35701 {
35702 enum machine_mode mode = GET_MODE (operand0);
35703 rtx xa, mask, TWO52, label, one, res, smask, tmp;
35704
35705 /* C code for SSE variant we expand below.
35706 double xa = fabs (x), x2;
35707 if (!isless (xa, TWO52))
35708 return x;
35709 xa2 = xa + TWO52 - TWO52;
35710 Compensate:
35711 if (xa2 > xa)
35712 xa2 -= 1.0;
35713 x2 = copysign (xa2, x);
35714 return x2;
35715 */
35716
35717 TWO52 = ix86_gen_TWO52 (mode);
35718
35719 /* Temporary for holding the result, initialized to the input
35720 operand to ease control flow. */
35721 res = gen_reg_rtx (mode);
35722 emit_move_insn (res, operand1);
35723
35724 /* xa = abs (operand1) */
35725 xa = ix86_expand_sse_fabs (res, &smask);
35726
35727 /* if (!isless (xa, TWO52)) goto label; */
35728 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35729
35730 /* res = xa + TWO52 - TWO52; */
35731 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35732 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35733 emit_move_insn (res, tmp);
35734
35735 /* generate 1.0 */
35736 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35737
35738 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
35739 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35740 emit_insn (gen_rtx_SET (VOIDmode, mask,
35741 gen_rtx_AND (mode, mask, one)));
35742 tmp = expand_simple_binop (mode, MINUS,
35743 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35744 emit_move_insn (res, tmp);
35745
35746 /* res = copysign (res, operand1) */
35747 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35748
35749 emit_label (label);
35750 LABEL_NUSES (label) = 1;
35751
35752 emit_move_insn (operand0, res);
35753 }
35754
35755 /* Expand SSE sequence for computing round from OPERAND1 storing
35756 into OPERAND0. */
35757 void
35758 ix86_expand_round (rtx operand0, rtx operand1)
35759 {
35760 /* C code for the stuff we're doing below:
35761 double xa = fabs (x);
35762 if (!isless (xa, TWO52))
35763 return x;
35764 xa = (double)(long)(xa + nextafter (0.5, 0.0));
35765 return copysign (xa, x);
35766 */
35767 enum machine_mode mode = GET_MODE (operand0);
35768 rtx res, TWO52, xa, label, xi, half, mask;
35769 const struct real_format *fmt;
35770 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35771
35772 /* Temporary for holding the result, initialized to the input
35773 operand to ease control flow. */
35774 res = gen_reg_rtx (mode);
35775 emit_move_insn (res, operand1);
35776
35777 TWO52 = ix86_gen_TWO52 (mode);
35778 xa = ix86_expand_sse_fabs (res, &mask);
35779 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35780
35781 /* load nextafter (0.5, 0.0) */
35782 fmt = REAL_MODE_FORMAT (mode);
35783 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35784 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35785
35786 /* xa = xa + 0.5 */
35787 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
35788 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
35789
35790 /* xa = (double)(int64_t)xa */
35791 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35792 expand_fix (xi, xa, 0);
35793 expand_float (xa, xi, 0);
35794
35795 /* res = copysign (xa, operand1) */
35796 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
35797
35798 emit_label (label);
35799 LABEL_NUSES (label) = 1;
35800
35801 emit_move_insn (operand0, res);
35802 }
35803
35804 /* Expand SSE sequence for computing round
35805 from OP1 storing into OP0 using sse4 round insn. */
35806 void
35807 ix86_expand_round_sse4 (rtx op0, rtx op1)
35808 {
35809 enum machine_mode mode = GET_MODE (op0);
35810 rtx e1, e2, res, half;
35811 const struct real_format *fmt;
35812 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35813 rtx (*gen_copysign) (rtx, rtx, rtx);
35814 rtx (*gen_round) (rtx, rtx, rtx);
35815
35816 switch (mode)
35817 {
35818 case SFmode:
35819 gen_copysign = gen_copysignsf3;
35820 gen_round = gen_sse4_1_roundsf2;
35821 break;
35822 case DFmode:
35823 gen_copysign = gen_copysigndf3;
35824 gen_round = gen_sse4_1_rounddf2;
35825 break;
35826 default:
35827 gcc_unreachable ();
35828 }
35829
35830 /* round (a) = trunc (a + copysign (0.5, a)) */
35831
35832 /* load nextafter (0.5, 0.0) */
35833 fmt = REAL_MODE_FORMAT (mode);
35834 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35835 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35836 half = const_double_from_real_value (pred_half, mode);
35837
35838 /* e1 = copysign (0.5, op1) */
35839 e1 = gen_reg_rtx (mode);
35840 emit_insn (gen_copysign (e1, half, op1));
35841
35842 /* e2 = op1 + e1 */
35843 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
35844
35845 /* res = trunc (e2) */
35846 res = gen_reg_rtx (mode);
35847 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
35848
35849 emit_move_insn (op0, res);
35850 }
35851 \f
35852
35853 /* Table of valid machine attributes. */
35854 static const struct attribute_spec ix86_attribute_table[] =
35855 {
35856 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
35857 affects_type_identity } */
35858 /* Stdcall attribute says callee is responsible for popping arguments
35859 if they are not variable. */
35860 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35861 true },
35862 /* Fastcall attribute says callee is responsible for popping arguments
35863 if they are not variable. */
35864 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35865 true },
35866 /* Thiscall attribute says callee is responsible for popping arguments
35867 if they are not variable. */
35868 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35869 true },
35870 /* Cdecl attribute says the callee is a normal C declaration */
35871 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35872 true },
35873 /* Regparm attribute specifies how many integer arguments are to be
35874 passed in registers. */
35875 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
35876 true },
35877 /* Sseregparm attribute says we are using x86_64 calling conventions
35878 for FP arguments. */
35879 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35880 true },
35881 /* The transactional memory builtins are implicitly regparm or fastcall
35882 depending on the ABI. Override the generic do-nothing attribute that
35883 these builtins were declared with. */
35884 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
35885 true },
35886 /* force_align_arg_pointer says this function realigns the stack at entry. */
35887 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
35888 false, true, true, ix86_handle_cconv_attribute, false },
35889 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
35890 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
35891 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
35892 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
35893 false },
35894 #endif
35895 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35896 false },
35897 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35898 false },
35899 #ifdef SUBTARGET_ATTRIBUTE_TABLE
35900 SUBTARGET_ATTRIBUTE_TABLE,
35901 #endif
35902 /* ms_abi and sysv_abi calling convention function attributes. */
35903 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35904 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35905 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
35906 false },
35907 { "callee_pop_aggregate_return", 1, 1, false, true, true,
35908 ix86_handle_callee_pop_aggregate_return, true },
35909 /* End element. */
35910 { NULL, 0, 0, false, false, false, NULL, false }
35911 };
35912
35913 /* Implement targetm.vectorize.builtin_vectorization_cost. */
35914 static int
35915 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35916 tree vectype ATTRIBUTE_UNUSED,
35917 int misalign ATTRIBUTE_UNUSED)
35918 {
35919 switch (type_of_cost)
35920 {
35921 case scalar_stmt:
35922 return ix86_cost->scalar_stmt_cost;
35923
35924 case scalar_load:
35925 return ix86_cost->scalar_load_cost;
35926
35927 case scalar_store:
35928 return ix86_cost->scalar_store_cost;
35929
35930 case vector_stmt:
35931 return ix86_cost->vec_stmt_cost;
35932
35933 case vector_load:
35934 return ix86_cost->vec_align_load_cost;
35935
35936 case vector_store:
35937 return ix86_cost->vec_store_cost;
35938
35939 case vec_to_scalar:
35940 return ix86_cost->vec_to_scalar_cost;
35941
35942 case scalar_to_vec:
35943 return ix86_cost->scalar_to_vec_cost;
35944
35945 case unaligned_load:
35946 case unaligned_store:
35947 return ix86_cost->vec_unalign_load_cost;
35948
35949 case cond_branch_taken:
35950 return ix86_cost->cond_taken_branch_cost;
35951
35952 case cond_branch_not_taken:
35953 return ix86_cost->cond_not_taken_branch_cost;
35954
35955 case vec_perm:
35956 return 1;
35957
35958 default:
35959 gcc_unreachable ();
35960 }
35961 }
35962
35963
35964 /* Return a vector mode with twice as many elements as VMODE. */
35965 /* ??? Consider moving this to a table generated by genmodes.c. */
35966
35967 static enum machine_mode
35968 doublesize_vector_mode (enum machine_mode vmode)
35969 {
35970 switch (vmode)
35971 {
35972 case V2SFmode: return V4SFmode;
35973 case V1DImode: return V2DImode;
35974 case V2SImode: return V4SImode;
35975 case V4HImode: return V8HImode;
35976 case V8QImode: return V16QImode;
35977
35978 case V2DFmode: return V4DFmode;
35979 case V4SFmode: return V8SFmode;
35980 case V2DImode: return V4DImode;
35981 case V4SImode: return V8SImode;
35982 case V8HImode: return V16HImode;
35983 case V16QImode: return V32QImode;
35984
35985 case V4DFmode: return V8DFmode;
35986 case V8SFmode: return V16SFmode;
35987 case V4DImode: return V8DImode;
35988 case V8SImode: return V16SImode;
35989 case V16HImode: return V32HImode;
35990 case V32QImode: return V64QImode;
35991
35992 default:
35993 gcc_unreachable ();
35994 }
35995 }
35996
35997 /* Construct (set target (vec_select op0 (parallel perm))) and
35998 return true if that's a valid instruction in the active ISA. */
35999
36000 static bool
36001 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
36002 {
36003 rtx rperm[MAX_VECT_LEN], x;
36004 unsigned i;
36005
36006 for (i = 0; i < nelt; ++i)
36007 rperm[i] = GEN_INT (perm[i]);
36008
36009 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
36010 x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
36011 x = gen_rtx_SET (VOIDmode, target, x);
36012
36013 x = emit_insn (x);
36014 if (recog_memoized (x) < 0)
36015 {
36016 remove_insn (x);
36017 return false;
36018 }
36019 return true;
36020 }
36021
36022 /* Similar, but generate a vec_concat from op0 and op1 as well. */
36023
36024 static bool
36025 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
36026 const unsigned char *perm, unsigned nelt)
36027 {
36028 enum machine_mode v2mode;
36029 rtx x;
36030
36031 v2mode = doublesize_vector_mode (GET_MODE (op0));
36032 x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
36033 return expand_vselect (target, x, perm, nelt);
36034 }
36035
36036 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36037 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
36038
36039 static bool
36040 expand_vec_perm_blend (struct expand_vec_perm_d *d)
36041 {
36042 enum machine_mode vmode = d->vmode;
36043 unsigned i, mask, nelt = d->nelt;
36044 rtx target, op0, op1, x;
36045 rtx rperm[32], vperm;
36046
36047 if (d->op0 == d->op1)
36048 return false;
36049 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
36050 ;
36051 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
36052 ;
36053 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
36054 ;
36055 else
36056 return false;
36057
36058 /* This is a blend, not a permute. Elements must stay in their
36059 respective lanes. */
36060 for (i = 0; i < nelt; ++i)
36061 {
36062 unsigned e = d->perm[i];
36063 if (!(e == i || e == i + nelt))
36064 return false;
36065 }
36066
36067 if (d->testing_p)
36068 return true;
36069
36070 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
36071 decision should be extracted elsewhere, so that we only try that
36072 sequence once all budget==3 options have been tried. */
36073 target = d->target;
36074 op0 = d->op0;
36075 op1 = d->op1;
36076 mask = 0;
36077
36078 switch (vmode)
36079 {
36080 case V4DFmode:
36081 case V8SFmode:
36082 case V2DFmode:
36083 case V4SFmode:
36084 case V8HImode:
36085 case V8SImode:
36086 for (i = 0; i < nelt; ++i)
36087 mask |= (d->perm[i] >= nelt) << i;
36088 break;
36089
36090 case V2DImode:
36091 for (i = 0; i < 2; ++i)
36092 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
36093 vmode = V8HImode;
36094 goto do_subreg;
36095
36096 case V4SImode:
36097 for (i = 0; i < 4; ++i)
36098 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36099 vmode = V8HImode;
36100 goto do_subreg;
36101
36102 case V16QImode:
36103 /* See if bytes move in pairs so we can use pblendw with
36104 an immediate argument, rather than pblendvb with a vector
36105 argument. */
36106 for (i = 0; i < 16; i += 2)
36107 if (d->perm[i] + 1 != d->perm[i + 1])
36108 {
36109 use_pblendvb:
36110 for (i = 0; i < nelt; ++i)
36111 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
36112
36113 finish_pblendvb:
36114 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
36115 vperm = force_reg (vmode, vperm);
36116
36117 if (GET_MODE_SIZE (vmode) == 16)
36118 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
36119 else
36120 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
36121 return true;
36122 }
36123
36124 for (i = 0; i < 8; ++i)
36125 mask |= (d->perm[i * 2] >= 16) << i;
36126 vmode = V8HImode;
36127 /* FALLTHRU */
36128
36129 do_subreg:
36130 target = gen_lowpart (vmode, target);
36131 op0 = gen_lowpart (vmode, op0);
36132 op1 = gen_lowpart (vmode, op1);
36133 break;
36134
36135 case V32QImode:
36136 /* See if bytes move in pairs. If not, vpblendvb must be used. */
36137 for (i = 0; i < 32; i += 2)
36138 if (d->perm[i] + 1 != d->perm[i + 1])
36139 goto use_pblendvb;
36140 /* See if bytes move in quadruplets. If yes, vpblendd
36141 with immediate can be used. */
36142 for (i = 0; i < 32; i += 4)
36143 if (d->perm[i] + 2 != d->perm[i + 2])
36144 break;
36145 if (i < 32)
36146 {
36147 /* See if bytes move the same in both lanes. If yes,
36148 vpblendw with immediate can be used. */
36149 for (i = 0; i < 16; i += 2)
36150 if (d->perm[i] + 16 != d->perm[i + 16])
36151 goto use_pblendvb;
36152
36153 /* Use vpblendw. */
36154 for (i = 0; i < 16; ++i)
36155 mask |= (d->perm[i * 2] >= 32) << i;
36156 vmode = V16HImode;
36157 goto do_subreg;
36158 }
36159
36160 /* Use vpblendd. */
36161 for (i = 0; i < 8; ++i)
36162 mask |= (d->perm[i * 4] >= 32) << i;
36163 vmode = V8SImode;
36164 goto do_subreg;
36165
36166 case V16HImode:
36167 /* See if words move in pairs. If yes, vpblendd can be used. */
36168 for (i = 0; i < 16; i += 2)
36169 if (d->perm[i] + 1 != d->perm[i + 1])
36170 break;
36171 if (i < 16)
36172 {
36173 /* See if words move the same in both lanes. If not,
36174 vpblendvb must be used. */
36175 for (i = 0; i < 8; i++)
36176 if (d->perm[i] + 8 != d->perm[i + 8])
36177 {
36178 /* Use vpblendvb. */
36179 for (i = 0; i < 32; ++i)
36180 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
36181
36182 vmode = V32QImode;
36183 nelt = 32;
36184 target = gen_lowpart (vmode, target);
36185 op0 = gen_lowpart (vmode, op0);
36186 op1 = gen_lowpart (vmode, op1);
36187 goto finish_pblendvb;
36188 }
36189
36190 /* Use vpblendw. */
36191 for (i = 0; i < 16; ++i)
36192 mask |= (d->perm[i] >= 16) << i;
36193 break;
36194 }
36195
36196 /* Use vpblendd. */
36197 for (i = 0; i < 8; ++i)
36198 mask |= (d->perm[i * 2] >= 16) << i;
36199 vmode = V8SImode;
36200 goto do_subreg;
36201
36202 case V4DImode:
36203 /* Use vpblendd. */
36204 for (i = 0; i < 4; ++i)
36205 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
36206 vmode = V8SImode;
36207 goto do_subreg;
36208
36209 default:
36210 gcc_unreachable ();
36211 }
36212
36213 /* This matches five different patterns with the different modes. */
36214 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
36215 x = gen_rtx_SET (VOIDmode, target, x);
36216 emit_insn (x);
36217
36218 return true;
36219 }
36220
36221 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36222 in terms of the variable form of vpermilps.
36223
36224 Note that we will have already failed the immediate input vpermilps,
36225 which requires that the high and low part shuffle be identical; the
36226 variable form doesn't require that. */
36227
36228 static bool
36229 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
36230 {
36231 rtx rperm[8], vperm;
36232 unsigned i;
36233
36234 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
36235 return false;
36236
36237 /* We can only permute within the 128-bit lane. */
36238 for (i = 0; i < 8; ++i)
36239 {
36240 unsigned e = d->perm[i];
36241 if (i < 4 ? e >= 4 : e < 4)
36242 return false;
36243 }
36244
36245 if (d->testing_p)
36246 return true;
36247
36248 for (i = 0; i < 8; ++i)
36249 {
36250 unsigned e = d->perm[i];
36251
36252 /* Within each 128-bit lane, the elements of op0 are numbered
36253 from 0 and the elements of op1 are numbered from 4. */
36254 if (e >= 8 + 4)
36255 e -= 8;
36256 else if (e >= 4)
36257 e -= 4;
36258
36259 rperm[i] = GEN_INT (e);
36260 }
36261
36262 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
36263 vperm = force_reg (V8SImode, vperm);
36264 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
36265
36266 return true;
36267 }
36268
36269 /* Return true if permutation D can be performed as VMODE permutation
36270 instead. */
36271
36272 static bool
36273 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
36274 {
36275 unsigned int i, j, chunk;
36276
36277 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
36278 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
36279 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
36280 return false;
36281
36282 if (GET_MODE_NUNITS (vmode) >= d->nelt)
36283 return true;
36284
36285 chunk = d->nelt / GET_MODE_NUNITS (vmode);
36286 for (i = 0; i < d->nelt; i += chunk)
36287 if (d->perm[i] & (chunk - 1))
36288 return false;
36289 else
36290 for (j = 1; j < chunk; ++j)
36291 if (d->perm[i] + j != d->perm[i + j])
36292 return false;
36293
36294 return true;
36295 }
36296
36297 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36298 in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */
36299
36300 static bool
36301 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
36302 {
36303 unsigned i, nelt, eltsz, mask;
36304 unsigned char perm[32];
36305 enum machine_mode vmode = V16QImode;
36306 rtx rperm[32], vperm, target, op0, op1;
36307
36308 nelt = d->nelt;
36309
36310 if (d->op0 != d->op1)
36311 {
36312 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
36313 {
36314 if (TARGET_AVX2
36315 && valid_perm_using_mode_p (V2TImode, d))
36316 {
36317 if (d->testing_p)
36318 return true;
36319
36320 /* Use vperm2i128 insn. The pattern uses
36321 V4DImode instead of V2TImode. */
36322 target = gen_lowpart (V4DImode, d->target);
36323 op0 = gen_lowpart (V4DImode, d->op0);
36324 op1 = gen_lowpart (V4DImode, d->op1);
36325 rperm[0]
36326 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
36327 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
36328 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
36329 return true;
36330 }
36331 return false;
36332 }
36333 }
36334 else
36335 {
36336 if (GET_MODE_SIZE (d->vmode) == 16)
36337 {
36338 if (!TARGET_SSSE3)
36339 return false;
36340 }
36341 else if (GET_MODE_SIZE (d->vmode) == 32)
36342 {
36343 if (!TARGET_AVX2)
36344 return false;
36345
36346 /* V4DImode should be already handled through
36347 expand_vselect by vpermq instruction. */
36348 gcc_assert (d->vmode != V4DImode);
36349
36350 vmode = V32QImode;
36351 if (d->vmode == V8SImode
36352 || d->vmode == V16HImode
36353 || d->vmode == V32QImode)
36354 {
36355 /* First see if vpermq can be used for
36356 V8SImode/V16HImode/V32QImode. */
36357 if (valid_perm_using_mode_p (V4DImode, d))
36358 {
36359 for (i = 0; i < 4; i++)
36360 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
36361 if (d->testing_p)
36362 return true;
36363 return expand_vselect (gen_lowpart (V4DImode, d->target),
36364 gen_lowpart (V4DImode, d->op0),
36365 perm, 4);
36366 }
36367
36368 /* Next see if vpermd can be used. */
36369 if (valid_perm_using_mode_p (V8SImode, d))
36370 vmode = V8SImode;
36371 }
36372
36373 if (vmode == V32QImode)
36374 {
36375 /* vpshufb only works intra lanes, it is not
36376 possible to shuffle bytes in between the lanes. */
36377 for (i = 0; i < nelt; ++i)
36378 if ((d->perm[i] ^ i) & (nelt / 2))
36379 return false;
36380 }
36381 }
36382 else
36383 return false;
36384 }
36385
36386 if (d->testing_p)
36387 return true;
36388
36389 if (vmode == V8SImode)
36390 for (i = 0; i < 8; ++i)
36391 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
36392 else
36393 {
36394 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36395 if (d->op0 != d->op1)
36396 mask = 2 * nelt - 1;
36397 else if (vmode == V16QImode)
36398 mask = nelt - 1;
36399 else
36400 mask = nelt / 2 - 1;
36401
36402 for (i = 0; i < nelt; ++i)
36403 {
36404 unsigned j, e = d->perm[i] & mask;
36405 for (j = 0; j < eltsz; ++j)
36406 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
36407 }
36408 }
36409
36410 vperm = gen_rtx_CONST_VECTOR (vmode,
36411 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
36412 vperm = force_reg (vmode, vperm);
36413
36414 target = gen_lowpart (vmode, d->target);
36415 op0 = gen_lowpart (vmode, d->op0);
36416 if (d->op0 == d->op1)
36417 {
36418 if (vmode == V16QImode)
36419 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
36420 else if (vmode == V32QImode)
36421 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
36422 else
36423 emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
36424 }
36425 else
36426 {
36427 op1 = gen_lowpart (vmode, d->op1);
36428 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
36429 }
36430
36431 return true;
36432 }
36433
36434 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
36435 in a single instruction. */
36436
36437 static bool
36438 expand_vec_perm_1 (struct expand_vec_perm_d *d)
36439 {
36440 unsigned i, nelt = d->nelt;
36441 unsigned char perm2[MAX_VECT_LEN];
36442
36443 /* Check plain VEC_SELECT first, because AVX has instructions that could
36444 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
36445 input where SEL+CONCAT may not. */
36446 if (d->op0 == d->op1)
36447 {
36448 int mask = nelt - 1;
36449 bool identity_perm = true;
36450 bool broadcast_perm = true;
36451
36452 for (i = 0; i < nelt; i++)
36453 {
36454 perm2[i] = d->perm[i] & mask;
36455 if (perm2[i] != i)
36456 identity_perm = false;
36457 if (perm2[i])
36458 broadcast_perm = false;
36459 }
36460
36461 if (identity_perm)
36462 {
36463 if (!d->testing_p)
36464 emit_move_insn (d->target, d->op0);
36465 return true;
36466 }
36467 else if (broadcast_perm && TARGET_AVX2)
36468 {
36469 /* Use vpbroadcast{b,w,d}. */
36470 rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
36471 switch (d->vmode)
36472 {
36473 case V32QImode:
36474 op = gen_lowpart (V16QImode, op);
36475 gen = gen_avx2_pbroadcastv32qi;
36476 break;
36477 case V16HImode:
36478 op = gen_lowpart (V8HImode, op);
36479 gen = gen_avx2_pbroadcastv16hi;
36480 break;
36481 case V8SImode:
36482 op = gen_lowpart (V4SImode, op);
36483 gen = gen_avx2_pbroadcastv8si;
36484 break;
36485 case V16QImode:
36486 gen = gen_avx2_pbroadcastv16qi;
36487 break;
36488 case V8HImode:
36489 gen = gen_avx2_pbroadcastv8hi;
36490 break;
36491 /* For other modes prefer other shuffles this function creates. */
36492 default: break;
36493 }
36494 if (gen != NULL)
36495 {
36496 if (!d->testing_p)
36497 emit_insn (gen (d->target, op));
36498 return true;
36499 }
36500 }
36501
36502 if (expand_vselect (d->target, d->op0, perm2, nelt))
36503 return true;
36504
36505 /* There are plenty of patterns in sse.md that are written for
36506 SEL+CONCAT and are not replicated for a single op. Perhaps
36507 that should be changed, to avoid the nastiness here. */
36508
36509 /* Recognize interleave style patterns, which means incrementing
36510 every other permutation operand. */
36511 for (i = 0; i < nelt; i += 2)
36512 {
36513 perm2[i] = d->perm[i] & mask;
36514 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
36515 }
36516 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
36517 return true;
36518
36519 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
36520 if (nelt >= 4)
36521 {
36522 for (i = 0; i < nelt; i += 4)
36523 {
36524 perm2[i + 0] = d->perm[i + 0] & mask;
36525 perm2[i + 1] = d->perm[i + 1] & mask;
36526 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
36527 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
36528 }
36529
36530 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
36531 return true;
36532 }
36533 }
36534
36535 /* Finally, try the fully general two operand permute. */
36536 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
36537 return true;
36538
36539 /* Recognize interleave style patterns with reversed operands. */
36540 if (d->op0 != d->op1)
36541 {
36542 for (i = 0; i < nelt; ++i)
36543 {
36544 unsigned e = d->perm[i];
36545 if (e >= nelt)
36546 e -= nelt;
36547 else
36548 e += nelt;
36549 perm2[i] = e;
36550 }
36551
36552 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
36553 return true;
36554 }
36555
36556 /* Try the SSE4.1 blend variable merge instructions. */
36557 if (expand_vec_perm_blend (d))
36558 return true;
36559
36560 /* Try one of the AVX vpermil variable permutations. */
36561 if (expand_vec_perm_vpermil (d))
36562 return true;
36563
36564 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
36565 vpshufb, vpermd or vpermq variable permutation. */
36566 if (expand_vec_perm_pshufb (d))
36567 return true;
36568
36569 return false;
36570 }
36571
36572 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
36573 in terms of a pair of pshuflw + pshufhw instructions. */
36574
36575 static bool
36576 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
36577 {
36578 unsigned char perm2[MAX_VECT_LEN];
36579 unsigned i;
36580 bool ok;
36581
36582 if (d->vmode != V8HImode || d->op0 != d->op1)
36583 return false;
36584
36585 /* The two permutations only operate in 64-bit lanes. */
36586 for (i = 0; i < 4; ++i)
36587 if (d->perm[i] >= 4)
36588 return false;
36589 for (i = 4; i < 8; ++i)
36590 if (d->perm[i] < 4)
36591 return false;
36592
36593 if (d->testing_p)
36594 return true;
36595
36596 /* Emit the pshuflw. */
36597 memcpy (perm2, d->perm, 4);
36598 for (i = 4; i < 8; ++i)
36599 perm2[i] = i;
36600 ok = expand_vselect (d->target, d->op0, perm2, 8);
36601 gcc_assert (ok);
36602
36603 /* Emit the pshufhw. */
36604 memcpy (perm2 + 4, d->perm + 4, 4);
36605 for (i = 0; i < 4; ++i)
36606 perm2[i] = i;
36607 ok = expand_vselect (d->target, d->target, perm2, 8);
36608 gcc_assert (ok);
36609
36610 return true;
36611 }
36612
36613 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36614 the permutation using the SSSE3 palignr instruction. This succeeds
36615 when all of the elements in PERM fit within one vector and we merely
36616 need to shift them down so that a single vector permutation has a
36617 chance to succeed. */
36618
36619 static bool
36620 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36621 {
36622 unsigned i, nelt = d->nelt;
36623 unsigned min, max;
36624 bool in_order, ok;
36625 rtx shift;
36626
36627 /* Even with AVX, palignr only operates on 128-bit vectors. */
36628 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36629 return false;
36630
36631 min = nelt, max = 0;
36632 for (i = 0; i < nelt; ++i)
36633 {
36634 unsigned e = d->perm[i];
36635 if (e < min)
36636 min = e;
36637 if (e > max)
36638 max = e;
36639 }
36640 if (min == 0 || max - min >= nelt)
36641 return false;
36642
36643 /* Given that we have SSSE3, we know we'll be able to implement the
36644 single operand permutation after the palignr with pshufb. */
36645 if (d->testing_p)
36646 return true;
36647
36648 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36649 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36650 gen_lowpart (TImode, d->op1),
36651 gen_lowpart (TImode, d->op0), shift));
36652
36653 d->op0 = d->op1 = d->target;
36654
36655 in_order = true;
36656 for (i = 0; i < nelt; ++i)
36657 {
36658 unsigned e = d->perm[i] - min;
36659 if (e != i)
36660 in_order = false;
36661 d->perm[i] = e;
36662 }
36663
36664 /* Test for the degenerate case where the alignment by itself
36665 produces the desired permutation. */
36666 if (in_order)
36667 return true;
36668
36669 ok = expand_vec_perm_1 (d);
36670 gcc_assert (ok);
36671
36672 return ok;
36673 }
36674
36675 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36676 a two vector permutation into a single vector permutation by using
36677 an interleave operation to merge the vectors. */
36678
36679 static bool
36680 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36681 {
36682 struct expand_vec_perm_d dremap, dfinal;
36683 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36684 unsigned HOST_WIDE_INT contents;
36685 unsigned char remap[2 * MAX_VECT_LEN];
36686 rtx seq;
36687 bool ok, same_halves = false;
36688
36689 if (GET_MODE_SIZE (d->vmode) == 16)
36690 {
36691 if (d->op0 == d->op1)
36692 return false;
36693 }
36694 else if (GET_MODE_SIZE (d->vmode) == 32)
36695 {
36696 if (!TARGET_AVX)
36697 return false;
36698 /* For 32-byte modes allow even d->op0 == d->op1.
36699 The lack of cross-lane shuffling in some instructions
36700 might prevent a single insn shuffle. */
36701 }
36702 else
36703 return false;
36704
36705 /* Examine from whence the elements come. */
36706 contents = 0;
36707 for (i = 0; i < nelt; ++i)
36708 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36709
36710 memset (remap, 0xff, sizeof (remap));
36711 dremap = *d;
36712
36713 if (GET_MODE_SIZE (d->vmode) == 16)
36714 {
36715 unsigned HOST_WIDE_INT h1, h2, h3, h4;
36716
36717 /* Split the two input vectors into 4 halves. */
36718 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36719 h2 = h1 << nelt2;
36720 h3 = h2 << nelt2;
36721 h4 = h3 << nelt2;
36722
36723 /* If the elements from the low halves use interleave low, and similarly
36724 for interleave high. If the elements are from mis-matched halves, we
36725 can use shufps for V4SF/V4SI or do a DImode shuffle. */
36726 if ((contents & (h1 | h3)) == contents)
36727 {
36728 /* punpckl* */
36729 for (i = 0; i < nelt2; ++i)
36730 {
36731 remap[i] = i * 2;
36732 remap[i + nelt] = i * 2 + 1;
36733 dremap.perm[i * 2] = i;
36734 dremap.perm[i * 2 + 1] = i + nelt;
36735 }
36736 if (!TARGET_SSE2 && d->vmode == V4SImode)
36737 dremap.vmode = V4SFmode;
36738 }
36739 else if ((contents & (h2 | h4)) == contents)
36740 {
36741 /* punpckh* */
36742 for (i = 0; i < nelt2; ++i)
36743 {
36744 remap[i + nelt2] = i * 2;
36745 remap[i + nelt + nelt2] = i * 2 + 1;
36746 dremap.perm[i * 2] = i + nelt2;
36747 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
36748 }
36749 if (!TARGET_SSE2 && d->vmode == V4SImode)
36750 dremap.vmode = V4SFmode;
36751 }
36752 else if ((contents & (h1 | h4)) == contents)
36753 {
36754 /* shufps */
36755 for (i = 0; i < nelt2; ++i)
36756 {
36757 remap[i] = i;
36758 remap[i + nelt + nelt2] = i + nelt2;
36759 dremap.perm[i] = i;
36760 dremap.perm[i + nelt2] = i + nelt + nelt2;
36761 }
36762 if (nelt != 4)
36763 {
36764 /* shufpd */
36765 dremap.vmode = V2DImode;
36766 dremap.nelt = 2;
36767 dremap.perm[0] = 0;
36768 dremap.perm[1] = 3;
36769 }
36770 }
36771 else if ((contents & (h2 | h3)) == contents)
36772 {
36773 /* shufps */
36774 for (i = 0; i < nelt2; ++i)
36775 {
36776 remap[i + nelt2] = i;
36777 remap[i + nelt] = i + nelt2;
36778 dremap.perm[i] = i + nelt2;
36779 dremap.perm[i + nelt2] = i + nelt;
36780 }
36781 if (nelt != 4)
36782 {
36783 /* shufpd */
36784 dremap.vmode = V2DImode;
36785 dremap.nelt = 2;
36786 dremap.perm[0] = 1;
36787 dremap.perm[1] = 2;
36788 }
36789 }
36790 else
36791 return false;
36792 }
36793 else
36794 {
36795 unsigned int nelt4 = nelt / 4, nzcnt = 0;
36796 unsigned HOST_WIDE_INT q[8];
36797 unsigned int nonzero_halves[4];
36798
36799 /* Split the two input vectors into 8 quarters. */
36800 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
36801 for (i = 1; i < 8; ++i)
36802 q[i] = q[0] << (nelt4 * i);
36803 for (i = 0; i < 4; ++i)
36804 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
36805 {
36806 nonzero_halves[nzcnt] = i;
36807 ++nzcnt;
36808 }
36809
36810 if (nzcnt == 1)
36811 {
36812 gcc_assert (d->op0 == d->op1);
36813 nonzero_halves[1] = nonzero_halves[0];
36814 same_halves = true;
36815 }
36816 else if (d->op0 == d->op1)
36817 {
36818 gcc_assert (nonzero_halves[0] == 0);
36819 gcc_assert (nonzero_halves[1] == 1);
36820 }
36821
36822 if (nzcnt <= 2)
36823 {
36824 if (d->perm[0] / nelt2 == nonzero_halves[1])
36825 {
36826 /* Attempt to increase the likelyhood that dfinal
36827 shuffle will be intra-lane. */
36828 char tmph = nonzero_halves[0];
36829 nonzero_halves[0] = nonzero_halves[1];
36830 nonzero_halves[1] = tmph;
36831 }
36832
36833 /* vperm2f128 or vperm2i128. */
36834 for (i = 0; i < nelt2; ++i)
36835 {
36836 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
36837 remap[i + nonzero_halves[0] * nelt2] = i;
36838 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
36839 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
36840 }
36841
36842 if (d->vmode != V8SFmode
36843 && d->vmode != V4DFmode
36844 && d->vmode != V8SImode)
36845 {
36846 dremap.vmode = V8SImode;
36847 dremap.nelt = 8;
36848 for (i = 0; i < 4; ++i)
36849 {
36850 dremap.perm[i] = i + nonzero_halves[0] * 4;
36851 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
36852 }
36853 }
36854 }
36855 else if (d->op0 == d->op1)
36856 return false;
36857 else if (TARGET_AVX2
36858 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
36859 {
36860 /* vpunpckl* */
36861 for (i = 0; i < nelt4; ++i)
36862 {
36863 remap[i] = i * 2;
36864 remap[i + nelt] = i * 2 + 1;
36865 remap[i + nelt2] = i * 2 + nelt2;
36866 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
36867 dremap.perm[i * 2] = i;
36868 dremap.perm[i * 2 + 1] = i + nelt;
36869 dremap.perm[i * 2 + nelt2] = i + nelt2;
36870 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
36871 }
36872 }
36873 else if (TARGET_AVX2
36874 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
36875 {
36876 /* vpunpckh* */
36877 for (i = 0; i < nelt4; ++i)
36878 {
36879 remap[i + nelt4] = i * 2;
36880 remap[i + nelt + nelt4] = i * 2 + 1;
36881 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
36882 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
36883 dremap.perm[i * 2] = i + nelt4;
36884 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
36885 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
36886 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
36887 }
36888 }
36889 else
36890 return false;
36891 }
36892
36893 /* Use the remapping array set up above to move the elements from their
36894 swizzled locations into their final destinations. */
36895 dfinal = *d;
36896 for (i = 0; i < nelt; ++i)
36897 {
36898 unsigned e = remap[d->perm[i]];
36899 gcc_assert (e < nelt);
36900 /* If same_halves is true, both halves of the remapped vector are the
36901 same. Avoid cross-lane accesses if possible. */
36902 if (same_halves && i >= nelt2)
36903 {
36904 gcc_assert (e < nelt2);
36905 dfinal.perm[i] = e + nelt2;
36906 }
36907 else
36908 dfinal.perm[i] = e;
36909 }
36910 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
36911 dfinal.op1 = dfinal.op0;
36912 dremap.target = dfinal.op0;
36913
36914 /* Test if the final remap can be done with a single insn. For V4SFmode or
36915 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
36916 start_sequence ();
36917 ok = expand_vec_perm_1 (&dfinal);
36918 seq = get_insns ();
36919 end_sequence ();
36920
36921 if (!ok)
36922 return false;
36923
36924 if (d->testing_p)
36925 return true;
36926
36927 if (dremap.vmode != dfinal.vmode)
36928 {
36929 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
36930 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
36931 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
36932 }
36933
36934 ok = expand_vec_perm_1 (&dremap);
36935 gcc_assert (ok);
36936
36937 emit_insn (seq);
36938 return true;
36939 }
36940
36941 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36942 a single vector cross-lane permutation into vpermq followed
36943 by any of the single insn permutations. */
36944
36945 static bool
36946 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
36947 {
36948 struct expand_vec_perm_d dremap, dfinal;
36949 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
36950 unsigned contents[2];
36951 bool ok;
36952
36953 if (!(TARGET_AVX2
36954 && (d->vmode == V32QImode || d->vmode == V16HImode)
36955 && d->op0 == d->op1))
36956 return false;
36957
36958 contents[0] = 0;
36959 contents[1] = 0;
36960 for (i = 0; i < nelt2; ++i)
36961 {
36962 contents[0] |= 1u << (d->perm[i] / nelt4);
36963 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
36964 }
36965
36966 for (i = 0; i < 2; ++i)
36967 {
36968 unsigned int cnt = 0;
36969 for (j = 0; j < 4; ++j)
36970 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
36971 return false;
36972 }
36973
36974 if (d->testing_p)
36975 return true;
36976
36977 dremap = *d;
36978 dremap.vmode = V4DImode;
36979 dremap.nelt = 4;
36980 dremap.target = gen_reg_rtx (V4DImode);
36981 dremap.op0 = gen_lowpart (V4DImode, d->op0);
36982 dremap.op1 = dremap.op0;
36983 for (i = 0; i < 2; ++i)
36984 {
36985 unsigned int cnt = 0;
36986 for (j = 0; j < 4; ++j)
36987 if ((contents[i] & (1u << j)) != 0)
36988 dremap.perm[2 * i + cnt++] = j;
36989 for (; cnt < 2; ++cnt)
36990 dremap.perm[2 * i + cnt] = 0;
36991 }
36992
36993 dfinal = *d;
36994 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
36995 dfinal.op1 = dfinal.op0;
36996 for (i = 0, j = 0; i < nelt; ++i)
36997 {
36998 if (i == nelt2)
36999 j = 2;
37000 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
37001 if ((d->perm[i] / nelt4) == dremap.perm[j])
37002 ;
37003 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
37004 dfinal.perm[i] |= nelt4;
37005 else
37006 gcc_unreachable ();
37007 }
37008
37009 ok = expand_vec_perm_1 (&dremap);
37010 gcc_assert (ok);
37011
37012 ok = expand_vec_perm_1 (&dfinal);
37013 gcc_assert (ok);
37014
37015 return true;
37016 }
37017
37018 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
37019 a two vector permutation using 2 intra-lane interleave insns
37020 and cross-lane shuffle for 32-byte vectors. */
37021
37022 static bool
37023 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
37024 {
37025 unsigned i, nelt;
37026 rtx (*gen) (rtx, rtx, rtx);
37027
37028 if (d->op0 == d->op1)
37029 return false;
37030 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
37031 ;
37032 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
37033 ;
37034 else
37035 return false;
37036
37037 nelt = d->nelt;
37038 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
37039 return false;
37040 for (i = 0; i < nelt; i += 2)
37041 if (d->perm[i] != d->perm[0] + i / 2
37042 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
37043 return false;
37044
37045 if (d->testing_p)
37046 return true;
37047
37048 switch (d->vmode)
37049 {
37050 case V32QImode:
37051 if (d->perm[0])
37052 gen = gen_vec_interleave_highv32qi;
37053 else
37054 gen = gen_vec_interleave_lowv32qi;
37055 break;
37056 case V16HImode:
37057 if (d->perm[0])
37058 gen = gen_vec_interleave_highv16hi;
37059 else
37060 gen = gen_vec_interleave_lowv16hi;
37061 break;
37062 case V8SImode:
37063 if (d->perm[0])
37064 gen = gen_vec_interleave_highv8si;
37065 else
37066 gen = gen_vec_interleave_lowv8si;
37067 break;
37068 case V4DImode:
37069 if (d->perm[0])
37070 gen = gen_vec_interleave_highv4di;
37071 else
37072 gen = gen_vec_interleave_lowv4di;
37073 break;
37074 case V8SFmode:
37075 if (d->perm[0])
37076 gen = gen_vec_interleave_highv8sf;
37077 else
37078 gen = gen_vec_interleave_lowv8sf;
37079 break;
37080 case V4DFmode:
37081 if (d->perm[0])
37082 gen = gen_vec_interleave_highv4df;
37083 else
37084 gen = gen_vec_interleave_lowv4df;
37085 break;
37086 default:
37087 gcc_unreachable ();
37088 }
37089
37090 emit_insn (gen (d->target, d->op0, d->op1));
37091 return true;
37092 }
37093
37094 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
37095 permutation with two pshufb insns and an ior. We should have already
37096 failed all two instruction sequences. */
37097
37098 static bool
37099 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
37100 {
37101 rtx rperm[2][16], vperm, l, h, op, m128;
37102 unsigned int i, nelt, eltsz;
37103
37104 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
37105 return false;
37106 gcc_assert (d->op0 != d->op1);
37107
37108 nelt = d->nelt;
37109 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37110
37111 /* Generate two permutation masks. If the required element is within
37112 the given vector it is shuffled into the proper lane. If the required
37113 element is in the other vector, force a zero into the lane by setting
37114 bit 7 in the permutation mask. */
37115 m128 = GEN_INT (-128);
37116 for (i = 0; i < nelt; ++i)
37117 {
37118 unsigned j, e = d->perm[i];
37119 unsigned which = (e >= nelt);
37120 if (e >= nelt)
37121 e -= nelt;
37122
37123 for (j = 0; j < eltsz; ++j)
37124 {
37125 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
37126 rperm[1-which][i*eltsz + j] = m128;
37127 }
37128 }
37129
37130 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
37131 vperm = force_reg (V16QImode, vperm);
37132
37133 l = gen_reg_rtx (V16QImode);
37134 op = gen_lowpart (V16QImode, d->op0);
37135 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
37136
37137 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
37138 vperm = force_reg (V16QImode, vperm);
37139
37140 h = gen_reg_rtx (V16QImode);
37141 op = gen_lowpart (V16QImode, d->op1);
37142 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
37143
37144 op = gen_lowpart (V16QImode, d->target);
37145 emit_insn (gen_iorv16qi3 (op, l, h));
37146
37147 return true;
37148 }
37149
37150 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
37151 with two vpshufb insns, vpermq and vpor. We should have already failed
37152 all two or three instruction sequences. */
37153
37154 static bool
37155 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
37156 {
37157 rtx rperm[2][32], vperm, l, h, hp, op, m128;
37158 unsigned int i, nelt, eltsz;
37159
37160 if (!TARGET_AVX2
37161 || d->op0 != d->op1
37162 || (d->vmode != V32QImode && d->vmode != V16HImode))
37163 return false;
37164
37165 if (d->testing_p)
37166 return true;
37167
37168 nelt = d->nelt;
37169 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37170
37171 /* Generate two permutation masks. If the required element is within
37172 the same lane, it is shuffled in. If the required element from the
37173 other lane, force a zero by setting bit 7 in the permutation mask.
37174 In the other mask the mask has non-negative elements if element
37175 is requested from the other lane, but also moved to the other lane,
37176 so that the result of vpshufb can have the two V2TImode halves
37177 swapped. */
37178 m128 = GEN_INT (-128);
37179 for (i = 0; i < nelt; ++i)
37180 {
37181 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37182 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37183
37184 for (j = 0; j < eltsz; ++j)
37185 {
37186 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
37187 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
37188 }
37189 }
37190
37191 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37192 vperm = force_reg (V32QImode, vperm);
37193
37194 h = gen_reg_rtx (V32QImode);
37195 op = gen_lowpart (V32QImode, d->op0);
37196 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37197
37198 /* Swap the 128-byte lanes of h into hp. */
37199 hp = gen_reg_rtx (V4DImode);
37200 op = gen_lowpart (V4DImode, h);
37201 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
37202 const1_rtx));
37203
37204 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37205 vperm = force_reg (V32QImode, vperm);
37206
37207 l = gen_reg_rtx (V32QImode);
37208 op = gen_lowpart (V32QImode, d->op0);
37209 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37210
37211 op = gen_lowpart (V32QImode, d->target);
37212 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
37213
37214 return true;
37215 }
37216
37217 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
37218 and extract-odd permutations of two V32QImode and V16QImode operand
37219 with two vpshufb insns, vpor and vpermq. We should have already
37220 failed all two or three instruction sequences. */
37221
37222 static bool
37223 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
37224 {
37225 rtx rperm[2][32], vperm, l, h, ior, op, m128;
37226 unsigned int i, nelt, eltsz;
37227
37228 if (!TARGET_AVX2
37229 || d->op0 == d->op1
37230 || (d->vmode != V32QImode && d->vmode != V16HImode))
37231 return false;
37232
37233 for (i = 0; i < d->nelt; ++i)
37234 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
37235 return false;
37236
37237 if (d->testing_p)
37238 return true;
37239
37240 nelt = d->nelt;
37241 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37242
37243 /* Generate two permutation masks. In the first permutation mask
37244 the first quarter will contain indexes for the first half
37245 of the op0, the second quarter will contain bit 7 set, third quarter
37246 will contain indexes for the second half of the op0 and the
37247 last quarter bit 7 set. In the second permutation mask
37248 the first quarter will contain bit 7 set, the second quarter
37249 indexes for the first half of the op1, the third quarter bit 7 set
37250 and last quarter indexes for the second half of the op1.
37251 I.e. the first mask e.g. for V32QImode extract even will be:
37252 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
37253 (all values masked with 0xf except for -128) and second mask
37254 for extract even will be
37255 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
37256 m128 = GEN_INT (-128);
37257 for (i = 0; i < nelt; ++i)
37258 {
37259 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37260 unsigned which = d->perm[i] >= nelt;
37261 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
37262
37263 for (j = 0; j < eltsz; ++j)
37264 {
37265 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
37266 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
37267 }
37268 }
37269
37270 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
37271 vperm = force_reg (V32QImode, vperm);
37272
37273 l = gen_reg_rtx (V32QImode);
37274 op = gen_lowpart (V32QImode, d->op0);
37275 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
37276
37277 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
37278 vperm = force_reg (V32QImode, vperm);
37279
37280 h = gen_reg_rtx (V32QImode);
37281 op = gen_lowpart (V32QImode, d->op1);
37282 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
37283
37284 ior = gen_reg_rtx (V32QImode);
37285 emit_insn (gen_iorv32qi3 (ior, l, h));
37286
37287 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
37288 op = gen_lowpart (V4DImode, d->target);
37289 ior = gen_lowpart (V4DImode, ior);
37290 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
37291 const1_rtx, GEN_INT (3)));
37292
37293 return true;
37294 }
37295
37296 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
37297 and extract-odd permutations. */
37298
37299 static bool
37300 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
37301 {
37302 rtx t1, t2, t3;
37303
37304 switch (d->vmode)
37305 {
37306 case V4DFmode:
37307 t1 = gen_reg_rtx (V4DFmode);
37308 t2 = gen_reg_rtx (V4DFmode);
37309
37310 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37311 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
37312 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
37313
37314 /* Now an unpck[lh]pd will produce the result required. */
37315 if (odd)
37316 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
37317 else
37318 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
37319 emit_insn (t3);
37320 break;
37321
37322 case V8SFmode:
37323 {
37324 int mask = odd ? 0xdd : 0x88;
37325
37326 t1 = gen_reg_rtx (V8SFmode);
37327 t2 = gen_reg_rtx (V8SFmode);
37328 t3 = gen_reg_rtx (V8SFmode);
37329
37330 /* Shuffle within the 128-bit lanes to produce:
37331 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
37332 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
37333 GEN_INT (mask)));
37334
37335 /* Shuffle the lanes around to produce:
37336 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
37337 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
37338 GEN_INT (0x3)));
37339
37340 /* Shuffle within the 128-bit lanes to produce:
37341 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
37342 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
37343
37344 /* Shuffle within the 128-bit lanes to produce:
37345 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
37346 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
37347
37348 /* Shuffle the lanes around to produce:
37349 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
37350 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
37351 GEN_INT (0x20)));
37352 }
37353 break;
37354
37355 case V2DFmode:
37356 case V4SFmode:
37357 case V2DImode:
37358 case V4SImode:
37359 /* These are always directly implementable by expand_vec_perm_1. */
37360 gcc_unreachable ();
37361
37362 case V8HImode:
37363 if (TARGET_SSSE3)
37364 return expand_vec_perm_pshufb2 (d);
37365 else
37366 {
37367 /* We need 2*log2(N)-1 operations to achieve odd/even
37368 with interleave. */
37369 t1 = gen_reg_rtx (V8HImode);
37370 t2 = gen_reg_rtx (V8HImode);
37371 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
37372 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
37373 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
37374 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
37375 if (odd)
37376 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
37377 else
37378 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
37379 emit_insn (t3);
37380 }
37381 break;
37382
37383 case V16QImode:
37384 if (TARGET_SSSE3)
37385 return expand_vec_perm_pshufb2 (d);
37386 else
37387 {
37388 t1 = gen_reg_rtx (V16QImode);
37389 t2 = gen_reg_rtx (V16QImode);
37390 t3 = gen_reg_rtx (V16QImode);
37391 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
37392 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
37393 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
37394 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
37395 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
37396 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
37397 if (odd)
37398 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
37399 else
37400 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
37401 emit_insn (t3);
37402 }
37403 break;
37404
37405 case V16HImode:
37406 case V32QImode:
37407 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
37408
37409 case V4DImode:
37410 if (!TARGET_AVX2)
37411 {
37412 struct expand_vec_perm_d d_copy = *d;
37413 d_copy.vmode = V4DFmode;
37414 d_copy.target = gen_lowpart (V4DFmode, d->target);
37415 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
37416 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
37417 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37418 }
37419
37420 t1 = gen_reg_rtx (V4DImode);
37421 t2 = gen_reg_rtx (V4DImode);
37422
37423 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
37424 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
37425 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
37426
37427 /* Now an vpunpck[lh]qdq will produce the result required. */
37428 if (odd)
37429 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
37430 else
37431 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
37432 emit_insn (t3);
37433 break;
37434
37435 case V8SImode:
37436 if (!TARGET_AVX2)
37437 {
37438 struct expand_vec_perm_d d_copy = *d;
37439 d_copy.vmode = V8SFmode;
37440 d_copy.target = gen_lowpart (V8SFmode, d->target);
37441 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
37442 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
37443 return expand_vec_perm_even_odd_1 (&d_copy, odd);
37444 }
37445
37446 t1 = gen_reg_rtx (V8SImode);
37447 t2 = gen_reg_rtx (V8SImode);
37448
37449 /* Shuffle the lanes around into
37450 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
37451 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
37452 gen_lowpart (V4DImode, d->op0),
37453 gen_lowpart (V4DImode, d->op1),
37454 GEN_INT (0x20)));
37455 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
37456 gen_lowpart (V4DImode, d->op0),
37457 gen_lowpart (V4DImode, d->op1),
37458 GEN_INT (0x31)));
37459
37460 /* Swap the 2nd and 3rd position in each lane into
37461 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
37462 emit_insn (gen_avx2_pshufdv3 (t1, t1,
37463 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37464 emit_insn (gen_avx2_pshufdv3 (t2, t2,
37465 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
37466
37467 /* Now an vpunpck[lh]qdq will produce
37468 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
37469 if (odd)
37470 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
37471 gen_lowpart (V4DImode, t1),
37472 gen_lowpart (V4DImode, t2));
37473 else
37474 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
37475 gen_lowpart (V4DImode, t1),
37476 gen_lowpart (V4DImode, t2));
37477 emit_insn (t3);
37478 break;
37479
37480 default:
37481 gcc_unreachable ();
37482 }
37483
37484 return true;
37485 }
37486
37487 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37488 extract-even and extract-odd permutations. */
37489
37490 static bool
37491 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
37492 {
37493 unsigned i, odd, nelt = d->nelt;
37494
37495 odd = d->perm[0];
37496 if (odd != 0 && odd != 1)
37497 return false;
37498
37499 for (i = 1; i < nelt; ++i)
37500 if (d->perm[i] != 2 * i + odd)
37501 return false;
37502
37503 return expand_vec_perm_even_odd_1 (d, odd);
37504 }
37505
37506 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
37507 permutations. We assume that expand_vec_perm_1 has already failed. */
37508
37509 static bool
37510 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
37511 {
37512 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
37513 enum machine_mode vmode = d->vmode;
37514 unsigned char perm2[4];
37515 rtx op0 = d->op0;
37516 bool ok;
37517
37518 switch (vmode)
37519 {
37520 case V4DFmode:
37521 case V8SFmode:
37522 /* These are special-cased in sse.md so that we can optionally
37523 use the vbroadcast instruction. They expand to two insns
37524 if the input happens to be in a register. */
37525 gcc_unreachable ();
37526
37527 case V2DFmode:
37528 case V2DImode:
37529 case V4SFmode:
37530 case V4SImode:
37531 /* These are always implementable using standard shuffle patterns. */
37532 gcc_unreachable ();
37533
37534 case V8HImode:
37535 case V16QImode:
37536 /* These can be implemented via interleave. We save one insn by
37537 stopping once we have promoted to V4SImode and then use pshufd. */
37538 do
37539 {
37540 optab otab = vec_interleave_low_optab;
37541
37542 if (elt >= nelt2)
37543 {
37544 otab = vec_interleave_high_optab;
37545 elt -= nelt2;
37546 }
37547 nelt2 /= 2;
37548
37549 op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
37550 vmode = get_mode_wider_vector (vmode);
37551 op0 = gen_lowpart (vmode, op0);
37552 }
37553 while (vmode != V4SImode);
37554
37555 memset (perm2, elt, 4);
37556 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
37557 gcc_assert (ok);
37558 return true;
37559
37560 case V32QImode:
37561 case V16HImode:
37562 case V8SImode:
37563 case V4DImode:
37564 /* For AVX2 broadcasts of the first element vpbroadcast* or
37565 vpermq should be used by expand_vec_perm_1. */
37566 gcc_assert (!TARGET_AVX2 || d->perm[0]);
37567 return false;
37568
37569 default:
37570 gcc_unreachable ();
37571 }
37572 }
37573
37574 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
37575 broadcast permutations. */
37576
37577 static bool
37578 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
37579 {
37580 unsigned i, elt, nelt = d->nelt;
37581
37582 if (d->op0 != d->op1)
37583 return false;
37584
37585 elt = d->perm[0];
37586 for (i = 1; i < nelt; ++i)
37587 if (d->perm[i] != elt)
37588 return false;
37589
37590 return expand_vec_perm_broadcast_1 (d);
37591 }
37592
37593 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
37594 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
37595 all the shorter instruction sequences. */
37596
37597 static bool
37598 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
37599 {
37600 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
37601 unsigned int i, nelt, eltsz;
37602 bool used[4];
37603
37604 if (!TARGET_AVX2
37605 || d->op0 == d->op1
37606 || (d->vmode != V32QImode && d->vmode != V16HImode))
37607 return false;
37608
37609 if (d->testing_p)
37610 return true;
37611
37612 nelt = d->nelt;
37613 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37614
37615 /* Generate 4 permutation masks. If the required element is within
37616 the same lane, it is shuffled in. If the required element from the
37617 other lane, force a zero by setting bit 7 in the permutation mask.
37618 In the other mask the mask has non-negative elements if element
37619 is requested from the other lane, but also moved to the other lane,
37620 so that the result of vpshufb can have the two V2TImode halves
37621 swapped. */
37622 m128 = GEN_INT (-128);
37623 for (i = 0; i < 32; ++i)
37624 {
37625 rperm[0][i] = m128;
37626 rperm[1][i] = m128;
37627 rperm[2][i] = m128;
37628 rperm[3][i] = m128;
37629 }
37630 used[0] = false;
37631 used[1] = false;
37632 used[2] = false;
37633 used[3] = false;
37634 for (i = 0; i < nelt; ++i)
37635 {
37636 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37637 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37638 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
37639
37640 for (j = 0; j < eltsz; ++j)
37641 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
37642 used[which] = true;
37643 }
37644
37645 for (i = 0; i < 2; ++i)
37646 {
37647 if (!used[2 * i + 1])
37648 {
37649 h[i] = NULL_RTX;
37650 continue;
37651 }
37652 vperm = gen_rtx_CONST_VECTOR (V32QImode,
37653 gen_rtvec_v (32, rperm[2 * i + 1]));
37654 vperm = force_reg (V32QImode, vperm);
37655 h[i] = gen_reg_rtx (V32QImode);
37656 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37657 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
37658 }
37659
37660 /* Swap the 128-byte lanes of h[X]. */
37661 for (i = 0; i < 2; ++i)
37662 {
37663 if (h[i] == NULL_RTX)
37664 continue;
37665 op = gen_reg_rtx (V4DImode);
37666 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
37667 const2_rtx, GEN_INT (3), const0_rtx,
37668 const1_rtx));
37669 h[i] = gen_lowpart (V32QImode, op);
37670 }
37671
37672 for (i = 0; i < 2; ++i)
37673 {
37674 if (!used[2 * i])
37675 {
37676 l[i] = NULL_RTX;
37677 continue;
37678 }
37679 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
37680 vperm = force_reg (V32QImode, vperm);
37681 l[i] = gen_reg_rtx (V32QImode);
37682 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37683 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
37684 }
37685
37686 for (i = 0; i < 2; ++i)
37687 {
37688 if (h[i] && l[i])
37689 {
37690 op = gen_reg_rtx (V32QImode);
37691 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
37692 l[i] = op;
37693 }
37694 else if (h[i])
37695 l[i] = h[i];
37696 }
37697
37698 gcc_assert (l[0] && l[1]);
37699 op = gen_lowpart (V32QImode, d->target);
37700 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
37701 return true;
37702 }
37703
37704 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
37705 With all of the interface bits taken care of, perform the expansion
37706 in D and return true on success. */
37707
37708 static bool
37709 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
37710 {
37711 /* Try a single instruction expansion. */
37712 if (expand_vec_perm_1 (d))
37713 return true;
37714
37715 /* Try sequences of two instructions. */
37716
37717 if (expand_vec_perm_pshuflw_pshufhw (d))
37718 return true;
37719
37720 if (expand_vec_perm_palignr (d))
37721 return true;
37722
37723 if (expand_vec_perm_interleave2 (d))
37724 return true;
37725
37726 if (expand_vec_perm_broadcast (d))
37727 return true;
37728
37729 if (expand_vec_perm_vpermq_perm_1 (d))
37730 return true;
37731
37732 /* Try sequences of three instructions. */
37733
37734 if (expand_vec_perm_pshufb2 (d))
37735 return true;
37736
37737 if (expand_vec_perm_interleave3 (d))
37738 return true;
37739
37740 /* Try sequences of four instructions. */
37741
37742 if (expand_vec_perm_vpshufb2_vpermq (d))
37743 return true;
37744
37745 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
37746 return true;
37747
37748 /* ??? Look for narrow permutations whose element orderings would
37749 allow the promotion to a wider mode. */
37750
37751 /* ??? Look for sequences of interleave or a wider permute that place
37752 the data into the correct lanes for a half-vector shuffle like
37753 pshuf[lh]w or vpermilps. */
37754
37755 /* ??? Look for sequences of interleave that produce the desired results.
37756 The combinatorics of punpck[lh] get pretty ugly... */
37757
37758 if (expand_vec_perm_even_odd (d))
37759 return true;
37760
37761 /* Even longer sequences. */
37762 if (expand_vec_perm_vpshufb4_vpermq2 (d))
37763 return true;
37764
37765 return false;
37766 }
37767
37768 bool
37769 ix86_expand_vec_perm_const (rtx operands[4])
37770 {
37771 struct expand_vec_perm_d d;
37772 unsigned char perm[MAX_VECT_LEN];
37773 int i, nelt, which;
37774 rtx sel;
37775
37776 d.target = operands[0];
37777 d.op0 = operands[1];
37778 d.op1 = operands[2];
37779 sel = operands[3];
37780
37781 d.vmode = GET_MODE (d.target);
37782 gcc_assert (VECTOR_MODE_P (d.vmode));
37783 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37784 d.testing_p = false;
37785
37786 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
37787 gcc_assert (XVECLEN (sel, 0) == nelt);
37788 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
37789
37790 for (i = which = 0; i < nelt; ++i)
37791 {
37792 rtx e = XVECEXP (sel, 0, i);
37793 int ei = INTVAL (e) & (2 * nelt - 1);
37794
37795 which |= (ei < nelt ? 1 : 2);
37796 d.perm[i] = ei;
37797 perm[i] = ei;
37798 }
37799
37800 switch (which)
37801 {
37802 default:
37803 gcc_unreachable();
37804
37805 case 3:
37806 if (!rtx_equal_p (d.op0, d.op1))
37807 break;
37808
37809 /* The elements of PERM do not suggest that only the first operand
37810 is used, but both operands are identical. Allow easier matching
37811 of the permutation by folding the permutation into the single
37812 input vector. */
37813 for (i = 0; i < nelt; ++i)
37814 if (d.perm[i] >= nelt)
37815 d.perm[i] -= nelt;
37816 /* FALLTHRU */
37817
37818 case 1:
37819 d.op1 = d.op0;
37820 break;
37821
37822 case 2:
37823 for (i = 0; i < nelt; ++i)
37824 d.perm[i] -= nelt;
37825 d.op0 = d.op1;
37826 break;
37827 }
37828
37829 if (ix86_expand_vec_perm_const_1 (&d))
37830 return true;
37831
37832 /* If the mask says both arguments are needed, but they are the same,
37833 the above tried to expand with d.op0 == d.op1. If that didn't work,
37834 retry with d.op0 != d.op1 as that is what testing has been done with. */
37835 if (which == 3 && d.op0 == d.op1)
37836 {
37837 rtx seq;
37838 bool ok;
37839
37840 memcpy (d.perm, perm, sizeof (perm));
37841 d.op1 = gen_reg_rtx (d.vmode);
37842 start_sequence ();
37843 ok = ix86_expand_vec_perm_const_1 (&d);
37844 seq = get_insns ();
37845 end_sequence ();
37846 if (ok)
37847 {
37848 emit_move_insn (d.op1, d.op0);
37849 emit_insn (seq);
37850 return true;
37851 }
37852 }
37853
37854 return false;
37855 }
37856
37857 /* Implement targetm.vectorize.vec_perm_const_ok. */
37858
37859 static bool
37860 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
37861 const unsigned char *sel)
37862 {
37863 struct expand_vec_perm_d d;
37864 unsigned int i, nelt, which;
37865 bool ret, one_vec;
37866
37867 d.vmode = vmode;
37868 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37869 d.testing_p = true;
37870
37871 /* Given sufficient ISA support we can just return true here
37872 for selected vector modes. */
37873 if (GET_MODE_SIZE (d.vmode) == 16)
37874 {
37875 /* All implementable with a single vpperm insn. */
37876 if (TARGET_XOP)
37877 return true;
37878 /* All implementable with 2 pshufb + 1 ior. */
37879 if (TARGET_SSSE3)
37880 return true;
37881 /* All implementable with shufpd or unpck[lh]pd. */
37882 if (d.nelt == 2)
37883 return true;
37884 }
37885
37886 /* Extract the values from the vector CST into the permutation
37887 array in D. */
37888 memcpy (d.perm, sel, nelt);
37889 for (i = which = 0; i < nelt; ++i)
37890 {
37891 unsigned char e = d.perm[i];
37892 gcc_assert (e < 2 * nelt);
37893 which |= (e < nelt ? 1 : 2);
37894 }
37895
37896 /* For all elements from second vector, fold the elements to first. */
37897 if (which == 2)
37898 for (i = 0; i < nelt; ++i)
37899 d.perm[i] -= nelt;
37900
37901 /* Check whether the mask can be applied to the vector type. */
37902 one_vec = (which != 3);
37903
37904 /* Implementable with shufps or pshufd. */
37905 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
37906 return true;
37907
37908 /* Otherwise we have to go through the motions and see if we can
37909 figure out how to generate the requested permutation. */
37910 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
37911 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
37912 if (!one_vec)
37913 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
37914
37915 start_sequence ();
37916 ret = ix86_expand_vec_perm_const_1 (&d);
37917 end_sequence ();
37918
37919 return ret;
37920 }
37921
37922 void
37923 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
37924 {
37925 struct expand_vec_perm_d d;
37926 unsigned i, nelt;
37927
37928 d.target = targ;
37929 d.op0 = op0;
37930 d.op1 = op1;
37931 d.vmode = GET_MODE (targ);
37932 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37933 d.testing_p = false;
37934
37935 for (i = 0; i < nelt; ++i)
37936 d.perm[i] = i * 2 + odd;
37937
37938 /* We'll either be able to implement the permutation directly... */
37939 if (expand_vec_perm_1 (&d))
37940 return;
37941
37942 /* ... or we use the special-case patterns. */
37943 expand_vec_perm_even_odd_1 (&d, odd);
37944 }
37945
37946 /* Expand an insert into a vector register through pinsr insn.
37947 Return true if successful. */
37948
37949 bool
37950 ix86_expand_pinsr (rtx *operands)
37951 {
37952 rtx dst = operands[0];
37953 rtx src = operands[3];
37954
37955 unsigned int size = INTVAL (operands[1]);
37956 unsigned int pos = INTVAL (operands[2]);
37957
37958 if (GET_CODE (dst) == SUBREG)
37959 {
37960 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
37961 dst = SUBREG_REG (dst);
37962 }
37963
37964 if (GET_CODE (src) == SUBREG)
37965 src = SUBREG_REG (src);
37966
37967 switch (GET_MODE (dst))
37968 {
37969 case V16QImode:
37970 case V8HImode:
37971 case V4SImode:
37972 case V2DImode:
37973 {
37974 enum machine_mode srcmode, dstmode;
37975 rtx (*pinsr)(rtx, rtx, rtx, rtx);
37976
37977 srcmode = mode_for_size (size, MODE_INT, 0);
37978
37979 switch (srcmode)
37980 {
37981 case QImode:
37982 if (!TARGET_SSE4_1)
37983 return false;
37984 dstmode = V16QImode;
37985 pinsr = gen_sse4_1_pinsrb;
37986 break;
37987
37988 case HImode:
37989 if (!TARGET_SSE2)
37990 return false;
37991 dstmode = V8HImode;
37992 pinsr = gen_sse2_pinsrw;
37993 break;
37994
37995 case SImode:
37996 if (!TARGET_SSE4_1)
37997 return false;
37998 dstmode = V4SImode;
37999 pinsr = gen_sse4_1_pinsrd;
38000 break;
38001
38002 case DImode:
38003 gcc_assert (TARGET_64BIT);
38004 if (!TARGET_SSE4_1)
38005 return false;
38006 dstmode = V2DImode;
38007 pinsr = gen_sse4_1_pinsrq;
38008 break;
38009
38010 default:
38011 return false;
38012 }
38013
38014 dst = gen_lowpart (dstmode, dst);
38015 src = gen_lowpart (srcmode, src);
38016
38017 pos /= size;
38018
38019 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
38020 return true;
38021 }
38022
38023 default:
38024 return false;
38025 }
38026 }
38027 \f
38028 /* This function returns the calling abi specific va_list type node.
38029 It returns the FNDECL specific va_list type. */
38030
38031 static tree
38032 ix86_fn_abi_va_list (tree fndecl)
38033 {
38034 if (!TARGET_64BIT)
38035 return va_list_type_node;
38036 gcc_assert (fndecl != NULL_TREE);
38037
38038 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
38039 return ms_va_list_type_node;
38040 else
38041 return sysv_va_list_type_node;
38042 }
38043
38044 /* Returns the canonical va_list type specified by TYPE. If there
38045 is no valid TYPE provided, it return NULL_TREE. */
38046
38047 static tree
38048 ix86_canonical_va_list_type (tree type)
38049 {
38050 tree wtype, htype;
38051
38052 /* Resolve references and pointers to va_list type. */
38053 if (TREE_CODE (type) == MEM_REF)
38054 type = TREE_TYPE (type);
38055 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
38056 type = TREE_TYPE (type);
38057 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
38058 type = TREE_TYPE (type);
38059
38060 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
38061 {
38062 wtype = va_list_type_node;
38063 gcc_assert (wtype != NULL_TREE);
38064 htype = type;
38065 if (TREE_CODE (wtype) == ARRAY_TYPE)
38066 {
38067 /* If va_list is an array type, the argument may have decayed
38068 to a pointer type, e.g. by being passed to another function.
38069 In that case, unwrap both types so that we can compare the
38070 underlying records. */
38071 if (TREE_CODE (htype) == ARRAY_TYPE
38072 || POINTER_TYPE_P (htype))
38073 {
38074 wtype = TREE_TYPE (wtype);
38075 htype = TREE_TYPE (htype);
38076 }
38077 }
38078 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38079 return va_list_type_node;
38080 wtype = sysv_va_list_type_node;
38081 gcc_assert (wtype != NULL_TREE);
38082 htype = type;
38083 if (TREE_CODE (wtype) == ARRAY_TYPE)
38084 {
38085 /* If va_list is an array type, the argument may have decayed
38086 to a pointer type, e.g. by being passed to another function.
38087 In that case, unwrap both types so that we can compare the
38088 underlying records. */
38089 if (TREE_CODE (htype) == ARRAY_TYPE
38090 || POINTER_TYPE_P (htype))
38091 {
38092 wtype = TREE_TYPE (wtype);
38093 htype = TREE_TYPE (htype);
38094 }
38095 }
38096 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38097 return sysv_va_list_type_node;
38098 wtype = ms_va_list_type_node;
38099 gcc_assert (wtype != NULL_TREE);
38100 htype = type;
38101 if (TREE_CODE (wtype) == ARRAY_TYPE)
38102 {
38103 /* If va_list is an array type, the argument may have decayed
38104 to a pointer type, e.g. by being passed to another function.
38105 In that case, unwrap both types so that we can compare the
38106 underlying records. */
38107 if (TREE_CODE (htype) == ARRAY_TYPE
38108 || POINTER_TYPE_P (htype))
38109 {
38110 wtype = TREE_TYPE (wtype);
38111 htype = TREE_TYPE (htype);
38112 }
38113 }
38114 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
38115 return ms_va_list_type_node;
38116 return NULL_TREE;
38117 }
38118 return std_canonical_va_list_type (type);
38119 }
38120
38121 /* Iterate through the target-specific builtin types for va_list.
38122 IDX denotes the iterator, *PTREE is set to the result type of
38123 the va_list builtin, and *PNAME to its internal type.
38124 Returns zero if there is no element for this index, otherwise
38125 IDX should be increased upon the next call.
38126 Note, do not iterate a base builtin's name like __builtin_va_list.
38127 Used from c_common_nodes_and_builtins. */
38128
38129 static int
38130 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
38131 {
38132 if (TARGET_64BIT)
38133 {
38134 switch (idx)
38135 {
38136 default:
38137 break;
38138
38139 case 0:
38140 *ptree = ms_va_list_type_node;
38141 *pname = "__builtin_ms_va_list";
38142 return 1;
38143
38144 case 1:
38145 *ptree = sysv_va_list_type_node;
38146 *pname = "__builtin_sysv_va_list";
38147 return 1;
38148 }
38149 }
38150
38151 return 0;
38152 }
38153
38154 #undef TARGET_SCHED_DISPATCH
38155 #define TARGET_SCHED_DISPATCH has_dispatch
38156 #undef TARGET_SCHED_DISPATCH_DO
38157 #define TARGET_SCHED_DISPATCH_DO do_dispatch
38158 #undef TARGET_SCHED_REASSOCIATION_WIDTH
38159 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
38160
38161 /* The size of the dispatch window is the total number of bytes of
38162 object code allowed in a window. */
38163 #define DISPATCH_WINDOW_SIZE 16
38164
38165 /* Number of dispatch windows considered for scheduling. */
38166 #define MAX_DISPATCH_WINDOWS 3
38167
38168 /* Maximum number of instructions in a window. */
38169 #define MAX_INSN 4
38170
38171 /* Maximum number of immediate operands in a window. */
38172 #define MAX_IMM 4
38173
38174 /* Maximum number of immediate bits allowed in a window. */
38175 #define MAX_IMM_SIZE 128
38176
38177 /* Maximum number of 32 bit immediates allowed in a window. */
38178 #define MAX_IMM_32 4
38179
38180 /* Maximum number of 64 bit immediates allowed in a window. */
38181 #define MAX_IMM_64 2
38182
38183 /* Maximum total of loads or prefetches allowed in a window. */
38184 #define MAX_LOAD 2
38185
38186 /* Maximum total of stores allowed in a window. */
38187 #define MAX_STORE 1
38188
38189 #undef BIG
38190 #define BIG 100
38191
38192
38193 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
38194 enum dispatch_group {
38195 disp_no_group = 0,
38196 disp_load,
38197 disp_store,
38198 disp_load_store,
38199 disp_prefetch,
38200 disp_imm,
38201 disp_imm_32,
38202 disp_imm_64,
38203 disp_branch,
38204 disp_cmp,
38205 disp_jcc,
38206 disp_last
38207 };
38208
38209 /* Number of allowable groups in a dispatch window. It is an array
38210 indexed by dispatch_group enum. 100 is used as a big number,
38211 because the number of these kind of operations does not have any
38212 effect in dispatch window, but we need them for other reasons in
38213 the table. */
38214 static unsigned int num_allowable_groups[disp_last] = {
38215 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
38216 };
38217
38218 char group_name[disp_last + 1][16] = {
38219 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
38220 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
38221 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
38222 };
38223
38224 /* Instruction path. */
38225 enum insn_path {
38226 no_path = 0,
38227 path_single, /* Single micro op. */
38228 path_double, /* Double micro op. */
38229 path_multi, /* Instructions with more than 2 micro op.. */
38230 last_path
38231 };
38232
38233 /* sched_insn_info defines a window to the instructions scheduled in
38234 the basic block. It contains a pointer to the insn_info table and
38235 the instruction scheduled.
38236
38237 Windows are allocated for each basic block and are linked
38238 together. */
38239 typedef struct sched_insn_info_s {
38240 rtx insn;
38241 enum dispatch_group group;
38242 enum insn_path path;
38243 int byte_len;
38244 int imm_bytes;
38245 } sched_insn_info;
38246
38247 /* Linked list of dispatch windows. This is a two way list of
38248 dispatch windows of a basic block. It contains information about
38249 the number of uops in the window and the total number of
38250 instructions and of bytes in the object code for this dispatch
38251 window. */
38252 typedef struct dispatch_windows_s {
38253 int num_insn; /* Number of insn in the window. */
38254 int num_uops; /* Number of uops in the window. */
38255 int window_size; /* Number of bytes in the window. */
38256 int window_num; /* Window number between 0 or 1. */
38257 int num_imm; /* Number of immediates in an insn. */
38258 int num_imm_32; /* Number of 32 bit immediates in an insn. */
38259 int num_imm_64; /* Number of 64 bit immediates in an insn. */
38260 int imm_size; /* Total immediates in the window. */
38261 int num_loads; /* Total memory loads in the window. */
38262 int num_stores; /* Total memory stores in the window. */
38263 int violation; /* Violation exists in window. */
38264 sched_insn_info *window; /* Pointer to the window. */
38265 struct dispatch_windows_s *next;
38266 struct dispatch_windows_s *prev;
38267 } dispatch_windows;
38268
38269 /* Immediate valuse used in an insn. */
38270 typedef struct imm_info_s
38271 {
38272 int imm;
38273 int imm32;
38274 int imm64;
38275 } imm_info;
38276
38277 static dispatch_windows *dispatch_window_list;
38278 static dispatch_windows *dispatch_window_list1;
38279
38280 /* Get dispatch group of insn. */
38281
38282 static enum dispatch_group
38283 get_mem_group (rtx insn)
38284 {
38285 enum attr_memory memory;
38286
38287 if (INSN_CODE (insn) < 0)
38288 return disp_no_group;
38289 memory = get_attr_memory (insn);
38290 if (memory == MEMORY_STORE)
38291 return disp_store;
38292
38293 if (memory == MEMORY_LOAD)
38294 return disp_load;
38295
38296 if (memory == MEMORY_BOTH)
38297 return disp_load_store;
38298
38299 return disp_no_group;
38300 }
38301
38302 /* Return true if insn is a compare instruction. */
38303
38304 static bool
38305 is_cmp (rtx insn)
38306 {
38307 enum attr_type type;
38308
38309 type = get_attr_type (insn);
38310 return (type == TYPE_TEST
38311 || type == TYPE_ICMP
38312 || type == TYPE_FCMP
38313 || GET_CODE (PATTERN (insn)) == COMPARE);
38314 }
38315
38316 /* Return true if a dispatch violation encountered. */
38317
38318 static bool
38319 dispatch_violation (void)
38320 {
38321 if (dispatch_window_list->next)
38322 return dispatch_window_list->next->violation;
38323 return dispatch_window_list->violation;
38324 }
38325
38326 /* Return true if insn is a branch instruction. */
38327
38328 static bool
38329 is_branch (rtx insn)
38330 {
38331 return (CALL_P (insn) || JUMP_P (insn));
38332 }
38333
38334 /* Return true if insn is a prefetch instruction. */
38335
38336 static bool
38337 is_prefetch (rtx insn)
38338 {
38339 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
38340 }
38341
38342 /* This function initializes a dispatch window and the list container holding a
38343 pointer to the window. */
38344
38345 static void
38346 init_window (int window_num)
38347 {
38348 int i;
38349 dispatch_windows *new_list;
38350
38351 if (window_num == 0)
38352 new_list = dispatch_window_list;
38353 else
38354 new_list = dispatch_window_list1;
38355
38356 new_list->num_insn = 0;
38357 new_list->num_uops = 0;
38358 new_list->window_size = 0;
38359 new_list->next = NULL;
38360 new_list->prev = NULL;
38361 new_list->window_num = window_num;
38362 new_list->num_imm = 0;
38363 new_list->num_imm_32 = 0;
38364 new_list->num_imm_64 = 0;
38365 new_list->imm_size = 0;
38366 new_list->num_loads = 0;
38367 new_list->num_stores = 0;
38368 new_list->violation = false;
38369
38370 for (i = 0; i < MAX_INSN; i++)
38371 {
38372 new_list->window[i].insn = NULL;
38373 new_list->window[i].group = disp_no_group;
38374 new_list->window[i].path = no_path;
38375 new_list->window[i].byte_len = 0;
38376 new_list->window[i].imm_bytes = 0;
38377 }
38378 return;
38379 }
38380
38381 /* This function allocates and initializes a dispatch window and the
38382 list container holding a pointer to the window. */
38383
38384 static dispatch_windows *
38385 allocate_window (void)
38386 {
38387 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
38388 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
38389
38390 return new_list;
38391 }
38392
38393 /* This routine initializes the dispatch scheduling information. It
38394 initiates building dispatch scheduler tables and constructs the
38395 first dispatch window. */
38396
38397 static void
38398 init_dispatch_sched (void)
38399 {
38400 /* Allocate a dispatch list and a window. */
38401 dispatch_window_list = allocate_window ();
38402 dispatch_window_list1 = allocate_window ();
38403 init_window (0);
38404 init_window (1);
38405 }
38406
38407 /* This function returns true if a branch is detected. End of a basic block
38408 does not have to be a branch, but here we assume only branches end a
38409 window. */
38410
38411 static bool
38412 is_end_basic_block (enum dispatch_group group)
38413 {
38414 return group == disp_branch;
38415 }
38416
38417 /* This function is called when the end of a window processing is reached. */
38418
38419 static void
38420 process_end_window (void)
38421 {
38422 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
38423 if (dispatch_window_list->next)
38424 {
38425 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
38426 gcc_assert (dispatch_window_list->window_size
38427 + dispatch_window_list1->window_size <= 48);
38428 init_window (1);
38429 }
38430 init_window (0);
38431 }
38432
38433 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
38434 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
38435 for 48 bytes of instructions. Note that these windows are not dispatch
38436 windows that their sizes are DISPATCH_WINDOW_SIZE. */
38437
38438 static dispatch_windows *
38439 allocate_next_window (int window_num)
38440 {
38441 if (window_num == 0)
38442 {
38443 if (dispatch_window_list->next)
38444 init_window (1);
38445 init_window (0);
38446 return dispatch_window_list;
38447 }
38448
38449 dispatch_window_list->next = dispatch_window_list1;
38450 dispatch_window_list1->prev = dispatch_window_list;
38451
38452 return dispatch_window_list1;
38453 }
38454
38455 /* Increment the number of immediate operands of an instruction. */
38456
38457 static int
38458 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
38459 {
38460 if (*in_rtx == 0)
38461 return 0;
38462
38463 switch ( GET_CODE (*in_rtx))
38464 {
38465 case CONST:
38466 case SYMBOL_REF:
38467 case CONST_INT:
38468 (imm_values->imm)++;
38469 if (x86_64_immediate_operand (*in_rtx, SImode))
38470 (imm_values->imm32)++;
38471 else
38472 (imm_values->imm64)++;
38473 break;
38474
38475 case CONST_DOUBLE:
38476 (imm_values->imm)++;
38477 (imm_values->imm64)++;
38478 break;
38479
38480 case CODE_LABEL:
38481 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
38482 {
38483 (imm_values->imm)++;
38484 (imm_values->imm32)++;
38485 }
38486 break;
38487
38488 default:
38489 break;
38490 }
38491
38492 return 0;
38493 }
38494
38495 /* Compute number of immediate operands of an instruction. */
38496
38497 static void
38498 find_constant (rtx in_rtx, imm_info *imm_values)
38499 {
38500 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
38501 (rtx_function) find_constant_1, (void *) imm_values);
38502 }
38503
38504 /* Return total size of immediate operands of an instruction along with number
38505 of corresponding immediate-operands. It initializes its parameters to zero
38506 befor calling FIND_CONSTANT.
38507 INSN is the input instruction. IMM is the total of immediates.
38508 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
38509 bit immediates. */
38510
38511 static int
38512 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
38513 {
38514 imm_info imm_values = {0, 0, 0};
38515
38516 find_constant (insn, &imm_values);
38517 *imm = imm_values.imm;
38518 *imm32 = imm_values.imm32;
38519 *imm64 = imm_values.imm64;
38520 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
38521 }
38522
38523 /* This function indicates if an operand of an instruction is an
38524 immediate. */
38525
38526 static bool
38527 has_immediate (rtx insn)
38528 {
38529 int num_imm_operand;
38530 int num_imm32_operand;
38531 int num_imm64_operand;
38532
38533 if (insn)
38534 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38535 &num_imm64_operand);
38536 return false;
38537 }
38538
38539 /* Return single or double path for instructions. */
38540
38541 static enum insn_path
38542 get_insn_path (rtx insn)
38543 {
38544 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
38545
38546 if ((int)path == 0)
38547 return path_single;
38548
38549 if ((int)path == 1)
38550 return path_double;
38551
38552 return path_multi;
38553 }
38554
38555 /* Return insn dispatch group. */
38556
38557 static enum dispatch_group
38558 get_insn_group (rtx insn)
38559 {
38560 enum dispatch_group group = get_mem_group (insn);
38561 if (group)
38562 return group;
38563
38564 if (is_branch (insn))
38565 return disp_branch;
38566
38567 if (is_cmp (insn))
38568 return disp_cmp;
38569
38570 if (has_immediate (insn))
38571 return disp_imm;
38572
38573 if (is_prefetch (insn))
38574 return disp_prefetch;
38575
38576 return disp_no_group;
38577 }
38578
38579 /* Count number of GROUP restricted instructions in a dispatch
38580 window WINDOW_LIST. */
38581
38582 static int
38583 count_num_restricted (rtx insn, dispatch_windows *window_list)
38584 {
38585 enum dispatch_group group = get_insn_group (insn);
38586 int imm_size;
38587 int num_imm_operand;
38588 int num_imm32_operand;
38589 int num_imm64_operand;
38590
38591 if (group == disp_no_group)
38592 return 0;
38593
38594 if (group == disp_imm)
38595 {
38596 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38597 &num_imm64_operand);
38598 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
38599 || num_imm_operand + window_list->num_imm > MAX_IMM
38600 || (num_imm32_operand > 0
38601 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
38602 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
38603 || (num_imm64_operand > 0
38604 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
38605 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
38606 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
38607 && num_imm64_operand > 0
38608 && ((window_list->num_imm_64 > 0
38609 && window_list->num_insn >= 2)
38610 || window_list->num_insn >= 3)))
38611 return BIG;
38612
38613 return 1;
38614 }
38615
38616 if ((group == disp_load_store
38617 && (window_list->num_loads >= MAX_LOAD
38618 || window_list->num_stores >= MAX_STORE))
38619 || ((group == disp_load
38620 || group == disp_prefetch)
38621 && window_list->num_loads >= MAX_LOAD)
38622 || (group == disp_store
38623 && window_list->num_stores >= MAX_STORE))
38624 return BIG;
38625
38626 return 1;
38627 }
38628
38629 /* This function returns true if insn satisfies dispatch rules on the
38630 last window scheduled. */
38631
38632 static bool
38633 fits_dispatch_window (rtx insn)
38634 {
38635 dispatch_windows *window_list = dispatch_window_list;
38636 dispatch_windows *window_list_next = dispatch_window_list->next;
38637 unsigned int num_restrict;
38638 enum dispatch_group group = get_insn_group (insn);
38639 enum insn_path path = get_insn_path (insn);
38640 int sum;
38641
38642 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
38643 instructions should be given the lowest priority in the
38644 scheduling process in Haifa scheduler to make sure they will be
38645 scheduled in the same dispatch window as the refrence to them. */
38646 if (group == disp_jcc || group == disp_cmp)
38647 return false;
38648
38649 /* Check nonrestricted. */
38650 if (group == disp_no_group || group == disp_branch)
38651 return true;
38652
38653 /* Get last dispatch window. */
38654 if (window_list_next)
38655 window_list = window_list_next;
38656
38657 if (window_list->window_num == 1)
38658 {
38659 sum = window_list->prev->window_size + window_list->window_size;
38660
38661 if (sum == 32
38662 || (min_insn_size (insn) + sum) >= 48)
38663 /* Window 1 is full. Go for next window. */
38664 return true;
38665 }
38666
38667 num_restrict = count_num_restricted (insn, window_list);
38668
38669 if (num_restrict > num_allowable_groups[group])
38670 return false;
38671
38672 /* See if it fits in the first window. */
38673 if (window_list->window_num == 0)
38674 {
38675 /* The first widow should have only single and double path
38676 uops. */
38677 if (path == path_double
38678 && (window_list->num_uops + 2) > MAX_INSN)
38679 return false;
38680 else if (path != path_single)
38681 return false;
38682 }
38683 return true;
38684 }
38685
38686 /* Add an instruction INSN with NUM_UOPS micro-operations to the
38687 dispatch window WINDOW_LIST. */
38688
38689 static void
38690 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
38691 {
38692 int byte_len = min_insn_size (insn);
38693 int num_insn = window_list->num_insn;
38694 int imm_size;
38695 sched_insn_info *window = window_list->window;
38696 enum dispatch_group group = get_insn_group (insn);
38697 enum insn_path path = get_insn_path (insn);
38698 int num_imm_operand;
38699 int num_imm32_operand;
38700 int num_imm64_operand;
38701
38702 if (!window_list->violation && group != disp_cmp
38703 && !fits_dispatch_window (insn))
38704 window_list->violation = true;
38705
38706 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38707 &num_imm64_operand);
38708
38709 /* Initialize window with new instruction. */
38710 window[num_insn].insn = insn;
38711 window[num_insn].byte_len = byte_len;
38712 window[num_insn].group = group;
38713 window[num_insn].path = path;
38714 window[num_insn].imm_bytes = imm_size;
38715
38716 window_list->window_size += byte_len;
38717 window_list->num_insn = num_insn + 1;
38718 window_list->num_uops = window_list->num_uops + num_uops;
38719 window_list->imm_size += imm_size;
38720 window_list->num_imm += num_imm_operand;
38721 window_list->num_imm_32 += num_imm32_operand;
38722 window_list->num_imm_64 += num_imm64_operand;
38723
38724 if (group == disp_store)
38725 window_list->num_stores += 1;
38726 else if (group == disp_load
38727 || group == disp_prefetch)
38728 window_list->num_loads += 1;
38729 else if (group == disp_load_store)
38730 {
38731 window_list->num_stores += 1;
38732 window_list->num_loads += 1;
38733 }
38734 }
38735
38736 /* Adds a scheduled instruction, INSN, to the current dispatch window.
38737 If the total bytes of instructions or the number of instructions in
38738 the window exceed allowable, it allocates a new window. */
38739
38740 static void
38741 add_to_dispatch_window (rtx insn)
38742 {
38743 int byte_len;
38744 dispatch_windows *window_list;
38745 dispatch_windows *next_list;
38746 dispatch_windows *window0_list;
38747 enum insn_path path;
38748 enum dispatch_group insn_group;
38749 bool insn_fits;
38750 int num_insn;
38751 int num_uops;
38752 int window_num;
38753 int insn_num_uops;
38754 int sum;
38755
38756 if (INSN_CODE (insn) < 0)
38757 return;
38758
38759 byte_len = min_insn_size (insn);
38760 window_list = dispatch_window_list;
38761 next_list = window_list->next;
38762 path = get_insn_path (insn);
38763 insn_group = get_insn_group (insn);
38764
38765 /* Get the last dispatch window. */
38766 if (next_list)
38767 window_list = dispatch_window_list->next;
38768
38769 if (path == path_single)
38770 insn_num_uops = 1;
38771 else if (path == path_double)
38772 insn_num_uops = 2;
38773 else
38774 insn_num_uops = (int) path;
38775
38776 /* If current window is full, get a new window.
38777 Window number zero is full, if MAX_INSN uops are scheduled in it.
38778 Window number one is full, if window zero's bytes plus window
38779 one's bytes is 32, or if the bytes of the new instruction added
38780 to the total makes it greater than 48, or it has already MAX_INSN
38781 instructions in it. */
38782 num_insn = window_list->num_insn;
38783 num_uops = window_list->num_uops;
38784 window_num = window_list->window_num;
38785 insn_fits = fits_dispatch_window (insn);
38786
38787 if (num_insn >= MAX_INSN
38788 || num_uops + insn_num_uops > MAX_INSN
38789 || !(insn_fits))
38790 {
38791 window_num = ~window_num & 1;
38792 window_list = allocate_next_window (window_num);
38793 }
38794
38795 if (window_num == 0)
38796 {
38797 add_insn_window (insn, window_list, insn_num_uops);
38798 if (window_list->num_insn >= MAX_INSN
38799 && insn_group == disp_branch)
38800 {
38801 process_end_window ();
38802 return;
38803 }
38804 }
38805 else if (window_num == 1)
38806 {
38807 window0_list = window_list->prev;
38808 sum = window0_list->window_size + window_list->window_size;
38809 if (sum == 32
38810 || (byte_len + sum) >= 48)
38811 {
38812 process_end_window ();
38813 window_list = dispatch_window_list;
38814 }
38815
38816 add_insn_window (insn, window_list, insn_num_uops);
38817 }
38818 else
38819 gcc_unreachable ();
38820
38821 if (is_end_basic_block (insn_group))
38822 {
38823 /* End of basic block is reached do end-basic-block process. */
38824 process_end_window ();
38825 return;
38826 }
38827 }
38828
38829 /* Print the dispatch window, WINDOW_NUM, to FILE. */
38830
38831 DEBUG_FUNCTION static void
38832 debug_dispatch_window_file (FILE *file, int window_num)
38833 {
38834 dispatch_windows *list;
38835 int i;
38836
38837 if (window_num == 0)
38838 list = dispatch_window_list;
38839 else
38840 list = dispatch_window_list1;
38841
38842 fprintf (file, "Window #%d:\n", list->window_num);
38843 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
38844 list->num_insn, list->num_uops, list->window_size);
38845 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38846 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
38847
38848 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
38849 list->num_stores);
38850 fprintf (file, " insn info:\n");
38851
38852 for (i = 0; i < MAX_INSN; i++)
38853 {
38854 if (!list->window[i].insn)
38855 break;
38856 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
38857 i, group_name[list->window[i].group],
38858 i, (void *)list->window[i].insn,
38859 i, list->window[i].path,
38860 i, list->window[i].byte_len,
38861 i, list->window[i].imm_bytes);
38862 }
38863 }
38864
38865 /* Print to stdout a dispatch window. */
38866
38867 DEBUG_FUNCTION void
38868 debug_dispatch_window (int window_num)
38869 {
38870 debug_dispatch_window_file (stdout, window_num);
38871 }
38872
38873 /* Print INSN dispatch information to FILE. */
38874
38875 DEBUG_FUNCTION static void
38876 debug_insn_dispatch_info_file (FILE *file, rtx insn)
38877 {
38878 int byte_len;
38879 enum insn_path path;
38880 enum dispatch_group group;
38881 int imm_size;
38882 int num_imm_operand;
38883 int num_imm32_operand;
38884 int num_imm64_operand;
38885
38886 if (INSN_CODE (insn) < 0)
38887 return;
38888
38889 byte_len = min_insn_size (insn);
38890 path = get_insn_path (insn);
38891 group = get_insn_group (insn);
38892 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38893 &num_imm64_operand);
38894
38895 fprintf (file, " insn info:\n");
38896 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
38897 group_name[group], path, byte_len);
38898 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38899 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
38900 }
38901
38902 /* Print to STDERR the status of the ready list with respect to
38903 dispatch windows. */
38904
38905 DEBUG_FUNCTION void
38906 debug_ready_dispatch (void)
38907 {
38908 int i;
38909 int no_ready = number_in_ready ();
38910
38911 fprintf (stdout, "Number of ready: %d\n", no_ready);
38912
38913 for (i = 0; i < no_ready; i++)
38914 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
38915 }
38916
38917 /* This routine is the driver of the dispatch scheduler. */
38918
38919 static void
38920 do_dispatch (rtx insn, int mode)
38921 {
38922 if (mode == DISPATCH_INIT)
38923 init_dispatch_sched ();
38924 else if (mode == ADD_TO_DISPATCH_WINDOW)
38925 add_to_dispatch_window (insn);
38926 }
38927
38928 /* Return TRUE if Dispatch Scheduling is supported. */
38929
38930 static bool
38931 has_dispatch (rtx insn, int action)
38932 {
38933 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
38934 && flag_dispatch_scheduler)
38935 switch (action)
38936 {
38937 default:
38938 return false;
38939
38940 case IS_DISPATCH_ON:
38941 return true;
38942 break;
38943
38944 case IS_CMP:
38945 return is_cmp (insn);
38946
38947 case DISPATCH_VIOLATION:
38948 return dispatch_violation ();
38949
38950 case FITS_DISPATCH_WINDOW:
38951 return fits_dispatch_window (insn);
38952 }
38953
38954 return false;
38955 }
38956
38957 /* Implementation of reassociation_width target hook used by
38958 reassoc phase to identify parallelism level in reassociated
38959 tree. Statements tree_code is passed in OPC. Arguments type
38960 is passed in MODE.
38961
38962 Currently parallel reassociation is enabled for Atom
38963 processors only and we set reassociation width to be 2
38964 because Atom may issue up to 2 instructions per cycle.
38965
38966 Return value should be fixed if parallel reassociation is
38967 enabled for other processors. */
38968
38969 static int
38970 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
38971 enum machine_mode mode)
38972 {
38973 int res = 1;
38974
38975 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
38976 res = 2;
38977 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
38978 res = 2;
38979
38980 return res;
38981 }
38982
38983 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
38984 place emms and femms instructions. */
38985
38986 static enum machine_mode
38987 ix86_preferred_simd_mode (enum machine_mode mode)
38988 {
38989 if (!TARGET_SSE)
38990 return word_mode;
38991
38992 switch (mode)
38993 {
38994 case QImode:
38995 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
38996 case HImode:
38997 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
38998 case SImode:
38999 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
39000 case DImode:
39001 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
39002
39003 case SFmode:
39004 if (TARGET_AVX && !TARGET_PREFER_AVX128)
39005 return V8SFmode;
39006 else
39007 return V4SFmode;
39008
39009 case DFmode:
39010 if (!TARGET_VECTORIZE_DOUBLE)
39011 return word_mode;
39012 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
39013 return V4DFmode;
39014 else if (TARGET_SSE2)
39015 return V2DFmode;
39016 /* FALLTHRU */
39017
39018 default:
39019 return word_mode;
39020 }
39021 }
39022
39023 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
39024 vectors. */
39025
39026 static unsigned int
39027 ix86_autovectorize_vector_sizes (void)
39028 {
39029 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
39030 }
39031
39032 /* Initialize the GCC target structure. */
39033 #undef TARGET_RETURN_IN_MEMORY
39034 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
39035
39036 #undef TARGET_LEGITIMIZE_ADDRESS
39037 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
39038
39039 #undef TARGET_ATTRIBUTE_TABLE
39040 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
39041 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39042 # undef TARGET_MERGE_DECL_ATTRIBUTES
39043 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
39044 #endif
39045
39046 #undef TARGET_COMP_TYPE_ATTRIBUTES
39047 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
39048
39049 #undef TARGET_INIT_BUILTINS
39050 #define TARGET_INIT_BUILTINS ix86_init_builtins
39051 #undef TARGET_BUILTIN_DECL
39052 #define TARGET_BUILTIN_DECL ix86_builtin_decl
39053 #undef TARGET_EXPAND_BUILTIN
39054 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
39055
39056 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
39057 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
39058 ix86_builtin_vectorized_function
39059
39060 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
39061 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
39062
39063 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
39064 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
39065
39066 #undef TARGET_VECTORIZE_BUILTIN_GATHER
39067 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
39068
39069 #undef TARGET_BUILTIN_RECIPROCAL
39070 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
39071
39072 #undef TARGET_ASM_FUNCTION_EPILOGUE
39073 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
39074
39075 #undef TARGET_ENCODE_SECTION_INFO
39076 #ifndef SUBTARGET_ENCODE_SECTION_INFO
39077 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
39078 #else
39079 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
39080 #endif
39081
39082 #undef TARGET_ASM_OPEN_PAREN
39083 #define TARGET_ASM_OPEN_PAREN ""
39084 #undef TARGET_ASM_CLOSE_PAREN
39085 #define TARGET_ASM_CLOSE_PAREN ""
39086
39087 #undef TARGET_ASM_BYTE_OP
39088 #define TARGET_ASM_BYTE_OP ASM_BYTE
39089
39090 #undef TARGET_ASM_ALIGNED_HI_OP
39091 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
39092 #undef TARGET_ASM_ALIGNED_SI_OP
39093 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
39094 #ifdef ASM_QUAD
39095 #undef TARGET_ASM_ALIGNED_DI_OP
39096 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
39097 #endif
39098
39099 #undef TARGET_PROFILE_BEFORE_PROLOGUE
39100 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
39101
39102 #undef TARGET_ASM_UNALIGNED_HI_OP
39103 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
39104 #undef TARGET_ASM_UNALIGNED_SI_OP
39105 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
39106 #undef TARGET_ASM_UNALIGNED_DI_OP
39107 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
39108
39109 #undef TARGET_PRINT_OPERAND
39110 #define TARGET_PRINT_OPERAND ix86_print_operand
39111 #undef TARGET_PRINT_OPERAND_ADDRESS
39112 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
39113 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
39114 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
39115 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
39116 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
39117
39118 #undef TARGET_SCHED_INIT_GLOBAL
39119 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
39120 #undef TARGET_SCHED_ADJUST_COST
39121 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
39122 #undef TARGET_SCHED_ISSUE_RATE
39123 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
39124 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
39125 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
39126 ia32_multipass_dfa_lookahead
39127
39128 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
39129 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
39130
39131 #ifdef HAVE_AS_TLS
39132 #undef TARGET_HAVE_TLS
39133 #define TARGET_HAVE_TLS true
39134 #endif
39135 #undef TARGET_CANNOT_FORCE_CONST_MEM
39136 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
39137 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
39138 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
39139
39140 #undef TARGET_DELEGITIMIZE_ADDRESS
39141 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
39142
39143 #undef TARGET_MS_BITFIELD_LAYOUT_P
39144 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
39145
39146 #if TARGET_MACHO
39147 #undef TARGET_BINDS_LOCAL_P
39148 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
39149 #endif
39150 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39151 #undef TARGET_BINDS_LOCAL_P
39152 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
39153 #endif
39154
39155 #undef TARGET_ASM_OUTPUT_MI_THUNK
39156 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
39157 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
39158 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
39159
39160 #undef TARGET_ASM_FILE_START
39161 #define TARGET_ASM_FILE_START x86_file_start
39162
39163 #undef TARGET_OPTION_OVERRIDE
39164 #define TARGET_OPTION_OVERRIDE ix86_option_override
39165
39166 #undef TARGET_REGISTER_MOVE_COST
39167 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
39168 #undef TARGET_MEMORY_MOVE_COST
39169 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
39170 #undef TARGET_RTX_COSTS
39171 #define TARGET_RTX_COSTS ix86_rtx_costs
39172 #undef TARGET_ADDRESS_COST
39173 #define TARGET_ADDRESS_COST ix86_address_cost
39174
39175 #undef TARGET_FIXED_CONDITION_CODE_REGS
39176 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
39177 #undef TARGET_CC_MODES_COMPATIBLE
39178 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
39179
39180 #undef TARGET_MACHINE_DEPENDENT_REORG
39181 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
39182
39183 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
39184 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
39185
39186 #undef TARGET_BUILD_BUILTIN_VA_LIST
39187 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
39188
39189 #undef TARGET_ENUM_VA_LIST_P
39190 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
39191
39192 #undef TARGET_FN_ABI_VA_LIST
39193 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
39194
39195 #undef TARGET_CANONICAL_VA_LIST_TYPE
39196 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
39197
39198 #undef TARGET_EXPAND_BUILTIN_VA_START
39199 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
39200
39201 #undef TARGET_MD_ASM_CLOBBERS
39202 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
39203
39204 #undef TARGET_PROMOTE_PROTOTYPES
39205 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
39206 #undef TARGET_STRUCT_VALUE_RTX
39207 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
39208 #undef TARGET_SETUP_INCOMING_VARARGS
39209 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
39210 #undef TARGET_MUST_PASS_IN_STACK
39211 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
39212 #undef TARGET_FUNCTION_ARG_ADVANCE
39213 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
39214 #undef TARGET_FUNCTION_ARG
39215 #define TARGET_FUNCTION_ARG ix86_function_arg
39216 #undef TARGET_FUNCTION_ARG_BOUNDARY
39217 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
39218 #undef TARGET_PASS_BY_REFERENCE
39219 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
39220 #undef TARGET_INTERNAL_ARG_POINTER
39221 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
39222 #undef TARGET_UPDATE_STACK_BOUNDARY
39223 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
39224 #undef TARGET_GET_DRAP_RTX
39225 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
39226 #undef TARGET_STRICT_ARGUMENT_NAMING
39227 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
39228 #undef TARGET_STATIC_CHAIN
39229 #define TARGET_STATIC_CHAIN ix86_static_chain
39230 #undef TARGET_TRAMPOLINE_INIT
39231 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
39232 #undef TARGET_RETURN_POPS_ARGS
39233 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
39234
39235 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
39236 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
39237
39238 #undef TARGET_SCALAR_MODE_SUPPORTED_P
39239 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
39240
39241 #undef TARGET_VECTOR_MODE_SUPPORTED_P
39242 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
39243
39244 #undef TARGET_C_MODE_FOR_SUFFIX
39245 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
39246
39247 #ifdef HAVE_AS_TLS
39248 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
39249 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
39250 #endif
39251
39252 #ifdef SUBTARGET_INSERT_ATTRIBUTES
39253 #undef TARGET_INSERT_ATTRIBUTES
39254 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
39255 #endif
39256
39257 #undef TARGET_MANGLE_TYPE
39258 #define TARGET_MANGLE_TYPE ix86_mangle_type
39259
39260 #ifndef TARGET_MACHO
39261 #undef TARGET_STACK_PROTECT_FAIL
39262 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
39263 #endif
39264
39265 #undef TARGET_FUNCTION_VALUE
39266 #define TARGET_FUNCTION_VALUE ix86_function_value
39267
39268 #undef TARGET_FUNCTION_VALUE_REGNO_P
39269 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
39270
39271 #undef TARGET_PROMOTE_FUNCTION_MODE
39272 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
39273
39274 #undef TARGET_SECONDARY_RELOAD
39275 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
39276
39277 #undef TARGET_CLASS_MAX_NREGS
39278 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
39279
39280 #undef TARGET_PREFERRED_RELOAD_CLASS
39281 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
39282 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
39283 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
39284 #undef TARGET_CLASS_LIKELY_SPILLED_P
39285 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
39286
39287 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
39288 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
39289 ix86_builtin_vectorization_cost
39290 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
39291 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
39292 ix86_vectorize_vec_perm_const_ok
39293 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
39294 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
39295 ix86_preferred_simd_mode
39296 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
39297 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
39298 ix86_autovectorize_vector_sizes
39299
39300 #undef TARGET_SET_CURRENT_FUNCTION
39301 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
39302
39303 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
39304 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
39305
39306 #undef TARGET_OPTION_SAVE
39307 #define TARGET_OPTION_SAVE ix86_function_specific_save
39308
39309 #undef TARGET_OPTION_RESTORE
39310 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
39311
39312 #undef TARGET_OPTION_PRINT
39313 #define TARGET_OPTION_PRINT ix86_function_specific_print
39314
39315 #undef TARGET_CAN_INLINE_P
39316 #define TARGET_CAN_INLINE_P ix86_can_inline_p
39317
39318 #undef TARGET_EXPAND_TO_RTL_HOOK
39319 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
39320
39321 #undef TARGET_LEGITIMATE_ADDRESS_P
39322 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
39323
39324 #undef TARGET_LEGITIMATE_CONSTANT_P
39325 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
39326
39327 #undef TARGET_FRAME_POINTER_REQUIRED
39328 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
39329
39330 #undef TARGET_CAN_ELIMINATE
39331 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
39332
39333 #undef TARGET_EXTRA_LIVE_ON_ENTRY
39334 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
39335
39336 #undef TARGET_ASM_CODE_END
39337 #define TARGET_ASM_CODE_END ix86_code_end
39338
39339 #undef TARGET_CONDITIONAL_REGISTER_USAGE
39340 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
39341
39342 #if TARGET_MACHO
39343 #undef TARGET_INIT_LIBFUNCS
39344 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
39345 #endif
39346
39347 struct gcc_target targetm = TARGET_INITIALIZER;
39348 \f
39349 #include "gt-i386.h"