973bbeb68db04ef559087ae78d967feaf7e6f478
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "cgraph.h"
51 #include "gimple.h"
52 #include "dwarf2.h"
53 #include "df.h"
54 #include "tm-constrs.h"
55 #include "params.h"
56 #include "cselib.h"
57 #include "debug.h"
58 #include "sched-int.h"
59 #include "sbitmap.h"
60 #include "fibheap.h"
61 #include "opts.h"
62 #include "diagnostic.h"
63
64 enum upper_128bits_state
65 {
66 unknown = 0,
67 unused,
68 used
69 };
70
71 typedef struct block_info_def
72 {
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
76 in this block. */
77 bool unchanged;
78 /* TRUE if block has been processed. */
79 bool processed;
80 /* TRUE if block has been scanned. */
81 bool scanned;
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
84 } *block_info;
85
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
87
88 enum call_avx256_state
89 {
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
95 callee_pass_avx256,
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
98 call_no_avx256,
99 /* vzeroupper intrinsic. */
100 vzeroupper_intrinsic
101 };
102
103 /* Check if a 256bit AVX register is referenced in stores. */
104
105 static void
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
107 {
108 if ((REG_P (dest)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
113 {
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
116 *state = used;
117 }
118 }
119
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
123
124 STATE is state of the upper 128bits of AVX registers at entry. */
125
126 static void
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
129 {
130 rtx insn, bb_end;
131 rtx vzeroupper_insn = NULL_RTX;
132 rtx pat;
133 int avx256;
134 bool unchanged;
135
136 if (BLOCK_INFO (bb)->unchanged)
137 {
138 if (dump_file)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
140 bb->index, state);
141
142 BLOCK_INFO (bb)->state = state;
143 return;
144 }
145
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
147 {
148 if (dump_file)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
151 return;
152 }
153
154 BLOCK_INFO (bb)->prev = state;
155
156 if (dump_file)
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
158 bb->index, state);
159
160 unchanged = true;
161
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
164 insn = BB_HEAD (bb);
165 while (insn != bb_end)
166 {
167 insn = NEXT_INSN (insn);
168
169 if (!NONDEBUG_INSN_P (insn))
170 continue;
171
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
174 {
175 if (!vzeroupper_insn)
176 continue;
177
178 if (PREV_INSN (insn) != vzeroupper_insn)
179 {
180 if (dump_file)
181 {
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
186 }
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
188 PREV_INSN (insn));
189 }
190 vzeroupper_insn = NULL_RTX;
191 continue;
192 }
193
194 pat = PATTERN (insn);
195
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
199 {
200 if (dump_file)
201 {
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
205 }
206 }
207 else
208 {
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
213 {
214 state = unused;
215 unchanged = false;
216
217 /* Delete pending vzeroupper insertion. */
218 if (vzeroupper_insn)
219 {
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
222 }
223 }
224 else if (state != used)
225 {
226 note_stores (pat, check_avx256_stores, &state);
227 if (state == used)
228 unchanged = false;
229 }
230 continue;
231 }
232
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
235
236 if (state == unused)
237 {
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
242 {
243 state = used;
244 unchanged = false;
245 }
246
247 /* Remove unnecessary vzeroupper since upper 128bits are
248 cleared. */
249 if (dump_file)
250 {
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
253 }
254 delete_insn (insn);
255 }
256 else
257 {
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
259 register. */
260 if (avx256 != callee_return_pass_avx256)
261 state = unused;
262
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
265 {
266 /* Must remove vzeroupper since callee passes in 256bit
267 AVX register. */
268 if (dump_file)
269 {
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
272 }
273 delete_insn (insn);
274 }
275 else
276 {
277 vzeroupper_insn = insn;
278 unchanged = false;
279 }
280 }
281 }
282
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
286
287 if (dump_file)
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
290 state);
291 }
292
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
296 state is changed. */
297
298 static bool
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
300 {
301 edge e;
302 edge_iterator ei;
303 enum upper_128bits_state state, old_state, new_state;
304 bool seen_unknown;
305
306 if (dump_file)
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
309
310 if (BLOCK_INFO (block)->processed)
311 return false;
312
313 state = unused;
314
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
318 {
319 if (e->src == block)
320 continue;
321 switch (BLOCK_INFO (e->src)->state)
322 {
323 case unknown:
324 if (!unknown_is_unused)
325 seen_unknown = true;
326 case unused:
327 break;
328 case used:
329 state = used;
330 goto done;
331 }
332 }
333
334 if (seen_unknown)
335 state = unknown;
336
337 done:
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
341
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
344
345 /* Need to rescan if the upper 128bits of AVX registers are changed
346 to USED at exit. */
347 if (new_state != old_state)
348 {
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
351 return true;
352 }
353 else
354 return false;
355 }
356
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
360
361 static void
362 move_or_delete_vzeroupper (void)
363 {
364 edge e;
365 edge_iterator ei;
366 basic_block bb;
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
369 int *bb_order;
370 int *rc_order;
371 int i;
372
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
375
376 /* Process outgoing edges of entry point. */
377 if (dump_file)
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
379
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
381 {
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
384 ? used : unused);
385 BLOCK_INFO (e->dest)->processed = true;
386 }
387
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
395 free (rc_order);
396
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
403
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
406 FOR_EACH_BB (bb)
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
409 else
410 {
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
413 }
414
415 if (dump_file)
416 fprintf (dump_file, "Check remaining basic blocks\n");
417
418 while (!fibheap_empty (pending))
419 {
420 fibheap_swap = pending;
421 pending = worklist;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
426
427 sbitmap_zero (visited);
428
429 cfun->machine->rescan_vzeroupper_p = 0;
430
431 while (!fibheap_empty (worklist))
432 {
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
437 {
438 edge_iterator ei;
439
440 SET_BIT (visited, bb->index);
441
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
444 {
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
447 continue;
448
449 if (TEST_BIT (visited, e->dest->index))
450 {
451 if (!TEST_BIT (in_pending, e->dest->index))
452 {
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
457 e->dest);
458 }
459 }
460 else if (!TEST_BIT (in_worklist, e->dest->index))
461 {
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
465 e->dest);
466 }
467 }
468 }
469 }
470
471 if (!cfun->machine->rescan_vzeroupper_p)
472 break;
473 }
474
475 free (bb_order);
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
481
482 if (dump_file)
483 fprintf (dump_file, "Process remaining basic blocks\n");
484
485 FOR_EACH_BB (bb)
486 move_or_delete_vzeroupper_1 (bb, true);
487
488 free_aux_for_blocks ();
489 }
490
491 static rtx legitimize_dllimport_symbol (rtx, bool);
492
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
495 #endif
496
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
503 : 4)
504
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
508
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
510
511 const
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
531 2, /* MOVE_RATIO */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
557 2, /* Branch cost */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 1, /* scalar_stmt_cost. */
569 1, /* scalar load_cost. */
570 1, /* scalar_store_cost. */
571 1, /* vec_stmt_cost. */
572 1, /* vec_to_scalar_cost. */
573 1, /* scalar_to_vec_cost. */
574 1, /* vec_align_load_cost. */
575 1, /* vec_unalign_load_cost. */
576 1, /* vec_store_cost. */
577 1, /* cond_taken_branch_cost. */
578 1, /* cond_not_taken_branch_cost. */
579 };
580
581 /* Processor costs (relative to an add) */
582 static const
583 struct processor_costs i386_cost = { /* 386 specific costs */
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (1), /* cost of a lea instruction */
586 COSTS_N_INSNS (3), /* variable shift costs */
587 COSTS_N_INSNS (2), /* constant shift costs */
588 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (6), /* HI */
590 COSTS_N_INSNS (6), /* SI */
591 COSTS_N_INSNS (6), /* DI */
592 COSTS_N_INSNS (6)}, /* other */
593 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (23), /* HI */
596 COSTS_N_INSNS (23), /* SI */
597 COSTS_N_INSNS (23), /* DI */
598 COSTS_N_INSNS (23)}, /* other */
599 COSTS_N_INSNS (3), /* cost of movsx */
600 COSTS_N_INSNS (2), /* cost of movzx */
601 15, /* "large" insn */
602 3, /* MOVE_RATIO */
603 4, /* cost for loading QImode using movzbl */
604 {2, 4, 2}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {2, 4, 2}, /* cost of storing integer registers */
608 2, /* cost of reg,reg fld/fst */
609 {8, 8, 8}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {8, 8, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 8}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 8}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 8, 16}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 8, 16}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 3, /* MMX or SSE register to integer */
624 0, /* size of l1 cache */
625 0, /* size of l2 cache */
626 0, /* size of prefetch block */
627 0, /* number of parallel prefetches */
628 1, /* Branch cost */
629 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (22), /* cost of FABS instruction. */
633 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
635 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
636 DUMMY_STRINGOP_ALGS},
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
650 };
651
652 static const
653 struct processor_costs i486_cost = { /* 486 specific costs */
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (1), /* cost of a lea instruction */
656 COSTS_N_INSNS (3), /* variable shift costs */
657 COSTS_N_INSNS (2), /* constant shift costs */
658 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (12), /* HI */
660 COSTS_N_INSNS (12), /* SI */
661 COSTS_N_INSNS (12), /* DI */
662 COSTS_N_INSNS (12)}, /* other */
663 1, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (40), /* HI */
666 COSTS_N_INSNS (40), /* SI */
667 COSTS_N_INSNS (40), /* DI */
668 COSTS_N_INSNS (40)}, /* other */
669 COSTS_N_INSNS (3), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 15, /* "large" insn */
672 3, /* MOVE_RATIO */
673 4, /* cost for loading QImode using movzbl */
674 {2, 4, 2}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 4, 2}, /* cost of storing integer registers */
678 2, /* cost of reg,reg fld/fst */
679 {8, 8, 8}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {8, 8, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 8}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 8}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 8, 16}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 8, 16}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 3, /* MMX or SSE register to integer */
694 4, /* size of l1 cache. 486 has 8kB cache
695 shared for code and data, so 4kB is
696 not really precise. */
697 4, /* size of l2 cache */
698 0, /* size of prefetch block */
699 0, /* number of parallel prefetches */
700 1, /* Branch cost */
701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (3), /* cost of FABS instruction. */
705 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
707 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
708 DUMMY_STRINGOP_ALGS},
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 1, /* scalar_stmt_cost. */
712 1, /* scalar load_cost. */
713 1, /* scalar_store_cost. */
714 1, /* vec_stmt_cost. */
715 1, /* vec_to_scalar_cost. */
716 1, /* scalar_to_vec_cost. */
717 1, /* vec_align_load_cost. */
718 2, /* vec_unalign_load_cost. */
719 1, /* vec_store_cost. */
720 3, /* cond_taken_branch_cost. */
721 1, /* cond_not_taken_branch_cost. */
722 };
723
724 static const
725 struct processor_costs pentium_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (1), /* cost of a lea instruction */
728 COSTS_N_INSNS (4), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (11), /* HI */
732 COSTS_N_INSNS (11), /* SI */
733 COSTS_N_INSNS (11), /* DI */
734 COSTS_N_INSNS (11)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (25), /* HI */
738 COSTS_N_INSNS (25), /* SI */
739 COSTS_N_INSNS (25), /* DI */
740 COSTS_N_INSNS (25)}, /* other */
741 COSTS_N_INSNS (3), /* cost of movsx */
742 COSTS_N_INSNS (2), /* cost of movzx */
743 8, /* "large" insn */
744 6, /* MOVE_RATIO */
745 6, /* cost for loading QImode using movzbl */
746 {2, 4, 2}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {2, 4, 2}, /* cost of storing integer registers */
750 2, /* cost of reg,reg fld/fst */
751 {2, 2, 6}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {4, 4, 6}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 8, /* cost of moving MMX register */
756 {8, 8}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {8, 8}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 8, 16}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 8, 16}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 3, /* MMX or SSE register to integer */
766 8, /* size of l1 cache. */
767 8, /* size of l2 cache */
768 0, /* size of prefetch block */
769 0, /* number of parallel prefetches */
770 2, /* Branch cost */
771 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
772 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
773 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
774 COSTS_N_INSNS (1), /* cost of FABS instruction. */
775 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
776 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
777 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
778 DUMMY_STRINGOP_ALGS},
779 {{libcall, {{-1, rep_prefix_4_byte}}},
780 DUMMY_STRINGOP_ALGS},
781 1, /* scalar_stmt_cost. */
782 1, /* scalar load_cost. */
783 1, /* scalar_store_cost. */
784 1, /* vec_stmt_cost. */
785 1, /* vec_to_scalar_cost. */
786 1, /* scalar_to_vec_cost. */
787 1, /* vec_align_load_cost. */
788 2, /* vec_unalign_load_cost. */
789 1, /* vec_store_cost. */
790 3, /* cond_taken_branch_cost. */
791 1, /* cond_not_taken_branch_cost. */
792 };
793
794 static const
795 struct processor_costs pentiumpro_cost = {
796 COSTS_N_INSNS (1), /* cost of an add instruction */
797 COSTS_N_INSNS (1), /* cost of a lea instruction */
798 COSTS_N_INSNS (1), /* variable shift costs */
799 COSTS_N_INSNS (1), /* constant shift costs */
800 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
801 COSTS_N_INSNS (4), /* HI */
802 COSTS_N_INSNS (4), /* SI */
803 COSTS_N_INSNS (4), /* DI */
804 COSTS_N_INSNS (4)}, /* other */
805 0, /* cost of multiply per each bit set */
806 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
807 COSTS_N_INSNS (17), /* HI */
808 COSTS_N_INSNS (17), /* SI */
809 COSTS_N_INSNS (17), /* DI */
810 COSTS_N_INSNS (17)}, /* other */
811 COSTS_N_INSNS (1), /* cost of movsx */
812 COSTS_N_INSNS (1), /* cost of movzx */
813 8, /* "large" insn */
814 6, /* MOVE_RATIO */
815 2, /* cost for loading QImode using movzbl */
816 {4, 4, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 2, 2}, /* cost of storing integer registers */
820 2, /* cost of reg,reg fld/fst */
821 {2, 2, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 6}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, /* cost of moving SSE register */
831 {2, 2, 8}, /* cost of loading SSE registers
832 in SImode, DImode and TImode */
833 {2, 2, 8}, /* cost of storing SSE registers
834 in SImode, DImode and TImode */
835 3, /* MMX or SSE register to integer */
836 8, /* size of l1 cache. */
837 256, /* size of l2 cache */
838 32, /* size of prefetch block */
839 6, /* number of parallel prefetches */
840 2, /* Branch cost */
841 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
842 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
843 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
844 COSTS_N_INSNS (2), /* cost of FABS instruction. */
845 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
846 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
847 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
848 (we ensure the alignment). For small blocks inline loop is still a
849 noticeable win, for bigger blocks either rep movsl or rep movsb is
850 way to go. Rep movsb has apparently more expensive startup time in CPU,
851 but after 4K the difference is down in the noise. */
852 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
853 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
854 DUMMY_STRINGOP_ALGS},
855 {{rep_prefix_4_byte, {{1024, unrolled_loop},
856 {8192, rep_prefix_4_byte}, {-1, libcall}}},
857 DUMMY_STRINGOP_ALGS},
858 1, /* scalar_stmt_cost. */
859 1, /* scalar load_cost. */
860 1, /* scalar_store_cost. */
861 1, /* vec_stmt_cost. */
862 1, /* vec_to_scalar_cost. */
863 1, /* scalar_to_vec_cost. */
864 1, /* vec_align_load_cost. */
865 2, /* vec_unalign_load_cost. */
866 1, /* vec_store_cost. */
867 3, /* cond_taken_branch_cost. */
868 1, /* cond_not_taken_branch_cost. */
869 };
870
871 static const
872 struct processor_costs geode_cost = {
873 COSTS_N_INSNS (1), /* cost of an add instruction */
874 COSTS_N_INSNS (1), /* cost of a lea instruction */
875 COSTS_N_INSNS (2), /* variable shift costs */
876 COSTS_N_INSNS (1), /* constant shift costs */
877 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
878 COSTS_N_INSNS (4), /* HI */
879 COSTS_N_INSNS (7), /* SI */
880 COSTS_N_INSNS (7), /* DI */
881 COSTS_N_INSNS (7)}, /* other */
882 0, /* cost of multiply per each bit set */
883 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
884 COSTS_N_INSNS (23), /* HI */
885 COSTS_N_INSNS (39), /* SI */
886 COSTS_N_INSNS (39), /* DI */
887 COSTS_N_INSNS (39)}, /* other */
888 COSTS_N_INSNS (1), /* cost of movsx */
889 COSTS_N_INSNS (1), /* cost of movzx */
890 8, /* "large" insn */
891 4, /* MOVE_RATIO */
892 1, /* cost for loading QImode using movzbl */
893 {1, 1, 1}, /* cost of loading integer registers
894 in QImode, HImode and SImode.
895 Relative to reg-reg move (2). */
896 {1, 1, 1}, /* cost of storing integer registers */
897 1, /* cost of reg,reg fld/fst */
898 {1, 1, 1}, /* cost of loading fp registers
899 in SFmode, DFmode and XFmode */
900 {4, 6, 6}, /* cost of storing fp registers
901 in SFmode, DFmode and XFmode */
902
903 1, /* cost of moving MMX register */
904 {1, 1}, /* cost of loading MMX registers
905 in SImode and DImode */
906 {1, 1}, /* cost of storing MMX registers
907 in SImode and DImode */
908 1, /* cost of moving SSE register */
909 {1, 1, 1}, /* cost of loading SSE registers
910 in SImode, DImode and TImode */
911 {1, 1, 1}, /* cost of storing SSE registers
912 in SImode, DImode and TImode */
913 1, /* MMX or SSE register to integer */
914 64, /* size of l1 cache. */
915 128, /* size of l2 cache. */
916 32, /* size of prefetch block */
917 1, /* number of parallel prefetches */
918 1, /* Branch cost */
919 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
920 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
921 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
922 COSTS_N_INSNS (1), /* cost of FABS instruction. */
923 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
924 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
925 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
926 DUMMY_STRINGOP_ALGS},
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 1, /* scalar_stmt_cost. */
930 1, /* scalar load_cost. */
931 1, /* scalar_store_cost. */
932 1, /* vec_stmt_cost. */
933 1, /* vec_to_scalar_cost. */
934 1, /* scalar_to_vec_cost. */
935 1, /* vec_align_load_cost. */
936 2, /* vec_unalign_load_cost. */
937 1, /* vec_store_cost. */
938 3, /* cond_taken_branch_cost. */
939 1, /* cond_not_taken_branch_cost. */
940 };
941
942 static const
943 struct processor_costs k6_cost = {
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (3), /* HI */
950 COSTS_N_INSNS (3), /* SI */
951 COSTS_N_INSNS (3), /* DI */
952 COSTS_N_INSNS (3)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (18), /* HI */
956 COSTS_N_INSNS (18), /* SI */
957 COSTS_N_INSNS (18), /* DI */
958 COSTS_N_INSNS (18)}, /* other */
959 COSTS_N_INSNS (2), /* cost of movsx */
960 COSTS_N_INSNS (2), /* cost of movzx */
961 8, /* "large" insn */
962 4, /* MOVE_RATIO */
963 3, /* cost for loading QImode using movzbl */
964 {4, 5, 4}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {2, 3, 2}, /* cost of storing integer registers */
968 4, /* cost of reg,reg fld/fst */
969 {6, 6, 6}, /* cost of loading fp registers
970 in SFmode, DFmode and XFmode */
971 {4, 4, 4}, /* cost of storing fp registers
972 in SFmode, DFmode and XFmode */
973 2, /* cost of moving MMX register */
974 {2, 2}, /* cost of loading MMX registers
975 in SImode and DImode */
976 {2, 2}, /* cost of storing MMX registers
977 in SImode and DImode */
978 2, /* cost of moving SSE register */
979 {2, 2, 8}, /* cost of loading SSE registers
980 in SImode, DImode and TImode */
981 {2, 2, 8}, /* cost of storing SSE registers
982 in SImode, DImode and TImode */
983 6, /* MMX or SSE register to integer */
984 32, /* size of l1 cache. */
985 32, /* size of l2 cache. Some models
986 have integrated l2 cache, but
987 optimizing for k6 is not important
988 enough to worry about that. */
989 32, /* size of prefetch block */
990 1, /* number of parallel prefetches */
991 1, /* Branch cost */
992 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
993 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
994 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
995 COSTS_N_INSNS (2), /* cost of FABS instruction. */
996 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
997 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
998 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
999 DUMMY_STRINGOP_ALGS},
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 1, /* scalar_stmt_cost. */
1003 1, /* scalar load_cost. */
1004 1, /* scalar_store_cost. */
1005 1, /* vec_stmt_cost. */
1006 1, /* vec_to_scalar_cost. */
1007 1, /* scalar_to_vec_cost. */
1008 1, /* vec_align_load_cost. */
1009 2, /* vec_unalign_load_cost. */
1010 1, /* vec_store_cost. */
1011 3, /* cond_taken_branch_cost. */
1012 1, /* cond_not_taken_branch_cost. */
1013 };
1014
1015 static const
1016 struct processor_costs athlon_cost = {
1017 COSTS_N_INSNS (1), /* cost of an add instruction */
1018 COSTS_N_INSNS (2), /* cost of a lea instruction */
1019 COSTS_N_INSNS (1), /* variable shift costs */
1020 COSTS_N_INSNS (1), /* constant shift costs */
1021 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1022 COSTS_N_INSNS (5), /* HI */
1023 COSTS_N_INSNS (5), /* SI */
1024 COSTS_N_INSNS (5), /* DI */
1025 COSTS_N_INSNS (5)}, /* other */
1026 0, /* cost of multiply per each bit set */
1027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1028 COSTS_N_INSNS (26), /* HI */
1029 COSTS_N_INSNS (42), /* SI */
1030 COSTS_N_INSNS (74), /* DI */
1031 COSTS_N_INSNS (74)}, /* other */
1032 COSTS_N_INSNS (1), /* cost of movsx */
1033 COSTS_N_INSNS (1), /* cost of movzx */
1034 8, /* "large" insn */
1035 9, /* MOVE_RATIO */
1036 4, /* cost for loading QImode using movzbl */
1037 {3, 4, 3}, /* cost of loading integer registers
1038 in QImode, HImode and SImode.
1039 Relative to reg-reg move (2). */
1040 {3, 4, 3}, /* cost of storing integer registers */
1041 4, /* cost of reg,reg fld/fst */
1042 {4, 4, 12}, /* cost of loading fp registers
1043 in SFmode, DFmode and XFmode */
1044 {6, 6, 8}, /* cost of storing fp registers
1045 in SFmode, DFmode and XFmode */
1046 2, /* cost of moving MMX register */
1047 {4, 4}, /* cost of loading MMX registers
1048 in SImode and DImode */
1049 {4, 4}, /* cost of storing MMX registers
1050 in SImode and DImode */
1051 2, /* cost of moving SSE register */
1052 {4, 4, 6}, /* cost of loading SSE registers
1053 in SImode, DImode and TImode */
1054 {4, 4, 5}, /* cost of storing SSE registers
1055 in SImode, DImode and TImode */
1056 5, /* MMX or SSE register to integer */
1057 64, /* size of l1 cache. */
1058 256, /* size of l2 cache. */
1059 64, /* size of prefetch block */
1060 6, /* number of parallel prefetches */
1061 5, /* Branch cost */
1062 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1063 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1064 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1065 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1066 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1067 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1068 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1069 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1070 128 bytes for memset. */
1071 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1072 DUMMY_STRINGOP_ALGS},
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 1, /* scalar_stmt_cost. */
1076 1, /* scalar load_cost. */
1077 1, /* scalar_store_cost. */
1078 1, /* vec_stmt_cost. */
1079 1, /* vec_to_scalar_cost. */
1080 1, /* scalar_to_vec_cost. */
1081 1, /* vec_align_load_cost. */
1082 2, /* vec_unalign_load_cost. */
1083 1, /* vec_store_cost. */
1084 3, /* cond_taken_branch_cost. */
1085 1, /* cond_not_taken_branch_cost. */
1086 };
1087
1088 static const
1089 struct processor_costs k8_cost = {
1090 COSTS_N_INSNS (1), /* cost of an add instruction */
1091 COSTS_N_INSNS (2), /* cost of a lea instruction */
1092 COSTS_N_INSNS (1), /* variable shift costs */
1093 COSTS_N_INSNS (1), /* constant shift costs */
1094 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1095 COSTS_N_INSNS (4), /* HI */
1096 COSTS_N_INSNS (3), /* SI */
1097 COSTS_N_INSNS (4), /* DI */
1098 COSTS_N_INSNS (5)}, /* other */
1099 0, /* cost of multiply per each bit set */
1100 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1101 COSTS_N_INSNS (26), /* HI */
1102 COSTS_N_INSNS (42), /* SI */
1103 COSTS_N_INSNS (74), /* DI */
1104 COSTS_N_INSNS (74)}, /* other */
1105 COSTS_N_INSNS (1), /* cost of movsx */
1106 COSTS_N_INSNS (1), /* cost of movzx */
1107 8, /* "large" insn */
1108 9, /* MOVE_RATIO */
1109 4, /* cost for loading QImode using movzbl */
1110 {3, 4, 3}, /* cost of loading integer registers
1111 in QImode, HImode and SImode.
1112 Relative to reg-reg move (2). */
1113 {3, 4, 3}, /* cost of storing integer registers */
1114 4, /* cost of reg,reg fld/fst */
1115 {4, 4, 12}, /* cost of loading fp registers
1116 in SFmode, DFmode and XFmode */
1117 {6, 6, 8}, /* cost of storing fp registers
1118 in SFmode, DFmode and XFmode */
1119 2, /* cost of moving MMX register */
1120 {3, 3}, /* cost of loading MMX registers
1121 in SImode and DImode */
1122 {4, 4}, /* cost of storing MMX registers
1123 in SImode and DImode */
1124 2, /* cost of moving SSE register */
1125 {4, 3, 6}, /* cost of loading SSE registers
1126 in SImode, DImode and TImode */
1127 {4, 4, 5}, /* cost of storing SSE registers
1128 in SImode, DImode and TImode */
1129 5, /* MMX or SSE register to integer */
1130 64, /* size of l1 cache. */
1131 512, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1137 time). */
1138 100, /* number of parallel prefetches */
1139 3, /* Branch cost */
1140 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1146 /* K8 has optimized REP instruction for medium sized blocks, but for very
1147 small blocks it is better to use loop. For large blocks, libcall can
1148 do nontemporary accesses and beat inline considerably. */
1149 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1150 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1151 {{libcall, {{8, loop}, {24, unrolled_loop},
1152 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1153 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1154 4, /* scalar_stmt_cost. */
1155 2, /* scalar load_cost. */
1156 2, /* scalar_store_cost. */
1157 5, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 2, /* vec_align_load_cost. */
1161 3, /* vec_unalign_load_cost. */
1162 3, /* vec_store_cost. */
1163 3, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1165 };
1166
1167 struct processor_costs amdfam10_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (2), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (3), /* SI */
1175 COSTS_N_INSNS (4), /* DI */
1176 COSTS_N_INSNS (5)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1186 9, /* MOVE_RATIO */
1187 4, /* cost for loading QImode using movzbl */
1188 {3, 4, 3}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {3, 4, 3}, /* cost of storing integer registers */
1192 4, /* cost of reg,reg fld/fst */
1193 {4, 4, 12}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {6, 6, 8}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {3, 3}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {4, 4}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 2, /* cost of moving SSE register */
1203 {4, 4, 3}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {4, 4, 5}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 3, /* MMX or SSE register to integer */
1208 /* On K8:
1209 MOVD reg64, xmmreg Double FSTORE 4
1210 MOVD reg32, xmmreg Double FSTORE 4
1211 On AMDFAM10:
1212 MOVD reg64, xmmreg Double FADD 3
1213 1/1 1/1
1214 MOVD reg32, xmmreg Double FADD 3
1215 1/1 1/1 */
1216 64, /* size of l1 cache. */
1217 512, /* size of l2 cache. */
1218 64, /* size of prefetch block */
1219 /* New AMD processors never drop prefetches; if they cannot be performed
1220 immediately, they are queued. We set number of simultaneous prefetches
1221 to a large constant to reflect this (it probably is not a good idea not
1222 to limit number of prefetches at all, as their execution also takes some
1223 time). */
1224 100, /* number of parallel prefetches */
1225 2, /* Branch cost */
1226 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1227 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1228 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1229 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1230 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1231 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1232
1233 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1234 very small blocks it is better to use loop. For large blocks, libcall can
1235 do nontemporary accesses and beat inline considerably. */
1236 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1237 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1238 {{libcall, {{8, loop}, {24, unrolled_loop},
1239 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1240 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1241 4, /* scalar_stmt_cost. */
1242 2, /* scalar load_cost. */
1243 2, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 2, /* vec_align_load_cost. */
1248 2, /* vec_unalign_load_cost. */
1249 2, /* vec_store_cost. */
1250 2, /* cond_taken_branch_cost. */
1251 1, /* cond_not_taken_branch_cost. */
1252 };
1253
1254 struct processor_costs bdver1_cost = {
1255 COSTS_N_INSNS (1), /* cost of an add instruction */
1256 COSTS_N_INSNS (1), /* cost of a lea instruction */
1257 COSTS_N_INSNS (1), /* variable shift costs */
1258 COSTS_N_INSNS (1), /* constant shift costs */
1259 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1260 COSTS_N_INSNS (4), /* HI */
1261 COSTS_N_INSNS (4), /* SI */
1262 COSTS_N_INSNS (6), /* DI */
1263 COSTS_N_INSNS (6)}, /* other */
1264 0, /* cost of multiply per each bit set */
1265 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1266 COSTS_N_INSNS (35), /* HI */
1267 COSTS_N_INSNS (51), /* SI */
1268 COSTS_N_INSNS (83), /* DI */
1269 COSTS_N_INSNS (83)}, /* other */
1270 COSTS_N_INSNS (1), /* cost of movsx */
1271 COSTS_N_INSNS (1), /* cost of movzx */
1272 8, /* "large" insn */
1273 9, /* MOVE_RATIO */
1274 4, /* cost for loading QImode using movzbl */
1275 {5, 5, 4}, /* cost of loading integer registers
1276 in QImode, HImode and SImode.
1277 Relative to reg-reg move (2). */
1278 {4, 4, 4}, /* cost of storing integer registers */
1279 2, /* cost of reg,reg fld/fst */
1280 {5, 5, 12}, /* cost of loading fp registers
1281 in SFmode, DFmode and XFmode */
1282 {4, 4, 8}, /* cost of storing fp registers
1283 in SFmode, DFmode and XFmode */
1284 2, /* cost of moving MMX register */
1285 {4, 4}, /* cost of loading MMX registers
1286 in SImode and DImode */
1287 {4, 4}, /* cost of storing MMX registers
1288 in SImode and DImode */
1289 2, /* cost of moving SSE register */
1290 {4, 4, 4}, /* cost of loading SSE registers
1291 in SImode, DImode and TImode */
1292 {4, 4, 4}, /* cost of storing SSE registers
1293 in SImode, DImode and TImode */
1294 2, /* MMX or SSE register to integer */
1295 /* On K8:
1296 MOVD reg64, xmmreg Double FSTORE 4
1297 MOVD reg32, xmmreg Double FSTORE 4
1298 On AMDFAM10:
1299 MOVD reg64, xmmreg Double FADD 3
1300 1/1 1/1
1301 MOVD reg32, xmmreg Double FADD 3
1302 1/1 1/1 */
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1319
1320 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1321 very small blocks it is better to use loop. For large blocks, libcall
1322 can do nontemporary accesses and beat inline considerably. */
1323 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1324 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1325 {{libcall, {{8, loop}, {24, unrolled_loop},
1326 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1327 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 2, /* cond_taken_branch_cost. */
1338 1, /* cond_not_taken_branch_cost. */
1339 };
1340
1341 struct processor_costs bdver2_cost = {
1342 COSTS_N_INSNS (1), /* cost of an add instruction */
1343 COSTS_N_INSNS (1), /* cost of a lea instruction */
1344 COSTS_N_INSNS (1), /* variable shift costs */
1345 COSTS_N_INSNS (1), /* constant shift costs */
1346 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1347 COSTS_N_INSNS (4), /* HI */
1348 COSTS_N_INSNS (4), /* SI */
1349 COSTS_N_INSNS (6), /* DI */
1350 COSTS_N_INSNS (6)}, /* other */
1351 0, /* cost of multiply per each bit set */
1352 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1353 COSTS_N_INSNS (35), /* HI */
1354 COSTS_N_INSNS (51), /* SI */
1355 COSTS_N_INSNS (83), /* DI */
1356 COSTS_N_INSNS (83)}, /* other */
1357 COSTS_N_INSNS (1), /* cost of movsx */
1358 COSTS_N_INSNS (1), /* cost of movzx */
1359 8, /* "large" insn */
1360 9, /* MOVE_RATIO */
1361 4, /* cost for loading QImode using movzbl */
1362 {5, 5, 4}, /* cost of loading integer registers
1363 in QImode, HImode and SImode.
1364 Relative to reg-reg move (2). */
1365 {4, 4, 4}, /* cost of storing integer registers */
1366 2, /* cost of reg,reg fld/fst */
1367 {5, 5, 12}, /* cost of loading fp registers
1368 in SFmode, DFmode and XFmode */
1369 {4, 4, 8}, /* cost of storing fp registers
1370 in SFmode, DFmode and XFmode */
1371 2, /* cost of moving MMX register */
1372 {4, 4}, /* cost of loading MMX registers
1373 in SImode and DImode */
1374 {4, 4}, /* cost of storing MMX registers
1375 in SImode and DImode */
1376 2, /* cost of moving SSE register */
1377 {4, 4, 4}, /* cost of loading SSE registers
1378 in SImode, DImode and TImode */
1379 {4, 4, 4}, /* cost of storing SSE registers
1380 in SImode, DImode and TImode */
1381 2, /* MMX or SSE register to integer */
1382 /* On K8:
1383 MOVD reg64, xmmreg Double FSTORE 4
1384 MOVD reg32, xmmreg Double FSTORE 4
1385 On AMDFAM10:
1386 MOVD reg64, xmmreg Double FADD 3
1387 1/1 1/1
1388 MOVD reg32, xmmreg Double FADD 3
1389 1/1 1/1 */
1390 16, /* size of l1 cache. */
1391 2048, /* size of l2 cache. */
1392 64, /* size of prefetch block */
1393 /* New AMD processors never drop prefetches; if they cannot be performed
1394 immediately, they are queued. We set number of simultaneous prefetches
1395 to a large constant to reflect this (it probably is not a good idea not
1396 to limit number of prefetches at all, as their execution also takes some
1397 time). */
1398 100, /* number of parallel prefetches */
1399 2, /* Branch cost */
1400 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1401 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1402 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1403 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1404 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1405 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1406
1407 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1408 very small blocks it is better to use loop. For large blocks, libcall
1409 can do nontemporary accesses and beat inline considerably. */
1410 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1411 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1412 {{libcall, {{8, loop}, {24, unrolled_loop},
1413 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1414 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1415 6, /* scalar_stmt_cost. */
1416 4, /* scalar load_cost. */
1417 4, /* scalar_store_cost. */
1418 6, /* vec_stmt_cost. */
1419 0, /* vec_to_scalar_cost. */
1420 2, /* scalar_to_vec_cost. */
1421 4, /* vec_align_load_cost. */
1422 4, /* vec_unalign_load_cost. */
1423 4, /* vec_store_cost. */
1424 2, /* cond_taken_branch_cost. */
1425 1, /* cond_not_taken_branch_cost. */
1426 };
1427
1428 struct processor_costs btver1_cost = {
1429 COSTS_N_INSNS (1), /* cost of an add instruction */
1430 COSTS_N_INSNS (2), /* cost of a lea instruction */
1431 COSTS_N_INSNS (1), /* variable shift costs */
1432 COSTS_N_INSNS (1), /* constant shift costs */
1433 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1434 COSTS_N_INSNS (4), /* HI */
1435 COSTS_N_INSNS (3), /* SI */
1436 COSTS_N_INSNS (4), /* DI */
1437 COSTS_N_INSNS (5)}, /* other */
1438 0, /* cost of multiply per each bit set */
1439 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1440 COSTS_N_INSNS (35), /* HI */
1441 COSTS_N_INSNS (51), /* SI */
1442 COSTS_N_INSNS (83), /* DI */
1443 COSTS_N_INSNS (83)}, /* other */
1444 COSTS_N_INSNS (1), /* cost of movsx */
1445 COSTS_N_INSNS (1), /* cost of movzx */
1446 8, /* "large" insn */
1447 9, /* MOVE_RATIO */
1448 4, /* cost for loading QImode using movzbl */
1449 {3, 4, 3}, /* cost of loading integer registers
1450 in QImode, HImode and SImode.
1451 Relative to reg-reg move (2). */
1452 {3, 4, 3}, /* cost of storing integer registers */
1453 4, /* cost of reg,reg fld/fst */
1454 {4, 4, 12}, /* cost of loading fp registers
1455 in SFmode, DFmode and XFmode */
1456 {6, 6, 8}, /* cost of storing fp registers
1457 in SFmode, DFmode and XFmode */
1458 2, /* cost of moving MMX register */
1459 {3, 3}, /* cost of loading MMX registers
1460 in SImode and DImode */
1461 {4, 4}, /* cost of storing MMX registers
1462 in SImode and DImode */
1463 2, /* cost of moving SSE register */
1464 {4, 4, 3}, /* cost of loading SSE registers
1465 in SImode, DImode and TImode */
1466 {4, 4, 5}, /* cost of storing SSE registers
1467 in SImode, DImode and TImode */
1468 3, /* MMX or SSE register to integer */
1469 /* On K8:
1470 MOVD reg64, xmmreg Double FSTORE 4
1471 MOVD reg32, xmmreg Double FSTORE 4
1472 On AMDFAM10:
1473 MOVD reg64, xmmreg Double FADD 3
1474 1/1 1/1
1475 MOVD reg32, xmmreg Double FADD 3
1476 1/1 1/1 */
1477 32, /* size of l1 cache. */
1478 512, /* size of l2 cache. */
1479 64, /* size of prefetch block */
1480 100, /* number of parallel prefetches */
1481 2, /* Branch cost */
1482 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1483 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1484 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1485 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1486 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1487 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1488
1489 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1490 very small blocks it is better to use loop. For large blocks, libcall can
1491 do nontemporary accesses and beat inline considerably. */
1492 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1493 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1494 {{libcall, {{8, loop}, {24, unrolled_loop},
1495 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1496 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1497 4, /* scalar_stmt_cost. */
1498 2, /* scalar load_cost. */
1499 2, /* scalar_store_cost. */
1500 6, /* vec_stmt_cost. */
1501 0, /* vec_to_scalar_cost. */
1502 2, /* scalar_to_vec_cost. */
1503 2, /* vec_align_load_cost. */
1504 2, /* vec_unalign_load_cost. */
1505 2, /* vec_store_cost. */
1506 2, /* cond_taken_branch_cost. */
1507 1, /* cond_not_taken_branch_cost. */
1508 };
1509
1510 static const
1511 struct processor_costs pentium4_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (3), /* cost of a lea instruction */
1514 COSTS_N_INSNS (4), /* variable shift costs */
1515 COSTS_N_INSNS (4), /* constant shift costs */
1516 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (15), /* HI */
1518 COSTS_N_INSNS (15), /* SI */
1519 COSTS_N_INSNS (15), /* DI */
1520 COSTS_N_INSNS (15)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (56), /* HI */
1524 COSTS_N_INSNS (56), /* SI */
1525 COSTS_N_INSNS (56), /* DI */
1526 COSTS_N_INSNS (56)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 16, /* "large" insn */
1530 6, /* MOVE_RATIO */
1531 2, /* cost for loading QImode using movzbl */
1532 {4, 5, 4}, /* cost of loading integer registers
1533 in QImode, HImode and SImode.
1534 Relative to reg-reg move (2). */
1535 {2, 3, 2}, /* cost of storing integer registers */
1536 2, /* cost of reg,reg fld/fst */
1537 {2, 2, 6}, /* cost of loading fp registers
1538 in SFmode, DFmode and XFmode */
1539 {4, 4, 6}, /* cost of storing fp registers
1540 in SFmode, DFmode and XFmode */
1541 2, /* cost of moving MMX register */
1542 {2, 2}, /* cost of loading MMX registers
1543 in SImode and DImode */
1544 {2, 2}, /* cost of storing MMX registers
1545 in SImode and DImode */
1546 12, /* cost of moving SSE register */
1547 {12, 12, 12}, /* cost of loading SSE registers
1548 in SImode, DImode and TImode */
1549 {2, 2, 8}, /* cost of storing SSE registers
1550 in SImode, DImode and TImode */
1551 10, /* MMX or SSE register to integer */
1552 8, /* size of l1 cache. */
1553 256, /* size of l2 cache. */
1554 64, /* size of prefetch block */
1555 6, /* number of parallel prefetches */
1556 2, /* Branch cost */
1557 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1558 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1559 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1560 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1561 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1562 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1563 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1564 DUMMY_STRINGOP_ALGS},
1565 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1566 {-1, libcall}}},
1567 DUMMY_STRINGOP_ALGS},
1568 1, /* scalar_stmt_cost. */
1569 1, /* scalar load_cost. */
1570 1, /* scalar_store_cost. */
1571 1, /* vec_stmt_cost. */
1572 1, /* vec_to_scalar_cost. */
1573 1, /* scalar_to_vec_cost. */
1574 1, /* vec_align_load_cost. */
1575 2, /* vec_unalign_load_cost. */
1576 1, /* vec_store_cost. */
1577 3, /* cond_taken_branch_cost. */
1578 1, /* cond_not_taken_branch_cost. */
1579 };
1580
1581 static const
1582 struct processor_costs nocona_cost = {
1583 COSTS_N_INSNS (1), /* cost of an add instruction */
1584 COSTS_N_INSNS (1), /* cost of a lea instruction */
1585 COSTS_N_INSNS (1), /* variable shift costs */
1586 COSTS_N_INSNS (1), /* constant shift costs */
1587 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1588 COSTS_N_INSNS (10), /* HI */
1589 COSTS_N_INSNS (10), /* SI */
1590 COSTS_N_INSNS (10), /* DI */
1591 COSTS_N_INSNS (10)}, /* other */
1592 0, /* cost of multiply per each bit set */
1593 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1594 COSTS_N_INSNS (66), /* HI */
1595 COSTS_N_INSNS (66), /* SI */
1596 COSTS_N_INSNS (66), /* DI */
1597 COSTS_N_INSNS (66)}, /* other */
1598 COSTS_N_INSNS (1), /* cost of movsx */
1599 COSTS_N_INSNS (1), /* cost of movzx */
1600 16, /* "large" insn */
1601 17, /* MOVE_RATIO */
1602 4, /* cost for loading QImode using movzbl */
1603 {4, 4, 4}, /* cost of loading integer registers
1604 in QImode, HImode and SImode.
1605 Relative to reg-reg move (2). */
1606 {4, 4, 4}, /* cost of storing integer registers */
1607 3, /* cost of reg,reg fld/fst */
1608 {12, 12, 12}, /* cost of loading fp registers
1609 in SFmode, DFmode and XFmode */
1610 {4, 4, 4}, /* cost of storing fp registers
1611 in SFmode, DFmode and XFmode */
1612 6, /* cost of moving MMX register */
1613 {12, 12}, /* cost of loading MMX registers
1614 in SImode and DImode */
1615 {12, 12}, /* cost of storing MMX registers
1616 in SImode and DImode */
1617 6, /* cost of moving SSE register */
1618 {12, 12, 12}, /* cost of loading SSE registers
1619 in SImode, DImode and TImode */
1620 {12, 12, 12}, /* cost of storing SSE registers
1621 in SImode, DImode and TImode */
1622 8, /* MMX or SSE register to integer */
1623 8, /* size of l1 cache. */
1624 1024, /* size of l2 cache. */
1625 128, /* size of prefetch block */
1626 8, /* number of parallel prefetches */
1627 1, /* Branch cost */
1628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1629 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1630 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1631 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1632 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1633 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1634 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1635 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1636 {100000, unrolled_loop}, {-1, libcall}}}},
1637 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1638 {-1, libcall}}},
1639 {libcall, {{24, loop}, {64, unrolled_loop},
1640 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1641 1, /* scalar_stmt_cost. */
1642 1, /* scalar load_cost. */
1643 1, /* scalar_store_cost. */
1644 1, /* vec_stmt_cost. */
1645 1, /* vec_to_scalar_cost. */
1646 1, /* scalar_to_vec_cost. */
1647 1, /* vec_align_load_cost. */
1648 2, /* vec_unalign_load_cost. */
1649 1, /* vec_store_cost. */
1650 3, /* cond_taken_branch_cost. */
1651 1, /* cond_not_taken_branch_cost. */
1652 };
1653
1654 static const
1655 struct processor_costs atom_cost = {
1656 COSTS_N_INSNS (1), /* cost of an add instruction */
1657 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1658 COSTS_N_INSNS (1), /* variable shift costs */
1659 COSTS_N_INSNS (1), /* constant shift costs */
1660 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1661 COSTS_N_INSNS (4), /* HI */
1662 COSTS_N_INSNS (3), /* SI */
1663 COSTS_N_INSNS (4), /* DI */
1664 COSTS_N_INSNS (2)}, /* other */
1665 0, /* cost of multiply per each bit set */
1666 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1667 COSTS_N_INSNS (26), /* HI */
1668 COSTS_N_INSNS (42), /* SI */
1669 COSTS_N_INSNS (74), /* DI */
1670 COSTS_N_INSNS (74)}, /* other */
1671 COSTS_N_INSNS (1), /* cost of movsx */
1672 COSTS_N_INSNS (1), /* cost of movzx */
1673 8, /* "large" insn */
1674 17, /* MOVE_RATIO */
1675 4, /* cost for loading QImode using movzbl */
1676 {4, 4, 4}, /* cost of loading integer registers
1677 in QImode, HImode and SImode.
1678 Relative to reg-reg move (2). */
1679 {4, 4, 4}, /* cost of storing integer registers */
1680 4, /* cost of reg,reg fld/fst */
1681 {12, 12, 12}, /* cost of loading fp registers
1682 in SFmode, DFmode and XFmode */
1683 {6, 6, 8}, /* cost of storing fp registers
1684 in SFmode, DFmode and XFmode */
1685 2, /* cost of moving MMX register */
1686 {8, 8}, /* cost of loading MMX registers
1687 in SImode and DImode */
1688 {8, 8}, /* cost of storing MMX registers
1689 in SImode and DImode */
1690 2, /* cost of moving SSE register */
1691 {8, 8, 8}, /* cost of loading SSE registers
1692 in SImode, DImode and TImode */
1693 {8, 8, 8}, /* cost of storing SSE registers
1694 in SImode, DImode and TImode */
1695 5, /* MMX or SSE register to integer */
1696 32, /* size of l1 cache. */
1697 256, /* size of l2 cache. */
1698 64, /* size of prefetch block */
1699 6, /* number of parallel prefetches */
1700 3, /* Branch cost */
1701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1702 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1703 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1704 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1705 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1706 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1707 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1708 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1709 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1710 {{libcall, {{8, loop}, {15, unrolled_loop},
1711 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1712 {libcall, {{24, loop}, {32, unrolled_loop},
1713 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1714 1, /* scalar_stmt_cost. */
1715 1, /* scalar load_cost. */
1716 1, /* scalar_store_cost. */
1717 1, /* vec_stmt_cost. */
1718 1, /* vec_to_scalar_cost. */
1719 1, /* scalar_to_vec_cost. */
1720 1, /* vec_align_load_cost. */
1721 2, /* vec_unalign_load_cost. */
1722 1, /* vec_store_cost. */
1723 3, /* cond_taken_branch_cost. */
1724 1, /* cond_not_taken_branch_cost. */
1725 };
1726
1727 /* Generic64 should produce code tuned for Nocona and K8. */
1728 static const
1729 struct processor_costs generic64_cost = {
1730 COSTS_N_INSNS (1), /* cost of an add instruction */
1731 /* On all chips taken into consideration lea is 2 cycles and more. With
1732 this cost however our current implementation of synth_mult results in
1733 use of unnecessary temporary registers causing regression on several
1734 SPECfp benchmarks. */
1735 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1736 COSTS_N_INSNS (1), /* variable shift costs */
1737 COSTS_N_INSNS (1), /* constant shift costs */
1738 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1739 COSTS_N_INSNS (4), /* HI */
1740 COSTS_N_INSNS (3), /* SI */
1741 COSTS_N_INSNS (4), /* DI */
1742 COSTS_N_INSNS (2)}, /* other */
1743 0, /* cost of multiply per each bit set */
1744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1745 COSTS_N_INSNS (26), /* HI */
1746 COSTS_N_INSNS (42), /* SI */
1747 COSTS_N_INSNS (74), /* DI */
1748 COSTS_N_INSNS (74)}, /* other */
1749 COSTS_N_INSNS (1), /* cost of movsx */
1750 COSTS_N_INSNS (1), /* cost of movzx */
1751 8, /* "large" insn */
1752 17, /* MOVE_RATIO */
1753 4, /* cost for loading QImode using movzbl */
1754 {4, 4, 4}, /* cost of loading integer registers
1755 in QImode, HImode and SImode.
1756 Relative to reg-reg move (2). */
1757 {4, 4, 4}, /* cost of storing integer registers */
1758 4, /* cost of reg,reg fld/fst */
1759 {12, 12, 12}, /* cost of loading fp registers
1760 in SFmode, DFmode and XFmode */
1761 {6, 6, 8}, /* cost of storing fp registers
1762 in SFmode, DFmode and XFmode */
1763 2, /* cost of moving MMX register */
1764 {8, 8}, /* cost of loading MMX registers
1765 in SImode and DImode */
1766 {8, 8}, /* cost of storing MMX registers
1767 in SImode and DImode */
1768 2, /* cost of moving SSE register */
1769 {8, 8, 8}, /* cost of loading SSE registers
1770 in SImode, DImode and TImode */
1771 {8, 8, 8}, /* cost of storing SSE registers
1772 in SImode, DImode and TImode */
1773 5, /* MMX or SSE register to integer */
1774 32, /* size of l1 cache. */
1775 512, /* size of l2 cache. */
1776 64, /* size of prefetch block */
1777 6, /* number of parallel prefetches */
1778 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1779 value is increased to perhaps more appropriate value of 5. */
1780 3, /* Branch cost */
1781 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1782 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1783 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1784 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1785 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1786 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1787 {DUMMY_STRINGOP_ALGS,
1788 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1789 {DUMMY_STRINGOP_ALGS,
1790 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 1, /* scalar_stmt_cost. */
1792 1, /* scalar load_cost. */
1793 1, /* scalar_store_cost. */
1794 1, /* vec_stmt_cost. */
1795 1, /* vec_to_scalar_cost. */
1796 1, /* scalar_to_vec_cost. */
1797 1, /* vec_align_load_cost. */
1798 2, /* vec_unalign_load_cost. */
1799 1, /* vec_store_cost. */
1800 3, /* cond_taken_branch_cost. */
1801 1, /* cond_not_taken_branch_cost. */
1802 };
1803
1804 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1805 Athlon and K8. */
1806 static const
1807 struct processor_costs generic32_cost = {
1808 COSTS_N_INSNS (1), /* cost of an add instruction */
1809 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1810 COSTS_N_INSNS (1), /* variable shift costs */
1811 COSTS_N_INSNS (1), /* constant shift costs */
1812 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1813 COSTS_N_INSNS (4), /* HI */
1814 COSTS_N_INSNS (3), /* SI */
1815 COSTS_N_INSNS (4), /* DI */
1816 COSTS_N_INSNS (2)}, /* other */
1817 0, /* cost of multiply per each bit set */
1818 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1819 COSTS_N_INSNS (26), /* HI */
1820 COSTS_N_INSNS (42), /* SI */
1821 COSTS_N_INSNS (74), /* DI */
1822 COSTS_N_INSNS (74)}, /* other */
1823 COSTS_N_INSNS (1), /* cost of movsx */
1824 COSTS_N_INSNS (1), /* cost of movzx */
1825 8, /* "large" insn */
1826 17, /* MOVE_RATIO */
1827 4, /* cost for loading QImode using movzbl */
1828 {4, 4, 4}, /* cost of loading integer registers
1829 in QImode, HImode and SImode.
1830 Relative to reg-reg move (2). */
1831 {4, 4, 4}, /* cost of storing integer registers */
1832 4, /* cost of reg,reg fld/fst */
1833 {12, 12, 12}, /* cost of loading fp registers
1834 in SFmode, DFmode and XFmode */
1835 {6, 6, 8}, /* cost of storing fp registers
1836 in SFmode, DFmode and XFmode */
1837 2, /* cost of moving MMX register */
1838 {8, 8}, /* cost of loading MMX registers
1839 in SImode and DImode */
1840 {8, 8}, /* cost of storing MMX registers
1841 in SImode and DImode */
1842 2, /* cost of moving SSE register */
1843 {8, 8, 8}, /* cost of loading SSE registers
1844 in SImode, DImode and TImode */
1845 {8, 8, 8}, /* cost of storing SSE registers
1846 in SImode, DImode and TImode */
1847 5, /* MMX or SSE register to integer */
1848 32, /* size of l1 cache. */
1849 256, /* size of l2 cache. */
1850 64, /* size of prefetch block */
1851 6, /* number of parallel prefetches */
1852 3, /* Branch cost */
1853 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1854 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1855 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1856 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1857 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1858 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1859 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1860 DUMMY_STRINGOP_ALGS},
1861 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1862 DUMMY_STRINGOP_ALGS},
1863 1, /* scalar_stmt_cost. */
1864 1, /* scalar load_cost. */
1865 1, /* scalar_store_cost. */
1866 1, /* vec_stmt_cost. */
1867 1, /* vec_to_scalar_cost. */
1868 1, /* scalar_to_vec_cost. */
1869 1, /* vec_align_load_cost. */
1870 2, /* vec_unalign_load_cost. */
1871 1, /* vec_store_cost. */
1872 3, /* cond_taken_branch_cost. */
1873 1, /* cond_not_taken_branch_cost. */
1874 };
1875
1876 const struct processor_costs *ix86_cost = &pentium_cost;
1877
1878 /* Processor feature/optimization bitmasks. */
1879 #define m_386 (1<<PROCESSOR_I386)
1880 #define m_486 (1<<PROCESSOR_I486)
1881 #define m_PENT (1<<PROCESSOR_PENTIUM)
1882 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1883 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1884 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1885 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1886 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1887 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1888 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1889 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1890 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1891 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1892 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1893 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1894 #define m_ATOM (1<<PROCESSOR_ATOM)
1895
1896 #define m_GEODE (1<<PROCESSOR_GEODE)
1897 #define m_K6 (1<<PROCESSOR_K6)
1898 #define m_K6_GEODE (m_K6 | m_GEODE)
1899 #define m_K8 (1<<PROCESSOR_K8)
1900 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1901 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1902 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1903 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1904 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1905 #define m_BDVER (m_BDVER1 | m_BDVER2)
1906 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1907 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1908
1909 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1910 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1911
1912 /* Generic instruction choice should be common subset of supported CPUs
1913 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1914 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1915
1916 /* Feature tests against the various tunings. */
1917 unsigned char ix86_tune_features[X86_TUNE_LAST];
1918
1919 /* Feature tests against the various tunings used to create ix86_tune_features
1920 based on the processor mask. */
1921 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1922 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1923 negatively, so enabling for Generic64 seems like good code size
1924 tradeoff. We can't enable it for 32bit generic because it does not
1925 work well with PPro base chips. */
1926 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1927
1928 /* X86_TUNE_PUSH_MEMORY */
1929 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1930
1931 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1932 m_486 | m_PENT,
1933
1934 /* X86_TUNE_UNROLL_STRLEN */
1935 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1936
1937 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1938 on simulation result. But after P4 was made, no performance benefit
1939 was observed with branch hints. It also increases the code size.
1940 As a result, icc never generates branch hints. */
1941 0,
1942
1943 /* X86_TUNE_DOUBLE_WITH_ADD */
1944 ~m_386,
1945
1946 /* X86_TUNE_USE_SAHF */
1947 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1948
1949 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1950 partial dependencies. */
1951 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1952
1953 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1954 register stalls on Generic32 compilation setting as well. However
1955 in current implementation the partial register stalls are not eliminated
1956 very well - they can be introduced via subregs synthesized by combine
1957 and can happen in caller/callee saving sequences. Because this option
1958 pays back little on PPro based chips and is in conflict with partial reg
1959 dependencies used by Athlon/P4 based chips, it is better to leave it off
1960 for generic32 for now. */
1961 m_PPRO,
1962
1963 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1964 m_CORE2I7 | m_GENERIC,
1965
1966 /* X86_TUNE_USE_HIMODE_FIOP */
1967 m_386 | m_486 | m_K6_GEODE,
1968
1969 /* X86_TUNE_USE_SIMODE_FIOP */
1970 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1971
1972 /* X86_TUNE_USE_MOV0 */
1973 m_K6,
1974
1975 /* X86_TUNE_USE_CLTD */
1976 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1977
1978 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1979 m_PENT4,
1980
1981 /* X86_TUNE_SPLIT_LONG_MOVES */
1982 m_PPRO,
1983
1984 /* X86_TUNE_READ_MODIFY_WRITE */
1985 ~m_PENT,
1986
1987 /* X86_TUNE_READ_MODIFY */
1988 ~(m_PENT | m_PPRO),
1989
1990 /* X86_TUNE_PROMOTE_QIMODE */
1991 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1992
1993 /* X86_TUNE_FAST_PREFIX */
1994 ~(m_386 | m_486 | m_PENT),
1995
1996 /* X86_TUNE_SINGLE_STRINGOP */
1997 m_386 | m_P4_NOCONA,
1998
1999 /* X86_TUNE_QIMODE_MATH */
2000 ~0,
2001
2002 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2003 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2004 might be considered for Generic32 if our scheme for avoiding partial
2005 stalls was more effective. */
2006 ~m_PPRO,
2007
2008 /* X86_TUNE_PROMOTE_QI_REGS */
2009 0,
2010
2011 /* X86_TUNE_PROMOTE_HI_REGS */
2012 m_PPRO,
2013
2014 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2015 over esp addition. */
2016 m_386 | m_486 | m_PENT | m_PPRO,
2017
2018 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2019 over esp addition. */
2020 m_PENT,
2021
2022 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2023 over esp subtraction. */
2024 m_386 | m_486 | m_PENT | m_K6_GEODE,
2025
2026 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2027 over esp subtraction. */
2028 m_PENT | m_K6_GEODE,
2029
2030 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2031 for DFmode copies */
2032 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2033
2034 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2035 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2036
2037 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2038 conflict here in between PPro/Pentium4 based chips that thread 128bit
2039 SSE registers as single units versus K8 based chips that divide SSE
2040 registers to two 64bit halves. This knob promotes all store destinations
2041 to be 128bit to allow register renaming on 128bit SSE units, but usually
2042 results in one extra microop on 64bit SSE units. Experimental results
2043 shows that disabling this option on P4 brings over 20% SPECfp regression,
2044 while enabling it on K8 brings roughly 2.4% regression that can be partly
2045 masked by careful scheduling of moves. */
2046 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2047
2048 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2049 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2050
2051 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2052 m_COREI7 | m_BDVER,
2053
2054 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2055 m_BDVER ,
2056
2057 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2058 are resolved on SSE register parts instead of whole registers, so we may
2059 maintain just lower part of scalar values in proper format leaving the
2060 upper part undefined. */
2061 m_ATHLON_K8,
2062
2063 /* X86_TUNE_SSE_TYPELESS_STORES */
2064 m_AMD_MULTIPLE,
2065
2066 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2067 m_PPRO | m_P4_NOCONA,
2068
2069 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2070 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2071
2072 /* X86_TUNE_PROLOGUE_USING_MOVE */
2073 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2074
2075 /* X86_TUNE_EPILOGUE_USING_MOVE */
2076 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2077
2078 /* X86_TUNE_SHIFT1 */
2079 ~m_486,
2080
2081 /* X86_TUNE_USE_FFREEP */
2082 m_AMD_MULTIPLE,
2083
2084 /* X86_TUNE_INTER_UNIT_MOVES */
2085 ~(m_AMD_MULTIPLE | m_GENERIC),
2086
2087 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2088 ~(m_AMDFAM10 | m_BDVER ),
2089
2090 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2091 than 4 branch instructions in the 16 byte window. */
2092 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2093
2094 /* X86_TUNE_SCHEDULE */
2095 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2096
2097 /* X86_TUNE_USE_BT */
2098 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2099
2100 /* X86_TUNE_USE_INCDEC */
2101 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2102
2103 /* X86_TUNE_PAD_RETURNS */
2104 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2105
2106 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2107 m_ATOM,
2108
2109 /* X86_TUNE_EXT_80387_CONSTANTS */
2110 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2111
2112 /* X86_TUNE_SHORTEN_X87_SSE */
2113 ~m_K8,
2114
2115 /* X86_TUNE_AVOID_VECTOR_DECODE */
2116 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2117
2118 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2119 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2120 ~(m_386 | m_486),
2121
2122 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2123 vector path on AMD machines. */
2124 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2125
2126 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2127 machines. */
2128 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2129
2130 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2131 than a MOV. */
2132 m_PENT,
2133
2134 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2135 but one byte longer. */
2136 m_PENT,
2137
2138 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2139 operand that cannot be represented using a modRM byte. The XOR
2140 replacement is long decoded, so this split helps here as well. */
2141 m_K6,
2142
2143 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2144 from FP to FP. */
2145 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2146
2147 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2148 from integer to FP. */
2149 m_AMDFAM10,
2150
2151 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2152 with a subsequent conditional jump instruction into a single
2153 compare-and-branch uop. */
2154 m_BDVER,
2155
2156 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2157 will impact LEA instruction selection. */
2158 m_ATOM,
2159
2160 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2161 instructions. */
2162 ~m_ATOM,
2163
2164 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2165 at -O3. For the moment, the prefetching seems badly tuned for Intel
2166 chips. */
2167 m_K6_GEODE | m_AMD_MULTIPLE,
2168
2169 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2170 the auto-vectorizer. */
2171 m_BDVER,
2172
2173 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2174 during reassociation of integer computation. */
2175 m_ATOM,
2176
2177 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2178 during reassociation of fp computation. */
2179 m_ATOM
2180 };
2181
2182 /* Feature tests against the various architecture variations. */
2183 unsigned char ix86_arch_features[X86_ARCH_LAST];
2184
2185 /* Feature tests against the various architecture variations, used to create
2186 ix86_arch_features based on the processor mask. */
2187 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2188 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2189 ~(m_386 | m_486 | m_PENT | m_K6),
2190
2191 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2192 ~m_386,
2193
2194 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2195 ~(m_386 | m_486),
2196
2197 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2198 ~m_386,
2199
2200 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2201 ~m_386,
2202 };
2203
2204 static const unsigned int x86_accumulate_outgoing_args
2205 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2206
2207 static const unsigned int x86_arch_always_fancy_math_387
2208 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2209
2210 static const unsigned int x86_avx256_split_unaligned_load
2211 = m_COREI7 | m_GENERIC;
2212
2213 static const unsigned int x86_avx256_split_unaligned_store
2214 = m_COREI7 | m_BDVER | m_GENERIC;
2215
2216 /* In case the average insn count for single function invocation is
2217 lower than this constant, emit fast (but longer) prologue and
2218 epilogue code. */
2219 #define FAST_PROLOGUE_INSN_COUNT 20
2220
2221 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2222 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2223 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2224 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2225
2226 /* Array of the smallest class containing reg number REGNO, indexed by
2227 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2228
2229 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2230 {
2231 /* ax, dx, cx, bx */
2232 AREG, DREG, CREG, BREG,
2233 /* si, di, bp, sp */
2234 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2235 /* FP registers */
2236 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2237 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2238 /* arg pointer */
2239 NON_Q_REGS,
2240 /* flags, fpsr, fpcr, frame */
2241 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2242 /* SSE registers */
2243 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2244 SSE_REGS, SSE_REGS,
2245 /* MMX registers */
2246 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2247 MMX_REGS, MMX_REGS,
2248 /* REX registers */
2249 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 /* SSE REX registers */
2252 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2253 SSE_REGS, SSE_REGS,
2254 };
2255
2256 /* The "default" register map used in 32bit mode. */
2257
2258 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2259 {
2260 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2261 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2262 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2263 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2264 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2265 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2267 };
2268
2269 /* The "default" register map used in 64bit mode. */
2270
2271 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2272 {
2273 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2274 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2275 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2276 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2277 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2278 8,9,10,11,12,13,14,15, /* extended integer registers */
2279 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2280 };
2281
2282 /* Define the register numbers to be used in Dwarf debugging information.
2283 The SVR4 reference port C compiler uses the following register numbers
2284 in its Dwarf output code:
2285 0 for %eax (gcc regno = 0)
2286 1 for %ecx (gcc regno = 2)
2287 2 for %edx (gcc regno = 1)
2288 3 for %ebx (gcc regno = 3)
2289 4 for %esp (gcc regno = 7)
2290 5 for %ebp (gcc regno = 6)
2291 6 for %esi (gcc regno = 4)
2292 7 for %edi (gcc regno = 5)
2293 The following three DWARF register numbers are never generated by
2294 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2295 believes these numbers have these meanings.
2296 8 for %eip (no gcc equivalent)
2297 9 for %eflags (gcc regno = 17)
2298 10 for %trapno (no gcc equivalent)
2299 It is not at all clear how we should number the FP stack registers
2300 for the x86 architecture. If the version of SDB on x86/svr4 were
2301 a bit less brain dead with respect to floating-point then we would
2302 have a precedent to follow with respect to DWARF register numbers
2303 for x86 FP registers, but the SDB on x86/svr4 is so completely
2304 broken with respect to FP registers that it is hardly worth thinking
2305 of it as something to strive for compatibility with.
2306 The version of x86/svr4 SDB I have at the moment does (partially)
2307 seem to believe that DWARF register number 11 is associated with
2308 the x86 register %st(0), but that's about all. Higher DWARF
2309 register numbers don't seem to be associated with anything in
2310 particular, and even for DWARF regno 11, SDB only seems to under-
2311 stand that it should say that a variable lives in %st(0) (when
2312 asked via an `=' command) if we said it was in DWARF regno 11,
2313 but SDB still prints garbage when asked for the value of the
2314 variable in question (via a `/' command).
2315 (Also note that the labels SDB prints for various FP stack regs
2316 when doing an `x' command are all wrong.)
2317 Note that these problems generally don't affect the native SVR4
2318 C compiler because it doesn't allow the use of -O with -g and
2319 because when it is *not* optimizing, it allocates a memory
2320 location for each floating-point variable, and the memory
2321 location is what gets described in the DWARF AT_location
2322 attribute for the variable in question.
2323 Regardless of the severe mental illness of the x86/svr4 SDB, we
2324 do something sensible here and we use the following DWARF
2325 register numbers. Note that these are all stack-top-relative
2326 numbers.
2327 11 for %st(0) (gcc regno = 8)
2328 12 for %st(1) (gcc regno = 9)
2329 13 for %st(2) (gcc regno = 10)
2330 14 for %st(3) (gcc regno = 11)
2331 15 for %st(4) (gcc regno = 12)
2332 16 for %st(5) (gcc regno = 13)
2333 17 for %st(6) (gcc regno = 14)
2334 18 for %st(7) (gcc regno = 15)
2335 */
2336 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2337 {
2338 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2339 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2340 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2341 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2342 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2343 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2345 };
2346
2347 /* Define parameter passing and return registers. */
2348
2349 static int const x86_64_int_parameter_registers[6] =
2350 {
2351 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2352 };
2353
2354 static int const x86_64_ms_abi_int_parameter_registers[4] =
2355 {
2356 CX_REG, DX_REG, R8_REG, R9_REG
2357 };
2358
2359 static int const x86_64_int_return_registers[4] =
2360 {
2361 AX_REG, DX_REG, DI_REG, SI_REG
2362 };
2363
2364 /* Define the structure for the machine field in struct function. */
2365
2366 struct GTY(()) stack_local_entry {
2367 unsigned short mode;
2368 unsigned short n;
2369 rtx rtl;
2370 struct stack_local_entry *next;
2371 };
2372
2373 /* Structure describing stack frame layout.
2374 Stack grows downward:
2375
2376 [arguments]
2377 <- ARG_POINTER
2378 saved pc
2379
2380 saved static chain if ix86_static_chain_on_stack
2381
2382 saved frame pointer if frame_pointer_needed
2383 <- HARD_FRAME_POINTER
2384 [saved regs]
2385 <- regs_save_offset
2386 [padding0]
2387
2388 [saved SSE regs]
2389 <- sse_regs_save_offset
2390 [padding1] |
2391 | <- FRAME_POINTER
2392 [va_arg registers] |
2393 |
2394 [frame] |
2395 |
2396 [padding2] | = to_allocate
2397 <- STACK_POINTER
2398 */
2399 struct ix86_frame
2400 {
2401 int nsseregs;
2402 int nregs;
2403 int va_arg_size;
2404 int red_zone_size;
2405 int outgoing_arguments_size;
2406 HOST_WIDE_INT frame;
2407
2408 /* The offsets relative to ARG_POINTER. */
2409 HOST_WIDE_INT frame_pointer_offset;
2410 HOST_WIDE_INT hard_frame_pointer_offset;
2411 HOST_WIDE_INT stack_pointer_offset;
2412 HOST_WIDE_INT hfp_save_offset;
2413 HOST_WIDE_INT reg_save_offset;
2414 HOST_WIDE_INT sse_reg_save_offset;
2415
2416 /* When save_regs_using_mov is set, emit prologue using
2417 move instead of push instructions. */
2418 bool save_regs_using_mov;
2419 };
2420
2421 /* Which cpu are we scheduling for. */
2422 enum attr_cpu ix86_schedule;
2423
2424 /* Which cpu are we optimizing for. */
2425 enum processor_type ix86_tune;
2426
2427 /* Which instruction set architecture to use. */
2428 enum processor_type ix86_arch;
2429
2430 /* true if sse prefetch instruction is not NOOP. */
2431 int x86_prefetch_sse;
2432
2433 /* -mstackrealign option */
2434 static const char ix86_force_align_arg_pointer_string[]
2435 = "force_align_arg_pointer";
2436
2437 static rtx (*ix86_gen_leave) (void);
2438 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2439 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2440 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2441 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2442 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2445 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2446 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2447
2448 /* Preferred alignment for stack boundary in bits. */
2449 unsigned int ix86_preferred_stack_boundary;
2450
2451 /* Alignment for incoming stack boundary in bits specified at
2452 command line. */
2453 static unsigned int ix86_user_incoming_stack_boundary;
2454
2455 /* Default alignment for incoming stack boundary in bits. */
2456 static unsigned int ix86_default_incoming_stack_boundary;
2457
2458 /* Alignment for incoming stack boundary in bits. */
2459 unsigned int ix86_incoming_stack_boundary;
2460
2461 /* Calling abi specific va_list type nodes. */
2462 static GTY(()) tree sysv_va_list_type_node;
2463 static GTY(()) tree ms_va_list_type_node;
2464
2465 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2466 char internal_label_prefix[16];
2467 int internal_label_prefix_len;
2468
2469 /* Fence to use after loop using movnt. */
2470 tree x86_mfence;
2471
2472 /* Register class used for passing given 64bit part of the argument.
2473 These represent classes as documented by the PS ABI, with the exception
2474 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2475 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2476
2477 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2478 whenever possible (upper half does contain padding). */
2479 enum x86_64_reg_class
2480 {
2481 X86_64_NO_CLASS,
2482 X86_64_INTEGER_CLASS,
2483 X86_64_INTEGERSI_CLASS,
2484 X86_64_SSE_CLASS,
2485 X86_64_SSESF_CLASS,
2486 X86_64_SSEDF_CLASS,
2487 X86_64_SSEUP_CLASS,
2488 X86_64_X87_CLASS,
2489 X86_64_X87UP_CLASS,
2490 X86_64_COMPLEX_X87_CLASS,
2491 X86_64_MEMORY_CLASS
2492 };
2493
2494 #define MAX_CLASSES 4
2495
2496 /* Table of constants used by fldpi, fldln2, etc.... */
2497 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2498 static bool ext_80387_constants_init = 0;
2499
2500 \f
2501 static struct machine_function * ix86_init_machine_status (void);
2502 static rtx ix86_function_value (const_tree, const_tree, bool);
2503 static bool ix86_function_value_regno_p (const unsigned int);
2504 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2505 const_tree);
2506 static rtx ix86_static_chain (const_tree, bool);
2507 static int ix86_function_regparm (const_tree, const_tree);
2508 static void ix86_compute_frame_layout (struct ix86_frame *);
2509 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2510 rtx, rtx, int);
2511 static void ix86_add_new_builtins (HOST_WIDE_INT);
2512 static tree ix86_canonical_va_list_type (tree);
2513 static void predict_jump (int);
2514 static unsigned int split_stack_prologue_scratch_regno (void);
2515 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2516
2517 enum ix86_function_specific_strings
2518 {
2519 IX86_FUNCTION_SPECIFIC_ARCH,
2520 IX86_FUNCTION_SPECIFIC_TUNE,
2521 IX86_FUNCTION_SPECIFIC_MAX
2522 };
2523
2524 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2525 const char *, enum fpmath_unit, bool);
2526 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2527 static void ix86_function_specific_save (struct cl_target_option *);
2528 static void ix86_function_specific_restore (struct cl_target_option *);
2529 static void ix86_function_specific_print (FILE *, int,
2530 struct cl_target_option *);
2531 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2532 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2533 struct gcc_options *);
2534 static bool ix86_can_inline_p (tree, tree);
2535 static void ix86_set_current_function (tree);
2536 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2537
2538 static enum calling_abi ix86_function_abi (const_tree);
2539
2540 \f
2541 #ifndef SUBTARGET32_DEFAULT_CPU
2542 #define SUBTARGET32_DEFAULT_CPU "i386"
2543 #endif
2544
2545 /* The svr4 ABI for the i386 says that records and unions are returned
2546 in memory. */
2547 #ifndef DEFAULT_PCC_STRUCT_RETURN
2548 #define DEFAULT_PCC_STRUCT_RETURN 1
2549 #endif
2550
2551 /* Whether -mtune= or -march= were specified */
2552 static int ix86_tune_defaulted;
2553 static int ix86_arch_specified;
2554
2555 /* Vectorization library interface and handlers. */
2556 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2557
2558 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2559 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2560
2561 /* Processor target table, indexed by processor number */
2562 struct ptt
2563 {
2564 const struct processor_costs *cost; /* Processor costs */
2565 const int align_loop; /* Default alignments. */
2566 const int align_loop_max_skip;
2567 const int align_jump;
2568 const int align_jump_max_skip;
2569 const int align_func;
2570 };
2571
2572 static const struct ptt processor_target_table[PROCESSOR_max] =
2573 {
2574 {&i386_cost, 4, 3, 4, 3, 4},
2575 {&i486_cost, 16, 15, 16, 15, 16},
2576 {&pentium_cost, 16, 7, 16, 7, 16},
2577 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2578 {&geode_cost, 0, 0, 0, 0, 0},
2579 {&k6_cost, 32, 7, 32, 7, 32},
2580 {&athlon_cost, 16, 7, 16, 7, 16},
2581 {&pentium4_cost, 0, 0, 0, 0, 0},
2582 {&k8_cost, 16, 7, 16, 7, 16},
2583 {&nocona_cost, 0, 0, 0, 0, 0},
2584 /* Core 2 32-bit. */
2585 {&generic32_cost, 16, 10, 16, 10, 16},
2586 /* Core 2 64-bit. */
2587 {&generic64_cost, 16, 10, 16, 10, 16},
2588 /* Core i7 32-bit. */
2589 {&generic32_cost, 16, 10, 16, 10, 16},
2590 /* Core i7 64-bit. */
2591 {&generic64_cost, 16, 10, 16, 10, 16},
2592 {&generic32_cost, 16, 7, 16, 7, 16},
2593 {&generic64_cost, 16, 10, 16, 10, 16},
2594 {&amdfam10_cost, 32, 24, 32, 7, 32},
2595 {&bdver1_cost, 32, 24, 32, 7, 32},
2596 {&bdver2_cost, 32, 24, 32, 7, 32},
2597 {&btver1_cost, 32, 24, 32, 7, 32},
2598 {&atom_cost, 16, 15, 16, 7, 16}
2599 };
2600
2601 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2602 {
2603 "generic",
2604 "i386",
2605 "i486",
2606 "pentium",
2607 "pentium-mmx",
2608 "pentiumpro",
2609 "pentium2",
2610 "pentium3",
2611 "pentium4",
2612 "pentium-m",
2613 "prescott",
2614 "nocona",
2615 "core2",
2616 "corei7",
2617 "atom",
2618 "geode",
2619 "k6",
2620 "k6-2",
2621 "k6-3",
2622 "athlon",
2623 "athlon-4",
2624 "k8",
2625 "amdfam10",
2626 "bdver1",
2627 "bdver2",
2628 "btver1"
2629 };
2630 \f
2631 /* Return true if a red-zone is in use. */
2632
2633 static inline bool
2634 ix86_using_red_zone (void)
2635 {
2636 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2637 }
2638 \f
2639 /* Return a string that documents the current -m options. The caller is
2640 responsible for freeing the string. */
2641
2642 static char *
2643 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2644 const char *tune, enum fpmath_unit fpmath,
2645 bool add_nl_p)
2646 {
2647 struct ix86_target_opts
2648 {
2649 const char *option; /* option string */
2650 HOST_WIDE_INT mask; /* isa mask options */
2651 };
2652
2653 /* This table is ordered so that options like -msse4.2 that imply
2654 preceding options while match those first. */
2655 static struct ix86_target_opts isa_opts[] =
2656 {
2657 { "-m64", OPTION_MASK_ISA_64BIT },
2658 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2659 { "-mfma", OPTION_MASK_ISA_FMA },
2660 { "-mxop", OPTION_MASK_ISA_XOP },
2661 { "-mlwp", OPTION_MASK_ISA_LWP },
2662 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2663 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2664 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2665 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2666 { "-msse3", OPTION_MASK_ISA_SSE3 },
2667 { "-msse2", OPTION_MASK_ISA_SSE2 },
2668 { "-msse", OPTION_MASK_ISA_SSE },
2669 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2670 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2671 { "-mmmx", OPTION_MASK_ISA_MMX },
2672 { "-mabm", OPTION_MASK_ISA_ABM },
2673 { "-mbmi", OPTION_MASK_ISA_BMI },
2674 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2675 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2676 { "-mtbm", OPTION_MASK_ISA_TBM },
2677 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2678 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2679 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2680 { "-maes", OPTION_MASK_ISA_AES },
2681 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2682 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2683 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2684 { "-mf16c", OPTION_MASK_ISA_F16C },
2685 };
2686
2687 /* Flag options. */
2688 static struct ix86_target_opts flag_opts[] =
2689 {
2690 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2691 { "-m80387", MASK_80387 },
2692 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2693 { "-malign-double", MASK_ALIGN_DOUBLE },
2694 { "-mcld", MASK_CLD },
2695 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2696 { "-mieee-fp", MASK_IEEE_FP },
2697 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2698 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2699 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2700 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2701 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2702 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2703 { "-mno-red-zone", MASK_NO_RED_ZONE },
2704 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2705 { "-mrecip", MASK_RECIP },
2706 { "-mrtd", MASK_RTD },
2707 { "-msseregparm", MASK_SSEREGPARM },
2708 { "-mstack-arg-probe", MASK_STACK_PROBE },
2709 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2710 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2711 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2712 { "-mvzeroupper", MASK_VZEROUPPER },
2713 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2714 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2715 { "-mprefer-avx128", MASK_PREFER_AVX128},
2716 };
2717
2718 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2719
2720 char isa_other[40];
2721 char target_other[40];
2722 unsigned num = 0;
2723 unsigned i, j;
2724 char *ret;
2725 char *ptr;
2726 size_t len;
2727 size_t line_len;
2728 size_t sep_len;
2729
2730 memset (opts, '\0', sizeof (opts));
2731
2732 /* Add -march= option. */
2733 if (arch)
2734 {
2735 opts[num][0] = "-march=";
2736 opts[num++][1] = arch;
2737 }
2738
2739 /* Add -mtune= option. */
2740 if (tune)
2741 {
2742 opts[num][0] = "-mtune=";
2743 opts[num++][1] = tune;
2744 }
2745
2746 /* Pick out the options in isa options. */
2747 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2748 {
2749 if ((isa & isa_opts[i].mask) != 0)
2750 {
2751 opts[num++][0] = isa_opts[i].option;
2752 isa &= ~ isa_opts[i].mask;
2753 }
2754 }
2755
2756 if (isa && add_nl_p)
2757 {
2758 opts[num++][0] = isa_other;
2759 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2760 isa);
2761 }
2762
2763 /* Add flag options. */
2764 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2765 {
2766 if ((flags & flag_opts[i].mask) != 0)
2767 {
2768 opts[num++][0] = flag_opts[i].option;
2769 flags &= ~ flag_opts[i].mask;
2770 }
2771 }
2772
2773 if (flags && add_nl_p)
2774 {
2775 opts[num++][0] = target_other;
2776 sprintf (target_other, "(other flags: %#x)", flags);
2777 }
2778
2779 /* Add -fpmath= option. */
2780 if (fpmath)
2781 {
2782 opts[num][0] = "-mfpmath=";
2783 switch ((int) fpmath)
2784 {
2785 case FPMATH_387:
2786 opts[num++][1] = "387";
2787 break;
2788
2789 case FPMATH_SSE:
2790 opts[num++][1] = "sse";
2791 break;
2792
2793 case FPMATH_387 | FPMATH_SSE:
2794 opts[num++][1] = "sse+387";
2795 break;
2796
2797 default:
2798 gcc_unreachable ();
2799 }
2800 }
2801
2802 /* Any options? */
2803 if (num == 0)
2804 return NULL;
2805
2806 gcc_assert (num < ARRAY_SIZE (opts));
2807
2808 /* Size the string. */
2809 len = 0;
2810 sep_len = (add_nl_p) ? 3 : 1;
2811 for (i = 0; i < num; i++)
2812 {
2813 len += sep_len;
2814 for (j = 0; j < 2; j++)
2815 if (opts[i][j])
2816 len += strlen (opts[i][j]);
2817 }
2818
2819 /* Build the string. */
2820 ret = ptr = (char *) xmalloc (len);
2821 line_len = 0;
2822
2823 for (i = 0; i < num; i++)
2824 {
2825 size_t len2[2];
2826
2827 for (j = 0; j < 2; j++)
2828 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2829
2830 if (i != 0)
2831 {
2832 *ptr++ = ' ';
2833 line_len++;
2834
2835 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2836 {
2837 *ptr++ = '\\';
2838 *ptr++ = '\n';
2839 line_len = 0;
2840 }
2841 }
2842
2843 for (j = 0; j < 2; j++)
2844 if (opts[i][j])
2845 {
2846 memcpy (ptr, opts[i][j], len2[j]);
2847 ptr += len2[j];
2848 line_len += len2[j];
2849 }
2850 }
2851
2852 *ptr = '\0';
2853 gcc_assert (ret + len >= ptr);
2854
2855 return ret;
2856 }
2857
2858 /* Return true, if profiling code should be emitted before
2859 prologue. Otherwise it returns false.
2860 Note: For x86 with "hotfix" it is sorried. */
2861 static bool
2862 ix86_profile_before_prologue (void)
2863 {
2864 return flag_fentry != 0;
2865 }
2866
2867 /* Function that is callable from the debugger to print the current
2868 options. */
2869 void
2870 ix86_debug_options (void)
2871 {
2872 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2873 ix86_arch_string, ix86_tune_string,
2874 ix86_fpmath, true);
2875
2876 if (opts)
2877 {
2878 fprintf (stderr, "%s\n\n", opts);
2879 free (opts);
2880 }
2881 else
2882 fputs ("<no options>\n\n", stderr);
2883
2884 return;
2885 }
2886 \f
2887 /* Override various settings based on options. If MAIN_ARGS_P, the
2888 options are from the command line, otherwise they are from
2889 attributes. */
2890
2891 static void
2892 ix86_option_override_internal (bool main_args_p)
2893 {
2894 int i;
2895 unsigned int ix86_arch_mask, ix86_tune_mask;
2896 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2897 const char *prefix;
2898 const char *suffix;
2899 const char *sw;
2900
2901 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2902 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2903 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2904 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2905 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2906 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2907 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2908 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2909 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2910 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2911 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2912 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2913 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2914 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2915 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2916 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2917 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2918 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2919 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2920 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2921 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2922 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2923 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2924 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2925 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2926 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2927 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2928 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2929 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2930 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2931 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2932 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2933 /* if this reaches 64, need to widen struct pta flags below */
2934
2935 static struct pta
2936 {
2937 const char *const name; /* processor name or nickname. */
2938 const enum processor_type processor;
2939 const enum attr_cpu schedule;
2940 const unsigned HOST_WIDE_INT flags;
2941 }
2942 const processor_alias_table[] =
2943 {
2944 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2945 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2946 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2947 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2948 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2949 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2950 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2951 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2952 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2953 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2954 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2955 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2956 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2957 PTA_MMX | PTA_SSE},
2958 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2959 PTA_MMX | PTA_SSE},
2960 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2961 PTA_MMX | PTA_SSE | PTA_SSE2},
2962 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2963 PTA_MMX |PTA_SSE | PTA_SSE2},
2964 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2965 PTA_MMX | PTA_SSE | PTA_SSE2},
2966 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2967 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2968 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2969 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2970 | PTA_CX16 | PTA_NO_SAHF},
2971 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2972 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2973 | PTA_SSSE3 | PTA_CX16},
2974 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2975 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2976 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2977 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2978 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2979 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2980 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2981 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2982 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2983 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2984 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2985 | PTA_RDRND | PTA_F16C},
2986 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2987 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2988 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2989 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2990 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2991 | PTA_FMA | PTA_MOVBE},
2992 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2993 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2994 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
2995 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2996 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
2997 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2998 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2999 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3000 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3001 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3002 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3003 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3004 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3005 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3006 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3007 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3008 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3009 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3010 {"x86-64", PROCESSOR_K8, CPU_K8,
3011 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3012 {"k8", PROCESSOR_K8, CPU_K8,
3013 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3014 | PTA_SSE2 | PTA_NO_SAHF},
3015 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3016 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3017 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3018 {"opteron", PROCESSOR_K8, CPU_K8,
3019 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3020 | PTA_SSE2 | PTA_NO_SAHF},
3021 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3022 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3023 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3024 {"athlon64", PROCESSOR_K8, CPU_K8,
3025 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3026 | PTA_SSE2 | PTA_NO_SAHF},
3027 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3028 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3029 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3030 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3031 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3032 | PTA_SSE2 | PTA_NO_SAHF},
3033 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3034 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3035 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3036 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3037 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3038 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3039 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3040 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3041 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3042 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3043 | PTA_XOP | PTA_LWP},
3044 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3045 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3046 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3047 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3048 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3049 | PTA_FMA},
3050 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3051 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3052 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3053 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3054 0 /* flags are only used for -march switch. */ },
3055 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3056 PTA_64BIT /* flags are only used for -march switch. */ },
3057 };
3058
3059 /* -mrecip options. */
3060 static struct
3061 {
3062 const char *string; /* option name */
3063 unsigned int mask; /* mask bits to set */
3064 }
3065 const recip_options[] =
3066 {
3067 { "all", RECIP_MASK_ALL },
3068 { "none", RECIP_MASK_NONE },
3069 { "div", RECIP_MASK_DIV },
3070 { "sqrt", RECIP_MASK_SQRT },
3071 { "vec-div", RECIP_MASK_VEC_DIV },
3072 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3073 };
3074
3075 int const pta_size = ARRAY_SIZE (processor_alias_table);
3076
3077 /* Set up prefix/suffix so the error messages refer to either the command
3078 line argument, or the attribute(target). */
3079 if (main_args_p)
3080 {
3081 prefix = "-m";
3082 suffix = "";
3083 sw = "switch";
3084 }
3085 else
3086 {
3087 prefix = "option(\"";
3088 suffix = "\")";
3089 sw = "attribute";
3090 }
3091
3092 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3093 SUBTARGET_OVERRIDE_OPTIONS;
3094 #endif
3095
3096 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3097 SUBSUBTARGET_OVERRIDE_OPTIONS;
3098 #endif
3099
3100 if (TARGET_X32)
3101 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3102
3103 /* -fPIC is the default for x86_64. */
3104 if (TARGET_MACHO && TARGET_64BIT)
3105 flag_pic = 2;
3106
3107 /* Need to check -mtune=generic first. */
3108 if (ix86_tune_string)
3109 {
3110 if (!strcmp (ix86_tune_string, "generic")
3111 || !strcmp (ix86_tune_string, "i686")
3112 /* As special support for cross compilers we read -mtune=native
3113 as -mtune=generic. With native compilers we won't see the
3114 -mtune=native, as it was changed by the driver. */
3115 || !strcmp (ix86_tune_string, "native"))
3116 {
3117 if (TARGET_64BIT)
3118 ix86_tune_string = "generic64";
3119 else
3120 ix86_tune_string = "generic32";
3121 }
3122 /* If this call is for setting the option attribute, allow the
3123 generic32/generic64 that was previously set. */
3124 else if (!main_args_p
3125 && (!strcmp (ix86_tune_string, "generic32")
3126 || !strcmp (ix86_tune_string, "generic64")))
3127 ;
3128 else if (!strncmp (ix86_tune_string, "generic", 7))
3129 error ("bad value (%s) for %stune=%s %s",
3130 ix86_tune_string, prefix, suffix, sw);
3131 else if (!strcmp (ix86_tune_string, "x86-64"))
3132 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3133 "%stune=k8%s or %stune=generic%s instead as appropriate",
3134 prefix, suffix, prefix, suffix, prefix, suffix);
3135 }
3136 else
3137 {
3138 if (ix86_arch_string)
3139 ix86_tune_string = ix86_arch_string;
3140 if (!ix86_tune_string)
3141 {
3142 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3143 ix86_tune_defaulted = 1;
3144 }
3145
3146 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3147 need to use a sensible tune option. */
3148 if (!strcmp (ix86_tune_string, "generic")
3149 || !strcmp (ix86_tune_string, "x86-64")
3150 || !strcmp (ix86_tune_string, "i686"))
3151 {
3152 if (TARGET_64BIT)
3153 ix86_tune_string = "generic64";
3154 else
3155 ix86_tune_string = "generic32";
3156 }
3157 }
3158
3159 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3160 {
3161 /* rep; movq isn't available in 32-bit code. */
3162 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3163 ix86_stringop_alg = no_stringop;
3164 }
3165
3166 if (!ix86_arch_string)
3167 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3168 else
3169 ix86_arch_specified = 1;
3170
3171 if (!global_options_set.x_ix86_abi)
3172 ix86_abi = DEFAULT_ABI;
3173
3174 if (global_options_set.x_ix86_cmodel)
3175 {
3176 switch (ix86_cmodel)
3177 {
3178 case CM_SMALL:
3179 case CM_SMALL_PIC:
3180 if (flag_pic)
3181 ix86_cmodel = CM_SMALL_PIC;
3182 if (!TARGET_64BIT)
3183 error ("code model %qs not supported in the %s bit mode",
3184 "small", "32");
3185 break;
3186
3187 case CM_MEDIUM:
3188 case CM_MEDIUM_PIC:
3189 if (flag_pic)
3190 ix86_cmodel = CM_MEDIUM_PIC;
3191 if (!TARGET_64BIT)
3192 error ("code model %qs not supported in the %s bit mode",
3193 "medium", "32");
3194 else if (TARGET_X32)
3195 error ("code model %qs not supported in x32 mode",
3196 "medium");
3197 break;
3198
3199 case CM_LARGE:
3200 case CM_LARGE_PIC:
3201 if (flag_pic)
3202 ix86_cmodel = CM_LARGE_PIC;
3203 if (!TARGET_64BIT)
3204 error ("code model %qs not supported in the %s bit mode",
3205 "large", "32");
3206 else if (TARGET_X32)
3207 error ("code model %qs not supported in x32 mode",
3208 "medium");
3209 break;
3210
3211 case CM_32:
3212 if (flag_pic)
3213 error ("code model %s does not support PIC mode", "32");
3214 if (TARGET_64BIT)
3215 error ("code model %qs not supported in the %s bit mode",
3216 "32", "64");
3217 break;
3218
3219 case CM_KERNEL:
3220 if (flag_pic)
3221 {
3222 error ("code model %s does not support PIC mode", "kernel");
3223 ix86_cmodel = CM_32;
3224 }
3225 if (!TARGET_64BIT)
3226 error ("code model %qs not supported in the %s bit mode",
3227 "kernel", "32");
3228 break;
3229
3230 default:
3231 gcc_unreachable ();
3232 }
3233 }
3234 else
3235 {
3236 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3237 use of rip-relative addressing. This eliminates fixups that
3238 would otherwise be needed if this object is to be placed in a
3239 DLL, and is essentially just as efficient as direct addressing. */
3240 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3241 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3242 else if (TARGET_64BIT)
3243 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3244 else
3245 ix86_cmodel = CM_32;
3246 }
3247 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3248 {
3249 error ("-masm=intel not supported in this configuration");
3250 ix86_asm_dialect = ASM_ATT;
3251 }
3252 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3253 sorry ("%i-bit mode not compiled in",
3254 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3255
3256 for (i = 0; i < pta_size; i++)
3257 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3258 {
3259 ix86_schedule = processor_alias_table[i].schedule;
3260 ix86_arch = processor_alias_table[i].processor;
3261 /* Default cpu tuning to the architecture. */
3262 ix86_tune = ix86_arch;
3263
3264 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3265 error ("CPU you selected does not support x86-64 "
3266 "instruction set");
3267
3268 if (processor_alias_table[i].flags & PTA_MMX
3269 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3270 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3271 if (processor_alias_table[i].flags & PTA_3DNOW
3272 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3273 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3274 if (processor_alias_table[i].flags & PTA_3DNOW_A
3275 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3276 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3277 if (processor_alias_table[i].flags & PTA_SSE
3278 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3279 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3280 if (processor_alias_table[i].flags & PTA_SSE2
3281 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3282 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3283 if (processor_alias_table[i].flags & PTA_SSE3
3284 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3285 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3286 if (processor_alias_table[i].flags & PTA_SSSE3
3287 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3288 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3289 if (processor_alias_table[i].flags & PTA_SSE4_1
3290 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3291 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3292 if (processor_alias_table[i].flags & PTA_SSE4_2
3293 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3294 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3295 if (processor_alias_table[i].flags & PTA_AVX
3296 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3297 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3298 if (processor_alias_table[i].flags & PTA_AVX2
3299 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3300 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3301 if (processor_alias_table[i].flags & PTA_FMA
3302 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3303 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3304 if (processor_alias_table[i].flags & PTA_SSE4A
3305 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3306 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3307 if (processor_alias_table[i].flags & PTA_FMA4
3308 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3309 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3310 if (processor_alias_table[i].flags & PTA_XOP
3311 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3312 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3313 if (processor_alias_table[i].flags & PTA_LWP
3314 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3315 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3316 if (processor_alias_table[i].flags & PTA_ABM
3317 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3318 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3319 if (processor_alias_table[i].flags & PTA_BMI
3320 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3321 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3322 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3323 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3324 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3325 if (processor_alias_table[i].flags & PTA_TBM
3326 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3327 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3328 if (processor_alias_table[i].flags & PTA_BMI2
3329 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3330 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3331 if (processor_alias_table[i].flags & PTA_CX16
3332 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3333 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3334 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3335 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3336 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3337 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3338 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3339 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3340 if (processor_alias_table[i].flags & PTA_MOVBE
3341 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3342 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3343 if (processor_alias_table[i].flags & PTA_AES
3344 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3345 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3346 if (processor_alias_table[i].flags & PTA_PCLMUL
3347 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3348 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3349 if (processor_alias_table[i].flags & PTA_FSGSBASE
3350 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3351 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3352 if (processor_alias_table[i].flags & PTA_RDRND
3353 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3354 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3355 if (processor_alias_table[i].flags & PTA_F16C
3356 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3357 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3358 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3359 x86_prefetch_sse = true;
3360
3361 break;
3362 }
3363
3364 if (!strcmp (ix86_arch_string, "generic"))
3365 error ("generic CPU can be used only for %stune=%s %s",
3366 prefix, suffix, sw);
3367 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3368 error ("bad value (%s) for %sarch=%s %s",
3369 ix86_arch_string, prefix, suffix, sw);
3370
3371 ix86_arch_mask = 1u << ix86_arch;
3372 for (i = 0; i < X86_ARCH_LAST; ++i)
3373 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3374
3375 for (i = 0; i < pta_size; i++)
3376 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3377 {
3378 ix86_schedule = processor_alias_table[i].schedule;
3379 ix86_tune = processor_alias_table[i].processor;
3380 if (TARGET_64BIT)
3381 {
3382 if (!(processor_alias_table[i].flags & PTA_64BIT))
3383 {
3384 if (ix86_tune_defaulted)
3385 {
3386 ix86_tune_string = "x86-64";
3387 for (i = 0; i < pta_size; i++)
3388 if (! strcmp (ix86_tune_string,
3389 processor_alias_table[i].name))
3390 break;
3391 ix86_schedule = processor_alias_table[i].schedule;
3392 ix86_tune = processor_alias_table[i].processor;
3393 }
3394 else
3395 error ("CPU you selected does not support x86-64 "
3396 "instruction set");
3397 }
3398 }
3399 else
3400 {
3401 /* Adjust tuning when compiling for 32-bit ABI. */
3402 switch (ix86_tune)
3403 {
3404 case PROCESSOR_GENERIC64:
3405 ix86_tune = PROCESSOR_GENERIC32;
3406 ix86_schedule = CPU_PENTIUMPRO;
3407 break;
3408
3409 case PROCESSOR_CORE2_64:
3410 ix86_tune = PROCESSOR_CORE2_32;
3411 break;
3412
3413 case PROCESSOR_COREI7_64:
3414 ix86_tune = PROCESSOR_COREI7_32;
3415 break;
3416
3417 default:
3418 break;
3419 }
3420 }
3421 /* Intel CPUs have always interpreted SSE prefetch instructions as
3422 NOPs; so, we can enable SSE prefetch instructions even when
3423 -mtune (rather than -march) points us to a processor that has them.
3424 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3425 higher processors. */
3426 if (TARGET_CMOVE
3427 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3428 x86_prefetch_sse = true;
3429 break;
3430 }
3431
3432 if (ix86_tune_specified && i == pta_size)
3433 error ("bad value (%s) for %stune=%s %s",
3434 ix86_tune_string, prefix, suffix, sw);
3435
3436 ix86_tune_mask = 1u << ix86_tune;
3437 for (i = 0; i < X86_TUNE_LAST; ++i)
3438 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3439
3440 #ifndef USE_IX86_FRAME_POINTER
3441 #define USE_IX86_FRAME_POINTER 0
3442 #endif
3443
3444 #ifndef USE_X86_64_FRAME_POINTER
3445 #define USE_X86_64_FRAME_POINTER 0
3446 #endif
3447
3448 /* Set the default values for switches whose default depends on TARGET_64BIT
3449 in case they weren't overwritten by command line options. */
3450 if (TARGET_64BIT)
3451 {
3452 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3453 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3454 if (flag_asynchronous_unwind_tables == 2)
3455 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3456 if (flag_pcc_struct_return == 2)
3457 flag_pcc_struct_return = 0;
3458 }
3459 else
3460 {
3461 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3462 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3463 if (flag_asynchronous_unwind_tables == 2)
3464 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3465 if (flag_pcc_struct_return == 2)
3466 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3467 }
3468
3469 if (optimize_size)
3470 ix86_cost = &ix86_size_cost;
3471 else
3472 ix86_cost = processor_target_table[ix86_tune].cost;
3473
3474 /* Arrange to set up i386_stack_locals for all functions. */
3475 init_machine_status = ix86_init_machine_status;
3476
3477 /* Validate -mregparm= value. */
3478 if (global_options_set.x_ix86_regparm)
3479 {
3480 if (TARGET_64BIT)
3481 warning (0, "-mregparm is ignored in 64-bit mode");
3482 if (ix86_regparm > REGPARM_MAX)
3483 {
3484 error ("-mregparm=%d is not between 0 and %d",
3485 ix86_regparm, REGPARM_MAX);
3486 ix86_regparm = 0;
3487 }
3488 }
3489 if (TARGET_64BIT)
3490 ix86_regparm = REGPARM_MAX;
3491
3492 /* Default align_* from the processor table. */
3493 if (align_loops == 0)
3494 {
3495 align_loops = processor_target_table[ix86_tune].align_loop;
3496 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3497 }
3498 if (align_jumps == 0)
3499 {
3500 align_jumps = processor_target_table[ix86_tune].align_jump;
3501 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3502 }
3503 if (align_functions == 0)
3504 {
3505 align_functions = processor_target_table[ix86_tune].align_func;
3506 }
3507
3508 /* Provide default for -mbranch-cost= value. */
3509 if (!global_options_set.x_ix86_branch_cost)
3510 ix86_branch_cost = ix86_cost->branch_cost;
3511
3512 if (TARGET_64BIT)
3513 {
3514 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3515
3516 /* Enable by default the SSE and MMX builtins. Do allow the user to
3517 explicitly disable any of these. In particular, disabling SSE and
3518 MMX for kernel code is extremely useful. */
3519 if (!ix86_arch_specified)
3520 ix86_isa_flags
3521 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3522 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3523
3524 if (TARGET_RTD)
3525 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3526 }
3527 else
3528 {
3529 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3530
3531 if (!ix86_arch_specified)
3532 ix86_isa_flags
3533 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3534
3535 /* i386 ABI does not specify red zone. It still makes sense to use it
3536 when programmer takes care to stack from being destroyed. */
3537 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3538 target_flags |= MASK_NO_RED_ZONE;
3539 }
3540
3541 /* Keep nonleaf frame pointers. */
3542 if (flag_omit_frame_pointer)
3543 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3544 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3545 flag_omit_frame_pointer = 1;
3546
3547 /* If we're doing fast math, we don't care about comparison order
3548 wrt NaNs. This lets us use a shorter comparison sequence. */
3549 if (flag_finite_math_only)
3550 target_flags &= ~MASK_IEEE_FP;
3551
3552 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3553 since the insns won't need emulation. */
3554 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3555 target_flags &= ~MASK_NO_FANCY_MATH_387;
3556
3557 /* Likewise, if the target doesn't have a 387, or we've specified
3558 software floating point, don't use 387 inline intrinsics. */
3559 if (!TARGET_80387)
3560 target_flags |= MASK_NO_FANCY_MATH_387;
3561
3562 /* Turn on MMX builtins for -msse. */
3563 if (TARGET_SSE)
3564 {
3565 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3566 x86_prefetch_sse = true;
3567 }
3568
3569 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3570 if (TARGET_SSE4_2 || TARGET_ABM)
3571 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3572
3573 /* Turn on lzcnt instruction for -mabm. */
3574 if (TARGET_ABM)
3575 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3576
3577 /* Validate -mpreferred-stack-boundary= value or default it to
3578 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3579 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3580 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3581 {
3582 int min = (TARGET_64BIT ? 4 : 2);
3583 int max = (TARGET_SEH ? 4 : 12);
3584
3585 if (ix86_preferred_stack_boundary_arg < min
3586 || ix86_preferred_stack_boundary_arg > max)
3587 {
3588 if (min == max)
3589 error ("-mpreferred-stack-boundary is not supported "
3590 "for this target");
3591 else
3592 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3593 ix86_preferred_stack_boundary_arg, min, max);
3594 }
3595 else
3596 ix86_preferred_stack_boundary
3597 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3598 }
3599
3600 /* Set the default value for -mstackrealign. */
3601 if (ix86_force_align_arg_pointer == -1)
3602 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3603
3604 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3605
3606 /* Validate -mincoming-stack-boundary= value or default it to
3607 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3608 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3609 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3610 {
3611 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3612 || ix86_incoming_stack_boundary_arg > 12)
3613 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3614 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3615 else
3616 {
3617 ix86_user_incoming_stack_boundary
3618 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3619 ix86_incoming_stack_boundary
3620 = ix86_user_incoming_stack_boundary;
3621 }
3622 }
3623
3624 /* Accept -msseregparm only if at least SSE support is enabled. */
3625 if (TARGET_SSEREGPARM
3626 && ! TARGET_SSE)
3627 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3628
3629 if (global_options_set.x_ix86_fpmath)
3630 {
3631 if (ix86_fpmath & FPMATH_SSE)
3632 {
3633 if (!TARGET_SSE)
3634 {
3635 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3636 ix86_fpmath = FPMATH_387;
3637 }
3638 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3639 {
3640 warning (0, "387 instruction set disabled, using SSE arithmetics");
3641 ix86_fpmath = FPMATH_SSE;
3642 }
3643 }
3644 }
3645 else
3646 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3647
3648 /* If the i387 is disabled, then do not return values in it. */
3649 if (!TARGET_80387)
3650 target_flags &= ~MASK_FLOAT_RETURNS;
3651
3652 /* Use external vectorized library in vectorizing intrinsics. */
3653 if (global_options_set.x_ix86_veclibabi_type)
3654 switch (ix86_veclibabi_type)
3655 {
3656 case ix86_veclibabi_type_svml:
3657 ix86_veclib_handler = ix86_veclibabi_svml;
3658 break;
3659
3660 case ix86_veclibabi_type_acml:
3661 ix86_veclib_handler = ix86_veclibabi_acml;
3662 break;
3663
3664 default:
3665 gcc_unreachable ();
3666 }
3667
3668 if ((!USE_IX86_FRAME_POINTER
3669 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3670 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3671 && !optimize_size)
3672 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3673
3674 /* ??? Unwind info is not correct around the CFG unless either a frame
3675 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3676 unwind info generation to be aware of the CFG and propagating states
3677 around edges. */
3678 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3679 || flag_exceptions || flag_non_call_exceptions)
3680 && flag_omit_frame_pointer
3681 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3682 {
3683 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3684 warning (0, "unwind tables currently require either a frame pointer "
3685 "or %saccumulate-outgoing-args%s for correctness",
3686 prefix, suffix);
3687 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3688 }
3689
3690 /* If stack probes are required, the space used for large function
3691 arguments on the stack must also be probed, so enable
3692 -maccumulate-outgoing-args so this happens in the prologue. */
3693 if (TARGET_STACK_PROBE
3694 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3695 {
3696 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3697 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3698 "for correctness", prefix, suffix);
3699 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3700 }
3701
3702 /* For sane SSE instruction set generation we need fcomi instruction.
3703 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3704 expands to a sequence that includes conditional move. */
3705 if (TARGET_SSE || TARGET_RDRND)
3706 TARGET_CMOVE = 1;
3707
3708 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3709 {
3710 char *p;
3711 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3712 p = strchr (internal_label_prefix, 'X');
3713 internal_label_prefix_len = p - internal_label_prefix;
3714 *p = '\0';
3715 }
3716
3717 /* When scheduling description is not available, disable scheduler pass
3718 so it won't slow down the compilation and make x87 code slower. */
3719 if (!TARGET_SCHEDULE)
3720 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3721
3722 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3723 ix86_cost->simultaneous_prefetches,
3724 global_options.x_param_values,
3725 global_options_set.x_param_values);
3726 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3727 global_options.x_param_values,
3728 global_options_set.x_param_values);
3729 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3730 global_options.x_param_values,
3731 global_options_set.x_param_values);
3732 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3733 global_options.x_param_values,
3734 global_options_set.x_param_values);
3735
3736 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3737 if (flag_prefetch_loop_arrays < 0
3738 && HAVE_prefetch
3739 && optimize >= 3
3740 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3741 flag_prefetch_loop_arrays = 1;
3742
3743 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3744 can be optimized to ap = __builtin_next_arg (0). */
3745 if (!TARGET_64BIT && !flag_split_stack)
3746 targetm.expand_builtin_va_start = NULL;
3747
3748 if (TARGET_64BIT)
3749 {
3750 ix86_gen_leave = gen_leave_rex64;
3751 ix86_gen_add3 = gen_adddi3;
3752 ix86_gen_sub3 = gen_subdi3;
3753 ix86_gen_sub3_carry = gen_subdi3_carry;
3754 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3755 ix86_gen_monitor = gen_sse3_monitor64;
3756 ix86_gen_andsp = gen_anddi3;
3757 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3758 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3759 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3760 }
3761 else
3762 {
3763 ix86_gen_leave = gen_leave;
3764 ix86_gen_add3 = gen_addsi3;
3765 ix86_gen_sub3 = gen_subsi3;
3766 ix86_gen_sub3_carry = gen_subsi3_carry;
3767 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3768 ix86_gen_monitor = gen_sse3_monitor;
3769 ix86_gen_andsp = gen_andsi3;
3770 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3771 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3772 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3773 }
3774
3775 #ifdef USE_IX86_CLD
3776 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3777 if (!TARGET_64BIT)
3778 target_flags |= MASK_CLD & ~target_flags_explicit;
3779 #endif
3780
3781 if (!TARGET_64BIT && flag_pic)
3782 {
3783 if (flag_fentry > 0)
3784 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3785 "with -fpic");
3786 flag_fentry = 0;
3787 }
3788 else if (TARGET_SEH)
3789 {
3790 if (flag_fentry == 0)
3791 sorry ("-mno-fentry isn%'t compatible with SEH");
3792 flag_fentry = 1;
3793 }
3794 else if (flag_fentry < 0)
3795 {
3796 #if defined(PROFILE_BEFORE_PROLOGUE)
3797 flag_fentry = 1;
3798 #else
3799 flag_fentry = 0;
3800 #endif
3801 }
3802
3803 if (TARGET_AVX)
3804 {
3805 /* When not optimize for size, enable vzeroupper optimization for
3806 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3807 AVX unaligned load/store. */
3808 if (!optimize_size)
3809 {
3810 if (flag_expensive_optimizations
3811 && !(target_flags_explicit & MASK_VZEROUPPER))
3812 target_flags |= MASK_VZEROUPPER;
3813 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3814 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3815 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3816 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3817 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3818 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3819 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3820 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3821 target_flags |= MASK_PREFER_AVX128;
3822 }
3823 }
3824 else
3825 {
3826 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3827 target_flags &= ~MASK_VZEROUPPER;
3828 }
3829
3830 if (ix86_recip_name)
3831 {
3832 char *p = ASTRDUP (ix86_recip_name);
3833 char *q;
3834 unsigned int mask, i;
3835 bool invert;
3836
3837 while ((q = strtok (p, ",")) != NULL)
3838 {
3839 p = NULL;
3840 if (*q == '!')
3841 {
3842 invert = true;
3843 q++;
3844 }
3845 else
3846 invert = false;
3847
3848 if (!strcmp (q, "default"))
3849 mask = RECIP_MASK_ALL;
3850 else
3851 {
3852 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3853 if (!strcmp (q, recip_options[i].string))
3854 {
3855 mask = recip_options[i].mask;
3856 break;
3857 }
3858
3859 if (i == ARRAY_SIZE (recip_options))
3860 {
3861 error ("unknown option for -mrecip=%s", q);
3862 invert = false;
3863 mask = RECIP_MASK_NONE;
3864 }
3865 }
3866
3867 recip_mask_explicit |= mask;
3868 if (invert)
3869 recip_mask &= ~mask;
3870 else
3871 recip_mask |= mask;
3872 }
3873 }
3874
3875 if (TARGET_RECIP)
3876 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3877 else if (target_flags_explicit & MASK_RECIP)
3878 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3879
3880 /* Save the initial options in case the user does function specific
3881 options. */
3882 if (main_args_p)
3883 target_option_default_node = target_option_current_node
3884 = build_target_option_node ();
3885 }
3886
3887 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3888
3889 static bool
3890 function_pass_avx256_p (const_rtx val)
3891 {
3892 if (!val)
3893 return false;
3894
3895 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3896 return true;
3897
3898 if (GET_CODE (val) == PARALLEL)
3899 {
3900 int i;
3901 rtx r;
3902
3903 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3904 {
3905 r = XVECEXP (val, 0, i);
3906 if (GET_CODE (r) == EXPR_LIST
3907 && XEXP (r, 0)
3908 && REG_P (XEXP (r, 0))
3909 && (GET_MODE (XEXP (r, 0)) == OImode
3910 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3911 return true;
3912 }
3913 }
3914
3915 return false;
3916 }
3917
3918 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3919
3920 static void
3921 ix86_option_override (void)
3922 {
3923 ix86_option_override_internal (true);
3924 }
3925
3926 /* Update register usage after having seen the compiler flags. */
3927
3928 static void
3929 ix86_conditional_register_usage (void)
3930 {
3931 int i;
3932 unsigned int j;
3933
3934 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3935 {
3936 if (fixed_regs[i] > 1)
3937 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3938 if (call_used_regs[i] > 1)
3939 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3940 }
3941
3942 /* The PIC register, if it exists, is fixed. */
3943 j = PIC_OFFSET_TABLE_REGNUM;
3944 if (j != INVALID_REGNUM)
3945 fixed_regs[j] = call_used_regs[j] = 1;
3946
3947 /* The 64-bit MS_ABI changes the set of call-used registers. */
3948 if (TARGET_64BIT_MS_ABI)
3949 {
3950 call_used_regs[SI_REG] = 0;
3951 call_used_regs[DI_REG] = 0;
3952 call_used_regs[XMM6_REG] = 0;
3953 call_used_regs[XMM7_REG] = 0;
3954 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3955 call_used_regs[i] = 0;
3956 }
3957
3958 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3959 other call-clobbered regs for 64-bit. */
3960 if (TARGET_64BIT)
3961 {
3962 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3963
3964 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3965 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3966 && call_used_regs[i])
3967 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3968 }
3969
3970 /* If MMX is disabled, squash the registers. */
3971 if (! TARGET_MMX)
3972 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3973 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3974 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3975
3976 /* If SSE is disabled, squash the registers. */
3977 if (! TARGET_SSE)
3978 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3979 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3980 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3981
3982 /* If the FPU is disabled, squash the registers. */
3983 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3984 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3985 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3986 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3987
3988 /* If 32-bit, squash the 64-bit registers. */
3989 if (! TARGET_64BIT)
3990 {
3991 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3992 reg_names[i] = "";
3993 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3994 reg_names[i] = "";
3995 }
3996 }
3997
3998 \f
3999 /* Save the current options */
4000
4001 static void
4002 ix86_function_specific_save (struct cl_target_option *ptr)
4003 {
4004 ptr->arch = ix86_arch;
4005 ptr->schedule = ix86_schedule;
4006 ptr->tune = ix86_tune;
4007 ptr->branch_cost = ix86_branch_cost;
4008 ptr->tune_defaulted = ix86_tune_defaulted;
4009 ptr->arch_specified = ix86_arch_specified;
4010 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
4011 ptr->ix86_target_flags_explicit = target_flags_explicit;
4012 ptr->x_recip_mask_explicit = recip_mask_explicit;
4013
4014 /* The fields are char but the variables are not; make sure the
4015 values fit in the fields. */
4016 gcc_assert (ptr->arch == ix86_arch);
4017 gcc_assert (ptr->schedule == ix86_schedule);
4018 gcc_assert (ptr->tune == ix86_tune);
4019 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4020 }
4021
4022 /* Restore the current options */
4023
4024 static void
4025 ix86_function_specific_restore (struct cl_target_option *ptr)
4026 {
4027 enum processor_type old_tune = ix86_tune;
4028 enum processor_type old_arch = ix86_arch;
4029 unsigned int ix86_arch_mask, ix86_tune_mask;
4030 int i;
4031
4032 ix86_arch = (enum processor_type) ptr->arch;
4033 ix86_schedule = (enum attr_cpu) ptr->schedule;
4034 ix86_tune = (enum processor_type) ptr->tune;
4035 ix86_branch_cost = ptr->branch_cost;
4036 ix86_tune_defaulted = ptr->tune_defaulted;
4037 ix86_arch_specified = ptr->arch_specified;
4038 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4039 target_flags_explicit = ptr->ix86_target_flags_explicit;
4040 recip_mask_explicit = ptr->x_recip_mask_explicit;
4041
4042 /* Recreate the arch feature tests if the arch changed */
4043 if (old_arch != ix86_arch)
4044 {
4045 ix86_arch_mask = 1u << ix86_arch;
4046 for (i = 0; i < X86_ARCH_LAST; ++i)
4047 ix86_arch_features[i]
4048 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4049 }
4050
4051 /* Recreate the tune optimization tests */
4052 if (old_tune != ix86_tune)
4053 {
4054 ix86_tune_mask = 1u << ix86_tune;
4055 for (i = 0; i < X86_TUNE_LAST; ++i)
4056 ix86_tune_features[i]
4057 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4058 }
4059 }
4060
4061 /* Print the current options */
4062
4063 static void
4064 ix86_function_specific_print (FILE *file, int indent,
4065 struct cl_target_option *ptr)
4066 {
4067 char *target_string
4068 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4069 NULL, NULL, ptr->x_ix86_fpmath, false);
4070
4071 fprintf (file, "%*sarch = %d (%s)\n",
4072 indent, "",
4073 ptr->arch,
4074 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4075 ? cpu_names[ptr->arch]
4076 : "<unknown>"));
4077
4078 fprintf (file, "%*stune = %d (%s)\n",
4079 indent, "",
4080 ptr->tune,
4081 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4082 ? cpu_names[ptr->tune]
4083 : "<unknown>"));
4084
4085 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4086
4087 if (target_string)
4088 {
4089 fprintf (file, "%*s%s\n", indent, "", target_string);
4090 free (target_string);
4091 }
4092 }
4093
4094 \f
4095 /* Inner function to process the attribute((target(...))), take an argument and
4096 set the current options from the argument. If we have a list, recursively go
4097 over the list. */
4098
4099 static bool
4100 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4101 struct gcc_options *enum_opts_set)
4102 {
4103 char *next_optstr;
4104 bool ret = true;
4105
4106 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4107 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4108 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4109 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4110 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4111
4112 enum ix86_opt_type
4113 {
4114 ix86_opt_unknown,
4115 ix86_opt_yes,
4116 ix86_opt_no,
4117 ix86_opt_str,
4118 ix86_opt_enum,
4119 ix86_opt_isa
4120 };
4121
4122 static const struct
4123 {
4124 const char *string;
4125 size_t len;
4126 enum ix86_opt_type type;
4127 int opt;
4128 int mask;
4129 } attrs[] = {
4130 /* isa options */
4131 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4132 IX86_ATTR_ISA ("abm", OPT_mabm),
4133 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4134 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4135 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4136 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4137 IX86_ATTR_ISA ("aes", OPT_maes),
4138 IX86_ATTR_ISA ("avx", OPT_mavx),
4139 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4140 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4141 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4142 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4143 IX86_ATTR_ISA ("sse", OPT_msse),
4144 IX86_ATTR_ISA ("sse2", OPT_msse2),
4145 IX86_ATTR_ISA ("sse3", OPT_msse3),
4146 IX86_ATTR_ISA ("sse4", OPT_msse4),
4147 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4148 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4149 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4150 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4151 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4152 IX86_ATTR_ISA ("fma", OPT_mfma),
4153 IX86_ATTR_ISA ("xop", OPT_mxop),
4154 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4155 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4156 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4157 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4158
4159 /* enum options */
4160 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4161
4162 /* string options */
4163 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4164 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4165
4166 /* flag options */
4167 IX86_ATTR_YES ("cld",
4168 OPT_mcld,
4169 MASK_CLD),
4170
4171 IX86_ATTR_NO ("fancy-math-387",
4172 OPT_mfancy_math_387,
4173 MASK_NO_FANCY_MATH_387),
4174
4175 IX86_ATTR_YES ("ieee-fp",
4176 OPT_mieee_fp,
4177 MASK_IEEE_FP),
4178
4179 IX86_ATTR_YES ("inline-all-stringops",
4180 OPT_minline_all_stringops,
4181 MASK_INLINE_ALL_STRINGOPS),
4182
4183 IX86_ATTR_YES ("inline-stringops-dynamically",
4184 OPT_minline_stringops_dynamically,
4185 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4186
4187 IX86_ATTR_NO ("align-stringops",
4188 OPT_mno_align_stringops,
4189 MASK_NO_ALIGN_STRINGOPS),
4190
4191 IX86_ATTR_YES ("recip",
4192 OPT_mrecip,
4193 MASK_RECIP),
4194
4195 };
4196
4197 /* If this is a list, recurse to get the options. */
4198 if (TREE_CODE (args) == TREE_LIST)
4199 {
4200 bool ret = true;
4201
4202 for (; args; args = TREE_CHAIN (args))
4203 if (TREE_VALUE (args)
4204 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4205 p_strings, enum_opts_set))
4206 ret = false;
4207
4208 return ret;
4209 }
4210
4211 else if (TREE_CODE (args) != STRING_CST)
4212 gcc_unreachable ();
4213
4214 /* Handle multiple arguments separated by commas. */
4215 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4216
4217 while (next_optstr && *next_optstr != '\0')
4218 {
4219 char *p = next_optstr;
4220 char *orig_p = p;
4221 char *comma = strchr (next_optstr, ',');
4222 const char *opt_string;
4223 size_t len, opt_len;
4224 int opt;
4225 bool opt_set_p;
4226 char ch;
4227 unsigned i;
4228 enum ix86_opt_type type = ix86_opt_unknown;
4229 int mask = 0;
4230
4231 if (comma)
4232 {
4233 *comma = '\0';
4234 len = comma - next_optstr;
4235 next_optstr = comma + 1;
4236 }
4237 else
4238 {
4239 len = strlen (p);
4240 next_optstr = NULL;
4241 }
4242
4243 /* Recognize no-xxx. */
4244 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4245 {
4246 opt_set_p = false;
4247 p += 3;
4248 len -= 3;
4249 }
4250 else
4251 opt_set_p = true;
4252
4253 /* Find the option. */
4254 ch = *p;
4255 opt = N_OPTS;
4256 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4257 {
4258 type = attrs[i].type;
4259 opt_len = attrs[i].len;
4260 if (ch == attrs[i].string[0]
4261 && ((type != ix86_opt_str && type != ix86_opt_enum)
4262 ? len == opt_len
4263 : len > opt_len)
4264 && memcmp (p, attrs[i].string, opt_len) == 0)
4265 {
4266 opt = attrs[i].opt;
4267 mask = attrs[i].mask;
4268 opt_string = attrs[i].string;
4269 break;
4270 }
4271 }
4272
4273 /* Process the option. */
4274 if (opt == N_OPTS)
4275 {
4276 error ("attribute(target(\"%s\")) is unknown", orig_p);
4277 ret = false;
4278 }
4279
4280 else if (type == ix86_opt_isa)
4281 {
4282 struct cl_decoded_option decoded;
4283
4284 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4285 ix86_handle_option (&global_options, &global_options_set,
4286 &decoded, input_location);
4287 }
4288
4289 else if (type == ix86_opt_yes || type == ix86_opt_no)
4290 {
4291 if (type == ix86_opt_no)
4292 opt_set_p = !opt_set_p;
4293
4294 if (opt_set_p)
4295 target_flags |= mask;
4296 else
4297 target_flags &= ~mask;
4298 }
4299
4300 else if (type == ix86_opt_str)
4301 {
4302 if (p_strings[opt])
4303 {
4304 error ("option(\"%s\") was already specified", opt_string);
4305 ret = false;
4306 }
4307 else
4308 p_strings[opt] = xstrdup (p + opt_len);
4309 }
4310
4311 else if (type == ix86_opt_enum)
4312 {
4313 bool arg_ok;
4314 int value;
4315
4316 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4317 if (arg_ok)
4318 set_option (&global_options, enum_opts_set, opt, value,
4319 p + opt_len, DK_UNSPECIFIED, input_location,
4320 global_dc);
4321 else
4322 {
4323 error ("attribute(target(\"%s\")) is unknown", orig_p);
4324 ret = false;
4325 }
4326 }
4327
4328 else
4329 gcc_unreachable ();
4330 }
4331
4332 return ret;
4333 }
4334
4335 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4336
4337 tree
4338 ix86_valid_target_attribute_tree (tree args)
4339 {
4340 const char *orig_arch_string = ix86_arch_string;
4341 const char *orig_tune_string = ix86_tune_string;
4342 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4343 int orig_tune_defaulted = ix86_tune_defaulted;
4344 int orig_arch_specified = ix86_arch_specified;
4345 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4346 tree t = NULL_TREE;
4347 int i;
4348 struct cl_target_option *def
4349 = TREE_TARGET_OPTION (target_option_default_node);
4350 struct gcc_options enum_opts_set;
4351
4352 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4353
4354 /* Process each of the options on the chain. */
4355 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4356 &enum_opts_set))
4357 return NULL_TREE;
4358
4359 /* If the changed options are different from the default, rerun
4360 ix86_option_override_internal, and then save the options away.
4361 The string options are are attribute options, and will be undone
4362 when we copy the save structure. */
4363 if (ix86_isa_flags != def->x_ix86_isa_flags
4364 || target_flags != def->x_target_flags
4365 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4366 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4367 || enum_opts_set.x_ix86_fpmath)
4368 {
4369 /* If we are using the default tune= or arch=, undo the string assigned,
4370 and use the default. */
4371 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4372 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4373 else if (!orig_arch_specified)
4374 ix86_arch_string = NULL;
4375
4376 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4377 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4378 else if (orig_tune_defaulted)
4379 ix86_tune_string = NULL;
4380
4381 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4382 if (enum_opts_set.x_ix86_fpmath)
4383 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4384 else if (!TARGET_64BIT && TARGET_SSE)
4385 {
4386 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4387 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4388 }
4389
4390 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4391 ix86_option_override_internal (false);
4392
4393 /* Add any builtin functions with the new isa if any. */
4394 ix86_add_new_builtins (ix86_isa_flags);
4395
4396 /* Save the current options unless we are validating options for
4397 #pragma. */
4398 t = build_target_option_node ();
4399
4400 ix86_arch_string = orig_arch_string;
4401 ix86_tune_string = orig_tune_string;
4402 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4403
4404 /* Free up memory allocated to hold the strings */
4405 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4406 free (option_strings[i]);
4407 }
4408
4409 return t;
4410 }
4411
4412 /* Hook to validate attribute((target("string"))). */
4413
4414 static bool
4415 ix86_valid_target_attribute_p (tree fndecl,
4416 tree ARG_UNUSED (name),
4417 tree args,
4418 int ARG_UNUSED (flags))
4419 {
4420 struct cl_target_option cur_target;
4421 bool ret = true;
4422 tree old_optimize = build_optimization_node ();
4423 tree new_target, new_optimize;
4424 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4425
4426 /* If the function changed the optimization levels as well as setting target
4427 options, start with the optimizations specified. */
4428 if (func_optimize && func_optimize != old_optimize)
4429 cl_optimization_restore (&global_options,
4430 TREE_OPTIMIZATION (func_optimize));
4431
4432 /* The target attributes may also change some optimization flags, so update
4433 the optimization options if necessary. */
4434 cl_target_option_save (&cur_target, &global_options);
4435 new_target = ix86_valid_target_attribute_tree (args);
4436 new_optimize = build_optimization_node ();
4437
4438 if (!new_target)
4439 ret = false;
4440
4441 else if (fndecl)
4442 {
4443 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4444
4445 if (old_optimize != new_optimize)
4446 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4447 }
4448
4449 cl_target_option_restore (&global_options, &cur_target);
4450
4451 if (old_optimize != new_optimize)
4452 cl_optimization_restore (&global_options,
4453 TREE_OPTIMIZATION (old_optimize));
4454
4455 return ret;
4456 }
4457
4458 \f
4459 /* Hook to determine if one function can safely inline another. */
4460
4461 static bool
4462 ix86_can_inline_p (tree caller, tree callee)
4463 {
4464 bool ret = false;
4465 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4466 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4467
4468 /* If callee has no option attributes, then it is ok to inline. */
4469 if (!callee_tree)
4470 ret = true;
4471
4472 /* If caller has no option attributes, but callee does then it is not ok to
4473 inline. */
4474 else if (!caller_tree)
4475 ret = false;
4476
4477 else
4478 {
4479 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4480 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4481
4482 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4483 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4484 function. */
4485 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4486 != callee_opts->x_ix86_isa_flags)
4487 ret = false;
4488
4489 /* See if we have the same non-isa options. */
4490 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4491 ret = false;
4492
4493 /* See if arch, tune, etc. are the same. */
4494 else if (caller_opts->arch != callee_opts->arch)
4495 ret = false;
4496
4497 else if (caller_opts->tune != callee_opts->tune)
4498 ret = false;
4499
4500 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4501 ret = false;
4502
4503 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4504 ret = false;
4505
4506 else
4507 ret = true;
4508 }
4509
4510 return ret;
4511 }
4512
4513 \f
4514 /* Remember the last target of ix86_set_current_function. */
4515 static GTY(()) tree ix86_previous_fndecl;
4516
4517 /* Establish appropriate back-end context for processing the function
4518 FNDECL. The argument might be NULL to indicate processing at top
4519 level, outside of any function scope. */
4520 static void
4521 ix86_set_current_function (tree fndecl)
4522 {
4523 /* Only change the context if the function changes. This hook is called
4524 several times in the course of compiling a function, and we don't want to
4525 slow things down too much or call target_reinit when it isn't safe. */
4526 if (fndecl && fndecl != ix86_previous_fndecl)
4527 {
4528 tree old_tree = (ix86_previous_fndecl
4529 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4530 : NULL_TREE);
4531
4532 tree new_tree = (fndecl
4533 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4534 : NULL_TREE);
4535
4536 ix86_previous_fndecl = fndecl;
4537 if (old_tree == new_tree)
4538 ;
4539
4540 else if (new_tree)
4541 {
4542 cl_target_option_restore (&global_options,
4543 TREE_TARGET_OPTION (new_tree));
4544 target_reinit ();
4545 }
4546
4547 else if (old_tree)
4548 {
4549 struct cl_target_option *def
4550 = TREE_TARGET_OPTION (target_option_current_node);
4551
4552 cl_target_option_restore (&global_options, def);
4553 target_reinit ();
4554 }
4555 }
4556 }
4557
4558 \f
4559 /* Return true if this goes in large data/bss. */
4560
4561 static bool
4562 ix86_in_large_data_p (tree exp)
4563 {
4564 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4565 return false;
4566
4567 /* Functions are never large data. */
4568 if (TREE_CODE (exp) == FUNCTION_DECL)
4569 return false;
4570
4571 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4572 {
4573 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4574 if (strcmp (section, ".ldata") == 0
4575 || strcmp (section, ".lbss") == 0)
4576 return true;
4577 return false;
4578 }
4579 else
4580 {
4581 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4582
4583 /* If this is an incomplete type with size 0, then we can't put it
4584 in data because it might be too big when completed. */
4585 if (!size || size > ix86_section_threshold)
4586 return true;
4587 }
4588
4589 return false;
4590 }
4591
4592 /* Switch to the appropriate section for output of DECL.
4593 DECL is either a `VAR_DECL' node or a constant of some sort.
4594 RELOC indicates whether forming the initial value of DECL requires
4595 link-time relocations. */
4596
4597 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4598 ATTRIBUTE_UNUSED;
4599
4600 static section *
4601 x86_64_elf_select_section (tree decl, int reloc,
4602 unsigned HOST_WIDE_INT align)
4603 {
4604 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4605 && ix86_in_large_data_p (decl))
4606 {
4607 const char *sname = NULL;
4608 unsigned int flags = SECTION_WRITE;
4609 switch (categorize_decl_for_section (decl, reloc))
4610 {
4611 case SECCAT_DATA:
4612 sname = ".ldata";
4613 break;
4614 case SECCAT_DATA_REL:
4615 sname = ".ldata.rel";
4616 break;
4617 case SECCAT_DATA_REL_LOCAL:
4618 sname = ".ldata.rel.local";
4619 break;
4620 case SECCAT_DATA_REL_RO:
4621 sname = ".ldata.rel.ro";
4622 break;
4623 case SECCAT_DATA_REL_RO_LOCAL:
4624 sname = ".ldata.rel.ro.local";
4625 break;
4626 case SECCAT_BSS:
4627 sname = ".lbss";
4628 flags |= SECTION_BSS;
4629 break;
4630 case SECCAT_RODATA:
4631 case SECCAT_RODATA_MERGE_STR:
4632 case SECCAT_RODATA_MERGE_STR_INIT:
4633 case SECCAT_RODATA_MERGE_CONST:
4634 sname = ".lrodata";
4635 flags = 0;
4636 break;
4637 case SECCAT_SRODATA:
4638 case SECCAT_SDATA:
4639 case SECCAT_SBSS:
4640 gcc_unreachable ();
4641 case SECCAT_TEXT:
4642 case SECCAT_TDATA:
4643 case SECCAT_TBSS:
4644 /* We don't split these for medium model. Place them into
4645 default sections and hope for best. */
4646 break;
4647 }
4648 if (sname)
4649 {
4650 /* We might get called with string constants, but get_named_section
4651 doesn't like them as they are not DECLs. Also, we need to set
4652 flags in that case. */
4653 if (!DECL_P (decl))
4654 return get_section (sname, flags, NULL);
4655 return get_named_section (decl, sname, reloc);
4656 }
4657 }
4658 return default_elf_select_section (decl, reloc, align);
4659 }
4660
4661 /* Build up a unique section name, expressed as a
4662 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4663 RELOC indicates whether the initial value of EXP requires
4664 link-time relocations. */
4665
4666 static void ATTRIBUTE_UNUSED
4667 x86_64_elf_unique_section (tree decl, int reloc)
4668 {
4669 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4670 && ix86_in_large_data_p (decl))
4671 {
4672 const char *prefix = NULL;
4673 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4674 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4675
4676 switch (categorize_decl_for_section (decl, reloc))
4677 {
4678 case SECCAT_DATA:
4679 case SECCAT_DATA_REL:
4680 case SECCAT_DATA_REL_LOCAL:
4681 case SECCAT_DATA_REL_RO:
4682 case SECCAT_DATA_REL_RO_LOCAL:
4683 prefix = one_only ? ".ld" : ".ldata";
4684 break;
4685 case SECCAT_BSS:
4686 prefix = one_only ? ".lb" : ".lbss";
4687 break;
4688 case SECCAT_RODATA:
4689 case SECCAT_RODATA_MERGE_STR:
4690 case SECCAT_RODATA_MERGE_STR_INIT:
4691 case SECCAT_RODATA_MERGE_CONST:
4692 prefix = one_only ? ".lr" : ".lrodata";
4693 break;
4694 case SECCAT_SRODATA:
4695 case SECCAT_SDATA:
4696 case SECCAT_SBSS:
4697 gcc_unreachable ();
4698 case SECCAT_TEXT:
4699 case SECCAT_TDATA:
4700 case SECCAT_TBSS:
4701 /* We don't split these for medium model. Place them into
4702 default sections and hope for best. */
4703 break;
4704 }
4705 if (prefix)
4706 {
4707 const char *name, *linkonce;
4708 char *string;
4709
4710 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4711 name = targetm.strip_name_encoding (name);
4712
4713 /* If we're using one_only, then there needs to be a .gnu.linkonce
4714 prefix to the section name. */
4715 linkonce = one_only ? ".gnu.linkonce" : "";
4716
4717 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4718
4719 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4720 return;
4721 }
4722 }
4723 default_unique_section (decl, reloc);
4724 }
4725
4726 #ifdef COMMON_ASM_OP
4727 /* This says how to output assembler code to declare an
4728 uninitialized external linkage data object.
4729
4730 For medium model x86-64 we need to use .largecomm opcode for
4731 large objects. */
4732 void
4733 x86_elf_aligned_common (FILE *file,
4734 const char *name, unsigned HOST_WIDE_INT size,
4735 int align)
4736 {
4737 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4738 && size > (unsigned int)ix86_section_threshold)
4739 fputs (".largecomm\t", file);
4740 else
4741 fputs (COMMON_ASM_OP, file);
4742 assemble_name (file, name);
4743 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4744 size, align / BITS_PER_UNIT);
4745 }
4746 #endif
4747
4748 /* Utility function for targets to use in implementing
4749 ASM_OUTPUT_ALIGNED_BSS. */
4750
4751 void
4752 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4753 const char *name, unsigned HOST_WIDE_INT size,
4754 int align)
4755 {
4756 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4757 && size > (unsigned int)ix86_section_threshold)
4758 switch_to_section (get_named_section (decl, ".lbss", 0));
4759 else
4760 switch_to_section (bss_section);
4761 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4762 #ifdef ASM_DECLARE_OBJECT_NAME
4763 last_assemble_variable_decl = decl;
4764 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4765 #else
4766 /* Standard thing is just output label for the object. */
4767 ASM_OUTPUT_LABEL (file, name);
4768 #endif /* ASM_DECLARE_OBJECT_NAME */
4769 ASM_OUTPUT_SKIP (file, size ? size : 1);
4770 }
4771 \f
4772 /* Decide whether we must probe the stack before any space allocation
4773 on this target. It's essentially TARGET_STACK_PROBE except when
4774 -fstack-check causes the stack to be already probed differently. */
4775
4776 bool
4777 ix86_target_stack_probe (void)
4778 {
4779 /* Do not probe the stack twice if static stack checking is enabled. */
4780 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4781 return false;
4782
4783 return TARGET_STACK_PROBE;
4784 }
4785 \f
4786 /* Decide whether we can make a sibling call to a function. DECL is the
4787 declaration of the function being targeted by the call and EXP is the
4788 CALL_EXPR representing the call. */
4789
4790 static bool
4791 ix86_function_ok_for_sibcall (tree decl, tree exp)
4792 {
4793 tree type, decl_or_type;
4794 rtx a, b;
4795
4796 /* If we are generating position-independent code, we cannot sibcall
4797 optimize any indirect call, or a direct call to a global function,
4798 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4799 if (!TARGET_MACHO
4800 && !TARGET_64BIT
4801 && flag_pic
4802 && (!decl || !targetm.binds_local_p (decl)))
4803 return false;
4804
4805 /* If we need to align the outgoing stack, then sibcalling would
4806 unalign the stack, which may break the called function. */
4807 if (ix86_minimum_incoming_stack_boundary (true)
4808 < PREFERRED_STACK_BOUNDARY)
4809 return false;
4810
4811 if (decl)
4812 {
4813 decl_or_type = decl;
4814 type = TREE_TYPE (decl);
4815 }
4816 else
4817 {
4818 /* We're looking at the CALL_EXPR, we need the type of the function. */
4819 type = CALL_EXPR_FN (exp); /* pointer expression */
4820 type = TREE_TYPE (type); /* pointer type */
4821 type = TREE_TYPE (type); /* function type */
4822 decl_or_type = type;
4823 }
4824
4825 /* Check that the return value locations are the same. Like
4826 if we are returning floats on the 80387 register stack, we cannot
4827 make a sibcall from a function that doesn't return a float to a
4828 function that does or, conversely, from a function that does return
4829 a float to a function that doesn't; the necessary stack adjustment
4830 would not be executed. This is also the place we notice
4831 differences in the return value ABI. Note that it is ok for one
4832 of the functions to have void return type as long as the return
4833 value of the other is passed in a register. */
4834 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4835 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4836 cfun->decl, false);
4837 if (STACK_REG_P (a) || STACK_REG_P (b))
4838 {
4839 if (!rtx_equal_p (a, b))
4840 return false;
4841 }
4842 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4843 {
4844 /* Disable sibcall if we need to generate vzeroupper after
4845 callee returns. */
4846 if (TARGET_VZEROUPPER
4847 && cfun->machine->callee_return_avx256_p
4848 && !cfun->machine->caller_return_avx256_p)
4849 return false;
4850 }
4851 else if (!rtx_equal_p (a, b))
4852 return false;
4853
4854 if (TARGET_64BIT)
4855 {
4856 /* The SYSV ABI has more call-clobbered registers;
4857 disallow sibcalls from MS to SYSV. */
4858 if (cfun->machine->call_abi == MS_ABI
4859 && ix86_function_type_abi (type) == SYSV_ABI)
4860 return false;
4861 }
4862 else
4863 {
4864 /* If this call is indirect, we'll need to be able to use a
4865 call-clobbered register for the address of the target function.
4866 Make sure that all such registers are not used for passing
4867 parameters. Note that DLLIMPORT functions are indirect. */
4868 if (!decl
4869 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4870 {
4871 if (ix86_function_regparm (type, NULL) >= 3)
4872 {
4873 /* ??? Need to count the actual number of registers to be used,
4874 not the possible number of registers. Fix later. */
4875 return false;
4876 }
4877 }
4878 }
4879
4880 /* Otherwise okay. That also includes certain types of indirect calls. */
4881 return true;
4882 }
4883
4884 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4885 and "sseregparm" calling convention attributes;
4886 arguments as in struct attribute_spec.handler. */
4887
4888 static tree
4889 ix86_handle_cconv_attribute (tree *node, tree name,
4890 tree args,
4891 int flags ATTRIBUTE_UNUSED,
4892 bool *no_add_attrs)
4893 {
4894 if (TREE_CODE (*node) != FUNCTION_TYPE
4895 && TREE_CODE (*node) != METHOD_TYPE
4896 && TREE_CODE (*node) != FIELD_DECL
4897 && TREE_CODE (*node) != TYPE_DECL)
4898 {
4899 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4900 name);
4901 *no_add_attrs = true;
4902 return NULL_TREE;
4903 }
4904
4905 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4906 if (is_attribute_p ("regparm", name))
4907 {
4908 tree cst;
4909
4910 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4911 {
4912 error ("fastcall and regparm attributes are not compatible");
4913 }
4914
4915 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4916 {
4917 error ("regparam and thiscall attributes are not compatible");
4918 }
4919
4920 cst = TREE_VALUE (args);
4921 if (TREE_CODE (cst) != INTEGER_CST)
4922 {
4923 warning (OPT_Wattributes,
4924 "%qE attribute requires an integer constant argument",
4925 name);
4926 *no_add_attrs = true;
4927 }
4928 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4929 {
4930 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4931 name, REGPARM_MAX);
4932 *no_add_attrs = true;
4933 }
4934
4935 return NULL_TREE;
4936 }
4937
4938 if (TARGET_64BIT)
4939 {
4940 /* Do not warn when emulating the MS ABI. */
4941 if ((TREE_CODE (*node) != FUNCTION_TYPE
4942 && TREE_CODE (*node) != METHOD_TYPE)
4943 || ix86_function_type_abi (*node) != MS_ABI)
4944 warning (OPT_Wattributes, "%qE attribute ignored",
4945 name);
4946 *no_add_attrs = true;
4947 return NULL_TREE;
4948 }
4949
4950 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4951 if (is_attribute_p ("fastcall", name))
4952 {
4953 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4954 {
4955 error ("fastcall and cdecl attributes are not compatible");
4956 }
4957 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4958 {
4959 error ("fastcall and stdcall attributes are not compatible");
4960 }
4961 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4962 {
4963 error ("fastcall and regparm attributes are not compatible");
4964 }
4965 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4966 {
4967 error ("fastcall and thiscall attributes are not compatible");
4968 }
4969 }
4970
4971 /* Can combine stdcall with fastcall (redundant), regparm and
4972 sseregparm. */
4973 else if (is_attribute_p ("stdcall", name))
4974 {
4975 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4976 {
4977 error ("stdcall and cdecl attributes are not compatible");
4978 }
4979 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4980 {
4981 error ("stdcall and fastcall attributes are not compatible");
4982 }
4983 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4984 {
4985 error ("stdcall and thiscall attributes are not compatible");
4986 }
4987 }
4988
4989 /* Can combine cdecl with regparm and sseregparm. */
4990 else if (is_attribute_p ("cdecl", name))
4991 {
4992 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4993 {
4994 error ("stdcall and cdecl attributes are not compatible");
4995 }
4996 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4997 {
4998 error ("fastcall and cdecl attributes are not compatible");
4999 }
5000 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5001 {
5002 error ("cdecl and thiscall attributes are not compatible");
5003 }
5004 }
5005 else if (is_attribute_p ("thiscall", name))
5006 {
5007 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5008 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5009 name);
5010 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5011 {
5012 error ("stdcall and thiscall attributes are not compatible");
5013 }
5014 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5015 {
5016 error ("fastcall and thiscall attributes are not compatible");
5017 }
5018 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5019 {
5020 error ("cdecl and thiscall attributes are not compatible");
5021 }
5022 }
5023
5024 /* Can combine sseregparm with all attributes. */
5025
5026 return NULL_TREE;
5027 }
5028
5029 /* The transactional memory builtins are implicitly regparm or fastcall
5030 depending on the ABI. Override the generic do-nothing attribute that
5031 these builtins were declared with, and replace it with one of the two
5032 attributes that we expect elsewhere. */
5033
5034 static tree
5035 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5036 tree args ATTRIBUTE_UNUSED,
5037 int flags ATTRIBUTE_UNUSED,
5038 bool *no_add_attrs)
5039 {
5040 tree alt;
5041
5042 /* In no case do we want to add the placeholder attribute. */
5043 *no_add_attrs = true;
5044
5045 /* The 64-bit ABI is unchanged for transactional memory. */
5046 if (TARGET_64BIT)
5047 return NULL_TREE;
5048
5049 /* ??? Is there a better way to validate 32-bit windows? We have
5050 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5051 if (CHECK_STACK_LIMIT > 0)
5052 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5053 else
5054 {
5055 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5056 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5057 }
5058 decl_attributes (node, alt, flags);
5059
5060 return NULL_TREE;
5061 }
5062
5063 /* This function determines from TYPE the calling-convention. */
5064
5065 unsigned int
5066 ix86_get_callcvt (const_tree type)
5067 {
5068 unsigned int ret = 0;
5069 bool is_stdarg;
5070 tree attrs;
5071
5072 if (TARGET_64BIT)
5073 return IX86_CALLCVT_CDECL;
5074
5075 attrs = TYPE_ATTRIBUTES (type);
5076 if (attrs != NULL_TREE)
5077 {
5078 if (lookup_attribute ("cdecl", attrs))
5079 ret |= IX86_CALLCVT_CDECL;
5080 else if (lookup_attribute ("stdcall", attrs))
5081 ret |= IX86_CALLCVT_STDCALL;
5082 else if (lookup_attribute ("fastcall", attrs))
5083 ret |= IX86_CALLCVT_FASTCALL;
5084 else if (lookup_attribute ("thiscall", attrs))
5085 ret |= IX86_CALLCVT_THISCALL;
5086
5087 /* Regparam isn't allowed for thiscall and fastcall. */
5088 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5089 {
5090 if (lookup_attribute ("regparm", attrs))
5091 ret |= IX86_CALLCVT_REGPARM;
5092 if (lookup_attribute ("sseregparm", attrs))
5093 ret |= IX86_CALLCVT_SSEREGPARM;
5094 }
5095
5096 if (IX86_BASE_CALLCVT(ret) != 0)
5097 return ret;
5098 }
5099
5100 is_stdarg = stdarg_p (type);
5101 if (TARGET_RTD && !is_stdarg)
5102 return IX86_CALLCVT_STDCALL | ret;
5103
5104 if (ret != 0
5105 || is_stdarg
5106 || TREE_CODE (type) != METHOD_TYPE
5107 || ix86_function_type_abi (type) != MS_ABI)
5108 return IX86_CALLCVT_CDECL | ret;
5109
5110 return IX86_CALLCVT_THISCALL;
5111 }
5112
5113 /* Return 0 if the attributes for two types are incompatible, 1 if they
5114 are compatible, and 2 if they are nearly compatible (which causes a
5115 warning to be generated). */
5116
5117 static int
5118 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5119 {
5120 unsigned int ccvt1, ccvt2;
5121
5122 if (TREE_CODE (type1) != FUNCTION_TYPE
5123 && TREE_CODE (type1) != METHOD_TYPE)
5124 return 1;
5125
5126 ccvt1 = ix86_get_callcvt (type1);
5127 ccvt2 = ix86_get_callcvt (type2);
5128 if (ccvt1 != ccvt2)
5129 return 0;
5130 if (ix86_function_regparm (type1, NULL)
5131 != ix86_function_regparm (type2, NULL))
5132 return 0;
5133
5134 return 1;
5135 }
5136 \f
5137 /* Return the regparm value for a function with the indicated TYPE and DECL.
5138 DECL may be NULL when calling function indirectly
5139 or considering a libcall. */
5140
5141 static int
5142 ix86_function_regparm (const_tree type, const_tree decl)
5143 {
5144 tree attr;
5145 int regparm;
5146 unsigned int ccvt;
5147
5148 if (TARGET_64BIT)
5149 return (ix86_function_type_abi (type) == SYSV_ABI
5150 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5151 ccvt = ix86_get_callcvt (type);
5152 regparm = ix86_regparm;
5153
5154 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5155 {
5156 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5157 if (attr)
5158 {
5159 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5160 return regparm;
5161 }
5162 }
5163 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5164 return 2;
5165 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5166 return 1;
5167
5168 /* Use register calling convention for local functions when possible. */
5169 if (decl
5170 && TREE_CODE (decl) == FUNCTION_DECL
5171 && optimize
5172 && !(profile_flag && !flag_fentry))
5173 {
5174 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5175 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5176 if (i && i->local && i->can_change_signature)
5177 {
5178 int local_regparm, globals = 0, regno;
5179
5180 /* Make sure no regparm register is taken by a
5181 fixed register variable. */
5182 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5183 if (fixed_regs[local_regparm])
5184 break;
5185
5186 /* We don't want to use regparm(3) for nested functions as
5187 these use a static chain pointer in the third argument. */
5188 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5189 local_regparm = 2;
5190
5191 /* In 32-bit mode save a register for the split stack. */
5192 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5193 local_regparm = 2;
5194
5195 /* Each fixed register usage increases register pressure,
5196 so less registers should be used for argument passing.
5197 This functionality can be overriden by an explicit
5198 regparm value. */
5199 for (regno = 0; regno <= DI_REG; regno++)
5200 if (fixed_regs[regno])
5201 globals++;
5202
5203 local_regparm
5204 = globals < local_regparm ? local_regparm - globals : 0;
5205
5206 if (local_regparm > regparm)
5207 regparm = local_regparm;
5208 }
5209 }
5210
5211 return regparm;
5212 }
5213
5214 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5215 DFmode (2) arguments in SSE registers for a function with the
5216 indicated TYPE and DECL. DECL may be NULL when calling function
5217 indirectly or considering a libcall. Otherwise return 0. */
5218
5219 static int
5220 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5221 {
5222 gcc_assert (!TARGET_64BIT);
5223
5224 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5225 by the sseregparm attribute. */
5226 if (TARGET_SSEREGPARM
5227 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5228 {
5229 if (!TARGET_SSE)
5230 {
5231 if (warn)
5232 {
5233 if (decl)
5234 error ("calling %qD with attribute sseregparm without "
5235 "SSE/SSE2 enabled", decl);
5236 else
5237 error ("calling %qT with attribute sseregparm without "
5238 "SSE/SSE2 enabled", type);
5239 }
5240 return 0;
5241 }
5242
5243 return 2;
5244 }
5245
5246 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5247 (and DFmode for SSE2) arguments in SSE registers. */
5248 if (decl && TARGET_SSE_MATH && optimize
5249 && !(profile_flag && !flag_fentry))
5250 {
5251 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5252 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5253 if (i && i->local && i->can_change_signature)
5254 return TARGET_SSE2 ? 2 : 1;
5255 }
5256
5257 return 0;
5258 }
5259
5260 /* Return true if EAX is live at the start of the function. Used by
5261 ix86_expand_prologue to determine if we need special help before
5262 calling allocate_stack_worker. */
5263
5264 static bool
5265 ix86_eax_live_at_start_p (void)
5266 {
5267 /* Cheat. Don't bother working forward from ix86_function_regparm
5268 to the function type to whether an actual argument is located in
5269 eax. Instead just look at cfg info, which is still close enough
5270 to correct at this point. This gives false positives for broken
5271 functions that might use uninitialized data that happens to be
5272 allocated in eax, but who cares? */
5273 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5274 }
5275
5276 static bool
5277 ix86_keep_aggregate_return_pointer (tree fntype)
5278 {
5279 tree attr;
5280
5281 if (!TARGET_64BIT)
5282 {
5283 attr = lookup_attribute ("callee_pop_aggregate_return",
5284 TYPE_ATTRIBUTES (fntype));
5285 if (attr)
5286 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5287
5288 /* For 32-bit MS-ABI the default is to keep aggregate
5289 return pointer. */
5290 if (ix86_function_type_abi (fntype) == MS_ABI)
5291 return true;
5292 }
5293 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5294 }
5295
5296 /* Value is the number of bytes of arguments automatically
5297 popped when returning from a subroutine call.
5298 FUNDECL is the declaration node of the function (as a tree),
5299 FUNTYPE is the data type of the function (as a tree),
5300 or for a library call it is an identifier node for the subroutine name.
5301 SIZE is the number of bytes of arguments passed on the stack.
5302
5303 On the 80386, the RTD insn may be used to pop them if the number
5304 of args is fixed, but if the number is variable then the caller
5305 must pop them all. RTD can't be used for library calls now
5306 because the library is compiled with the Unix compiler.
5307 Use of RTD is a selectable option, since it is incompatible with
5308 standard Unix calling sequences. If the option is not selected,
5309 the caller must always pop the args.
5310
5311 The attribute stdcall is equivalent to RTD on a per module basis. */
5312
5313 static int
5314 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5315 {
5316 unsigned int ccvt;
5317
5318 /* None of the 64-bit ABIs pop arguments. */
5319 if (TARGET_64BIT)
5320 return 0;
5321
5322 ccvt = ix86_get_callcvt (funtype);
5323
5324 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5325 | IX86_CALLCVT_THISCALL)) != 0
5326 && ! stdarg_p (funtype))
5327 return size;
5328
5329 /* Lose any fake structure return argument if it is passed on the stack. */
5330 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5331 && !ix86_keep_aggregate_return_pointer (funtype))
5332 {
5333 int nregs = ix86_function_regparm (funtype, fundecl);
5334 if (nregs == 0)
5335 return GET_MODE_SIZE (Pmode);
5336 }
5337
5338 return 0;
5339 }
5340 \f
5341 /* Argument support functions. */
5342
5343 /* Return true when register may be used to pass function parameters. */
5344 bool
5345 ix86_function_arg_regno_p (int regno)
5346 {
5347 int i;
5348 const int *parm_regs;
5349
5350 if (!TARGET_64BIT)
5351 {
5352 if (TARGET_MACHO)
5353 return (regno < REGPARM_MAX
5354 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5355 else
5356 return (regno < REGPARM_MAX
5357 || (TARGET_MMX && MMX_REGNO_P (regno)
5358 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5359 || (TARGET_SSE && SSE_REGNO_P (regno)
5360 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5361 }
5362
5363 if (TARGET_MACHO)
5364 {
5365 if (SSE_REGNO_P (regno) && TARGET_SSE)
5366 return true;
5367 }
5368 else
5369 {
5370 if (TARGET_SSE && SSE_REGNO_P (regno)
5371 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5372 return true;
5373 }
5374
5375 /* TODO: The function should depend on current function ABI but
5376 builtins.c would need updating then. Therefore we use the
5377 default ABI. */
5378
5379 /* RAX is used as hidden argument to va_arg functions. */
5380 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5381 return true;
5382
5383 if (ix86_abi == MS_ABI)
5384 parm_regs = x86_64_ms_abi_int_parameter_registers;
5385 else
5386 parm_regs = x86_64_int_parameter_registers;
5387 for (i = 0; i < (ix86_abi == MS_ABI
5388 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5389 if (regno == parm_regs[i])
5390 return true;
5391 return false;
5392 }
5393
5394 /* Return if we do not know how to pass TYPE solely in registers. */
5395
5396 static bool
5397 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5398 {
5399 if (must_pass_in_stack_var_size_or_pad (mode, type))
5400 return true;
5401
5402 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5403 The layout_type routine is crafty and tries to trick us into passing
5404 currently unsupported vector types on the stack by using TImode. */
5405 return (!TARGET_64BIT && mode == TImode
5406 && type && TREE_CODE (type) != VECTOR_TYPE);
5407 }
5408
5409 /* It returns the size, in bytes, of the area reserved for arguments passed
5410 in registers for the function represented by fndecl dependent to the used
5411 abi format. */
5412 int
5413 ix86_reg_parm_stack_space (const_tree fndecl)
5414 {
5415 enum calling_abi call_abi = SYSV_ABI;
5416 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5417 call_abi = ix86_function_abi (fndecl);
5418 else
5419 call_abi = ix86_function_type_abi (fndecl);
5420 if (TARGET_64BIT && call_abi == MS_ABI)
5421 return 32;
5422 return 0;
5423 }
5424
5425 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5426 call abi used. */
5427 enum calling_abi
5428 ix86_function_type_abi (const_tree fntype)
5429 {
5430 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5431 {
5432 enum calling_abi abi = ix86_abi;
5433 if (abi == SYSV_ABI)
5434 {
5435 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5436 abi = MS_ABI;
5437 }
5438 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5439 abi = SYSV_ABI;
5440 return abi;
5441 }
5442 return ix86_abi;
5443 }
5444
5445 static bool
5446 ix86_function_ms_hook_prologue (const_tree fn)
5447 {
5448 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5449 {
5450 if (decl_function_context (fn) != NULL_TREE)
5451 error_at (DECL_SOURCE_LOCATION (fn),
5452 "ms_hook_prologue is not compatible with nested function");
5453 else
5454 return true;
5455 }
5456 return false;
5457 }
5458
5459 static enum calling_abi
5460 ix86_function_abi (const_tree fndecl)
5461 {
5462 if (! fndecl)
5463 return ix86_abi;
5464 return ix86_function_type_abi (TREE_TYPE (fndecl));
5465 }
5466
5467 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5468 call abi used. */
5469 enum calling_abi
5470 ix86_cfun_abi (void)
5471 {
5472 if (! cfun)
5473 return ix86_abi;
5474 return cfun->machine->call_abi;
5475 }
5476
5477 /* Write the extra assembler code needed to declare a function properly. */
5478
5479 void
5480 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5481 tree decl)
5482 {
5483 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5484
5485 if (is_ms_hook)
5486 {
5487 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5488 unsigned int filler_cc = 0xcccccccc;
5489
5490 for (i = 0; i < filler_count; i += 4)
5491 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5492 }
5493
5494 #ifdef SUBTARGET_ASM_UNWIND_INIT
5495 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5496 #endif
5497
5498 ASM_OUTPUT_LABEL (asm_out_file, fname);
5499
5500 /* Output magic byte marker, if hot-patch attribute is set. */
5501 if (is_ms_hook)
5502 {
5503 if (TARGET_64BIT)
5504 {
5505 /* leaq [%rsp + 0], %rsp */
5506 asm_fprintf (asm_out_file, ASM_BYTE
5507 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5508 }
5509 else
5510 {
5511 /* movl.s %edi, %edi
5512 push %ebp
5513 movl.s %esp, %ebp */
5514 asm_fprintf (asm_out_file, ASM_BYTE
5515 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5516 }
5517 }
5518 }
5519
5520 /* regclass.c */
5521 extern void init_regs (void);
5522
5523 /* Implementation of call abi switching target hook. Specific to FNDECL
5524 the specific call register sets are set. See also
5525 ix86_conditional_register_usage for more details. */
5526 void
5527 ix86_call_abi_override (const_tree fndecl)
5528 {
5529 if (fndecl == NULL_TREE)
5530 cfun->machine->call_abi = ix86_abi;
5531 else
5532 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5533 }
5534
5535 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5536 expensive re-initialization of init_regs each time we switch function context
5537 since this is needed only during RTL expansion. */
5538 static void
5539 ix86_maybe_switch_abi (void)
5540 {
5541 if (TARGET_64BIT &&
5542 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5543 reinit_regs ();
5544 }
5545
5546 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5547 for a call to a function whose data type is FNTYPE.
5548 For a library call, FNTYPE is 0. */
5549
5550 void
5551 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5552 tree fntype, /* tree ptr for function decl */
5553 rtx libname, /* SYMBOL_REF of library name or 0 */
5554 tree fndecl,
5555 int caller)
5556 {
5557 struct cgraph_local_info *i;
5558 tree fnret_type;
5559
5560 memset (cum, 0, sizeof (*cum));
5561
5562 /* Initialize for the current callee. */
5563 if (caller)
5564 {
5565 cfun->machine->callee_pass_avx256_p = false;
5566 cfun->machine->callee_return_avx256_p = false;
5567 }
5568
5569 if (fndecl)
5570 {
5571 i = cgraph_local_info (fndecl);
5572 cum->call_abi = ix86_function_abi (fndecl);
5573 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5574 }
5575 else
5576 {
5577 i = NULL;
5578 cum->call_abi = ix86_function_type_abi (fntype);
5579 if (fntype)
5580 fnret_type = TREE_TYPE (fntype);
5581 else
5582 fnret_type = NULL;
5583 }
5584
5585 if (TARGET_VZEROUPPER && fnret_type)
5586 {
5587 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5588 false);
5589 if (function_pass_avx256_p (fnret_value))
5590 {
5591 /* The return value of this function uses 256bit AVX modes. */
5592 if (caller)
5593 cfun->machine->callee_return_avx256_p = true;
5594 else
5595 cfun->machine->caller_return_avx256_p = true;
5596 }
5597 }
5598
5599 cum->caller = caller;
5600
5601 /* Set up the number of registers to use for passing arguments. */
5602
5603 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5604 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5605 "or subtarget optimization implying it");
5606 cum->nregs = ix86_regparm;
5607 if (TARGET_64BIT)
5608 {
5609 cum->nregs = (cum->call_abi == SYSV_ABI
5610 ? X86_64_REGPARM_MAX
5611 : X86_64_MS_REGPARM_MAX);
5612 }
5613 if (TARGET_SSE)
5614 {
5615 cum->sse_nregs = SSE_REGPARM_MAX;
5616 if (TARGET_64BIT)
5617 {
5618 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5619 ? X86_64_SSE_REGPARM_MAX
5620 : X86_64_MS_SSE_REGPARM_MAX);
5621 }
5622 }
5623 if (TARGET_MMX)
5624 cum->mmx_nregs = MMX_REGPARM_MAX;
5625 cum->warn_avx = true;
5626 cum->warn_sse = true;
5627 cum->warn_mmx = true;
5628
5629 /* Because type might mismatch in between caller and callee, we need to
5630 use actual type of function for local calls.
5631 FIXME: cgraph_analyze can be told to actually record if function uses
5632 va_start so for local functions maybe_vaarg can be made aggressive
5633 helping K&R code.
5634 FIXME: once typesytem is fixed, we won't need this code anymore. */
5635 if (i && i->local && i->can_change_signature)
5636 fntype = TREE_TYPE (fndecl);
5637 cum->maybe_vaarg = (fntype
5638 ? (!prototype_p (fntype) || stdarg_p (fntype))
5639 : !libname);
5640
5641 if (!TARGET_64BIT)
5642 {
5643 /* If there are variable arguments, then we won't pass anything
5644 in registers in 32-bit mode. */
5645 if (stdarg_p (fntype))
5646 {
5647 cum->nregs = 0;
5648 cum->sse_nregs = 0;
5649 cum->mmx_nregs = 0;
5650 cum->warn_avx = 0;
5651 cum->warn_sse = 0;
5652 cum->warn_mmx = 0;
5653 return;
5654 }
5655
5656 /* Use ecx and edx registers if function has fastcall attribute,
5657 else look for regparm information. */
5658 if (fntype)
5659 {
5660 unsigned int ccvt = ix86_get_callcvt (fntype);
5661 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5662 {
5663 cum->nregs = 1;
5664 cum->fastcall = 1; /* Same first register as in fastcall. */
5665 }
5666 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5667 {
5668 cum->nregs = 2;
5669 cum->fastcall = 1;
5670 }
5671 else
5672 cum->nregs = ix86_function_regparm (fntype, fndecl);
5673 }
5674
5675 /* Set up the number of SSE registers used for passing SFmode
5676 and DFmode arguments. Warn for mismatching ABI. */
5677 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5678 }
5679 }
5680
5681 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5682 But in the case of vector types, it is some vector mode.
5683
5684 When we have only some of our vector isa extensions enabled, then there
5685 are some modes for which vector_mode_supported_p is false. For these
5686 modes, the generic vector support in gcc will choose some non-vector mode
5687 in order to implement the type. By computing the natural mode, we'll
5688 select the proper ABI location for the operand and not depend on whatever
5689 the middle-end decides to do with these vector types.
5690
5691 The midde-end can't deal with the vector types > 16 bytes. In this
5692 case, we return the original mode and warn ABI change if CUM isn't
5693 NULL. */
5694
5695 static enum machine_mode
5696 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5697 {
5698 enum machine_mode mode = TYPE_MODE (type);
5699
5700 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5701 {
5702 HOST_WIDE_INT size = int_size_in_bytes (type);
5703 if ((size == 8 || size == 16 || size == 32)
5704 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5705 && TYPE_VECTOR_SUBPARTS (type) > 1)
5706 {
5707 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5708
5709 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5710 mode = MIN_MODE_VECTOR_FLOAT;
5711 else
5712 mode = MIN_MODE_VECTOR_INT;
5713
5714 /* Get the mode which has this inner mode and number of units. */
5715 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5716 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5717 && GET_MODE_INNER (mode) == innermode)
5718 {
5719 if (size == 32 && !TARGET_AVX)
5720 {
5721 static bool warnedavx;
5722
5723 if (cum
5724 && !warnedavx
5725 && cum->warn_avx)
5726 {
5727 warnedavx = true;
5728 warning (0, "AVX vector argument without AVX "
5729 "enabled changes the ABI");
5730 }
5731 return TYPE_MODE (type);
5732 }
5733 else
5734 return mode;
5735 }
5736
5737 gcc_unreachable ();
5738 }
5739 }
5740
5741 return mode;
5742 }
5743
5744 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5745 this may not agree with the mode that the type system has chosen for the
5746 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5747 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5748
5749 static rtx
5750 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5751 unsigned int regno)
5752 {
5753 rtx tmp;
5754
5755 if (orig_mode != BLKmode)
5756 tmp = gen_rtx_REG (orig_mode, regno);
5757 else
5758 {
5759 tmp = gen_rtx_REG (mode, regno);
5760 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5761 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5762 }
5763
5764 return tmp;
5765 }
5766
5767 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5768 of this code is to classify each 8bytes of incoming argument by the register
5769 class and assign registers accordingly. */
5770
5771 /* Return the union class of CLASS1 and CLASS2.
5772 See the x86-64 PS ABI for details. */
5773
5774 static enum x86_64_reg_class
5775 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5776 {
5777 /* Rule #1: If both classes are equal, this is the resulting class. */
5778 if (class1 == class2)
5779 return class1;
5780
5781 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5782 the other class. */
5783 if (class1 == X86_64_NO_CLASS)
5784 return class2;
5785 if (class2 == X86_64_NO_CLASS)
5786 return class1;
5787
5788 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5789 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5790 return X86_64_MEMORY_CLASS;
5791
5792 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5793 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5794 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5795 return X86_64_INTEGERSI_CLASS;
5796 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5797 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5798 return X86_64_INTEGER_CLASS;
5799
5800 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5801 MEMORY is used. */
5802 if (class1 == X86_64_X87_CLASS
5803 || class1 == X86_64_X87UP_CLASS
5804 || class1 == X86_64_COMPLEX_X87_CLASS
5805 || class2 == X86_64_X87_CLASS
5806 || class2 == X86_64_X87UP_CLASS
5807 || class2 == X86_64_COMPLEX_X87_CLASS)
5808 return X86_64_MEMORY_CLASS;
5809
5810 /* Rule #6: Otherwise class SSE is used. */
5811 return X86_64_SSE_CLASS;
5812 }
5813
5814 /* Classify the argument of type TYPE and mode MODE.
5815 CLASSES will be filled by the register class used to pass each word
5816 of the operand. The number of words is returned. In case the parameter
5817 should be passed in memory, 0 is returned. As a special case for zero
5818 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5819
5820 BIT_OFFSET is used internally for handling records and specifies offset
5821 of the offset in bits modulo 256 to avoid overflow cases.
5822
5823 See the x86-64 PS ABI for details.
5824 */
5825
5826 static int
5827 classify_argument (enum machine_mode mode, const_tree type,
5828 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5829 {
5830 HOST_WIDE_INT bytes =
5831 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5832 int words
5833 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5834
5835 /* Variable sized entities are always passed/returned in memory. */
5836 if (bytes < 0)
5837 return 0;
5838
5839 if (mode != VOIDmode
5840 && targetm.calls.must_pass_in_stack (mode, type))
5841 return 0;
5842
5843 if (type && AGGREGATE_TYPE_P (type))
5844 {
5845 int i;
5846 tree field;
5847 enum x86_64_reg_class subclasses[MAX_CLASSES];
5848
5849 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5850 if (bytes > 32)
5851 return 0;
5852
5853 for (i = 0; i < words; i++)
5854 classes[i] = X86_64_NO_CLASS;
5855
5856 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5857 signalize memory class, so handle it as special case. */
5858 if (!words)
5859 {
5860 classes[0] = X86_64_NO_CLASS;
5861 return 1;
5862 }
5863
5864 /* Classify each field of record and merge classes. */
5865 switch (TREE_CODE (type))
5866 {
5867 case RECORD_TYPE:
5868 /* And now merge the fields of structure. */
5869 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5870 {
5871 if (TREE_CODE (field) == FIELD_DECL)
5872 {
5873 int num;
5874
5875 if (TREE_TYPE (field) == error_mark_node)
5876 continue;
5877
5878 /* Bitfields are always classified as integer. Handle them
5879 early, since later code would consider them to be
5880 misaligned integers. */
5881 if (DECL_BIT_FIELD (field))
5882 {
5883 for (i = (int_bit_position (field)
5884 + (bit_offset % 64)) / 8 / 8;
5885 i < ((int_bit_position (field) + (bit_offset % 64))
5886 + tree_low_cst (DECL_SIZE (field), 0)
5887 + 63) / 8 / 8; i++)
5888 classes[i] =
5889 merge_classes (X86_64_INTEGER_CLASS,
5890 classes[i]);
5891 }
5892 else
5893 {
5894 int pos;
5895
5896 type = TREE_TYPE (field);
5897
5898 /* Flexible array member is ignored. */
5899 if (TYPE_MODE (type) == BLKmode
5900 && TREE_CODE (type) == ARRAY_TYPE
5901 && TYPE_SIZE (type) == NULL_TREE
5902 && TYPE_DOMAIN (type) != NULL_TREE
5903 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5904 == NULL_TREE))
5905 {
5906 static bool warned;
5907
5908 if (!warned && warn_psabi)
5909 {
5910 warned = true;
5911 inform (input_location,
5912 "the ABI of passing struct with"
5913 " a flexible array member has"
5914 " changed in GCC 4.4");
5915 }
5916 continue;
5917 }
5918 num = classify_argument (TYPE_MODE (type), type,
5919 subclasses,
5920 (int_bit_position (field)
5921 + bit_offset) % 256);
5922 if (!num)
5923 return 0;
5924 pos = (int_bit_position (field)
5925 + (bit_offset % 64)) / 8 / 8;
5926 for (i = 0; i < num && (i + pos) < words; i++)
5927 classes[i + pos] =
5928 merge_classes (subclasses[i], classes[i + pos]);
5929 }
5930 }
5931 }
5932 break;
5933
5934 case ARRAY_TYPE:
5935 /* Arrays are handled as small records. */
5936 {
5937 int num;
5938 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5939 TREE_TYPE (type), subclasses, bit_offset);
5940 if (!num)
5941 return 0;
5942
5943 /* The partial classes are now full classes. */
5944 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5945 subclasses[0] = X86_64_SSE_CLASS;
5946 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5947 && !((bit_offset % 64) == 0 && bytes == 4))
5948 subclasses[0] = X86_64_INTEGER_CLASS;
5949
5950 for (i = 0; i < words; i++)
5951 classes[i] = subclasses[i % num];
5952
5953 break;
5954 }
5955 case UNION_TYPE:
5956 case QUAL_UNION_TYPE:
5957 /* Unions are similar to RECORD_TYPE but offset is always 0.
5958 */
5959 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5960 {
5961 if (TREE_CODE (field) == FIELD_DECL)
5962 {
5963 int num;
5964
5965 if (TREE_TYPE (field) == error_mark_node)
5966 continue;
5967
5968 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5969 TREE_TYPE (field), subclasses,
5970 bit_offset);
5971 if (!num)
5972 return 0;
5973 for (i = 0; i < num; i++)
5974 classes[i] = merge_classes (subclasses[i], classes[i]);
5975 }
5976 }
5977 break;
5978
5979 default:
5980 gcc_unreachable ();
5981 }
5982
5983 if (words > 2)
5984 {
5985 /* When size > 16 bytes, if the first one isn't
5986 X86_64_SSE_CLASS or any other ones aren't
5987 X86_64_SSEUP_CLASS, everything should be passed in
5988 memory. */
5989 if (classes[0] != X86_64_SSE_CLASS)
5990 return 0;
5991
5992 for (i = 1; i < words; i++)
5993 if (classes[i] != X86_64_SSEUP_CLASS)
5994 return 0;
5995 }
5996
5997 /* Final merger cleanup. */
5998 for (i = 0; i < words; i++)
5999 {
6000 /* If one class is MEMORY, everything should be passed in
6001 memory. */
6002 if (classes[i] == X86_64_MEMORY_CLASS)
6003 return 0;
6004
6005 /* The X86_64_SSEUP_CLASS should be always preceded by
6006 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6007 if (classes[i] == X86_64_SSEUP_CLASS
6008 && classes[i - 1] != X86_64_SSE_CLASS
6009 && classes[i - 1] != X86_64_SSEUP_CLASS)
6010 {
6011 /* The first one should never be X86_64_SSEUP_CLASS. */
6012 gcc_assert (i != 0);
6013 classes[i] = X86_64_SSE_CLASS;
6014 }
6015
6016 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6017 everything should be passed in memory. */
6018 if (classes[i] == X86_64_X87UP_CLASS
6019 && (classes[i - 1] != X86_64_X87_CLASS))
6020 {
6021 static bool warned;
6022
6023 /* The first one should never be X86_64_X87UP_CLASS. */
6024 gcc_assert (i != 0);
6025 if (!warned && warn_psabi)
6026 {
6027 warned = true;
6028 inform (input_location,
6029 "the ABI of passing union with long double"
6030 " has changed in GCC 4.4");
6031 }
6032 return 0;
6033 }
6034 }
6035 return words;
6036 }
6037
6038 /* Compute alignment needed. We align all types to natural boundaries with
6039 exception of XFmode that is aligned to 64bits. */
6040 if (mode != VOIDmode && mode != BLKmode)
6041 {
6042 int mode_alignment = GET_MODE_BITSIZE (mode);
6043
6044 if (mode == XFmode)
6045 mode_alignment = 128;
6046 else if (mode == XCmode)
6047 mode_alignment = 256;
6048 if (COMPLEX_MODE_P (mode))
6049 mode_alignment /= 2;
6050 /* Misaligned fields are always returned in memory. */
6051 if (bit_offset % mode_alignment)
6052 return 0;
6053 }
6054
6055 /* for V1xx modes, just use the base mode */
6056 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6057 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6058 mode = GET_MODE_INNER (mode);
6059
6060 /* Classification of atomic types. */
6061 switch (mode)
6062 {
6063 case SDmode:
6064 case DDmode:
6065 classes[0] = X86_64_SSE_CLASS;
6066 return 1;
6067 case TDmode:
6068 classes[0] = X86_64_SSE_CLASS;
6069 classes[1] = X86_64_SSEUP_CLASS;
6070 return 2;
6071 case DImode:
6072 case SImode:
6073 case HImode:
6074 case QImode:
6075 case CSImode:
6076 case CHImode:
6077 case CQImode:
6078 {
6079 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6080
6081 if (size <= 32)
6082 {
6083 classes[0] = X86_64_INTEGERSI_CLASS;
6084 return 1;
6085 }
6086 else if (size <= 64)
6087 {
6088 classes[0] = X86_64_INTEGER_CLASS;
6089 return 1;
6090 }
6091 else if (size <= 64+32)
6092 {
6093 classes[0] = X86_64_INTEGER_CLASS;
6094 classes[1] = X86_64_INTEGERSI_CLASS;
6095 return 2;
6096 }
6097 else if (size <= 64+64)
6098 {
6099 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6100 return 2;
6101 }
6102 else
6103 gcc_unreachable ();
6104 }
6105 case CDImode:
6106 case TImode:
6107 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6108 return 2;
6109 case COImode:
6110 case OImode:
6111 /* OImode shouldn't be used directly. */
6112 gcc_unreachable ();
6113 case CTImode:
6114 return 0;
6115 case SFmode:
6116 if (!(bit_offset % 64))
6117 classes[0] = X86_64_SSESF_CLASS;
6118 else
6119 classes[0] = X86_64_SSE_CLASS;
6120 return 1;
6121 case DFmode:
6122 classes[0] = X86_64_SSEDF_CLASS;
6123 return 1;
6124 case XFmode:
6125 classes[0] = X86_64_X87_CLASS;
6126 classes[1] = X86_64_X87UP_CLASS;
6127 return 2;
6128 case TFmode:
6129 classes[0] = X86_64_SSE_CLASS;
6130 classes[1] = X86_64_SSEUP_CLASS;
6131 return 2;
6132 case SCmode:
6133 classes[0] = X86_64_SSE_CLASS;
6134 if (!(bit_offset % 64))
6135 return 1;
6136 else
6137 {
6138 static bool warned;
6139
6140 if (!warned && warn_psabi)
6141 {
6142 warned = true;
6143 inform (input_location,
6144 "the ABI of passing structure with complex float"
6145 " member has changed in GCC 4.4");
6146 }
6147 classes[1] = X86_64_SSESF_CLASS;
6148 return 2;
6149 }
6150 case DCmode:
6151 classes[0] = X86_64_SSEDF_CLASS;
6152 classes[1] = X86_64_SSEDF_CLASS;
6153 return 2;
6154 case XCmode:
6155 classes[0] = X86_64_COMPLEX_X87_CLASS;
6156 return 1;
6157 case TCmode:
6158 /* This modes is larger than 16 bytes. */
6159 return 0;
6160 case V8SFmode:
6161 case V8SImode:
6162 case V32QImode:
6163 case V16HImode:
6164 case V4DFmode:
6165 case V4DImode:
6166 classes[0] = X86_64_SSE_CLASS;
6167 classes[1] = X86_64_SSEUP_CLASS;
6168 classes[2] = X86_64_SSEUP_CLASS;
6169 classes[3] = X86_64_SSEUP_CLASS;
6170 return 4;
6171 case V4SFmode:
6172 case V4SImode:
6173 case V16QImode:
6174 case V8HImode:
6175 case V2DFmode:
6176 case V2DImode:
6177 classes[0] = X86_64_SSE_CLASS;
6178 classes[1] = X86_64_SSEUP_CLASS;
6179 return 2;
6180 case V1TImode:
6181 case V1DImode:
6182 case V2SFmode:
6183 case V2SImode:
6184 case V4HImode:
6185 case V8QImode:
6186 classes[0] = X86_64_SSE_CLASS;
6187 return 1;
6188 case BLKmode:
6189 case VOIDmode:
6190 return 0;
6191 default:
6192 gcc_assert (VECTOR_MODE_P (mode));
6193
6194 if (bytes > 16)
6195 return 0;
6196
6197 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6198
6199 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6200 classes[0] = X86_64_INTEGERSI_CLASS;
6201 else
6202 classes[0] = X86_64_INTEGER_CLASS;
6203 classes[1] = X86_64_INTEGER_CLASS;
6204 return 1 + (bytes > 8);
6205 }
6206 }
6207
6208 /* Examine the argument and return set number of register required in each
6209 class. Return 0 iff parameter should be passed in memory. */
6210 static int
6211 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6212 int *int_nregs, int *sse_nregs)
6213 {
6214 enum x86_64_reg_class regclass[MAX_CLASSES];
6215 int n = classify_argument (mode, type, regclass, 0);
6216
6217 *int_nregs = 0;
6218 *sse_nregs = 0;
6219 if (!n)
6220 return 0;
6221 for (n--; n >= 0; n--)
6222 switch (regclass[n])
6223 {
6224 case X86_64_INTEGER_CLASS:
6225 case X86_64_INTEGERSI_CLASS:
6226 (*int_nregs)++;
6227 break;
6228 case X86_64_SSE_CLASS:
6229 case X86_64_SSESF_CLASS:
6230 case X86_64_SSEDF_CLASS:
6231 (*sse_nregs)++;
6232 break;
6233 case X86_64_NO_CLASS:
6234 case X86_64_SSEUP_CLASS:
6235 break;
6236 case X86_64_X87_CLASS:
6237 case X86_64_X87UP_CLASS:
6238 if (!in_return)
6239 return 0;
6240 break;
6241 case X86_64_COMPLEX_X87_CLASS:
6242 return in_return ? 2 : 0;
6243 case X86_64_MEMORY_CLASS:
6244 gcc_unreachable ();
6245 }
6246 return 1;
6247 }
6248
6249 /* Construct container for the argument used by GCC interface. See
6250 FUNCTION_ARG for the detailed description. */
6251
6252 static rtx
6253 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6254 const_tree type, int in_return, int nintregs, int nsseregs,
6255 const int *intreg, int sse_regno)
6256 {
6257 /* The following variables hold the static issued_error state. */
6258 static bool issued_sse_arg_error;
6259 static bool issued_sse_ret_error;
6260 static bool issued_x87_ret_error;
6261
6262 enum machine_mode tmpmode;
6263 int bytes =
6264 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6265 enum x86_64_reg_class regclass[MAX_CLASSES];
6266 int n;
6267 int i;
6268 int nexps = 0;
6269 int needed_sseregs, needed_intregs;
6270 rtx exp[MAX_CLASSES];
6271 rtx ret;
6272
6273 n = classify_argument (mode, type, regclass, 0);
6274 if (!n)
6275 return NULL;
6276 if (!examine_argument (mode, type, in_return, &needed_intregs,
6277 &needed_sseregs))
6278 return NULL;
6279 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6280 return NULL;
6281
6282 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6283 some less clueful developer tries to use floating-point anyway. */
6284 if (needed_sseregs && !TARGET_SSE)
6285 {
6286 if (in_return)
6287 {
6288 if (!issued_sse_ret_error)
6289 {
6290 error ("SSE register return with SSE disabled");
6291 issued_sse_ret_error = true;
6292 }
6293 }
6294 else if (!issued_sse_arg_error)
6295 {
6296 error ("SSE register argument with SSE disabled");
6297 issued_sse_arg_error = true;
6298 }
6299 return NULL;
6300 }
6301
6302 /* Likewise, error if the ABI requires us to return values in the
6303 x87 registers and the user specified -mno-80387. */
6304 if (!TARGET_80387 && in_return)
6305 for (i = 0; i < n; i++)
6306 if (regclass[i] == X86_64_X87_CLASS
6307 || regclass[i] == X86_64_X87UP_CLASS
6308 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6309 {
6310 if (!issued_x87_ret_error)
6311 {
6312 error ("x87 register return with x87 disabled");
6313 issued_x87_ret_error = true;
6314 }
6315 return NULL;
6316 }
6317
6318 /* First construct simple cases. Avoid SCmode, since we want to use
6319 single register to pass this type. */
6320 if (n == 1 && mode != SCmode)
6321 switch (regclass[0])
6322 {
6323 case X86_64_INTEGER_CLASS:
6324 case X86_64_INTEGERSI_CLASS:
6325 return gen_rtx_REG (mode, intreg[0]);
6326 case X86_64_SSE_CLASS:
6327 case X86_64_SSESF_CLASS:
6328 case X86_64_SSEDF_CLASS:
6329 if (mode != BLKmode)
6330 return gen_reg_or_parallel (mode, orig_mode,
6331 SSE_REGNO (sse_regno));
6332 break;
6333 case X86_64_X87_CLASS:
6334 case X86_64_COMPLEX_X87_CLASS:
6335 return gen_rtx_REG (mode, FIRST_STACK_REG);
6336 case X86_64_NO_CLASS:
6337 /* Zero sized array, struct or class. */
6338 return NULL;
6339 default:
6340 gcc_unreachable ();
6341 }
6342 if (n == 2
6343 && regclass[0] == X86_64_SSE_CLASS
6344 && regclass[1] == X86_64_SSEUP_CLASS
6345 && mode != BLKmode)
6346 return gen_reg_or_parallel (mode, orig_mode,
6347 SSE_REGNO (sse_regno));
6348 if (n == 4
6349 && regclass[0] == X86_64_SSE_CLASS
6350 && regclass[1] == X86_64_SSEUP_CLASS
6351 && regclass[2] == X86_64_SSEUP_CLASS
6352 && regclass[3] == X86_64_SSEUP_CLASS
6353 && mode != BLKmode)
6354 return gen_reg_or_parallel (mode, orig_mode,
6355 SSE_REGNO (sse_regno));
6356 if (n == 2
6357 && regclass[0] == X86_64_X87_CLASS
6358 && regclass[1] == X86_64_X87UP_CLASS)
6359 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6360
6361 if (n == 2
6362 && regclass[0] == X86_64_INTEGER_CLASS
6363 && regclass[1] == X86_64_INTEGER_CLASS
6364 && (mode == CDImode || mode == TImode || mode == TFmode)
6365 && intreg[0] + 1 == intreg[1])
6366 return gen_rtx_REG (mode, intreg[0]);
6367
6368 /* Otherwise figure out the entries of the PARALLEL. */
6369 for (i = 0; i < n; i++)
6370 {
6371 int pos;
6372
6373 switch (regclass[i])
6374 {
6375 case X86_64_NO_CLASS:
6376 break;
6377 case X86_64_INTEGER_CLASS:
6378 case X86_64_INTEGERSI_CLASS:
6379 /* Merge TImodes on aligned occasions here too. */
6380 if (i * 8 + 8 > bytes)
6381 tmpmode
6382 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6383 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6384 tmpmode = SImode;
6385 else
6386 tmpmode = DImode;
6387 /* We've requested 24 bytes we
6388 don't have mode for. Use DImode. */
6389 if (tmpmode == BLKmode)
6390 tmpmode = DImode;
6391 exp [nexps++]
6392 = gen_rtx_EXPR_LIST (VOIDmode,
6393 gen_rtx_REG (tmpmode, *intreg),
6394 GEN_INT (i*8));
6395 intreg++;
6396 break;
6397 case X86_64_SSESF_CLASS:
6398 exp [nexps++]
6399 = gen_rtx_EXPR_LIST (VOIDmode,
6400 gen_rtx_REG (SFmode,
6401 SSE_REGNO (sse_regno)),
6402 GEN_INT (i*8));
6403 sse_regno++;
6404 break;
6405 case X86_64_SSEDF_CLASS:
6406 exp [nexps++]
6407 = gen_rtx_EXPR_LIST (VOIDmode,
6408 gen_rtx_REG (DFmode,
6409 SSE_REGNO (sse_regno)),
6410 GEN_INT (i*8));
6411 sse_regno++;
6412 break;
6413 case X86_64_SSE_CLASS:
6414 pos = i;
6415 switch (n)
6416 {
6417 case 1:
6418 tmpmode = DImode;
6419 break;
6420 case 2:
6421 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6422 {
6423 tmpmode = TImode;
6424 i++;
6425 }
6426 else
6427 tmpmode = DImode;
6428 break;
6429 case 4:
6430 gcc_assert (i == 0
6431 && regclass[1] == X86_64_SSEUP_CLASS
6432 && regclass[2] == X86_64_SSEUP_CLASS
6433 && regclass[3] == X86_64_SSEUP_CLASS);
6434 tmpmode = OImode;
6435 i += 3;
6436 break;
6437 default:
6438 gcc_unreachable ();
6439 }
6440 exp [nexps++]
6441 = gen_rtx_EXPR_LIST (VOIDmode,
6442 gen_rtx_REG (tmpmode,
6443 SSE_REGNO (sse_regno)),
6444 GEN_INT (pos*8));
6445 sse_regno++;
6446 break;
6447 default:
6448 gcc_unreachable ();
6449 }
6450 }
6451
6452 /* Empty aligned struct, union or class. */
6453 if (nexps == 0)
6454 return NULL;
6455
6456 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6457 for (i = 0; i < nexps; i++)
6458 XVECEXP (ret, 0, i) = exp [i];
6459 return ret;
6460 }
6461
6462 /* Update the data in CUM to advance over an argument of mode MODE
6463 and data type TYPE. (TYPE is null for libcalls where that information
6464 may not be available.) */
6465
6466 static void
6467 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6468 const_tree type, HOST_WIDE_INT bytes,
6469 HOST_WIDE_INT words)
6470 {
6471 switch (mode)
6472 {
6473 default:
6474 break;
6475
6476 case BLKmode:
6477 if (bytes < 0)
6478 break;
6479 /* FALLTHRU */
6480
6481 case DImode:
6482 case SImode:
6483 case HImode:
6484 case QImode:
6485 cum->words += words;
6486 cum->nregs -= words;
6487 cum->regno += words;
6488
6489 if (cum->nregs <= 0)
6490 {
6491 cum->nregs = 0;
6492 cum->regno = 0;
6493 }
6494 break;
6495
6496 case OImode:
6497 /* OImode shouldn't be used directly. */
6498 gcc_unreachable ();
6499
6500 case DFmode:
6501 if (cum->float_in_sse < 2)
6502 break;
6503 case SFmode:
6504 if (cum->float_in_sse < 1)
6505 break;
6506 /* FALLTHRU */
6507
6508 case V8SFmode:
6509 case V8SImode:
6510 case V32QImode:
6511 case V16HImode:
6512 case V4DFmode:
6513 case V4DImode:
6514 case TImode:
6515 case V16QImode:
6516 case V8HImode:
6517 case V4SImode:
6518 case V2DImode:
6519 case V4SFmode:
6520 case V2DFmode:
6521 if (!type || !AGGREGATE_TYPE_P (type))
6522 {
6523 cum->sse_words += words;
6524 cum->sse_nregs -= 1;
6525 cum->sse_regno += 1;
6526 if (cum->sse_nregs <= 0)
6527 {
6528 cum->sse_nregs = 0;
6529 cum->sse_regno = 0;
6530 }
6531 }
6532 break;
6533
6534 case V8QImode:
6535 case V4HImode:
6536 case V2SImode:
6537 case V2SFmode:
6538 case V1TImode:
6539 case V1DImode:
6540 if (!type || !AGGREGATE_TYPE_P (type))
6541 {
6542 cum->mmx_words += words;
6543 cum->mmx_nregs -= 1;
6544 cum->mmx_regno += 1;
6545 if (cum->mmx_nregs <= 0)
6546 {
6547 cum->mmx_nregs = 0;
6548 cum->mmx_regno = 0;
6549 }
6550 }
6551 break;
6552 }
6553 }
6554
6555 static void
6556 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6557 const_tree type, HOST_WIDE_INT words, bool named)
6558 {
6559 int int_nregs, sse_nregs;
6560
6561 /* Unnamed 256bit vector mode parameters are passed on stack. */
6562 if (!named && VALID_AVX256_REG_MODE (mode))
6563 return;
6564
6565 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6566 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6567 {
6568 cum->nregs -= int_nregs;
6569 cum->sse_nregs -= sse_nregs;
6570 cum->regno += int_nregs;
6571 cum->sse_regno += sse_nregs;
6572 }
6573 else
6574 {
6575 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6576 cum->words = (cum->words + align - 1) & ~(align - 1);
6577 cum->words += words;
6578 }
6579 }
6580
6581 static void
6582 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6583 HOST_WIDE_INT words)
6584 {
6585 /* Otherwise, this should be passed indirect. */
6586 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6587
6588 cum->words += words;
6589 if (cum->nregs > 0)
6590 {
6591 cum->nregs -= 1;
6592 cum->regno += 1;
6593 }
6594 }
6595
6596 /* Update the data in CUM to advance over an argument of mode MODE and
6597 data type TYPE. (TYPE is null for libcalls where that information
6598 may not be available.) */
6599
6600 static void
6601 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6602 const_tree type, bool named)
6603 {
6604 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6605 HOST_WIDE_INT bytes, words;
6606
6607 if (mode == BLKmode)
6608 bytes = int_size_in_bytes (type);
6609 else
6610 bytes = GET_MODE_SIZE (mode);
6611 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6612
6613 if (type)
6614 mode = type_natural_mode (type, NULL);
6615
6616 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6617 function_arg_advance_ms_64 (cum, bytes, words);
6618 else if (TARGET_64BIT)
6619 function_arg_advance_64 (cum, mode, type, words, named);
6620 else
6621 function_arg_advance_32 (cum, mode, type, bytes, words);
6622 }
6623
6624 /* Define where to put the arguments to a function.
6625 Value is zero to push the argument on the stack,
6626 or a hard register in which to store the argument.
6627
6628 MODE is the argument's machine mode.
6629 TYPE is the data type of the argument (as a tree).
6630 This is null for libcalls where that information may
6631 not be available.
6632 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6633 the preceding args and about the function being called.
6634 NAMED is nonzero if this argument is a named parameter
6635 (otherwise it is an extra parameter matching an ellipsis). */
6636
6637 static rtx
6638 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6639 enum machine_mode orig_mode, const_tree type,
6640 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6641 {
6642 static bool warnedsse, warnedmmx;
6643
6644 /* Avoid the AL settings for the Unix64 ABI. */
6645 if (mode == VOIDmode)
6646 return constm1_rtx;
6647
6648 switch (mode)
6649 {
6650 default:
6651 break;
6652
6653 case BLKmode:
6654 if (bytes < 0)
6655 break;
6656 /* FALLTHRU */
6657 case DImode:
6658 case SImode:
6659 case HImode:
6660 case QImode:
6661 if (words <= cum->nregs)
6662 {
6663 int regno = cum->regno;
6664
6665 /* Fastcall allocates the first two DWORD (SImode) or
6666 smaller arguments to ECX and EDX if it isn't an
6667 aggregate type . */
6668 if (cum->fastcall)
6669 {
6670 if (mode == BLKmode
6671 || mode == DImode
6672 || (type && AGGREGATE_TYPE_P (type)))
6673 break;
6674
6675 /* ECX not EAX is the first allocated register. */
6676 if (regno == AX_REG)
6677 regno = CX_REG;
6678 }
6679 return gen_rtx_REG (mode, regno);
6680 }
6681 break;
6682
6683 case DFmode:
6684 if (cum->float_in_sse < 2)
6685 break;
6686 case SFmode:
6687 if (cum->float_in_sse < 1)
6688 break;
6689 /* FALLTHRU */
6690 case TImode:
6691 /* In 32bit, we pass TImode in xmm registers. */
6692 case V16QImode:
6693 case V8HImode:
6694 case V4SImode:
6695 case V2DImode:
6696 case V4SFmode:
6697 case V2DFmode:
6698 if (!type || !AGGREGATE_TYPE_P (type))
6699 {
6700 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6701 {
6702 warnedsse = true;
6703 warning (0, "SSE vector argument without SSE enabled "
6704 "changes the ABI");
6705 }
6706 if (cum->sse_nregs)
6707 return gen_reg_or_parallel (mode, orig_mode,
6708 cum->sse_regno + FIRST_SSE_REG);
6709 }
6710 break;
6711
6712 case OImode:
6713 /* OImode shouldn't be used directly. */
6714 gcc_unreachable ();
6715
6716 case V8SFmode:
6717 case V8SImode:
6718 case V32QImode:
6719 case V16HImode:
6720 case V4DFmode:
6721 case V4DImode:
6722 if (!type || !AGGREGATE_TYPE_P (type))
6723 {
6724 if (cum->sse_nregs)
6725 return gen_reg_or_parallel (mode, orig_mode,
6726 cum->sse_regno + FIRST_SSE_REG);
6727 }
6728 break;
6729
6730 case V8QImode:
6731 case V4HImode:
6732 case V2SImode:
6733 case V2SFmode:
6734 case V1TImode:
6735 case V1DImode:
6736 if (!type || !AGGREGATE_TYPE_P (type))
6737 {
6738 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6739 {
6740 warnedmmx = true;
6741 warning (0, "MMX vector argument without MMX enabled "
6742 "changes the ABI");
6743 }
6744 if (cum->mmx_nregs)
6745 return gen_reg_or_parallel (mode, orig_mode,
6746 cum->mmx_regno + FIRST_MMX_REG);
6747 }
6748 break;
6749 }
6750
6751 return NULL_RTX;
6752 }
6753
6754 static rtx
6755 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6756 enum machine_mode orig_mode, const_tree type, bool named)
6757 {
6758 /* Handle a hidden AL argument containing number of registers
6759 for varargs x86-64 functions. */
6760 if (mode == VOIDmode)
6761 return GEN_INT (cum->maybe_vaarg
6762 ? (cum->sse_nregs < 0
6763 ? X86_64_SSE_REGPARM_MAX
6764 : cum->sse_regno)
6765 : -1);
6766
6767 switch (mode)
6768 {
6769 default:
6770 break;
6771
6772 case V8SFmode:
6773 case V8SImode:
6774 case V32QImode:
6775 case V16HImode:
6776 case V4DFmode:
6777 case V4DImode:
6778 /* Unnamed 256bit vector mode parameters are passed on stack. */
6779 if (!named)
6780 return NULL;
6781 break;
6782 }
6783
6784 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6785 cum->sse_nregs,
6786 &x86_64_int_parameter_registers [cum->regno],
6787 cum->sse_regno);
6788 }
6789
6790 static rtx
6791 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6792 enum machine_mode orig_mode, bool named,
6793 HOST_WIDE_INT bytes)
6794 {
6795 unsigned int regno;
6796
6797 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6798 We use value of -2 to specify that current function call is MSABI. */
6799 if (mode == VOIDmode)
6800 return GEN_INT (-2);
6801
6802 /* If we've run out of registers, it goes on the stack. */
6803 if (cum->nregs == 0)
6804 return NULL_RTX;
6805
6806 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6807
6808 /* Only floating point modes are passed in anything but integer regs. */
6809 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6810 {
6811 if (named)
6812 regno = cum->regno + FIRST_SSE_REG;
6813 else
6814 {
6815 rtx t1, t2;
6816
6817 /* Unnamed floating parameters are passed in both the
6818 SSE and integer registers. */
6819 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6820 t2 = gen_rtx_REG (mode, regno);
6821 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6822 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6823 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6824 }
6825 }
6826 /* Handle aggregated types passed in register. */
6827 if (orig_mode == BLKmode)
6828 {
6829 if (bytes > 0 && bytes <= 8)
6830 mode = (bytes > 4 ? DImode : SImode);
6831 if (mode == BLKmode)
6832 mode = DImode;
6833 }
6834
6835 return gen_reg_or_parallel (mode, orig_mode, regno);
6836 }
6837
6838 /* Return where to put the arguments to a function.
6839 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6840
6841 MODE is the argument's machine mode. TYPE is the data type of the
6842 argument. It is null for libcalls where that information may not be
6843 available. CUM gives information about the preceding args and about
6844 the function being called. NAMED is nonzero if this argument is a
6845 named parameter (otherwise it is an extra parameter matching an
6846 ellipsis). */
6847
6848 static rtx
6849 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6850 const_tree type, bool named)
6851 {
6852 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6853 enum machine_mode mode = omode;
6854 HOST_WIDE_INT bytes, words;
6855 rtx arg;
6856
6857 if (mode == BLKmode)
6858 bytes = int_size_in_bytes (type);
6859 else
6860 bytes = GET_MODE_SIZE (mode);
6861 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6862
6863 /* To simplify the code below, represent vector types with a vector mode
6864 even if MMX/SSE are not active. */
6865 if (type && TREE_CODE (type) == VECTOR_TYPE)
6866 mode = type_natural_mode (type, cum);
6867
6868 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6869 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6870 else if (TARGET_64BIT)
6871 arg = function_arg_64 (cum, mode, omode, type, named);
6872 else
6873 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6874
6875 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6876 {
6877 /* This argument uses 256bit AVX modes. */
6878 if (cum->caller)
6879 cfun->machine->callee_pass_avx256_p = true;
6880 else
6881 cfun->machine->caller_pass_avx256_p = true;
6882 }
6883
6884 return arg;
6885 }
6886
6887 /* A C expression that indicates when an argument must be passed by
6888 reference. If nonzero for an argument, a copy of that argument is
6889 made in memory and a pointer to the argument is passed instead of
6890 the argument itself. The pointer is passed in whatever way is
6891 appropriate for passing a pointer to that type. */
6892
6893 static bool
6894 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6895 enum machine_mode mode ATTRIBUTE_UNUSED,
6896 const_tree type, bool named ATTRIBUTE_UNUSED)
6897 {
6898 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6899
6900 /* See Windows x64 Software Convention. */
6901 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6902 {
6903 int msize = (int) GET_MODE_SIZE (mode);
6904 if (type)
6905 {
6906 /* Arrays are passed by reference. */
6907 if (TREE_CODE (type) == ARRAY_TYPE)
6908 return true;
6909
6910 if (AGGREGATE_TYPE_P (type))
6911 {
6912 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6913 are passed by reference. */
6914 msize = int_size_in_bytes (type);
6915 }
6916 }
6917
6918 /* __m128 is passed by reference. */
6919 switch (msize) {
6920 case 1: case 2: case 4: case 8:
6921 break;
6922 default:
6923 return true;
6924 }
6925 }
6926 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6927 return 1;
6928
6929 return 0;
6930 }
6931
6932 /* Return true when TYPE should be 128bit aligned for 32bit argument
6933 passing ABI. XXX: This function is obsolete and is only used for
6934 checking psABI compatibility with previous versions of GCC. */
6935
6936 static bool
6937 ix86_compat_aligned_value_p (const_tree type)
6938 {
6939 enum machine_mode mode = TYPE_MODE (type);
6940 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6941 || mode == TDmode
6942 || mode == TFmode
6943 || mode == TCmode)
6944 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6945 return true;
6946 if (TYPE_ALIGN (type) < 128)
6947 return false;
6948
6949 if (AGGREGATE_TYPE_P (type))
6950 {
6951 /* Walk the aggregates recursively. */
6952 switch (TREE_CODE (type))
6953 {
6954 case RECORD_TYPE:
6955 case UNION_TYPE:
6956 case QUAL_UNION_TYPE:
6957 {
6958 tree field;
6959
6960 /* Walk all the structure fields. */
6961 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6962 {
6963 if (TREE_CODE (field) == FIELD_DECL
6964 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
6965 return true;
6966 }
6967 break;
6968 }
6969
6970 case ARRAY_TYPE:
6971 /* Just for use if some languages passes arrays by value. */
6972 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
6973 return true;
6974 break;
6975
6976 default:
6977 gcc_unreachable ();
6978 }
6979 }
6980 return false;
6981 }
6982
6983 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
6984 XXX: This function is obsolete and is only used for checking psABI
6985 compatibility with previous versions of GCC. */
6986
6987 static unsigned int
6988 ix86_compat_function_arg_boundary (enum machine_mode mode,
6989 const_tree type, unsigned int align)
6990 {
6991 /* In 32bit, only _Decimal128 and __float128 are aligned to their
6992 natural boundaries. */
6993 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6994 {
6995 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
6996 make an exception for SSE modes since these require 128bit
6997 alignment.
6998
6999 The handling here differs from field_alignment. ICC aligns MMX
7000 arguments to 4 byte boundaries, while structure fields are aligned
7001 to 8 byte boundaries. */
7002 if (!type)
7003 {
7004 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7005 align = PARM_BOUNDARY;
7006 }
7007 else
7008 {
7009 if (!ix86_compat_aligned_value_p (type))
7010 align = PARM_BOUNDARY;
7011 }
7012 }
7013 if (align > BIGGEST_ALIGNMENT)
7014 align = BIGGEST_ALIGNMENT;
7015 return align;
7016 }
7017
7018 /* Return true when TYPE should be 128bit aligned for 32bit argument
7019 passing ABI. */
7020
7021 static bool
7022 ix86_contains_aligned_value_p (const_tree type)
7023 {
7024 enum machine_mode mode = TYPE_MODE (type);
7025
7026 if (mode == XFmode || mode == XCmode)
7027 return false;
7028
7029 if (TYPE_ALIGN (type) < 128)
7030 return false;
7031
7032 if (AGGREGATE_TYPE_P (type))
7033 {
7034 /* Walk the aggregates recursively. */
7035 switch (TREE_CODE (type))
7036 {
7037 case RECORD_TYPE:
7038 case UNION_TYPE:
7039 case QUAL_UNION_TYPE:
7040 {
7041 tree field;
7042
7043 /* Walk all the structure fields. */
7044 for (field = TYPE_FIELDS (type);
7045 field;
7046 field = DECL_CHAIN (field))
7047 {
7048 if (TREE_CODE (field) == FIELD_DECL
7049 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7050 return true;
7051 }
7052 break;
7053 }
7054
7055 case ARRAY_TYPE:
7056 /* Just for use if some languages passes arrays by value. */
7057 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7058 return true;
7059 break;
7060
7061 default:
7062 gcc_unreachable ();
7063 }
7064 }
7065 else
7066 return TYPE_ALIGN (type) >= 128;
7067
7068 return false;
7069 }
7070
7071 /* Gives the alignment boundary, in bits, of an argument with the
7072 specified mode and type. */
7073
7074 static unsigned int
7075 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7076 {
7077 unsigned int align;
7078 if (type)
7079 {
7080 /* Since the main variant type is used for call, we convert it to
7081 the main variant type. */
7082 type = TYPE_MAIN_VARIANT (type);
7083 align = TYPE_ALIGN (type);
7084 }
7085 else
7086 align = GET_MODE_ALIGNMENT (mode);
7087 if (align < PARM_BOUNDARY)
7088 align = PARM_BOUNDARY;
7089 else
7090 {
7091 static bool warned;
7092 unsigned int saved_align = align;
7093
7094 if (!TARGET_64BIT)
7095 {
7096 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7097 if (!type)
7098 {
7099 if (mode == XFmode || mode == XCmode)
7100 align = PARM_BOUNDARY;
7101 }
7102 else if (!ix86_contains_aligned_value_p (type))
7103 align = PARM_BOUNDARY;
7104
7105 if (align < 128)
7106 align = PARM_BOUNDARY;
7107 }
7108
7109 if (warn_psabi
7110 && !warned
7111 && align != ix86_compat_function_arg_boundary (mode, type,
7112 saved_align))
7113 {
7114 warned = true;
7115 inform (input_location,
7116 "The ABI for passing parameters with %d-byte"
7117 " alignment has changed in GCC 4.6",
7118 align / BITS_PER_UNIT);
7119 }
7120 }
7121
7122 return align;
7123 }
7124
7125 /* Return true if N is a possible register number of function value. */
7126
7127 static bool
7128 ix86_function_value_regno_p (const unsigned int regno)
7129 {
7130 switch (regno)
7131 {
7132 case AX_REG:
7133 return true;
7134
7135 case FIRST_FLOAT_REG:
7136 /* TODO: The function should depend on current function ABI but
7137 builtins.c would need updating then. Therefore we use the
7138 default ABI. */
7139 if (TARGET_64BIT && ix86_abi == MS_ABI)
7140 return false;
7141 return TARGET_FLOAT_RETURNS_IN_80387;
7142
7143 case FIRST_SSE_REG:
7144 return TARGET_SSE;
7145
7146 case FIRST_MMX_REG:
7147 if (TARGET_MACHO || TARGET_64BIT)
7148 return false;
7149 return TARGET_MMX;
7150 }
7151
7152 return false;
7153 }
7154
7155 /* Define how to find the value returned by a function.
7156 VALTYPE is the data type of the value (as a tree).
7157 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7158 otherwise, FUNC is 0. */
7159
7160 static rtx
7161 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7162 const_tree fntype, const_tree fn)
7163 {
7164 unsigned int regno;
7165
7166 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7167 we normally prevent this case when mmx is not available. However
7168 some ABIs may require the result to be returned like DImode. */
7169 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7170 regno = FIRST_MMX_REG;
7171
7172 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7173 we prevent this case when sse is not available. However some ABIs
7174 may require the result to be returned like integer TImode. */
7175 else if (mode == TImode
7176 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7177 regno = FIRST_SSE_REG;
7178
7179 /* 32-byte vector modes in %ymm0. */
7180 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7181 regno = FIRST_SSE_REG;
7182
7183 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7184 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7185 regno = FIRST_FLOAT_REG;
7186 else
7187 /* Most things go in %eax. */
7188 regno = AX_REG;
7189
7190 /* Override FP return register with %xmm0 for local functions when
7191 SSE math is enabled or for functions with sseregparm attribute. */
7192 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7193 {
7194 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7195 if ((sse_level >= 1 && mode == SFmode)
7196 || (sse_level == 2 && mode == DFmode))
7197 regno = FIRST_SSE_REG;
7198 }
7199
7200 /* OImode shouldn't be used directly. */
7201 gcc_assert (mode != OImode);
7202
7203 return gen_rtx_REG (orig_mode, regno);
7204 }
7205
7206 static rtx
7207 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7208 const_tree valtype)
7209 {
7210 rtx ret;
7211
7212 /* Handle libcalls, which don't provide a type node. */
7213 if (valtype == NULL)
7214 {
7215 unsigned int regno;
7216
7217 switch (mode)
7218 {
7219 case SFmode:
7220 case SCmode:
7221 case DFmode:
7222 case DCmode:
7223 case TFmode:
7224 case SDmode:
7225 case DDmode:
7226 case TDmode:
7227 regno = FIRST_SSE_REG;
7228 break;
7229 case XFmode:
7230 case XCmode:
7231 regno = FIRST_FLOAT_REG;
7232 break;
7233 case TCmode:
7234 return NULL;
7235 default:
7236 regno = AX_REG;
7237 }
7238
7239 return gen_rtx_REG (mode, regno);
7240 }
7241 else if (POINTER_TYPE_P (valtype))
7242 {
7243 /* Pointers are always returned in Pmode. */
7244 mode = Pmode;
7245 }
7246
7247 ret = construct_container (mode, orig_mode, valtype, 1,
7248 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7249 x86_64_int_return_registers, 0);
7250
7251 /* For zero sized structures, construct_container returns NULL, but we
7252 need to keep rest of compiler happy by returning meaningful value. */
7253 if (!ret)
7254 ret = gen_rtx_REG (orig_mode, AX_REG);
7255
7256 return ret;
7257 }
7258
7259 static rtx
7260 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7261 {
7262 unsigned int regno = AX_REG;
7263
7264 if (TARGET_SSE)
7265 {
7266 switch (GET_MODE_SIZE (mode))
7267 {
7268 case 16:
7269 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7270 && !COMPLEX_MODE_P (mode))
7271 regno = FIRST_SSE_REG;
7272 break;
7273 case 8:
7274 case 4:
7275 if (mode == SFmode || mode == DFmode)
7276 regno = FIRST_SSE_REG;
7277 break;
7278 default:
7279 break;
7280 }
7281 }
7282 return gen_rtx_REG (orig_mode, regno);
7283 }
7284
7285 static rtx
7286 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7287 enum machine_mode orig_mode, enum machine_mode mode)
7288 {
7289 const_tree fn, fntype;
7290
7291 fn = NULL_TREE;
7292 if (fntype_or_decl && DECL_P (fntype_or_decl))
7293 fn = fntype_or_decl;
7294 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7295
7296 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7297 return function_value_ms_64 (orig_mode, mode);
7298 else if (TARGET_64BIT)
7299 return function_value_64 (orig_mode, mode, valtype);
7300 else
7301 return function_value_32 (orig_mode, mode, fntype, fn);
7302 }
7303
7304 static rtx
7305 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7306 bool outgoing ATTRIBUTE_UNUSED)
7307 {
7308 enum machine_mode mode, orig_mode;
7309
7310 orig_mode = TYPE_MODE (valtype);
7311 mode = type_natural_mode (valtype, NULL);
7312 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7313 }
7314
7315 /* Pointer function arguments and return values are promoted to Pmode. */
7316
7317 static enum machine_mode
7318 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7319 int *punsignedp, const_tree fntype,
7320 int for_return)
7321 {
7322 if (type != NULL_TREE && POINTER_TYPE_P (type))
7323 {
7324 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7325 return Pmode;
7326 }
7327 return default_promote_function_mode (type, mode, punsignedp, fntype,
7328 for_return);
7329 }
7330
7331 rtx
7332 ix86_libcall_value (enum machine_mode mode)
7333 {
7334 return ix86_function_value_1 (NULL, NULL, mode, mode);
7335 }
7336
7337 /* Return true iff type is returned in memory. */
7338
7339 static bool ATTRIBUTE_UNUSED
7340 return_in_memory_32 (const_tree type, enum machine_mode mode)
7341 {
7342 HOST_WIDE_INT size;
7343
7344 if (mode == BLKmode)
7345 return true;
7346
7347 size = int_size_in_bytes (type);
7348
7349 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7350 return false;
7351
7352 if (VECTOR_MODE_P (mode) || mode == TImode)
7353 {
7354 /* User-created vectors small enough to fit in EAX. */
7355 if (size < 8)
7356 return false;
7357
7358 /* MMX/3dNow values are returned in MM0,
7359 except when it doesn't exits or the ABI prescribes otherwise. */
7360 if (size == 8)
7361 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7362
7363 /* SSE values are returned in XMM0, except when it doesn't exist. */
7364 if (size == 16)
7365 return !TARGET_SSE;
7366
7367 /* AVX values are returned in YMM0, except when it doesn't exist. */
7368 if (size == 32)
7369 return !TARGET_AVX;
7370 }
7371
7372 if (mode == XFmode)
7373 return false;
7374
7375 if (size > 12)
7376 return true;
7377
7378 /* OImode shouldn't be used directly. */
7379 gcc_assert (mode != OImode);
7380
7381 return false;
7382 }
7383
7384 static bool ATTRIBUTE_UNUSED
7385 return_in_memory_64 (const_tree type, enum machine_mode mode)
7386 {
7387 int needed_intregs, needed_sseregs;
7388 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7389 }
7390
7391 static bool ATTRIBUTE_UNUSED
7392 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7393 {
7394 HOST_WIDE_INT size = int_size_in_bytes (type);
7395
7396 /* __m128 is returned in xmm0. */
7397 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7398 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7399 return false;
7400
7401 /* Otherwise, the size must be exactly in [1248]. */
7402 return size != 1 && size != 2 && size != 4 && size != 8;
7403 }
7404
7405 static bool
7406 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7407 {
7408 #ifdef SUBTARGET_RETURN_IN_MEMORY
7409 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7410 #else
7411 const enum machine_mode mode = type_natural_mode (type, NULL);
7412
7413 if (TARGET_64BIT)
7414 {
7415 if (ix86_function_type_abi (fntype) == MS_ABI)
7416 return return_in_memory_ms_64 (type, mode);
7417 else
7418 return return_in_memory_64 (type, mode);
7419 }
7420 else
7421 return return_in_memory_32 (type, mode);
7422 #endif
7423 }
7424
7425 /* When returning SSE vector types, we have a choice of either
7426 (1) being abi incompatible with a -march switch, or
7427 (2) generating an error.
7428 Given no good solution, I think the safest thing is one warning.
7429 The user won't be able to use -Werror, but....
7430
7431 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7432 called in response to actually generating a caller or callee that
7433 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7434 via aggregate_value_p for general type probing from tree-ssa. */
7435
7436 static rtx
7437 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7438 {
7439 static bool warnedsse, warnedmmx;
7440
7441 if (!TARGET_64BIT && type)
7442 {
7443 /* Look at the return type of the function, not the function type. */
7444 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7445
7446 if (!TARGET_SSE && !warnedsse)
7447 {
7448 if (mode == TImode
7449 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7450 {
7451 warnedsse = true;
7452 warning (0, "SSE vector return without SSE enabled "
7453 "changes the ABI");
7454 }
7455 }
7456
7457 if (!TARGET_MMX && !warnedmmx)
7458 {
7459 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7460 {
7461 warnedmmx = true;
7462 warning (0, "MMX vector return without MMX enabled "
7463 "changes the ABI");
7464 }
7465 }
7466 }
7467
7468 return NULL;
7469 }
7470
7471 \f
7472 /* Create the va_list data type. */
7473
7474 /* Returns the calling convention specific va_list date type.
7475 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7476
7477 static tree
7478 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7479 {
7480 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7481
7482 /* For i386 we use plain pointer to argument area. */
7483 if (!TARGET_64BIT || abi == MS_ABI)
7484 return build_pointer_type (char_type_node);
7485
7486 record = lang_hooks.types.make_type (RECORD_TYPE);
7487 type_decl = build_decl (BUILTINS_LOCATION,
7488 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7489
7490 f_gpr = build_decl (BUILTINS_LOCATION,
7491 FIELD_DECL, get_identifier ("gp_offset"),
7492 unsigned_type_node);
7493 f_fpr = build_decl (BUILTINS_LOCATION,
7494 FIELD_DECL, get_identifier ("fp_offset"),
7495 unsigned_type_node);
7496 f_ovf = build_decl (BUILTINS_LOCATION,
7497 FIELD_DECL, get_identifier ("overflow_arg_area"),
7498 ptr_type_node);
7499 f_sav = build_decl (BUILTINS_LOCATION,
7500 FIELD_DECL, get_identifier ("reg_save_area"),
7501 ptr_type_node);
7502
7503 va_list_gpr_counter_field = f_gpr;
7504 va_list_fpr_counter_field = f_fpr;
7505
7506 DECL_FIELD_CONTEXT (f_gpr) = record;
7507 DECL_FIELD_CONTEXT (f_fpr) = record;
7508 DECL_FIELD_CONTEXT (f_ovf) = record;
7509 DECL_FIELD_CONTEXT (f_sav) = record;
7510
7511 TYPE_STUB_DECL (record) = type_decl;
7512 TYPE_NAME (record) = type_decl;
7513 TYPE_FIELDS (record) = f_gpr;
7514 DECL_CHAIN (f_gpr) = f_fpr;
7515 DECL_CHAIN (f_fpr) = f_ovf;
7516 DECL_CHAIN (f_ovf) = f_sav;
7517
7518 layout_type (record);
7519
7520 /* The correct type is an array type of one element. */
7521 return build_array_type (record, build_index_type (size_zero_node));
7522 }
7523
7524 /* Setup the builtin va_list data type and for 64-bit the additional
7525 calling convention specific va_list data types. */
7526
7527 static tree
7528 ix86_build_builtin_va_list (void)
7529 {
7530 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7531
7532 /* Initialize abi specific va_list builtin types. */
7533 if (TARGET_64BIT)
7534 {
7535 tree t;
7536 if (ix86_abi == MS_ABI)
7537 {
7538 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7539 if (TREE_CODE (t) != RECORD_TYPE)
7540 t = build_variant_type_copy (t);
7541 sysv_va_list_type_node = t;
7542 }
7543 else
7544 {
7545 t = ret;
7546 if (TREE_CODE (t) != RECORD_TYPE)
7547 t = build_variant_type_copy (t);
7548 sysv_va_list_type_node = t;
7549 }
7550 if (ix86_abi != MS_ABI)
7551 {
7552 t = ix86_build_builtin_va_list_abi (MS_ABI);
7553 if (TREE_CODE (t) != RECORD_TYPE)
7554 t = build_variant_type_copy (t);
7555 ms_va_list_type_node = t;
7556 }
7557 else
7558 {
7559 t = ret;
7560 if (TREE_CODE (t) != RECORD_TYPE)
7561 t = build_variant_type_copy (t);
7562 ms_va_list_type_node = t;
7563 }
7564 }
7565
7566 return ret;
7567 }
7568
7569 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7570
7571 static void
7572 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7573 {
7574 rtx save_area, mem;
7575 alias_set_type set;
7576 int i, max;
7577
7578 /* GPR size of varargs save area. */
7579 if (cfun->va_list_gpr_size)
7580 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7581 else
7582 ix86_varargs_gpr_size = 0;
7583
7584 /* FPR size of varargs save area. We don't need it if we don't pass
7585 anything in SSE registers. */
7586 if (TARGET_SSE && cfun->va_list_fpr_size)
7587 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7588 else
7589 ix86_varargs_fpr_size = 0;
7590
7591 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7592 return;
7593
7594 save_area = frame_pointer_rtx;
7595 set = get_varargs_alias_set ();
7596
7597 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7598 if (max > X86_64_REGPARM_MAX)
7599 max = X86_64_REGPARM_MAX;
7600
7601 for (i = cum->regno; i < max; i++)
7602 {
7603 mem = gen_rtx_MEM (Pmode,
7604 plus_constant (save_area, i * UNITS_PER_WORD));
7605 MEM_NOTRAP_P (mem) = 1;
7606 set_mem_alias_set (mem, set);
7607 emit_move_insn (mem, gen_rtx_REG (Pmode,
7608 x86_64_int_parameter_registers[i]));
7609 }
7610
7611 if (ix86_varargs_fpr_size)
7612 {
7613 enum machine_mode smode;
7614 rtx label, test;
7615
7616 /* Now emit code to save SSE registers. The AX parameter contains number
7617 of SSE parameter registers used to call this function, though all we
7618 actually check here is the zero/non-zero status. */
7619
7620 label = gen_label_rtx ();
7621 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7622 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7623 label));
7624
7625 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7626 we used movdqa (i.e. TImode) instead? Perhaps even better would
7627 be if we could determine the real mode of the data, via a hook
7628 into pass_stdarg. Ignore all that for now. */
7629 smode = V4SFmode;
7630 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7631 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7632
7633 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7634 if (max > X86_64_SSE_REGPARM_MAX)
7635 max = X86_64_SSE_REGPARM_MAX;
7636
7637 for (i = cum->sse_regno; i < max; ++i)
7638 {
7639 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7640 mem = gen_rtx_MEM (smode, mem);
7641 MEM_NOTRAP_P (mem) = 1;
7642 set_mem_alias_set (mem, set);
7643 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7644
7645 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7646 }
7647
7648 emit_label (label);
7649 }
7650 }
7651
7652 static void
7653 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7654 {
7655 alias_set_type set = get_varargs_alias_set ();
7656 int i;
7657
7658 /* Reset to zero, as there might be a sysv vaarg used
7659 before. */
7660 ix86_varargs_gpr_size = 0;
7661 ix86_varargs_fpr_size = 0;
7662
7663 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7664 {
7665 rtx reg, mem;
7666
7667 mem = gen_rtx_MEM (Pmode,
7668 plus_constant (virtual_incoming_args_rtx,
7669 i * UNITS_PER_WORD));
7670 MEM_NOTRAP_P (mem) = 1;
7671 set_mem_alias_set (mem, set);
7672
7673 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7674 emit_move_insn (mem, reg);
7675 }
7676 }
7677
7678 static void
7679 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7680 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7681 int no_rtl)
7682 {
7683 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7684 CUMULATIVE_ARGS next_cum;
7685 tree fntype;
7686
7687 /* This argument doesn't appear to be used anymore. Which is good,
7688 because the old code here didn't suppress rtl generation. */
7689 gcc_assert (!no_rtl);
7690
7691 if (!TARGET_64BIT)
7692 return;
7693
7694 fntype = TREE_TYPE (current_function_decl);
7695
7696 /* For varargs, we do not want to skip the dummy va_dcl argument.
7697 For stdargs, we do want to skip the last named argument. */
7698 next_cum = *cum;
7699 if (stdarg_p (fntype))
7700 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7701 true);
7702
7703 if (cum->call_abi == MS_ABI)
7704 setup_incoming_varargs_ms_64 (&next_cum);
7705 else
7706 setup_incoming_varargs_64 (&next_cum);
7707 }
7708
7709 /* Checks if TYPE is of kind va_list char *. */
7710
7711 static bool
7712 is_va_list_char_pointer (tree type)
7713 {
7714 tree canonic;
7715
7716 /* For 32-bit it is always true. */
7717 if (!TARGET_64BIT)
7718 return true;
7719 canonic = ix86_canonical_va_list_type (type);
7720 return (canonic == ms_va_list_type_node
7721 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7722 }
7723
7724 /* Implement va_start. */
7725
7726 static void
7727 ix86_va_start (tree valist, rtx nextarg)
7728 {
7729 HOST_WIDE_INT words, n_gpr, n_fpr;
7730 tree f_gpr, f_fpr, f_ovf, f_sav;
7731 tree gpr, fpr, ovf, sav, t;
7732 tree type;
7733 rtx ovf_rtx;
7734
7735 if (flag_split_stack
7736 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7737 {
7738 unsigned int scratch_regno;
7739
7740 /* When we are splitting the stack, we can't refer to the stack
7741 arguments using internal_arg_pointer, because they may be on
7742 the old stack. The split stack prologue will arrange to
7743 leave a pointer to the old stack arguments in a scratch
7744 register, which we here copy to a pseudo-register. The split
7745 stack prologue can't set the pseudo-register directly because
7746 it (the prologue) runs before any registers have been saved. */
7747
7748 scratch_regno = split_stack_prologue_scratch_regno ();
7749 if (scratch_regno != INVALID_REGNUM)
7750 {
7751 rtx reg, seq;
7752
7753 reg = gen_reg_rtx (Pmode);
7754 cfun->machine->split_stack_varargs_pointer = reg;
7755
7756 start_sequence ();
7757 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7758 seq = get_insns ();
7759 end_sequence ();
7760
7761 push_topmost_sequence ();
7762 emit_insn_after (seq, entry_of_function ());
7763 pop_topmost_sequence ();
7764 }
7765 }
7766
7767 /* Only 64bit target needs something special. */
7768 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7769 {
7770 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7771 std_expand_builtin_va_start (valist, nextarg);
7772 else
7773 {
7774 rtx va_r, next;
7775
7776 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7777 next = expand_binop (ptr_mode, add_optab,
7778 cfun->machine->split_stack_varargs_pointer,
7779 crtl->args.arg_offset_rtx,
7780 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7781 convert_move (va_r, next, 0);
7782 }
7783 return;
7784 }
7785
7786 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7787 f_fpr = DECL_CHAIN (f_gpr);
7788 f_ovf = DECL_CHAIN (f_fpr);
7789 f_sav = DECL_CHAIN (f_ovf);
7790
7791 valist = build_simple_mem_ref (valist);
7792 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7793 /* The following should be folded into the MEM_REF offset. */
7794 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7795 f_gpr, NULL_TREE);
7796 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7797 f_fpr, NULL_TREE);
7798 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7799 f_ovf, NULL_TREE);
7800 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7801 f_sav, NULL_TREE);
7802
7803 /* Count number of gp and fp argument registers used. */
7804 words = crtl->args.info.words;
7805 n_gpr = crtl->args.info.regno;
7806 n_fpr = crtl->args.info.sse_regno;
7807
7808 if (cfun->va_list_gpr_size)
7809 {
7810 type = TREE_TYPE (gpr);
7811 t = build2 (MODIFY_EXPR, type,
7812 gpr, build_int_cst (type, n_gpr * 8));
7813 TREE_SIDE_EFFECTS (t) = 1;
7814 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7815 }
7816
7817 if (TARGET_SSE && cfun->va_list_fpr_size)
7818 {
7819 type = TREE_TYPE (fpr);
7820 t = build2 (MODIFY_EXPR, type, fpr,
7821 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7822 TREE_SIDE_EFFECTS (t) = 1;
7823 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7824 }
7825
7826 /* Find the overflow area. */
7827 type = TREE_TYPE (ovf);
7828 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7829 ovf_rtx = crtl->args.internal_arg_pointer;
7830 else
7831 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7832 t = make_tree (type, ovf_rtx);
7833 if (words != 0)
7834 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7835 t = build2 (MODIFY_EXPR, type, ovf, t);
7836 TREE_SIDE_EFFECTS (t) = 1;
7837 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7838
7839 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7840 {
7841 /* Find the register save area.
7842 Prologue of the function save it right above stack frame. */
7843 type = TREE_TYPE (sav);
7844 t = make_tree (type, frame_pointer_rtx);
7845 if (!ix86_varargs_gpr_size)
7846 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7847 t = build2 (MODIFY_EXPR, type, sav, t);
7848 TREE_SIDE_EFFECTS (t) = 1;
7849 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7850 }
7851 }
7852
7853 /* Implement va_arg. */
7854
7855 static tree
7856 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7857 gimple_seq *post_p)
7858 {
7859 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7860 tree f_gpr, f_fpr, f_ovf, f_sav;
7861 tree gpr, fpr, ovf, sav, t;
7862 int size, rsize;
7863 tree lab_false, lab_over = NULL_TREE;
7864 tree addr, t2;
7865 rtx container;
7866 int indirect_p = 0;
7867 tree ptrtype;
7868 enum machine_mode nat_mode;
7869 unsigned int arg_boundary;
7870
7871 /* Only 64bit target needs something special. */
7872 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7873 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7874
7875 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7876 f_fpr = DECL_CHAIN (f_gpr);
7877 f_ovf = DECL_CHAIN (f_fpr);
7878 f_sav = DECL_CHAIN (f_ovf);
7879
7880 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7881 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7882 valist = build_va_arg_indirect_ref (valist);
7883 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7884 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7885 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7886
7887 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7888 if (indirect_p)
7889 type = build_pointer_type (type);
7890 size = int_size_in_bytes (type);
7891 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7892
7893 nat_mode = type_natural_mode (type, NULL);
7894 switch (nat_mode)
7895 {
7896 case V8SFmode:
7897 case V8SImode:
7898 case V32QImode:
7899 case V16HImode:
7900 case V4DFmode:
7901 case V4DImode:
7902 /* Unnamed 256bit vector mode parameters are passed on stack. */
7903 if (!TARGET_64BIT_MS_ABI)
7904 {
7905 container = NULL;
7906 break;
7907 }
7908
7909 default:
7910 container = construct_container (nat_mode, TYPE_MODE (type),
7911 type, 0, X86_64_REGPARM_MAX,
7912 X86_64_SSE_REGPARM_MAX, intreg,
7913 0);
7914 break;
7915 }
7916
7917 /* Pull the value out of the saved registers. */
7918
7919 addr = create_tmp_var (ptr_type_node, "addr");
7920
7921 if (container)
7922 {
7923 int needed_intregs, needed_sseregs;
7924 bool need_temp;
7925 tree int_addr, sse_addr;
7926
7927 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7928 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7929
7930 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7931
7932 need_temp = (!REG_P (container)
7933 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7934 || TYPE_ALIGN (type) > 128));
7935
7936 /* In case we are passing structure, verify that it is consecutive block
7937 on the register save area. If not we need to do moves. */
7938 if (!need_temp && !REG_P (container))
7939 {
7940 /* Verify that all registers are strictly consecutive */
7941 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7942 {
7943 int i;
7944
7945 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7946 {
7947 rtx slot = XVECEXP (container, 0, i);
7948 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7949 || INTVAL (XEXP (slot, 1)) != i * 16)
7950 need_temp = 1;
7951 }
7952 }
7953 else
7954 {
7955 int i;
7956
7957 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7958 {
7959 rtx slot = XVECEXP (container, 0, i);
7960 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7961 || INTVAL (XEXP (slot, 1)) != i * 8)
7962 need_temp = 1;
7963 }
7964 }
7965 }
7966 if (!need_temp)
7967 {
7968 int_addr = addr;
7969 sse_addr = addr;
7970 }
7971 else
7972 {
7973 int_addr = create_tmp_var (ptr_type_node, "int_addr");
7974 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7975 }
7976
7977 /* First ensure that we fit completely in registers. */
7978 if (needed_intregs)
7979 {
7980 t = build_int_cst (TREE_TYPE (gpr),
7981 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7982 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7983 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7984 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7985 gimplify_and_add (t, pre_p);
7986 }
7987 if (needed_sseregs)
7988 {
7989 t = build_int_cst (TREE_TYPE (fpr),
7990 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
7991 + X86_64_REGPARM_MAX * 8);
7992 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
7993 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7994 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7995 gimplify_and_add (t, pre_p);
7996 }
7997
7998 /* Compute index to start of area used for integer regs. */
7999 if (needed_intregs)
8000 {
8001 /* int_addr = gpr + sav; */
8002 t = fold_build_pointer_plus (sav, gpr);
8003 gimplify_assign (int_addr, t, pre_p);
8004 }
8005 if (needed_sseregs)
8006 {
8007 /* sse_addr = fpr + sav; */
8008 t = fold_build_pointer_plus (sav, fpr);
8009 gimplify_assign (sse_addr, t, pre_p);
8010 }
8011 if (need_temp)
8012 {
8013 int i, prev_size = 0;
8014 tree temp = create_tmp_var (type, "va_arg_tmp");
8015
8016 /* addr = &temp; */
8017 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8018 gimplify_assign (addr, t, pre_p);
8019
8020 for (i = 0; i < XVECLEN (container, 0); i++)
8021 {
8022 rtx slot = XVECEXP (container, 0, i);
8023 rtx reg = XEXP (slot, 0);
8024 enum machine_mode mode = GET_MODE (reg);
8025 tree piece_type;
8026 tree addr_type;
8027 tree daddr_type;
8028 tree src_addr, src;
8029 int src_offset;
8030 tree dest_addr, dest;
8031 int cur_size = GET_MODE_SIZE (mode);
8032
8033 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8034 prev_size = INTVAL (XEXP (slot, 1));
8035 if (prev_size + cur_size > size)
8036 {
8037 cur_size = size - prev_size;
8038 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8039 if (mode == BLKmode)
8040 mode = QImode;
8041 }
8042 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8043 if (mode == GET_MODE (reg))
8044 addr_type = build_pointer_type (piece_type);
8045 else
8046 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8047 true);
8048 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8049 true);
8050
8051 if (SSE_REGNO_P (REGNO (reg)))
8052 {
8053 src_addr = sse_addr;
8054 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8055 }
8056 else
8057 {
8058 src_addr = int_addr;
8059 src_offset = REGNO (reg) * 8;
8060 }
8061 src_addr = fold_convert (addr_type, src_addr);
8062 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8063
8064 dest_addr = fold_convert (daddr_type, addr);
8065 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8066 if (cur_size == GET_MODE_SIZE (mode))
8067 {
8068 src = build_va_arg_indirect_ref (src_addr);
8069 dest = build_va_arg_indirect_ref (dest_addr);
8070
8071 gimplify_assign (dest, src, pre_p);
8072 }
8073 else
8074 {
8075 tree copy
8076 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8077 3, dest_addr, src_addr,
8078 size_int (cur_size));
8079 gimplify_and_add (copy, pre_p);
8080 }
8081 prev_size += cur_size;
8082 }
8083 }
8084
8085 if (needed_intregs)
8086 {
8087 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8088 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8089 gimplify_assign (gpr, t, pre_p);
8090 }
8091
8092 if (needed_sseregs)
8093 {
8094 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8095 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8096 gimplify_assign (fpr, t, pre_p);
8097 }
8098
8099 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8100
8101 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8102 }
8103
8104 /* ... otherwise out of the overflow area. */
8105
8106 /* When we align parameter on stack for caller, if the parameter
8107 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8108 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8109 here with caller. */
8110 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8111 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8112 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8113
8114 /* Care for on-stack alignment if needed. */
8115 if (arg_boundary <= 64 || size == 0)
8116 t = ovf;
8117 else
8118 {
8119 HOST_WIDE_INT align = arg_boundary / 8;
8120 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8121 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8122 build_int_cst (TREE_TYPE (t), -align));
8123 }
8124
8125 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8126 gimplify_assign (addr, t, pre_p);
8127
8128 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8129 gimplify_assign (unshare_expr (ovf), t, pre_p);
8130
8131 if (container)
8132 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8133
8134 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8135 addr = fold_convert (ptrtype, addr);
8136
8137 if (indirect_p)
8138 addr = build_va_arg_indirect_ref (addr);
8139 return build_va_arg_indirect_ref (addr);
8140 }
8141 \f
8142 /* Return true if OPNUM's MEM should be matched
8143 in movabs* patterns. */
8144
8145 bool
8146 ix86_check_movabs (rtx insn, int opnum)
8147 {
8148 rtx set, mem;
8149
8150 set = PATTERN (insn);
8151 if (GET_CODE (set) == PARALLEL)
8152 set = XVECEXP (set, 0, 0);
8153 gcc_assert (GET_CODE (set) == SET);
8154 mem = XEXP (set, opnum);
8155 while (GET_CODE (mem) == SUBREG)
8156 mem = SUBREG_REG (mem);
8157 gcc_assert (MEM_P (mem));
8158 return volatile_ok || !MEM_VOLATILE_P (mem);
8159 }
8160 \f
8161 /* Initialize the table of extra 80387 mathematical constants. */
8162
8163 static void
8164 init_ext_80387_constants (void)
8165 {
8166 static const char * cst[5] =
8167 {
8168 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8169 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8170 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8171 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8172 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8173 };
8174 int i;
8175
8176 for (i = 0; i < 5; i++)
8177 {
8178 real_from_string (&ext_80387_constants_table[i], cst[i]);
8179 /* Ensure each constant is rounded to XFmode precision. */
8180 real_convert (&ext_80387_constants_table[i],
8181 XFmode, &ext_80387_constants_table[i]);
8182 }
8183
8184 ext_80387_constants_init = 1;
8185 }
8186
8187 /* Return non-zero if the constant is something that
8188 can be loaded with a special instruction. */
8189
8190 int
8191 standard_80387_constant_p (rtx x)
8192 {
8193 enum machine_mode mode = GET_MODE (x);
8194
8195 REAL_VALUE_TYPE r;
8196
8197 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8198 return -1;
8199
8200 if (x == CONST0_RTX (mode))
8201 return 1;
8202 if (x == CONST1_RTX (mode))
8203 return 2;
8204
8205 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8206
8207 /* For XFmode constants, try to find a special 80387 instruction when
8208 optimizing for size or on those CPUs that benefit from them. */
8209 if (mode == XFmode
8210 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8211 {
8212 int i;
8213
8214 if (! ext_80387_constants_init)
8215 init_ext_80387_constants ();
8216
8217 for (i = 0; i < 5; i++)
8218 if (real_identical (&r, &ext_80387_constants_table[i]))
8219 return i + 3;
8220 }
8221
8222 /* Load of the constant -0.0 or -1.0 will be split as
8223 fldz;fchs or fld1;fchs sequence. */
8224 if (real_isnegzero (&r))
8225 return 8;
8226 if (real_identical (&r, &dconstm1))
8227 return 9;
8228
8229 return 0;
8230 }
8231
8232 /* Return the opcode of the special instruction to be used to load
8233 the constant X. */
8234
8235 const char *
8236 standard_80387_constant_opcode (rtx x)
8237 {
8238 switch (standard_80387_constant_p (x))
8239 {
8240 case 1:
8241 return "fldz";
8242 case 2:
8243 return "fld1";
8244 case 3:
8245 return "fldlg2";
8246 case 4:
8247 return "fldln2";
8248 case 5:
8249 return "fldl2e";
8250 case 6:
8251 return "fldl2t";
8252 case 7:
8253 return "fldpi";
8254 case 8:
8255 case 9:
8256 return "#";
8257 default:
8258 gcc_unreachable ();
8259 }
8260 }
8261
8262 /* Return the CONST_DOUBLE representing the 80387 constant that is
8263 loaded by the specified special instruction. The argument IDX
8264 matches the return value from standard_80387_constant_p. */
8265
8266 rtx
8267 standard_80387_constant_rtx (int idx)
8268 {
8269 int i;
8270
8271 if (! ext_80387_constants_init)
8272 init_ext_80387_constants ();
8273
8274 switch (idx)
8275 {
8276 case 3:
8277 case 4:
8278 case 5:
8279 case 6:
8280 case 7:
8281 i = idx - 3;
8282 break;
8283
8284 default:
8285 gcc_unreachable ();
8286 }
8287
8288 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8289 XFmode);
8290 }
8291
8292 /* Return 1 if X is all 0s and 2 if x is all 1s
8293 in supported SSE/AVX vector mode. */
8294
8295 int
8296 standard_sse_constant_p (rtx x)
8297 {
8298 enum machine_mode mode = GET_MODE (x);
8299
8300 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8301 return 1;
8302 if (vector_all_ones_operand (x, mode))
8303 switch (mode)
8304 {
8305 case V16QImode:
8306 case V8HImode:
8307 case V4SImode:
8308 case V2DImode:
8309 if (TARGET_SSE2)
8310 return 2;
8311 case V32QImode:
8312 case V16HImode:
8313 case V8SImode:
8314 case V4DImode:
8315 if (TARGET_AVX2)
8316 return 2;
8317 default:
8318 break;
8319 }
8320
8321 return 0;
8322 }
8323
8324 /* Return the opcode of the special instruction to be used to load
8325 the constant X. */
8326
8327 const char *
8328 standard_sse_constant_opcode (rtx insn, rtx x)
8329 {
8330 switch (standard_sse_constant_p (x))
8331 {
8332 case 1:
8333 switch (get_attr_mode (insn))
8334 {
8335 case MODE_TI:
8336 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8337 return "%vpxor\t%0, %d0";
8338 case MODE_V2DF:
8339 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8340 return "%vxorpd\t%0, %d0";
8341 case MODE_V4SF:
8342 return "%vxorps\t%0, %d0";
8343
8344 case MODE_OI:
8345 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8346 return "vpxor\t%x0, %x0, %x0";
8347 case MODE_V4DF:
8348 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8349 return "vxorpd\t%x0, %x0, %x0";
8350 case MODE_V8SF:
8351 return "vxorps\t%x0, %x0, %x0";
8352
8353 default:
8354 break;
8355 }
8356
8357 case 2:
8358 if (TARGET_AVX)
8359 return "vpcmpeqd\t%0, %0, %0";
8360 else
8361 return "pcmpeqd\t%0, %0";
8362
8363 default:
8364 break;
8365 }
8366 gcc_unreachable ();
8367 }
8368
8369 /* Returns true if OP contains a symbol reference */
8370
8371 bool
8372 symbolic_reference_mentioned_p (rtx op)
8373 {
8374 const char *fmt;
8375 int i;
8376
8377 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8378 return true;
8379
8380 fmt = GET_RTX_FORMAT (GET_CODE (op));
8381 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8382 {
8383 if (fmt[i] == 'E')
8384 {
8385 int j;
8386
8387 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8388 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8389 return true;
8390 }
8391
8392 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8393 return true;
8394 }
8395
8396 return false;
8397 }
8398
8399 /* Return true if it is appropriate to emit `ret' instructions in the
8400 body of a function. Do this only if the epilogue is simple, needing a
8401 couple of insns. Prior to reloading, we can't tell how many registers
8402 must be saved, so return false then. Return false if there is no frame
8403 marker to de-allocate. */
8404
8405 bool
8406 ix86_can_use_return_insn_p (void)
8407 {
8408 struct ix86_frame frame;
8409
8410 if (! reload_completed || frame_pointer_needed)
8411 return 0;
8412
8413 /* Don't allow more than 32k pop, since that's all we can do
8414 with one instruction. */
8415 if (crtl->args.pops_args && crtl->args.size >= 32768)
8416 return 0;
8417
8418 ix86_compute_frame_layout (&frame);
8419 return (frame.stack_pointer_offset == UNITS_PER_WORD
8420 && (frame.nregs + frame.nsseregs) == 0);
8421 }
8422 \f
8423 /* Value should be nonzero if functions must have frame pointers.
8424 Zero means the frame pointer need not be set up (and parms may
8425 be accessed via the stack pointer) in functions that seem suitable. */
8426
8427 static bool
8428 ix86_frame_pointer_required (void)
8429 {
8430 /* If we accessed previous frames, then the generated code expects
8431 to be able to access the saved ebp value in our frame. */
8432 if (cfun->machine->accesses_prev_frame)
8433 return true;
8434
8435 /* Several x86 os'es need a frame pointer for other reasons,
8436 usually pertaining to setjmp. */
8437 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8438 return true;
8439
8440 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8441 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8442 return true;
8443
8444 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8445 turns off the frame pointer by default. Turn it back on now if
8446 we've not got a leaf function. */
8447 if (TARGET_OMIT_LEAF_FRAME_POINTER
8448 && (!current_function_is_leaf
8449 || ix86_current_function_calls_tls_descriptor))
8450 return true;
8451
8452 if (crtl->profile && !flag_fentry)
8453 return true;
8454
8455 return false;
8456 }
8457
8458 /* Record that the current function accesses previous call frames. */
8459
8460 void
8461 ix86_setup_frame_addresses (void)
8462 {
8463 cfun->machine->accesses_prev_frame = 1;
8464 }
8465 \f
8466 #ifndef USE_HIDDEN_LINKONCE
8467 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8468 # define USE_HIDDEN_LINKONCE 1
8469 # else
8470 # define USE_HIDDEN_LINKONCE 0
8471 # endif
8472 #endif
8473
8474 static int pic_labels_used;
8475
8476 /* Fills in the label name that should be used for a pc thunk for
8477 the given register. */
8478
8479 static void
8480 get_pc_thunk_name (char name[32], unsigned int regno)
8481 {
8482 gcc_assert (!TARGET_64BIT);
8483
8484 if (USE_HIDDEN_LINKONCE)
8485 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8486 else
8487 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8488 }
8489
8490
8491 /* This function generates code for -fpic that loads %ebx with
8492 the return address of the caller and then returns. */
8493
8494 static void
8495 ix86_code_end (void)
8496 {
8497 rtx xops[2];
8498 int regno;
8499
8500 for (regno = AX_REG; regno <= SP_REG; regno++)
8501 {
8502 char name[32];
8503 tree decl;
8504
8505 if (!(pic_labels_used & (1 << regno)))
8506 continue;
8507
8508 get_pc_thunk_name (name, regno);
8509
8510 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8511 get_identifier (name),
8512 build_function_type_list (void_type_node, NULL_TREE));
8513 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8514 NULL_TREE, void_type_node);
8515 TREE_PUBLIC (decl) = 1;
8516 TREE_STATIC (decl) = 1;
8517
8518 #if TARGET_MACHO
8519 if (TARGET_MACHO)
8520 {
8521 switch_to_section (darwin_sections[text_coal_section]);
8522 fputs ("\t.weak_definition\t", asm_out_file);
8523 assemble_name (asm_out_file, name);
8524 fputs ("\n\t.private_extern\t", asm_out_file);
8525 assemble_name (asm_out_file, name);
8526 putc ('\n', asm_out_file);
8527 ASM_OUTPUT_LABEL (asm_out_file, name);
8528 DECL_WEAK (decl) = 1;
8529 }
8530 else
8531 #endif
8532 if (USE_HIDDEN_LINKONCE)
8533 {
8534 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8535
8536 targetm.asm_out.unique_section (decl, 0);
8537 switch_to_section (get_named_section (decl, NULL, 0));
8538
8539 targetm.asm_out.globalize_label (asm_out_file, name);
8540 fputs ("\t.hidden\t", asm_out_file);
8541 assemble_name (asm_out_file, name);
8542 putc ('\n', asm_out_file);
8543 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8544 }
8545 else
8546 {
8547 switch_to_section (text_section);
8548 ASM_OUTPUT_LABEL (asm_out_file, name);
8549 }
8550
8551 DECL_INITIAL (decl) = make_node (BLOCK);
8552 current_function_decl = decl;
8553 init_function_start (decl);
8554 first_function_block_is_cold = false;
8555 /* Make sure unwind info is emitted for the thunk if needed. */
8556 final_start_function (emit_barrier (), asm_out_file, 1);
8557
8558 /* Pad stack IP move with 4 instructions (two NOPs count
8559 as one instruction). */
8560 if (TARGET_PAD_SHORT_FUNCTION)
8561 {
8562 int i = 8;
8563
8564 while (i--)
8565 fputs ("\tnop\n", asm_out_file);
8566 }
8567
8568 xops[0] = gen_rtx_REG (Pmode, regno);
8569 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8570 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8571 fputs ("\tret\n", asm_out_file);
8572 final_end_function ();
8573 init_insn_lengths ();
8574 free_after_compilation (cfun);
8575 set_cfun (NULL);
8576 current_function_decl = NULL;
8577 }
8578
8579 if (flag_split_stack)
8580 file_end_indicate_split_stack ();
8581 }
8582
8583 /* Emit code for the SET_GOT patterns. */
8584
8585 const char *
8586 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8587 {
8588 rtx xops[3];
8589
8590 xops[0] = dest;
8591
8592 if (TARGET_VXWORKS_RTP && flag_pic)
8593 {
8594 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8595 xops[2] = gen_rtx_MEM (Pmode,
8596 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8597 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8598
8599 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8600 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8601 an unadorned address. */
8602 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8603 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8604 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8605 return "";
8606 }
8607
8608 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8609
8610 if (!flag_pic)
8611 {
8612 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8613
8614 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8615
8616 #if TARGET_MACHO
8617 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8618 is what will be referenced by the Mach-O PIC subsystem. */
8619 if (!label)
8620 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8621 #endif
8622
8623 targetm.asm_out.internal_label (asm_out_file, "L",
8624 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8625 }
8626 else
8627 {
8628 char name[32];
8629 get_pc_thunk_name (name, REGNO (dest));
8630 pic_labels_used |= 1 << REGNO (dest);
8631
8632 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8633 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8634 output_asm_insn ("call\t%X2", xops);
8635 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8636 is what will be referenced by the Mach-O PIC subsystem. */
8637 #if TARGET_MACHO
8638 if (!label)
8639 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8640 else
8641 targetm.asm_out.internal_label (asm_out_file, "L",
8642 CODE_LABEL_NUMBER (label));
8643 #endif
8644 }
8645
8646 if (!TARGET_MACHO)
8647 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8648
8649 return "";
8650 }
8651
8652 /* Generate an "push" pattern for input ARG. */
8653
8654 static rtx
8655 gen_push (rtx arg)
8656 {
8657 struct machine_function *m = cfun->machine;
8658
8659 if (m->fs.cfa_reg == stack_pointer_rtx)
8660 m->fs.cfa_offset += UNITS_PER_WORD;
8661 m->fs.sp_offset += UNITS_PER_WORD;
8662
8663 return gen_rtx_SET (VOIDmode,
8664 gen_rtx_MEM (Pmode,
8665 gen_rtx_PRE_DEC (Pmode,
8666 stack_pointer_rtx)),
8667 arg);
8668 }
8669
8670 /* Generate an "pop" pattern for input ARG. */
8671
8672 static rtx
8673 gen_pop (rtx arg)
8674 {
8675 return gen_rtx_SET (VOIDmode,
8676 arg,
8677 gen_rtx_MEM (Pmode,
8678 gen_rtx_POST_INC (Pmode,
8679 stack_pointer_rtx)));
8680 }
8681
8682 /* Return >= 0 if there is an unused call-clobbered register available
8683 for the entire function. */
8684
8685 static unsigned int
8686 ix86_select_alt_pic_regnum (void)
8687 {
8688 if (current_function_is_leaf
8689 && !crtl->profile
8690 && !ix86_current_function_calls_tls_descriptor)
8691 {
8692 int i, drap;
8693 /* Can't use the same register for both PIC and DRAP. */
8694 if (crtl->drap_reg)
8695 drap = REGNO (crtl->drap_reg);
8696 else
8697 drap = -1;
8698 for (i = 2; i >= 0; --i)
8699 if (i != drap && !df_regs_ever_live_p (i))
8700 return i;
8701 }
8702
8703 return INVALID_REGNUM;
8704 }
8705
8706 /* Return TRUE if we need to save REGNO. */
8707
8708 static bool
8709 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8710 {
8711 if (pic_offset_table_rtx
8712 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8713 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8714 || crtl->profile
8715 || crtl->calls_eh_return
8716 || crtl->uses_const_pool))
8717 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8718
8719 if (crtl->calls_eh_return && maybe_eh_return)
8720 {
8721 unsigned i;
8722 for (i = 0; ; i++)
8723 {
8724 unsigned test = EH_RETURN_DATA_REGNO (i);
8725 if (test == INVALID_REGNUM)
8726 break;
8727 if (test == regno)
8728 return true;
8729 }
8730 }
8731
8732 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8733 return true;
8734
8735 return (df_regs_ever_live_p (regno)
8736 && !call_used_regs[regno]
8737 && !fixed_regs[regno]
8738 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8739 }
8740
8741 /* Return number of saved general prupose registers. */
8742
8743 static int
8744 ix86_nsaved_regs (void)
8745 {
8746 int nregs = 0;
8747 int regno;
8748
8749 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8750 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8751 nregs ++;
8752 return nregs;
8753 }
8754
8755 /* Return number of saved SSE registrers. */
8756
8757 static int
8758 ix86_nsaved_sseregs (void)
8759 {
8760 int nregs = 0;
8761 int regno;
8762
8763 if (!TARGET_64BIT_MS_ABI)
8764 return 0;
8765 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8766 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8767 nregs ++;
8768 return nregs;
8769 }
8770
8771 /* Given FROM and TO register numbers, say whether this elimination is
8772 allowed. If stack alignment is needed, we can only replace argument
8773 pointer with hard frame pointer, or replace frame pointer with stack
8774 pointer. Otherwise, frame pointer elimination is automatically
8775 handled and all other eliminations are valid. */
8776
8777 static bool
8778 ix86_can_eliminate (const int from, const int to)
8779 {
8780 if (stack_realign_fp)
8781 return ((from == ARG_POINTER_REGNUM
8782 && to == HARD_FRAME_POINTER_REGNUM)
8783 || (from == FRAME_POINTER_REGNUM
8784 && to == STACK_POINTER_REGNUM));
8785 else
8786 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8787 }
8788
8789 /* Return the offset between two registers, one to be eliminated, and the other
8790 its replacement, at the start of a routine. */
8791
8792 HOST_WIDE_INT
8793 ix86_initial_elimination_offset (int from, int to)
8794 {
8795 struct ix86_frame frame;
8796 ix86_compute_frame_layout (&frame);
8797
8798 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8799 return frame.hard_frame_pointer_offset;
8800 else if (from == FRAME_POINTER_REGNUM
8801 && to == HARD_FRAME_POINTER_REGNUM)
8802 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8803 else
8804 {
8805 gcc_assert (to == STACK_POINTER_REGNUM);
8806
8807 if (from == ARG_POINTER_REGNUM)
8808 return frame.stack_pointer_offset;
8809
8810 gcc_assert (from == FRAME_POINTER_REGNUM);
8811 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8812 }
8813 }
8814
8815 /* In a dynamically-aligned function, we can't know the offset from
8816 stack pointer to frame pointer, so we must ensure that setjmp
8817 eliminates fp against the hard fp (%ebp) rather than trying to
8818 index from %esp up to the top of the frame across a gap that is
8819 of unknown (at compile-time) size. */
8820 static rtx
8821 ix86_builtin_setjmp_frame_value (void)
8822 {
8823 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8824 }
8825
8826 /* When using -fsplit-stack, the allocation routines set a field in
8827 the TCB to the bottom of the stack plus this much space, measured
8828 in bytes. */
8829
8830 #define SPLIT_STACK_AVAILABLE 256
8831
8832 /* Fill structure ix86_frame about frame of currently computed function. */
8833
8834 static void
8835 ix86_compute_frame_layout (struct ix86_frame *frame)
8836 {
8837 unsigned int stack_alignment_needed;
8838 HOST_WIDE_INT offset;
8839 unsigned int preferred_alignment;
8840 HOST_WIDE_INT size = get_frame_size ();
8841 HOST_WIDE_INT to_allocate;
8842
8843 frame->nregs = ix86_nsaved_regs ();
8844 frame->nsseregs = ix86_nsaved_sseregs ();
8845
8846 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8847 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8848
8849 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8850 function prologues and leaf. */
8851 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8852 && (!current_function_is_leaf || cfun->calls_alloca != 0
8853 || ix86_current_function_calls_tls_descriptor))
8854 {
8855 preferred_alignment = 16;
8856 stack_alignment_needed = 16;
8857 crtl->preferred_stack_boundary = 128;
8858 crtl->stack_alignment_needed = 128;
8859 }
8860
8861 gcc_assert (!size || stack_alignment_needed);
8862 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8863 gcc_assert (preferred_alignment <= stack_alignment_needed);
8864
8865 /* For SEH we have to limit the amount of code movement into the prologue.
8866 At present we do this via a BLOCKAGE, at which point there's very little
8867 scheduling that can be done, which means that there's very little point
8868 in doing anything except PUSHs. */
8869 if (TARGET_SEH)
8870 cfun->machine->use_fast_prologue_epilogue = false;
8871
8872 /* During reload iteration the amount of registers saved can change.
8873 Recompute the value as needed. Do not recompute when amount of registers
8874 didn't change as reload does multiple calls to the function and does not
8875 expect the decision to change within single iteration. */
8876 else if (!optimize_function_for_size_p (cfun)
8877 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8878 {
8879 int count = frame->nregs;
8880 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8881
8882 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8883
8884 /* The fast prologue uses move instead of push to save registers. This
8885 is significantly longer, but also executes faster as modern hardware
8886 can execute the moves in parallel, but can't do that for push/pop.
8887
8888 Be careful about choosing what prologue to emit: When function takes
8889 many instructions to execute we may use slow version as well as in
8890 case function is known to be outside hot spot (this is known with
8891 feedback only). Weight the size of function by number of registers
8892 to save as it is cheap to use one or two push instructions but very
8893 slow to use many of them. */
8894 if (count)
8895 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8896 if (node->frequency < NODE_FREQUENCY_NORMAL
8897 || (flag_branch_probabilities
8898 && node->frequency < NODE_FREQUENCY_HOT))
8899 cfun->machine->use_fast_prologue_epilogue = false;
8900 else
8901 cfun->machine->use_fast_prologue_epilogue
8902 = !expensive_function_p (count);
8903 }
8904
8905 frame->save_regs_using_mov
8906 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8907 /* If static stack checking is enabled and done with probes,
8908 the registers need to be saved before allocating the frame. */
8909 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8910
8911 /* Skip return address. */
8912 offset = UNITS_PER_WORD;
8913
8914 /* Skip pushed static chain. */
8915 if (ix86_static_chain_on_stack)
8916 offset += UNITS_PER_WORD;
8917
8918 /* Skip saved base pointer. */
8919 if (frame_pointer_needed)
8920 offset += UNITS_PER_WORD;
8921 frame->hfp_save_offset = offset;
8922
8923 /* The traditional frame pointer location is at the top of the frame. */
8924 frame->hard_frame_pointer_offset = offset;
8925
8926 /* Register save area */
8927 offset += frame->nregs * UNITS_PER_WORD;
8928 frame->reg_save_offset = offset;
8929
8930 /* Align and set SSE register save area. */
8931 if (frame->nsseregs)
8932 {
8933 /* The only ABI that has saved SSE registers (Win64) also has a
8934 16-byte aligned default stack, and thus we don't need to be
8935 within the re-aligned local stack frame to save them. */
8936 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8937 offset = (offset + 16 - 1) & -16;
8938 offset += frame->nsseregs * 16;
8939 }
8940 frame->sse_reg_save_offset = offset;
8941
8942 /* The re-aligned stack starts here. Values before this point are not
8943 directly comparable with values below this point. In order to make
8944 sure that no value happens to be the same before and after, force
8945 the alignment computation below to add a non-zero value. */
8946 if (stack_realign_fp)
8947 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8948
8949 /* Va-arg area */
8950 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8951 offset += frame->va_arg_size;
8952
8953 /* Align start of frame for local function. */
8954 if (stack_realign_fp
8955 || offset != frame->sse_reg_save_offset
8956 || size != 0
8957 || !current_function_is_leaf
8958 || cfun->calls_alloca
8959 || ix86_current_function_calls_tls_descriptor)
8960 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
8961
8962 /* Frame pointer points here. */
8963 frame->frame_pointer_offset = offset;
8964
8965 offset += size;
8966
8967 /* Add outgoing arguments area. Can be skipped if we eliminated
8968 all the function calls as dead code.
8969 Skipping is however impossible when function calls alloca. Alloca
8970 expander assumes that last crtl->outgoing_args_size
8971 of stack frame are unused. */
8972 if (ACCUMULATE_OUTGOING_ARGS
8973 && (!current_function_is_leaf || cfun->calls_alloca
8974 || ix86_current_function_calls_tls_descriptor))
8975 {
8976 offset += crtl->outgoing_args_size;
8977 frame->outgoing_arguments_size = crtl->outgoing_args_size;
8978 }
8979 else
8980 frame->outgoing_arguments_size = 0;
8981
8982 /* Align stack boundary. Only needed if we're calling another function
8983 or using alloca. */
8984 if (!current_function_is_leaf || cfun->calls_alloca
8985 || ix86_current_function_calls_tls_descriptor)
8986 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
8987
8988 /* We've reached end of stack frame. */
8989 frame->stack_pointer_offset = offset;
8990
8991 /* Size prologue needs to allocate. */
8992 to_allocate = offset - frame->sse_reg_save_offset;
8993
8994 if ((!to_allocate && frame->nregs <= 1)
8995 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
8996 frame->save_regs_using_mov = false;
8997
8998 if (ix86_using_red_zone ()
8999 && current_function_sp_is_unchanging
9000 && current_function_is_leaf
9001 && !ix86_current_function_calls_tls_descriptor)
9002 {
9003 frame->red_zone_size = to_allocate;
9004 if (frame->save_regs_using_mov)
9005 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9006 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9007 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9008 }
9009 else
9010 frame->red_zone_size = 0;
9011 frame->stack_pointer_offset -= frame->red_zone_size;
9012
9013 /* The SEH frame pointer location is near the bottom of the frame.
9014 This is enforced by the fact that the difference between the
9015 stack pointer and the frame pointer is limited to 240 bytes in
9016 the unwind data structure. */
9017 if (TARGET_SEH)
9018 {
9019 HOST_WIDE_INT diff;
9020
9021 /* If we can leave the frame pointer where it is, do so. */
9022 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9023 if (diff > 240 || (diff & 15) != 0)
9024 {
9025 /* Ideally we'd determine what portion of the local stack frame
9026 (within the constraint of the lowest 240) is most heavily used.
9027 But without that complication, simply bias the frame pointer
9028 by 128 bytes so as to maximize the amount of the local stack
9029 frame that is addressable with 8-bit offsets. */
9030 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9031 }
9032 }
9033 }
9034
9035 /* This is semi-inlined memory_address_length, but simplified
9036 since we know that we're always dealing with reg+offset, and
9037 to avoid having to create and discard all that rtl. */
9038
9039 static inline int
9040 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9041 {
9042 int len = 4;
9043
9044 if (offset == 0)
9045 {
9046 /* EBP and R13 cannot be encoded without an offset. */
9047 len = (regno == BP_REG || regno == R13_REG);
9048 }
9049 else if (IN_RANGE (offset, -128, 127))
9050 len = 1;
9051
9052 /* ESP and R12 must be encoded with a SIB byte. */
9053 if (regno == SP_REG || regno == R12_REG)
9054 len++;
9055
9056 return len;
9057 }
9058
9059 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9060 The valid base registers are taken from CFUN->MACHINE->FS. */
9061
9062 static rtx
9063 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9064 {
9065 const struct machine_function *m = cfun->machine;
9066 rtx base_reg = NULL;
9067 HOST_WIDE_INT base_offset = 0;
9068
9069 if (m->use_fast_prologue_epilogue)
9070 {
9071 /* Choose the base register most likely to allow the most scheduling
9072 opportunities. Generally FP is valid througout the function,
9073 while DRAP must be reloaded within the epilogue. But choose either
9074 over the SP due to increased encoding size. */
9075
9076 if (m->fs.fp_valid)
9077 {
9078 base_reg = hard_frame_pointer_rtx;
9079 base_offset = m->fs.fp_offset - cfa_offset;
9080 }
9081 else if (m->fs.drap_valid)
9082 {
9083 base_reg = crtl->drap_reg;
9084 base_offset = 0 - cfa_offset;
9085 }
9086 else if (m->fs.sp_valid)
9087 {
9088 base_reg = stack_pointer_rtx;
9089 base_offset = m->fs.sp_offset - cfa_offset;
9090 }
9091 }
9092 else
9093 {
9094 HOST_WIDE_INT toffset;
9095 int len = 16, tlen;
9096
9097 /* Choose the base register with the smallest address encoding.
9098 With a tie, choose FP > DRAP > SP. */
9099 if (m->fs.sp_valid)
9100 {
9101 base_reg = stack_pointer_rtx;
9102 base_offset = m->fs.sp_offset - cfa_offset;
9103 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9104 }
9105 if (m->fs.drap_valid)
9106 {
9107 toffset = 0 - cfa_offset;
9108 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9109 if (tlen <= len)
9110 {
9111 base_reg = crtl->drap_reg;
9112 base_offset = toffset;
9113 len = tlen;
9114 }
9115 }
9116 if (m->fs.fp_valid)
9117 {
9118 toffset = m->fs.fp_offset - cfa_offset;
9119 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9120 if (tlen <= len)
9121 {
9122 base_reg = hard_frame_pointer_rtx;
9123 base_offset = toffset;
9124 len = tlen;
9125 }
9126 }
9127 }
9128 gcc_assert (base_reg != NULL);
9129
9130 return plus_constant (base_reg, base_offset);
9131 }
9132
9133 /* Emit code to save registers in the prologue. */
9134
9135 static void
9136 ix86_emit_save_regs (void)
9137 {
9138 unsigned int regno;
9139 rtx insn;
9140
9141 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9142 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9143 {
9144 insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
9145 RTX_FRAME_RELATED_P (insn) = 1;
9146 }
9147 }
9148
9149 /* Emit a single register save at CFA - CFA_OFFSET. */
9150
9151 static void
9152 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9153 HOST_WIDE_INT cfa_offset)
9154 {
9155 struct machine_function *m = cfun->machine;
9156 rtx reg = gen_rtx_REG (mode, regno);
9157 rtx mem, addr, base, insn;
9158
9159 addr = choose_baseaddr (cfa_offset);
9160 mem = gen_frame_mem (mode, addr);
9161
9162 /* For SSE saves, we need to indicate the 128-bit alignment. */
9163 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9164
9165 insn = emit_move_insn (mem, reg);
9166 RTX_FRAME_RELATED_P (insn) = 1;
9167
9168 base = addr;
9169 if (GET_CODE (base) == PLUS)
9170 base = XEXP (base, 0);
9171 gcc_checking_assert (REG_P (base));
9172
9173 /* When saving registers into a re-aligned local stack frame, avoid
9174 any tricky guessing by dwarf2out. */
9175 if (m->fs.realigned)
9176 {
9177 gcc_checking_assert (stack_realign_drap);
9178
9179 if (regno == REGNO (crtl->drap_reg))
9180 {
9181 /* A bit of a hack. We force the DRAP register to be saved in
9182 the re-aligned stack frame, which provides us with a copy
9183 of the CFA that will last past the prologue. Install it. */
9184 gcc_checking_assert (cfun->machine->fs.fp_valid);
9185 addr = plus_constant (hard_frame_pointer_rtx,
9186 cfun->machine->fs.fp_offset - cfa_offset);
9187 mem = gen_rtx_MEM (mode, addr);
9188 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9189 }
9190 else
9191 {
9192 /* The frame pointer is a stable reference within the
9193 aligned frame. Use it. */
9194 gcc_checking_assert (cfun->machine->fs.fp_valid);
9195 addr = plus_constant (hard_frame_pointer_rtx,
9196 cfun->machine->fs.fp_offset - cfa_offset);
9197 mem = gen_rtx_MEM (mode, addr);
9198 add_reg_note (insn, REG_CFA_EXPRESSION,
9199 gen_rtx_SET (VOIDmode, mem, reg));
9200 }
9201 }
9202
9203 /* The memory may not be relative to the current CFA register,
9204 which means that we may need to generate a new pattern for
9205 use by the unwind info. */
9206 else if (base != m->fs.cfa_reg)
9207 {
9208 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9209 mem = gen_rtx_MEM (mode, addr);
9210 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9211 }
9212 }
9213
9214 /* Emit code to save registers using MOV insns.
9215 First register is stored at CFA - CFA_OFFSET. */
9216 static void
9217 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9218 {
9219 unsigned int regno;
9220
9221 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9222 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9223 {
9224 ix86_emit_save_reg_using_mov (Pmode, regno, cfa_offset);
9225 cfa_offset -= UNITS_PER_WORD;
9226 }
9227 }
9228
9229 /* Emit code to save SSE registers using MOV insns.
9230 First register is stored at CFA - CFA_OFFSET. */
9231 static void
9232 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9233 {
9234 unsigned int regno;
9235
9236 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9237 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9238 {
9239 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9240 cfa_offset -= 16;
9241 }
9242 }
9243
9244 static GTY(()) rtx queued_cfa_restores;
9245
9246 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9247 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9248 Don't add the note if the previously saved value will be left untouched
9249 within stack red-zone till return, as unwinders can find the same value
9250 in the register and on the stack. */
9251
9252 static void
9253 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9254 {
9255 if (!crtl->shrink_wrapped
9256 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9257 return;
9258
9259 if (insn)
9260 {
9261 add_reg_note (insn, REG_CFA_RESTORE, reg);
9262 RTX_FRAME_RELATED_P (insn) = 1;
9263 }
9264 else
9265 queued_cfa_restores
9266 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9267 }
9268
9269 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9270
9271 static void
9272 ix86_add_queued_cfa_restore_notes (rtx insn)
9273 {
9274 rtx last;
9275 if (!queued_cfa_restores)
9276 return;
9277 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9278 ;
9279 XEXP (last, 1) = REG_NOTES (insn);
9280 REG_NOTES (insn) = queued_cfa_restores;
9281 queued_cfa_restores = NULL_RTX;
9282 RTX_FRAME_RELATED_P (insn) = 1;
9283 }
9284
9285 /* Expand prologue or epilogue stack adjustment.
9286 The pattern exist to put a dependency on all ebp-based memory accesses.
9287 STYLE should be negative if instructions should be marked as frame related,
9288 zero if %r11 register is live and cannot be freely used and positive
9289 otherwise. */
9290
9291 static void
9292 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9293 int style, bool set_cfa)
9294 {
9295 struct machine_function *m = cfun->machine;
9296 rtx insn;
9297 bool add_frame_related_expr = false;
9298
9299 if (Pmode == SImode)
9300 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9301 else if (x86_64_immediate_operand (offset, DImode))
9302 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9303 else
9304 {
9305 rtx tmp;
9306 /* r11 is used by indirect sibcall return as well, set before the
9307 epilogue and used after the epilogue. */
9308 if (style)
9309 tmp = gen_rtx_REG (DImode, R11_REG);
9310 else
9311 {
9312 gcc_assert (src != hard_frame_pointer_rtx
9313 && dest != hard_frame_pointer_rtx);
9314 tmp = hard_frame_pointer_rtx;
9315 }
9316 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9317 if (style < 0)
9318 add_frame_related_expr = true;
9319
9320 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9321 }
9322
9323 insn = emit_insn (insn);
9324 if (style >= 0)
9325 ix86_add_queued_cfa_restore_notes (insn);
9326
9327 if (set_cfa)
9328 {
9329 rtx r;
9330
9331 gcc_assert (m->fs.cfa_reg == src);
9332 m->fs.cfa_offset += INTVAL (offset);
9333 m->fs.cfa_reg = dest;
9334
9335 r = gen_rtx_PLUS (Pmode, src, offset);
9336 r = gen_rtx_SET (VOIDmode, dest, r);
9337 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9338 RTX_FRAME_RELATED_P (insn) = 1;
9339 }
9340 else if (style < 0)
9341 {
9342 RTX_FRAME_RELATED_P (insn) = 1;
9343 if (add_frame_related_expr)
9344 {
9345 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9346 r = gen_rtx_SET (VOIDmode, dest, r);
9347 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9348 }
9349 }
9350
9351 if (dest == stack_pointer_rtx)
9352 {
9353 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9354 bool valid = m->fs.sp_valid;
9355
9356 if (src == hard_frame_pointer_rtx)
9357 {
9358 valid = m->fs.fp_valid;
9359 ooffset = m->fs.fp_offset;
9360 }
9361 else if (src == crtl->drap_reg)
9362 {
9363 valid = m->fs.drap_valid;
9364 ooffset = 0;
9365 }
9366 else
9367 {
9368 /* Else there are two possibilities: SP itself, which we set
9369 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9370 taken care of this by hand along the eh_return path. */
9371 gcc_checking_assert (src == stack_pointer_rtx
9372 || offset == const0_rtx);
9373 }
9374
9375 m->fs.sp_offset = ooffset - INTVAL (offset);
9376 m->fs.sp_valid = valid;
9377 }
9378 }
9379
9380 /* Find an available register to be used as dynamic realign argument
9381 pointer regsiter. Such a register will be written in prologue and
9382 used in begin of body, so it must not be
9383 1. parameter passing register.
9384 2. GOT pointer.
9385 We reuse static-chain register if it is available. Otherwise, we
9386 use DI for i386 and R13 for x86-64. We chose R13 since it has
9387 shorter encoding.
9388
9389 Return: the regno of chosen register. */
9390
9391 static unsigned int
9392 find_drap_reg (void)
9393 {
9394 tree decl = cfun->decl;
9395
9396 if (TARGET_64BIT)
9397 {
9398 /* Use R13 for nested function or function need static chain.
9399 Since function with tail call may use any caller-saved
9400 registers in epilogue, DRAP must not use caller-saved
9401 register in such case. */
9402 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9403 return R13_REG;
9404
9405 return R10_REG;
9406 }
9407 else
9408 {
9409 /* Use DI for nested function or function need static chain.
9410 Since function with tail call may use any caller-saved
9411 registers in epilogue, DRAP must not use caller-saved
9412 register in such case. */
9413 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9414 return DI_REG;
9415
9416 /* Reuse static chain register if it isn't used for parameter
9417 passing. */
9418 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9419 {
9420 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9421 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9422 return CX_REG;
9423 }
9424 return DI_REG;
9425 }
9426 }
9427
9428 /* Return minimum incoming stack alignment. */
9429
9430 static unsigned int
9431 ix86_minimum_incoming_stack_boundary (bool sibcall)
9432 {
9433 unsigned int incoming_stack_boundary;
9434
9435 /* Prefer the one specified at command line. */
9436 if (ix86_user_incoming_stack_boundary)
9437 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9438 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9439 if -mstackrealign is used, it isn't used for sibcall check and
9440 estimated stack alignment is 128bit. */
9441 else if (!sibcall
9442 && !TARGET_64BIT
9443 && ix86_force_align_arg_pointer
9444 && crtl->stack_alignment_estimated == 128)
9445 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9446 else
9447 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9448
9449 /* Incoming stack alignment can be changed on individual functions
9450 via force_align_arg_pointer attribute. We use the smallest
9451 incoming stack boundary. */
9452 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9453 && lookup_attribute (ix86_force_align_arg_pointer_string,
9454 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9455 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9456
9457 /* The incoming stack frame has to be aligned at least at
9458 parm_stack_boundary. */
9459 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9460 incoming_stack_boundary = crtl->parm_stack_boundary;
9461
9462 /* Stack at entrance of main is aligned by runtime. We use the
9463 smallest incoming stack boundary. */
9464 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9465 && DECL_NAME (current_function_decl)
9466 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9467 && DECL_FILE_SCOPE_P (current_function_decl))
9468 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9469
9470 return incoming_stack_boundary;
9471 }
9472
9473 /* Update incoming stack boundary and estimated stack alignment. */
9474
9475 static void
9476 ix86_update_stack_boundary (void)
9477 {
9478 ix86_incoming_stack_boundary
9479 = ix86_minimum_incoming_stack_boundary (false);
9480
9481 /* x86_64 vararg needs 16byte stack alignment for register save
9482 area. */
9483 if (TARGET_64BIT
9484 && cfun->stdarg
9485 && crtl->stack_alignment_estimated < 128)
9486 crtl->stack_alignment_estimated = 128;
9487 }
9488
9489 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9490 needed or an rtx for DRAP otherwise. */
9491
9492 static rtx
9493 ix86_get_drap_rtx (void)
9494 {
9495 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9496 crtl->need_drap = true;
9497
9498 if (stack_realign_drap)
9499 {
9500 /* Assign DRAP to vDRAP and returns vDRAP */
9501 unsigned int regno = find_drap_reg ();
9502 rtx drap_vreg;
9503 rtx arg_ptr;
9504 rtx seq, insn;
9505
9506 arg_ptr = gen_rtx_REG (Pmode, regno);
9507 crtl->drap_reg = arg_ptr;
9508
9509 start_sequence ();
9510 drap_vreg = copy_to_reg (arg_ptr);
9511 seq = get_insns ();
9512 end_sequence ();
9513
9514 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9515 if (!optimize)
9516 {
9517 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9518 RTX_FRAME_RELATED_P (insn) = 1;
9519 }
9520 return drap_vreg;
9521 }
9522 else
9523 return NULL;
9524 }
9525
9526 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9527
9528 static rtx
9529 ix86_internal_arg_pointer (void)
9530 {
9531 return virtual_incoming_args_rtx;
9532 }
9533
9534 struct scratch_reg {
9535 rtx reg;
9536 bool saved;
9537 };
9538
9539 /* Return a short-lived scratch register for use on function entry.
9540 In 32-bit mode, it is valid only after the registers are saved
9541 in the prologue. This register must be released by means of
9542 release_scratch_register_on_entry once it is dead. */
9543
9544 static void
9545 get_scratch_register_on_entry (struct scratch_reg *sr)
9546 {
9547 int regno;
9548
9549 sr->saved = false;
9550
9551 if (TARGET_64BIT)
9552 {
9553 /* We always use R11 in 64-bit mode. */
9554 regno = R11_REG;
9555 }
9556 else
9557 {
9558 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9559 bool fastcall_p
9560 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9561 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9562 int regparm = ix86_function_regparm (fntype, decl);
9563 int drap_regno
9564 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9565
9566 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9567 for the static chain register. */
9568 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9569 && drap_regno != AX_REG)
9570 regno = AX_REG;
9571 else if (regparm < 2 && drap_regno != DX_REG)
9572 regno = DX_REG;
9573 /* ecx is the static chain register. */
9574 else if (regparm < 3 && !fastcall_p && !static_chain_p
9575 && drap_regno != CX_REG)
9576 regno = CX_REG;
9577 else if (ix86_save_reg (BX_REG, true))
9578 regno = BX_REG;
9579 /* esi is the static chain register. */
9580 else if (!(regparm == 3 && static_chain_p)
9581 && ix86_save_reg (SI_REG, true))
9582 regno = SI_REG;
9583 else if (ix86_save_reg (DI_REG, true))
9584 regno = DI_REG;
9585 else
9586 {
9587 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9588 sr->saved = true;
9589 }
9590 }
9591
9592 sr->reg = gen_rtx_REG (Pmode, regno);
9593 if (sr->saved)
9594 {
9595 rtx insn = emit_insn (gen_push (sr->reg));
9596 RTX_FRAME_RELATED_P (insn) = 1;
9597 }
9598 }
9599
9600 /* Release a scratch register obtained from the preceding function. */
9601
9602 static void
9603 release_scratch_register_on_entry (struct scratch_reg *sr)
9604 {
9605 if (sr->saved)
9606 {
9607 rtx x, insn = emit_insn (gen_pop (sr->reg));
9608
9609 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9610 RTX_FRAME_RELATED_P (insn) = 1;
9611 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9612 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9613 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9614 }
9615 }
9616
9617 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9618
9619 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9620
9621 static void
9622 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9623 {
9624 /* We skip the probe for the first interval + a small dope of 4 words and
9625 probe that many bytes past the specified size to maintain a protection
9626 area at the botton of the stack. */
9627 const int dope = 4 * UNITS_PER_WORD;
9628 rtx size_rtx = GEN_INT (size), last;
9629
9630 /* See if we have a constant small number of probes to generate. If so,
9631 that's the easy case. The run-time loop is made up of 11 insns in the
9632 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9633 for n # of intervals. */
9634 if (size <= 5 * PROBE_INTERVAL)
9635 {
9636 HOST_WIDE_INT i, adjust;
9637 bool first_probe = true;
9638
9639 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9640 values of N from 1 until it exceeds SIZE. If only one probe is
9641 needed, this will not generate any code. Then adjust and probe
9642 to PROBE_INTERVAL + SIZE. */
9643 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9644 {
9645 if (first_probe)
9646 {
9647 adjust = 2 * PROBE_INTERVAL + dope;
9648 first_probe = false;
9649 }
9650 else
9651 adjust = PROBE_INTERVAL;
9652
9653 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9654 plus_constant (stack_pointer_rtx, -adjust)));
9655 emit_stack_probe (stack_pointer_rtx);
9656 }
9657
9658 if (first_probe)
9659 adjust = size + PROBE_INTERVAL + dope;
9660 else
9661 adjust = size + PROBE_INTERVAL - i;
9662
9663 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9664 plus_constant (stack_pointer_rtx, -adjust)));
9665 emit_stack_probe (stack_pointer_rtx);
9666
9667 /* Adjust back to account for the additional first interval. */
9668 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9669 plus_constant (stack_pointer_rtx,
9670 PROBE_INTERVAL + dope)));
9671 }
9672
9673 /* Otherwise, do the same as above, but in a loop. Note that we must be
9674 extra careful with variables wrapping around because we might be at
9675 the very top (or the very bottom) of the address space and we have
9676 to be able to handle this case properly; in particular, we use an
9677 equality test for the loop condition. */
9678 else
9679 {
9680 HOST_WIDE_INT rounded_size;
9681 struct scratch_reg sr;
9682
9683 get_scratch_register_on_entry (&sr);
9684
9685
9686 /* Step 1: round SIZE to the previous multiple of the interval. */
9687
9688 rounded_size = size & -PROBE_INTERVAL;
9689
9690
9691 /* Step 2: compute initial and final value of the loop counter. */
9692
9693 /* SP = SP_0 + PROBE_INTERVAL. */
9694 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9695 plus_constant (stack_pointer_rtx,
9696 - (PROBE_INTERVAL + dope))));
9697
9698 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9699 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9700 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9701 gen_rtx_PLUS (Pmode, sr.reg,
9702 stack_pointer_rtx)));
9703
9704
9705 /* Step 3: the loop
9706
9707 while (SP != LAST_ADDR)
9708 {
9709 SP = SP + PROBE_INTERVAL
9710 probe at SP
9711 }
9712
9713 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9714 values of N from 1 until it is equal to ROUNDED_SIZE. */
9715
9716 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9717
9718
9719 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9720 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9721
9722 if (size != rounded_size)
9723 {
9724 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9725 plus_constant (stack_pointer_rtx,
9726 rounded_size - size)));
9727 emit_stack_probe (stack_pointer_rtx);
9728 }
9729
9730 /* Adjust back to account for the additional first interval. */
9731 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9732 plus_constant (stack_pointer_rtx,
9733 PROBE_INTERVAL + dope)));
9734
9735 release_scratch_register_on_entry (&sr);
9736 }
9737
9738 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9739
9740 /* Even if the stack pointer isn't the CFA register, we need to correctly
9741 describe the adjustments made to it, in particular differentiate the
9742 frame-related ones from the frame-unrelated ones. */
9743 if (size > 0)
9744 {
9745 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9746 XVECEXP (expr, 0, 0)
9747 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9748 plus_constant (stack_pointer_rtx, -size));
9749 XVECEXP (expr, 0, 1)
9750 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9751 plus_constant (stack_pointer_rtx,
9752 PROBE_INTERVAL + dope + size));
9753 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9754 RTX_FRAME_RELATED_P (last) = 1;
9755
9756 cfun->machine->fs.sp_offset += size;
9757 }
9758
9759 /* Make sure nothing is scheduled before we are done. */
9760 emit_insn (gen_blockage ());
9761 }
9762
9763 /* Adjust the stack pointer up to REG while probing it. */
9764
9765 const char *
9766 output_adjust_stack_and_probe (rtx reg)
9767 {
9768 static int labelno = 0;
9769 char loop_lab[32], end_lab[32];
9770 rtx xops[2];
9771
9772 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9773 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9774
9775 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9776
9777 /* Jump to END_LAB if SP == LAST_ADDR. */
9778 xops[0] = stack_pointer_rtx;
9779 xops[1] = reg;
9780 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9781 fputs ("\tje\t", asm_out_file);
9782 assemble_name_raw (asm_out_file, end_lab);
9783 fputc ('\n', asm_out_file);
9784
9785 /* SP = SP + PROBE_INTERVAL. */
9786 xops[1] = GEN_INT (PROBE_INTERVAL);
9787 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9788
9789 /* Probe at SP. */
9790 xops[1] = const0_rtx;
9791 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9792
9793 fprintf (asm_out_file, "\tjmp\t");
9794 assemble_name_raw (asm_out_file, loop_lab);
9795 fputc ('\n', asm_out_file);
9796
9797 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9798
9799 return "";
9800 }
9801
9802 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9803 inclusive. These are offsets from the current stack pointer. */
9804
9805 static void
9806 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9807 {
9808 /* See if we have a constant small number of probes to generate. If so,
9809 that's the easy case. The run-time loop is made up of 7 insns in the
9810 generic case while the compile-time loop is made up of n insns for n #
9811 of intervals. */
9812 if (size <= 7 * PROBE_INTERVAL)
9813 {
9814 HOST_WIDE_INT i;
9815
9816 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9817 it exceeds SIZE. If only one probe is needed, this will not
9818 generate any code. Then probe at FIRST + SIZE. */
9819 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9820 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9821
9822 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9823 }
9824
9825 /* Otherwise, do the same as above, but in a loop. Note that we must be
9826 extra careful with variables wrapping around because we might be at
9827 the very top (or the very bottom) of the address space and we have
9828 to be able to handle this case properly; in particular, we use an
9829 equality test for the loop condition. */
9830 else
9831 {
9832 HOST_WIDE_INT rounded_size, last;
9833 struct scratch_reg sr;
9834
9835 get_scratch_register_on_entry (&sr);
9836
9837
9838 /* Step 1: round SIZE to the previous multiple of the interval. */
9839
9840 rounded_size = size & -PROBE_INTERVAL;
9841
9842
9843 /* Step 2: compute initial and final value of the loop counter. */
9844
9845 /* TEST_OFFSET = FIRST. */
9846 emit_move_insn (sr.reg, GEN_INT (-first));
9847
9848 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9849 last = first + rounded_size;
9850
9851
9852 /* Step 3: the loop
9853
9854 while (TEST_ADDR != LAST_ADDR)
9855 {
9856 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9857 probe at TEST_ADDR
9858 }
9859
9860 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9861 until it is equal to ROUNDED_SIZE. */
9862
9863 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9864
9865
9866 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9867 that SIZE is equal to ROUNDED_SIZE. */
9868
9869 if (size != rounded_size)
9870 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9871 stack_pointer_rtx,
9872 sr.reg),
9873 rounded_size - size));
9874
9875 release_scratch_register_on_entry (&sr);
9876 }
9877
9878 /* Make sure nothing is scheduled before we are done. */
9879 emit_insn (gen_blockage ());
9880 }
9881
9882 /* Probe a range of stack addresses from REG to END, inclusive. These are
9883 offsets from the current stack pointer. */
9884
9885 const char *
9886 output_probe_stack_range (rtx reg, rtx end)
9887 {
9888 static int labelno = 0;
9889 char loop_lab[32], end_lab[32];
9890 rtx xops[3];
9891
9892 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9893 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9894
9895 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9896
9897 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9898 xops[0] = reg;
9899 xops[1] = end;
9900 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9901 fputs ("\tje\t", asm_out_file);
9902 assemble_name_raw (asm_out_file, end_lab);
9903 fputc ('\n', asm_out_file);
9904
9905 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9906 xops[1] = GEN_INT (PROBE_INTERVAL);
9907 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9908
9909 /* Probe at TEST_ADDR. */
9910 xops[0] = stack_pointer_rtx;
9911 xops[1] = reg;
9912 xops[2] = const0_rtx;
9913 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9914
9915 fprintf (asm_out_file, "\tjmp\t");
9916 assemble_name_raw (asm_out_file, loop_lab);
9917 fputc ('\n', asm_out_file);
9918
9919 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9920
9921 return "";
9922 }
9923
9924 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9925 to be generated in correct form. */
9926 static void
9927 ix86_finalize_stack_realign_flags (void)
9928 {
9929 /* Check if stack realign is really needed after reload, and
9930 stores result in cfun */
9931 unsigned int incoming_stack_boundary
9932 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9933 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9934 unsigned int stack_realign = (incoming_stack_boundary
9935 < (current_function_is_leaf
9936 ? crtl->max_used_stack_slot_alignment
9937 : crtl->stack_alignment_needed));
9938
9939 if (crtl->stack_realign_finalized)
9940 {
9941 /* After stack_realign_needed is finalized, we can't no longer
9942 change it. */
9943 gcc_assert (crtl->stack_realign_needed == stack_realign);
9944 return;
9945 }
9946
9947 /* If the only reason for frame_pointer_needed is that we conservatively
9948 assumed stack realignment might be needed, but in the end nothing that
9949 needed the stack alignment had been spilled, clear frame_pointer_needed
9950 and say we don't need stack realignment. */
9951 if (stack_realign
9952 && !crtl->need_drap
9953 && frame_pointer_needed
9954 && current_function_is_leaf
9955 && flag_omit_frame_pointer
9956 && current_function_sp_is_unchanging
9957 && !ix86_current_function_calls_tls_descriptor
9958 && !crtl->accesses_prior_frames
9959 && !cfun->calls_alloca
9960 && !crtl->calls_eh_return
9961 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
9962 && !ix86_frame_pointer_required ()
9963 && get_frame_size () == 0
9964 && ix86_nsaved_sseregs () == 0
9965 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
9966 {
9967 HARD_REG_SET set_up_by_prologue, prologue_used;
9968 basic_block bb;
9969
9970 CLEAR_HARD_REG_SET (prologue_used);
9971 CLEAR_HARD_REG_SET (set_up_by_prologue);
9972 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
9973 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
9974 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
9975 HARD_FRAME_POINTER_REGNUM);
9976 FOR_EACH_BB (bb)
9977 {
9978 rtx insn;
9979 FOR_BB_INSNS (bb, insn)
9980 if (NONDEBUG_INSN_P (insn)
9981 && requires_stack_frame_p (insn, prologue_used,
9982 set_up_by_prologue))
9983 {
9984 crtl->stack_realign_needed = stack_realign;
9985 crtl->stack_realign_finalized = true;
9986 return;
9987 }
9988 }
9989
9990 frame_pointer_needed = false;
9991 stack_realign = false;
9992 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
9993 crtl->stack_alignment_needed = incoming_stack_boundary;
9994 crtl->stack_alignment_estimated = incoming_stack_boundary;
9995 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
9996 crtl->preferred_stack_boundary = incoming_stack_boundary;
9997 df_finish_pass (true);
9998 df_scan_alloc (NULL);
9999 df_scan_blocks ();
10000 df_compute_regs_ever_live (true);
10001 df_analyze ();
10002 }
10003
10004 crtl->stack_realign_needed = stack_realign;
10005 crtl->stack_realign_finalized = true;
10006 }
10007
10008 /* Expand the prologue into a bunch of separate insns. */
10009
10010 void
10011 ix86_expand_prologue (void)
10012 {
10013 struct machine_function *m = cfun->machine;
10014 rtx insn, t;
10015 bool pic_reg_used;
10016 struct ix86_frame frame;
10017 HOST_WIDE_INT allocate;
10018 bool int_registers_saved;
10019
10020 ix86_finalize_stack_realign_flags ();
10021
10022 /* DRAP should not coexist with stack_realign_fp */
10023 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10024
10025 memset (&m->fs, 0, sizeof (m->fs));
10026
10027 /* Initialize CFA state for before the prologue. */
10028 m->fs.cfa_reg = stack_pointer_rtx;
10029 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10030
10031 /* Track SP offset to the CFA. We continue tracking this after we've
10032 swapped the CFA register away from SP. In the case of re-alignment
10033 this is fudged; we're interested to offsets within the local frame. */
10034 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10035 m->fs.sp_valid = true;
10036
10037 ix86_compute_frame_layout (&frame);
10038
10039 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10040 {
10041 /* We should have already generated an error for any use of
10042 ms_hook on a nested function. */
10043 gcc_checking_assert (!ix86_static_chain_on_stack);
10044
10045 /* Check if profiling is active and we shall use profiling before
10046 prologue variant. If so sorry. */
10047 if (crtl->profile && flag_fentry != 0)
10048 sorry ("ms_hook_prologue attribute isn%'t compatible "
10049 "with -mfentry for 32-bit");
10050
10051 /* In ix86_asm_output_function_label we emitted:
10052 8b ff movl.s %edi,%edi
10053 55 push %ebp
10054 8b ec movl.s %esp,%ebp
10055
10056 This matches the hookable function prologue in Win32 API
10057 functions in Microsoft Windows XP Service Pack 2 and newer.
10058 Wine uses this to enable Windows apps to hook the Win32 API
10059 functions provided by Wine.
10060
10061 What that means is that we've already set up the frame pointer. */
10062
10063 if (frame_pointer_needed
10064 && !(crtl->drap_reg && crtl->stack_realign_needed))
10065 {
10066 rtx push, mov;
10067
10068 /* We've decided to use the frame pointer already set up.
10069 Describe this to the unwinder by pretending that both
10070 push and mov insns happen right here.
10071
10072 Putting the unwind info here at the end of the ms_hook
10073 is done so that we can make absolutely certain we get
10074 the required byte sequence at the start of the function,
10075 rather than relying on an assembler that can produce
10076 the exact encoding required.
10077
10078 However it does mean (in the unpatched case) that we have
10079 a 1 insn window where the asynchronous unwind info is
10080 incorrect. However, if we placed the unwind info at
10081 its correct location we would have incorrect unwind info
10082 in the patched case. Which is probably all moot since
10083 I don't expect Wine generates dwarf2 unwind info for the
10084 system libraries that use this feature. */
10085
10086 insn = emit_insn (gen_blockage ());
10087
10088 push = gen_push (hard_frame_pointer_rtx);
10089 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10090 stack_pointer_rtx);
10091 RTX_FRAME_RELATED_P (push) = 1;
10092 RTX_FRAME_RELATED_P (mov) = 1;
10093
10094 RTX_FRAME_RELATED_P (insn) = 1;
10095 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10096 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10097
10098 /* Note that gen_push incremented m->fs.cfa_offset, even
10099 though we didn't emit the push insn here. */
10100 m->fs.cfa_reg = hard_frame_pointer_rtx;
10101 m->fs.fp_offset = m->fs.cfa_offset;
10102 m->fs.fp_valid = true;
10103 }
10104 else
10105 {
10106 /* The frame pointer is not needed so pop %ebp again.
10107 This leaves us with a pristine state. */
10108 emit_insn (gen_pop (hard_frame_pointer_rtx));
10109 }
10110 }
10111
10112 /* The first insn of a function that accepts its static chain on the
10113 stack is to push the register that would be filled in by a direct
10114 call. This insn will be skipped by the trampoline. */
10115 else if (ix86_static_chain_on_stack)
10116 {
10117 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10118 emit_insn (gen_blockage ());
10119
10120 /* We don't want to interpret this push insn as a register save,
10121 only as a stack adjustment. The real copy of the register as
10122 a save will be done later, if needed. */
10123 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
10124 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10125 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10126 RTX_FRAME_RELATED_P (insn) = 1;
10127 }
10128
10129 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10130 of DRAP is needed and stack realignment is really needed after reload */
10131 if (stack_realign_drap)
10132 {
10133 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10134
10135 /* Only need to push parameter pointer reg if it is caller saved. */
10136 if (!call_used_regs[REGNO (crtl->drap_reg)])
10137 {
10138 /* Push arg pointer reg */
10139 insn = emit_insn (gen_push (crtl->drap_reg));
10140 RTX_FRAME_RELATED_P (insn) = 1;
10141 }
10142
10143 /* Grab the argument pointer. */
10144 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
10145 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10146 RTX_FRAME_RELATED_P (insn) = 1;
10147 m->fs.cfa_reg = crtl->drap_reg;
10148 m->fs.cfa_offset = 0;
10149
10150 /* Align the stack. */
10151 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10152 stack_pointer_rtx,
10153 GEN_INT (-align_bytes)));
10154 RTX_FRAME_RELATED_P (insn) = 1;
10155
10156 /* Replicate the return address on the stack so that return
10157 address can be reached via (argp - 1) slot. This is needed
10158 to implement macro RETURN_ADDR_RTX and intrinsic function
10159 expand_builtin_return_addr etc. */
10160 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
10161 t = gen_frame_mem (Pmode, t);
10162 insn = emit_insn (gen_push (t));
10163 RTX_FRAME_RELATED_P (insn) = 1;
10164
10165 /* For the purposes of frame and register save area addressing,
10166 we've started over with a new frame. */
10167 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10168 m->fs.realigned = true;
10169 }
10170
10171 if (frame_pointer_needed && !m->fs.fp_valid)
10172 {
10173 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10174 slower on all targets. Also sdb doesn't like it. */
10175 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10176 RTX_FRAME_RELATED_P (insn) = 1;
10177
10178 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10179 {
10180 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10181 RTX_FRAME_RELATED_P (insn) = 1;
10182
10183 if (m->fs.cfa_reg == stack_pointer_rtx)
10184 m->fs.cfa_reg = hard_frame_pointer_rtx;
10185 m->fs.fp_offset = m->fs.sp_offset;
10186 m->fs.fp_valid = true;
10187 }
10188 }
10189
10190 int_registers_saved = (frame.nregs == 0);
10191
10192 if (!int_registers_saved)
10193 {
10194 /* If saving registers via PUSH, do so now. */
10195 if (!frame.save_regs_using_mov)
10196 {
10197 ix86_emit_save_regs ();
10198 int_registers_saved = true;
10199 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10200 }
10201
10202 /* When using red zone we may start register saving before allocating
10203 the stack frame saving one cycle of the prologue. However, avoid
10204 doing this if we have to probe the stack; at least on x86_64 the
10205 stack probe can turn into a call that clobbers a red zone location. */
10206 else if (ix86_using_red_zone ()
10207 && (! TARGET_STACK_PROBE
10208 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10209 {
10210 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10211 int_registers_saved = true;
10212 }
10213 }
10214
10215 if (stack_realign_fp)
10216 {
10217 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10218 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10219
10220 /* The computation of the size of the re-aligned stack frame means
10221 that we must allocate the size of the register save area before
10222 performing the actual alignment. Otherwise we cannot guarantee
10223 that there's enough storage above the realignment point. */
10224 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10225 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10226 GEN_INT (m->fs.sp_offset
10227 - frame.sse_reg_save_offset),
10228 -1, false);
10229
10230 /* Align the stack. */
10231 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10232 stack_pointer_rtx,
10233 GEN_INT (-align_bytes)));
10234
10235 /* For the purposes of register save area addressing, the stack
10236 pointer is no longer valid. As for the value of sp_offset,
10237 see ix86_compute_frame_layout, which we need to match in order
10238 to pass verification of stack_pointer_offset at the end. */
10239 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10240 m->fs.sp_valid = false;
10241 }
10242
10243 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10244
10245 if (flag_stack_usage_info)
10246 {
10247 /* We start to count from ARG_POINTER. */
10248 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10249
10250 /* If it was realigned, take into account the fake frame. */
10251 if (stack_realign_drap)
10252 {
10253 if (ix86_static_chain_on_stack)
10254 stack_size += UNITS_PER_WORD;
10255
10256 if (!call_used_regs[REGNO (crtl->drap_reg)])
10257 stack_size += UNITS_PER_WORD;
10258
10259 /* This over-estimates by 1 minimal-stack-alignment-unit but
10260 mitigates that by counting in the new return address slot. */
10261 current_function_dynamic_stack_size
10262 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10263 }
10264
10265 current_function_static_stack_size = stack_size;
10266 }
10267
10268 /* The stack has already been decremented by the instruction calling us
10269 so probe if the size is non-negative to preserve the protection area. */
10270 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10271 {
10272 /* We expect the registers to be saved when probes are used. */
10273 gcc_assert (int_registers_saved);
10274
10275 if (STACK_CHECK_MOVING_SP)
10276 {
10277 ix86_adjust_stack_and_probe (allocate);
10278 allocate = 0;
10279 }
10280 else
10281 {
10282 HOST_WIDE_INT size = allocate;
10283
10284 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10285 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10286
10287 if (TARGET_STACK_PROBE)
10288 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10289 else
10290 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10291 }
10292 }
10293
10294 if (allocate == 0)
10295 ;
10296 else if (!ix86_target_stack_probe ()
10297 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10298 {
10299 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10300 GEN_INT (-allocate), -1,
10301 m->fs.cfa_reg == stack_pointer_rtx);
10302 }
10303 else
10304 {
10305 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10306 rtx r10 = NULL;
10307 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10308
10309 bool eax_live = false;
10310 bool r10_live = false;
10311
10312 if (TARGET_64BIT)
10313 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10314 if (!TARGET_64BIT_MS_ABI)
10315 eax_live = ix86_eax_live_at_start_p ();
10316
10317 if (eax_live)
10318 {
10319 emit_insn (gen_push (eax));
10320 allocate -= UNITS_PER_WORD;
10321 }
10322 if (r10_live)
10323 {
10324 r10 = gen_rtx_REG (Pmode, R10_REG);
10325 emit_insn (gen_push (r10));
10326 allocate -= UNITS_PER_WORD;
10327 }
10328
10329 emit_move_insn (eax, GEN_INT (allocate));
10330 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10331
10332 /* Use the fact that AX still contains ALLOCATE. */
10333 adjust_stack_insn = (Pmode == DImode
10334 ? gen_pro_epilogue_adjust_stack_di_sub
10335 : gen_pro_epilogue_adjust_stack_si_sub);
10336
10337 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10338 stack_pointer_rtx, eax));
10339
10340 /* Note that SEH directives need to continue tracking the stack
10341 pointer even after the frame pointer has been set up. */
10342 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10343 {
10344 if (m->fs.cfa_reg == stack_pointer_rtx)
10345 m->fs.cfa_offset += allocate;
10346
10347 RTX_FRAME_RELATED_P (insn) = 1;
10348 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10349 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10350 plus_constant (stack_pointer_rtx,
10351 -allocate)));
10352 }
10353 m->fs.sp_offset += allocate;
10354
10355 if (r10_live && eax_live)
10356 {
10357 t = choose_baseaddr (m->fs.sp_offset - allocate);
10358 emit_move_insn (r10, gen_frame_mem (Pmode, t));
10359 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10360 emit_move_insn (eax, gen_frame_mem (Pmode, t));
10361 }
10362 else if (eax_live || r10_live)
10363 {
10364 t = choose_baseaddr (m->fs.sp_offset - allocate);
10365 emit_move_insn ((eax_live ? eax : r10), gen_frame_mem (Pmode, t));
10366 }
10367 }
10368 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10369
10370 /* If we havn't already set up the frame pointer, do so now. */
10371 if (frame_pointer_needed && !m->fs.fp_valid)
10372 {
10373 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10374 GEN_INT (frame.stack_pointer_offset
10375 - frame.hard_frame_pointer_offset));
10376 insn = emit_insn (insn);
10377 RTX_FRAME_RELATED_P (insn) = 1;
10378 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10379
10380 if (m->fs.cfa_reg == stack_pointer_rtx)
10381 m->fs.cfa_reg = hard_frame_pointer_rtx;
10382 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10383 m->fs.fp_valid = true;
10384 }
10385
10386 if (!int_registers_saved)
10387 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10388 if (frame.nsseregs)
10389 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10390
10391 pic_reg_used = false;
10392 if (pic_offset_table_rtx
10393 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10394 || crtl->profile))
10395 {
10396 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10397
10398 if (alt_pic_reg_used != INVALID_REGNUM)
10399 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10400
10401 pic_reg_used = true;
10402 }
10403
10404 if (pic_reg_used)
10405 {
10406 if (TARGET_64BIT)
10407 {
10408 if (ix86_cmodel == CM_LARGE_PIC)
10409 {
10410 rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
10411 rtx label = gen_label_rtx ();
10412 emit_label (label);
10413 LABEL_PRESERVE_P (label) = 1;
10414 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10415 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
10416 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10417 insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
10418 pic_offset_table_rtx, tmp_reg));
10419 }
10420 else
10421 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10422 }
10423 else
10424 {
10425 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10426 RTX_FRAME_RELATED_P (insn) = 1;
10427 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10428 }
10429 }
10430
10431 /* In the pic_reg_used case, make sure that the got load isn't deleted
10432 when mcount needs it. Blockage to avoid call movement across mcount
10433 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10434 note. */
10435 if (crtl->profile && !flag_fentry && pic_reg_used)
10436 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10437
10438 if (crtl->drap_reg && !crtl->stack_realign_needed)
10439 {
10440 /* vDRAP is setup but after reload it turns out stack realign
10441 isn't necessary, here we will emit prologue to setup DRAP
10442 without stack realign adjustment */
10443 t = choose_baseaddr (0);
10444 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10445 }
10446
10447 /* Prevent instructions from being scheduled into register save push
10448 sequence when access to the redzone area is done through frame pointer.
10449 The offset between the frame pointer and the stack pointer is calculated
10450 relative to the value of the stack pointer at the end of the function
10451 prologue, and moving instructions that access redzone area via frame
10452 pointer inside push sequence violates this assumption. */
10453 if (frame_pointer_needed && frame.red_zone_size)
10454 emit_insn (gen_memory_blockage ());
10455
10456 /* Emit cld instruction if stringops are used in the function. */
10457 if (TARGET_CLD && ix86_current_function_needs_cld)
10458 emit_insn (gen_cld ());
10459
10460 /* SEH requires that the prologue end within 256 bytes of the start of
10461 the function. Prevent instruction schedules that would extend that.
10462 Further, prevent alloca modifications to the stack pointer from being
10463 combined with prologue modifications. */
10464 if (TARGET_SEH)
10465 emit_insn (gen_prologue_use (stack_pointer_rtx));
10466 }
10467
10468 /* Emit code to restore REG using a POP insn. */
10469
10470 static void
10471 ix86_emit_restore_reg_using_pop (rtx reg)
10472 {
10473 struct machine_function *m = cfun->machine;
10474 rtx insn = emit_insn (gen_pop (reg));
10475
10476 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10477 m->fs.sp_offset -= UNITS_PER_WORD;
10478
10479 if (m->fs.cfa_reg == crtl->drap_reg
10480 && REGNO (reg) == REGNO (crtl->drap_reg))
10481 {
10482 /* Previously we'd represented the CFA as an expression
10483 like *(%ebp - 8). We've just popped that value from
10484 the stack, which means we need to reset the CFA to
10485 the drap register. This will remain until we restore
10486 the stack pointer. */
10487 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10488 RTX_FRAME_RELATED_P (insn) = 1;
10489
10490 /* This means that the DRAP register is valid for addressing too. */
10491 m->fs.drap_valid = true;
10492 return;
10493 }
10494
10495 if (m->fs.cfa_reg == stack_pointer_rtx)
10496 {
10497 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10498 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10499 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10500 RTX_FRAME_RELATED_P (insn) = 1;
10501
10502 m->fs.cfa_offset -= UNITS_PER_WORD;
10503 }
10504
10505 /* When the frame pointer is the CFA, and we pop it, we are
10506 swapping back to the stack pointer as the CFA. This happens
10507 for stack frames that don't allocate other data, so we assume
10508 the stack pointer is now pointing at the return address, i.e.
10509 the function entry state, which makes the offset be 1 word. */
10510 if (reg == hard_frame_pointer_rtx)
10511 {
10512 m->fs.fp_valid = false;
10513 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10514 {
10515 m->fs.cfa_reg = stack_pointer_rtx;
10516 m->fs.cfa_offset -= UNITS_PER_WORD;
10517
10518 add_reg_note (insn, REG_CFA_DEF_CFA,
10519 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10520 GEN_INT (m->fs.cfa_offset)));
10521 RTX_FRAME_RELATED_P (insn) = 1;
10522 }
10523 }
10524 }
10525
10526 /* Emit code to restore saved registers using POP insns. */
10527
10528 static void
10529 ix86_emit_restore_regs_using_pop (void)
10530 {
10531 unsigned int regno;
10532
10533 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10534 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10535 ix86_emit_restore_reg_using_pop (gen_rtx_REG (Pmode, regno));
10536 }
10537
10538 /* Emit code and notes for the LEAVE instruction. */
10539
10540 static void
10541 ix86_emit_leave (void)
10542 {
10543 struct machine_function *m = cfun->machine;
10544 rtx insn = emit_insn (ix86_gen_leave ());
10545
10546 ix86_add_queued_cfa_restore_notes (insn);
10547
10548 gcc_assert (m->fs.fp_valid);
10549 m->fs.sp_valid = true;
10550 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10551 m->fs.fp_valid = false;
10552
10553 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10554 {
10555 m->fs.cfa_reg = stack_pointer_rtx;
10556 m->fs.cfa_offset = m->fs.sp_offset;
10557
10558 add_reg_note (insn, REG_CFA_DEF_CFA,
10559 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10560 RTX_FRAME_RELATED_P (insn) = 1;
10561 }
10562 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10563 m->fs.fp_offset);
10564 }
10565
10566 /* Emit code to restore saved registers using MOV insns.
10567 First register is restored from CFA - CFA_OFFSET. */
10568 static void
10569 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10570 bool maybe_eh_return)
10571 {
10572 struct machine_function *m = cfun->machine;
10573 unsigned int regno;
10574
10575 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10576 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10577 {
10578 rtx reg = gen_rtx_REG (Pmode, regno);
10579 rtx insn, mem;
10580
10581 mem = choose_baseaddr (cfa_offset);
10582 mem = gen_frame_mem (Pmode, mem);
10583 insn = emit_move_insn (reg, mem);
10584
10585 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10586 {
10587 /* Previously we'd represented the CFA as an expression
10588 like *(%ebp - 8). We've just popped that value from
10589 the stack, which means we need to reset the CFA to
10590 the drap register. This will remain until we restore
10591 the stack pointer. */
10592 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10593 RTX_FRAME_RELATED_P (insn) = 1;
10594
10595 /* This means that the DRAP register is valid for addressing. */
10596 m->fs.drap_valid = true;
10597 }
10598 else
10599 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10600
10601 cfa_offset -= UNITS_PER_WORD;
10602 }
10603 }
10604
10605 /* Emit code to restore saved registers using MOV insns.
10606 First register is restored from CFA - CFA_OFFSET. */
10607 static void
10608 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10609 bool maybe_eh_return)
10610 {
10611 unsigned int regno;
10612
10613 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10614 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10615 {
10616 rtx reg = gen_rtx_REG (V4SFmode, regno);
10617 rtx mem;
10618
10619 mem = choose_baseaddr (cfa_offset);
10620 mem = gen_rtx_MEM (V4SFmode, mem);
10621 set_mem_align (mem, 128);
10622 emit_move_insn (reg, mem);
10623
10624 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10625
10626 cfa_offset -= 16;
10627 }
10628 }
10629
10630 /* Emit vzeroupper if needed. */
10631
10632 void
10633 ix86_maybe_emit_epilogue_vzeroupper (void)
10634 {
10635 if (TARGET_VZEROUPPER
10636 && !TREE_THIS_VOLATILE (cfun->decl)
10637 && !cfun->machine->caller_return_avx256_p)
10638 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10639 }
10640
10641 /* Restore function stack, frame, and registers. */
10642
10643 void
10644 ix86_expand_epilogue (int style)
10645 {
10646 struct machine_function *m = cfun->machine;
10647 struct machine_frame_state frame_state_save = m->fs;
10648 struct ix86_frame frame;
10649 bool restore_regs_via_mov;
10650 bool using_drap;
10651
10652 ix86_finalize_stack_realign_flags ();
10653 ix86_compute_frame_layout (&frame);
10654
10655 m->fs.sp_valid = (!frame_pointer_needed
10656 || (current_function_sp_is_unchanging
10657 && !stack_realign_fp));
10658 gcc_assert (!m->fs.sp_valid
10659 || m->fs.sp_offset == frame.stack_pointer_offset);
10660
10661 /* The FP must be valid if the frame pointer is present. */
10662 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10663 gcc_assert (!m->fs.fp_valid
10664 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10665
10666 /* We must have *some* valid pointer to the stack frame. */
10667 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10668
10669 /* The DRAP is never valid at this point. */
10670 gcc_assert (!m->fs.drap_valid);
10671
10672 /* See the comment about red zone and frame
10673 pointer usage in ix86_expand_prologue. */
10674 if (frame_pointer_needed && frame.red_zone_size)
10675 emit_insn (gen_memory_blockage ());
10676
10677 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10678 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10679
10680 /* Determine the CFA offset of the end of the red-zone. */
10681 m->fs.red_zone_offset = 0;
10682 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10683 {
10684 /* The red-zone begins below the return address. */
10685 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10686
10687 /* When the register save area is in the aligned portion of
10688 the stack, determine the maximum runtime displacement that
10689 matches up with the aligned frame. */
10690 if (stack_realign_drap)
10691 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10692 + UNITS_PER_WORD);
10693 }
10694
10695 /* Special care must be taken for the normal return case of a function
10696 using eh_return: the eax and edx registers are marked as saved, but
10697 not restored along this path. Adjust the save location to match. */
10698 if (crtl->calls_eh_return && style != 2)
10699 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10700
10701 /* EH_RETURN requires the use of moves to function properly. */
10702 if (crtl->calls_eh_return)
10703 restore_regs_via_mov = true;
10704 /* SEH requires the use of pops to identify the epilogue. */
10705 else if (TARGET_SEH)
10706 restore_regs_via_mov = false;
10707 /* If we're only restoring one register and sp is not valid then
10708 using a move instruction to restore the register since it's
10709 less work than reloading sp and popping the register. */
10710 else if (!m->fs.sp_valid && frame.nregs <= 1)
10711 restore_regs_via_mov = true;
10712 else if (TARGET_EPILOGUE_USING_MOVE
10713 && cfun->machine->use_fast_prologue_epilogue
10714 && (frame.nregs > 1
10715 || m->fs.sp_offset != frame.reg_save_offset))
10716 restore_regs_via_mov = true;
10717 else if (frame_pointer_needed
10718 && !frame.nregs
10719 && m->fs.sp_offset != frame.reg_save_offset)
10720 restore_regs_via_mov = true;
10721 else if (frame_pointer_needed
10722 && TARGET_USE_LEAVE
10723 && cfun->machine->use_fast_prologue_epilogue
10724 && frame.nregs == 1)
10725 restore_regs_via_mov = true;
10726 else
10727 restore_regs_via_mov = false;
10728
10729 if (restore_regs_via_mov || frame.nsseregs)
10730 {
10731 /* Ensure that the entire register save area is addressable via
10732 the stack pointer, if we will restore via sp. */
10733 if (TARGET_64BIT
10734 && m->fs.sp_offset > 0x7fffffff
10735 && !(m->fs.fp_valid || m->fs.drap_valid)
10736 && (frame.nsseregs + frame.nregs) != 0)
10737 {
10738 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10739 GEN_INT (m->fs.sp_offset
10740 - frame.sse_reg_save_offset),
10741 style,
10742 m->fs.cfa_reg == stack_pointer_rtx);
10743 }
10744 }
10745
10746 /* If there are any SSE registers to restore, then we have to do it
10747 via moves, since there's obviously no pop for SSE regs. */
10748 if (frame.nsseregs)
10749 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10750 style == 2);
10751
10752 if (restore_regs_via_mov)
10753 {
10754 rtx t;
10755
10756 if (frame.nregs)
10757 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10758
10759 /* eh_return epilogues need %ecx added to the stack pointer. */
10760 if (style == 2)
10761 {
10762 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10763
10764 /* Stack align doesn't work with eh_return. */
10765 gcc_assert (!stack_realign_drap);
10766 /* Neither does regparm nested functions. */
10767 gcc_assert (!ix86_static_chain_on_stack);
10768
10769 if (frame_pointer_needed)
10770 {
10771 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10772 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10773 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10774
10775 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10776 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10777
10778 /* Note that we use SA as a temporary CFA, as the return
10779 address is at the proper place relative to it. We
10780 pretend this happens at the FP restore insn because
10781 prior to this insn the FP would be stored at the wrong
10782 offset relative to SA, and after this insn we have no
10783 other reasonable register to use for the CFA. We don't
10784 bother resetting the CFA to the SP for the duration of
10785 the return insn. */
10786 add_reg_note (insn, REG_CFA_DEF_CFA,
10787 plus_constant (sa, UNITS_PER_WORD));
10788 ix86_add_queued_cfa_restore_notes (insn);
10789 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10790 RTX_FRAME_RELATED_P (insn) = 1;
10791
10792 m->fs.cfa_reg = sa;
10793 m->fs.cfa_offset = UNITS_PER_WORD;
10794 m->fs.fp_valid = false;
10795
10796 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10797 const0_rtx, style, false);
10798 }
10799 else
10800 {
10801 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10802 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10803 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10804 ix86_add_queued_cfa_restore_notes (insn);
10805
10806 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10807 if (m->fs.cfa_offset != UNITS_PER_WORD)
10808 {
10809 m->fs.cfa_offset = UNITS_PER_WORD;
10810 add_reg_note (insn, REG_CFA_DEF_CFA,
10811 plus_constant (stack_pointer_rtx,
10812 UNITS_PER_WORD));
10813 RTX_FRAME_RELATED_P (insn) = 1;
10814 }
10815 }
10816 m->fs.sp_offset = UNITS_PER_WORD;
10817 m->fs.sp_valid = true;
10818 }
10819 }
10820 else
10821 {
10822 /* SEH requires that the function end with (1) a stack adjustment
10823 if necessary, (2) a sequence of pops, and (3) a return or
10824 jump instruction. Prevent insns from the function body from
10825 being scheduled into this sequence. */
10826 if (TARGET_SEH)
10827 {
10828 /* Prevent a catch region from being adjacent to the standard
10829 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10830 several other flags that would be interesting to test are
10831 not yet set up. */
10832 if (flag_non_call_exceptions)
10833 emit_insn (gen_nops (const1_rtx));
10834 else
10835 emit_insn (gen_blockage ());
10836 }
10837
10838 /* First step is to deallocate the stack frame so that we can
10839 pop the registers. */
10840 if (!m->fs.sp_valid)
10841 {
10842 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10843 GEN_INT (m->fs.fp_offset
10844 - frame.reg_save_offset),
10845 style, false);
10846 }
10847 else if (m->fs.sp_offset != frame.reg_save_offset)
10848 {
10849 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10850 GEN_INT (m->fs.sp_offset
10851 - frame.reg_save_offset),
10852 style,
10853 m->fs.cfa_reg == stack_pointer_rtx);
10854 }
10855
10856 ix86_emit_restore_regs_using_pop ();
10857 }
10858
10859 /* If we used a stack pointer and haven't already got rid of it,
10860 then do so now. */
10861 if (m->fs.fp_valid)
10862 {
10863 /* If the stack pointer is valid and pointing at the frame
10864 pointer store address, then we only need a pop. */
10865 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10866 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10867 /* Leave results in shorter dependency chains on CPUs that are
10868 able to grok it fast. */
10869 else if (TARGET_USE_LEAVE
10870 || optimize_function_for_size_p (cfun)
10871 || !cfun->machine->use_fast_prologue_epilogue)
10872 ix86_emit_leave ();
10873 else
10874 {
10875 pro_epilogue_adjust_stack (stack_pointer_rtx,
10876 hard_frame_pointer_rtx,
10877 const0_rtx, style, !using_drap);
10878 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10879 }
10880 }
10881
10882 if (using_drap)
10883 {
10884 int param_ptr_offset = UNITS_PER_WORD;
10885 rtx insn;
10886
10887 gcc_assert (stack_realign_drap);
10888
10889 if (ix86_static_chain_on_stack)
10890 param_ptr_offset += UNITS_PER_WORD;
10891 if (!call_used_regs[REGNO (crtl->drap_reg)])
10892 param_ptr_offset += UNITS_PER_WORD;
10893
10894 insn = emit_insn (gen_rtx_SET
10895 (VOIDmode, stack_pointer_rtx,
10896 gen_rtx_PLUS (Pmode,
10897 crtl->drap_reg,
10898 GEN_INT (-param_ptr_offset))));
10899 m->fs.cfa_reg = stack_pointer_rtx;
10900 m->fs.cfa_offset = param_ptr_offset;
10901 m->fs.sp_offset = param_ptr_offset;
10902 m->fs.realigned = false;
10903
10904 add_reg_note (insn, REG_CFA_DEF_CFA,
10905 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10906 GEN_INT (param_ptr_offset)));
10907 RTX_FRAME_RELATED_P (insn) = 1;
10908
10909 if (!call_used_regs[REGNO (crtl->drap_reg)])
10910 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10911 }
10912
10913 /* At this point the stack pointer must be valid, and we must have
10914 restored all of the registers. We may not have deallocated the
10915 entire stack frame. We've delayed this until now because it may
10916 be possible to merge the local stack deallocation with the
10917 deallocation forced by ix86_static_chain_on_stack. */
10918 gcc_assert (m->fs.sp_valid);
10919 gcc_assert (!m->fs.fp_valid);
10920 gcc_assert (!m->fs.realigned);
10921 if (m->fs.sp_offset != UNITS_PER_WORD)
10922 {
10923 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10924 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10925 style, true);
10926 }
10927 else
10928 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10929
10930 /* Sibcall epilogues don't want a return instruction. */
10931 if (style == 0)
10932 {
10933 m->fs = frame_state_save;
10934 return;
10935 }
10936
10937 /* Emit vzeroupper if needed. */
10938 ix86_maybe_emit_epilogue_vzeroupper ();
10939
10940 if (crtl->args.pops_args && crtl->args.size)
10941 {
10942 rtx popc = GEN_INT (crtl->args.pops_args);
10943
10944 /* i386 can only pop 64K bytes. If asked to pop more, pop return
10945 address, do explicit add, and jump indirectly to the caller. */
10946
10947 if (crtl->args.pops_args >= 65536)
10948 {
10949 rtx ecx = gen_rtx_REG (SImode, CX_REG);
10950 rtx insn;
10951
10952 /* There is no "pascal" calling convention in any 64bit ABI. */
10953 gcc_assert (!TARGET_64BIT);
10954
10955 insn = emit_insn (gen_pop (ecx));
10956 m->fs.cfa_offset -= UNITS_PER_WORD;
10957 m->fs.sp_offset -= UNITS_PER_WORD;
10958
10959 add_reg_note (insn, REG_CFA_ADJUST_CFA,
10960 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
10961 add_reg_note (insn, REG_CFA_REGISTER,
10962 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
10963 RTX_FRAME_RELATED_P (insn) = 1;
10964
10965 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10966 popc, -1, true);
10967 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
10968 }
10969 else
10970 emit_jump_insn (gen_simple_return_pop_internal (popc));
10971 }
10972 else
10973 emit_jump_insn (gen_simple_return_internal ());
10974
10975 /* Restore the state back to the state from the prologue,
10976 so that it's correct for the next epilogue. */
10977 m->fs = frame_state_save;
10978 }
10979
10980 /* Reset from the function's potential modifications. */
10981
10982 static void
10983 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
10984 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
10985 {
10986 if (pic_offset_table_rtx)
10987 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
10988 #if TARGET_MACHO
10989 /* Mach-O doesn't support labels at the end of objects, so if
10990 it looks like we might want one, insert a NOP. */
10991 {
10992 rtx insn = get_last_insn ();
10993 rtx deleted_debug_label = NULL_RTX;
10994 while (insn
10995 && NOTE_P (insn)
10996 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
10997 {
10998 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
10999 notes only, instead set their CODE_LABEL_NUMBER to -1,
11000 otherwise there would be code generation differences
11001 in between -g and -g0. */
11002 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11003 deleted_debug_label = insn;
11004 insn = PREV_INSN (insn);
11005 }
11006 if (insn
11007 && (LABEL_P (insn)
11008 || (NOTE_P (insn)
11009 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11010 fputs ("\tnop\n", file);
11011 else if (deleted_debug_label)
11012 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11013 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11014 CODE_LABEL_NUMBER (insn) = -1;
11015 }
11016 #endif
11017
11018 }
11019
11020 /* Return a scratch register to use in the split stack prologue. The
11021 split stack prologue is used for -fsplit-stack. It is the first
11022 instructions in the function, even before the regular prologue.
11023 The scratch register can be any caller-saved register which is not
11024 used for parameters or for the static chain. */
11025
11026 static unsigned int
11027 split_stack_prologue_scratch_regno (void)
11028 {
11029 if (TARGET_64BIT)
11030 return R11_REG;
11031 else
11032 {
11033 bool is_fastcall;
11034 int regparm;
11035
11036 is_fastcall = (lookup_attribute ("fastcall",
11037 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11038 != NULL);
11039 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11040
11041 if (is_fastcall)
11042 {
11043 if (DECL_STATIC_CHAIN (cfun->decl))
11044 {
11045 sorry ("-fsplit-stack does not support fastcall with "
11046 "nested function");
11047 return INVALID_REGNUM;
11048 }
11049 return AX_REG;
11050 }
11051 else if (regparm < 3)
11052 {
11053 if (!DECL_STATIC_CHAIN (cfun->decl))
11054 return CX_REG;
11055 else
11056 {
11057 if (regparm >= 2)
11058 {
11059 sorry ("-fsplit-stack does not support 2 register "
11060 " parameters for a nested function");
11061 return INVALID_REGNUM;
11062 }
11063 return DX_REG;
11064 }
11065 }
11066 else
11067 {
11068 /* FIXME: We could make this work by pushing a register
11069 around the addition and comparison. */
11070 sorry ("-fsplit-stack does not support 3 register parameters");
11071 return INVALID_REGNUM;
11072 }
11073 }
11074 }
11075
11076 /* A SYMBOL_REF for the function which allocates new stackspace for
11077 -fsplit-stack. */
11078
11079 static GTY(()) rtx split_stack_fn;
11080
11081 /* A SYMBOL_REF for the more stack function when using the large
11082 model. */
11083
11084 static GTY(()) rtx split_stack_fn_large;
11085
11086 /* Handle -fsplit-stack. These are the first instructions in the
11087 function, even before the regular prologue. */
11088
11089 void
11090 ix86_expand_split_stack_prologue (void)
11091 {
11092 struct ix86_frame frame;
11093 HOST_WIDE_INT allocate;
11094 unsigned HOST_WIDE_INT args_size;
11095 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11096 rtx scratch_reg = NULL_RTX;
11097 rtx varargs_label = NULL_RTX;
11098 rtx fn;
11099
11100 gcc_assert (flag_split_stack && reload_completed);
11101
11102 ix86_finalize_stack_realign_flags ();
11103 ix86_compute_frame_layout (&frame);
11104 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11105
11106 /* This is the label we will branch to if we have enough stack
11107 space. We expect the basic block reordering pass to reverse this
11108 branch if optimizing, so that we branch in the unlikely case. */
11109 label = gen_label_rtx ();
11110
11111 /* We need to compare the stack pointer minus the frame size with
11112 the stack boundary in the TCB. The stack boundary always gives
11113 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11114 can compare directly. Otherwise we need to do an addition. */
11115
11116 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11117 UNSPEC_STACK_CHECK);
11118 limit = gen_rtx_CONST (Pmode, limit);
11119 limit = gen_rtx_MEM (Pmode, limit);
11120 if (allocate < SPLIT_STACK_AVAILABLE)
11121 current = stack_pointer_rtx;
11122 else
11123 {
11124 unsigned int scratch_regno;
11125 rtx offset;
11126
11127 /* We need a scratch register to hold the stack pointer minus
11128 the required frame size. Since this is the very start of the
11129 function, the scratch register can be any caller-saved
11130 register which is not used for parameters. */
11131 offset = GEN_INT (- allocate);
11132 scratch_regno = split_stack_prologue_scratch_regno ();
11133 if (scratch_regno == INVALID_REGNUM)
11134 return;
11135 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11136 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11137 {
11138 /* We don't use ix86_gen_add3 in this case because it will
11139 want to split to lea, but when not optimizing the insn
11140 will not be split after this point. */
11141 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11142 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11143 offset)));
11144 }
11145 else
11146 {
11147 emit_move_insn (scratch_reg, offset);
11148 emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
11149 stack_pointer_rtx));
11150 }
11151 current = scratch_reg;
11152 }
11153
11154 ix86_expand_branch (GEU, current, limit, label);
11155 jump_insn = get_last_insn ();
11156 JUMP_LABEL (jump_insn) = label;
11157
11158 /* Mark the jump as very likely to be taken. */
11159 add_reg_note (jump_insn, REG_BR_PROB,
11160 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11161
11162 if (split_stack_fn == NULL_RTX)
11163 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11164 fn = split_stack_fn;
11165
11166 /* Get more stack space. We pass in the desired stack space and the
11167 size of the arguments to copy to the new stack. In 32-bit mode
11168 we push the parameters; __morestack will return on a new stack
11169 anyhow. In 64-bit mode we pass the parameters in r10 and
11170 r11. */
11171 allocate_rtx = GEN_INT (allocate);
11172 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11173 call_fusage = NULL_RTX;
11174 if (TARGET_64BIT)
11175 {
11176 rtx reg10, reg11;
11177
11178 reg10 = gen_rtx_REG (Pmode, R10_REG);
11179 reg11 = gen_rtx_REG (Pmode, R11_REG);
11180
11181 /* If this function uses a static chain, it will be in %r10.
11182 Preserve it across the call to __morestack. */
11183 if (DECL_STATIC_CHAIN (cfun->decl))
11184 {
11185 rtx rax;
11186
11187 rax = gen_rtx_REG (Pmode, AX_REG);
11188 emit_move_insn (rax, reg10);
11189 use_reg (&call_fusage, rax);
11190 }
11191
11192 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11193 {
11194 HOST_WIDE_INT argval;
11195
11196 /* When using the large model we need to load the address
11197 into a register, and we've run out of registers. So we
11198 switch to a different calling convention, and we call a
11199 different function: __morestack_large. We pass the
11200 argument size in the upper 32 bits of r10 and pass the
11201 frame size in the lower 32 bits. */
11202 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11203 gcc_assert ((args_size & 0xffffffff) == args_size);
11204
11205 if (split_stack_fn_large == NULL_RTX)
11206 split_stack_fn_large =
11207 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11208
11209 if (ix86_cmodel == CM_LARGE_PIC)
11210 {
11211 rtx label, x;
11212
11213 label = gen_label_rtx ();
11214 emit_label (label);
11215 LABEL_PRESERVE_P (label) = 1;
11216 emit_insn (gen_set_rip_rex64 (reg10, label));
11217 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11218 emit_insn (gen_adddi3 (reg10, reg10, reg11));
11219 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11220 UNSPEC_GOT);
11221 x = gen_rtx_CONST (Pmode, x);
11222 emit_move_insn (reg11, x);
11223 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11224 x = gen_const_mem (Pmode, x);
11225 emit_move_insn (reg11, x);
11226 }
11227 else
11228 emit_move_insn (reg11, split_stack_fn_large);
11229
11230 fn = reg11;
11231
11232 argval = ((args_size << 16) << 16) + allocate;
11233 emit_move_insn (reg10, GEN_INT (argval));
11234 }
11235 else
11236 {
11237 emit_move_insn (reg10, allocate_rtx);
11238 emit_move_insn (reg11, GEN_INT (args_size));
11239 use_reg (&call_fusage, reg11);
11240 }
11241
11242 use_reg (&call_fusage, reg10);
11243 }
11244 else
11245 {
11246 emit_insn (gen_push (GEN_INT (args_size)));
11247 emit_insn (gen_push (allocate_rtx));
11248 }
11249 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11250 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11251 NULL_RTX, false);
11252 add_function_usage_to (call_insn, call_fusage);
11253
11254 /* In order to make call/return prediction work right, we now need
11255 to execute a return instruction. See
11256 libgcc/config/i386/morestack.S for the details on how this works.
11257
11258 For flow purposes gcc must not see this as a return
11259 instruction--we need control flow to continue at the subsequent
11260 label. Therefore, we use an unspec. */
11261 gcc_assert (crtl->args.pops_args < 65536);
11262 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11263
11264 /* If we are in 64-bit mode and this function uses a static chain,
11265 we saved %r10 in %rax before calling _morestack. */
11266 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11267 emit_move_insn (gen_rtx_REG (Pmode, R10_REG),
11268 gen_rtx_REG (Pmode, AX_REG));
11269
11270 /* If this function calls va_start, we need to store a pointer to
11271 the arguments on the old stack, because they may not have been
11272 all copied to the new stack. At this point the old stack can be
11273 found at the frame pointer value used by __morestack, because
11274 __morestack has set that up before calling back to us. Here we
11275 store that pointer in a scratch register, and in
11276 ix86_expand_prologue we store the scratch register in a stack
11277 slot. */
11278 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11279 {
11280 unsigned int scratch_regno;
11281 rtx frame_reg;
11282 int words;
11283
11284 scratch_regno = split_stack_prologue_scratch_regno ();
11285 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11286 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11287
11288 /* 64-bit:
11289 fp -> old fp value
11290 return address within this function
11291 return address of caller of this function
11292 stack arguments
11293 So we add three words to get to the stack arguments.
11294
11295 32-bit:
11296 fp -> old fp value
11297 return address within this function
11298 first argument to __morestack
11299 second argument to __morestack
11300 return address of caller of this function
11301 stack arguments
11302 So we add five words to get to the stack arguments.
11303 */
11304 words = TARGET_64BIT ? 3 : 5;
11305 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11306 gen_rtx_PLUS (Pmode, frame_reg,
11307 GEN_INT (words * UNITS_PER_WORD))));
11308
11309 varargs_label = gen_label_rtx ();
11310 emit_jump_insn (gen_jump (varargs_label));
11311 JUMP_LABEL (get_last_insn ()) = varargs_label;
11312
11313 emit_barrier ();
11314 }
11315
11316 emit_label (label);
11317 LABEL_NUSES (label) = 1;
11318
11319 /* If this function calls va_start, we now have to set the scratch
11320 register for the case where we do not call __morestack. In this
11321 case we need to set it based on the stack pointer. */
11322 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11323 {
11324 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11325 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11326 GEN_INT (UNITS_PER_WORD))));
11327
11328 emit_label (varargs_label);
11329 LABEL_NUSES (varargs_label) = 1;
11330 }
11331 }
11332
11333 /* We may have to tell the dataflow pass that the split stack prologue
11334 is initializing a scratch register. */
11335
11336 static void
11337 ix86_live_on_entry (bitmap regs)
11338 {
11339 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11340 {
11341 gcc_assert (flag_split_stack);
11342 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11343 }
11344 }
11345 \f
11346 /* Determine if op is suitable SUBREG RTX for address. */
11347
11348 static bool
11349 ix86_address_subreg_operand (rtx op)
11350 {
11351 enum machine_mode mode;
11352
11353 if (!REG_P (op))
11354 return false;
11355
11356 mode = GET_MODE (op);
11357
11358 if (GET_MODE_CLASS (mode) != MODE_INT)
11359 return false;
11360
11361 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11362 failures when the register is one word out of a two word structure. */
11363 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11364 return false;
11365
11366 /* Allow only SUBREGs of non-eliminable hard registers. */
11367 return register_no_elim_operand (op, mode);
11368 }
11369
11370 /* Extract the parts of an RTL expression that is a valid memory address
11371 for an instruction. Return 0 if the structure of the address is
11372 grossly off. Return -1 if the address contains ASHIFT, so it is not
11373 strictly valid, but still used for computing length of lea instruction. */
11374
11375 int
11376 ix86_decompose_address (rtx addr, struct ix86_address *out)
11377 {
11378 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11379 rtx base_reg, index_reg;
11380 HOST_WIDE_INT scale = 1;
11381 rtx scale_rtx = NULL_RTX;
11382 rtx tmp;
11383 int retval = 1;
11384 enum ix86_address_seg seg = SEG_DEFAULT;
11385
11386 /* Allow zero-extended SImode addresses,
11387 they will be emitted with addr32 prefix. */
11388 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11389 {
11390 if (GET_CODE (addr) == ZERO_EXTEND
11391 && GET_MODE (XEXP (addr, 0)) == SImode)
11392 addr = XEXP (addr, 0);
11393 else if (GET_CODE (addr) == AND
11394 && const_32bit_mask (XEXP (addr, 1), DImode))
11395 {
11396 addr = XEXP (addr, 0);
11397
11398 /* Strip subreg. */
11399 if (GET_CODE (addr) == SUBREG
11400 && GET_MODE (SUBREG_REG (addr)) == SImode)
11401 addr = SUBREG_REG (addr);
11402 }
11403 }
11404
11405 if (REG_P (addr))
11406 base = addr;
11407 else if (GET_CODE (addr) == SUBREG)
11408 {
11409 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11410 base = addr;
11411 else
11412 return 0;
11413 }
11414 else if (GET_CODE (addr) == PLUS)
11415 {
11416 rtx addends[4], op;
11417 int n = 0, i;
11418
11419 op = addr;
11420 do
11421 {
11422 if (n >= 4)
11423 return 0;
11424 addends[n++] = XEXP (op, 1);
11425 op = XEXP (op, 0);
11426 }
11427 while (GET_CODE (op) == PLUS);
11428 if (n >= 4)
11429 return 0;
11430 addends[n] = op;
11431
11432 for (i = n; i >= 0; --i)
11433 {
11434 op = addends[i];
11435 switch (GET_CODE (op))
11436 {
11437 case MULT:
11438 if (index)
11439 return 0;
11440 index = XEXP (op, 0);
11441 scale_rtx = XEXP (op, 1);
11442 break;
11443
11444 case ASHIFT:
11445 if (index)
11446 return 0;
11447 index = XEXP (op, 0);
11448 tmp = XEXP (op, 1);
11449 if (!CONST_INT_P (tmp))
11450 return 0;
11451 scale = INTVAL (tmp);
11452 if ((unsigned HOST_WIDE_INT) scale > 3)
11453 return 0;
11454 scale = 1 << scale;
11455 break;
11456
11457 case UNSPEC:
11458 if (XINT (op, 1) == UNSPEC_TP
11459 && TARGET_TLS_DIRECT_SEG_REFS
11460 && seg == SEG_DEFAULT)
11461 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11462 else
11463 return 0;
11464 break;
11465
11466 case SUBREG:
11467 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11468 return 0;
11469 /* FALLTHRU */
11470
11471 case REG:
11472 if (!base)
11473 base = op;
11474 else if (!index)
11475 index = op;
11476 else
11477 return 0;
11478 break;
11479
11480 case CONST:
11481 case CONST_INT:
11482 case SYMBOL_REF:
11483 case LABEL_REF:
11484 if (disp)
11485 return 0;
11486 disp = op;
11487 break;
11488
11489 default:
11490 return 0;
11491 }
11492 }
11493 }
11494 else if (GET_CODE (addr) == MULT)
11495 {
11496 index = XEXP (addr, 0); /* index*scale */
11497 scale_rtx = XEXP (addr, 1);
11498 }
11499 else if (GET_CODE (addr) == ASHIFT)
11500 {
11501 /* We're called for lea too, which implements ashift on occasion. */
11502 index = XEXP (addr, 0);
11503 tmp = XEXP (addr, 1);
11504 if (!CONST_INT_P (tmp))
11505 return 0;
11506 scale = INTVAL (tmp);
11507 if ((unsigned HOST_WIDE_INT) scale > 3)
11508 return 0;
11509 scale = 1 << scale;
11510 retval = -1;
11511 }
11512 else
11513 disp = addr; /* displacement */
11514
11515 if (index)
11516 {
11517 if (REG_P (index))
11518 ;
11519 else if (GET_CODE (index) == SUBREG
11520 && ix86_address_subreg_operand (SUBREG_REG (index)))
11521 ;
11522 else
11523 return 0;
11524 }
11525
11526 /* Extract the integral value of scale. */
11527 if (scale_rtx)
11528 {
11529 if (!CONST_INT_P (scale_rtx))
11530 return 0;
11531 scale = INTVAL (scale_rtx);
11532 }
11533
11534 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11535 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11536
11537 /* Avoid useless 0 displacement. */
11538 if (disp == const0_rtx && (base || index))
11539 disp = NULL_RTX;
11540
11541 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11542 if (base_reg && index_reg && scale == 1
11543 && (index_reg == arg_pointer_rtx
11544 || index_reg == frame_pointer_rtx
11545 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11546 {
11547 rtx tmp;
11548 tmp = base, base = index, index = tmp;
11549 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11550 }
11551
11552 /* Special case: %ebp cannot be encoded as a base without a displacement.
11553 Similarly %r13. */
11554 if (!disp
11555 && base_reg
11556 && (base_reg == hard_frame_pointer_rtx
11557 || base_reg == frame_pointer_rtx
11558 || base_reg == arg_pointer_rtx
11559 || (REG_P (base_reg)
11560 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11561 || REGNO (base_reg) == R13_REG))))
11562 disp = const0_rtx;
11563
11564 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11565 Avoid this by transforming to [%esi+0].
11566 Reload calls address legitimization without cfun defined, so we need
11567 to test cfun for being non-NULL. */
11568 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11569 && base_reg && !index_reg && !disp
11570 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11571 disp = const0_rtx;
11572
11573 /* Special case: encode reg+reg instead of reg*2. */
11574 if (!base && index && scale == 2)
11575 base = index, base_reg = index_reg, scale = 1;
11576
11577 /* Special case: scaling cannot be encoded without base or displacement. */
11578 if (!base && !disp && index && scale != 1)
11579 disp = const0_rtx;
11580
11581 out->base = base;
11582 out->index = index;
11583 out->disp = disp;
11584 out->scale = scale;
11585 out->seg = seg;
11586
11587 return retval;
11588 }
11589 \f
11590 /* Return cost of the memory address x.
11591 For i386, it is better to use a complex address than let gcc copy
11592 the address into a reg and make a new pseudo. But not if the address
11593 requires to two regs - that would mean more pseudos with longer
11594 lifetimes. */
11595 static int
11596 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11597 {
11598 struct ix86_address parts;
11599 int cost = 1;
11600 int ok = ix86_decompose_address (x, &parts);
11601
11602 gcc_assert (ok);
11603
11604 if (parts.base && GET_CODE (parts.base) == SUBREG)
11605 parts.base = SUBREG_REG (parts.base);
11606 if (parts.index && GET_CODE (parts.index) == SUBREG)
11607 parts.index = SUBREG_REG (parts.index);
11608
11609 /* Attempt to minimize number of registers in the address. */
11610 if ((parts.base
11611 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11612 || (parts.index
11613 && (!REG_P (parts.index)
11614 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11615 cost++;
11616
11617 if (parts.base
11618 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11619 && parts.index
11620 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11621 && parts.base != parts.index)
11622 cost++;
11623
11624 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11625 since it's predecode logic can't detect the length of instructions
11626 and it degenerates to vector decoded. Increase cost of such
11627 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11628 to split such addresses or even refuse such addresses at all.
11629
11630 Following addressing modes are affected:
11631 [base+scale*index]
11632 [scale*index+disp]
11633 [base+index]
11634
11635 The first and last case may be avoidable by explicitly coding the zero in
11636 memory address, but I don't have AMD-K6 machine handy to check this
11637 theory. */
11638
11639 if (TARGET_K6
11640 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11641 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11642 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11643 cost += 10;
11644
11645 return cost;
11646 }
11647 \f
11648 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11649 this is used for to form addresses to local data when -fPIC is in
11650 use. */
11651
11652 static bool
11653 darwin_local_data_pic (rtx disp)
11654 {
11655 return (GET_CODE (disp) == UNSPEC
11656 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11657 }
11658
11659 /* Determine if a given RTX is a valid constant. We already know this
11660 satisfies CONSTANT_P. */
11661
11662 static bool
11663 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11664 {
11665 switch (GET_CODE (x))
11666 {
11667 case CONST:
11668 x = XEXP (x, 0);
11669
11670 if (GET_CODE (x) == PLUS)
11671 {
11672 if (!CONST_INT_P (XEXP (x, 1)))
11673 return false;
11674 x = XEXP (x, 0);
11675 }
11676
11677 if (TARGET_MACHO && darwin_local_data_pic (x))
11678 return true;
11679
11680 /* Only some unspecs are valid as "constants". */
11681 if (GET_CODE (x) == UNSPEC)
11682 switch (XINT (x, 1))
11683 {
11684 case UNSPEC_GOT:
11685 case UNSPEC_GOTOFF:
11686 case UNSPEC_PLTOFF:
11687 return TARGET_64BIT;
11688 case UNSPEC_TPOFF:
11689 case UNSPEC_NTPOFF:
11690 x = XVECEXP (x, 0, 0);
11691 return (GET_CODE (x) == SYMBOL_REF
11692 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11693 case UNSPEC_DTPOFF:
11694 x = XVECEXP (x, 0, 0);
11695 return (GET_CODE (x) == SYMBOL_REF
11696 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11697 default:
11698 return false;
11699 }
11700
11701 /* We must have drilled down to a symbol. */
11702 if (GET_CODE (x) == LABEL_REF)
11703 return true;
11704 if (GET_CODE (x) != SYMBOL_REF)
11705 return false;
11706 /* FALLTHRU */
11707
11708 case SYMBOL_REF:
11709 /* TLS symbols are never valid. */
11710 if (SYMBOL_REF_TLS_MODEL (x))
11711 return false;
11712
11713 /* DLLIMPORT symbols are never valid. */
11714 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11715 && SYMBOL_REF_DLLIMPORT_P (x))
11716 return false;
11717
11718 #if TARGET_MACHO
11719 /* mdynamic-no-pic */
11720 if (MACHO_DYNAMIC_NO_PIC_P)
11721 return machopic_symbol_defined_p (x);
11722 #endif
11723 break;
11724
11725 case CONST_DOUBLE:
11726 if (GET_MODE (x) == TImode
11727 && x != CONST0_RTX (TImode)
11728 && !TARGET_64BIT)
11729 return false;
11730 break;
11731
11732 case CONST_VECTOR:
11733 if (!standard_sse_constant_p (x))
11734 return false;
11735
11736 default:
11737 break;
11738 }
11739
11740 /* Otherwise we handle everything else in the move patterns. */
11741 return true;
11742 }
11743
11744 /* Determine if it's legal to put X into the constant pool. This
11745 is not possible for the address of thread-local symbols, which
11746 is checked above. */
11747
11748 static bool
11749 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11750 {
11751 /* We can always put integral constants and vectors in memory. */
11752 switch (GET_CODE (x))
11753 {
11754 case CONST_INT:
11755 case CONST_DOUBLE:
11756 case CONST_VECTOR:
11757 return false;
11758
11759 default:
11760 break;
11761 }
11762 return !ix86_legitimate_constant_p (mode, x);
11763 }
11764
11765
11766 /* Nonzero if the constant value X is a legitimate general operand
11767 when generating PIC code. It is given that flag_pic is on and
11768 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11769
11770 bool
11771 legitimate_pic_operand_p (rtx x)
11772 {
11773 rtx inner;
11774
11775 switch (GET_CODE (x))
11776 {
11777 case CONST:
11778 inner = XEXP (x, 0);
11779 if (GET_CODE (inner) == PLUS
11780 && CONST_INT_P (XEXP (inner, 1)))
11781 inner = XEXP (inner, 0);
11782
11783 /* Only some unspecs are valid as "constants". */
11784 if (GET_CODE (inner) == UNSPEC)
11785 switch (XINT (inner, 1))
11786 {
11787 case UNSPEC_GOT:
11788 case UNSPEC_GOTOFF:
11789 case UNSPEC_PLTOFF:
11790 return TARGET_64BIT;
11791 case UNSPEC_TPOFF:
11792 x = XVECEXP (inner, 0, 0);
11793 return (GET_CODE (x) == SYMBOL_REF
11794 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11795 case UNSPEC_MACHOPIC_OFFSET:
11796 return legitimate_pic_address_disp_p (x);
11797 default:
11798 return false;
11799 }
11800 /* FALLTHRU */
11801
11802 case SYMBOL_REF:
11803 case LABEL_REF:
11804 return legitimate_pic_address_disp_p (x);
11805
11806 default:
11807 return true;
11808 }
11809 }
11810
11811 /* Determine if a given CONST RTX is a valid memory displacement
11812 in PIC mode. */
11813
11814 bool
11815 legitimate_pic_address_disp_p (rtx disp)
11816 {
11817 bool saw_plus;
11818
11819 /* In 64bit mode we can allow direct addresses of symbols and labels
11820 when they are not dynamic symbols. */
11821 if (TARGET_64BIT)
11822 {
11823 rtx op0 = disp, op1;
11824
11825 switch (GET_CODE (disp))
11826 {
11827 case LABEL_REF:
11828 return true;
11829
11830 case CONST:
11831 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11832 break;
11833 op0 = XEXP (XEXP (disp, 0), 0);
11834 op1 = XEXP (XEXP (disp, 0), 1);
11835 if (!CONST_INT_P (op1)
11836 || INTVAL (op1) >= 16*1024*1024
11837 || INTVAL (op1) < -16*1024*1024)
11838 break;
11839 if (GET_CODE (op0) == LABEL_REF)
11840 return true;
11841 if (GET_CODE (op0) == CONST
11842 && GET_CODE (XEXP (op0, 0)) == UNSPEC
11843 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
11844 return true;
11845 if (GET_CODE (op0) == UNSPEC
11846 && XINT (op0, 1) == UNSPEC_PCREL)
11847 return true;
11848 if (GET_CODE (op0) != SYMBOL_REF)
11849 break;
11850 /* FALLTHRU */
11851
11852 case SYMBOL_REF:
11853 /* TLS references should always be enclosed in UNSPEC. */
11854 if (SYMBOL_REF_TLS_MODEL (op0))
11855 return false;
11856 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11857 && ix86_cmodel != CM_LARGE_PIC)
11858 return true;
11859 break;
11860
11861 default:
11862 break;
11863 }
11864 }
11865 if (GET_CODE (disp) != CONST)
11866 return false;
11867 disp = XEXP (disp, 0);
11868
11869 if (TARGET_64BIT)
11870 {
11871 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11872 of GOT tables. We should not need these anyway. */
11873 if (GET_CODE (disp) != UNSPEC
11874 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11875 && XINT (disp, 1) != UNSPEC_GOTOFF
11876 && XINT (disp, 1) != UNSPEC_PCREL
11877 && XINT (disp, 1) != UNSPEC_PLTOFF))
11878 return false;
11879
11880 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11881 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11882 return false;
11883 return true;
11884 }
11885
11886 saw_plus = false;
11887 if (GET_CODE (disp) == PLUS)
11888 {
11889 if (!CONST_INT_P (XEXP (disp, 1)))
11890 return false;
11891 disp = XEXP (disp, 0);
11892 saw_plus = true;
11893 }
11894
11895 if (TARGET_MACHO && darwin_local_data_pic (disp))
11896 return true;
11897
11898 if (GET_CODE (disp) != UNSPEC)
11899 return false;
11900
11901 switch (XINT (disp, 1))
11902 {
11903 case UNSPEC_GOT:
11904 if (saw_plus)
11905 return false;
11906 /* We need to check for both symbols and labels because VxWorks loads
11907 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11908 details. */
11909 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11910 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11911 case UNSPEC_GOTOFF:
11912 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11913 While ABI specify also 32bit relocation but we don't produce it in
11914 small PIC model at all. */
11915 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11916 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11917 && !TARGET_64BIT)
11918 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11919 return false;
11920 case UNSPEC_GOTTPOFF:
11921 case UNSPEC_GOTNTPOFF:
11922 case UNSPEC_INDNTPOFF:
11923 if (saw_plus)
11924 return false;
11925 disp = XVECEXP (disp, 0, 0);
11926 return (GET_CODE (disp) == SYMBOL_REF
11927 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
11928 case UNSPEC_NTPOFF:
11929 disp = XVECEXP (disp, 0, 0);
11930 return (GET_CODE (disp) == SYMBOL_REF
11931 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
11932 case UNSPEC_DTPOFF:
11933 disp = XVECEXP (disp, 0, 0);
11934 return (GET_CODE (disp) == SYMBOL_REF
11935 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
11936 }
11937
11938 return false;
11939 }
11940
11941 /* Recognizes RTL expressions that are valid memory addresses for an
11942 instruction. The MODE argument is the machine mode for the MEM
11943 expression that wants to use this address.
11944
11945 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
11946 convert common non-canonical forms to canonical form so that they will
11947 be recognized. */
11948
11949 static bool
11950 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
11951 rtx addr, bool strict)
11952 {
11953 struct ix86_address parts;
11954 rtx base, index, disp;
11955 HOST_WIDE_INT scale;
11956
11957 /* Since constant address in x32 is signed extended to 64bit,
11958 we have to prevent addresses from 0x80000000 to 0xffffffff. */
11959 if (TARGET_X32
11960 && CONST_INT_P (addr)
11961 && INTVAL (addr) < 0)
11962 return false;
11963
11964 if (ix86_decompose_address (addr, &parts) <= 0)
11965 /* Decomposition failed. */
11966 return false;
11967
11968 base = parts.base;
11969 index = parts.index;
11970 disp = parts.disp;
11971 scale = parts.scale;
11972
11973 /* Validate base register. */
11974 if (base)
11975 {
11976 rtx reg;
11977
11978 if (REG_P (base))
11979 reg = base;
11980 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
11981 reg = SUBREG_REG (base);
11982 else
11983 /* Base is not a register. */
11984 return false;
11985
11986 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
11987 return false;
11988
11989 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
11990 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
11991 /* Base is not valid. */
11992 return false;
11993 }
11994
11995 /* Validate index register. */
11996 if (index)
11997 {
11998 rtx reg;
11999
12000 if (REG_P (index))
12001 reg = index;
12002 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12003 reg = SUBREG_REG (index);
12004 else
12005 /* Index is not a register. */
12006 return false;
12007
12008 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12009 return false;
12010
12011 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12012 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12013 /* Index is not valid. */
12014 return false;
12015 }
12016
12017 /* Index and base should have the same mode. */
12018 if (base && index
12019 && GET_MODE (base) != GET_MODE (index))
12020 return false;
12021
12022 /* Validate scale factor. */
12023 if (scale != 1)
12024 {
12025 if (!index)
12026 /* Scale without index. */
12027 return false;
12028
12029 if (scale != 2 && scale != 4 && scale != 8)
12030 /* Scale is not a valid multiplier. */
12031 return false;
12032 }
12033
12034 /* Validate displacement. */
12035 if (disp)
12036 {
12037 if (GET_CODE (disp) == CONST
12038 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12039 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12040 switch (XINT (XEXP (disp, 0), 1))
12041 {
12042 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12043 used. While ABI specify also 32bit relocations, we don't produce
12044 them at all and use IP relative instead. */
12045 case UNSPEC_GOT:
12046 case UNSPEC_GOTOFF:
12047 gcc_assert (flag_pic);
12048 if (!TARGET_64BIT)
12049 goto is_legitimate_pic;
12050
12051 /* 64bit address unspec. */
12052 return false;
12053
12054 case UNSPEC_GOTPCREL:
12055 case UNSPEC_PCREL:
12056 gcc_assert (flag_pic);
12057 goto is_legitimate_pic;
12058
12059 case UNSPEC_GOTTPOFF:
12060 case UNSPEC_GOTNTPOFF:
12061 case UNSPEC_INDNTPOFF:
12062 case UNSPEC_NTPOFF:
12063 case UNSPEC_DTPOFF:
12064 break;
12065
12066 case UNSPEC_STACK_CHECK:
12067 gcc_assert (flag_split_stack);
12068 break;
12069
12070 default:
12071 /* Invalid address unspec. */
12072 return false;
12073 }
12074
12075 else if (SYMBOLIC_CONST (disp)
12076 && (flag_pic
12077 || (TARGET_MACHO
12078 #if TARGET_MACHO
12079 && MACHOPIC_INDIRECT
12080 && !machopic_operand_p (disp)
12081 #endif
12082 )))
12083 {
12084
12085 is_legitimate_pic:
12086 if (TARGET_64BIT && (index || base))
12087 {
12088 /* foo@dtpoff(%rX) is ok. */
12089 if (GET_CODE (disp) != CONST
12090 || GET_CODE (XEXP (disp, 0)) != PLUS
12091 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12092 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12093 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12094 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12095 /* Non-constant pic memory reference. */
12096 return false;
12097 }
12098 else if ((!TARGET_MACHO || flag_pic)
12099 && ! legitimate_pic_address_disp_p (disp))
12100 /* Displacement is an invalid pic construct. */
12101 return false;
12102 #if TARGET_MACHO
12103 else if (MACHO_DYNAMIC_NO_PIC_P
12104 && !ix86_legitimate_constant_p (Pmode, disp))
12105 /* displacment must be referenced via non_lazy_pointer */
12106 return false;
12107 #endif
12108
12109 /* This code used to verify that a symbolic pic displacement
12110 includes the pic_offset_table_rtx register.
12111
12112 While this is good idea, unfortunately these constructs may
12113 be created by "adds using lea" optimization for incorrect
12114 code like:
12115
12116 int a;
12117 int foo(int i)
12118 {
12119 return *(&a+i);
12120 }
12121
12122 This code is nonsensical, but results in addressing
12123 GOT table with pic_offset_table_rtx base. We can't
12124 just refuse it easily, since it gets matched by
12125 "addsi3" pattern, that later gets split to lea in the
12126 case output register differs from input. While this
12127 can be handled by separate addsi pattern for this case
12128 that never results in lea, this seems to be easier and
12129 correct fix for crash to disable this test. */
12130 }
12131 else if (GET_CODE (disp) != LABEL_REF
12132 && !CONST_INT_P (disp)
12133 && (GET_CODE (disp) != CONST
12134 || !ix86_legitimate_constant_p (Pmode, disp))
12135 && (GET_CODE (disp) != SYMBOL_REF
12136 || !ix86_legitimate_constant_p (Pmode, disp)))
12137 /* Displacement is not constant. */
12138 return false;
12139 else if (TARGET_64BIT
12140 && !x86_64_immediate_operand (disp, VOIDmode))
12141 /* Displacement is out of range. */
12142 return false;
12143 }
12144
12145 /* Everything looks valid. */
12146 return true;
12147 }
12148
12149 /* Determine if a given RTX is a valid constant address. */
12150
12151 bool
12152 constant_address_p (rtx x)
12153 {
12154 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12155 }
12156 \f
12157 /* Return a unique alias set for the GOT. */
12158
12159 static alias_set_type
12160 ix86_GOT_alias_set (void)
12161 {
12162 static alias_set_type set = -1;
12163 if (set == -1)
12164 set = new_alias_set ();
12165 return set;
12166 }
12167
12168 /* Return a legitimate reference for ORIG (an address) using the
12169 register REG. If REG is 0, a new pseudo is generated.
12170
12171 There are two types of references that must be handled:
12172
12173 1. Global data references must load the address from the GOT, via
12174 the PIC reg. An insn is emitted to do this load, and the reg is
12175 returned.
12176
12177 2. Static data references, constant pool addresses, and code labels
12178 compute the address as an offset from the GOT, whose base is in
12179 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12180 differentiate them from global data objects. The returned
12181 address is the PIC reg + an unspec constant.
12182
12183 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12184 reg also appears in the address. */
12185
12186 static rtx
12187 legitimize_pic_address (rtx orig, rtx reg)
12188 {
12189 rtx addr = orig;
12190 rtx new_rtx = orig;
12191 rtx base;
12192
12193 #if TARGET_MACHO
12194 if (TARGET_MACHO && !TARGET_64BIT)
12195 {
12196 if (reg == 0)
12197 reg = gen_reg_rtx (Pmode);
12198 /* Use the generic Mach-O PIC machinery. */
12199 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12200 }
12201 #endif
12202
12203 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12204 new_rtx = addr;
12205 else if (TARGET_64BIT
12206 && ix86_cmodel != CM_SMALL_PIC
12207 && gotoff_operand (addr, Pmode))
12208 {
12209 rtx tmpreg;
12210 /* This symbol may be referenced via a displacement from the PIC
12211 base address (@GOTOFF). */
12212
12213 if (reload_in_progress)
12214 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12215 if (GET_CODE (addr) == CONST)
12216 addr = XEXP (addr, 0);
12217 if (GET_CODE (addr) == PLUS)
12218 {
12219 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12220 UNSPEC_GOTOFF);
12221 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12222 }
12223 else
12224 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12225 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12226 if (!reg)
12227 tmpreg = gen_reg_rtx (Pmode);
12228 else
12229 tmpreg = reg;
12230 emit_move_insn (tmpreg, new_rtx);
12231
12232 if (reg != 0)
12233 {
12234 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12235 tmpreg, 1, OPTAB_DIRECT);
12236 new_rtx = reg;
12237 }
12238 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12239 }
12240 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12241 {
12242 /* This symbol may be referenced via a displacement from the PIC
12243 base address (@GOTOFF). */
12244
12245 if (reload_in_progress)
12246 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12247 if (GET_CODE (addr) == CONST)
12248 addr = XEXP (addr, 0);
12249 if (GET_CODE (addr) == PLUS)
12250 {
12251 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12252 UNSPEC_GOTOFF);
12253 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12254 }
12255 else
12256 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12257 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12258 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12259
12260 if (reg != 0)
12261 {
12262 emit_move_insn (reg, new_rtx);
12263 new_rtx = reg;
12264 }
12265 }
12266 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12267 /* We can't use @GOTOFF for text labels on VxWorks;
12268 see gotoff_operand. */
12269 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12270 {
12271 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12272 {
12273 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12274 return legitimize_dllimport_symbol (addr, true);
12275 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12276 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12277 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12278 {
12279 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12280 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12281 }
12282 }
12283
12284 /* For x64 PE-COFF there is no GOT table. So we use address
12285 directly. */
12286 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12287 {
12288 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12289 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12290
12291 if (reg == 0)
12292 reg = gen_reg_rtx (Pmode);
12293 emit_move_insn (reg, new_rtx);
12294 new_rtx = reg;
12295 }
12296 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12297 {
12298 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12299 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12300 new_rtx = gen_const_mem (Pmode, new_rtx);
12301 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12302
12303 if (reg == 0)
12304 reg = gen_reg_rtx (Pmode);
12305 /* Use directly gen_movsi, otherwise the address is loaded
12306 into register for CSE. We don't want to CSE this addresses,
12307 instead we CSE addresses from the GOT table, so skip this. */
12308 emit_insn (gen_movsi (reg, new_rtx));
12309 new_rtx = reg;
12310 }
12311 else
12312 {
12313 /* This symbol must be referenced via a load from the
12314 Global Offset Table (@GOT). */
12315
12316 if (reload_in_progress)
12317 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12318 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12319 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12320 if (TARGET_64BIT)
12321 new_rtx = force_reg (Pmode, new_rtx);
12322 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12323 new_rtx = gen_const_mem (Pmode, new_rtx);
12324 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12325
12326 if (reg == 0)
12327 reg = gen_reg_rtx (Pmode);
12328 emit_move_insn (reg, new_rtx);
12329 new_rtx = reg;
12330 }
12331 }
12332 else
12333 {
12334 if (CONST_INT_P (addr)
12335 && !x86_64_immediate_operand (addr, VOIDmode))
12336 {
12337 if (reg)
12338 {
12339 emit_move_insn (reg, addr);
12340 new_rtx = reg;
12341 }
12342 else
12343 new_rtx = force_reg (Pmode, addr);
12344 }
12345 else if (GET_CODE (addr) == CONST)
12346 {
12347 addr = XEXP (addr, 0);
12348
12349 /* We must match stuff we generate before. Assume the only
12350 unspecs that can get here are ours. Not that we could do
12351 anything with them anyway.... */
12352 if (GET_CODE (addr) == UNSPEC
12353 || (GET_CODE (addr) == PLUS
12354 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12355 return orig;
12356 gcc_assert (GET_CODE (addr) == PLUS);
12357 }
12358 if (GET_CODE (addr) == PLUS)
12359 {
12360 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12361
12362 /* Check first to see if this is a constant offset from a @GOTOFF
12363 symbol reference. */
12364 if (gotoff_operand (op0, Pmode)
12365 && CONST_INT_P (op1))
12366 {
12367 if (!TARGET_64BIT)
12368 {
12369 if (reload_in_progress)
12370 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12371 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12372 UNSPEC_GOTOFF);
12373 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12374 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12375 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12376
12377 if (reg != 0)
12378 {
12379 emit_move_insn (reg, new_rtx);
12380 new_rtx = reg;
12381 }
12382 }
12383 else
12384 {
12385 if (INTVAL (op1) < -16*1024*1024
12386 || INTVAL (op1) >= 16*1024*1024)
12387 {
12388 if (!x86_64_immediate_operand (op1, Pmode))
12389 op1 = force_reg (Pmode, op1);
12390 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12391 }
12392 }
12393 }
12394 else
12395 {
12396 base = legitimize_pic_address (XEXP (addr, 0), reg);
12397 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12398 base == reg ? NULL_RTX : reg);
12399
12400 if (CONST_INT_P (new_rtx))
12401 new_rtx = plus_constant (base, INTVAL (new_rtx));
12402 else
12403 {
12404 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12405 {
12406 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12407 new_rtx = XEXP (new_rtx, 1);
12408 }
12409 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12410 }
12411 }
12412 }
12413 }
12414 return new_rtx;
12415 }
12416 \f
12417 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12418
12419 static rtx
12420 get_thread_pointer (bool to_reg)
12421 {
12422 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12423
12424 if (GET_MODE (tp) != Pmode)
12425 tp = convert_to_mode (Pmode, tp, 1);
12426
12427 if (to_reg)
12428 tp = copy_addr_to_reg (tp);
12429
12430 return tp;
12431 }
12432
12433 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12434
12435 static GTY(()) rtx ix86_tls_symbol;
12436
12437 static rtx
12438 ix86_tls_get_addr (void)
12439 {
12440 if (!ix86_tls_symbol)
12441 {
12442 const char *sym
12443 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12444 ? "___tls_get_addr" : "__tls_get_addr");
12445
12446 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12447 }
12448
12449 return ix86_tls_symbol;
12450 }
12451
12452 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12453
12454 static GTY(()) rtx ix86_tls_module_base_symbol;
12455
12456 rtx
12457 ix86_tls_module_base (void)
12458 {
12459 if (!ix86_tls_module_base_symbol)
12460 {
12461 ix86_tls_module_base_symbol
12462 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12463
12464 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12465 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12466 }
12467
12468 return ix86_tls_module_base_symbol;
12469 }
12470
12471 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12472 false if we expect this to be used for a memory address and true if
12473 we expect to load the address into a register. */
12474
12475 static rtx
12476 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12477 {
12478 rtx dest, base, off;
12479 rtx pic = NULL_RTX, tp = NULL_RTX;
12480 int type;
12481
12482 switch (model)
12483 {
12484 case TLS_MODEL_GLOBAL_DYNAMIC:
12485 dest = gen_reg_rtx (Pmode);
12486
12487 if (!TARGET_64BIT)
12488 {
12489 if (flag_pic)
12490 pic = pic_offset_table_rtx;
12491 else
12492 {
12493 pic = gen_reg_rtx (Pmode);
12494 emit_insn (gen_set_got (pic));
12495 }
12496 }
12497
12498 if (TARGET_GNU2_TLS)
12499 {
12500 if (TARGET_64BIT)
12501 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12502 else
12503 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12504
12505 tp = get_thread_pointer (true);
12506 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12507
12508 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12509 }
12510 else
12511 {
12512 rtx caddr = ix86_tls_get_addr ();
12513
12514 if (TARGET_64BIT)
12515 {
12516 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12517
12518 start_sequence ();
12519 emit_call_insn (gen_tls_global_dynamic_64 (rax, x, caddr));
12520 insns = get_insns ();
12521 end_sequence ();
12522
12523 RTL_CONST_CALL_P (insns) = 1;
12524 emit_libcall_block (insns, dest, rax, x);
12525 }
12526 else
12527 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12528 }
12529 break;
12530
12531 case TLS_MODEL_LOCAL_DYNAMIC:
12532 base = gen_reg_rtx (Pmode);
12533
12534 if (!TARGET_64BIT)
12535 {
12536 if (flag_pic)
12537 pic = pic_offset_table_rtx;
12538 else
12539 {
12540 pic = gen_reg_rtx (Pmode);
12541 emit_insn (gen_set_got (pic));
12542 }
12543 }
12544
12545 if (TARGET_GNU2_TLS)
12546 {
12547 rtx tmp = ix86_tls_module_base ();
12548
12549 if (TARGET_64BIT)
12550 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12551 else
12552 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12553
12554 tp = get_thread_pointer (true);
12555 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12556 gen_rtx_MINUS (Pmode, tmp, tp));
12557 }
12558 else
12559 {
12560 rtx caddr = ix86_tls_get_addr ();
12561
12562 if (TARGET_64BIT)
12563 {
12564 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12565
12566 start_sequence ();
12567 emit_call_insn (gen_tls_local_dynamic_base_64 (rax, caddr));
12568 insns = get_insns ();
12569 end_sequence ();
12570
12571 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12572 share the LD_BASE result with other LD model accesses. */
12573 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12574 UNSPEC_TLS_LD_BASE);
12575
12576 RTL_CONST_CALL_P (insns) = 1;
12577 emit_libcall_block (insns, base, rax, eqv);
12578 }
12579 else
12580 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12581 }
12582
12583 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12584 off = gen_rtx_CONST (Pmode, off);
12585
12586 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12587
12588 if (TARGET_GNU2_TLS)
12589 {
12590 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12591
12592 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12593 }
12594 break;
12595
12596 case TLS_MODEL_INITIAL_EXEC:
12597 if (TARGET_64BIT)
12598 {
12599 if (TARGET_SUN_TLS)
12600 {
12601 /* The Sun linker took the AMD64 TLS spec literally
12602 and can only handle %rax as destination of the
12603 initial executable code sequence. */
12604
12605 dest = gen_reg_rtx (Pmode);
12606 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12607 return dest;
12608 }
12609
12610 pic = NULL;
12611 type = UNSPEC_GOTNTPOFF;
12612 }
12613 else if (flag_pic)
12614 {
12615 if (reload_in_progress)
12616 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12617 pic = pic_offset_table_rtx;
12618 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12619 }
12620 else if (!TARGET_ANY_GNU_TLS)
12621 {
12622 pic = gen_reg_rtx (Pmode);
12623 emit_insn (gen_set_got (pic));
12624 type = UNSPEC_GOTTPOFF;
12625 }
12626 else
12627 {
12628 pic = NULL;
12629 type = UNSPEC_INDNTPOFF;
12630 }
12631
12632 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
12633 off = gen_rtx_CONST (Pmode, off);
12634 if (pic)
12635 off = gen_rtx_PLUS (Pmode, pic, off);
12636 off = gen_const_mem (Pmode, off);
12637 set_mem_alias_set (off, ix86_GOT_alias_set ());
12638
12639 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12640 {
12641 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12642 off = force_reg (Pmode, off);
12643 return gen_rtx_PLUS (Pmode, base, off);
12644 }
12645 else
12646 {
12647 base = get_thread_pointer (true);
12648 dest = gen_reg_rtx (Pmode);
12649 emit_insn (gen_subsi3 (dest, base, off));
12650 }
12651 break;
12652
12653 case TLS_MODEL_LOCAL_EXEC:
12654 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12655 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12656 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12657 off = gen_rtx_CONST (Pmode, off);
12658
12659 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12660 {
12661 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12662 return gen_rtx_PLUS (Pmode, base, off);
12663 }
12664 else
12665 {
12666 base = get_thread_pointer (true);
12667 dest = gen_reg_rtx (Pmode);
12668 emit_insn (gen_subsi3 (dest, base, off));
12669 }
12670 break;
12671
12672 default:
12673 gcc_unreachable ();
12674 }
12675
12676 return dest;
12677 }
12678
12679 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12680 to symbol DECL. */
12681
12682 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12683 htab_t dllimport_map;
12684
12685 static tree
12686 get_dllimport_decl (tree decl)
12687 {
12688 struct tree_map *h, in;
12689 void **loc;
12690 const char *name;
12691 const char *prefix;
12692 size_t namelen, prefixlen;
12693 char *imp_name;
12694 tree to;
12695 rtx rtl;
12696
12697 if (!dllimport_map)
12698 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12699
12700 in.hash = htab_hash_pointer (decl);
12701 in.base.from = decl;
12702 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12703 h = (struct tree_map *) *loc;
12704 if (h)
12705 return h->to;
12706
12707 *loc = h = ggc_alloc_tree_map ();
12708 h->hash = in.hash;
12709 h->base.from = decl;
12710 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12711 VAR_DECL, NULL, ptr_type_node);
12712 DECL_ARTIFICIAL (to) = 1;
12713 DECL_IGNORED_P (to) = 1;
12714 DECL_EXTERNAL (to) = 1;
12715 TREE_READONLY (to) = 1;
12716
12717 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12718 name = targetm.strip_name_encoding (name);
12719 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12720 ? "*__imp_" : "*__imp__";
12721 namelen = strlen (name);
12722 prefixlen = strlen (prefix);
12723 imp_name = (char *) alloca (namelen + prefixlen + 1);
12724 memcpy (imp_name, prefix, prefixlen);
12725 memcpy (imp_name + prefixlen, name, namelen + 1);
12726
12727 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12728 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12729 SET_SYMBOL_REF_DECL (rtl, to);
12730 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12731
12732 rtl = gen_const_mem (Pmode, rtl);
12733 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12734
12735 SET_DECL_RTL (to, rtl);
12736 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12737
12738 return to;
12739 }
12740
12741 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12742 true if we require the result be a register. */
12743
12744 static rtx
12745 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12746 {
12747 tree imp_decl;
12748 rtx x;
12749
12750 gcc_assert (SYMBOL_REF_DECL (symbol));
12751 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12752
12753 x = DECL_RTL (imp_decl);
12754 if (want_reg)
12755 x = force_reg (Pmode, x);
12756 return x;
12757 }
12758
12759 /* Try machine-dependent ways of modifying an illegitimate address
12760 to be legitimate. If we find one, return the new, valid address.
12761 This macro is used in only one place: `memory_address' in explow.c.
12762
12763 OLDX is the address as it was before break_out_memory_refs was called.
12764 In some cases it is useful to look at this to decide what needs to be done.
12765
12766 It is always safe for this macro to do nothing. It exists to recognize
12767 opportunities to optimize the output.
12768
12769 For the 80386, we handle X+REG by loading X into a register R and
12770 using R+REG. R will go in a general reg and indexing will be used.
12771 However, if REG is a broken-out memory address or multiplication,
12772 nothing needs to be done because REG can certainly go in a general reg.
12773
12774 When -fpic is used, special handling is needed for symbolic references.
12775 See comments by legitimize_pic_address in i386.c for details. */
12776
12777 static rtx
12778 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12779 enum machine_mode mode)
12780 {
12781 int changed = 0;
12782 unsigned log;
12783
12784 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12785 if (log)
12786 return legitimize_tls_address (x, (enum tls_model) log, false);
12787 if (GET_CODE (x) == CONST
12788 && GET_CODE (XEXP (x, 0)) == PLUS
12789 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12790 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12791 {
12792 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12793 (enum tls_model) log, false);
12794 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12795 }
12796
12797 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12798 {
12799 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12800 return legitimize_dllimport_symbol (x, true);
12801 if (GET_CODE (x) == CONST
12802 && GET_CODE (XEXP (x, 0)) == PLUS
12803 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12804 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12805 {
12806 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12807 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12808 }
12809 }
12810
12811 if (flag_pic && SYMBOLIC_CONST (x))
12812 return legitimize_pic_address (x, 0);
12813
12814 #if TARGET_MACHO
12815 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12816 return machopic_indirect_data_reference (x, 0);
12817 #endif
12818
12819 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12820 if (GET_CODE (x) == ASHIFT
12821 && CONST_INT_P (XEXP (x, 1))
12822 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12823 {
12824 changed = 1;
12825 log = INTVAL (XEXP (x, 1));
12826 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12827 GEN_INT (1 << log));
12828 }
12829
12830 if (GET_CODE (x) == PLUS)
12831 {
12832 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12833
12834 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12835 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12836 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12837 {
12838 changed = 1;
12839 log = INTVAL (XEXP (XEXP (x, 0), 1));
12840 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12841 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12842 GEN_INT (1 << log));
12843 }
12844
12845 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12846 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12847 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12848 {
12849 changed = 1;
12850 log = INTVAL (XEXP (XEXP (x, 1), 1));
12851 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12852 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12853 GEN_INT (1 << log));
12854 }
12855
12856 /* Put multiply first if it isn't already. */
12857 if (GET_CODE (XEXP (x, 1)) == MULT)
12858 {
12859 rtx tmp = XEXP (x, 0);
12860 XEXP (x, 0) = XEXP (x, 1);
12861 XEXP (x, 1) = tmp;
12862 changed = 1;
12863 }
12864
12865 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
12866 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
12867 created by virtual register instantiation, register elimination, and
12868 similar optimizations. */
12869 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
12870 {
12871 changed = 1;
12872 x = gen_rtx_PLUS (Pmode,
12873 gen_rtx_PLUS (Pmode, XEXP (x, 0),
12874 XEXP (XEXP (x, 1), 0)),
12875 XEXP (XEXP (x, 1), 1));
12876 }
12877
12878 /* Canonicalize
12879 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
12880 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
12881 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
12882 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12883 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
12884 && CONSTANT_P (XEXP (x, 1)))
12885 {
12886 rtx constant;
12887 rtx other = NULL_RTX;
12888
12889 if (CONST_INT_P (XEXP (x, 1)))
12890 {
12891 constant = XEXP (x, 1);
12892 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
12893 }
12894 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
12895 {
12896 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
12897 other = XEXP (x, 1);
12898 }
12899 else
12900 constant = 0;
12901
12902 if (constant)
12903 {
12904 changed = 1;
12905 x = gen_rtx_PLUS (Pmode,
12906 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
12907 XEXP (XEXP (XEXP (x, 0), 1), 0)),
12908 plus_constant (other, INTVAL (constant)));
12909 }
12910 }
12911
12912 if (changed && ix86_legitimate_address_p (mode, x, false))
12913 return x;
12914
12915 if (GET_CODE (XEXP (x, 0)) == MULT)
12916 {
12917 changed = 1;
12918 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
12919 }
12920
12921 if (GET_CODE (XEXP (x, 1)) == MULT)
12922 {
12923 changed = 1;
12924 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
12925 }
12926
12927 if (changed
12928 && REG_P (XEXP (x, 1))
12929 && REG_P (XEXP (x, 0)))
12930 return x;
12931
12932 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
12933 {
12934 changed = 1;
12935 x = legitimize_pic_address (x, 0);
12936 }
12937
12938 if (changed && ix86_legitimate_address_p (mode, x, false))
12939 return x;
12940
12941 if (REG_P (XEXP (x, 0)))
12942 {
12943 rtx temp = gen_reg_rtx (Pmode);
12944 rtx val = force_operand (XEXP (x, 1), temp);
12945 if (val != temp)
12946 {
12947 if (GET_MODE (val) != Pmode)
12948 val = convert_to_mode (Pmode, val, 1);
12949 emit_move_insn (temp, val);
12950 }
12951
12952 XEXP (x, 1) = temp;
12953 return x;
12954 }
12955
12956 else if (REG_P (XEXP (x, 1)))
12957 {
12958 rtx temp = gen_reg_rtx (Pmode);
12959 rtx val = force_operand (XEXP (x, 0), temp);
12960 if (val != temp)
12961 {
12962 if (GET_MODE (val) != Pmode)
12963 val = convert_to_mode (Pmode, val, 1);
12964 emit_move_insn (temp, val);
12965 }
12966
12967 XEXP (x, 0) = temp;
12968 return x;
12969 }
12970 }
12971
12972 return x;
12973 }
12974 \f
12975 /* Print an integer constant expression in assembler syntax. Addition
12976 and subtraction are the only arithmetic that may appear in these
12977 expressions. FILE is the stdio stream to write to, X is the rtx, and
12978 CODE is the operand print code from the output string. */
12979
12980 static void
12981 output_pic_addr_const (FILE *file, rtx x, int code)
12982 {
12983 char buf[256];
12984
12985 switch (GET_CODE (x))
12986 {
12987 case PC:
12988 gcc_assert (flag_pic);
12989 putc ('.', file);
12990 break;
12991
12992 case SYMBOL_REF:
12993 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
12994 output_addr_const (file, x);
12995 else
12996 {
12997 const char *name = XSTR (x, 0);
12998
12999 /* Mark the decl as referenced so that cgraph will
13000 output the function. */
13001 if (SYMBOL_REF_DECL (x))
13002 mark_decl_referenced (SYMBOL_REF_DECL (x));
13003
13004 #if TARGET_MACHO
13005 if (MACHOPIC_INDIRECT
13006 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13007 name = machopic_indirection_name (x, /*stub_p=*/true);
13008 #endif
13009 assemble_name (file, name);
13010 }
13011 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13012 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13013 fputs ("@PLT", file);
13014 break;
13015
13016 case LABEL_REF:
13017 x = XEXP (x, 0);
13018 /* FALLTHRU */
13019 case CODE_LABEL:
13020 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13021 assemble_name (asm_out_file, buf);
13022 break;
13023
13024 case CONST_INT:
13025 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13026 break;
13027
13028 case CONST:
13029 /* This used to output parentheses around the expression,
13030 but that does not work on the 386 (either ATT or BSD assembler). */
13031 output_pic_addr_const (file, XEXP (x, 0), code);
13032 break;
13033
13034 case CONST_DOUBLE:
13035 if (GET_MODE (x) == VOIDmode)
13036 {
13037 /* We can use %d if the number is <32 bits and positive. */
13038 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13039 fprintf (file, "0x%lx%08lx",
13040 (unsigned long) CONST_DOUBLE_HIGH (x),
13041 (unsigned long) CONST_DOUBLE_LOW (x));
13042 else
13043 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13044 }
13045 else
13046 /* We can't handle floating point constants;
13047 TARGET_PRINT_OPERAND must handle them. */
13048 output_operand_lossage ("floating constant misused");
13049 break;
13050
13051 case PLUS:
13052 /* Some assemblers need integer constants to appear first. */
13053 if (CONST_INT_P (XEXP (x, 0)))
13054 {
13055 output_pic_addr_const (file, XEXP (x, 0), code);
13056 putc ('+', file);
13057 output_pic_addr_const (file, XEXP (x, 1), code);
13058 }
13059 else
13060 {
13061 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13062 output_pic_addr_const (file, XEXP (x, 1), code);
13063 putc ('+', file);
13064 output_pic_addr_const (file, XEXP (x, 0), code);
13065 }
13066 break;
13067
13068 case MINUS:
13069 if (!TARGET_MACHO)
13070 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13071 output_pic_addr_const (file, XEXP (x, 0), code);
13072 putc ('-', file);
13073 output_pic_addr_const (file, XEXP (x, 1), code);
13074 if (!TARGET_MACHO)
13075 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13076 break;
13077
13078 case UNSPEC:
13079 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13080 {
13081 bool f = i386_asm_output_addr_const_extra (file, x);
13082 gcc_assert (f);
13083 break;
13084 }
13085
13086 gcc_assert (XVECLEN (x, 0) == 1);
13087 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13088 switch (XINT (x, 1))
13089 {
13090 case UNSPEC_GOT:
13091 fputs ("@GOT", file);
13092 break;
13093 case UNSPEC_GOTOFF:
13094 fputs ("@GOTOFF", file);
13095 break;
13096 case UNSPEC_PLTOFF:
13097 fputs ("@PLTOFF", file);
13098 break;
13099 case UNSPEC_PCREL:
13100 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13101 "(%rip)" : "[rip]", file);
13102 break;
13103 case UNSPEC_GOTPCREL:
13104 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13105 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13106 break;
13107 case UNSPEC_GOTTPOFF:
13108 /* FIXME: This might be @TPOFF in Sun ld too. */
13109 fputs ("@gottpoff", file);
13110 break;
13111 case UNSPEC_TPOFF:
13112 fputs ("@tpoff", file);
13113 break;
13114 case UNSPEC_NTPOFF:
13115 if (TARGET_64BIT)
13116 fputs ("@tpoff", file);
13117 else
13118 fputs ("@ntpoff", file);
13119 break;
13120 case UNSPEC_DTPOFF:
13121 fputs ("@dtpoff", file);
13122 break;
13123 case UNSPEC_GOTNTPOFF:
13124 if (TARGET_64BIT)
13125 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13126 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13127 else
13128 fputs ("@gotntpoff", file);
13129 break;
13130 case UNSPEC_INDNTPOFF:
13131 fputs ("@indntpoff", file);
13132 break;
13133 #if TARGET_MACHO
13134 case UNSPEC_MACHOPIC_OFFSET:
13135 putc ('-', file);
13136 machopic_output_function_base_name (file);
13137 break;
13138 #endif
13139 default:
13140 output_operand_lossage ("invalid UNSPEC as operand");
13141 break;
13142 }
13143 break;
13144
13145 default:
13146 output_operand_lossage ("invalid expression as operand");
13147 }
13148 }
13149
13150 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13151 We need to emit DTP-relative relocations. */
13152
13153 static void ATTRIBUTE_UNUSED
13154 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13155 {
13156 fputs (ASM_LONG, file);
13157 output_addr_const (file, x);
13158 fputs ("@dtpoff", file);
13159 switch (size)
13160 {
13161 case 4:
13162 break;
13163 case 8:
13164 fputs (", 0", file);
13165 break;
13166 default:
13167 gcc_unreachable ();
13168 }
13169 }
13170
13171 /* Return true if X is a representation of the PIC register. This copes
13172 with calls from ix86_find_base_term, where the register might have
13173 been replaced by a cselib value. */
13174
13175 static bool
13176 ix86_pic_register_p (rtx x)
13177 {
13178 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13179 return (pic_offset_table_rtx
13180 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13181 else
13182 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13183 }
13184
13185 /* Helper function for ix86_delegitimize_address.
13186 Attempt to delegitimize TLS local-exec accesses. */
13187
13188 static rtx
13189 ix86_delegitimize_tls_address (rtx orig_x)
13190 {
13191 rtx x = orig_x, unspec;
13192 struct ix86_address addr;
13193
13194 if (!TARGET_TLS_DIRECT_SEG_REFS)
13195 return orig_x;
13196 if (MEM_P (x))
13197 x = XEXP (x, 0);
13198 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13199 return orig_x;
13200 if (ix86_decompose_address (x, &addr) == 0
13201 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13202 || addr.disp == NULL_RTX
13203 || GET_CODE (addr.disp) != CONST)
13204 return orig_x;
13205 unspec = XEXP (addr.disp, 0);
13206 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13207 unspec = XEXP (unspec, 0);
13208 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13209 return orig_x;
13210 x = XVECEXP (unspec, 0, 0);
13211 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13212 if (unspec != XEXP (addr.disp, 0))
13213 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13214 if (addr.index)
13215 {
13216 rtx idx = addr.index;
13217 if (addr.scale != 1)
13218 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13219 x = gen_rtx_PLUS (Pmode, idx, x);
13220 }
13221 if (addr.base)
13222 x = gen_rtx_PLUS (Pmode, addr.base, x);
13223 if (MEM_P (orig_x))
13224 x = replace_equiv_address_nv (orig_x, x);
13225 return x;
13226 }
13227
13228 /* In the name of slightly smaller debug output, and to cater to
13229 general assembler lossage, recognize PIC+GOTOFF and turn it back
13230 into a direct symbol reference.
13231
13232 On Darwin, this is necessary to avoid a crash, because Darwin
13233 has a different PIC label for each routine but the DWARF debugging
13234 information is not associated with any particular routine, so it's
13235 necessary to remove references to the PIC label from RTL stored by
13236 the DWARF output code. */
13237
13238 static rtx
13239 ix86_delegitimize_address (rtx x)
13240 {
13241 rtx orig_x = delegitimize_mem_from_attrs (x);
13242 /* addend is NULL or some rtx if x is something+GOTOFF where
13243 something doesn't include the PIC register. */
13244 rtx addend = NULL_RTX;
13245 /* reg_addend is NULL or a multiple of some register. */
13246 rtx reg_addend = NULL_RTX;
13247 /* const_addend is NULL or a const_int. */
13248 rtx const_addend = NULL_RTX;
13249 /* This is the result, or NULL. */
13250 rtx result = NULL_RTX;
13251
13252 x = orig_x;
13253
13254 if (MEM_P (x))
13255 x = XEXP (x, 0);
13256
13257 if (TARGET_64BIT)
13258 {
13259 if (GET_CODE (x) == CONST
13260 && GET_CODE (XEXP (x, 0)) == PLUS
13261 && GET_MODE (XEXP (x, 0)) == Pmode
13262 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13263 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13264 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13265 {
13266 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13267 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13268 if (MEM_P (orig_x))
13269 x = replace_equiv_address_nv (orig_x, x);
13270 return x;
13271 }
13272 if (GET_CODE (x) != CONST
13273 || GET_CODE (XEXP (x, 0)) != UNSPEC
13274 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13275 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13276 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13277 return ix86_delegitimize_tls_address (orig_x);
13278 x = XVECEXP (XEXP (x, 0), 0, 0);
13279 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13280 {
13281 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13282 GET_MODE (x), 0);
13283 if (x == NULL_RTX)
13284 return orig_x;
13285 }
13286 return x;
13287 }
13288
13289 if (GET_CODE (x) != PLUS
13290 || GET_CODE (XEXP (x, 1)) != CONST)
13291 return ix86_delegitimize_tls_address (orig_x);
13292
13293 if (ix86_pic_register_p (XEXP (x, 0)))
13294 /* %ebx + GOT/GOTOFF */
13295 ;
13296 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13297 {
13298 /* %ebx + %reg * scale + GOT/GOTOFF */
13299 reg_addend = XEXP (x, 0);
13300 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13301 reg_addend = XEXP (reg_addend, 1);
13302 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13303 reg_addend = XEXP (reg_addend, 0);
13304 else
13305 {
13306 reg_addend = NULL_RTX;
13307 addend = XEXP (x, 0);
13308 }
13309 }
13310 else
13311 addend = XEXP (x, 0);
13312
13313 x = XEXP (XEXP (x, 1), 0);
13314 if (GET_CODE (x) == PLUS
13315 && CONST_INT_P (XEXP (x, 1)))
13316 {
13317 const_addend = XEXP (x, 1);
13318 x = XEXP (x, 0);
13319 }
13320
13321 if (GET_CODE (x) == UNSPEC
13322 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13323 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13324 result = XVECEXP (x, 0, 0);
13325
13326 if (TARGET_MACHO && darwin_local_data_pic (x)
13327 && !MEM_P (orig_x))
13328 result = XVECEXP (x, 0, 0);
13329
13330 if (! result)
13331 return ix86_delegitimize_tls_address (orig_x);
13332
13333 if (const_addend)
13334 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13335 if (reg_addend)
13336 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13337 if (addend)
13338 {
13339 /* If the rest of original X doesn't involve the PIC register, add
13340 addend and subtract pic_offset_table_rtx. This can happen e.g.
13341 for code like:
13342 leal (%ebx, %ecx, 4), %ecx
13343 ...
13344 movl foo@GOTOFF(%ecx), %edx
13345 in which case we return (%ecx - %ebx) + foo. */
13346 if (pic_offset_table_rtx)
13347 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13348 pic_offset_table_rtx),
13349 result);
13350 else
13351 return orig_x;
13352 }
13353 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13354 {
13355 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13356 if (result == NULL_RTX)
13357 return orig_x;
13358 }
13359 return result;
13360 }
13361
13362 /* If X is a machine specific address (i.e. a symbol or label being
13363 referenced as a displacement from the GOT implemented using an
13364 UNSPEC), then return the base term. Otherwise return X. */
13365
13366 rtx
13367 ix86_find_base_term (rtx x)
13368 {
13369 rtx term;
13370
13371 if (TARGET_64BIT)
13372 {
13373 if (GET_CODE (x) != CONST)
13374 return x;
13375 term = XEXP (x, 0);
13376 if (GET_CODE (term) == PLUS
13377 && (CONST_INT_P (XEXP (term, 1))
13378 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13379 term = XEXP (term, 0);
13380 if (GET_CODE (term) != UNSPEC
13381 || (XINT (term, 1) != UNSPEC_GOTPCREL
13382 && XINT (term, 1) != UNSPEC_PCREL))
13383 return x;
13384
13385 return XVECEXP (term, 0, 0);
13386 }
13387
13388 return ix86_delegitimize_address (x);
13389 }
13390 \f
13391 static void
13392 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13393 int fp, FILE *file)
13394 {
13395 const char *suffix;
13396
13397 if (mode == CCFPmode || mode == CCFPUmode)
13398 {
13399 code = ix86_fp_compare_code_to_integer (code);
13400 mode = CCmode;
13401 }
13402 if (reverse)
13403 code = reverse_condition (code);
13404
13405 switch (code)
13406 {
13407 case EQ:
13408 switch (mode)
13409 {
13410 case CCAmode:
13411 suffix = "a";
13412 break;
13413
13414 case CCCmode:
13415 suffix = "c";
13416 break;
13417
13418 case CCOmode:
13419 suffix = "o";
13420 break;
13421
13422 case CCSmode:
13423 suffix = "s";
13424 break;
13425
13426 default:
13427 suffix = "e";
13428 }
13429 break;
13430 case NE:
13431 switch (mode)
13432 {
13433 case CCAmode:
13434 suffix = "na";
13435 break;
13436
13437 case CCCmode:
13438 suffix = "nc";
13439 break;
13440
13441 case CCOmode:
13442 suffix = "no";
13443 break;
13444
13445 case CCSmode:
13446 suffix = "ns";
13447 break;
13448
13449 default:
13450 suffix = "ne";
13451 }
13452 break;
13453 case GT:
13454 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13455 suffix = "g";
13456 break;
13457 case GTU:
13458 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13459 Those same assemblers have the same but opposite lossage on cmov. */
13460 if (mode == CCmode)
13461 suffix = fp ? "nbe" : "a";
13462 else if (mode == CCCmode)
13463 suffix = "b";
13464 else
13465 gcc_unreachable ();
13466 break;
13467 case LT:
13468 switch (mode)
13469 {
13470 case CCNOmode:
13471 case CCGOCmode:
13472 suffix = "s";
13473 break;
13474
13475 case CCmode:
13476 case CCGCmode:
13477 suffix = "l";
13478 break;
13479
13480 default:
13481 gcc_unreachable ();
13482 }
13483 break;
13484 case LTU:
13485 gcc_assert (mode == CCmode || mode == CCCmode);
13486 suffix = "b";
13487 break;
13488 case GE:
13489 switch (mode)
13490 {
13491 case CCNOmode:
13492 case CCGOCmode:
13493 suffix = "ns";
13494 break;
13495
13496 case CCmode:
13497 case CCGCmode:
13498 suffix = "ge";
13499 break;
13500
13501 default:
13502 gcc_unreachable ();
13503 }
13504 break;
13505 case GEU:
13506 /* ??? As above. */
13507 gcc_assert (mode == CCmode || mode == CCCmode);
13508 suffix = fp ? "nb" : "ae";
13509 break;
13510 case LE:
13511 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13512 suffix = "le";
13513 break;
13514 case LEU:
13515 /* ??? As above. */
13516 if (mode == CCmode)
13517 suffix = "be";
13518 else if (mode == CCCmode)
13519 suffix = fp ? "nb" : "ae";
13520 else
13521 gcc_unreachable ();
13522 break;
13523 case UNORDERED:
13524 suffix = fp ? "u" : "p";
13525 break;
13526 case ORDERED:
13527 suffix = fp ? "nu" : "np";
13528 break;
13529 default:
13530 gcc_unreachable ();
13531 }
13532 fputs (suffix, file);
13533 }
13534
13535 /* Print the name of register X to FILE based on its machine mode and number.
13536 If CODE is 'w', pretend the mode is HImode.
13537 If CODE is 'b', pretend the mode is QImode.
13538 If CODE is 'k', pretend the mode is SImode.
13539 If CODE is 'q', pretend the mode is DImode.
13540 If CODE is 'x', pretend the mode is V4SFmode.
13541 If CODE is 't', pretend the mode is V8SFmode.
13542 If CODE is 'h', pretend the reg is the 'high' byte register.
13543 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13544 If CODE is 'd', duplicate the operand for AVX instruction.
13545 */
13546
13547 void
13548 print_reg (rtx x, int code, FILE *file)
13549 {
13550 const char *reg;
13551 bool duplicated = code == 'd' && TARGET_AVX;
13552
13553 gcc_assert (x == pc_rtx
13554 || (REGNO (x) != ARG_POINTER_REGNUM
13555 && REGNO (x) != FRAME_POINTER_REGNUM
13556 && REGNO (x) != FLAGS_REG
13557 && REGNO (x) != FPSR_REG
13558 && REGNO (x) != FPCR_REG));
13559
13560 if (ASSEMBLER_DIALECT == ASM_ATT)
13561 putc ('%', file);
13562
13563 if (x == pc_rtx)
13564 {
13565 gcc_assert (TARGET_64BIT);
13566 fputs ("rip", file);
13567 return;
13568 }
13569
13570 if (code == 'w' || MMX_REG_P (x))
13571 code = 2;
13572 else if (code == 'b')
13573 code = 1;
13574 else if (code == 'k')
13575 code = 4;
13576 else if (code == 'q')
13577 code = 8;
13578 else if (code == 'y')
13579 code = 3;
13580 else if (code == 'h')
13581 code = 0;
13582 else if (code == 'x')
13583 code = 16;
13584 else if (code == 't')
13585 code = 32;
13586 else
13587 code = GET_MODE_SIZE (GET_MODE (x));
13588
13589 /* Irritatingly, AMD extended registers use different naming convention
13590 from the normal registers: "r%d[bwd]" */
13591 if (REX_INT_REG_P (x))
13592 {
13593 gcc_assert (TARGET_64BIT);
13594 putc ('r', file);
13595 fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
13596 switch (code)
13597 {
13598 case 0:
13599 error ("extended registers have no high halves");
13600 break;
13601 case 1:
13602 putc ('b', file);
13603 break;
13604 case 2:
13605 putc ('w', file);
13606 break;
13607 case 4:
13608 putc ('d', file);
13609 break;
13610 case 8:
13611 /* no suffix */
13612 break;
13613 default:
13614 error ("unsupported operand size for extended register");
13615 break;
13616 }
13617 return;
13618 }
13619
13620 reg = NULL;
13621 switch (code)
13622 {
13623 case 3:
13624 if (STACK_TOP_P (x))
13625 {
13626 reg = "st(0)";
13627 break;
13628 }
13629 /* FALLTHRU */
13630 case 8:
13631 case 4:
13632 case 12:
13633 if (! ANY_FP_REG_P (x))
13634 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13635 /* FALLTHRU */
13636 case 16:
13637 case 2:
13638 normal:
13639 reg = hi_reg_name[REGNO (x)];
13640 break;
13641 case 1:
13642 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13643 goto normal;
13644 reg = qi_reg_name[REGNO (x)];
13645 break;
13646 case 0:
13647 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13648 goto normal;
13649 reg = qi_high_reg_name[REGNO (x)];
13650 break;
13651 case 32:
13652 if (SSE_REG_P (x))
13653 {
13654 gcc_assert (!duplicated);
13655 putc ('y', file);
13656 fputs (hi_reg_name[REGNO (x)] + 1, file);
13657 return;
13658 }
13659 break;
13660 default:
13661 gcc_unreachable ();
13662 }
13663
13664 fputs (reg, file);
13665 if (duplicated)
13666 {
13667 if (ASSEMBLER_DIALECT == ASM_ATT)
13668 fprintf (file, ", %%%s", reg);
13669 else
13670 fprintf (file, ", %s", reg);
13671 }
13672 }
13673
13674 /* Locate some local-dynamic symbol still in use by this function
13675 so that we can print its name in some tls_local_dynamic_base
13676 pattern. */
13677
13678 static int
13679 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13680 {
13681 rtx x = *px;
13682
13683 if (GET_CODE (x) == SYMBOL_REF
13684 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13685 {
13686 cfun->machine->some_ld_name = XSTR (x, 0);
13687 return 1;
13688 }
13689
13690 return 0;
13691 }
13692
13693 static const char *
13694 get_some_local_dynamic_name (void)
13695 {
13696 rtx insn;
13697
13698 if (cfun->machine->some_ld_name)
13699 return cfun->machine->some_ld_name;
13700
13701 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13702 if (NONDEBUG_INSN_P (insn)
13703 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13704 return cfun->machine->some_ld_name;
13705
13706 return NULL;
13707 }
13708
13709 /* Meaning of CODE:
13710 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13711 C -- print opcode suffix for set/cmov insn.
13712 c -- like C, but print reversed condition
13713 F,f -- likewise, but for floating-point.
13714 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13715 otherwise nothing
13716 R -- print the prefix for register names.
13717 z -- print the opcode suffix for the size of the current operand.
13718 Z -- likewise, with special suffixes for x87 instructions.
13719 * -- print a star (in certain assembler syntax)
13720 A -- print an absolute memory reference.
13721 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13722 s -- print a shift double count, followed by the assemblers argument
13723 delimiter.
13724 b -- print the QImode name of the register for the indicated operand.
13725 %b0 would print %al if operands[0] is reg 0.
13726 w -- likewise, print the HImode name of the register.
13727 k -- likewise, print the SImode name of the register.
13728 q -- likewise, print the DImode name of the register.
13729 x -- likewise, print the V4SFmode name of the register.
13730 t -- likewise, print the V8SFmode name of the register.
13731 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13732 y -- print "st(0)" instead of "st" as a register.
13733 d -- print duplicated register operand for AVX instruction.
13734 D -- print condition for SSE cmp instruction.
13735 P -- if PIC, print an @PLT suffix.
13736 p -- print raw symbol name.
13737 X -- don't print any sort of PIC '@' suffix for a symbol.
13738 & -- print some in-use local-dynamic symbol name.
13739 H -- print a memory address offset by 8; used for sse high-parts
13740 Y -- print condition for XOP pcom* instruction.
13741 + -- print a branch hint as 'cs' or 'ds' prefix
13742 ; -- print a semicolon (after prefixes due to bug in older gas).
13743 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13744 @ -- print a segment register of thread base pointer load
13745 */
13746
13747 void
13748 ix86_print_operand (FILE *file, rtx x, int code)
13749 {
13750 if (code)
13751 {
13752 switch (code)
13753 {
13754 case '*':
13755 if (ASSEMBLER_DIALECT == ASM_ATT)
13756 putc ('*', file);
13757 return;
13758
13759 case '&':
13760 {
13761 const char *name = get_some_local_dynamic_name ();
13762 if (name == NULL)
13763 output_operand_lossage ("'%%&' used without any "
13764 "local dynamic TLS references");
13765 else
13766 assemble_name (file, name);
13767 return;
13768 }
13769
13770 case 'A':
13771 switch (ASSEMBLER_DIALECT)
13772 {
13773 case ASM_ATT:
13774 putc ('*', file);
13775 break;
13776
13777 case ASM_INTEL:
13778 /* Intel syntax. For absolute addresses, registers should not
13779 be surrounded by braces. */
13780 if (!REG_P (x))
13781 {
13782 putc ('[', file);
13783 ix86_print_operand (file, x, 0);
13784 putc (']', file);
13785 return;
13786 }
13787 break;
13788
13789 default:
13790 gcc_unreachable ();
13791 }
13792
13793 ix86_print_operand (file, x, 0);
13794 return;
13795
13796
13797 case 'L':
13798 if (ASSEMBLER_DIALECT == ASM_ATT)
13799 putc ('l', file);
13800 return;
13801
13802 case 'W':
13803 if (ASSEMBLER_DIALECT == ASM_ATT)
13804 putc ('w', file);
13805 return;
13806
13807 case 'B':
13808 if (ASSEMBLER_DIALECT == ASM_ATT)
13809 putc ('b', file);
13810 return;
13811
13812 case 'Q':
13813 if (ASSEMBLER_DIALECT == ASM_ATT)
13814 putc ('l', file);
13815 return;
13816
13817 case 'S':
13818 if (ASSEMBLER_DIALECT == ASM_ATT)
13819 putc ('s', file);
13820 return;
13821
13822 case 'T':
13823 if (ASSEMBLER_DIALECT == ASM_ATT)
13824 putc ('t', file);
13825 return;
13826
13827 case 'z':
13828 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13829 {
13830 /* Opcodes don't get size suffixes if using Intel opcodes. */
13831 if (ASSEMBLER_DIALECT == ASM_INTEL)
13832 return;
13833
13834 switch (GET_MODE_SIZE (GET_MODE (x)))
13835 {
13836 case 1:
13837 putc ('b', file);
13838 return;
13839
13840 case 2:
13841 putc ('w', file);
13842 return;
13843
13844 case 4:
13845 putc ('l', file);
13846 return;
13847
13848 case 8:
13849 putc ('q', file);
13850 return;
13851
13852 default:
13853 output_operand_lossage
13854 ("invalid operand size for operand code '%c'", code);
13855 return;
13856 }
13857 }
13858
13859 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13860 warning
13861 (0, "non-integer operand used with operand code '%c'", code);
13862 /* FALLTHRU */
13863
13864 case 'Z':
13865 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
13866 if (ASSEMBLER_DIALECT == ASM_INTEL)
13867 return;
13868
13869 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13870 {
13871 switch (GET_MODE_SIZE (GET_MODE (x)))
13872 {
13873 case 2:
13874 #ifdef HAVE_AS_IX86_FILDS
13875 putc ('s', file);
13876 #endif
13877 return;
13878
13879 case 4:
13880 putc ('l', file);
13881 return;
13882
13883 case 8:
13884 #ifdef HAVE_AS_IX86_FILDQ
13885 putc ('q', file);
13886 #else
13887 fputs ("ll", file);
13888 #endif
13889 return;
13890
13891 default:
13892 break;
13893 }
13894 }
13895 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13896 {
13897 /* 387 opcodes don't get size suffixes
13898 if the operands are registers. */
13899 if (STACK_REG_P (x))
13900 return;
13901
13902 switch (GET_MODE_SIZE (GET_MODE (x)))
13903 {
13904 case 4:
13905 putc ('s', file);
13906 return;
13907
13908 case 8:
13909 putc ('l', file);
13910 return;
13911
13912 case 12:
13913 case 16:
13914 putc ('t', file);
13915 return;
13916
13917 default:
13918 break;
13919 }
13920 }
13921 else
13922 {
13923 output_operand_lossage
13924 ("invalid operand type used with operand code '%c'", code);
13925 return;
13926 }
13927
13928 output_operand_lossage
13929 ("invalid operand size for operand code '%c'", code);
13930 return;
13931
13932 case 'd':
13933 case 'b':
13934 case 'w':
13935 case 'k':
13936 case 'q':
13937 case 'h':
13938 case 't':
13939 case 'y':
13940 case 'x':
13941 case 'X':
13942 case 'P':
13943 case 'p':
13944 break;
13945
13946 case 's':
13947 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
13948 {
13949 ix86_print_operand (file, x, 0);
13950 fputs (", ", file);
13951 }
13952 return;
13953
13954 case 'D':
13955 /* Little bit of braindamage here. The SSE compare instructions
13956 does use completely different names for the comparisons that the
13957 fp conditional moves. */
13958 if (TARGET_AVX)
13959 {
13960 switch (GET_CODE (x))
13961 {
13962 case EQ:
13963 fputs ("eq", file);
13964 break;
13965 case UNEQ:
13966 fputs ("eq_us", file);
13967 break;
13968 case LT:
13969 fputs ("lt", file);
13970 break;
13971 case UNLT:
13972 fputs ("nge", file);
13973 break;
13974 case LE:
13975 fputs ("le", file);
13976 break;
13977 case UNLE:
13978 fputs ("ngt", file);
13979 break;
13980 case UNORDERED:
13981 fputs ("unord", file);
13982 break;
13983 case NE:
13984 fputs ("neq", file);
13985 break;
13986 case LTGT:
13987 fputs ("neq_oq", file);
13988 break;
13989 case GE:
13990 fputs ("ge", file);
13991 break;
13992 case UNGE:
13993 fputs ("nlt", file);
13994 break;
13995 case GT:
13996 fputs ("gt", file);
13997 break;
13998 case UNGT:
13999 fputs ("nle", file);
14000 break;
14001 case ORDERED:
14002 fputs ("ord", file);
14003 break;
14004 default:
14005 output_operand_lossage ("operand is not a condition code, "
14006 "invalid operand code 'D'");
14007 return;
14008 }
14009 }
14010 else
14011 {
14012 switch (GET_CODE (x))
14013 {
14014 case EQ:
14015 case UNEQ:
14016 fputs ("eq", file);
14017 break;
14018 case LT:
14019 case UNLT:
14020 fputs ("lt", file);
14021 break;
14022 case LE:
14023 case UNLE:
14024 fputs ("le", file);
14025 break;
14026 case UNORDERED:
14027 fputs ("unord", file);
14028 break;
14029 case NE:
14030 case LTGT:
14031 fputs ("neq", file);
14032 break;
14033 case UNGE:
14034 case GE:
14035 fputs ("nlt", file);
14036 break;
14037 case UNGT:
14038 case GT:
14039 fputs ("nle", file);
14040 break;
14041 case ORDERED:
14042 fputs ("ord", file);
14043 break;
14044 default:
14045 output_operand_lossage ("operand is not a condition code, "
14046 "invalid operand code 'D'");
14047 return;
14048 }
14049 }
14050 return;
14051 case 'O':
14052 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14053 if (ASSEMBLER_DIALECT == ASM_ATT)
14054 {
14055 switch (GET_MODE (x))
14056 {
14057 case HImode: putc ('w', file); break;
14058 case SImode:
14059 case SFmode: putc ('l', file); break;
14060 case DImode:
14061 case DFmode: putc ('q', file); break;
14062 default: gcc_unreachable ();
14063 }
14064 putc ('.', file);
14065 }
14066 #endif
14067 return;
14068 case 'C':
14069 if (!COMPARISON_P (x))
14070 {
14071 output_operand_lossage ("operand is neither a constant nor a "
14072 "condition code, invalid operand code "
14073 "'C'");
14074 return;
14075 }
14076 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
14077 return;
14078 case 'F':
14079 if (!COMPARISON_P (x))
14080 {
14081 output_operand_lossage ("operand is neither a constant nor a "
14082 "condition code, invalid operand code "
14083 "'F'");
14084 return;
14085 }
14086 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14087 if (ASSEMBLER_DIALECT == ASM_ATT)
14088 putc ('.', file);
14089 #endif
14090 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
14091 return;
14092
14093 /* Like above, but reverse condition */
14094 case 'c':
14095 /* Check to see if argument to %c is really a constant
14096 and not a condition code which needs to be reversed. */
14097 if (!COMPARISON_P (x))
14098 {
14099 output_operand_lossage ("operand is neither a constant nor a "
14100 "condition code, invalid operand "
14101 "code 'c'");
14102 return;
14103 }
14104 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
14105 return;
14106 case 'f':
14107 if (!COMPARISON_P (x))
14108 {
14109 output_operand_lossage ("operand is neither a constant nor a "
14110 "condition code, invalid operand "
14111 "code 'f'");
14112 return;
14113 }
14114 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14115 if (ASSEMBLER_DIALECT == ASM_ATT)
14116 putc ('.', file);
14117 #endif
14118 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
14119 return;
14120
14121 case 'H':
14122 if (!offsettable_memref_p (x))
14123 {
14124 output_operand_lossage ("operand is not an offsettable memory "
14125 "reference, invalid operand "
14126 "code 'H'");
14127 return;
14128 }
14129 /* It doesn't actually matter what mode we use here, as we're
14130 only going to use this for printing. */
14131 x = adjust_address_nv (x, DImode, 8);
14132 break;
14133
14134 case '+':
14135 {
14136 rtx x;
14137
14138 if (!optimize
14139 || optimize_function_for_size_p (cfun)
14140 || !TARGET_BRANCH_PREDICTION_HINTS)
14141 return;
14142
14143 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14144 if (x)
14145 {
14146 int pred_val = INTVAL (XEXP (x, 0));
14147
14148 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14149 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14150 {
14151 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14152 bool cputaken
14153 = final_forward_branch_p (current_output_insn) == 0;
14154
14155 /* Emit hints only in the case default branch prediction
14156 heuristics would fail. */
14157 if (taken != cputaken)
14158 {
14159 /* We use 3e (DS) prefix for taken branches and
14160 2e (CS) prefix for not taken branches. */
14161 if (taken)
14162 fputs ("ds ; ", file);
14163 else
14164 fputs ("cs ; ", file);
14165 }
14166 }
14167 }
14168 return;
14169 }
14170
14171 case 'Y':
14172 switch (GET_CODE (x))
14173 {
14174 case NE:
14175 fputs ("neq", file);
14176 break;
14177 case EQ:
14178 fputs ("eq", file);
14179 break;
14180 case GE:
14181 case GEU:
14182 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14183 break;
14184 case GT:
14185 case GTU:
14186 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14187 break;
14188 case LE:
14189 case LEU:
14190 fputs ("le", file);
14191 break;
14192 case LT:
14193 case LTU:
14194 fputs ("lt", file);
14195 break;
14196 case UNORDERED:
14197 fputs ("unord", file);
14198 break;
14199 case ORDERED:
14200 fputs ("ord", file);
14201 break;
14202 case UNEQ:
14203 fputs ("ueq", file);
14204 break;
14205 case UNGE:
14206 fputs ("nlt", file);
14207 break;
14208 case UNGT:
14209 fputs ("nle", file);
14210 break;
14211 case UNLE:
14212 fputs ("ule", file);
14213 break;
14214 case UNLT:
14215 fputs ("ult", file);
14216 break;
14217 case LTGT:
14218 fputs ("une", file);
14219 break;
14220 default:
14221 output_operand_lossage ("operand is not a condition code, "
14222 "invalid operand code 'Y'");
14223 return;
14224 }
14225 return;
14226
14227 case ';':
14228 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14229 putc (';', file);
14230 #endif
14231 return;
14232
14233 case '@':
14234 if (ASSEMBLER_DIALECT == ASM_ATT)
14235 putc ('%', file);
14236
14237 /* The kernel uses a different segment register for performance
14238 reasons; a system call would not have to trash the userspace
14239 segment register, which would be expensive. */
14240 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14241 fputs ("fs", file);
14242 else
14243 fputs ("gs", file);
14244 return;
14245
14246 case '~':
14247 putc (TARGET_AVX2 ? 'i' : 'f', file);
14248 return;
14249
14250 default:
14251 output_operand_lossage ("invalid operand code '%c'", code);
14252 }
14253 }
14254
14255 if (REG_P (x))
14256 print_reg (x, code, file);
14257
14258 else if (MEM_P (x))
14259 {
14260 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14261 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14262 && GET_MODE (x) != BLKmode)
14263 {
14264 const char * size;
14265 switch (GET_MODE_SIZE (GET_MODE (x)))
14266 {
14267 case 1: size = "BYTE"; break;
14268 case 2: size = "WORD"; break;
14269 case 4: size = "DWORD"; break;
14270 case 8: size = "QWORD"; break;
14271 case 12: size = "TBYTE"; break;
14272 case 16:
14273 if (GET_MODE (x) == XFmode)
14274 size = "TBYTE";
14275 else
14276 size = "XMMWORD";
14277 break;
14278 case 32: size = "YMMWORD"; break;
14279 default:
14280 gcc_unreachable ();
14281 }
14282
14283 /* Check for explicit size override (codes 'b', 'w', 'k',
14284 'q' and 'x') */
14285 if (code == 'b')
14286 size = "BYTE";
14287 else if (code == 'w')
14288 size = "WORD";
14289 else if (code == 'k')
14290 size = "DWORD";
14291 else if (code == 'q')
14292 size = "QWORD";
14293 else if (code == 'x')
14294 size = "XMMWORD";
14295
14296 fputs (size, file);
14297 fputs (" PTR ", file);
14298 }
14299
14300 x = XEXP (x, 0);
14301 /* Avoid (%rip) for call operands. */
14302 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14303 && !CONST_INT_P (x))
14304 output_addr_const (file, x);
14305 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14306 output_operand_lossage ("invalid constraints for operand");
14307 else
14308 output_address (x);
14309 }
14310
14311 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14312 {
14313 REAL_VALUE_TYPE r;
14314 long l;
14315
14316 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14317 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14318
14319 if (ASSEMBLER_DIALECT == ASM_ATT)
14320 putc ('$', file);
14321 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14322 if (code == 'q')
14323 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14324 else
14325 fprintf (file, "0x%08x", (unsigned int) l);
14326 }
14327
14328 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14329 {
14330 REAL_VALUE_TYPE r;
14331 long l[2];
14332
14333 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14334 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14335
14336 if (ASSEMBLER_DIALECT == ASM_ATT)
14337 putc ('$', file);
14338 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14339 }
14340
14341 /* These float cases don't actually occur as immediate operands. */
14342 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14343 {
14344 char dstr[30];
14345
14346 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14347 fputs (dstr, file);
14348 }
14349
14350 else
14351 {
14352 /* We have patterns that allow zero sets of memory, for instance.
14353 In 64-bit mode, we should probably support all 8-byte vectors,
14354 since we can in fact encode that into an immediate. */
14355 if (GET_CODE (x) == CONST_VECTOR)
14356 {
14357 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14358 x = const0_rtx;
14359 }
14360
14361 if (code != 'P' && code != 'p')
14362 {
14363 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14364 {
14365 if (ASSEMBLER_DIALECT == ASM_ATT)
14366 putc ('$', file);
14367 }
14368 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14369 || GET_CODE (x) == LABEL_REF)
14370 {
14371 if (ASSEMBLER_DIALECT == ASM_ATT)
14372 putc ('$', file);
14373 else
14374 fputs ("OFFSET FLAT:", file);
14375 }
14376 }
14377 if (CONST_INT_P (x))
14378 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14379 else if (flag_pic || MACHOPIC_INDIRECT)
14380 output_pic_addr_const (file, x, code);
14381 else
14382 output_addr_const (file, x);
14383 }
14384 }
14385
14386 static bool
14387 ix86_print_operand_punct_valid_p (unsigned char code)
14388 {
14389 return (code == '@' || code == '*' || code == '+'
14390 || code == '&' || code == ';' || code == '~');
14391 }
14392 \f
14393 /* Print a memory operand whose address is ADDR. */
14394
14395 static void
14396 ix86_print_operand_address (FILE *file, rtx addr)
14397 {
14398 struct ix86_address parts;
14399 rtx base, index, disp;
14400 int scale;
14401 int ok;
14402 bool vsib = false;
14403
14404 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14405 {
14406 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14407 gcc_assert (parts.index == NULL_RTX);
14408 parts.index = XVECEXP (addr, 0, 1);
14409 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14410 addr = XVECEXP (addr, 0, 0);
14411 vsib = true;
14412 }
14413 else
14414 ok = ix86_decompose_address (addr, &parts);
14415
14416 gcc_assert (ok);
14417
14418 if (parts.base && GET_CODE (parts.base) == SUBREG)
14419 {
14420 rtx tmp = SUBREG_REG (parts.base);
14421 parts.base = simplify_subreg (GET_MODE (parts.base),
14422 tmp, GET_MODE (tmp), 0);
14423 }
14424
14425 if (parts.index && GET_CODE (parts.index) == SUBREG)
14426 {
14427 rtx tmp = SUBREG_REG (parts.index);
14428 parts.index = simplify_subreg (GET_MODE (parts.index),
14429 tmp, GET_MODE (tmp), 0);
14430 }
14431
14432 base = parts.base;
14433 index = parts.index;
14434 disp = parts.disp;
14435 scale = parts.scale;
14436
14437 switch (parts.seg)
14438 {
14439 case SEG_DEFAULT:
14440 break;
14441 case SEG_FS:
14442 case SEG_GS:
14443 if (ASSEMBLER_DIALECT == ASM_ATT)
14444 putc ('%', file);
14445 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14446 break;
14447 default:
14448 gcc_unreachable ();
14449 }
14450
14451 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14452 if (TARGET_64BIT && !base && !index)
14453 {
14454 rtx symbol = disp;
14455
14456 if (GET_CODE (disp) == CONST
14457 && GET_CODE (XEXP (disp, 0)) == PLUS
14458 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14459 symbol = XEXP (XEXP (disp, 0), 0);
14460
14461 if (GET_CODE (symbol) == LABEL_REF
14462 || (GET_CODE (symbol) == SYMBOL_REF
14463 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14464 base = pc_rtx;
14465 }
14466 if (!base && !index)
14467 {
14468 /* Displacement only requires special attention. */
14469
14470 if (CONST_INT_P (disp))
14471 {
14472 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14473 fputs ("ds:", file);
14474 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14475 }
14476 else if (flag_pic)
14477 output_pic_addr_const (file, disp, 0);
14478 else
14479 output_addr_const (file, disp);
14480 }
14481 else
14482 {
14483 int code = 0;
14484
14485 /* Print SImode registers for zero-extended addresses to force
14486 addr32 prefix. Otherwise print DImode registers to avoid it. */
14487 if (TARGET_64BIT && GET_MODE (addr) == DImode)
14488 code = ((GET_CODE (addr) == ZERO_EXTEND
14489 || GET_CODE (addr) == AND)
14490 ? 'l'
14491 : 'q');
14492
14493 if (ASSEMBLER_DIALECT == ASM_ATT)
14494 {
14495 if (disp)
14496 {
14497 if (flag_pic)
14498 output_pic_addr_const (file, disp, 0);
14499 else if (GET_CODE (disp) == LABEL_REF)
14500 output_asm_label (disp);
14501 else
14502 output_addr_const (file, disp);
14503 }
14504
14505 putc ('(', file);
14506 if (base)
14507 print_reg (base, code, file);
14508 if (index)
14509 {
14510 putc (',', file);
14511 print_reg (index, vsib ? 0 : code, file);
14512 if (scale != 1 || vsib)
14513 fprintf (file, ",%d", scale);
14514 }
14515 putc (')', file);
14516 }
14517 else
14518 {
14519 rtx offset = NULL_RTX;
14520
14521 if (disp)
14522 {
14523 /* Pull out the offset of a symbol; print any symbol itself. */
14524 if (GET_CODE (disp) == CONST
14525 && GET_CODE (XEXP (disp, 0)) == PLUS
14526 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14527 {
14528 offset = XEXP (XEXP (disp, 0), 1);
14529 disp = gen_rtx_CONST (VOIDmode,
14530 XEXP (XEXP (disp, 0), 0));
14531 }
14532
14533 if (flag_pic)
14534 output_pic_addr_const (file, disp, 0);
14535 else if (GET_CODE (disp) == LABEL_REF)
14536 output_asm_label (disp);
14537 else if (CONST_INT_P (disp))
14538 offset = disp;
14539 else
14540 output_addr_const (file, disp);
14541 }
14542
14543 putc ('[', file);
14544 if (base)
14545 {
14546 print_reg (base, code, file);
14547 if (offset)
14548 {
14549 if (INTVAL (offset) >= 0)
14550 putc ('+', file);
14551 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14552 }
14553 }
14554 else if (offset)
14555 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14556 else
14557 putc ('0', file);
14558
14559 if (index)
14560 {
14561 putc ('+', file);
14562 print_reg (index, vsib ? 0 : code, file);
14563 if (scale != 1 || vsib)
14564 fprintf (file, "*%d", scale);
14565 }
14566 putc (']', file);
14567 }
14568 }
14569 }
14570
14571 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14572
14573 static bool
14574 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14575 {
14576 rtx op;
14577
14578 if (GET_CODE (x) != UNSPEC)
14579 return false;
14580
14581 op = XVECEXP (x, 0, 0);
14582 switch (XINT (x, 1))
14583 {
14584 case UNSPEC_GOTTPOFF:
14585 output_addr_const (file, op);
14586 /* FIXME: This might be @TPOFF in Sun ld. */
14587 fputs ("@gottpoff", file);
14588 break;
14589 case UNSPEC_TPOFF:
14590 output_addr_const (file, op);
14591 fputs ("@tpoff", file);
14592 break;
14593 case UNSPEC_NTPOFF:
14594 output_addr_const (file, op);
14595 if (TARGET_64BIT)
14596 fputs ("@tpoff", file);
14597 else
14598 fputs ("@ntpoff", file);
14599 break;
14600 case UNSPEC_DTPOFF:
14601 output_addr_const (file, op);
14602 fputs ("@dtpoff", file);
14603 break;
14604 case UNSPEC_GOTNTPOFF:
14605 output_addr_const (file, op);
14606 if (TARGET_64BIT)
14607 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14608 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14609 else
14610 fputs ("@gotntpoff", file);
14611 break;
14612 case UNSPEC_INDNTPOFF:
14613 output_addr_const (file, op);
14614 fputs ("@indntpoff", file);
14615 break;
14616 #if TARGET_MACHO
14617 case UNSPEC_MACHOPIC_OFFSET:
14618 output_addr_const (file, op);
14619 putc ('-', file);
14620 machopic_output_function_base_name (file);
14621 break;
14622 #endif
14623
14624 case UNSPEC_STACK_CHECK:
14625 {
14626 int offset;
14627
14628 gcc_assert (flag_split_stack);
14629
14630 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14631 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14632 #else
14633 gcc_unreachable ();
14634 #endif
14635
14636 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14637 }
14638 break;
14639
14640 default:
14641 return false;
14642 }
14643
14644 return true;
14645 }
14646 \f
14647 /* Split one or more double-mode RTL references into pairs of half-mode
14648 references. The RTL can be REG, offsettable MEM, integer constant, or
14649 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14650 split and "num" is its length. lo_half and hi_half are output arrays
14651 that parallel "operands". */
14652
14653 void
14654 split_double_mode (enum machine_mode mode, rtx operands[],
14655 int num, rtx lo_half[], rtx hi_half[])
14656 {
14657 enum machine_mode half_mode;
14658 unsigned int byte;
14659
14660 switch (mode)
14661 {
14662 case TImode:
14663 half_mode = DImode;
14664 break;
14665 case DImode:
14666 half_mode = SImode;
14667 break;
14668 default:
14669 gcc_unreachable ();
14670 }
14671
14672 byte = GET_MODE_SIZE (half_mode);
14673
14674 while (num--)
14675 {
14676 rtx op = operands[num];
14677
14678 /* simplify_subreg refuse to split volatile memory addresses,
14679 but we still have to handle it. */
14680 if (MEM_P (op))
14681 {
14682 lo_half[num] = adjust_address (op, half_mode, 0);
14683 hi_half[num] = adjust_address (op, half_mode, byte);
14684 }
14685 else
14686 {
14687 lo_half[num] = simplify_gen_subreg (half_mode, op,
14688 GET_MODE (op) == VOIDmode
14689 ? mode : GET_MODE (op), 0);
14690 hi_half[num] = simplify_gen_subreg (half_mode, op,
14691 GET_MODE (op) == VOIDmode
14692 ? mode : GET_MODE (op), byte);
14693 }
14694 }
14695 }
14696 \f
14697 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14698 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14699 is the expression of the binary operation. The output may either be
14700 emitted here, or returned to the caller, like all output_* functions.
14701
14702 There is no guarantee that the operands are the same mode, as they
14703 might be within FLOAT or FLOAT_EXTEND expressions. */
14704
14705 #ifndef SYSV386_COMPAT
14706 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14707 wants to fix the assemblers because that causes incompatibility
14708 with gcc. No-one wants to fix gcc because that causes
14709 incompatibility with assemblers... You can use the option of
14710 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14711 #define SYSV386_COMPAT 1
14712 #endif
14713
14714 const char *
14715 output_387_binary_op (rtx insn, rtx *operands)
14716 {
14717 static char buf[40];
14718 const char *p;
14719 const char *ssep;
14720 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14721
14722 #ifdef ENABLE_CHECKING
14723 /* Even if we do not want to check the inputs, this documents input
14724 constraints. Which helps in understanding the following code. */
14725 if (STACK_REG_P (operands[0])
14726 && ((REG_P (operands[1])
14727 && REGNO (operands[0]) == REGNO (operands[1])
14728 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14729 || (REG_P (operands[2])
14730 && REGNO (operands[0]) == REGNO (operands[2])
14731 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14732 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14733 ; /* ok */
14734 else
14735 gcc_assert (is_sse);
14736 #endif
14737
14738 switch (GET_CODE (operands[3]))
14739 {
14740 case PLUS:
14741 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14742 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14743 p = "fiadd";
14744 else
14745 p = "fadd";
14746 ssep = "vadd";
14747 break;
14748
14749 case MINUS:
14750 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14751 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14752 p = "fisub";
14753 else
14754 p = "fsub";
14755 ssep = "vsub";
14756 break;
14757
14758 case MULT:
14759 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14760 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14761 p = "fimul";
14762 else
14763 p = "fmul";
14764 ssep = "vmul";
14765 break;
14766
14767 case DIV:
14768 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14769 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14770 p = "fidiv";
14771 else
14772 p = "fdiv";
14773 ssep = "vdiv";
14774 break;
14775
14776 default:
14777 gcc_unreachable ();
14778 }
14779
14780 if (is_sse)
14781 {
14782 if (TARGET_AVX)
14783 {
14784 strcpy (buf, ssep);
14785 if (GET_MODE (operands[0]) == SFmode)
14786 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14787 else
14788 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14789 }
14790 else
14791 {
14792 strcpy (buf, ssep + 1);
14793 if (GET_MODE (operands[0]) == SFmode)
14794 strcat (buf, "ss\t{%2, %0|%0, %2}");
14795 else
14796 strcat (buf, "sd\t{%2, %0|%0, %2}");
14797 }
14798 return buf;
14799 }
14800 strcpy (buf, p);
14801
14802 switch (GET_CODE (operands[3]))
14803 {
14804 case MULT:
14805 case PLUS:
14806 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14807 {
14808 rtx temp = operands[2];
14809 operands[2] = operands[1];
14810 operands[1] = temp;
14811 }
14812
14813 /* know operands[0] == operands[1]. */
14814
14815 if (MEM_P (operands[2]))
14816 {
14817 p = "%Z2\t%2";
14818 break;
14819 }
14820
14821 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14822 {
14823 if (STACK_TOP_P (operands[0]))
14824 /* How is it that we are storing to a dead operand[2]?
14825 Well, presumably operands[1] is dead too. We can't
14826 store the result to st(0) as st(0) gets popped on this
14827 instruction. Instead store to operands[2] (which I
14828 think has to be st(1)). st(1) will be popped later.
14829 gcc <= 2.8.1 didn't have this check and generated
14830 assembly code that the Unixware assembler rejected. */
14831 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14832 else
14833 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14834 break;
14835 }
14836
14837 if (STACK_TOP_P (operands[0]))
14838 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14839 else
14840 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14841 break;
14842
14843 case MINUS:
14844 case DIV:
14845 if (MEM_P (operands[1]))
14846 {
14847 p = "r%Z1\t%1";
14848 break;
14849 }
14850
14851 if (MEM_P (operands[2]))
14852 {
14853 p = "%Z2\t%2";
14854 break;
14855 }
14856
14857 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14858 {
14859 #if SYSV386_COMPAT
14860 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14861 derived assemblers, confusingly reverse the direction of
14862 the operation for fsub{r} and fdiv{r} when the
14863 destination register is not st(0). The Intel assembler
14864 doesn't have this brain damage. Read !SYSV386_COMPAT to
14865 figure out what the hardware really does. */
14866 if (STACK_TOP_P (operands[0]))
14867 p = "{p\t%0, %2|rp\t%2, %0}";
14868 else
14869 p = "{rp\t%2, %0|p\t%0, %2}";
14870 #else
14871 if (STACK_TOP_P (operands[0]))
14872 /* As above for fmul/fadd, we can't store to st(0). */
14873 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14874 else
14875 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14876 #endif
14877 break;
14878 }
14879
14880 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
14881 {
14882 #if SYSV386_COMPAT
14883 if (STACK_TOP_P (operands[0]))
14884 p = "{rp\t%0, %1|p\t%1, %0}";
14885 else
14886 p = "{p\t%1, %0|rp\t%0, %1}";
14887 #else
14888 if (STACK_TOP_P (operands[0]))
14889 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
14890 else
14891 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
14892 #endif
14893 break;
14894 }
14895
14896 if (STACK_TOP_P (operands[0]))
14897 {
14898 if (STACK_TOP_P (operands[1]))
14899 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14900 else
14901 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
14902 break;
14903 }
14904 else if (STACK_TOP_P (operands[1]))
14905 {
14906 #if SYSV386_COMPAT
14907 p = "{\t%1, %0|r\t%0, %1}";
14908 #else
14909 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
14910 #endif
14911 }
14912 else
14913 {
14914 #if SYSV386_COMPAT
14915 p = "{r\t%2, %0|\t%0, %2}";
14916 #else
14917 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14918 #endif
14919 }
14920 break;
14921
14922 default:
14923 gcc_unreachable ();
14924 }
14925
14926 strcat (buf, p);
14927 return buf;
14928 }
14929
14930 /* Return needed mode for entity in optimize_mode_switching pass. */
14931
14932 int
14933 ix86_mode_needed (int entity, rtx insn)
14934 {
14935 enum attr_i387_cw mode;
14936
14937 /* The mode UNINITIALIZED is used to store control word after a
14938 function call or ASM pattern. The mode ANY specify that function
14939 has no requirements on the control word and make no changes in the
14940 bits we are interested in. */
14941
14942 if (CALL_P (insn)
14943 || (NONJUMP_INSN_P (insn)
14944 && (asm_noperands (PATTERN (insn)) >= 0
14945 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
14946 return I387_CW_UNINITIALIZED;
14947
14948 if (recog_memoized (insn) < 0)
14949 return I387_CW_ANY;
14950
14951 mode = get_attr_i387_cw (insn);
14952
14953 switch (entity)
14954 {
14955 case I387_TRUNC:
14956 if (mode == I387_CW_TRUNC)
14957 return mode;
14958 break;
14959
14960 case I387_FLOOR:
14961 if (mode == I387_CW_FLOOR)
14962 return mode;
14963 break;
14964
14965 case I387_CEIL:
14966 if (mode == I387_CW_CEIL)
14967 return mode;
14968 break;
14969
14970 case I387_MASK_PM:
14971 if (mode == I387_CW_MASK_PM)
14972 return mode;
14973 break;
14974
14975 default:
14976 gcc_unreachable ();
14977 }
14978
14979 return I387_CW_ANY;
14980 }
14981
14982 /* Output code to initialize control word copies used by trunc?f?i and
14983 rounding patterns. CURRENT_MODE is set to current control word,
14984 while NEW_MODE is set to new control word. */
14985
14986 void
14987 emit_i387_cw_initialization (int mode)
14988 {
14989 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
14990 rtx new_mode;
14991
14992 enum ix86_stack_slot slot;
14993
14994 rtx reg = gen_reg_rtx (HImode);
14995
14996 emit_insn (gen_x86_fnstcw_1 (stored_mode));
14997 emit_move_insn (reg, copy_rtx (stored_mode));
14998
14999 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15000 || optimize_function_for_size_p (cfun))
15001 {
15002 switch (mode)
15003 {
15004 case I387_CW_TRUNC:
15005 /* round toward zero (truncate) */
15006 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15007 slot = SLOT_CW_TRUNC;
15008 break;
15009
15010 case I387_CW_FLOOR:
15011 /* round down toward -oo */
15012 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15013 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15014 slot = SLOT_CW_FLOOR;
15015 break;
15016
15017 case I387_CW_CEIL:
15018 /* round up toward +oo */
15019 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15020 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15021 slot = SLOT_CW_CEIL;
15022 break;
15023
15024 case I387_CW_MASK_PM:
15025 /* mask precision exception for nearbyint() */
15026 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15027 slot = SLOT_CW_MASK_PM;
15028 break;
15029
15030 default:
15031 gcc_unreachable ();
15032 }
15033 }
15034 else
15035 {
15036 switch (mode)
15037 {
15038 case I387_CW_TRUNC:
15039 /* round toward zero (truncate) */
15040 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15041 slot = SLOT_CW_TRUNC;
15042 break;
15043
15044 case I387_CW_FLOOR:
15045 /* round down toward -oo */
15046 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15047 slot = SLOT_CW_FLOOR;
15048 break;
15049
15050 case I387_CW_CEIL:
15051 /* round up toward +oo */
15052 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15053 slot = SLOT_CW_CEIL;
15054 break;
15055
15056 case I387_CW_MASK_PM:
15057 /* mask precision exception for nearbyint() */
15058 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15059 slot = SLOT_CW_MASK_PM;
15060 break;
15061
15062 default:
15063 gcc_unreachable ();
15064 }
15065 }
15066
15067 gcc_assert (slot < MAX_386_STACK_LOCALS);
15068
15069 new_mode = assign_386_stack_local (HImode, slot);
15070 emit_move_insn (new_mode, reg);
15071 }
15072
15073 /* Output code for INSN to convert a float to a signed int. OPERANDS
15074 are the insn operands. The output may be [HSD]Imode and the input
15075 operand may be [SDX]Fmode. */
15076
15077 const char *
15078 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15079 {
15080 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15081 int dimode_p = GET_MODE (operands[0]) == DImode;
15082 int round_mode = get_attr_i387_cw (insn);
15083
15084 /* Jump through a hoop or two for DImode, since the hardware has no
15085 non-popping instruction. We used to do this a different way, but
15086 that was somewhat fragile and broke with post-reload splitters. */
15087 if ((dimode_p || fisttp) && !stack_top_dies)
15088 output_asm_insn ("fld\t%y1", operands);
15089
15090 gcc_assert (STACK_TOP_P (operands[1]));
15091 gcc_assert (MEM_P (operands[0]));
15092 gcc_assert (GET_MODE (operands[1]) != TFmode);
15093
15094 if (fisttp)
15095 output_asm_insn ("fisttp%Z0\t%0", operands);
15096 else
15097 {
15098 if (round_mode != I387_CW_ANY)
15099 output_asm_insn ("fldcw\t%3", operands);
15100 if (stack_top_dies || dimode_p)
15101 output_asm_insn ("fistp%Z0\t%0", operands);
15102 else
15103 output_asm_insn ("fist%Z0\t%0", operands);
15104 if (round_mode != I387_CW_ANY)
15105 output_asm_insn ("fldcw\t%2", operands);
15106 }
15107
15108 return "";
15109 }
15110
15111 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15112 have the values zero or one, indicates the ffreep insn's operand
15113 from the OPERANDS array. */
15114
15115 static const char *
15116 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15117 {
15118 if (TARGET_USE_FFREEP)
15119 #ifdef HAVE_AS_IX86_FFREEP
15120 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15121 #else
15122 {
15123 static char retval[32];
15124 int regno = REGNO (operands[opno]);
15125
15126 gcc_assert (FP_REGNO_P (regno));
15127
15128 regno -= FIRST_STACK_REG;
15129
15130 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15131 return retval;
15132 }
15133 #endif
15134
15135 return opno ? "fstp\t%y1" : "fstp\t%y0";
15136 }
15137
15138
15139 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15140 should be used. UNORDERED_P is true when fucom should be used. */
15141
15142 const char *
15143 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15144 {
15145 int stack_top_dies;
15146 rtx cmp_op0, cmp_op1;
15147 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15148
15149 if (eflags_p)
15150 {
15151 cmp_op0 = operands[0];
15152 cmp_op1 = operands[1];
15153 }
15154 else
15155 {
15156 cmp_op0 = operands[1];
15157 cmp_op1 = operands[2];
15158 }
15159
15160 if (is_sse)
15161 {
15162 if (GET_MODE (operands[0]) == SFmode)
15163 if (unordered_p)
15164 return "%vucomiss\t{%1, %0|%0, %1}";
15165 else
15166 return "%vcomiss\t{%1, %0|%0, %1}";
15167 else
15168 if (unordered_p)
15169 return "%vucomisd\t{%1, %0|%0, %1}";
15170 else
15171 return "%vcomisd\t{%1, %0|%0, %1}";
15172 }
15173
15174 gcc_assert (STACK_TOP_P (cmp_op0));
15175
15176 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15177
15178 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15179 {
15180 if (stack_top_dies)
15181 {
15182 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15183 return output_387_ffreep (operands, 1);
15184 }
15185 else
15186 return "ftst\n\tfnstsw\t%0";
15187 }
15188
15189 if (STACK_REG_P (cmp_op1)
15190 && stack_top_dies
15191 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15192 && REGNO (cmp_op1) != FIRST_STACK_REG)
15193 {
15194 /* If both the top of the 387 stack dies, and the other operand
15195 is also a stack register that dies, then this must be a
15196 `fcompp' float compare */
15197
15198 if (eflags_p)
15199 {
15200 /* There is no double popping fcomi variant. Fortunately,
15201 eflags is immune from the fstp's cc clobbering. */
15202 if (unordered_p)
15203 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15204 else
15205 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15206 return output_387_ffreep (operands, 0);
15207 }
15208 else
15209 {
15210 if (unordered_p)
15211 return "fucompp\n\tfnstsw\t%0";
15212 else
15213 return "fcompp\n\tfnstsw\t%0";
15214 }
15215 }
15216 else
15217 {
15218 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15219
15220 static const char * const alt[16] =
15221 {
15222 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15223 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15224 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15225 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15226
15227 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15228 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15229 NULL,
15230 NULL,
15231
15232 "fcomi\t{%y1, %0|%0, %y1}",
15233 "fcomip\t{%y1, %0|%0, %y1}",
15234 "fucomi\t{%y1, %0|%0, %y1}",
15235 "fucomip\t{%y1, %0|%0, %y1}",
15236
15237 NULL,
15238 NULL,
15239 NULL,
15240 NULL
15241 };
15242
15243 int mask;
15244 const char *ret;
15245
15246 mask = eflags_p << 3;
15247 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15248 mask |= unordered_p << 1;
15249 mask |= stack_top_dies;
15250
15251 gcc_assert (mask < 16);
15252 ret = alt[mask];
15253 gcc_assert (ret);
15254
15255 return ret;
15256 }
15257 }
15258
15259 void
15260 ix86_output_addr_vec_elt (FILE *file, int value)
15261 {
15262 const char *directive = ASM_LONG;
15263
15264 #ifdef ASM_QUAD
15265 if (TARGET_LP64)
15266 directive = ASM_QUAD;
15267 #else
15268 gcc_assert (!TARGET_64BIT);
15269 #endif
15270
15271 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15272 }
15273
15274 void
15275 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15276 {
15277 const char *directive = ASM_LONG;
15278
15279 #ifdef ASM_QUAD
15280 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15281 directive = ASM_QUAD;
15282 #else
15283 gcc_assert (!TARGET_64BIT);
15284 #endif
15285 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15286 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15287 fprintf (file, "%s%s%d-%s%d\n",
15288 directive, LPREFIX, value, LPREFIX, rel);
15289 else if (HAVE_AS_GOTOFF_IN_DATA)
15290 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15291 #if TARGET_MACHO
15292 else if (TARGET_MACHO)
15293 {
15294 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15295 machopic_output_function_base_name (file);
15296 putc ('\n', file);
15297 }
15298 #endif
15299 else
15300 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15301 GOT_SYMBOL_NAME, LPREFIX, value);
15302 }
15303 \f
15304 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15305 for the target. */
15306
15307 void
15308 ix86_expand_clear (rtx dest)
15309 {
15310 rtx tmp;
15311
15312 /* We play register width games, which are only valid after reload. */
15313 gcc_assert (reload_completed);
15314
15315 /* Avoid HImode and its attendant prefix byte. */
15316 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15317 dest = gen_rtx_REG (SImode, REGNO (dest));
15318 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15319
15320 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15321 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15322 {
15323 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15324 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15325 }
15326
15327 emit_insn (tmp);
15328 }
15329
15330 /* X is an unchanging MEM. If it is a constant pool reference, return
15331 the constant pool rtx, else NULL. */
15332
15333 rtx
15334 maybe_get_pool_constant (rtx x)
15335 {
15336 x = ix86_delegitimize_address (XEXP (x, 0));
15337
15338 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15339 return get_pool_constant (x);
15340
15341 return NULL_RTX;
15342 }
15343
15344 void
15345 ix86_expand_move (enum machine_mode mode, rtx operands[])
15346 {
15347 rtx op0, op1;
15348 enum tls_model model;
15349
15350 op0 = operands[0];
15351 op1 = operands[1];
15352
15353 if (GET_CODE (op1) == SYMBOL_REF)
15354 {
15355 model = SYMBOL_REF_TLS_MODEL (op1);
15356 if (model)
15357 {
15358 op1 = legitimize_tls_address (op1, model, true);
15359 op1 = force_operand (op1, op0);
15360 if (op1 == op0)
15361 return;
15362 if (GET_MODE (op1) != mode)
15363 op1 = convert_to_mode (mode, op1, 1);
15364 }
15365 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15366 && SYMBOL_REF_DLLIMPORT_P (op1))
15367 op1 = legitimize_dllimport_symbol (op1, false);
15368 }
15369 else if (GET_CODE (op1) == CONST
15370 && GET_CODE (XEXP (op1, 0)) == PLUS
15371 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15372 {
15373 rtx addend = XEXP (XEXP (op1, 0), 1);
15374 rtx symbol = XEXP (XEXP (op1, 0), 0);
15375 rtx tmp = NULL;
15376
15377 model = SYMBOL_REF_TLS_MODEL (symbol);
15378 if (model)
15379 tmp = legitimize_tls_address (symbol, model, true);
15380 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15381 && SYMBOL_REF_DLLIMPORT_P (symbol))
15382 tmp = legitimize_dllimport_symbol (symbol, true);
15383
15384 if (tmp)
15385 {
15386 tmp = force_operand (tmp, NULL);
15387 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15388 op0, 1, OPTAB_DIRECT);
15389 if (tmp == op0)
15390 return;
15391 if (GET_MODE (tmp) != mode)
15392 op1 = convert_to_mode (mode, tmp, 1);
15393 }
15394 }
15395
15396 if ((flag_pic || MACHOPIC_INDIRECT)
15397 && symbolic_operand (op1, mode))
15398 {
15399 if (TARGET_MACHO && !TARGET_64BIT)
15400 {
15401 #if TARGET_MACHO
15402 /* dynamic-no-pic */
15403 if (MACHOPIC_INDIRECT)
15404 {
15405 rtx temp = ((reload_in_progress
15406 || ((op0 && REG_P (op0))
15407 && mode == Pmode))
15408 ? op0 : gen_reg_rtx (Pmode));
15409 op1 = machopic_indirect_data_reference (op1, temp);
15410 if (MACHOPIC_PURE)
15411 op1 = machopic_legitimize_pic_address (op1, mode,
15412 temp == op1 ? 0 : temp);
15413 }
15414 if (op0 != op1 && GET_CODE (op0) != MEM)
15415 {
15416 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15417 emit_insn (insn);
15418 return;
15419 }
15420 if (GET_CODE (op0) == MEM)
15421 op1 = force_reg (Pmode, op1);
15422 else
15423 {
15424 rtx temp = op0;
15425 if (GET_CODE (temp) != REG)
15426 temp = gen_reg_rtx (Pmode);
15427 temp = legitimize_pic_address (op1, temp);
15428 if (temp == op0)
15429 return;
15430 op1 = temp;
15431 }
15432 /* dynamic-no-pic */
15433 #endif
15434 }
15435 else
15436 {
15437 if (MEM_P (op0))
15438 op1 = force_reg (mode, op1);
15439 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15440 {
15441 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15442 op1 = legitimize_pic_address (op1, reg);
15443 if (op0 == op1)
15444 return;
15445 if (GET_MODE (op1) != mode)
15446 op1 = convert_to_mode (mode, op1, 1);
15447 }
15448 }
15449 }
15450 else
15451 {
15452 if (MEM_P (op0)
15453 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15454 || !push_operand (op0, mode))
15455 && MEM_P (op1))
15456 op1 = force_reg (mode, op1);
15457
15458 if (push_operand (op0, mode)
15459 && ! general_no_elim_operand (op1, mode))
15460 op1 = copy_to_mode_reg (mode, op1);
15461
15462 /* Force large constants in 64bit compilation into register
15463 to get them CSEed. */
15464 if (can_create_pseudo_p ()
15465 && (mode == DImode) && TARGET_64BIT
15466 && immediate_operand (op1, mode)
15467 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15468 && !register_operand (op0, mode)
15469 && optimize)
15470 op1 = copy_to_mode_reg (mode, op1);
15471
15472 if (can_create_pseudo_p ()
15473 && FLOAT_MODE_P (mode)
15474 && GET_CODE (op1) == CONST_DOUBLE)
15475 {
15476 /* If we are loading a floating point constant to a register,
15477 force the value to memory now, since we'll get better code
15478 out the back end. */
15479
15480 op1 = validize_mem (force_const_mem (mode, op1));
15481 if (!register_operand (op0, mode))
15482 {
15483 rtx temp = gen_reg_rtx (mode);
15484 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15485 emit_move_insn (op0, temp);
15486 return;
15487 }
15488 }
15489 }
15490
15491 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15492 }
15493
15494 void
15495 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15496 {
15497 rtx op0 = operands[0], op1 = operands[1];
15498 unsigned int align = GET_MODE_ALIGNMENT (mode);
15499
15500 /* Force constants other than zero into memory. We do not know how
15501 the instructions used to build constants modify the upper 64 bits
15502 of the register, once we have that information we may be able
15503 to handle some of them more efficiently. */
15504 if (can_create_pseudo_p ()
15505 && register_operand (op0, mode)
15506 && (CONSTANT_P (op1)
15507 || (GET_CODE (op1) == SUBREG
15508 && CONSTANT_P (SUBREG_REG (op1))))
15509 && !standard_sse_constant_p (op1))
15510 op1 = validize_mem (force_const_mem (mode, op1));
15511
15512 /* We need to check memory alignment for SSE mode since attribute
15513 can make operands unaligned. */
15514 if (can_create_pseudo_p ()
15515 && SSE_REG_MODE_P (mode)
15516 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15517 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15518 {
15519 rtx tmp[2];
15520
15521 /* ix86_expand_vector_move_misalign() does not like constants ... */
15522 if (CONSTANT_P (op1)
15523 || (GET_CODE (op1) == SUBREG
15524 && CONSTANT_P (SUBREG_REG (op1))))
15525 op1 = validize_mem (force_const_mem (mode, op1));
15526
15527 /* ... nor both arguments in memory. */
15528 if (!register_operand (op0, mode)
15529 && !register_operand (op1, mode))
15530 op1 = force_reg (mode, op1);
15531
15532 tmp[0] = op0; tmp[1] = op1;
15533 ix86_expand_vector_move_misalign (mode, tmp);
15534 return;
15535 }
15536
15537 /* Make operand1 a register if it isn't already. */
15538 if (can_create_pseudo_p ()
15539 && !register_operand (op0, mode)
15540 && !register_operand (op1, mode))
15541 {
15542 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15543 return;
15544 }
15545
15546 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15547 }
15548
15549 /* Split 32-byte AVX unaligned load and store if needed. */
15550
15551 static void
15552 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15553 {
15554 rtx m;
15555 rtx (*extract) (rtx, rtx, rtx);
15556 rtx (*move_unaligned) (rtx, rtx);
15557 enum machine_mode mode;
15558
15559 switch (GET_MODE (op0))
15560 {
15561 default:
15562 gcc_unreachable ();
15563 case V32QImode:
15564 extract = gen_avx_vextractf128v32qi;
15565 move_unaligned = gen_avx_movdqu256;
15566 mode = V16QImode;
15567 break;
15568 case V8SFmode:
15569 extract = gen_avx_vextractf128v8sf;
15570 move_unaligned = gen_avx_movups256;
15571 mode = V4SFmode;
15572 break;
15573 case V4DFmode:
15574 extract = gen_avx_vextractf128v4df;
15575 move_unaligned = gen_avx_movupd256;
15576 mode = V2DFmode;
15577 break;
15578 }
15579
15580 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15581 {
15582 rtx r = gen_reg_rtx (mode);
15583 m = adjust_address (op1, mode, 0);
15584 emit_move_insn (r, m);
15585 m = adjust_address (op1, mode, 16);
15586 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15587 emit_move_insn (op0, r);
15588 }
15589 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15590 {
15591 m = adjust_address (op0, mode, 0);
15592 emit_insn (extract (m, op1, const0_rtx));
15593 m = adjust_address (op0, mode, 16);
15594 emit_insn (extract (m, op1, const1_rtx));
15595 }
15596 else
15597 emit_insn (move_unaligned (op0, op1));
15598 }
15599
15600 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15601 straight to ix86_expand_vector_move. */
15602 /* Code generation for scalar reg-reg moves of single and double precision data:
15603 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15604 movaps reg, reg
15605 else
15606 movss reg, reg
15607 if (x86_sse_partial_reg_dependency == true)
15608 movapd reg, reg
15609 else
15610 movsd reg, reg
15611
15612 Code generation for scalar loads of double precision data:
15613 if (x86_sse_split_regs == true)
15614 movlpd mem, reg (gas syntax)
15615 else
15616 movsd mem, reg
15617
15618 Code generation for unaligned packed loads of single precision data
15619 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15620 if (x86_sse_unaligned_move_optimal)
15621 movups mem, reg
15622
15623 if (x86_sse_partial_reg_dependency == true)
15624 {
15625 xorps reg, reg
15626 movlps mem, reg
15627 movhps mem+8, reg
15628 }
15629 else
15630 {
15631 movlps mem, reg
15632 movhps mem+8, reg
15633 }
15634
15635 Code generation for unaligned packed loads of double precision data
15636 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15637 if (x86_sse_unaligned_move_optimal)
15638 movupd mem, reg
15639
15640 if (x86_sse_split_regs == true)
15641 {
15642 movlpd mem, reg
15643 movhpd mem+8, reg
15644 }
15645 else
15646 {
15647 movsd mem, reg
15648 movhpd mem+8, reg
15649 }
15650 */
15651
15652 void
15653 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15654 {
15655 rtx op0, op1, m;
15656
15657 op0 = operands[0];
15658 op1 = operands[1];
15659
15660 if (TARGET_AVX)
15661 {
15662 switch (GET_MODE_CLASS (mode))
15663 {
15664 case MODE_VECTOR_INT:
15665 case MODE_INT:
15666 switch (GET_MODE_SIZE (mode))
15667 {
15668 case 16:
15669 /* If we're optimizing for size, movups is the smallest. */
15670 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15671 {
15672 op0 = gen_lowpart (V4SFmode, op0);
15673 op1 = gen_lowpart (V4SFmode, op1);
15674 emit_insn (gen_sse_movups (op0, op1));
15675 return;
15676 }
15677 op0 = gen_lowpart (V16QImode, op0);
15678 op1 = gen_lowpart (V16QImode, op1);
15679 emit_insn (gen_sse2_movdqu (op0, op1));
15680 break;
15681 case 32:
15682 op0 = gen_lowpart (V32QImode, op0);
15683 op1 = gen_lowpart (V32QImode, op1);
15684 ix86_avx256_split_vector_move_misalign (op0, op1);
15685 break;
15686 default:
15687 gcc_unreachable ();
15688 }
15689 break;
15690 case MODE_VECTOR_FLOAT:
15691 op0 = gen_lowpart (mode, op0);
15692 op1 = gen_lowpart (mode, op1);
15693
15694 switch (mode)
15695 {
15696 case V4SFmode:
15697 emit_insn (gen_sse_movups (op0, op1));
15698 break;
15699 case V8SFmode:
15700 ix86_avx256_split_vector_move_misalign (op0, op1);
15701 break;
15702 case V2DFmode:
15703 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15704 {
15705 op0 = gen_lowpart (V4SFmode, op0);
15706 op1 = gen_lowpart (V4SFmode, op1);
15707 emit_insn (gen_sse_movups (op0, op1));
15708 return;
15709 }
15710 emit_insn (gen_sse2_movupd (op0, op1));
15711 break;
15712 case V4DFmode:
15713 ix86_avx256_split_vector_move_misalign (op0, op1);
15714 break;
15715 default:
15716 gcc_unreachable ();
15717 }
15718 break;
15719
15720 default:
15721 gcc_unreachable ();
15722 }
15723
15724 return;
15725 }
15726
15727 if (MEM_P (op1))
15728 {
15729 /* If we're optimizing for size, movups is the smallest. */
15730 if (optimize_insn_for_size_p ()
15731 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15732 {
15733 op0 = gen_lowpart (V4SFmode, op0);
15734 op1 = gen_lowpart (V4SFmode, op1);
15735 emit_insn (gen_sse_movups (op0, op1));
15736 return;
15737 }
15738
15739 /* ??? If we have typed data, then it would appear that using
15740 movdqu is the only way to get unaligned data loaded with
15741 integer type. */
15742 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15743 {
15744 op0 = gen_lowpart (V16QImode, op0);
15745 op1 = gen_lowpart (V16QImode, op1);
15746 emit_insn (gen_sse2_movdqu (op0, op1));
15747 return;
15748 }
15749
15750 if (TARGET_SSE2 && mode == V2DFmode)
15751 {
15752 rtx zero;
15753
15754 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15755 {
15756 op0 = gen_lowpart (V2DFmode, op0);
15757 op1 = gen_lowpart (V2DFmode, op1);
15758 emit_insn (gen_sse2_movupd (op0, op1));
15759 return;
15760 }
15761
15762 /* When SSE registers are split into halves, we can avoid
15763 writing to the top half twice. */
15764 if (TARGET_SSE_SPLIT_REGS)
15765 {
15766 emit_clobber (op0);
15767 zero = op0;
15768 }
15769 else
15770 {
15771 /* ??? Not sure about the best option for the Intel chips.
15772 The following would seem to satisfy; the register is
15773 entirely cleared, breaking the dependency chain. We
15774 then store to the upper half, with a dependency depth
15775 of one. A rumor has it that Intel recommends two movsd
15776 followed by an unpacklpd, but this is unconfirmed. And
15777 given that the dependency depth of the unpacklpd would
15778 still be one, I'm not sure why this would be better. */
15779 zero = CONST0_RTX (V2DFmode);
15780 }
15781
15782 m = adjust_address (op1, DFmode, 0);
15783 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15784 m = adjust_address (op1, DFmode, 8);
15785 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15786 }
15787 else
15788 {
15789 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15790 {
15791 op0 = gen_lowpart (V4SFmode, op0);
15792 op1 = gen_lowpart (V4SFmode, op1);
15793 emit_insn (gen_sse_movups (op0, op1));
15794 return;
15795 }
15796
15797 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15798 emit_move_insn (op0, CONST0_RTX (mode));
15799 else
15800 emit_clobber (op0);
15801
15802 if (mode != V4SFmode)
15803 op0 = gen_lowpart (V4SFmode, op0);
15804 m = adjust_address (op1, V2SFmode, 0);
15805 emit_insn (gen_sse_loadlps (op0, op0, m));
15806 m = adjust_address (op1, V2SFmode, 8);
15807 emit_insn (gen_sse_loadhps (op0, op0, m));
15808 }
15809 }
15810 else if (MEM_P (op0))
15811 {
15812 /* If we're optimizing for size, movups is the smallest. */
15813 if (optimize_insn_for_size_p ()
15814 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15815 {
15816 op0 = gen_lowpart (V4SFmode, op0);
15817 op1 = gen_lowpart (V4SFmode, op1);
15818 emit_insn (gen_sse_movups (op0, op1));
15819 return;
15820 }
15821
15822 /* ??? Similar to above, only less clear because of quote
15823 typeless stores unquote. */
15824 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15825 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15826 {
15827 op0 = gen_lowpart (V16QImode, op0);
15828 op1 = gen_lowpart (V16QImode, op1);
15829 emit_insn (gen_sse2_movdqu (op0, op1));
15830 return;
15831 }
15832
15833 if (TARGET_SSE2 && mode == V2DFmode)
15834 {
15835 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15836 {
15837 op0 = gen_lowpart (V2DFmode, op0);
15838 op1 = gen_lowpart (V2DFmode, op1);
15839 emit_insn (gen_sse2_movupd (op0, op1));
15840 }
15841 else
15842 {
15843 m = adjust_address (op0, DFmode, 0);
15844 emit_insn (gen_sse2_storelpd (m, op1));
15845 m = adjust_address (op0, DFmode, 8);
15846 emit_insn (gen_sse2_storehpd (m, op1));
15847 }
15848 }
15849 else
15850 {
15851 if (mode != V4SFmode)
15852 op1 = gen_lowpart (V4SFmode, op1);
15853
15854 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15855 {
15856 op0 = gen_lowpart (V4SFmode, op0);
15857 emit_insn (gen_sse_movups (op0, op1));
15858 }
15859 else
15860 {
15861 m = adjust_address (op0, V2SFmode, 0);
15862 emit_insn (gen_sse_storelps (m, op1));
15863 m = adjust_address (op0, V2SFmode, 8);
15864 emit_insn (gen_sse_storehps (m, op1));
15865 }
15866 }
15867 }
15868 else
15869 gcc_unreachable ();
15870 }
15871
15872 /* Expand a push in MODE. This is some mode for which we do not support
15873 proper push instructions, at least from the registers that we expect
15874 the value to live in. */
15875
15876 void
15877 ix86_expand_push (enum machine_mode mode, rtx x)
15878 {
15879 rtx tmp;
15880
15881 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
15882 GEN_INT (-GET_MODE_SIZE (mode)),
15883 stack_pointer_rtx, 1, OPTAB_DIRECT);
15884 if (tmp != stack_pointer_rtx)
15885 emit_move_insn (stack_pointer_rtx, tmp);
15886
15887 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
15888
15889 /* When we push an operand onto stack, it has to be aligned at least
15890 at the function argument boundary. However since we don't have
15891 the argument type, we can't determine the actual argument
15892 boundary. */
15893 emit_move_insn (tmp, x);
15894 }
15895
15896 /* Helper function of ix86_fixup_binary_operands to canonicalize
15897 operand order. Returns true if the operands should be swapped. */
15898
15899 static bool
15900 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
15901 rtx operands[])
15902 {
15903 rtx dst = operands[0];
15904 rtx src1 = operands[1];
15905 rtx src2 = operands[2];
15906
15907 /* If the operation is not commutative, we can't do anything. */
15908 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
15909 return false;
15910
15911 /* Highest priority is that src1 should match dst. */
15912 if (rtx_equal_p (dst, src1))
15913 return false;
15914 if (rtx_equal_p (dst, src2))
15915 return true;
15916
15917 /* Next highest priority is that immediate constants come second. */
15918 if (immediate_operand (src2, mode))
15919 return false;
15920 if (immediate_operand (src1, mode))
15921 return true;
15922
15923 /* Lowest priority is that memory references should come second. */
15924 if (MEM_P (src2))
15925 return false;
15926 if (MEM_P (src1))
15927 return true;
15928
15929 return false;
15930 }
15931
15932
15933 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
15934 destination to use for the operation. If different from the true
15935 destination in operands[0], a copy operation will be required. */
15936
15937 rtx
15938 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
15939 rtx operands[])
15940 {
15941 rtx dst = operands[0];
15942 rtx src1 = operands[1];
15943 rtx src2 = operands[2];
15944
15945 /* Canonicalize operand order. */
15946 if (ix86_swap_binary_operands_p (code, mode, operands))
15947 {
15948 rtx temp;
15949
15950 /* It is invalid to swap operands of different modes. */
15951 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
15952
15953 temp = src1;
15954 src1 = src2;
15955 src2 = temp;
15956 }
15957
15958 /* Both source operands cannot be in memory. */
15959 if (MEM_P (src1) && MEM_P (src2))
15960 {
15961 /* Optimization: Only read from memory once. */
15962 if (rtx_equal_p (src1, src2))
15963 {
15964 src2 = force_reg (mode, src2);
15965 src1 = src2;
15966 }
15967 else
15968 src2 = force_reg (mode, src2);
15969 }
15970
15971 /* If the destination is memory, and we do not have matching source
15972 operands, do things in registers. */
15973 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
15974 dst = gen_reg_rtx (mode);
15975
15976 /* Source 1 cannot be a constant. */
15977 if (CONSTANT_P (src1))
15978 src1 = force_reg (mode, src1);
15979
15980 /* Source 1 cannot be a non-matching memory. */
15981 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
15982 src1 = force_reg (mode, src1);
15983
15984 /* Improve address combine. */
15985 if (code == PLUS
15986 && GET_MODE_CLASS (mode) == MODE_INT
15987 && MEM_P (src2))
15988 src2 = force_reg (mode, src2);
15989
15990 operands[1] = src1;
15991 operands[2] = src2;
15992 return dst;
15993 }
15994
15995 /* Similarly, but assume that the destination has already been
15996 set up properly. */
15997
15998 void
15999 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16000 enum machine_mode mode, rtx operands[])
16001 {
16002 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16003 gcc_assert (dst == operands[0]);
16004 }
16005
16006 /* Attempt to expand a binary operator. Make the expansion closer to the
16007 actual machine, then just general_operand, which will allow 3 separate
16008 memory references (one output, two input) in a single insn. */
16009
16010 void
16011 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16012 rtx operands[])
16013 {
16014 rtx src1, src2, dst, op, clob;
16015
16016 dst = ix86_fixup_binary_operands (code, mode, operands);
16017 src1 = operands[1];
16018 src2 = operands[2];
16019
16020 /* Emit the instruction. */
16021
16022 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16023 if (reload_in_progress)
16024 {
16025 /* Reload doesn't know about the flags register, and doesn't know that
16026 it doesn't want to clobber it. We can only do this with PLUS. */
16027 gcc_assert (code == PLUS);
16028 emit_insn (op);
16029 }
16030 else if (reload_completed
16031 && code == PLUS
16032 && !rtx_equal_p (dst, src1))
16033 {
16034 /* This is going to be an LEA; avoid splitting it later. */
16035 emit_insn (op);
16036 }
16037 else
16038 {
16039 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16040 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16041 }
16042
16043 /* Fix up the destination if needed. */
16044 if (dst != operands[0])
16045 emit_move_insn (operands[0], dst);
16046 }
16047
16048 /* Return TRUE or FALSE depending on whether the binary operator meets the
16049 appropriate constraints. */
16050
16051 bool
16052 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16053 rtx operands[3])
16054 {
16055 rtx dst = operands[0];
16056 rtx src1 = operands[1];
16057 rtx src2 = operands[2];
16058
16059 /* Both source operands cannot be in memory. */
16060 if (MEM_P (src1) && MEM_P (src2))
16061 return false;
16062
16063 /* Canonicalize operand order for commutative operators. */
16064 if (ix86_swap_binary_operands_p (code, mode, operands))
16065 {
16066 rtx temp = src1;
16067 src1 = src2;
16068 src2 = temp;
16069 }
16070
16071 /* If the destination is memory, we must have a matching source operand. */
16072 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16073 return false;
16074
16075 /* Source 1 cannot be a constant. */
16076 if (CONSTANT_P (src1))
16077 return false;
16078
16079 /* Source 1 cannot be a non-matching memory. */
16080 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16081 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16082 return (code == AND
16083 && (mode == HImode
16084 || mode == SImode
16085 || (TARGET_64BIT && mode == DImode))
16086 && satisfies_constraint_L (src2));
16087
16088 return true;
16089 }
16090
16091 /* Attempt to expand a unary operator. Make the expansion closer to the
16092 actual machine, then just general_operand, which will allow 2 separate
16093 memory references (one output, one input) in a single insn. */
16094
16095 void
16096 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16097 rtx operands[])
16098 {
16099 int matching_memory;
16100 rtx src, dst, op, clob;
16101
16102 dst = operands[0];
16103 src = operands[1];
16104
16105 /* If the destination is memory, and we do not have matching source
16106 operands, do things in registers. */
16107 matching_memory = 0;
16108 if (MEM_P (dst))
16109 {
16110 if (rtx_equal_p (dst, src))
16111 matching_memory = 1;
16112 else
16113 dst = gen_reg_rtx (mode);
16114 }
16115
16116 /* When source operand is memory, destination must match. */
16117 if (MEM_P (src) && !matching_memory)
16118 src = force_reg (mode, src);
16119
16120 /* Emit the instruction. */
16121
16122 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16123 if (reload_in_progress || code == NOT)
16124 {
16125 /* Reload doesn't know about the flags register, and doesn't know that
16126 it doesn't want to clobber it. */
16127 gcc_assert (code == NOT);
16128 emit_insn (op);
16129 }
16130 else
16131 {
16132 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16133 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16134 }
16135
16136 /* Fix up the destination if needed. */
16137 if (dst != operands[0])
16138 emit_move_insn (operands[0], dst);
16139 }
16140
16141 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16142 divisor are within the range [0-255]. */
16143
16144 void
16145 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16146 bool signed_p)
16147 {
16148 rtx end_label, qimode_label;
16149 rtx insn, div, mod;
16150 rtx scratch, tmp0, tmp1, tmp2;
16151 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16152 rtx (*gen_zero_extend) (rtx, rtx);
16153 rtx (*gen_test_ccno_1) (rtx, rtx);
16154
16155 switch (mode)
16156 {
16157 case SImode:
16158 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16159 gen_test_ccno_1 = gen_testsi_ccno_1;
16160 gen_zero_extend = gen_zero_extendqisi2;
16161 break;
16162 case DImode:
16163 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16164 gen_test_ccno_1 = gen_testdi_ccno_1;
16165 gen_zero_extend = gen_zero_extendqidi2;
16166 break;
16167 default:
16168 gcc_unreachable ();
16169 }
16170
16171 end_label = gen_label_rtx ();
16172 qimode_label = gen_label_rtx ();
16173
16174 scratch = gen_reg_rtx (mode);
16175
16176 /* Use 8bit unsigned divimod if dividend and divisor are within
16177 the range [0-255]. */
16178 emit_move_insn (scratch, operands[2]);
16179 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16180 scratch, 1, OPTAB_DIRECT);
16181 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16182 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16183 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16184 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16185 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16186 pc_rtx);
16187 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16188 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16189 JUMP_LABEL (insn) = qimode_label;
16190
16191 /* Generate original signed/unsigned divimod. */
16192 div = gen_divmod4_1 (operands[0], operands[1],
16193 operands[2], operands[3]);
16194 emit_insn (div);
16195
16196 /* Branch to the end. */
16197 emit_jump_insn (gen_jump (end_label));
16198 emit_barrier ();
16199
16200 /* Generate 8bit unsigned divide. */
16201 emit_label (qimode_label);
16202 /* Don't use operands[0] for result of 8bit divide since not all
16203 registers support QImode ZERO_EXTRACT. */
16204 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16205 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16206 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16207 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16208
16209 if (signed_p)
16210 {
16211 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16212 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16213 }
16214 else
16215 {
16216 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16217 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16218 }
16219
16220 /* Extract remainder from AH. */
16221 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16222 if (REG_P (operands[1]))
16223 insn = emit_move_insn (operands[1], tmp1);
16224 else
16225 {
16226 /* Need a new scratch register since the old one has result
16227 of 8bit divide. */
16228 scratch = gen_reg_rtx (mode);
16229 emit_move_insn (scratch, tmp1);
16230 insn = emit_move_insn (operands[1], scratch);
16231 }
16232 set_unique_reg_note (insn, REG_EQUAL, mod);
16233
16234 /* Zero extend quotient from AL. */
16235 tmp1 = gen_lowpart (QImode, tmp0);
16236 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16237 set_unique_reg_note (insn, REG_EQUAL, div);
16238
16239 emit_label (end_label);
16240 }
16241
16242 #define LEA_MAX_STALL (3)
16243 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16244
16245 /* Increase given DISTANCE in half-cycles according to
16246 dependencies between PREV and NEXT instructions.
16247 Add 1 half-cycle if there is no dependency and
16248 go to next cycle if there is some dependecy. */
16249
16250 static unsigned int
16251 increase_distance (rtx prev, rtx next, unsigned int distance)
16252 {
16253 df_ref *use_rec;
16254 df_ref *def_rec;
16255
16256 if (!prev || !next)
16257 return distance + (distance & 1) + 2;
16258
16259 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16260 return distance + 1;
16261
16262 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16263 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16264 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16265 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16266 return distance + (distance & 1) + 2;
16267
16268 return distance + 1;
16269 }
16270
16271 /* Function checks if instruction INSN defines register number
16272 REGNO1 or REGNO2. */
16273
16274 static bool
16275 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16276 rtx insn)
16277 {
16278 df_ref *def_rec;
16279
16280 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16281 if (DF_REF_REG_DEF_P (*def_rec)
16282 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16283 && (regno1 == DF_REF_REGNO (*def_rec)
16284 || regno2 == DF_REF_REGNO (*def_rec)))
16285 {
16286 return true;
16287 }
16288
16289 return false;
16290 }
16291
16292 /* Function checks if instruction INSN uses register number
16293 REGNO as a part of address expression. */
16294
16295 static bool
16296 insn_uses_reg_mem (unsigned int regno, rtx insn)
16297 {
16298 df_ref *use_rec;
16299
16300 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16301 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16302 return true;
16303
16304 return false;
16305 }
16306
16307 /* Search backward for non-agu definition of register number REGNO1
16308 or register number REGNO2 in basic block starting from instruction
16309 START up to head of basic block or instruction INSN.
16310
16311 Function puts true value into *FOUND var if definition was found
16312 and false otherwise.
16313
16314 Distance in half-cycles between START and found instruction or head
16315 of BB is added to DISTANCE and returned. */
16316
16317 static int
16318 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16319 rtx insn, int distance,
16320 rtx start, bool *found)
16321 {
16322 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16323 rtx prev = start;
16324 rtx next = NULL;
16325
16326 *found = false;
16327
16328 while (prev
16329 && prev != insn
16330 && distance < LEA_SEARCH_THRESHOLD)
16331 {
16332 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16333 {
16334 distance = increase_distance (prev, next, distance);
16335 if (insn_defines_reg (regno1, regno2, prev))
16336 {
16337 if (recog_memoized (prev) < 0
16338 || get_attr_type (prev) != TYPE_LEA)
16339 {
16340 *found = true;
16341 return distance;
16342 }
16343 }
16344
16345 next = prev;
16346 }
16347 if (prev == BB_HEAD (bb))
16348 break;
16349
16350 prev = PREV_INSN (prev);
16351 }
16352
16353 return distance;
16354 }
16355
16356 /* Search backward for non-agu definition of register number REGNO1
16357 or register number REGNO2 in INSN's basic block until
16358 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16359 2. Reach neighbour BBs boundary, or
16360 3. Reach agu definition.
16361 Returns the distance between the non-agu definition point and INSN.
16362 If no definition point, returns -1. */
16363
16364 static int
16365 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16366 rtx insn)
16367 {
16368 basic_block bb = BLOCK_FOR_INSN (insn);
16369 int distance = 0;
16370 bool found = false;
16371
16372 if (insn != BB_HEAD (bb))
16373 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16374 distance, PREV_INSN (insn),
16375 &found);
16376
16377 if (!found && distance < LEA_SEARCH_THRESHOLD)
16378 {
16379 edge e;
16380 edge_iterator ei;
16381 bool simple_loop = false;
16382
16383 FOR_EACH_EDGE (e, ei, bb->preds)
16384 if (e->src == bb)
16385 {
16386 simple_loop = true;
16387 break;
16388 }
16389
16390 if (simple_loop)
16391 distance = distance_non_agu_define_in_bb (regno1, regno2,
16392 insn, distance,
16393 BB_END (bb), &found);
16394 else
16395 {
16396 int shortest_dist = -1;
16397 bool found_in_bb = false;
16398
16399 FOR_EACH_EDGE (e, ei, bb->preds)
16400 {
16401 int bb_dist
16402 = distance_non_agu_define_in_bb (regno1, regno2,
16403 insn, distance,
16404 BB_END (e->src),
16405 &found_in_bb);
16406 if (found_in_bb)
16407 {
16408 if (shortest_dist < 0)
16409 shortest_dist = bb_dist;
16410 else if (bb_dist > 0)
16411 shortest_dist = MIN (bb_dist, shortest_dist);
16412
16413 found = true;
16414 }
16415 }
16416
16417 distance = shortest_dist;
16418 }
16419 }
16420
16421 /* get_attr_type may modify recog data. We want to make sure
16422 that recog data is valid for instruction INSN, on which
16423 distance_non_agu_define is called. INSN is unchanged here. */
16424 extract_insn_cached (insn);
16425
16426 if (!found)
16427 return -1;
16428
16429 return distance >> 1;
16430 }
16431
16432 /* Return the distance in half-cycles between INSN and the next
16433 insn that uses register number REGNO in memory address added
16434 to DISTANCE. Return -1 if REGNO0 is set.
16435
16436 Put true value into *FOUND if register usage was found and
16437 false otherwise.
16438 Put true value into *REDEFINED if register redefinition was
16439 found and false otherwise. */
16440
16441 static int
16442 distance_agu_use_in_bb (unsigned int regno,
16443 rtx insn, int distance, rtx start,
16444 bool *found, bool *redefined)
16445 {
16446 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16447 rtx next = start;
16448 rtx prev = NULL;
16449
16450 *found = false;
16451 *redefined = false;
16452
16453 while (next
16454 && next != insn
16455 && distance < LEA_SEARCH_THRESHOLD)
16456 {
16457 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16458 {
16459 distance = increase_distance(prev, next, distance);
16460 if (insn_uses_reg_mem (regno, next))
16461 {
16462 /* Return DISTANCE if OP0 is used in memory
16463 address in NEXT. */
16464 *found = true;
16465 return distance;
16466 }
16467
16468 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16469 {
16470 /* Return -1 if OP0 is set in NEXT. */
16471 *redefined = true;
16472 return -1;
16473 }
16474
16475 prev = next;
16476 }
16477
16478 if (next == BB_END (bb))
16479 break;
16480
16481 next = NEXT_INSN (next);
16482 }
16483
16484 return distance;
16485 }
16486
16487 /* Return the distance between INSN and the next insn that uses
16488 register number REGNO0 in memory address. Return -1 if no such
16489 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16490
16491 static int
16492 distance_agu_use (unsigned int regno0, rtx insn)
16493 {
16494 basic_block bb = BLOCK_FOR_INSN (insn);
16495 int distance = 0;
16496 bool found = false;
16497 bool redefined = false;
16498
16499 if (insn != BB_END (bb))
16500 distance = distance_agu_use_in_bb (regno0, insn, distance,
16501 NEXT_INSN (insn),
16502 &found, &redefined);
16503
16504 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16505 {
16506 edge e;
16507 edge_iterator ei;
16508 bool simple_loop = false;
16509
16510 FOR_EACH_EDGE (e, ei, bb->succs)
16511 if (e->dest == bb)
16512 {
16513 simple_loop = true;
16514 break;
16515 }
16516
16517 if (simple_loop)
16518 distance = distance_agu_use_in_bb (regno0, insn,
16519 distance, BB_HEAD (bb),
16520 &found, &redefined);
16521 else
16522 {
16523 int shortest_dist = -1;
16524 bool found_in_bb = false;
16525 bool redefined_in_bb = false;
16526
16527 FOR_EACH_EDGE (e, ei, bb->succs)
16528 {
16529 int bb_dist
16530 = distance_agu_use_in_bb (regno0, insn,
16531 distance, BB_HEAD (e->dest),
16532 &found_in_bb, &redefined_in_bb);
16533 if (found_in_bb)
16534 {
16535 if (shortest_dist < 0)
16536 shortest_dist = bb_dist;
16537 else if (bb_dist > 0)
16538 shortest_dist = MIN (bb_dist, shortest_dist);
16539
16540 found = true;
16541 }
16542 }
16543
16544 distance = shortest_dist;
16545 }
16546 }
16547
16548 if (!found || redefined)
16549 return -1;
16550
16551 return distance >> 1;
16552 }
16553
16554 /* Define this macro to tune LEA priority vs ADD, it take effect when
16555 there is a dilemma of choicing LEA or ADD
16556 Negative value: ADD is more preferred than LEA
16557 Zero: Netrual
16558 Positive value: LEA is more preferred than ADD*/
16559 #define IX86_LEA_PRIORITY 0
16560
16561 /* Return true if usage of lea INSN has performance advantage
16562 over a sequence of instructions. Instructions sequence has
16563 SPLIT_COST cycles higher latency than lea latency. */
16564
16565 bool
16566 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
16567 unsigned int regno2, unsigned int split_cost)
16568 {
16569 int dist_define, dist_use;
16570
16571 dist_define = distance_non_agu_define (regno1, regno2, insn);
16572 dist_use = distance_agu_use (regno0, insn);
16573
16574 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
16575 {
16576 /* If there is no non AGU operand definition, no AGU
16577 operand usage and split cost is 0 then both lea
16578 and non lea variants have same priority. Currently
16579 we prefer lea for 64 bit code and non lea on 32 bit
16580 code. */
16581 if (dist_use < 0 && split_cost == 0)
16582 return TARGET_64BIT || IX86_LEA_PRIORITY;
16583 else
16584 return true;
16585 }
16586
16587 /* With longer definitions distance lea is more preferable.
16588 Here we change it to take into account splitting cost and
16589 lea priority. */
16590 dist_define += split_cost + IX86_LEA_PRIORITY;
16591
16592 /* If there is no use in memory addess then we just check
16593 that split cost does not exceed AGU stall. */
16594 if (dist_use < 0)
16595 return dist_define >= LEA_MAX_STALL;
16596
16597 /* If this insn has both backward non-agu dependence and forward
16598 agu dependence, the one with short distance takes effect. */
16599 return dist_define >= dist_use;
16600 }
16601
16602 /* Return true if it is legal to clobber flags by INSN and
16603 false otherwise. */
16604
16605 static bool
16606 ix86_ok_to_clobber_flags (rtx insn)
16607 {
16608 basic_block bb = BLOCK_FOR_INSN (insn);
16609 df_ref *use;
16610 bitmap live;
16611
16612 while (insn)
16613 {
16614 if (NONDEBUG_INSN_P (insn))
16615 {
16616 for (use = DF_INSN_USES (insn); *use; use++)
16617 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
16618 return false;
16619
16620 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
16621 return true;
16622 }
16623
16624 if (insn == BB_END (bb))
16625 break;
16626
16627 insn = NEXT_INSN (insn);
16628 }
16629
16630 live = df_get_live_out(bb);
16631 return !REGNO_REG_SET_P (live, FLAGS_REG);
16632 }
16633
16634 /* Return true if we need to split op0 = op1 + op2 into a sequence of
16635 move and add to avoid AGU stalls. */
16636
16637 bool
16638 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
16639 {
16640 unsigned int regno0 = true_regnum (operands[0]);
16641 unsigned int regno1 = true_regnum (operands[1]);
16642 unsigned int regno2 = true_regnum (operands[2]);
16643
16644 /* Check if we need to optimize. */
16645 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16646 return false;
16647
16648 /* Check it is correct to split here. */
16649 if (!ix86_ok_to_clobber_flags(insn))
16650 return false;
16651
16652 /* We need to split only adds with non destructive
16653 destination operand. */
16654 if (regno0 == regno1 || regno0 == regno2)
16655 return false;
16656 else
16657 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
16658 }
16659
16660 /* Return true if we should emit lea instruction instead of mov
16661 instruction. */
16662
16663 bool
16664 ix86_use_lea_for_mov (rtx insn, rtx operands[])
16665 {
16666 unsigned int regno0;
16667 unsigned int regno1;
16668
16669 /* Check if we need to optimize. */
16670 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16671 return false;
16672
16673 /* Use lea for reg to reg moves only. */
16674 if (!REG_P (operands[0]) || !REG_P (operands[1]))
16675 return false;
16676
16677 regno0 = true_regnum (operands[0]);
16678 regno1 = true_regnum (operands[1]);
16679
16680 return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
16681 }
16682
16683 /* Return true if we need to split lea into a sequence of
16684 instructions to avoid AGU stalls. */
16685
16686 bool
16687 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
16688 {
16689 unsigned int regno0 = true_regnum (operands[0]) ;
16690 unsigned int regno1 = -1;
16691 unsigned int regno2 = -1;
16692 unsigned int split_cost = 0;
16693 struct ix86_address parts;
16694 int ok;
16695
16696 /* Check we need to optimize. */
16697 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16698 return false;
16699
16700 /* Check it is correct to split here. */
16701 if (!ix86_ok_to_clobber_flags(insn))
16702 return false;
16703
16704 ok = ix86_decompose_address (operands[1], &parts);
16705 gcc_assert (ok);
16706
16707 /* We should not split into add if non legitimate pic
16708 operand is used as displacement. */
16709 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
16710 return false;
16711
16712 if (parts.base)
16713 regno1 = true_regnum (parts.base);
16714 if (parts.index)
16715 regno2 = true_regnum (parts.index);
16716
16717 /* Compute how many cycles we will add to execution time
16718 if split lea into a sequence of instructions. */
16719 if (parts.base || parts.index)
16720 {
16721 /* Have to use mov instruction if non desctructive
16722 destination form is used. */
16723 if (regno1 != regno0 && regno2 != regno0)
16724 split_cost += 1;
16725
16726 /* Have to add index to base if both exist. */
16727 if (parts.base && parts.index)
16728 split_cost += 1;
16729
16730 /* Have to use shift and adds if scale is 2 or greater. */
16731 if (parts.scale > 1)
16732 {
16733 if (regno0 != regno1)
16734 split_cost += 1;
16735 else if (regno2 == regno0)
16736 split_cost += 4;
16737 else
16738 split_cost += parts.scale;
16739 }
16740
16741 /* Have to use add instruction with immediate if
16742 disp is non zero. */
16743 if (parts.disp && parts.disp != const0_rtx)
16744 split_cost += 1;
16745
16746 /* Subtract the price of lea. */
16747 split_cost -= 1;
16748 }
16749
16750 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
16751 }
16752
16753 /* Emit x86 binary operand CODE in mode MODE, where the first operand
16754 matches destination. RTX includes clobber of FLAGS_REG. */
16755
16756 static void
16757 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
16758 rtx dst, rtx src)
16759 {
16760 rtx op, clob;
16761
16762 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
16763 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16764
16765 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16766 }
16767
16768 /* Split lea instructions into a sequence of instructions
16769 which are executed on ALU to avoid AGU stalls.
16770 It is assumed that it is allowed to clobber flags register
16771 at lea position. */
16772
16773 extern void
16774 ix86_split_lea_for_addr (rtx operands[], enum machine_mode mode)
16775 {
16776 unsigned int regno0 = true_regnum (operands[0]) ;
16777 unsigned int regno1 = INVALID_REGNUM;
16778 unsigned int regno2 = INVALID_REGNUM;
16779 struct ix86_address parts;
16780 rtx tmp;
16781 int ok, adds;
16782
16783 ok = ix86_decompose_address (operands[1], &parts);
16784 gcc_assert (ok);
16785
16786 if (parts.base)
16787 {
16788 if (GET_MODE (parts.base) != mode)
16789 parts.base = gen_rtx_SUBREG (mode, parts.base, 0);
16790 regno1 = true_regnum (parts.base);
16791 }
16792
16793 if (parts.index)
16794 {
16795 if (GET_MODE (parts.index) != mode)
16796 parts.index = gen_rtx_SUBREG (mode, parts.index, 0);
16797 regno2 = true_regnum (parts.index);
16798 }
16799
16800 if (parts.scale > 1)
16801 {
16802 /* Case r1 = r1 + ... */
16803 if (regno1 == regno0)
16804 {
16805 /* If we have a case r1 = r1 + C * r1 then we
16806 should use multiplication which is very
16807 expensive. Assume cost model is wrong if we
16808 have such case here. */
16809 gcc_assert (regno2 != regno0);
16810
16811 for (adds = parts.scale; adds > 0; adds--)
16812 ix86_emit_binop (PLUS, mode, operands[0], parts.index);
16813 }
16814 else
16815 {
16816 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
16817 if (regno0 != regno2)
16818 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16819
16820 /* Use shift for scaling. */
16821 ix86_emit_binop (ASHIFT, mode, operands[0],
16822 GEN_INT (exact_log2 (parts.scale)));
16823
16824 if (parts.base)
16825 ix86_emit_binop (PLUS, mode, operands[0], parts.base);
16826
16827 if (parts.disp && parts.disp != const0_rtx)
16828 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16829 }
16830 }
16831 else if (!parts.base && !parts.index)
16832 {
16833 gcc_assert(parts.disp);
16834 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.disp));
16835 }
16836 else
16837 {
16838 if (!parts.base)
16839 {
16840 if (regno0 != regno2)
16841 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index));
16842 }
16843 else if (!parts.index)
16844 {
16845 if (regno0 != regno1)
16846 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16847 }
16848 else
16849 {
16850 if (regno0 == regno1)
16851 tmp = parts.index;
16852 else if (regno0 == regno2)
16853 tmp = parts.base;
16854 else
16855 {
16856 emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base));
16857 tmp = parts.index;
16858 }
16859
16860 ix86_emit_binop (PLUS, mode, operands[0], tmp);
16861 }
16862
16863 if (parts.disp && parts.disp != const0_rtx)
16864 ix86_emit_binop (PLUS, mode, operands[0], parts.disp);
16865 }
16866 }
16867
16868 /* Return true if it is ok to optimize an ADD operation to LEA
16869 operation to avoid flag register consumation. For most processors,
16870 ADD is faster than LEA. For the processors like ATOM, if the
16871 destination register of LEA holds an actual address which will be
16872 used soon, LEA is better and otherwise ADD is better. */
16873
16874 bool
16875 ix86_lea_for_add_ok (rtx insn, rtx operands[])
16876 {
16877 unsigned int regno0 = true_regnum (operands[0]);
16878 unsigned int regno1 = true_regnum (operands[1]);
16879 unsigned int regno2 = true_regnum (operands[2]);
16880
16881 /* If a = b + c, (a!=b && a!=c), must use lea form. */
16882 if (regno0 != regno1 && regno0 != regno2)
16883 return true;
16884
16885 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16886 return false;
16887
16888 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
16889 }
16890
16891 /* Return true if destination reg of SET_BODY is shift count of
16892 USE_BODY. */
16893
16894 static bool
16895 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
16896 {
16897 rtx set_dest;
16898 rtx shift_rtx;
16899 int i;
16900
16901 /* Retrieve destination of SET_BODY. */
16902 switch (GET_CODE (set_body))
16903 {
16904 case SET:
16905 set_dest = SET_DEST (set_body);
16906 if (!set_dest || !REG_P (set_dest))
16907 return false;
16908 break;
16909 case PARALLEL:
16910 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
16911 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
16912 use_body))
16913 return true;
16914 default:
16915 return false;
16916 break;
16917 }
16918
16919 /* Retrieve shift count of USE_BODY. */
16920 switch (GET_CODE (use_body))
16921 {
16922 case SET:
16923 shift_rtx = XEXP (use_body, 1);
16924 break;
16925 case PARALLEL:
16926 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
16927 if (ix86_dep_by_shift_count_body (set_body,
16928 XVECEXP (use_body, 0, i)))
16929 return true;
16930 default:
16931 return false;
16932 break;
16933 }
16934
16935 if (shift_rtx
16936 && (GET_CODE (shift_rtx) == ASHIFT
16937 || GET_CODE (shift_rtx) == LSHIFTRT
16938 || GET_CODE (shift_rtx) == ASHIFTRT
16939 || GET_CODE (shift_rtx) == ROTATE
16940 || GET_CODE (shift_rtx) == ROTATERT))
16941 {
16942 rtx shift_count = XEXP (shift_rtx, 1);
16943
16944 /* Return true if shift count is dest of SET_BODY. */
16945 if (REG_P (shift_count)
16946 && true_regnum (set_dest) == true_regnum (shift_count))
16947 return true;
16948 }
16949
16950 return false;
16951 }
16952
16953 /* Return true if destination reg of SET_INSN is shift count of
16954 USE_INSN. */
16955
16956 bool
16957 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
16958 {
16959 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
16960 PATTERN (use_insn));
16961 }
16962
16963 /* Return TRUE or FALSE depending on whether the unary operator meets the
16964 appropriate constraints. */
16965
16966 bool
16967 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
16968 enum machine_mode mode ATTRIBUTE_UNUSED,
16969 rtx operands[2] ATTRIBUTE_UNUSED)
16970 {
16971 /* If one of operands is memory, source and destination must match. */
16972 if ((MEM_P (operands[0])
16973 || MEM_P (operands[1]))
16974 && ! rtx_equal_p (operands[0], operands[1]))
16975 return false;
16976 return true;
16977 }
16978
16979 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
16980 are ok, keeping in mind the possible movddup alternative. */
16981
16982 bool
16983 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
16984 {
16985 if (MEM_P (operands[0]))
16986 return rtx_equal_p (operands[0], operands[1 + high]);
16987 if (MEM_P (operands[1]) && MEM_P (operands[2]))
16988 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
16989 return true;
16990 }
16991
16992 /* Post-reload splitter for converting an SF or DFmode value in an
16993 SSE register into an unsigned SImode. */
16994
16995 void
16996 ix86_split_convert_uns_si_sse (rtx operands[])
16997 {
16998 enum machine_mode vecmode;
16999 rtx value, large, zero_or_two31, input, two31, x;
17000
17001 large = operands[1];
17002 zero_or_two31 = operands[2];
17003 input = operands[3];
17004 two31 = operands[4];
17005 vecmode = GET_MODE (large);
17006 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17007
17008 /* Load up the value into the low element. We must ensure that the other
17009 elements are valid floats -- zero is the easiest such value. */
17010 if (MEM_P (input))
17011 {
17012 if (vecmode == V4SFmode)
17013 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17014 else
17015 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17016 }
17017 else
17018 {
17019 input = gen_rtx_REG (vecmode, REGNO (input));
17020 emit_move_insn (value, CONST0_RTX (vecmode));
17021 if (vecmode == V4SFmode)
17022 emit_insn (gen_sse_movss (value, value, input));
17023 else
17024 emit_insn (gen_sse2_movsd (value, value, input));
17025 }
17026
17027 emit_move_insn (large, two31);
17028 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17029
17030 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17031 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17032
17033 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17034 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17035
17036 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17037 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17038
17039 large = gen_rtx_REG (V4SImode, REGNO (large));
17040 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17041
17042 x = gen_rtx_REG (V4SImode, REGNO (value));
17043 if (vecmode == V4SFmode)
17044 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17045 else
17046 emit_insn (gen_sse2_cvttpd2dq (x, value));
17047 value = x;
17048
17049 emit_insn (gen_xorv4si3 (value, value, large));
17050 }
17051
17052 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17053 Expects the 64-bit DImode to be supplied in a pair of integral
17054 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17055 -mfpmath=sse, !optimize_size only. */
17056
17057 void
17058 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17059 {
17060 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17061 rtx int_xmm, fp_xmm;
17062 rtx biases, exponents;
17063 rtx x;
17064
17065 int_xmm = gen_reg_rtx (V4SImode);
17066 if (TARGET_INTER_UNIT_MOVES)
17067 emit_insn (gen_movdi_to_sse (int_xmm, input));
17068 else if (TARGET_SSE_SPLIT_REGS)
17069 {
17070 emit_clobber (int_xmm);
17071 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17072 }
17073 else
17074 {
17075 x = gen_reg_rtx (V2DImode);
17076 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17077 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17078 }
17079
17080 x = gen_rtx_CONST_VECTOR (V4SImode,
17081 gen_rtvec (4, GEN_INT (0x43300000UL),
17082 GEN_INT (0x45300000UL),
17083 const0_rtx, const0_rtx));
17084 exponents = validize_mem (force_const_mem (V4SImode, x));
17085
17086 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17087 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17088
17089 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17090 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17091 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17092 (0x1.0p84 + double(fp_value_hi_xmm)).
17093 Note these exponents differ by 32. */
17094
17095 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17096
17097 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17098 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17099 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17100 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17101 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17102 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17103 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17104 biases = validize_mem (force_const_mem (V2DFmode, biases));
17105 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17106
17107 /* Add the upper and lower DFmode values together. */
17108 if (TARGET_SSE3)
17109 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17110 else
17111 {
17112 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17113 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17114 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17115 }
17116
17117 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17118 }
17119
17120 /* Not used, but eases macroization of patterns. */
17121 void
17122 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17123 rtx input ATTRIBUTE_UNUSED)
17124 {
17125 gcc_unreachable ();
17126 }
17127
17128 /* Convert an unsigned SImode value into a DFmode. Only currently used
17129 for SSE, but applicable anywhere. */
17130
17131 void
17132 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17133 {
17134 REAL_VALUE_TYPE TWO31r;
17135 rtx x, fp;
17136
17137 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17138 NULL, 1, OPTAB_DIRECT);
17139
17140 fp = gen_reg_rtx (DFmode);
17141 emit_insn (gen_floatsidf2 (fp, x));
17142
17143 real_ldexp (&TWO31r, &dconst1, 31);
17144 x = const_double_from_real_value (TWO31r, DFmode);
17145
17146 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17147 if (x != target)
17148 emit_move_insn (target, x);
17149 }
17150
17151 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17152 32-bit mode; otherwise we have a direct convert instruction. */
17153
17154 void
17155 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17156 {
17157 REAL_VALUE_TYPE TWO32r;
17158 rtx fp_lo, fp_hi, x;
17159
17160 fp_lo = gen_reg_rtx (DFmode);
17161 fp_hi = gen_reg_rtx (DFmode);
17162
17163 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17164
17165 real_ldexp (&TWO32r, &dconst1, 32);
17166 x = const_double_from_real_value (TWO32r, DFmode);
17167 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17168
17169 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17170
17171 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17172 0, OPTAB_DIRECT);
17173 if (x != target)
17174 emit_move_insn (target, x);
17175 }
17176
17177 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17178 For x86_32, -mfpmath=sse, !optimize_size only. */
17179 void
17180 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17181 {
17182 REAL_VALUE_TYPE ONE16r;
17183 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17184
17185 real_ldexp (&ONE16r, &dconst1, 16);
17186 x = const_double_from_real_value (ONE16r, SFmode);
17187 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17188 NULL, 0, OPTAB_DIRECT);
17189 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17190 NULL, 0, OPTAB_DIRECT);
17191 fp_hi = gen_reg_rtx (SFmode);
17192 fp_lo = gen_reg_rtx (SFmode);
17193 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17194 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17195 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17196 0, OPTAB_DIRECT);
17197 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17198 0, OPTAB_DIRECT);
17199 if (!rtx_equal_p (target, fp_hi))
17200 emit_move_insn (target, fp_hi);
17201 }
17202
17203 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17204 a vector of unsigned ints VAL to vector of floats TARGET. */
17205
17206 void
17207 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17208 {
17209 rtx tmp[8];
17210 REAL_VALUE_TYPE TWO16r;
17211 enum machine_mode intmode = GET_MODE (val);
17212 enum machine_mode fltmode = GET_MODE (target);
17213 rtx (*cvt) (rtx, rtx);
17214
17215 if (intmode == V4SImode)
17216 cvt = gen_floatv4siv4sf2;
17217 else
17218 cvt = gen_floatv8siv8sf2;
17219 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17220 tmp[0] = force_reg (intmode, tmp[0]);
17221 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17222 OPTAB_DIRECT);
17223 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17224 NULL_RTX, 1, OPTAB_DIRECT);
17225 tmp[3] = gen_reg_rtx (fltmode);
17226 emit_insn (cvt (tmp[3], tmp[1]));
17227 tmp[4] = gen_reg_rtx (fltmode);
17228 emit_insn (cvt (tmp[4], tmp[2]));
17229 real_ldexp (&TWO16r, &dconst1, 16);
17230 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17231 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17232 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17233 OPTAB_DIRECT);
17234 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17235 OPTAB_DIRECT);
17236 if (tmp[7] != target)
17237 emit_move_insn (target, tmp[7]);
17238 }
17239
17240 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17241 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17242 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17243 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17244
17245 rtx
17246 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17247 {
17248 REAL_VALUE_TYPE TWO31r;
17249 rtx two31r, tmp[4];
17250 enum machine_mode mode = GET_MODE (val);
17251 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17252 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17253 rtx (*cmp) (rtx, rtx, rtx, rtx);
17254 int i;
17255
17256 for (i = 0; i < 3; i++)
17257 tmp[i] = gen_reg_rtx (mode);
17258 real_ldexp (&TWO31r, &dconst1, 31);
17259 two31r = const_double_from_real_value (TWO31r, scalarmode);
17260 two31r = ix86_build_const_vector (mode, 1, two31r);
17261 two31r = force_reg (mode, two31r);
17262 switch (mode)
17263 {
17264 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17265 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17266 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17267 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17268 default: gcc_unreachable ();
17269 }
17270 tmp[3] = gen_rtx_LE (mode, two31r, val);
17271 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17272 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17273 0, OPTAB_DIRECT);
17274 if (intmode == V4SImode || TARGET_AVX2)
17275 *xorp = expand_simple_binop (intmode, ASHIFT,
17276 gen_lowpart (intmode, tmp[0]),
17277 GEN_INT (31), NULL_RTX, 0,
17278 OPTAB_DIRECT);
17279 else
17280 {
17281 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17282 two31 = ix86_build_const_vector (intmode, 1, two31);
17283 *xorp = expand_simple_binop (intmode, AND,
17284 gen_lowpart (intmode, tmp[0]),
17285 two31, NULL_RTX, 0,
17286 OPTAB_DIRECT);
17287 }
17288 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17289 0, OPTAB_DIRECT);
17290 }
17291
17292 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17293 then replicate the value for all elements of the vector
17294 register. */
17295
17296 rtx
17297 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17298 {
17299 int i, n_elt;
17300 rtvec v;
17301 enum machine_mode scalar_mode;
17302
17303 switch (mode)
17304 {
17305 case V32QImode:
17306 case V16QImode:
17307 case V16HImode:
17308 case V8HImode:
17309 case V8SImode:
17310 case V4SImode:
17311 case V4DImode:
17312 case V2DImode:
17313 gcc_assert (vect);
17314 case V8SFmode:
17315 case V4SFmode:
17316 case V4DFmode:
17317 case V2DFmode:
17318 n_elt = GET_MODE_NUNITS (mode);
17319 v = rtvec_alloc (n_elt);
17320 scalar_mode = GET_MODE_INNER (mode);
17321
17322 RTVEC_ELT (v, 0) = value;
17323
17324 for (i = 1; i < n_elt; ++i)
17325 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17326
17327 return gen_rtx_CONST_VECTOR (mode, v);
17328
17329 default:
17330 gcc_unreachable ();
17331 }
17332 }
17333
17334 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17335 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17336 for an SSE register. If VECT is true, then replicate the mask for
17337 all elements of the vector register. If INVERT is true, then create
17338 a mask excluding the sign bit. */
17339
17340 rtx
17341 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17342 {
17343 enum machine_mode vec_mode, imode;
17344 HOST_WIDE_INT hi, lo;
17345 int shift = 63;
17346 rtx v;
17347 rtx mask;
17348
17349 /* Find the sign bit, sign extended to 2*HWI. */
17350 switch (mode)
17351 {
17352 case V8SImode:
17353 case V4SImode:
17354 case V8SFmode:
17355 case V4SFmode:
17356 vec_mode = mode;
17357 mode = GET_MODE_INNER (mode);
17358 imode = SImode;
17359 lo = 0x80000000, hi = lo < 0;
17360 break;
17361
17362 case V4DImode:
17363 case V2DImode:
17364 case V4DFmode:
17365 case V2DFmode:
17366 vec_mode = mode;
17367 mode = GET_MODE_INNER (mode);
17368 imode = DImode;
17369 if (HOST_BITS_PER_WIDE_INT >= 64)
17370 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17371 else
17372 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17373 break;
17374
17375 case TImode:
17376 case TFmode:
17377 vec_mode = VOIDmode;
17378 if (HOST_BITS_PER_WIDE_INT >= 64)
17379 {
17380 imode = TImode;
17381 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17382 }
17383 else
17384 {
17385 rtvec vec;
17386
17387 imode = DImode;
17388 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17389
17390 if (invert)
17391 {
17392 lo = ~lo, hi = ~hi;
17393 v = constm1_rtx;
17394 }
17395 else
17396 v = const0_rtx;
17397
17398 mask = immed_double_const (lo, hi, imode);
17399
17400 vec = gen_rtvec (2, v, mask);
17401 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17402 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17403
17404 return v;
17405 }
17406 break;
17407
17408 default:
17409 gcc_unreachable ();
17410 }
17411
17412 if (invert)
17413 lo = ~lo, hi = ~hi;
17414
17415 /* Force this value into the low part of a fp vector constant. */
17416 mask = immed_double_const (lo, hi, imode);
17417 mask = gen_lowpart (mode, mask);
17418
17419 if (vec_mode == VOIDmode)
17420 return force_reg (mode, mask);
17421
17422 v = ix86_build_const_vector (vec_mode, vect, mask);
17423 return force_reg (vec_mode, v);
17424 }
17425
17426 /* Generate code for floating point ABS or NEG. */
17427
17428 void
17429 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17430 rtx operands[])
17431 {
17432 rtx mask, set, dst, src;
17433 bool use_sse = false;
17434 bool vector_mode = VECTOR_MODE_P (mode);
17435 enum machine_mode vmode = mode;
17436
17437 if (vector_mode)
17438 use_sse = true;
17439 else if (mode == TFmode)
17440 use_sse = true;
17441 else if (TARGET_SSE_MATH)
17442 {
17443 use_sse = SSE_FLOAT_MODE_P (mode);
17444 if (mode == SFmode)
17445 vmode = V4SFmode;
17446 else if (mode == DFmode)
17447 vmode = V2DFmode;
17448 }
17449
17450 /* NEG and ABS performed with SSE use bitwise mask operations.
17451 Create the appropriate mask now. */
17452 if (use_sse)
17453 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17454 else
17455 mask = NULL_RTX;
17456
17457 dst = operands[0];
17458 src = operands[1];
17459
17460 set = gen_rtx_fmt_e (code, mode, src);
17461 set = gen_rtx_SET (VOIDmode, dst, set);
17462
17463 if (mask)
17464 {
17465 rtx use, clob;
17466 rtvec par;
17467
17468 use = gen_rtx_USE (VOIDmode, mask);
17469 if (vector_mode)
17470 par = gen_rtvec (2, set, use);
17471 else
17472 {
17473 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17474 par = gen_rtvec (3, set, use, clob);
17475 }
17476 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
17477 }
17478 else
17479 emit_insn (set);
17480 }
17481
17482 /* Expand a copysign operation. Special case operand 0 being a constant. */
17483
17484 void
17485 ix86_expand_copysign (rtx operands[])
17486 {
17487 enum machine_mode mode, vmode;
17488 rtx dest, op0, op1, mask, nmask;
17489
17490 dest = operands[0];
17491 op0 = operands[1];
17492 op1 = operands[2];
17493
17494 mode = GET_MODE (dest);
17495
17496 if (mode == SFmode)
17497 vmode = V4SFmode;
17498 else if (mode == DFmode)
17499 vmode = V2DFmode;
17500 else
17501 vmode = mode;
17502
17503 if (GET_CODE (op0) == CONST_DOUBLE)
17504 {
17505 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
17506
17507 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
17508 op0 = simplify_unary_operation (ABS, mode, op0, mode);
17509
17510 if (mode == SFmode || mode == DFmode)
17511 {
17512 if (op0 == CONST0_RTX (mode))
17513 op0 = CONST0_RTX (vmode);
17514 else
17515 {
17516 rtx v = ix86_build_const_vector (vmode, false, op0);
17517
17518 op0 = force_reg (vmode, v);
17519 }
17520 }
17521 else if (op0 != CONST0_RTX (mode))
17522 op0 = force_reg (mode, op0);
17523
17524 mask = ix86_build_signbit_mask (vmode, 0, 0);
17525
17526 if (mode == SFmode)
17527 copysign_insn = gen_copysignsf3_const;
17528 else if (mode == DFmode)
17529 copysign_insn = gen_copysigndf3_const;
17530 else
17531 copysign_insn = gen_copysigntf3_const;
17532
17533 emit_insn (copysign_insn (dest, op0, op1, mask));
17534 }
17535 else
17536 {
17537 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
17538
17539 nmask = ix86_build_signbit_mask (vmode, 0, 1);
17540 mask = ix86_build_signbit_mask (vmode, 0, 0);
17541
17542 if (mode == SFmode)
17543 copysign_insn = gen_copysignsf3_var;
17544 else if (mode == DFmode)
17545 copysign_insn = gen_copysigndf3_var;
17546 else
17547 copysign_insn = gen_copysigntf3_var;
17548
17549 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
17550 }
17551 }
17552
17553 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
17554 be a constant, and so has already been expanded into a vector constant. */
17555
17556 void
17557 ix86_split_copysign_const (rtx operands[])
17558 {
17559 enum machine_mode mode, vmode;
17560 rtx dest, op0, mask, x;
17561
17562 dest = operands[0];
17563 op0 = operands[1];
17564 mask = operands[3];
17565
17566 mode = GET_MODE (dest);
17567 vmode = GET_MODE (mask);
17568
17569 dest = simplify_gen_subreg (vmode, dest, mode, 0);
17570 x = gen_rtx_AND (vmode, dest, mask);
17571 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17572
17573 if (op0 != CONST0_RTX (vmode))
17574 {
17575 x = gen_rtx_IOR (vmode, dest, op0);
17576 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17577 }
17578 }
17579
17580 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
17581 so we have to do two masks. */
17582
17583 void
17584 ix86_split_copysign_var (rtx operands[])
17585 {
17586 enum machine_mode mode, vmode;
17587 rtx dest, scratch, op0, op1, mask, nmask, x;
17588
17589 dest = operands[0];
17590 scratch = operands[1];
17591 op0 = operands[2];
17592 op1 = operands[3];
17593 nmask = operands[4];
17594 mask = operands[5];
17595
17596 mode = GET_MODE (dest);
17597 vmode = GET_MODE (mask);
17598
17599 if (rtx_equal_p (op0, op1))
17600 {
17601 /* Shouldn't happen often (it's useless, obviously), but when it does
17602 we'd generate incorrect code if we continue below. */
17603 emit_move_insn (dest, op0);
17604 return;
17605 }
17606
17607 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
17608 {
17609 gcc_assert (REGNO (op1) == REGNO (scratch));
17610
17611 x = gen_rtx_AND (vmode, scratch, mask);
17612 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17613
17614 dest = mask;
17615 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17616 x = gen_rtx_NOT (vmode, dest);
17617 x = gen_rtx_AND (vmode, x, op0);
17618 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17619 }
17620 else
17621 {
17622 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
17623 {
17624 x = gen_rtx_AND (vmode, scratch, mask);
17625 }
17626 else /* alternative 2,4 */
17627 {
17628 gcc_assert (REGNO (mask) == REGNO (scratch));
17629 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
17630 x = gen_rtx_AND (vmode, scratch, op1);
17631 }
17632 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
17633
17634 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
17635 {
17636 dest = simplify_gen_subreg (vmode, op0, mode, 0);
17637 x = gen_rtx_AND (vmode, dest, nmask);
17638 }
17639 else /* alternative 3,4 */
17640 {
17641 gcc_assert (REGNO (nmask) == REGNO (dest));
17642 dest = nmask;
17643 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
17644 x = gen_rtx_AND (vmode, dest, op0);
17645 }
17646 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17647 }
17648
17649 x = gen_rtx_IOR (vmode, dest, scratch);
17650 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
17651 }
17652
17653 /* Return TRUE or FALSE depending on whether the first SET in INSN
17654 has source and destination with matching CC modes, and that the
17655 CC mode is at least as constrained as REQ_MODE. */
17656
17657 bool
17658 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
17659 {
17660 rtx set;
17661 enum machine_mode set_mode;
17662
17663 set = PATTERN (insn);
17664 if (GET_CODE (set) == PARALLEL)
17665 set = XVECEXP (set, 0, 0);
17666 gcc_assert (GET_CODE (set) == SET);
17667 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
17668
17669 set_mode = GET_MODE (SET_DEST (set));
17670 switch (set_mode)
17671 {
17672 case CCNOmode:
17673 if (req_mode != CCNOmode
17674 && (req_mode != CCmode
17675 || XEXP (SET_SRC (set), 1) != const0_rtx))
17676 return false;
17677 break;
17678 case CCmode:
17679 if (req_mode == CCGCmode)
17680 return false;
17681 /* FALLTHRU */
17682 case CCGCmode:
17683 if (req_mode == CCGOCmode || req_mode == CCNOmode)
17684 return false;
17685 /* FALLTHRU */
17686 case CCGOCmode:
17687 if (req_mode == CCZmode)
17688 return false;
17689 /* FALLTHRU */
17690 case CCZmode:
17691 break;
17692
17693 case CCAmode:
17694 case CCCmode:
17695 case CCOmode:
17696 case CCSmode:
17697 if (set_mode != req_mode)
17698 return false;
17699 break;
17700
17701 default:
17702 gcc_unreachable ();
17703 }
17704
17705 return GET_MODE (SET_SRC (set)) == set_mode;
17706 }
17707
17708 /* Generate insn patterns to do an integer compare of OPERANDS. */
17709
17710 static rtx
17711 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
17712 {
17713 enum machine_mode cmpmode;
17714 rtx tmp, flags;
17715
17716 cmpmode = SELECT_CC_MODE (code, op0, op1);
17717 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
17718
17719 /* This is very simple, but making the interface the same as in the
17720 FP case makes the rest of the code easier. */
17721 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
17722 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
17723
17724 /* Return the test that should be put into the flags user, i.e.
17725 the bcc, scc, or cmov instruction. */
17726 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
17727 }
17728
17729 /* Figure out whether to use ordered or unordered fp comparisons.
17730 Return the appropriate mode to use. */
17731
17732 enum machine_mode
17733 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
17734 {
17735 /* ??? In order to make all comparisons reversible, we do all comparisons
17736 non-trapping when compiling for IEEE. Once gcc is able to distinguish
17737 all forms trapping and nontrapping comparisons, we can make inequality
17738 comparisons trapping again, since it results in better code when using
17739 FCOM based compares. */
17740 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
17741 }
17742
17743 enum machine_mode
17744 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
17745 {
17746 enum machine_mode mode = GET_MODE (op0);
17747
17748 if (SCALAR_FLOAT_MODE_P (mode))
17749 {
17750 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17751 return ix86_fp_compare_mode (code);
17752 }
17753
17754 switch (code)
17755 {
17756 /* Only zero flag is needed. */
17757 case EQ: /* ZF=0 */
17758 case NE: /* ZF!=0 */
17759 return CCZmode;
17760 /* Codes needing carry flag. */
17761 case GEU: /* CF=0 */
17762 case LTU: /* CF=1 */
17763 /* Detect overflow checks. They need just the carry flag. */
17764 if (GET_CODE (op0) == PLUS
17765 && rtx_equal_p (op1, XEXP (op0, 0)))
17766 return CCCmode;
17767 else
17768 return CCmode;
17769 case GTU: /* CF=0 & ZF=0 */
17770 case LEU: /* CF=1 | ZF=1 */
17771 /* Detect overflow checks. They need just the carry flag. */
17772 if (GET_CODE (op0) == MINUS
17773 && rtx_equal_p (op1, XEXP (op0, 0)))
17774 return CCCmode;
17775 else
17776 return CCmode;
17777 /* Codes possibly doable only with sign flag when
17778 comparing against zero. */
17779 case GE: /* SF=OF or SF=0 */
17780 case LT: /* SF<>OF or SF=1 */
17781 if (op1 == const0_rtx)
17782 return CCGOCmode;
17783 else
17784 /* For other cases Carry flag is not required. */
17785 return CCGCmode;
17786 /* Codes doable only with sign flag when comparing
17787 against zero, but we miss jump instruction for it
17788 so we need to use relational tests against overflow
17789 that thus needs to be zero. */
17790 case GT: /* ZF=0 & SF=OF */
17791 case LE: /* ZF=1 | SF<>OF */
17792 if (op1 == const0_rtx)
17793 return CCNOmode;
17794 else
17795 return CCGCmode;
17796 /* strcmp pattern do (use flags) and combine may ask us for proper
17797 mode. */
17798 case USE:
17799 return CCmode;
17800 default:
17801 gcc_unreachable ();
17802 }
17803 }
17804
17805 /* Return the fixed registers used for condition codes. */
17806
17807 static bool
17808 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
17809 {
17810 *p1 = FLAGS_REG;
17811 *p2 = FPSR_REG;
17812 return true;
17813 }
17814
17815 /* If two condition code modes are compatible, return a condition code
17816 mode which is compatible with both. Otherwise, return
17817 VOIDmode. */
17818
17819 static enum machine_mode
17820 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
17821 {
17822 if (m1 == m2)
17823 return m1;
17824
17825 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
17826 return VOIDmode;
17827
17828 if ((m1 == CCGCmode && m2 == CCGOCmode)
17829 || (m1 == CCGOCmode && m2 == CCGCmode))
17830 return CCGCmode;
17831
17832 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
17833 return m2;
17834 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
17835 return m1;
17836
17837 switch (m1)
17838 {
17839 default:
17840 gcc_unreachable ();
17841
17842 case CCmode:
17843 case CCGCmode:
17844 case CCGOCmode:
17845 case CCNOmode:
17846 case CCAmode:
17847 case CCCmode:
17848 case CCOmode:
17849 case CCSmode:
17850 case CCZmode:
17851 switch (m2)
17852 {
17853 default:
17854 return VOIDmode;
17855
17856 case CCmode:
17857 case CCGCmode:
17858 case CCGOCmode:
17859 case CCNOmode:
17860 case CCAmode:
17861 case CCCmode:
17862 case CCOmode:
17863 case CCSmode:
17864 case CCZmode:
17865 return CCmode;
17866 }
17867
17868 case CCFPmode:
17869 case CCFPUmode:
17870 /* These are only compatible with themselves, which we already
17871 checked above. */
17872 return VOIDmode;
17873 }
17874 }
17875
17876
17877 /* Return a comparison we can do and that it is equivalent to
17878 swap_condition (code) apart possibly from orderedness.
17879 But, never change orderedness if TARGET_IEEE_FP, returning
17880 UNKNOWN in that case if necessary. */
17881
17882 static enum rtx_code
17883 ix86_fp_swap_condition (enum rtx_code code)
17884 {
17885 switch (code)
17886 {
17887 case GT: /* GTU - CF=0 & ZF=0 */
17888 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
17889 case GE: /* GEU - CF=0 */
17890 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
17891 case UNLT: /* LTU - CF=1 */
17892 return TARGET_IEEE_FP ? UNKNOWN : GT;
17893 case UNLE: /* LEU - CF=1 | ZF=1 */
17894 return TARGET_IEEE_FP ? UNKNOWN : GE;
17895 default:
17896 return swap_condition (code);
17897 }
17898 }
17899
17900 /* Return cost of comparison CODE using the best strategy for performance.
17901 All following functions do use number of instructions as a cost metrics.
17902 In future this should be tweaked to compute bytes for optimize_size and
17903 take into account performance of various instructions on various CPUs. */
17904
17905 static int
17906 ix86_fp_comparison_cost (enum rtx_code code)
17907 {
17908 int arith_cost;
17909
17910 /* The cost of code using bit-twiddling on %ah. */
17911 switch (code)
17912 {
17913 case UNLE:
17914 case UNLT:
17915 case LTGT:
17916 case GT:
17917 case GE:
17918 case UNORDERED:
17919 case ORDERED:
17920 case UNEQ:
17921 arith_cost = 4;
17922 break;
17923 case LT:
17924 case NE:
17925 case EQ:
17926 case UNGE:
17927 arith_cost = TARGET_IEEE_FP ? 5 : 4;
17928 break;
17929 case LE:
17930 case UNGT:
17931 arith_cost = TARGET_IEEE_FP ? 6 : 4;
17932 break;
17933 default:
17934 gcc_unreachable ();
17935 }
17936
17937 switch (ix86_fp_comparison_strategy (code))
17938 {
17939 case IX86_FPCMP_COMI:
17940 return arith_cost > 4 ? 3 : 2;
17941 case IX86_FPCMP_SAHF:
17942 return arith_cost > 4 ? 4 : 3;
17943 default:
17944 return arith_cost;
17945 }
17946 }
17947
17948 /* Return strategy to use for floating-point. We assume that fcomi is always
17949 preferrable where available, since that is also true when looking at size
17950 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
17951
17952 enum ix86_fpcmp_strategy
17953 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
17954 {
17955 /* Do fcomi/sahf based test when profitable. */
17956
17957 if (TARGET_CMOVE)
17958 return IX86_FPCMP_COMI;
17959
17960 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
17961 return IX86_FPCMP_SAHF;
17962
17963 return IX86_FPCMP_ARITH;
17964 }
17965
17966 /* Swap, force into registers, or otherwise massage the two operands
17967 to a fp comparison. The operands are updated in place; the new
17968 comparison code is returned. */
17969
17970 static enum rtx_code
17971 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
17972 {
17973 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
17974 rtx op0 = *pop0, op1 = *pop1;
17975 enum machine_mode op_mode = GET_MODE (op0);
17976 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
17977
17978 /* All of the unordered compare instructions only work on registers.
17979 The same is true of the fcomi compare instructions. The XFmode
17980 compare instructions require registers except when comparing
17981 against zero or when converting operand 1 from fixed point to
17982 floating point. */
17983
17984 if (!is_sse
17985 && (fpcmp_mode == CCFPUmode
17986 || (op_mode == XFmode
17987 && ! (standard_80387_constant_p (op0) == 1
17988 || standard_80387_constant_p (op1) == 1)
17989 && GET_CODE (op1) != FLOAT)
17990 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
17991 {
17992 op0 = force_reg (op_mode, op0);
17993 op1 = force_reg (op_mode, op1);
17994 }
17995 else
17996 {
17997 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
17998 things around if they appear profitable, otherwise force op0
17999 into a register. */
18000
18001 if (standard_80387_constant_p (op0) == 0
18002 || (MEM_P (op0)
18003 && ! (standard_80387_constant_p (op1) == 0
18004 || MEM_P (op1))))
18005 {
18006 enum rtx_code new_code = ix86_fp_swap_condition (code);
18007 if (new_code != UNKNOWN)
18008 {
18009 rtx tmp;
18010 tmp = op0, op0 = op1, op1 = tmp;
18011 code = new_code;
18012 }
18013 }
18014
18015 if (!REG_P (op0))
18016 op0 = force_reg (op_mode, op0);
18017
18018 if (CONSTANT_P (op1))
18019 {
18020 int tmp = standard_80387_constant_p (op1);
18021 if (tmp == 0)
18022 op1 = validize_mem (force_const_mem (op_mode, op1));
18023 else if (tmp == 1)
18024 {
18025 if (TARGET_CMOVE)
18026 op1 = force_reg (op_mode, op1);
18027 }
18028 else
18029 op1 = force_reg (op_mode, op1);
18030 }
18031 }
18032
18033 /* Try to rearrange the comparison to make it cheaper. */
18034 if (ix86_fp_comparison_cost (code)
18035 > ix86_fp_comparison_cost (swap_condition (code))
18036 && (REG_P (op1) || can_create_pseudo_p ()))
18037 {
18038 rtx tmp;
18039 tmp = op0, op0 = op1, op1 = tmp;
18040 code = swap_condition (code);
18041 if (!REG_P (op0))
18042 op0 = force_reg (op_mode, op0);
18043 }
18044
18045 *pop0 = op0;
18046 *pop1 = op1;
18047 return code;
18048 }
18049
18050 /* Convert comparison codes we use to represent FP comparison to integer
18051 code that will result in proper branch. Return UNKNOWN if no such code
18052 is available. */
18053
18054 enum rtx_code
18055 ix86_fp_compare_code_to_integer (enum rtx_code code)
18056 {
18057 switch (code)
18058 {
18059 case GT:
18060 return GTU;
18061 case GE:
18062 return GEU;
18063 case ORDERED:
18064 case UNORDERED:
18065 return code;
18066 break;
18067 case UNEQ:
18068 return EQ;
18069 break;
18070 case UNLT:
18071 return LTU;
18072 break;
18073 case UNLE:
18074 return LEU;
18075 break;
18076 case LTGT:
18077 return NE;
18078 break;
18079 default:
18080 return UNKNOWN;
18081 }
18082 }
18083
18084 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18085
18086 static rtx
18087 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18088 {
18089 enum machine_mode fpcmp_mode, intcmp_mode;
18090 rtx tmp, tmp2;
18091
18092 fpcmp_mode = ix86_fp_compare_mode (code);
18093 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18094
18095 /* Do fcomi/sahf based test when profitable. */
18096 switch (ix86_fp_comparison_strategy (code))
18097 {
18098 case IX86_FPCMP_COMI:
18099 intcmp_mode = fpcmp_mode;
18100 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18101 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18102 tmp);
18103 emit_insn (tmp);
18104 break;
18105
18106 case IX86_FPCMP_SAHF:
18107 intcmp_mode = fpcmp_mode;
18108 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18109 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18110 tmp);
18111
18112 if (!scratch)
18113 scratch = gen_reg_rtx (HImode);
18114 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18115 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18116 break;
18117
18118 case IX86_FPCMP_ARITH:
18119 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18120 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18121 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18122 if (!scratch)
18123 scratch = gen_reg_rtx (HImode);
18124 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18125
18126 /* In the unordered case, we have to check C2 for NaN's, which
18127 doesn't happen to work out to anything nice combination-wise.
18128 So do some bit twiddling on the value we've got in AH to come
18129 up with an appropriate set of condition codes. */
18130
18131 intcmp_mode = CCNOmode;
18132 switch (code)
18133 {
18134 case GT:
18135 case UNGT:
18136 if (code == GT || !TARGET_IEEE_FP)
18137 {
18138 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18139 code = EQ;
18140 }
18141 else
18142 {
18143 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18144 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18145 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18146 intcmp_mode = CCmode;
18147 code = GEU;
18148 }
18149 break;
18150 case LT:
18151 case UNLT:
18152 if (code == LT && TARGET_IEEE_FP)
18153 {
18154 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18155 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18156 intcmp_mode = CCmode;
18157 code = EQ;
18158 }
18159 else
18160 {
18161 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18162 code = NE;
18163 }
18164 break;
18165 case GE:
18166 case UNGE:
18167 if (code == GE || !TARGET_IEEE_FP)
18168 {
18169 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18170 code = EQ;
18171 }
18172 else
18173 {
18174 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18175 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18176 code = NE;
18177 }
18178 break;
18179 case LE:
18180 case UNLE:
18181 if (code == LE && TARGET_IEEE_FP)
18182 {
18183 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18184 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18185 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18186 intcmp_mode = CCmode;
18187 code = LTU;
18188 }
18189 else
18190 {
18191 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18192 code = NE;
18193 }
18194 break;
18195 case EQ:
18196 case UNEQ:
18197 if (code == EQ && TARGET_IEEE_FP)
18198 {
18199 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18200 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18201 intcmp_mode = CCmode;
18202 code = EQ;
18203 }
18204 else
18205 {
18206 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18207 code = NE;
18208 }
18209 break;
18210 case NE:
18211 case LTGT:
18212 if (code == NE && TARGET_IEEE_FP)
18213 {
18214 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18215 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18216 GEN_INT (0x40)));
18217 code = NE;
18218 }
18219 else
18220 {
18221 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18222 code = EQ;
18223 }
18224 break;
18225
18226 case UNORDERED:
18227 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18228 code = NE;
18229 break;
18230 case ORDERED:
18231 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18232 code = EQ;
18233 break;
18234
18235 default:
18236 gcc_unreachable ();
18237 }
18238 break;
18239
18240 default:
18241 gcc_unreachable();
18242 }
18243
18244 /* Return the test that should be put into the flags user, i.e.
18245 the bcc, scc, or cmov instruction. */
18246 return gen_rtx_fmt_ee (code, VOIDmode,
18247 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18248 const0_rtx);
18249 }
18250
18251 static rtx
18252 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18253 {
18254 rtx ret;
18255
18256 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18257 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18258
18259 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18260 {
18261 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18262 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18263 }
18264 else
18265 ret = ix86_expand_int_compare (code, op0, op1);
18266
18267 return ret;
18268 }
18269
18270 void
18271 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18272 {
18273 enum machine_mode mode = GET_MODE (op0);
18274 rtx tmp;
18275
18276 switch (mode)
18277 {
18278 case SFmode:
18279 case DFmode:
18280 case XFmode:
18281 case QImode:
18282 case HImode:
18283 case SImode:
18284 simple:
18285 tmp = ix86_expand_compare (code, op0, op1);
18286 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18287 gen_rtx_LABEL_REF (VOIDmode, label),
18288 pc_rtx);
18289 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18290 return;
18291
18292 case DImode:
18293 if (TARGET_64BIT)
18294 goto simple;
18295 case TImode:
18296 /* Expand DImode branch into multiple compare+branch. */
18297 {
18298 rtx lo[2], hi[2], label2;
18299 enum rtx_code code1, code2, code3;
18300 enum machine_mode submode;
18301
18302 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18303 {
18304 tmp = op0, op0 = op1, op1 = tmp;
18305 code = swap_condition (code);
18306 }
18307
18308 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18309 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18310
18311 submode = mode == DImode ? SImode : DImode;
18312
18313 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18314 avoid two branches. This costs one extra insn, so disable when
18315 optimizing for size. */
18316
18317 if ((code == EQ || code == NE)
18318 && (!optimize_insn_for_size_p ()
18319 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18320 {
18321 rtx xor0, xor1;
18322
18323 xor1 = hi[0];
18324 if (hi[1] != const0_rtx)
18325 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18326 NULL_RTX, 0, OPTAB_WIDEN);
18327
18328 xor0 = lo[0];
18329 if (lo[1] != const0_rtx)
18330 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18331 NULL_RTX, 0, OPTAB_WIDEN);
18332
18333 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18334 NULL_RTX, 0, OPTAB_WIDEN);
18335
18336 ix86_expand_branch (code, tmp, const0_rtx, label);
18337 return;
18338 }
18339
18340 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18341 op1 is a constant and the low word is zero, then we can just
18342 examine the high word. Similarly for low word -1 and
18343 less-or-equal-than or greater-than. */
18344
18345 if (CONST_INT_P (hi[1]))
18346 switch (code)
18347 {
18348 case LT: case LTU: case GE: case GEU:
18349 if (lo[1] == const0_rtx)
18350 {
18351 ix86_expand_branch (code, hi[0], hi[1], label);
18352 return;
18353 }
18354 break;
18355 case LE: case LEU: case GT: case GTU:
18356 if (lo[1] == constm1_rtx)
18357 {
18358 ix86_expand_branch (code, hi[0], hi[1], label);
18359 return;
18360 }
18361 break;
18362 default:
18363 break;
18364 }
18365
18366 /* Otherwise, we need two or three jumps. */
18367
18368 label2 = gen_label_rtx ();
18369
18370 code1 = code;
18371 code2 = swap_condition (code);
18372 code3 = unsigned_condition (code);
18373
18374 switch (code)
18375 {
18376 case LT: case GT: case LTU: case GTU:
18377 break;
18378
18379 case LE: code1 = LT; code2 = GT; break;
18380 case GE: code1 = GT; code2 = LT; break;
18381 case LEU: code1 = LTU; code2 = GTU; break;
18382 case GEU: code1 = GTU; code2 = LTU; break;
18383
18384 case EQ: code1 = UNKNOWN; code2 = NE; break;
18385 case NE: code2 = UNKNOWN; break;
18386
18387 default:
18388 gcc_unreachable ();
18389 }
18390
18391 /*
18392 * a < b =>
18393 * if (hi(a) < hi(b)) goto true;
18394 * if (hi(a) > hi(b)) goto false;
18395 * if (lo(a) < lo(b)) goto true;
18396 * false:
18397 */
18398
18399 if (code1 != UNKNOWN)
18400 ix86_expand_branch (code1, hi[0], hi[1], label);
18401 if (code2 != UNKNOWN)
18402 ix86_expand_branch (code2, hi[0], hi[1], label2);
18403
18404 ix86_expand_branch (code3, lo[0], lo[1], label);
18405
18406 if (code2 != UNKNOWN)
18407 emit_label (label2);
18408 return;
18409 }
18410
18411 default:
18412 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18413 goto simple;
18414 }
18415 }
18416
18417 /* Split branch based on floating point condition. */
18418 void
18419 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18420 rtx target1, rtx target2, rtx tmp, rtx pushed)
18421 {
18422 rtx condition;
18423 rtx i;
18424
18425 if (target2 != pc_rtx)
18426 {
18427 rtx tmp = target2;
18428 code = reverse_condition_maybe_unordered (code);
18429 target2 = target1;
18430 target1 = tmp;
18431 }
18432
18433 condition = ix86_expand_fp_compare (code, op1, op2,
18434 tmp);
18435
18436 /* Remove pushed operand from stack. */
18437 if (pushed)
18438 ix86_free_from_memory (GET_MODE (pushed));
18439
18440 i = emit_jump_insn (gen_rtx_SET
18441 (VOIDmode, pc_rtx,
18442 gen_rtx_IF_THEN_ELSE (VOIDmode,
18443 condition, target1, target2)));
18444 if (split_branch_probability >= 0)
18445 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18446 }
18447
18448 void
18449 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18450 {
18451 rtx ret;
18452
18453 gcc_assert (GET_MODE (dest) == QImode);
18454
18455 ret = ix86_expand_compare (code, op0, op1);
18456 PUT_MODE (ret, QImode);
18457 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
18458 }
18459
18460 /* Expand comparison setting or clearing carry flag. Return true when
18461 successful and set pop for the operation. */
18462 static bool
18463 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
18464 {
18465 enum machine_mode mode =
18466 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
18467
18468 /* Do not handle double-mode compares that go through special path. */
18469 if (mode == (TARGET_64BIT ? TImode : DImode))
18470 return false;
18471
18472 if (SCALAR_FLOAT_MODE_P (mode))
18473 {
18474 rtx compare_op, compare_seq;
18475
18476 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18477
18478 /* Shortcut: following common codes never translate
18479 into carry flag compares. */
18480 if (code == EQ || code == NE || code == UNEQ || code == LTGT
18481 || code == ORDERED || code == UNORDERED)
18482 return false;
18483
18484 /* These comparisons require zero flag; swap operands so they won't. */
18485 if ((code == GT || code == UNLE || code == LE || code == UNGT)
18486 && !TARGET_IEEE_FP)
18487 {
18488 rtx tmp = op0;
18489 op0 = op1;
18490 op1 = tmp;
18491 code = swap_condition (code);
18492 }
18493
18494 /* Try to expand the comparison and verify that we end up with
18495 carry flag based comparison. This fails to be true only when
18496 we decide to expand comparison using arithmetic that is not
18497 too common scenario. */
18498 start_sequence ();
18499 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18500 compare_seq = get_insns ();
18501 end_sequence ();
18502
18503 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
18504 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
18505 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
18506 else
18507 code = GET_CODE (compare_op);
18508
18509 if (code != LTU && code != GEU)
18510 return false;
18511
18512 emit_insn (compare_seq);
18513 *pop = compare_op;
18514 return true;
18515 }
18516
18517 if (!INTEGRAL_MODE_P (mode))
18518 return false;
18519
18520 switch (code)
18521 {
18522 case LTU:
18523 case GEU:
18524 break;
18525
18526 /* Convert a==0 into (unsigned)a<1. */
18527 case EQ:
18528 case NE:
18529 if (op1 != const0_rtx)
18530 return false;
18531 op1 = const1_rtx;
18532 code = (code == EQ ? LTU : GEU);
18533 break;
18534
18535 /* Convert a>b into b<a or a>=b-1. */
18536 case GTU:
18537 case LEU:
18538 if (CONST_INT_P (op1))
18539 {
18540 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
18541 /* Bail out on overflow. We still can swap operands but that
18542 would force loading of the constant into register. */
18543 if (op1 == const0_rtx
18544 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
18545 return false;
18546 code = (code == GTU ? GEU : LTU);
18547 }
18548 else
18549 {
18550 rtx tmp = op1;
18551 op1 = op0;
18552 op0 = tmp;
18553 code = (code == GTU ? LTU : GEU);
18554 }
18555 break;
18556
18557 /* Convert a>=0 into (unsigned)a<0x80000000. */
18558 case LT:
18559 case GE:
18560 if (mode == DImode || op1 != const0_rtx)
18561 return false;
18562 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18563 code = (code == LT ? GEU : LTU);
18564 break;
18565 case LE:
18566 case GT:
18567 if (mode == DImode || op1 != constm1_rtx)
18568 return false;
18569 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
18570 code = (code == LE ? GEU : LTU);
18571 break;
18572
18573 default:
18574 return false;
18575 }
18576 /* Swapping operands may cause constant to appear as first operand. */
18577 if (!nonimmediate_operand (op0, VOIDmode))
18578 {
18579 if (!can_create_pseudo_p ())
18580 return false;
18581 op0 = force_reg (mode, op0);
18582 }
18583 *pop = ix86_expand_compare (code, op0, op1);
18584 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
18585 return true;
18586 }
18587
18588 bool
18589 ix86_expand_int_movcc (rtx operands[])
18590 {
18591 enum rtx_code code = GET_CODE (operands[1]), compare_code;
18592 rtx compare_seq, compare_op;
18593 enum machine_mode mode = GET_MODE (operands[0]);
18594 bool sign_bit_compare_p = false;
18595 rtx op0 = XEXP (operands[1], 0);
18596 rtx op1 = XEXP (operands[1], 1);
18597
18598 start_sequence ();
18599 compare_op = ix86_expand_compare (code, op0, op1);
18600 compare_seq = get_insns ();
18601 end_sequence ();
18602
18603 compare_code = GET_CODE (compare_op);
18604
18605 if ((op1 == const0_rtx && (code == GE || code == LT))
18606 || (op1 == constm1_rtx && (code == GT || code == LE)))
18607 sign_bit_compare_p = true;
18608
18609 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
18610 HImode insns, we'd be swallowed in word prefix ops. */
18611
18612 if ((mode != HImode || TARGET_FAST_PREFIX)
18613 && (mode != (TARGET_64BIT ? TImode : DImode))
18614 && CONST_INT_P (operands[2])
18615 && CONST_INT_P (operands[3]))
18616 {
18617 rtx out = operands[0];
18618 HOST_WIDE_INT ct = INTVAL (operands[2]);
18619 HOST_WIDE_INT cf = INTVAL (operands[3]);
18620 HOST_WIDE_INT diff;
18621
18622 diff = ct - cf;
18623 /* Sign bit compares are better done using shifts than we do by using
18624 sbb. */
18625 if (sign_bit_compare_p
18626 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18627 {
18628 /* Detect overlap between destination and compare sources. */
18629 rtx tmp = out;
18630
18631 if (!sign_bit_compare_p)
18632 {
18633 rtx flags;
18634 bool fpcmp = false;
18635
18636 compare_code = GET_CODE (compare_op);
18637
18638 flags = XEXP (compare_op, 0);
18639
18640 if (GET_MODE (flags) == CCFPmode
18641 || GET_MODE (flags) == CCFPUmode)
18642 {
18643 fpcmp = true;
18644 compare_code
18645 = ix86_fp_compare_code_to_integer (compare_code);
18646 }
18647
18648 /* To simplify rest of code, restrict to the GEU case. */
18649 if (compare_code == LTU)
18650 {
18651 HOST_WIDE_INT tmp = ct;
18652 ct = cf;
18653 cf = tmp;
18654 compare_code = reverse_condition (compare_code);
18655 code = reverse_condition (code);
18656 }
18657 else
18658 {
18659 if (fpcmp)
18660 PUT_CODE (compare_op,
18661 reverse_condition_maybe_unordered
18662 (GET_CODE (compare_op)));
18663 else
18664 PUT_CODE (compare_op,
18665 reverse_condition (GET_CODE (compare_op)));
18666 }
18667 diff = ct - cf;
18668
18669 if (reg_overlap_mentioned_p (out, op0)
18670 || reg_overlap_mentioned_p (out, op1))
18671 tmp = gen_reg_rtx (mode);
18672
18673 if (mode == DImode)
18674 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
18675 else
18676 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
18677 flags, compare_op));
18678 }
18679 else
18680 {
18681 if (code == GT || code == GE)
18682 code = reverse_condition (code);
18683 else
18684 {
18685 HOST_WIDE_INT tmp = ct;
18686 ct = cf;
18687 cf = tmp;
18688 diff = ct - cf;
18689 }
18690 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
18691 }
18692
18693 if (diff == 1)
18694 {
18695 /*
18696 * cmpl op0,op1
18697 * sbbl dest,dest
18698 * [addl dest, ct]
18699 *
18700 * Size 5 - 8.
18701 */
18702 if (ct)
18703 tmp = expand_simple_binop (mode, PLUS,
18704 tmp, GEN_INT (ct),
18705 copy_rtx (tmp), 1, OPTAB_DIRECT);
18706 }
18707 else if (cf == -1)
18708 {
18709 /*
18710 * cmpl op0,op1
18711 * sbbl dest,dest
18712 * orl $ct, dest
18713 *
18714 * Size 8.
18715 */
18716 tmp = expand_simple_binop (mode, IOR,
18717 tmp, GEN_INT (ct),
18718 copy_rtx (tmp), 1, OPTAB_DIRECT);
18719 }
18720 else if (diff == -1 && ct)
18721 {
18722 /*
18723 * cmpl op0,op1
18724 * sbbl dest,dest
18725 * notl dest
18726 * [addl dest, cf]
18727 *
18728 * Size 8 - 11.
18729 */
18730 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18731 if (cf)
18732 tmp = expand_simple_binop (mode, PLUS,
18733 copy_rtx (tmp), GEN_INT (cf),
18734 copy_rtx (tmp), 1, OPTAB_DIRECT);
18735 }
18736 else
18737 {
18738 /*
18739 * cmpl op0,op1
18740 * sbbl dest,dest
18741 * [notl dest]
18742 * andl cf - ct, dest
18743 * [addl dest, ct]
18744 *
18745 * Size 8 - 11.
18746 */
18747
18748 if (cf == 0)
18749 {
18750 cf = ct;
18751 ct = 0;
18752 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
18753 }
18754
18755 tmp = expand_simple_binop (mode, AND,
18756 copy_rtx (tmp),
18757 gen_int_mode (cf - ct, mode),
18758 copy_rtx (tmp), 1, OPTAB_DIRECT);
18759 if (ct)
18760 tmp = expand_simple_binop (mode, PLUS,
18761 copy_rtx (tmp), GEN_INT (ct),
18762 copy_rtx (tmp), 1, OPTAB_DIRECT);
18763 }
18764
18765 if (!rtx_equal_p (tmp, out))
18766 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
18767
18768 return true;
18769 }
18770
18771 if (diff < 0)
18772 {
18773 enum machine_mode cmp_mode = GET_MODE (op0);
18774
18775 HOST_WIDE_INT tmp;
18776 tmp = ct, ct = cf, cf = tmp;
18777 diff = -diff;
18778
18779 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18780 {
18781 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18782
18783 /* We may be reversing unordered compare to normal compare, that
18784 is not valid in general (we may convert non-trapping condition
18785 to trapping one), however on i386 we currently emit all
18786 comparisons unordered. */
18787 compare_code = reverse_condition_maybe_unordered (compare_code);
18788 code = reverse_condition_maybe_unordered (code);
18789 }
18790 else
18791 {
18792 compare_code = reverse_condition (compare_code);
18793 code = reverse_condition (code);
18794 }
18795 }
18796
18797 compare_code = UNKNOWN;
18798 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
18799 && CONST_INT_P (op1))
18800 {
18801 if (op1 == const0_rtx
18802 && (code == LT || code == GE))
18803 compare_code = code;
18804 else if (op1 == constm1_rtx)
18805 {
18806 if (code == LE)
18807 compare_code = LT;
18808 else if (code == GT)
18809 compare_code = GE;
18810 }
18811 }
18812
18813 /* Optimize dest = (op0 < 0) ? -1 : cf. */
18814 if (compare_code != UNKNOWN
18815 && GET_MODE (op0) == GET_MODE (out)
18816 && (cf == -1 || ct == -1))
18817 {
18818 /* If lea code below could be used, only optimize
18819 if it results in a 2 insn sequence. */
18820
18821 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
18822 || diff == 3 || diff == 5 || diff == 9)
18823 || (compare_code == LT && ct == -1)
18824 || (compare_code == GE && cf == -1))
18825 {
18826 /*
18827 * notl op1 (if necessary)
18828 * sarl $31, op1
18829 * orl cf, op1
18830 */
18831 if (ct != -1)
18832 {
18833 cf = ct;
18834 ct = -1;
18835 code = reverse_condition (code);
18836 }
18837
18838 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18839
18840 out = expand_simple_binop (mode, IOR,
18841 out, GEN_INT (cf),
18842 out, 1, OPTAB_DIRECT);
18843 if (out != operands[0])
18844 emit_move_insn (operands[0], out);
18845
18846 return true;
18847 }
18848 }
18849
18850
18851 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
18852 || diff == 3 || diff == 5 || diff == 9)
18853 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
18854 && (mode != DImode
18855 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
18856 {
18857 /*
18858 * xorl dest,dest
18859 * cmpl op1,op2
18860 * setcc dest
18861 * lea cf(dest*(ct-cf)),dest
18862 *
18863 * Size 14.
18864 *
18865 * This also catches the degenerate setcc-only case.
18866 */
18867
18868 rtx tmp;
18869 int nops;
18870
18871 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18872
18873 nops = 0;
18874 /* On x86_64 the lea instruction operates on Pmode, so we need
18875 to get arithmetics done in proper mode to match. */
18876 if (diff == 1)
18877 tmp = copy_rtx (out);
18878 else
18879 {
18880 rtx out1;
18881 out1 = copy_rtx (out);
18882 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
18883 nops++;
18884 if (diff & 1)
18885 {
18886 tmp = gen_rtx_PLUS (mode, tmp, out1);
18887 nops++;
18888 }
18889 }
18890 if (cf != 0)
18891 {
18892 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
18893 nops++;
18894 }
18895 if (!rtx_equal_p (tmp, out))
18896 {
18897 if (nops == 1)
18898 out = force_operand (tmp, copy_rtx (out));
18899 else
18900 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
18901 }
18902 if (!rtx_equal_p (out, operands[0]))
18903 emit_move_insn (operands[0], copy_rtx (out));
18904
18905 return true;
18906 }
18907
18908 /*
18909 * General case: Jumpful:
18910 * xorl dest,dest cmpl op1, op2
18911 * cmpl op1, op2 movl ct, dest
18912 * setcc dest jcc 1f
18913 * decl dest movl cf, dest
18914 * andl (cf-ct),dest 1:
18915 * addl ct,dest
18916 *
18917 * Size 20. Size 14.
18918 *
18919 * This is reasonably steep, but branch mispredict costs are
18920 * high on modern cpus, so consider failing only if optimizing
18921 * for space.
18922 */
18923
18924 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18925 && BRANCH_COST (optimize_insn_for_speed_p (),
18926 false) >= 2)
18927 {
18928 if (cf == 0)
18929 {
18930 enum machine_mode cmp_mode = GET_MODE (op0);
18931
18932 cf = ct;
18933 ct = 0;
18934
18935 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18936 {
18937 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18938
18939 /* We may be reversing unordered compare to normal compare,
18940 that is not valid in general (we may convert non-trapping
18941 condition to trapping one), however on i386 we currently
18942 emit all comparisons unordered. */
18943 code = reverse_condition_maybe_unordered (code);
18944 }
18945 else
18946 {
18947 code = reverse_condition (code);
18948 if (compare_code != UNKNOWN)
18949 compare_code = reverse_condition (compare_code);
18950 }
18951 }
18952
18953 if (compare_code != UNKNOWN)
18954 {
18955 /* notl op1 (if needed)
18956 sarl $31, op1
18957 andl (cf-ct), op1
18958 addl ct, op1
18959
18960 For x < 0 (resp. x <= -1) there will be no notl,
18961 so if possible swap the constants to get rid of the
18962 complement.
18963 True/false will be -1/0 while code below (store flag
18964 followed by decrement) is 0/-1, so the constants need
18965 to be exchanged once more. */
18966
18967 if (compare_code == GE || !cf)
18968 {
18969 code = reverse_condition (code);
18970 compare_code = LT;
18971 }
18972 else
18973 {
18974 HOST_WIDE_INT tmp = cf;
18975 cf = ct;
18976 ct = tmp;
18977 }
18978
18979 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18980 }
18981 else
18982 {
18983 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18984
18985 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
18986 constm1_rtx,
18987 copy_rtx (out), 1, OPTAB_DIRECT);
18988 }
18989
18990 out = expand_simple_binop (mode, AND, copy_rtx (out),
18991 gen_int_mode (cf - ct, mode),
18992 copy_rtx (out), 1, OPTAB_DIRECT);
18993 if (ct)
18994 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
18995 copy_rtx (out), 1, OPTAB_DIRECT);
18996 if (!rtx_equal_p (out, operands[0]))
18997 emit_move_insn (operands[0], copy_rtx (out));
18998
18999 return true;
19000 }
19001 }
19002
19003 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19004 {
19005 /* Try a few things more with specific constants and a variable. */
19006
19007 optab op;
19008 rtx var, orig_out, out, tmp;
19009
19010 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19011 return false;
19012
19013 /* If one of the two operands is an interesting constant, load a
19014 constant with the above and mask it in with a logical operation. */
19015
19016 if (CONST_INT_P (operands[2]))
19017 {
19018 var = operands[3];
19019 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19020 operands[3] = constm1_rtx, op = and_optab;
19021 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19022 operands[3] = const0_rtx, op = ior_optab;
19023 else
19024 return false;
19025 }
19026 else if (CONST_INT_P (operands[3]))
19027 {
19028 var = operands[2];
19029 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19030 operands[2] = constm1_rtx, op = and_optab;
19031 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19032 operands[2] = const0_rtx, op = ior_optab;
19033 else
19034 return false;
19035 }
19036 else
19037 return false;
19038
19039 orig_out = operands[0];
19040 tmp = gen_reg_rtx (mode);
19041 operands[0] = tmp;
19042
19043 /* Recurse to get the constant loaded. */
19044 if (ix86_expand_int_movcc (operands) == 0)
19045 return false;
19046
19047 /* Mask in the interesting variable. */
19048 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19049 OPTAB_WIDEN);
19050 if (!rtx_equal_p (out, orig_out))
19051 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19052
19053 return true;
19054 }
19055
19056 /*
19057 * For comparison with above,
19058 *
19059 * movl cf,dest
19060 * movl ct,tmp
19061 * cmpl op1,op2
19062 * cmovcc tmp,dest
19063 *
19064 * Size 15.
19065 */
19066
19067 if (! nonimmediate_operand (operands[2], mode))
19068 operands[2] = force_reg (mode, operands[2]);
19069 if (! nonimmediate_operand (operands[3], mode))
19070 operands[3] = force_reg (mode, operands[3]);
19071
19072 if (! register_operand (operands[2], VOIDmode)
19073 && (mode == QImode
19074 || ! register_operand (operands[3], VOIDmode)))
19075 operands[2] = force_reg (mode, operands[2]);
19076
19077 if (mode == QImode
19078 && ! register_operand (operands[3], VOIDmode))
19079 operands[3] = force_reg (mode, operands[3]);
19080
19081 emit_insn (compare_seq);
19082 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19083 gen_rtx_IF_THEN_ELSE (mode,
19084 compare_op, operands[2],
19085 operands[3])));
19086 return true;
19087 }
19088
19089 /* Swap, force into registers, or otherwise massage the two operands
19090 to an sse comparison with a mask result. Thus we differ a bit from
19091 ix86_prepare_fp_compare_args which expects to produce a flags result.
19092
19093 The DEST operand exists to help determine whether to commute commutative
19094 operators. The POP0/POP1 operands are updated in place. The new
19095 comparison code is returned, or UNKNOWN if not implementable. */
19096
19097 static enum rtx_code
19098 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19099 rtx *pop0, rtx *pop1)
19100 {
19101 rtx tmp;
19102
19103 switch (code)
19104 {
19105 case LTGT:
19106 case UNEQ:
19107 /* AVX supports all the needed comparisons. */
19108 if (TARGET_AVX)
19109 break;
19110 /* We have no LTGT as an operator. We could implement it with
19111 NE & ORDERED, but this requires an extra temporary. It's
19112 not clear that it's worth it. */
19113 return UNKNOWN;
19114
19115 case LT:
19116 case LE:
19117 case UNGT:
19118 case UNGE:
19119 /* These are supported directly. */
19120 break;
19121
19122 case EQ:
19123 case NE:
19124 case UNORDERED:
19125 case ORDERED:
19126 /* AVX has 3 operand comparisons, no need to swap anything. */
19127 if (TARGET_AVX)
19128 break;
19129 /* For commutative operators, try to canonicalize the destination
19130 operand to be first in the comparison - this helps reload to
19131 avoid extra moves. */
19132 if (!dest || !rtx_equal_p (dest, *pop1))
19133 break;
19134 /* FALLTHRU */
19135
19136 case GE:
19137 case GT:
19138 case UNLE:
19139 case UNLT:
19140 /* These are not supported directly before AVX, and furthermore
19141 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19142 comparison operands to transform into something that is
19143 supported. */
19144 tmp = *pop0;
19145 *pop0 = *pop1;
19146 *pop1 = tmp;
19147 code = swap_condition (code);
19148 break;
19149
19150 default:
19151 gcc_unreachable ();
19152 }
19153
19154 return code;
19155 }
19156
19157 /* Detect conditional moves that exactly match min/max operational
19158 semantics. Note that this is IEEE safe, as long as we don't
19159 interchange the operands.
19160
19161 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19162 and TRUE if the operation is successful and instructions are emitted. */
19163
19164 static bool
19165 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19166 rtx cmp_op1, rtx if_true, rtx if_false)
19167 {
19168 enum machine_mode mode;
19169 bool is_min;
19170 rtx tmp;
19171
19172 if (code == LT)
19173 ;
19174 else if (code == UNGE)
19175 {
19176 tmp = if_true;
19177 if_true = if_false;
19178 if_false = tmp;
19179 }
19180 else
19181 return false;
19182
19183 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19184 is_min = true;
19185 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19186 is_min = false;
19187 else
19188 return false;
19189
19190 mode = GET_MODE (dest);
19191
19192 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19193 but MODE may be a vector mode and thus not appropriate. */
19194 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19195 {
19196 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19197 rtvec v;
19198
19199 if_true = force_reg (mode, if_true);
19200 v = gen_rtvec (2, if_true, if_false);
19201 tmp = gen_rtx_UNSPEC (mode, v, u);
19202 }
19203 else
19204 {
19205 code = is_min ? SMIN : SMAX;
19206 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19207 }
19208
19209 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19210 return true;
19211 }
19212
19213 /* Expand an sse vector comparison. Return the register with the result. */
19214
19215 static rtx
19216 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19217 rtx op_true, rtx op_false)
19218 {
19219 enum machine_mode mode = GET_MODE (dest);
19220 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19221 rtx x;
19222
19223 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19224 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19225 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19226
19227 if (optimize
19228 || reg_overlap_mentioned_p (dest, op_true)
19229 || reg_overlap_mentioned_p (dest, op_false))
19230 dest = gen_reg_rtx (mode);
19231
19232 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19233 if (cmp_mode != mode)
19234 {
19235 x = force_reg (cmp_mode, x);
19236 convert_move (dest, x, false);
19237 }
19238 else
19239 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19240
19241 return dest;
19242 }
19243
19244 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19245 operations. This is used for both scalar and vector conditional moves. */
19246
19247 static void
19248 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19249 {
19250 enum machine_mode mode = GET_MODE (dest);
19251 rtx t2, t3, x;
19252
19253 if (vector_all_ones_operand (op_true, mode)
19254 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19255 {
19256 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19257 }
19258 else if (op_false == CONST0_RTX (mode))
19259 {
19260 op_true = force_reg (mode, op_true);
19261 x = gen_rtx_AND (mode, cmp, op_true);
19262 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19263 }
19264 else if (op_true == CONST0_RTX (mode))
19265 {
19266 op_false = force_reg (mode, op_false);
19267 x = gen_rtx_NOT (mode, cmp);
19268 x = gen_rtx_AND (mode, x, op_false);
19269 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19270 }
19271 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19272 {
19273 op_false = force_reg (mode, op_false);
19274 x = gen_rtx_IOR (mode, cmp, op_false);
19275 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19276 }
19277 else if (TARGET_XOP)
19278 {
19279 op_true = force_reg (mode, op_true);
19280
19281 if (!nonimmediate_operand (op_false, mode))
19282 op_false = force_reg (mode, op_false);
19283
19284 emit_insn (gen_rtx_SET (mode, dest,
19285 gen_rtx_IF_THEN_ELSE (mode, cmp,
19286 op_true,
19287 op_false)));
19288 }
19289 else
19290 {
19291 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19292
19293 if (!nonimmediate_operand (op_true, mode))
19294 op_true = force_reg (mode, op_true);
19295
19296 op_false = force_reg (mode, op_false);
19297
19298 switch (mode)
19299 {
19300 case V4SFmode:
19301 if (TARGET_SSE4_1)
19302 gen = gen_sse4_1_blendvps;
19303 break;
19304 case V2DFmode:
19305 if (TARGET_SSE4_1)
19306 gen = gen_sse4_1_blendvpd;
19307 break;
19308 case V16QImode:
19309 case V8HImode:
19310 case V4SImode:
19311 case V2DImode:
19312 if (TARGET_SSE4_1)
19313 {
19314 gen = gen_sse4_1_pblendvb;
19315 dest = gen_lowpart (V16QImode, dest);
19316 op_false = gen_lowpart (V16QImode, op_false);
19317 op_true = gen_lowpart (V16QImode, op_true);
19318 cmp = gen_lowpart (V16QImode, cmp);
19319 }
19320 break;
19321 case V8SFmode:
19322 if (TARGET_AVX)
19323 gen = gen_avx_blendvps256;
19324 break;
19325 case V4DFmode:
19326 if (TARGET_AVX)
19327 gen = gen_avx_blendvpd256;
19328 break;
19329 case V32QImode:
19330 case V16HImode:
19331 case V8SImode:
19332 case V4DImode:
19333 if (TARGET_AVX2)
19334 {
19335 gen = gen_avx2_pblendvb;
19336 dest = gen_lowpart (V32QImode, dest);
19337 op_false = gen_lowpart (V32QImode, op_false);
19338 op_true = gen_lowpart (V32QImode, op_true);
19339 cmp = gen_lowpart (V32QImode, cmp);
19340 }
19341 break;
19342 default:
19343 break;
19344 }
19345
19346 if (gen != NULL)
19347 emit_insn (gen (dest, op_false, op_true, cmp));
19348 else
19349 {
19350 op_true = force_reg (mode, op_true);
19351
19352 t2 = gen_reg_rtx (mode);
19353 if (optimize)
19354 t3 = gen_reg_rtx (mode);
19355 else
19356 t3 = dest;
19357
19358 x = gen_rtx_AND (mode, op_true, cmp);
19359 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19360
19361 x = gen_rtx_NOT (mode, cmp);
19362 x = gen_rtx_AND (mode, x, op_false);
19363 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19364
19365 x = gen_rtx_IOR (mode, t3, t2);
19366 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19367 }
19368 }
19369 }
19370
19371 /* Expand a floating-point conditional move. Return true if successful. */
19372
19373 bool
19374 ix86_expand_fp_movcc (rtx operands[])
19375 {
19376 enum machine_mode mode = GET_MODE (operands[0]);
19377 enum rtx_code code = GET_CODE (operands[1]);
19378 rtx tmp, compare_op;
19379 rtx op0 = XEXP (operands[1], 0);
19380 rtx op1 = XEXP (operands[1], 1);
19381
19382 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19383 {
19384 enum machine_mode cmode;
19385
19386 /* Since we've no cmove for sse registers, don't force bad register
19387 allocation just to gain access to it. Deny movcc when the
19388 comparison mode doesn't match the move mode. */
19389 cmode = GET_MODE (op0);
19390 if (cmode == VOIDmode)
19391 cmode = GET_MODE (op1);
19392 if (cmode != mode)
19393 return false;
19394
19395 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19396 if (code == UNKNOWN)
19397 return false;
19398
19399 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19400 operands[2], operands[3]))
19401 return true;
19402
19403 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19404 operands[2], operands[3]);
19405 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19406 return true;
19407 }
19408
19409 /* The floating point conditional move instructions don't directly
19410 support conditions resulting from a signed integer comparison. */
19411
19412 compare_op = ix86_expand_compare (code, op0, op1);
19413 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19414 {
19415 tmp = gen_reg_rtx (QImode);
19416 ix86_expand_setcc (tmp, code, op0, op1);
19417
19418 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19419 }
19420
19421 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19422 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19423 operands[2], operands[3])));
19424
19425 return true;
19426 }
19427
19428 /* Expand a floating-point vector conditional move; a vcond operation
19429 rather than a movcc operation. */
19430
19431 bool
19432 ix86_expand_fp_vcond (rtx operands[])
19433 {
19434 enum rtx_code code = GET_CODE (operands[3]);
19435 rtx cmp;
19436
19437 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19438 &operands[4], &operands[5]);
19439 if (code == UNKNOWN)
19440 {
19441 rtx temp;
19442 switch (GET_CODE (operands[3]))
19443 {
19444 case LTGT:
19445 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19446 operands[5], operands[0], operands[0]);
19447 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
19448 operands[5], operands[1], operands[2]);
19449 code = AND;
19450 break;
19451 case UNEQ:
19452 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
19453 operands[5], operands[0], operands[0]);
19454 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
19455 operands[5], operands[1], operands[2]);
19456 code = IOR;
19457 break;
19458 default:
19459 gcc_unreachable ();
19460 }
19461 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
19462 OPTAB_DIRECT);
19463 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19464 return true;
19465 }
19466
19467 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
19468 operands[5], operands[1], operands[2]))
19469 return true;
19470
19471 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
19472 operands[1], operands[2]);
19473 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
19474 return true;
19475 }
19476
19477 /* Expand a signed/unsigned integral vector conditional move. */
19478
19479 bool
19480 ix86_expand_int_vcond (rtx operands[])
19481 {
19482 enum machine_mode data_mode = GET_MODE (operands[0]);
19483 enum machine_mode mode = GET_MODE (operands[4]);
19484 enum rtx_code code = GET_CODE (operands[3]);
19485 bool negate = false;
19486 rtx x, cop0, cop1;
19487
19488 cop0 = operands[4];
19489 cop1 = operands[5];
19490
19491 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
19492 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
19493 if ((code == LT || code == GE)
19494 && data_mode == mode
19495 && cop1 == CONST0_RTX (mode)
19496 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
19497 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
19498 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
19499 && (GET_MODE_SIZE (data_mode) == 16
19500 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
19501 {
19502 rtx negop = operands[2 - (code == LT)];
19503 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
19504 if (negop == CONST1_RTX (data_mode))
19505 {
19506 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
19507 operands[0], 1, OPTAB_DIRECT);
19508 if (res != operands[0])
19509 emit_move_insn (operands[0], res);
19510 return true;
19511 }
19512 else if (GET_MODE_INNER (data_mode) != DImode
19513 && vector_all_ones_operand (negop, data_mode))
19514 {
19515 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
19516 operands[0], 0, OPTAB_DIRECT);
19517 if (res != operands[0])
19518 emit_move_insn (operands[0], res);
19519 return true;
19520 }
19521 }
19522
19523 if (!nonimmediate_operand (cop1, mode))
19524 cop1 = force_reg (mode, cop1);
19525 if (!general_operand (operands[1], data_mode))
19526 operands[1] = force_reg (data_mode, operands[1]);
19527 if (!general_operand (operands[2], data_mode))
19528 operands[2] = force_reg (data_mode, operands[2]);
19529
19530 /* XOP supports all of the comparisons on all 128-bit vector int types. */
19531 if (TARGET_XOP
19532 && (mode == V16QImode || mode == V8HImode
19533 || mode == V4SImode || mode == V2DImode))
19534 ;
19535 else
19536 {
19537 /* Canonicalize the comparison to EQ, GT, GTU. */
19538 switch (code)
19539 {
19540 case EQ:
19541 case GT:
19542 case GTU:
19543 break;
19544
19545 case NE:
19546 case LE:
19547 case LEU:
19548 code = reverse_condition (code);
19549 negate = true;
19550 break;
19551
19552 case GE:
19553 case GEU:
19554 code = reverse_condition (code);
19555 negate = true;
19556 /* FALLTHRU */
19557
19558 case LT:
19559 case LTU:
19560 code = swap_condition (code);
19561 x = cop0, cop0 = cop1, cop1 = x;
19562 break;
19563
19564 default:
19565 gcc_unreachable ();
19566 }
19567
19568 /* Only SSE4.1/SSE4.2 supports V2DImode. */
19569 if (mode == V2DImode)
19570 {
19571 switch (code)
19572 {
19573 case EQ:
19574 /* SSE4.1 supports EQ. */
19575 if (!TARGET_SSE4_1)
19576 return false;
19577 break;
19578
19579 case GT:
19580 case GTU:
19581 /* SSE4.2 supports GT/GTU. */
19582 if (!TARGET_SSE4_2)
19583 return false;
19584 break;
19585
19586 default:
19587 gcc_unreachable ();
19588 }
19589 }
19590
19591 /* Unsigned parallel compare is not supported by the hardware.
19592 Play some tricks to turn this into a signed comparison
19593 against 0. */
19594 if (code == GTU)
19595 {
19596 cop0 = force_reg (mode, cop0);
19597
19598 switch (mode)
19599 {
19600 case V8SImode:
19601 case V4DImode:
19602 case V4SImode:
19603 case V2DImode:
19604 {
19605 rtx t1, t2, mask;
19606 rtx (*gen_sub3) (rtx, rtx, rtx);
19607
19608 switch (mode)
19609 {
19610 case V8SImode: gen_sub3 = gen_subv8si3; break;
19611 case V4DImode: gen_sub3 = gen_subv4di3; break;
19612 case V4SImode: gen_sub3 = gen_subv4si3; break;
19613 case V2DImode: gen_sub3 = gen_subv2di3; break;
19614 default:
19615 gcc_unreachable ();
19616 }
19617 /* Subtract (-(INT MAX) - 1) from both operands to make
19618 them signed. */
19619 mask = ix86_build_signbit_mask (mode, true, false);
19620 t1 = gen_reg_rtx (mode);
19621 emit_insn (gen_sub3 (t1, cop0, mask));
19622
19623 t2 = gen_reg_rtx (mode);
19624 emit_insn (gen_sub3 (t2, cop1, mask));
19625
19626 cop0 = t1;
19627 cop1 = t2;
19628 code = GT;
19629 }
19630 break;
19631
19632 case V32QImode:
19633 case V16HImode:
19634 case V16QImode:
19635 case V8HImode:
19636 /* Perform a parallel unsigned saturating subtraction. */
19637 x = gen_reg_rtx (mode);
19638 emit_insn (gen_rtx_SET (VOIDmode, x,
19639 gen_rtx_US_MINUS (mode, cop0, cop1)));
19640
19641 cop0 = x;
19642 cop1 = CONST0_RTX (mode);
19643 code = EQ;
19644 negate = !negate;
19645 break;
19646
19647 default:
19648 gcc_unreachable ();
19649 }
19650 }
19651 }
19652
19653 /* Allow the comparison to be done in one mode, but the movcc to
19654 happen in another mode. */
19655 if (data_mode == mode)
19656 {
19657 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
19658 operands[1+negate], operands[2-negate]);
19659 }
19660 else
19661 {
19662 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
19663 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
19664 code, cop0, cop1,
19665 operands[1+negate], operands[2-negate]);
19666 x = gen_lowpart (data_mode, x);
19667 }
19668
19669 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
19670 operands[2-negate]);
19671 return true;
19672 }
19673
19674 /* Expand a variable vector permutation. */
19675
19676 void
19677 ix86_expand_vec_perm (rtx operands[])
19678 {
19679 rtx target = operands[0];
19680 rtx op0 = operands[1];
19681 rtx op1 = operands[2];
19682 rtx mask = operands[3];
19683 rtx t1, t2, t3, t4, vt, vt2, vec[32];
19684 enum machine_mode mode = GET_MODE (op0);
19685 enum machine_mode maskmode = GET_MODE (mask);
19686 int w, e, i;
19687 bool one_operand_shuffle = rtx_equal_p (op0, op1);
19688
19689 /* Number of elements in the vector. */
19690 w = GET_MODE_NUNITS (mode);
19691 e = GET_MODE_UNIT_SIZE (mode);
19692 gcc_assert (w <= 32);
19693
19694 if (TARGET_AVX2)
19695 {
19696 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
19697 {
19698 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
19699 an constant shuffle operand. With a tiny bit of effort we can
19700 use VPERMD instead. A re-interpretation stall for V4DFmode is
19701 unfortunate but there's no avoiding it.
19702 Similarly for V16HImode we don't have instructions for variable
19703 shuffling, while for V32QImode we can use after preparing suitable
19704 masks vpshufb; vpshufb; vpermq; vpor. */
19705
19706 if (mode == V16HImode)
19707 {
19708 maskmode = mode = V32QImode;
19709 w = 32;
19710 e = 1;
19711 }
19712 else
19713 {
19714 maskmode = mode = V8SImode;
19715 w = 8;
19716 e = 4;
19717 }
19718 t1 = gen_reg_rtx (maskmode);
19719
19720 /* Replicate the low bits of the V4DImode mask into V8SImode:
19721 mask = { A B C D }
19722 t1 = { A A B B C C D D }. */
19723 for (i = 0; i < w / 2; ++i)
19724 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
19725 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19726 vt = force_reg (maskmode, vt);
19727 mask = gen_lowpart (maskmode, mask);
19728 if (maskmode == V8SImode)
19729 emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
19730 else
19731 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
19732
19733 /* Multiply the shuffle indicies by two. */
19734 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
19735 OPTAB_DIRECT);
19736
19737 /* Add one to the odd shuffle indicies:
19738 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
19739 for (i = 0; i < w / 2; ++i)
19740 {
19741 vec[i * 2] = const0_rtx;
19742 vec[i * 2 + 1] = const1_rtx;
19743 }
19744 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19745 vt = force_const_mem (maskmode, vt);
19746 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
19747 OPTAB_DIRECT);
19748
19749 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
19750 operands[3] = mask = t1;
19751 target = gen_lowpart (mode, target);
19752 op0 = gen_lowpart (mode, op0);
19753 op1 = gen_lowpart (mode, op1);
19754 }
19755
19756 switch (mode)
19757 {
19758 case V8SImode:
19759 /* The VPERMD and VPERMPS instructions already properly ignore
19760 the high bits of the shuffle elements. No need for us to
19761 perform an AND ourselves. */
19762 if (one_operand_shuffle)
19763 emit_insn (gen_avx2_permvarv8si (target, mask, op0));
19764 else
19765 {
19766 t1 = gen_reg_rtx (V8SImode);
19767 t2 = gen_reg_rtx (V8SImode);
19768 emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
19769 emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
19770 goto merge_two;
19771 }
19772 return;
19773
19774 case V8SFmode:
19775 mask = gen_lowpart (V8SFmode, mask);
19776 if (one_operand_shuffle)
19777 emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
19778 else
19779 {
19780 t1 = gen_reg_rtx (V8SFmode);
19781 t2 = gen_reg_rtx (V8SFmode);
19782 emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
19783 emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
19784 goto merge_two;
19785 }
19786 return;
19787
19788 case V4SImode:
19789 /* By combining the two 128-bit input vectors into one 256-bit
19790 input vector, we can use VPERMD and VPERMPS for the full
19791 two-operand shuffle. */
19792 t1 = gen_reg_rtx (V8SImode);
19793 t2 = gen_reg_rtx (V8SImode);
19794 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
19795 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
19796 emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
19797 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
19798 return;
19799
19800 case V4SFmode:
19801 t1 = gen_reg_rtx (V8SFmode);
19802 t2 = gen_reg_rtx (V8SFmode);
19803 mask = gen_lowpart (V4SFmode, mask);
19804 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
19805 emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
19806 emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
19807 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
19808 return;
19809
19810 case V32QImode:
19811 t1 = gen_reg_rtx (V32QImode);
19812 t2 = gen_reg_rtx (V32QImode);
19813 t3 = gen_reg_rtx (V32QImode);
19814 vt2 = GEN_INT (128);
19815 for (i = 0; i < 32; i++)
19816 vec[i] = vt2;
19817 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19818 vt = force_reg (V32QImode, vt);
19819 for (i = 0; i < 32; i++)
19820 vec[i] = i < 16 ? vt2 : const0_rtx;
19821 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
19822 vt2 = force_reg (V32QImode, vt2);
19823 /* From mask create two adjusted masks, which contain the same
19824 bits as mask in the low 7 bits of each vector element.
19825 The first mask will have the most significant bit clear
19826 if it requests element from the same 128-bit lane
19827 and MSB set if it requests element from the other 128-bit lane.
19828 The second mask will have the opposite values of the MSB,
19829 and additionally will have its 128-bit lanes swapped.
19830 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
19831 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
19832 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
19833 stands for other 12 bytes. */
19834 /* The bit whether element is from the same lane or the other
19835 lane is bit 4, so shift it up by 3 to the MSB position. */
19836 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
19837 gen_lowpart (V4DImode, mask),
19838 GEN_INT (3)));
19839 /* Clear MSB bits from the mask just in case it had them set. */
19840 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
19841 /* After this t1 will have MSB set for elements from other lane. */
19842 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
19843 /* Clear bits other than MSB. */
19844 emit_insn (gen_andv32qi3 (t1, t1, vt));
19845 /* Or in the lower bits from mask into t3. */
19846 emit_insn (gen_iorv32qi3 (t3, t1, t2));
19847 /* And invert MSB bits in t1, so MSB is set for elements from the same
19848 lane. */
19849 emit_insn (gen_xorv32qi3 (t1, t1, vt));
19850 /* Swap 128-bit lanes in t3. */
19851 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19852 gen_lowpart (V4DImode, t3),
19853 const2_rtx, GEN_INT (3),
19854 const0_rtx, const1_rtx));
19855 /* And or in the lower bits from mask into t1. */
19856 emit_insn (gen_iorv32qi3 (t1, t1, t2));
19857 if (one_operand_shuffle)
19858 {
19859 /* Each of these shuffles will put 0s in places where
19860 element from the other 128-bit lane is needed, otherwise
19861 will shuffle in the requested value. */
19862 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
19863 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
19864 /* For t3 the 128-bit lanes are swapped again. */
19865 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19866 gen_lowpart (V4DImode, t3),
19867 const2_rtx, GEN_INT (3),
19868 const0_rtx, const1_rtx));
19869 /* And oring both together leads to the result. */
19870 emit_insn (gen_iorv32qi3 (target, t1, t3));
19871 return;
19872 }
19873
19874 t4 = gen_reg_rtx (V32QImode);
19875 /* Similarly to the above one_operand_shuffle code,
19876 just for repeated twice for each operand. merge_two:
19877 code will merge the two results together. */
19878 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
19879 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
19880 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
19881 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
19882 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
19883 gen_lowpart (V4DImode, t4),
19884 const2_rtx, GEN_INT (3),
19885 const0_rtx, const1_rtx));
19886 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
19887 gen_lowpart (V4DImode, t3),
19888 const2_rtx, GEN_INT (3),
19889 const0_rtx, const1_rtx));
19890 emit_insn (gen_iorv32qi3 (t4, t2, t4));
19891 emit_insn (gen_iorv32qi3 (t3, t1, t3));
19892 t1 = t4;
19893 t2 = t3;
19894 goto merge_two;
19895
19896 default:
19897 gcc_assert (GET_MODE_SIZE (mode) <= 16);
19898 break;
19899 }
19900 }
19901
19902 if (TARGET_XOP)
19903 {
19904 /* The XOP VPPERM insn supports three inputs. By ignoring the
19905 one_operand_shuffle special case, we avoid creating another
19906 set of constant vectors in memory. */
19907 one_operand_shuffle = false;
19908
19909 /* mask = mask & {2*w-1, ...} */
19910 vt = GEN_INT (2*w - 1);
19911 }
19912 else
19913 {
19914 /* mask = mask & {w-1, ...} */
19915 vt = GEN_INT (w - 1);
19916 }
19917
19918 for (i = 0; i < w; i++)
19919 vec[i] = vt;
19920 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
19921 mask = expand_simple_binop (maskmode, AND, mask, vt,
19922 NULL_RTX, 0, OPTAB_DIRECT);
19923
19924 /* For non-QImode operations, convert the word permutation control
19925 into a byte permutation control. */
19926 if (mode != V16QImode)
19927 {
19928 mask = expand_simple_binop (maskmode, ASHIFT, mask,
19929 GEN_INT (exact_log2 (e)),
19930 NULL_RTX, 0, OPTAB_DIRECT);
19931
19932 /* Convert mask to vector of chars. */
19933 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
19934
19935 /* Replicate each of the input bytes into byte positions:
19936 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
19937 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
19938 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
19939 for (i = 0; i < 16; ++i)
19940 vec[i] = GEN_INT (i/e * e);
19941 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19942 vt = force_const_mem (V16QImode, vt);
19943 if (TARGET_XOP)
19944 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
19945 else
19946 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
19947
19948 /* Convert it into the byte positions by doing
19949 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
19950 for (i = 0; i < 16; ++i)
19951 vec[i] = GEN_INT (i % e);
19952 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
19953 vt = force_const_mem (V16QImode, vt);
19954 emit_insn (gen_addv16qi3 (mask, mask, vt));
19955 }
19956
19957 /* The actual shuffle operations all operate on V16QImode. */
19958 op0 = gen_lowpart (V16QImode, op0);
19959 op1 = gen_lowpart (V16QImode, op1);
19960 target = gen_lowpart (V16QImode, target);
19961
19962 if (TARGET_XOP)
19963 {
19964 emit_insn (gen_xop_pperm (target, op0, op1, mask));
19965 }
19966 else if (one_operand_shuffle)
19967 {
19968 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
19969 }
19970 else
19971 {
19972 rtx xops[6];
19973 bool ok;
19974
19975 /* Shuffle the two input vectors independently. */
19976 t1 = gen_reg_rtx (V16QImode);
19977 t2 = gen_reg_rtx (V16QImode);
19978 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
19979 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
19980
19981 merge_two:
19982 /* Then merge them together. The key is whether any given control
19983 element contained a bit set that indicates the second word. */
19984 mask = operands[3];
19985 vt = GEN_INT (w);
19986 if (maskmode == V2DImode && !TARGET_SSE4_1)
19987 {
19988 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
19989 more shuffle to convert the V2DI input mask into a V4SI
19990 input mask. At which point the masking that expand_int_vcond
19991 will work as desired. */
19992 rtx t3 = gen_reg_rtx (V4SImode);
19993 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
19994 const0_rtx, const0_rtx,
19995 const2_rtx, const2_rtx));
19996 mask = t3;
19997 maskmode = V4SImode;
19998 e = w = 4;
19999 }
20000
20001 for (i = 0; i < w; i++)
20002 vec[i] = vt;
20003 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20004 vt = force_reg (maskmode, vt);
20005 mask = expand_simple_binop (maskmode, AND, mask, vt,
20006 NULL_RTX, 0, OPTAB_DIRECT);
20007
20008 xops[0] = gen_lowpart (mode, operands[0]);
20009 xops[1] = gen_lowpart (mode, t2);
20010 xops[2] = gen_lowpart (mode, t1);
20011 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20012 xops[4] = mask;
20013 xops[5] = vt;
20014 ok = ix86_expand_int_vcond (xops);
20015 gcc_assert (ok);
20016 }
20017 }
20018
20019 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20020 true if we should do zero extension, else sign extension. HIGH_P is
20021 true if we want the N/2 high elements, else the low elements. */
20022
20023 void
20024 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
20025 {
20026 enum machine_mode imode = GET_MODE (operands[1]);
20027 rtx tmp, dest;
20028
20029 if (TARGET_SSE4_1)
20030 {
20031 rtx (*unpack)(rtx, rtx);
20032 rtx (*extract)(rtx, rtx) = NULL;
20033 enum machine_mode halfmode = BLKmode;
20034
20035 switch (imode)
20036 {
20037 case V32QImode:
20038 if (unsigned_p)
20039 unpack = gen_avx2_zero_extendv16qiv16hi2;
20040 else
20041 unpack = gen_avx2_sign_extendv16qiv16hi2;
20042 halfmode = V16QImode;
20043 extract
20044 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20045 break;
20046 case V16HImode:
20047 if (unsigned_p)
20048 unpack = gen_avx2_zero_extendv8hiv8si2;
20049 else
20050 unpack = gen_avx2_sign_extendv8hiv8si2;
20051 halfmode = V8HImode;
20052 extract
20053 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20054 break;
20055 case V8SImode:
20056 if (unsigned_p)
20057 unpack = gen_avx2_zero_extendv4siv4di2;
20058 else
20059 unpack = gen_avx2_sign_extendv4siv4di2;
20060 halfmode = V4SImode;
20061 extract
20062 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20063 break;
20064 case V16QImode:
20065 if (unsigned_p)
20066 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20067 else
20068 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20069 break;
20070 case V8HImode:
20071 if (unsigned_p)
20072 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20073 else
20074 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20075 break;
20076 case V4SImode:
20077 if (unsigned_p)
20078 unpack = gen_sse4_1_zero_extendv2siv2di2;
20079 else
20080 unpack = gen_sse4_1_sign_extendv2siv2di2;
20081 break;
20082 default:
20083 gcc_unreachable ();
20084 }
20085
20086 if (GET_MODE_SIZE (imode) == 32)
20087 {
20088 tmp = gen_reg_rtx (halfmode);
20089 emit_insn (extract (tmp, operands[1]));
20090 }
20091 else if (high_p)
20092 {
20093 /* Shift higher 8 bytes to lower 8 bytes. */
20094 tmp = gen_reg_rtx (imode);
20095 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20096 gen_lowpart (V1TImode, operands[1]),
20097 GEN_INT (64)));
20098 }
20099 else
20100 tmp = operands[1];
20101
20102 emit_insn (unpack (operands[0], tmp));
20103 }
20104 else
20105 {
20106 rtx (*unpack)(rtx, rtx, rtx);
20107
20108 switch (imode)
20109 {
20110 case V16QImode:
20111 if (high_p)
20112 unpack = gen_vec_interleave_highv16qi;
20113 else
20114 unpack = gen_vec_interleave_lowv16qi;
20115 break;
20116 case V8HImode:
20117 if (high_p)
20118 unpack = gen_vec_interleave_highv8hi;
20119 else
20120 unpack = gen_vec_interleave_lowv8hi;
20121 break;
20122 case V4SImode:
20123 if (high_p)
20124 unpack = gen_vec_interleave_highv4si;
20125 else
20126 unpack = gen_vec_interleave_lowv4si;
20127 break;
20128 default:
20129 gcc_unreachable ();
20130 }
20131
20132 dest = gen_lowpart (imode, operands[0]);
20133
20134 if (unsigned_p)
20135 tmp = force_reg (imode, CONST0_RTX (imode));
20136 else
20137 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20138 operands[1], pc_rtx, pc_rtx);
20139
20140 emit_insn (unpack (dest, operands[1], tmp));
20141 }
20142 }
20143
20144 /* Expand conditional increment or decrement using adb/sbb instructions.
20145 The default case using setcc followed by the conditional move can be
20146 done by generic code. */
20147 bool
20148 ix86_expand_int_addcc (rtx operands[])
20149 {
20150 enum rtx_code code = GET_CODE (operands[1]);
20151 rtx flags;
20152 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20153 rtx compare_op;
20154 rtx val = const0_rtx;
20155 bool fpcmp = false;
20156 enum machine_mode mode;
20157 rtx op0 = XEXP (operands[1], 0);
20158 rtx op1 = XEXP (operands[1], 1);
20159
20160 if (operands[3] != const1_rtx
20161 && operands[3] != constm1_rtx)
20162 return false;
20163 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20164 return false;
20165 code = GET_CODE (compare_op);
20166
20167 flags = XEXP (compare_op, 0);
20168
20169 if (GET_MODE (flags) == CCFPmode
20170 || GET_MODE (flags) == CCFPUmode)
20171 {
20172 fpcmp = true;
20173 code = ix86_fp_compare_code_to_integer (code);
20174 }
20175
20176 if (code != LTU)
20177 {
20178 val = constm1_rtx;
20179 if (fpcmp)
20180 PUT_CODE (compare_op,
20181 reverse_condition_maybe_unordered
20182 (GET_CODE (compare_op)));
20183 else
20184 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20185 }
20186
20187 mode = GET_MODE (operands[0]);
20188
20189 /* Construct either adc or sbb insn. */
20190 if ((code == LTU) == (operands[3] == constm1_rtx))
20191 {
20192 switch (mode)
20193 {
20194 case QImode:
20195 insn = gen_subqi3_carry;
20196 break;
20197 case HImode:
20198 insn = gen_subhi3_carry;
20199 break;
20200 case SImode:
20201 insn = gen_subsi3_carry;
20202 break;
20203 case DImode:
20204 insn = gen_subdi3_carry;
20205 break;
20206 default:
20207 gcc_unreachable ();
20208 }
20209 }
20210 else
20211 {
20212 switch (mode)
20213 {
20214 case QImode:
20215 insn = gen_addqi3_carry;
20216 break;
20217 case HImode:
20218 insn = gen_addhi3_carry;
20219 break;
20220 case SImode:
20221 insn = gen_addsi3_carry;
20222 break;
20223 case DImode:
20224 insn = gen_adddi3_carry;
20225 break;
20226 default:
20227 gcc_unreachable ();
20228 }
20229 }
20230 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20231
20232 return true;
20233 }
20234
20235
20236 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20237 but works for floating pointer parameters and nonoffsetable memories.
20238 For pushes, it returns just stack offsets; the values will be saved
20239 in the right order. Maximally three parts are generated. */
20240
20241 static int
20242 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20243 {
20244 int size;
20245
20246 if (!TARGET_64BIT)
20247 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20248 else
20249 size = (GET_MODE_SIZE (mode) + 4) / 8;
20250
20251 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20252 gcc_assert (size >= 2 && size <= 4);
20253
20254 /* Optimize constant pool reference to immediates. This is used by fp
20255 moves, that force all constants to memory to allow combining. */
20256 if (MEM_P (operand) && MEM_READONLY_P (operand))
20257 {
20258 rtx tmp = maybe_get_pool_constant (operand);
20259 if (tmp)
20260 operand = tmp;
20261 }
20262
20263 if (MEM_P (operand) && !offsettable_memref_p (operand))
20264 {
20265 /* The only non-offsetable memories we handle are pushes. */
20266 int ok = push_operand (operand, VOIDmode);
20267
20268 gcc_assert (ok);
20269
20270 operand = copy_rtx (operand);
20271 PUT_MODE (operand, Pmode);
20272 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20273 return size;
20274 }
20275
20276 if (GET_CODE (operand) == CONST_VECTOR)
20277 {
20278 enum machine_mode imode = int_mode_for_mode (mode);
20279 /* Caution: if we looked through a constant pool memory above,
20280 the operand may actually have a different mode now. That's
20281 ok, since we want to pun this all the way back to an integer. */
20282 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20283 gcc_assert (operand != NULL);
20284 mode = imode;
20285 }
20286
20287 if (!TARGET_64BIT)
20288 {
20289 if (mode == DImode)
20290 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20291 else
20292 {
20293 int i;
20294
20295 if (REG_P (operand))
20296 {
20297 gcc_assert (reload_completed);
20298 for (i = 0; i < size; i++)
20299 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20300 }
20301 else if (offsettable_memref_p (operand))
20302 {
20303 operand = adjust_address (operand, SImode, 0);
20304 parts[0] = operand;
20305 for (i = 1; i < size; i++)
20306 parts[i] = adjust_address (operand, SImode, 4 * i);
20307 }
20308 else if (GET_CODE (operand) == CONST_DOUBLE)
20309 {
20310 REAL_VALUE_TYPE r;
20311 long l[4];
20312
20313 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20314 switch (mode)
20315 {
20316 case TFmode:
20317 real_to_target (l, &r, mode);
20318 parts[3] = gen_int_mode (l[3], SImode);
20319 parts[2] = gen_int_mode (l[2], SImode);
20320 break;
20321 case XFmode:
20322 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
20323 parts[2] = gen_int_mode (l[2], SImode);
20324 break;
20325 case DFmode:
20326 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20327 break;
20328 default:
20329 gcc_unreachable ();
20330 }
20331 parts[1] = gen_int_mode (l[1], SImode);
20332 parts[0] = gen_int_mode (l[0], SImode);
20333 }
20334 else
20335 gcc_unreachable ();
20336 }
20337 }
20338 else
20339 {
20340 if (mode == TImode)
20341 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20342 if (mode == XFmode || mode == TFmode)
20343 {
20344 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20345 if (REG_P (operand))
20346 {
20347 gcc_assert (reload_completed);
20348 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20349 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20350 }
20351 else if (offsettable_memref_p (operand))
20352 {
20353 operand = adjust_address (operand, DImode, 0);
20354 parts[0] = operand;
20355 parts[1] = adjust_address (operand, upper_mode, 8);
20356 }
20357 else if (GET_CODE (operand) == CONST_DOUBLE)
20358 {
20359 REAL_VALUE_TYPE r;
20360 long l[4];
20361
20362 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20363 real_to_target (l, &r, mode);
20364
20365 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20366 if (HOST_BITS_PER_WIDE_INT >= 64)
20367 parts[0]
20368 = gen_int_mode
20369 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20370 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20371 DImode);
20372 else
20373 parts[0] = immed_double_const (l[0], l[1], DImode);
20374
20375 if (upper_mode == SImode)
20376 parts[1] = gen_int_mode (l[2], SImode);
20377 else if (HOST_BITS_PER_WIDE_INT >= 64)
20378 parts[1]
20379 = gen_int_mode
20380 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20381 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20382 DImode);
20383 else
20384 parts[1] = immed_double_const (l[2], l[3], DImode);
20385 }
20386 else
20387 gcc_unreachable ();
20388 }
20389 }
20390
20391 return size;
20392 }
20393
20394 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20395 Return false when normal moves are needed; true when all required
20396 insns have been emitted. Operands 2-4 contain the input values
20397 int the correct order; operands 5-7 contain the output values. */
20398
20399 void
20400 ix86_split_long_move (rtx operands[])
20401 {
20402 rtx part[2][4];
20403 int nparts, i, j;
20404 int push = 0;
20405 int collisions = 0;
20406 enum machine_mode mode = GET_MODE (operands[0]);
20407 bool collisionparts[4];
20408
20409 /* The DFmode expanders may ask us to move double.
20410 For 64bit target this is single move. By hiding the fact
20411 here we simplify i386.md splitters. */
20412 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20413 {
20414 /* Optimize constant pool reference to immediates. This is used by
20415 fp moves, that force all constants to memory to allow combining. */
20416
20417 if (MEM_P (operands[1])
20418 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20419 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20420 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20421 if (push_operand (operands[0], VOIDmode))
20422 {
20423 operands[0] = copy_rtx (operands[0]);
20424 PUT_MODE (operands[0], Pmode);
20425 }
20426 else
20427 operands[0] = gen_lowpart (DImode, operands[0]);
20428 operands[1] = gen_lowpart (DImode, operands[1]);
20429 emit_move_insn (operands[0], operands[1]);
20430 return;
20431 }
20432
20433 /* The only non-offsettable memory we handle is push. */
20434 if (push_operand (operands[0], VOIDmode))
20435 push = 1;
20436 else
20437 gcc_assert (!MEM_P (operands[0])
20438 || offsettable_memref_p (operands[0]));
20439
20440 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20441 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20442
20443 /* When emitting push, take care for source operands on the stack. */
20444 if (push && MEM_P (operands[1])
20445 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20446 {
20447 rtx src_base = XEXP (part[1][nparts - 1], 0);
20448
20449 /* Compensate for the stack decrement by 4. */
20450 if (!TARGET_64BIT && nparts == 3
20451 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
20452 src_base = plus_constant (src_base, 4);
20453
20454 /* src_base refers to the stack pointer and is
20455 automatically decreased by emitted push. */
20456 for (i = 0; i < nparts; i++)
20457 part[1][i] = change_address (part[1][i],
20458 GET_MODE (part[1][i]), src_base);
20459 }
20460
20461 /* We need to do copy in the right order in case an address register
20462 of the source overlaps the destination. */
20463 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
20464 {
20465 rtx tmp;
20466
20467 for (i = 0; i < nparts; i++)
20468 {
20469 collisionparts[i]
20470 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
20471 if (collisionparts[i])
20472 collisions++;
20473 }
20474
20475 /* Collision in the middle part can be handled by reordering. */
20476 if (collisions == 1 && nparts == 3 && collisionparts [1])
20477 {
20478 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20479 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20480 }
20481 else if (collisions == 1
20482 && nparts == 4
20483 && (collisionparts [1] || collisionparts [2]))
20484 {
20485 if (collisionparts [1])
20486 {
20487 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
20488 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
20489 }
20490 else
20491 {
20492 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
20493 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
20494 }
20495 }
20496
20497 /* If there are more collisions, we can't handle it by reordering.
20498 Do an lea to the last part and use only one colliding move. */
20499 else if (collisions > 1)
20500 {
20501 rtx base;
20502
20503 collisions = 1;
20504
20505 base = part[0][nparts - 1];
20506
20507 /* Handle the case when the last part isn't valid for lea.
20508 Happens in 64-bit mode storing the 12-byte XFmode. */
20509 if (GET_MODE (base) != Pmode)
20510 base = gen_rtx_REG (Pmode, REGNO (base));
20511
20512 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
20513 part[1][0] = replace_equiv_address (part[1][0], base);
20514 for (i = 1; i < nparts; i++)
20515 {
20516 tmp = plus_constant (base, UNITS_PER_WORD * i);
20517 part[1][i] = replace_equiv_address (part[1][i], tmp);
20518 }
20519 }
20520 }
20521
20522 if (push)
20523 {
20524 if (!TARGET_64BIT)
20525 {
20526 if (nparts == 3)
20527 {
20528 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
20529 emit_insn (gen_addsi3 (stack_pointer_rtx,
20530 stack_pointer_rtx, GEN_INT (-4)));
20531 emit_move_insn (part[0][2], part[1][2]);
20532 }
20533 else if (nparts == 4)
20534 {
20535 emit_move_insn (part[0][3], part[1][3]);
20536 emit_move_insn (part[0][2], part[1][2]);
20537 }
20538 }
20539 else
20540 {
20541 /* In 64bit mode we don't have 32bit push available. In case this is
20542 register, it is OK - we will just use larger counterpart. We also
20543 retype memory - these comes from attempt to avoid REX prefix on
20544 moving of second half of TFmode value. */
20545 if (GET_MODE (part[1][1]) == SImode)
20546 {
20547 switch (GET_CODE (part[1][1]))
20548 {
20549 case MEM:
20550 part[1][1] = adjust_address (part[1][1], DImode, 0);
20551 break;
20552
20553 case REG:
20554 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
20555 break;
20556
20557 default:
20558 gcc_unreachable ();
20559 }
20560
20561 if (GET_MODE (part[1][0]) == SImode)
20562 part[1][0] = part[1][1];
20563 }
20564 }
20565 emit_move_insn (part[0][1], part[1][1]);
20566 emit_move_insn (part[0][0], part[1][0]);
20567 return;
20568 }
20569
20570 /* Choose correct order to not overwrite the source before it is copied. */
20571 if ((REG_P (part[0][0])
20572 && REG_P (part[1][1])
20573 && (REGNO (part[0][0]) == REGNO (part[1][1])
20574 || (nparts == 3
20575 && REGNO (part[0][0]) == REGNO (part[1][2]))
20576 || (nparts == 4
20577 && REGNO (part[0][0]) == REGNO (part[1][3]))))
20578 || (collisions > 0
20579 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
20580 {
20581 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
20582 {
20583 operands[2 + i] = part[0][j];
20584 operands[6 + i] = part[1][j];
20585 }
20586 }
20587 else
20588 {
20589 for (i = 0; i < nparts; i++)
20590 {
20591 operands[2 + i] = part[0][i];
20592 operands[6 + i] = part[1][i];
20593 }
20594 }
20595
20596 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
20597 if (optimize_insn_for_size_p ())
20598 {
20599 for (j = 0; j < nparts - 1; j++)
20600 if (CONST_INT_P (operands[6 + j])
20601 && operands[6 + j] != const0_rtx
20602 && REG_P (operands[2 + j]))
20603 for (i = j; i < nparts - 1; i++)
20604 if (CONST_INT_P (operands[7 + i])
20605 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
20606 operands[7 + i] = operands[2 + j];
20607 }
20608
20609 for (i = 0; i < nparts; i++)
20610 emit_move_insn (operands[2 + i], operands[6 + i]);
20611
20612 return;
20613 }
20614
20615 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
20616 left shift by a constant, either using a single shift or
20617 a sequence of add instructions. */
20618
20619 static void
20620 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
20621 {
20622 rtx (*insn)(rtx, rtx, rtx);
20623
20624 if (count == 1
20625 || (count * ix86_cost->add <= ix86_cost->shift_const
20626 && !optimize_insn_for_size_p ()))
20627 {
20628 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
20629 while (count-- > 0)
20630 emit_insn (insn (operand, operand, operand));
20631 }
20632 else
20633 {
20634 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20635 emit_insn (insn (operand, operand, GEN_INT (count)));
20636 }
20637 }
20638
20639 void
20640 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
20641 {
20642 rtx (*gen_ashl3)(rtx, rtx, rtx);
20643 rtx (*gen_shld)(rtx, rtx, rtx);
20644 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20645
20646 rtx low[2], high[2];
20647 int count;
20648
20649 if (CONST_INT_P (operands[2]))
20650 {
20651 split_double_mode (mode, operands, 2, low, high);
20652 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20653
20654 if (count >= half_width)
20655 {
20656 emit_move_insn (high[0], low[1]);
20657 emit_move_insn (low[0], const0_rtx);
20658
20659 if (count > half_width)
20660 ix86_expand_ashl_const (high[0], count - half_width, mode);
20661 }
20662 else
20663 {
20664 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20665
20666 if (!rtx_equal_p (operands[0], operands[1]))
20667 emit_move_insn (operands[0], operands[1]);
20668
20669 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
20670 ix86_expand_ashl_const (low[0], count, mode);
20671 }
20672 return;
20673 }
20674
20675 split_double_mode (mode, operands, 1, low, high);
20676
20677 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
20678
20679 if (operands[1] == const1_rtx)
20680 {
20681 /* Assuming we've chosen a QImode capable registers, then 1 << N
20682 can be done with two 32/64-bit shifts, no branches, no cmoves. */
20683 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
20684 {
20685 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
20686
20687 ix86_expand_clear (low[0]);
20688 ix86_expand_clear (high[0]);
20689 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
20690
20691 d = gen_lowpart (QImode, low[0]);
20692 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20693 s = gen_rtx_EQ (QImode, flags, const0_rtx);
20694 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20695
20696 d = gen_lowpart (QImode, high[0]);
20697 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
20698 s = gen_rtx_NE (QImode, flags, const0_rtx);
20699 emit_insn (gen_rtx_SET (VOIDmode, d, s));
20700 }
20701
20702 /* Otherwise, we can get the same results by manually performing
20703 a bit extract operation on bit 5/6, and then performing the two
20704 shifts. The two methods of getting 0/1 into low/high are exactly
20705 the same size. Avoiding the shift in the bit extract case helps
20706 pentium4 a bit; no one else seems to care much either way. */
20707 else
20708 {
20709 enum machine_mode half_mode;
20710 rtx (*gen_lshr3)(rtx, rtx, rtx);
20711 rtx (*gen_and3)(rtx, rtx, rtx);
20712 rtx (*gen_xor3)(rtx, rtx, rtx);
20713 HOST_WIDE_INT bits;
20714 rtx x;
20715
20716 if (mode == DImode)
20717 {
20718 half_mode = SImode;
20719 gen_lshr3 = gen_lshrsi3;
20720 gen_and3 = gen_andsi3;
20721 gen_xor3 = gen_xorsi3;
20722 bits = 5;
20723 }
20724 else
20725 {
20726 half_mode = DImode;
20727 gen_lshr3 = gen_lshrdi3;
20728 gen_and3 = gen_anddi3;
20729 gen_xor3 = gen_xordi3;
20730 bits = 6;
20731 }
20732
20733 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
20734 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
20735 else
20736 x = gen_lowpart (half_mode, operands[2]);
20737 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
20738
20739 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
20740 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
20741 emit_move_insn (low[0], high[0]);
20742 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
20743 }
20744
20745 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20746 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
20747 return;
20748 }
20749
20750 if (operands[1] == constm1_rtx)
20751 {
20752 /* For -1 << N, we can avoid the shld instruction, because we
20753 know that we're shifting 0...31/63 ones into a -1. */
20754 emit_move_insn (low[0], constm1_rtx);
20755 if (optimize_insn_for_size_p ())
20756 emit_move_insn (high[0], low[0]);
20757 else
20758 emit_move_insn (high[0], constm1_rtx);
20759 }
20760 else
20761 {
20762 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
20763
20764 if (!rtx_equal_p (operands[0], operands[1]))
20765 emit_move_insn (operands[0], operands[1]);
20766
20767 split_double_mode (mode, operands, 1, low, high);
20768 emit_insn (gen_shld (high[0], low[0], operands[2]));
20769 }
20770
20771 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
20772
20773 if (TARGET_CMOVE && scratch)
20774 {
20775 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20776 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20777
20778 ix86_expand_clear (scratch);
20779 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
20780 }
20781 else
20782 {
20783 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20784 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20785
20786 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
20787 }
20788 }
20789
20790 void
20791 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
20792 {
20793 rtx (*gen_ashr3)(rtx, rtx, rtx)
20794 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
20795 rtx (*gen_shrd)(rtx, rtx, rtx);
20796 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20797
20798 rtx low[2], high[2];
20799 int count;
20800
20801 if (CONST_INT_P (operands[2]))
20802 {
20803 split_double_mode (mode, operands, 2, low, high);
20804 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20805
20806 if (count == GET_MODE_BITSIZE (mode) - 1)
20807 {
20808 emit_move_insn (high[0], high[1]);
20809 emit_insn (gen_ashr3 (high[0], high[0],
20810 GEN_INT (half_width - 1)));
20811 emit_move_insn (low[0], high[0]);
20812
20813 }
20814 else if (count >= half_width)
20815 {
20816 emit_move_insn (low[0], high[1]);
20817 emit_move_insn (high[0], low[0]);
20818 emit_insn (gen_ashr3 (high[0], high[0],
20819 GEN_INT (half_width - 1)));
20820
20821 if (count > half_width)
20822 emit_insn (gen_ashr3 (low[0], low[0],
20823 GEN_INT (count - half_width)));
20824 }
20825 else
20826 {
20827 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20828
20829 if (!rtx_equal_p (operands[0], operands[1]))
20830 emit_move_insn (operands[0], operands[1]);
20831
20832 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20833 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
20834 }
20835 }
20836 else
20837 {
20838 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20839
20840 if (!rtx_equal_p (operands[0], operands[1]))
20841 emit_move_insn (operands[0], operands[1]);
20842
20843 split_double_mode (mode, operands, 1, low, high);
20844
20845 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20846 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
20847
20848 if (TARGET_CMOVE && scratch)
20849 {
20850 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20851 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20852
20853 emit_move_insn (scratch, high[0]);
20854 emit_insn (gen_ashr3 (scratch, scratch,
20855 GEN_INT (half_width - 1)));
20856 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20857 scratch));
20858 }
20859 else
20860 {
20861 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
20862 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
20863
20864 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
20865 }
20866 }
20867 }
20868
20869 void
20870 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
20871 {
20872 rtx (*gen_lshr3)(rtx, rtx, rtx)
20873 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
20874 rtx (*gen_shrd)(rtx, rtx, rtx);
20875 int half_width = GET_MODE_BITSIZE (mode) >> 1;
20876
20877 rtx low[2], high[2];
20878 int count;
20879
20880 if (CONST_INT_P (operands[2]))
20881 {
20882 split_double_mode (mode, operands, 2, low, high);
20883 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
20884
20885 if (count >= half_width)
20886 {
20887 emit_move_insn (low[0], high[1]);
20888 ix86_expand_clear (high[0]);
20889
20890 if (count > half_width)
20891 emit_insn (gen_lshr3 (low[0], low[0],
20892 GEN_INT (count - half_width)));
20893 }
20894 else
20895 {
20896 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20897
20898 if (!rtx_equal_p (operands[0], operands[1]))
20899 emit_move_insn (operands[0], operands[1]);
20900
20901 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
20902 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
20903 }
20904 }
20905 else
20906 {
20907 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
20908
20909 if (!rtx_equal_p (operands[0], operands[1]))
20910 emit_move_insn (operands[0], operands[1]);
20911
20912 split_double_mode (mode, operands, 1, low, high);
20913
20914 emit_insn (gen_shrd (low[0], high[0], operands[2]));
20915 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
20916
20917 if (TARGET_CMOVE && scratch)
20918 {
20919 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
20920 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
20921
20922 ix86_expand_clear (scratch);
20923 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
20924 scratch));
20925 }
20926 else
20927 {
20928 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
20929 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
20930
20931 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
20932 }
20933 }
20934 }
20935
20936 /* Predict just emitted jump instruction to be taken with probability PROB. */
20937 static void
20938 predict_jump (int prob)
20939 {
20940 rtx insn = get_last_insn ();
20941 gcc_assert (JUMP_P (insn));
20942 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
20943 }
20944
20945 /* Helper function for the string operations below. Dest VARIABLE whether
20946 it is aligned to VALUE bytes. If true, jump to the label. */
20947 static rtx
20948 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
20949 {
20950 rtx label = gen_label_rtx ();
20951 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
20952 if (GET_MODE (variable) == DImode)
20953 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
20954 else
20955 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
20956 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
20957 1, label);
20958 if (epilogue)
20959 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20960 else
20961 predict_jump (REG_BR_PROB_BASE * 90 / 100);
20962 return label;
20963 }
20964
20965 /* Adjust COUNTER by the VALUE. */
20966 static void
20967 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
20968 {
20969 rtx (*gen_add)(rtx, rtx, rtx)
20970 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
20971
20972 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
20973 }
20974
20975 /* Zero extend possibly SImode EXP to Pmode register. */
20976 rtx
20977 ix86_zero_extend_to_Pmode (rtx exp)
20978 {
20979 rtx r;
20980 if (GET_MODE (exp) == VOIDmode)
20981 return force_reg (Pmode, exp);
20982 if (GET_MODE (exp) == Pmode)
20983 return copy_to_mode_reg (Pmode, exp);
20984 r = gen_reg_rtx (Pmode);
20985 emit_insn (gen_zero_extendsidi2 (r, exp));
20986 return r;
20987 }
20988
20989 /* Divide COUNTREG by SCALE. */
20990 static rtx
20991 scale_counter (rtx countreg, int scale)
20992 {
20993 rtx sc;
20994
20995 if (scale == 1)
20996 return countreg;
20997 if (CONST_INT_P (countreg))
20998 return GEN_INT (INTVAL (countreg) / scale);
20999 gcc_assert (REG_P (countreg));
21000
21001 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21002 GEN_INT (exact_log2 (scale)),
21003 NULL, 1, OPTAB_DIRECT);
21004 return sc;
21005 }
21006
21007 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21008 DImode for constant loop counts. */
21009
21010 static enum machine_mode
21011 counter_mode (rtx count_exp)
21012 {
21013 if (GET_MODE (count_exp) != VOIDmode)
21014 return GET_MODE (count_exp);
21015 if (!CONST_INT_P (count_exp))
21016 return Pmode;
21017 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21018 return DImode;
21019 return SImode;
21020 }
21021
21022 /* When SRCPTR is non-NULL, output simple loop to move memory
21023 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21024 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21025 equivalent loop to set memory by VALUE (supposed to be in MODE).
21026
21027 The size is rounded down to whole number of chunk size moved at once.
21028 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21029
21030
21031 static void
21032 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21033 rtx destptr, rtx srcptr, rtx value,
21034 rtx count, enum machine_mode mode, int unroll,
21035 int expected_size)
21036 {
21037 rtx out_label, top_label, iter, tmp;
21038 enum machine_mode iter_mode = counter_mode (count);
21039 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21040 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21041 rtx size;
21042 rtx x_addr;
21043 rtx y_addr;
21044 int i;
21045
21046 top_label = gen_label_rtx ();
21047 out_label = gen_label_rtx ();
21048 iter = gen_reg_rtx (iter_mode);
21049
21050 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21051 NULL, 1, OPTAB_DIRECT);
21052 /* Those two should combine. */
21053 if (piece_size == const1_rtx)
21054 {
21055 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21056 true, out_label);
21057 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21058 }
21059 emit_move_insn (iter, const0_rtx);
21060
21061 emit_label (top_label);
21062
21063 tmp = convert_modes (Pmode, iter_mode, iter, true);
21064 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21065 destmem = change_address (destmem, mode, x_addr);
21066
21067 if (srcmem)
21068 {
21069 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21070 srcmem = change_address (srcmem, mode, y_addr);
21071
21072 /* When unrolling for chips that reorder memory reads and writes,
21073 we can save registers by using single temporary.
21074 Also using 4 temporaries is overkill in 32bit mode. */
21075 if (!TARGET_64BIT && 0)
21076 {
21077 for (i = 0; i < unroll; i++)
21078 {
21079 if (i)
21080 {
21081 destmem =
21082 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21083 srcmem =
21084 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21085 }
21086 emit_move_insn (destmem, srcmem);
21087 }
21088 }
21089 else
21090 {
21091 rtx tmpreg[4];
21092 gcc_assert (unroll <= 4);
21093 for (i = 0; i < unroll; i++)
21094 {
21095 tmpreg[i] = gen_reg_rtx (mode);
21096 if (i)
21097 {
21098 srcmem =
21099 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21100 }
21101 emit_move_insn (tmpreg[i], srcmem);
21102 }
21103 for (i = 0; i < unroll; i++)
21104 {
21105 if (i)
21106 {
21107 destmem =
21108 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21109 }
21110 emit_move_insn (destmem, tmpreg[i]);
21111 }
21112 }
21113 }
21114 else
21115 for (i = 0; i < unroll; i++)
21116 {
21117 if (i)
21118 destmem =
21119 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21120 emit_move_insn (destmem, value);
21121 }
21122
21123 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21124 true, OPTAB_LIB_WIDEN);
21125 if (tmp != iter)
21126 emit_move_insn (iter, tmp);
21127
21128 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21129 true, top_label);
21130 if (expected_size != -1)
21131 {
21132 expected_size /= GET_MODE_SIZE (mode) * unroll;
21133 if (expected_size == 0)
21134 predict_jump (0);
21135 else if (expected_size > REG_BR_PROB_BASE)
21136 predict_jump (REG_BR_PROB_BASE - 1);
21137 else
21138 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21139 }
21140 else
21141 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21142 iter = ix86_zero_extend_to_Pmode (iter);
21143 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21144 true, OPTAB_LIB_WIDEN);
21145 if (tmp != destptr)
21146 emit_move_insn (destptr, tmp);
21147 if (srcptr)
21148 {
21149 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21150 true, OPTAB_LIB_WIDEN);
21151 if (tmp != srcptr)
21152 emit_move_insn (srcptr, tmp);
21153 }
21154 emit_label (out_label);
21155 }
21156
21157 /* Output "rep; mov" instruction.
21158 Arguments have same meaning as for previous function */
21159 static void
21160 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21161 rtx destptr, rtx srcptr,
21162 rtx count,
21163 enum machine_mode mode)
21164 {
21165 rtx destexp;
21166 rtx srcexp;
21167 rtx countreg;
21168 HOST_WIDE_INT rounded_count;
21169
21170 /* If the size is known, it is shorter to use rep movs. */
21171 if (mode == QImode && CONST_INT_P (count)
21172 && !(INTVAL (count) & 3))
21173 mode = SImode;
21174
21175 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21176 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21177 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21178 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21179 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21180 if (mode != QImode)
21181 {
21182 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21183 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21184 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21185 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21186 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21187 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21188 }
21189 else
21190 {
21191 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21192 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21193 }
21194 if (CONST_INT_P (count))
21195 {
21196 rounded_count = (INTVAL (count)
21197 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21198 destmem = shallow_copy_rtx (destmem);
21199 srcmem = shallow_copy_rtx (srcmem);
21200 set_mem_size (destmem, rounded_count);
21201 set_mem_size (srcmem, rounded_count);
21202 }
21203 else
21204 {
21205 if (MEM_SIZE_KNOWN_P (destmem))
21206 clear_mem_size (destmem);
21207 if (MEM_SIZE_KNOWN_P (srcmem))
21208 clear_mem_size (srcmem);
21209 }
21210 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21211 destexp, srcexp));
21212 }
21213
21214 /* Output "rep; stos" instruction.
21215 Arguments have same meaning as for previous function */
21216 static void
21217 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21218 rtx count, enum machine_mode mode,
21219 rtx orig_value)
21220 {
21221 rtx destexp;
21222 rtx countreg;
21223 HOST_WIDE_INT rounded_count;
21224
21225 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21226 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21227 value = force_reg (mode, gen_lowpart (mode, value));
21228 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21229 if (mode != QImode)
21230 {
21231 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21232 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21233 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21234 }
21235 else
21236 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21237 if (orig_value == const0_rtx && CONST_INT_P (count))
21238 {
21239 rounded_count = (INTVAL (count)
21240 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21241 destmem = shallow_copy_rtx (destmem);
21242 set_mem_size (destmem, rounded_count);
21243 }
21244 else if (MEM_SIZE_KNOWN_P (destmem))
21245 clear_mem_size (destmem);
21246 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21247 }
21248
21249 static void
21250 emit_strmov (rtx destmem, rtx srcmem,
21251 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21252 {
21253 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21254 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21255 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21256 }
21257
21258 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21259 static void
21260 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21261 rtx destptr, rtx srcptr, rtx count, int max_size)
21262 {
21263 rtx src, dest;
21264 if (CONST_INT_P (count))
21265 {
21266 HOST_WIDE_INT countval = INTVAL (count);
21267 int offset = 0;
21268
21269 if ((countval & 0x10) && max_size > 16)
21270 {
21271 if (TARGET_64BIT)
21272 {
21273 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21274 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21275 }
21276 else
21277 gcc_unreachable ();
21278 offset += 16;
21279 }
21280 if ((countval & 0x08) && max_size > 8)
21281 {
21282 if (TARGET_64BIT)
21283 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21284 else
21285 {
21286 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21287 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21288 }
21289 offset += 8;
21290 }
21291 if ((countval & 0x04) && max_size > 4)
21292 {
21293 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21294 offset += 4;
21295 }
21296 if ((countval & 0x02) && max_size > 2)
21297 {
21298 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21299 offset += 2;
21300 }
21301 if ((countval & 0x01) && max_size > 1)
21302 {
21303 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21304 offset += 1;
21305 }
21306 return;
21307 }
21308 if (max_size > 8)
21309 {
21310 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21311 count, 1, OPTAB_DIRECT);
21312 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21313 count, QImode, 1, 4);
21314 return;
21315 }
21316
21317 /* When there are stringops, we can cheaply increase dest and src pointers.
21318 Otherwise we save code size by maintaining offset (zero is readily
21319 available from preceding rep operation) and using x86 addressing modes.
21320 */
21321 if (TARGET_SINGLE_STRINGOP)
21322 {
21323 if (max_size > 4)
21324 {
21325 rtx label = ix86_expand_aligntest (count, 4, true);
21326 src = change_address (srcmem, SImode, srcptr);
21327 dest = change_address (destmem, SImode, destptr);
21328 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21329 emit_label (label);
21330 LABEL_NUSES (label) = 1;
21331 }
21332 if (max_size > 2)
21333 {
21334 rtx label = ix86_expand_aligntest (count, 2, true);
21335 src = change_address (srcmem, HImode, srcptr);
21336 dest = change_address (destmem, HImode, destptr);
21337 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21338 emit_label (label);
21339 LABEL_NUSES (label) = 1;
21340 }
21341 if (max_size > 1)
21342 {
21343 rtx label = ix86_expand_aligntest (count, 1, true);
21344 src = change_address (srcmem, QImode, srcptr);
21345 dest = change_address (destmem, QImode, destptr);
21346 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21347 emit_label (label);
21348 LABEL_NUSES (label) = 1;
21349 }
21350 }
21351 else
21352 {
21353 rtx offset = force_reg (Pmode, const0_rtx);
21354 rtx tmp;
21355
21356 if (max_size > 4)
21357 {
21358 rtx label = ix86_expand_aligntest (count, 4, true);
21359 src = change_address (srcmem, SImode, srcptr);
21360 dest = change_address (destmem, SImode, destptr);
21361 emit_move_insn (dest, src);
21362 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21363 true, OPTAB_LIB_WIDEN);
21364 if (tmp != offset)
21365 emit_move_insn (offset, tmp);
21366 emit_label (label);
21367 LABEL_NUSES (label) = 1;
21368 }
21369 if (max_size > 2)
21370 {
21371 rtx label = ix86_expand_aligntest (count, 2, true);
21372 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21373 src = change_address (srcmem, HImode, tmp);
21374 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21375 dest = change_address (destmem, HImode, tmp);
21376 emit_move_insn (dest, src);
21377 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21378 true, OPTAB_LIB_WIDEN);
21379 if (tmp != offset)
21380 emit_move_insn (offset, tmp);
21381 emit_label (label);
21382 LABEL_NUSES (label) = 1;
21383 }
21384 if (max_size > 1)
21385 {
21386 rtx label = ix86_expand_aligntest (count, 1, true);
21387 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21388 src = change_address (srcmem, QImode, tmp);
21389 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21390 dest = change_address (destmem, QImode, tmp);
21391 emit_move_insn (dest, src);
21392 emit_label (label);
21393 LABEL_NUSES (label) = 1;
21394 }
21395 }
21396 }
21397
21398 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21399 static void
21400 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21401 rtx count, int max_size)
21402 {
21403 count =
21404 expand_simple_binop (counter_mode (count), AND, count,
21405 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21406 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21407 gen_lowpart (QImode, value), count, QImode,
21408 1, max_size / 2);
21409 }
21410
21411 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21412 static void
21413 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21414 {
21415 rtx dest;
21416
21417 if (CONST_INT_P (count))
21418 {
21419 HOST_WIDE_INT countval = INTVAL (count);
21420 int offset = 0;
21421
21422 if ((countval & 0x10) && max_size > 16)
21423 {
21424 if (TARGET_64BIT)
21425 {
21426 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21427 emit_insn (gen_strset (destptr, dest, value));
21428 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21429 emit_insn (gen_strset (destptr, dest, value));
21430 }
21431 else
21432 gcc_unreachable ();
21433 offset += 16;
21434 }
21435 if ((countval & 0x08) && max_size > 8)
21436 {
21437 if (TARGET_64BIT)
21438 {
21439 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21440 emit_insn (gen_strset (destptr, dest, value));
21441 }
21442 else
21443 {
21444 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21445 emit_insn (gen_strset (destptr, dest, value));
21446 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21447 emit_insn (gen_strset (destptr, dest, value));
21448 }
21449 offset += 8;
21450 }
21451 if ((countval & 0x04) && max_size > 4)
21452 {
21453 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21454 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21455 offset += 4;
21456 }
21457 if ((countval & 0x02) && max_size > 2)
21458 {
21459 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
21460 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21461 offset += 2;
21462 }
21463 if ((countval & 0x01) && max_size > 1)
21464 {
21465 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
21466 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21467 offset += 1;
21468 }
21469 return;
21470 }
21471 if (max_size > 32)
21472 {
21473 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
21474 return;
21475 }
21476 if (max_size > 16)
21477 {
21478 rtx label = ix86_expand_aligntest (count, 16, true);
21479 if (TARGET_64BIT)
21480 {
21481 dest = change_address (destmem, DImode, destptr);
21482 emit_insn (gen_strset (destptr, dest, value));
21483 emit_insn (gen_strset (destptr, dest, value));
21484 }
21485 else
21486 {
21487 dest = change_address (destmem, SImode, destptr);
21488 emit_insn (gen_strset (destptr, dest, value));
21489 emit_insn (gen_strset (destptr, dest, value));
21490 emit_insn (gen_strset (destptr, dest, value));
21491 emit_insn (gen_strset (destptr, dest, value));
21492 }
21493 emit_label (label);
21494 LABEL_NUSES (label) = 1;
21495 }
21496 if (max_size > 8)
21497 {
21498 rtx label = ix86_expand_aligntest (count, 8, true);
21499 if (TARGET_64BIT)
21500 {
21501 dest = change_address (destmem, DImode, destptr);
21502 emit_insn (gen_strset (destptr, dest, value));
21503 }
21504 else
21505 {
21506 dest = change_address (destmem, SImode, destptr);
21507 emit_insn (gen_strset (destptr, dest, value));
21508 emit_insn (gen_strset (destptr, dest, value));
21509 }
21510 emit_label (label);
21511 LABEL_NUSES (label) = 1;
21512 }
21513 if (max_size > 4)
21514 {
21515 rtx label = ix86_expand_aligntest (count, 4, true);
21516 dest = change_address (destmem, SImode, destptr);
21517 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
21518 emit_label (label);
21519 LABEL_NUSES (label) = 1;
21520 }
21521 if (max_size > 2)
21522 {
21523 rtx label = ix86_expand_aligntest (count, 2, true);
21524 dest = change_address (destmem, HImode, destptr);
21525 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
21526 emit_label (label);
21527 LABEL_NUSES (label) = 1;
21528 }
21529 if (max_size > 1)
21530 {
21531 rtx label = ix86_expand_aligntest (count, 1, true);
21532 dest = change_address (destmem, QImode, destptr);
21533 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
21534 emit_label (label);
21535 LABEL_NUSES (label) = 1;
21536 }
21537 }
21538
21539 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
21540 DESIRED_ALIGNMENT. */
21541 static void
21542 expand_movmem_prologue (rtx destmem, rtx srcmem,
21543 rtx destptr, rtx srcptr, rtx count,
21544 int align, int desired_alignment)
21545 {
21546 if (align <= 1 && desired_alignment > 1)
21547 {
21548 rtx label = ix86_expand_aligntest (destptr, 1, false);
21549 srcmem = change_address (srcmem, QImode, srcptr);
21550 destmem = change_address (destmem, QImode, destptr);
21551 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21552 ix86_adjust_counter (count, 1);
21553 emit_label (label);
21554 LABEL_NUSES (label) = 1;
21555 }
21556 if (align <= 2 && desired_alignment > 2)
21557 {
21558 rtx label = ix86_expand_aligntest (destptr, 2, false);
21559 srcmem = change_address (srcmem, HImode, srcptr);
21560 destmem = change_address (destmem, HImode, destptr);
21561 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21562 ix86_adjust_counter (count, 2);
21563 emit_label (label);
21564 LABEL_NUSES (label) = 1;
21565 }
21566 if (align <= 4 && desired_alignment > 4)
21567 {
21568 rtx label = ix86_expand_aligntest (destptr, 4, false);
21569 srcmem = change_address (srcmem, SImode, srcptr);
21570 destmem = change_address (destmem, SImode, destptr);
21571 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
21572 ix86_adjust_counter (count, 4);
21573 emit_label (label);
21574 LABEL_NUSES (label) = 1;
21575 }
21576 gcc_assert (desired_alignment <= 8);
21577 }
21578
21579 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
21580 ALIGN_BYTES is how many bytes need to be copied. */
21581 static rtx
21582 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
21583 int desired_align, int align_bytes)
21584 {
21585 rtx src = *srcp;
21586 rtx orig_dst = dst;
21587 rtx orig_src = src;
21588 int off = 0;
21589 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
21590 if (src_align_bytes >= 0)
21591 src_align_bytes = desired_align - src_align_bytes;
21592 if (align_bytes & 1)
21593 {
21594 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21595 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
21596 off = 1;
21597 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21598 }
21599 if (align_bytes & 2)
21600 {
21601 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21602 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
21603 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21604 set_mem_align (dst, 2 * BITS_PER_UNIT);
21605 if (src_align_bytes >= 0
21606 && (src_align_bytes & 1) == (align_bytes & 1)
21607 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
21608 set_mem_align (src, 2 * BITS_PER_UNIT);
21609 off = 2;
21610 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21611 }
21612 if (align_bytes & 4)
21613 {
21614 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21615 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
21616 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21617 set_mem_align (dst, 4 * BITS_PER_UNIT);
21618 if (src_align_bytes >= 0)
21619 {
21620 unsigned int src_align = 0;
21621 if ((src_align_bytes & 3) == (align_bytes & 3))
21622 src_align = 4;
21623 else if ((src_align_bytes & 1) == (align_bytes & 1))
21624 src_align = 2;
21625 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21626 set_mem_align (src, src_align * BITS_PER_UNIT);
21627 }
21628 off = 4;
21629 emit_insn (gen_strmov (destreg, dst, srcreg, src));
21630 }
21631 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21632 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
21633 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21634 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21635 if (src_align_bytes >= 0)
21636 {
21637 unsigned int src_align = 0;
21638 if ((src_align_bytes & 7) == (align_bytes & 7))
21639 src_align = 8;
21640 else if ((src_align_bytes & 3) == (align_bytes & 3))
21641 src_align = 4;
21642 else if ((src_align_bytes & 1) == (align_bytes & 1))
21643 src_align = 2;
21644 if (src_align > (unsigned int) desired_align)
21645 src_align = desired_align;
21646 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
21647 set_mem_align (src, src_align * BITS_PER_UNIT);
21648 }
21649 if (MEM_SIZE_KNOWN_P (orig_dst))
21650 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21651 if (MEM_SIZE_KNOWN_P (orig_src))
21652 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
21653 *srcp = src;
21654 return dst;
21655 }
21656
21657 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
21658 DESIRED_ALIGNMENT. */
21659 static void
21660 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
21661 int align, int desired_alignment)
21662 {
21663 if (align <= 1 && desired_alignment > 1)
21664 {
21665 rtx label = ix86_expand_aligntest (destptr, 1, false);
21666 destmem = change_address (destmem, QImode, destptr);
21667 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
21668 ix86_adjust_counter (count, 1);
21669 emit_label (label);
21670 LABEL_NUSES (label) = 1;
21671 }
21672 if (align <= 2 && desired_alignment > 2)
21673 {
21674 rtx label = ix86_expand_aligntest (destptr, 2, false);
21675 destmem = change_address (destmem, HImode, destptr);
21676 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
21677 ix86_adjust_counter (count, 2);
21678 emit_label (label);
21679 LABEL_NUSES (label) = 1;
21680 }
21681 if (align <= 4 && desired_alignment > 4)
21682 {
21683 rtx label = ix86_expand_aligntest (destptr, 4, false);
21684 destmem = change_address (destmem, SImode, destptr);
21685 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
21686 ix86_adjust_counter (count, 4);
21687 emit_label (label);
21688 LABEL_NUSES (label) = 1;
21689 }
21690 gcc_assert (desired_alignment <= 8);
21691 }
21692
21693 /* Set enough from DST to align DST known to by aligned by ALIGN to
21694 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
21695 static rtx
21696 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
21697 int desired_align, int align_bytes)
21698 {
21699 int off = 0;
21700 rtx orig_dst = dst;
21701 if (align_bytes & 1)
21702 {
21703 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
21704 off = 1;
21705 emit_insn (gen_strset (destreg, dst,
21706 gen_lowpart (QImode, value)));
21707 }
21708 if (align_bytes & 2)
21709 {
21710 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
21711 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
21712 set_mem_align (dst, 2 * BITS_PER_UNIT);
21713 off = 2;
21714 emit_insn (gen_strset (destreg, dst,
21715 gen_lowpart (HImode, value)));
21716 }
21717 if (align_bytes & 4)
21718 {
21719 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
21720 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
21721 set_mem_align (dst, 4 * BITS_PER_UNIT);
21722 off = 4;
21723 emit_insn (gen_strset (destreg, dst,
21724 gen_lowpart (SImode, value)));
21725 }
21726 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
21727 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
21728 set_mem_align (dst, desired_align * BITS_PER_UNIT);
21729 if (MEM_SIZE_KNOWN_P (orig_dst))
21730 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
21731 return dst;
21732 }
21733
21734 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
21735 static enum stringop_alg
21736 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
21737 int *dynamic_check)
21738 {
21739 const struct stringop_algs * algs;
21740 bool optimize_for_speed;
21741 /* Algorithms using the rep prefix want at least edi and ecx;
21742 additionally, memset wants eax and memcpy wants esi. Don't
21743 consider such algorithms if the user has appropriated those
21744 registers for their own purposes. */
21745 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
21746 || (memset
21747 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
21748
21749 #define ALG_USABLE_P(alg) (rep_prefix_usable \
21750 || (alg != rep_prefix_1_byte \
21751 && alg != rep_prefix_4_byte \
21752 && alg != rep_prefix_8_byte))
21753 const struct processor_costs *cost;
21754
21755 /* Even if the string operation call is cold, we still might spend a lot
21756 of time processing large blocks. */
21757 if (optimize_function_for_size_p (cfun)
21758 || (optimize_insn_for_size_p ()
21759 && expected_size != -1 && expected_size < 256))
21760 optimize_for_speed = false;
21761 else
21762 optimize_for_speed = true;
21763
21764 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
21765
21766 *dynamic_check = -1;
21767 if (memset)
21768 algs = &cost->memset[TARGET_64BIT != 0];
21769 else
21770 algs = &cost->memcpy[TARGET_64BIT != 0];
21771 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
21772 return ix86_stringop_alg;
21773 /* rep; movq or rep; movl is the smallest variant. */
21774 else if (!optimize_for_speed)
21775 {
21776 if (!count || (count & 3))
21777 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
21778 else
21779 return rep_prefix_usable ? rep_prefix_4_byte : loop;
21780 }
21781 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
21782 */
21783 else if (expected_size != -1 && expected_size < 4)
21784 return loop_1_byte;
21785 else if (expected_size != -1)
21786 {
21787 unsigned int i;
21788 enum stringop_alg alg = libcall;
21789 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21790 {
21791 /* We get here if the algorithms that were not libcall-based
21792 were rep-prefix based and we are unable to use rep prefixes
21793 based on global register usage. Break out of the loop and
21794 use the heuristic below. */
21795 if (algs->size[i].max == 0)
21796 break;
21797 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
21798 {
21799 enum stringop_alg candidate = algs->size[i].alg;
21800
21801 if (candidate != libcall && ALG_USABLE_P (candidate))
21802 alg = candidate;
21803 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
21804 last non-libcall inline algorithm. */
21805 if (TARGET_INLINE_ALL_STRINGOPS)
21806 {
21807 /* When the current size is best to be copied by a libcall,
21808 but we are still forced to inline, run the heuristic below
21809 that will pick code for medium sized blocks. */
21810 if (alg != libcall)
21811 return alg;
21812 break;
21813 }
21814 else if (ALG_USABLE_P (candidate))
21815 return candidate;
21816 }
21817 }
21818 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
21819 }
21820 /* When asked to inline the call anyway, try to pick meaningful choice.
21821 We look for maximal size of block that is faster to copy by hand and
21822 take blocks of at most of that size guessing that average size will
21823 be roughly half of the block.
21824
21825 If this turns out to be bad, we might simply specify the preferred
21826 choice in ix86_costs. */
21827 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21828 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
21829 {
21830 int max = -1;
21831 enum stringop_alg alg;
21832 int i;
21833 bool any_alg_usable_p = true;
21834
21835 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
21836 {
21837 enum stringop_alg candidate = algs->size[i].alg;
21838 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
21839
21840 if (candidate != libcall && candidate
21841 && ALG_USABLE_P (candidate))
21842 max = algs->size[i].max;
21843 }
21844 /* If there aren't any usable algorithms, then recursing on
21845 smaller sizes isn't going to find anything. Just return the
21846 simple byte-at-a-time copy loop. */
21847 if (!any_alg_usable_p)
21848 {
21849 /* Pick something reasonable. */
21850 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21851 *dynamic_check = 128;
21852 return loop_1_byte;
21853 }
21854 if (max == -1)
21855 max = 4096;
21856 alg = decide_alg (count, max / 2, memset, dynamic_check);
21857 gcc_assert (*dynamic_check == -1);
21858 gcc_assert (alg != libcall);
21859 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
21860 *dynamic_check = max;
21861 return alg;
21862 }
21863 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
21864 #undef ALG_USABLE_P
21865 }
21866
21867 /* Decide on alignment. We know that the operand is already aligned to ALIGN
21868 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
21869 static int
21870 decide_alignment (int align,
21871 enum stringop_alg alg,
21872 int expected_size)
21873 {
21874 int desired_align = 0;
21875 switch (alg)
21876 {
21877 case no_stringop:
21878 gcc_unreachable ();
21879 case loop:
21880 case unrolled_loop:
21881 desired_align = GET_MODE_SIZE (Pmode);
21882 break;
21883 case rep_prefix_8_byte:
21884 desired_align = 8;
21885 break;
21886 case rep_prefix_4_byte:
21887 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21888 copying whole cacheline at once. */
21889 if (TARGET_PENTIUMPRO)
21890 desired_align = 8;
21891 else
21892 desired_align = 4;
21893 break;
21894 case rep_prefix_1_byte:
21895 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
21896 copying whole cacheline at once. */
21897 if (TARGET_PENTIUMPRO)
21898 desired_align = 8;
21899 else
21900 desired_align = 1;
21901 break;
21902 case loop_1_byte:
21903 desired_align = 1;
21904 break;
21905 case libcall:
21906 return 0;
21907 }
21908
21909 if (optimize_size)
21910 desired_align = 1;
21911 if (desired_align < align)
21912 desired_align = align;
21913 if (expected_size != -1 && expected_size < 4)
21914 desired_align = align;
21915 return desired_align;
21916 }
21917
21918 /* Return the smallest power of 2 greater than VAL. */
21919 static int
21920 smallest_pow2_greater_than (int val)
21921 {
21922 int ret = 1;
21923 while (ret <= val)
21924 ret <<= 1;
21925 return ret;
21926 }
21927
21928 /* Expand string move (memcpy) operation. Use i386 string operations
21929 when profitable. expand_setmem contains similar code. The code
21930 depends upon architecture, block size and alignment, but always has
21931 the same overall structure:
21932
21933 1) Prologue guard: Conditional that jumps up to epilogues for small
21934 blocks that can be handled by epilogue alone. This is faster
21935 but also needed for correctness, since prologue assume the block
21936 is larger than the desired alignment.
21937
21938 Optional dynamic check for size and libcall for large
21939 blocks is emitted here too, with -minline-stringops-dynamically.
21940
21941 2) Prologue: copy first few bytes in order to get destination
21942 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
21943 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
21944 copied. We emit either a jump tree on power of two sized
21945 blocks, or a byte loop.
21946
21947 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
21948 with specified algorithm.
21949
21950 4) Epilogue: code copying tail of the block that is too small to be
21951 handled by main body (or up to size guarded by prologue guard). */
21952
21953 bool
21954 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
21955 rtx expected_align_exp, rtx expected_size_exp)
21956 {
21957 rtx destreg;
21958 rtx srcreg;
21959 rtx label = NULL;
21960 rtx tmp;
21961 rtx jump_around_label = NULL;
21962 HOST_WIDE_INT align = 1;
21963 unsigned HOST_WIDE_INT count = 0;
21964 HOST_WIDE_INT expected_size = -1;
21965 int size_needed = 0, epilogue_size_needed;
21966 int desired_align = 0, align_bytes = 0;
21967 enum stringop_alg alg;
21968 int dynamic_check;
21969 bool need_zero_guard = false;
21970
21971 if (CONST_INT_P (align_exp))
21972 align = INTVAL (align_exp);
21973 /* i386 can do misaligned access on reasonably increased cost. */
21974 if (CONST_INT_P (expected_align_exp)
21975 && INTVAL (expected_align_exp) > align)
21976 align = INTVAL (expected_align_exp);
21977 /* ALIGN is the minimum of destination and source alignment, but we care here
21978 just about destination alignment. */
21979 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
21980 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
21981
21982 if (CONST_INT_P (count_exp))
21983 count = expected_size = INTVAL (count_exp);
21984 if (CONST_INT_P (expected_size_exp) && count == 0)
21985 expected_size = INTVAL (expected_size_exp);
21986
21987 /* Make sure we don't need to care about overflow later on. */
21988 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
21989 return false;
21990
21991 /* Step 0: Decide on preferred algorithm, desired alignment and
21992 size of chunks to be copied by main loop. */
21993
21994 alg = decide_alg (count, expected_size, false, &dynamic_check);
21995 desired_align = decide_alignment (align, alg, expected_size);
21996
21997 if (!TARGET_ALIGN_STRINGOPS)
21998 align = desired_align;
21999
22000 if (alg == libcall)
22001 return false;
22002 gcc_assert (alg != no_stringop);
22003 if (!count)
22004 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22005 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22006 srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
22007 switch (alg)
22008 {
22009 case libcall:
22010 case no_stringop:
22011 gcc_unreachable ();
22012 case loop:
22013 need_zero_guard = true;
22014 size_needed = GET_MODE_SIZE (Pmode);
22015 break;
22016 case unrolled_loop:
22017 need_zero_guard = true;
22018 size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
22019 break;
22020 case rep_prefix_8_byte:
22021 size_needed = 8;
22022 break;
22023 case rep_prefix_4_byte:
22024 size_needed = 4;
22025 break;
22026 case rep_prefix_1_byte:
22027 size_needed = 1;
22028 break;
22029 case loop_1_byte:
22030 need_zero_guard = true;
22031 size_needed = 1;
22032 break;
22033 }
22034
22035 epilogue_size_needed = size_needed;
22036
22037 /* Step 1: Prologue guard. */
22038
22039 /* Alignment code needs count to be in register. */
22040 if (CONST_INT_P (count_exp) && desired_align > align)
22041 {
22042 if (INTVAL (count_exp) > desired_align
22043 && INTVAL (count_exp) > size_needed)
22044 {
22045 align_bytes
22046 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22047 if (align_bytes <= 0)
22048 align_bytes = 0;
22049 else
22050 align_bytes = desired_align - align_bytes;
22051 }
22052 if (align_bytes == 0)
22053 count_exp = force_reg (counter_mode (count_exp), count_exp);
22054 }
22055 gcc_assert (desired_align >= 1 && align >= 1);
22056
22057 /* Ensure that alignment prologue won't copy past end of block. */
22058 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22059 {
22060 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22061 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22062 Make sure it is power of 2. */
22063 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22064
22065 if (count)
22066 {
22067 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22068 {
22069 /* If main algorithm works on QImode, no epilogue is needed.
22070 For small sizes just don't align anything. */
22071 if (size_needed == 1)
22072 desired_align = align;
22073 else
22074 goto epilogue;
22075 }
22076 }
22077 else
22078 {
22079 label = gen_label_rtx ();
22080 emit_cmp_and_jump_insns (count_exp,
22081 GEN_INT (epilogue_size_needed),
22082 LTU, 0, counter_mode (count_exp), 1, label);
22083 if (expected_size == -1 || expected_size < epilogue_size_needed)
22084 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22085 else
22086 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22087 }
22088 }
22089
22090 /* Emit code to decide on runtime whether library call or inline should be
22091 used. */
22092 if (dynamic_check != -1)
22093 {
22094 if (CONST_INT_P (count_exp))
22095 {
22096 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22097 {
22098 emit_block_move_via_libcall (dst, src, count_exp, false);
22099 count_exp = const0_rtx;
22100 goto epilogue;
22101 }
22102 }
22103 else
22104 {
22105 rtx hot_label = gen_label_rtx ();
22106 jump_around_label = gen_label_rtx ();
22107 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22108 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22109 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22110 emit_block_move_via_libcall (dst, src, count_exp, false);
22111 emit_jump (jump_around_label);
22112 emit_label (hot_label);
22113 }
22114 }
22115
22116 /* Step 2: Alignment prologue. */
22117
22118 if (desired_align > align)
22119 {
22120 if (align_bytes == 0)
22121 {
22122 /* Except for the first move in epilogue, we no longer know
22123 constant offset in aliasing info. It don't seems to worth
22124 the pain to maintain it for the first move, so throw away
22125 the info early. */
22126 src = change_address (src, BLKmode, srcreg);
22127 dst = change_address (dst, BLKmode, destreg);
22128 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22129 desired_align);
22130 }
22131 else
22132 {
22133 /* If we know how many bytes need to be stored before dst is
22134 sufficiently aligned, maintain aliasing info accurately. */
22135 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22136 desired_align, align_bytes);
22137 count_exp = plus_constant (count_exp, -align_bytes);
22138 count -= align_bytes;
22139 }
22140 if (need_zero_guard
22141 && (count < (unsigned HOST_WIDE_INT) size_needed
22142 || (align_bytes == 0
22143 && count < ((unsigned HOST_WIDE_INT) size_needed
22144 + desired_align - align))))
22145 {
22146 /* It is possible that we copied enough so the main loop will not
22147 execute. */
22148 gcc_assert (size_needed > 1);
22149 if (label == NULL_RTX)
22150 label = gen_label_rtx ();
22151 emit_cmp_and_jump_insns (count_exp,
22152 GEN_INT (size_needed),
22153 LTU, 0, counter_mode (count_exp), 1, label);
22154 if (expected_size == -1
22155 || expected_size < (desired_align - align) / 2 + size_needed)
22156 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22157 else
22158 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22159 }
22160 }
22161 if (label && size_needed == 1)
22162 {
22163 emit_label (label);
22164 LABEL_NUSES (label) = 1;
22165 label = NULL;
22166 epilogue_size_needed = 1;
22167 }
22168 else if (label == NULL_RTX)
22169 epilogue_size_needed = size_needed;
22170
22171 /* Step 3: Main loop. */
22172
22173 switch (alg)
22174 {
22175 case libcall:
22176 case no_stringop:
22177 gcc_unreachable ();
22178 case loop_1_byte:
22179 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22180 count_exp, QImode, 1, expected_size);
22181 break;
22182 case loop:
22183 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22184 count_exp, Pmode, 1, expected_size);
22185 break;
22186 case unrolled_loop:
22187 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22188 registers for 4 temporaries anyway. */
22189 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22190 count_exp, Pmode, TARGET_64BIT ? 4 : 2,
22191 expected_size);
22192 break;
22193 case rep_prefix_8_byte:
22194 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22195 DImode);
22196 break;
22197 case rep_prefix_4_byte:
22198 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22199 SImode);
22200 break;
22201 case rep_prefix_1_byte:
22202 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22203 QImode);
22204 break;
22205 }
22206 /* Adjust properly the offset of src and dest memory for aliasing. */
22207 if (CONST_INT_P (count_exp))
22208 {
22209 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22210 (count / size_needed) * size_needed);
22211 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22212 (count / size_needed) * size_needed);
22213 }
22214 else
22215 {
22216 src = change_address (src, BLKmode, srcreg);
22217 dst = change_address (dst, BLKmode, destreg);
22218 }
22219
22220 /* Step 4: Epilogue to copy the remaining bytes. */
22221 epilogue:
22222 if (label)
22223 {
22224 /* When the main loop is done, COUNT_EXP might hold original count,
22225 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22226 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22227 bytes. Compensate if needed. */
22228
22229 if (size_needed < epilogue_size_needed)
22230 {
22231 tmp =
22232 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22233 GEN_INT (size_needed - 1), count_exp, 1,
22234 OPTAB_DIRECT);
22235 if (tmp != count_exp)
22236 emit_move_insn (count_exp, tmp);
22237 }
22238 emit_label (label);
22239 LABEL_NUSES (label) = 1;
22240 }
22241
22242 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22243 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22244 epilogue_size_needed);
22245 if (jump_around_label)
22246 emit_label (jump_around_label);
22247 return true;
22248 }
22249
22250 /* Helper function for memcpy. For QImode value 0xXY produce
22251 0xXYXYXYXY of wide specified by MODE. This is essentially
22252 a * 0x10101010, but we can do slightly better than
22253 synth_mult by unwinding the sequence by hand on CPUs with
22254 slow multiply. */
22255 static rtx
22256 promote_duplicated_reg (enum machine_mode mode, rtx val)
22257 {
22258 enum machine_mode valmode = GET_MODE (val);
22259 rtx tmp;
22260 int nops = mode == DImode ? 3 : 2;
22261
22262 gcc_assert (mode == SImode || mode == DImode);
22263 if (val == const0_rtx)
22264 return copy_to_mode_reg (mode, const0_rtx);
22265 if (CONST_INT_P (val))
22266 {
22267 HOST_WIDE_INT v = INTVAL (val) & 255;
22268
22269 v |= v << 8;
22270 v |= v << 16;
22271 if (mode == DImode)
22272 v |= (v << 16) << 16;
22273 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22274 }
22275
22276 if (valmode == VOIDmode)
22277 valmode = QImode;
22278 if (valmode != QImode)
22279 val = gen_lowpart (QImode, val);
22280 if (mode == QImode)
22281 return val;
22282 if (!TARGET_PARTIAL_REG_STALL)
22283 nops--;
22284 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22285 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22286 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22287 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22288 {
22289 rtx reg = convert_modes (mode, QImode, val, true);
22290 tmp = promote_duplicated_reg (mode, const1_rtx);
22291 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22292 OPTAB_DIRECT);
22293 }
22294 else
22295 {
22296 rtx reg = convert_modes (mode, QImode, val, true);
22297
22298 if (!TARGET_PARTIAL_REG_STALL)
22299 if (mode == SImode)
22300 emit_insn (gen_movsi_insv_1 (reg, reg));
22301 else
22302 emit_insn (gen_movdi_insv_1 (reg, reg));
22303 else
22304 {
22305 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22306 NULL, 1, OPTAB_DIRECT);
22307 reg =
22308 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22309 }
22310 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22311 NULL, 1, OPTAB_DIRECT);
22312 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22313 if (mode == SImode)
22314 return reg;
22315 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22316 NULL, 1, OPTAB_DIRECT);
22317 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22318 return reg;
22319 }
22320 }
22321
22322 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22323 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22324 alignment from ALIGN to DESIRED_ALIGN. */
22325 static rtx
22326 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22327 {
22328 rtx promoted_val;
22329
22330 if (TARGET_64BIT
22331 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22332 promoted_val = promote_duplicated_reg (DImode, val);
22333 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22334 promoted_val = promote_duplicated_reg (SImode, val);
22335 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22336 promoted_val = promote_duplicated_reg (HImode, val);
22337 else
22338 promoted_val = val;
22339
22340 return promoted_val;
22341 }
22342
22343 /* Expand string clear operation (bzero). Use i386 string operations when
22344 profitable. See expand_movmem comment for explanation of individual
22345 steps performed. */
22346 bool
22347 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22348 rtx expected_align_exp, rtx expected_size_exp)
22349 {
22350 rtx destreg;
22351 rtx label = NULL;
22352 rtx tmp;
22353 rtx jump_around_label = NULL;
22354 HOST_WIDE_INT align = 1;
22355 unsigned HOST_WIDE_INT count = 0;
22356 HOST_WIDE_INT expected_size = -1;
22357 int size_needed = 0, epilogue_size_needed;
22358 int desired_align = 0, align_bytes = 0;
22359 enum stringop_alg alg;
22360 rtx promoted_val = NULL;
22361 bool force_loopy_epilogue = false;
22362 int dynamic_check;
22363 bool need_zero_guard = false;
22364
22365 if (CONST_INT_P (align_exp))
22366 align = INTVAL (align_exp);
22367 /* i386 can do misaligned access on reasonably increased cost. */
22368 if (CONST_INT_P (expected_align_exp)
22369 && INTVAL (expected_align_exp) > align)
22370 align = INTVAL (expected_align_exp);
22371 if (CONST_INT_P (count_exp))
22372 count = expected_size = INTVAL (count_exp);
22373 if (CONST_INT_P (expected_size_exp) && count == 0)
22374 expected_size = INTVAL (expected_size_exp);
22375
22376 /* Make sure we don't need to care about overflow later on. */
22377 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22378 return false;
22379
22380 /* Step 0: Decide on preferred algorithm, desired alignment and
22381 size of chunks to be copied by main loop. */
22382
22383 alg = decide_alg (count, expected_size, true, &dynamic_check);
22384 desired_align = decide_alignment (align, alg, expected_size);
22385
22386 if (!TARGET_ALIGN_STRINGOPS)
22387 align = desired_align;
22388
22389 if (alg == libcall)
22390 return false;
22391 gcc_assert (alg != no_stringop);
22392 if (!count)
22393 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22394 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
22395 switch (alg)
22396 {
22397 case libcall:
22398 case no_stringop:
22399 gcc_unreachable ();
22400 case loop:
22401 need_zero_guard = true;
22402 size_needed = GET_MODE_SIZE (Pmode);
22403 break;
22404 case unrolled_loop:
22405 need_zero_guard = true;
22406 size_needed = GET_MODE_SIZE (Pmode) * 4;
22407 break;
22408 case rep_prefix_8_byte:
22409 size_needed = 8;
22410 break;
22411 case rep_prefix_4_byte:
22412 size_needed = 4;
22413 break;
22414 case rep_prefix_1_byte:
22415 size_needed = 1;
22416 break;
22417 case loop_1_byte:
22418 need_zero_guard = true;
22419 size_needed = 1;
22420 break;
22421 }
22422 epilogue_size_needed = size_needed;
22423
22424 /* Step 1: Prologue guard. */
22425
22426 /* Alignment code needs count to be in register. */
22427 if (CONST_INT_P (count_exp) && desired_align > align)
22428 {
22429 if (INTVAL (count_exp) > desired_align
22430 && INTVAL (count_exp) > size_needed)
22431 {
22432 align_bytes
22433 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22434 if (align_bytes <= 0)
22435 align_bytes = 0;
22436 else
22437 align_bytes = desired_align - align_bytes;
22438 }
22439 if (align_bytes == 0)
22440 {
22441 enum machine_mode mode = SImode;
22442 if (TARGET_64BIT && (count & ~0xffffffff))
22443 mode = DImode;
22444 count_exp = force_reg (mode, count_exp);
22445 }
22446 }
22447 /* Do the cheap promotion to allow better CSE across the
22448 main loop and epilogue (ie one load of the big constant in the
22449 front of all code. */
22450 if (CONST_INT_P (val_exp))
22451 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22452 desired_align, align);
22453 /* Ensure that alignment prologue won't copy past end of block. */
22454 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22455 {
22456 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22457 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
22458 Make sure it is power of 2. */
22459 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22460
22461 /* To improve performance of small blocks, we jump around the VAL
22462 promoting mode. This mean that if the promoted VAL is not constant,
22463 we might not use it in the epilogue and have to use byte
22464 loop variant. */
22465 if (epilogue_size_needed > 2 && !promoted_val)
22466 force_loopy_epilogue = true;
22467 if (count)
22468 {
22469 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22470 {
22471 /* If main algorithm works on QImode, no epilogue is needed.
22472 For small sizes just don't align anything. */
22473 if (size_needed == 1)
22474 desired_align = align;
22475 else
22476 goto epilogue;
22477 }
22478 }
22479 else
22480 {
22481 label = gen_label_rtx ();
22482 emit_cmp_and_jump_insns (count_exp,
22483 GEN_INT (epilogue_size_needed),
22484 LTU, 0, counter_mode (count_exp), 1, label);
22485 if (expected_size == -1 || expected_size <= epilogue_size_needed)
22486 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22487 else
22488 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22489 }
22490 }
22491 if (dynamic_check != -1)
22492 {
22493 rtx hot_label = gen_label_rtx ();
22494 jump_around_label = gen_label_rtx ();
22495 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22496 LEU, 0, counter_mode (count_exp), 1, hot_label);
22497 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22498 set_storage_via_libcall (dst, count_exp, val_exp, false);
22499 emit_jump (jump_around_label);
22500 emit_label (hot_label);
22501 }
22502
22503 /* Step 2: Alignment prologue. */
22504
22505 /* Do the expensive promotion once we branched off the small blocks. */
22506 if (!promoted_val)
22507 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
22508 desired_align, align);
22509 gcc_assert (desired_align >= 1 && align >= 1);
22510
22511 if (desired_align > align)
22512 {
22513 if (align_bytes == 0)
22514 {
22515 /* Except for the first move in epilogue, we no longer know
22516 constant offset in aliasing info. It don't seems to worth
22517 the pain to maintain it for the first move, so throw away
22518 the info early. */
22519 dst = change_address (dst, BLKmode, destreg);
22520 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
22521 desired_align);
22522 }
22523 else
22524 {
22525 /* If we know how many bytes need to be stored before dst is
22526 sufficiently aligned, maintain aliasing info accurately. */
22527 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
22528 desired_align, align_bytes);
22529 count_exp = plus_constant (count_exp, -align_bytes);
22530 count -= align_bytes;
22531 }
22532 if (need_zero_guard
22533 && (count < (unsigned HOST_WIDE_INT) size_needed
22534 || (align_bytes == 0
22535 && count < ((unsigned HOST_WIDE_INT) size_needed
22536 + desired_align - align))))
22537 {
22538 /* It is possible that we copied enough so the main loop will not
22539 execute. */
22540 gcc_assert (size_needed > 1);
22541 if (label == NULL_RTX)
22542 label = gen_label_rtx ();
22543 emit_cmp_and_jump_insns (count_exp,
22544 GEN_INT (size_needed),
22545 LTU, 0, counter_mode (count_exp), 1, label);
22546 if (expected_size == -1
22547 || expected_size < (desired_align - align) / 2 + size_needed)
22548 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22549 else
22550 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22551 }
22552 }
22553 if (label && size_needed == 1)
22554 {
22555 emit_label (label);
22556 LABEL_NUSES (label) = 1;
22557 label = NULL;
22558 promoted_val = val_exp;
22559 epilogue_size_needed = 1;
22560 }
22561 else if (label == NULL_RTX)
22562 epilogue_size_needed = size_needed;
22563
22564 /* Step 3: Main loop. */
22565
22566 switch (alg)
22567 {
22568 case libcall:
22569 case no_stringop:
22570 gcc_unreachable ();
22571 case loop_1_byte:
22572 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22573 count_exp, QImode, 1, expected_size);
22574 break;
22575 case loop:
22576 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22577 count_exp, Pmode, 1, expected_size);
22578 break;
22579 case unrolled_loop:
22580 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
22581 count_exp, Pmode, 4, expected_size);
22582 break;
22583 case rep_prefix_8_byte:
22584 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22585 DImode, val_exp);
22586 break;
22587 case rep_prefix_4_byte:
22588 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22589 SImode, val_exp);
22590 break;
22591 case rep_prefix_1_byte:
22592 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
22593 QImode, val_exp);
22594 break;
22595 }
22596 /* Adjust properly the offset of src and dest memory for aliasing. */
22597 if (CONST_INT_P (count_exp))
22598 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22599 (count / size_needed) * size_needed);
22600 else
22601 dst = change_address (dst, BLKmode, destreg);
22602
22603 /* Step 4: Epilogue to copy the remaining bytes. */
22604
22605 if (label)
22606 {
22607 /* When the main loop is done, COUNT_EXP might hold original count,
22608 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22609 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22610 bytes. Compensate if needed. */
22611
22612 if (size_needed < epilogue_size_needed)
22613 {
22614 tmp =
22615 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22616 GEN_INT (size_needed - 1), count_exp, 1,
22617 OPTAB_DIRECT);
22618 if (tmp != count_exp)
22619 emit_move_insn (count_exp, tmp);
22620 }
22621 emit_label (label);
22622 LABEL_NUSES (label) = 1;
22623 }
22624 epilogue:
22625 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22626 {
22627 if (force_loopy_epilogue)
22628 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
22629 epilogue_size_needed);
22630 else
22631 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
22632 epilogue_size_needed);
22633 }
22634 if (jump_around_label)
22635 emit_label (jump_around_label);
22636 return true;
22637 }
22638
22639 /* Expand the appropriate insns for doing strlen if not just doing
22640 repnz; scasb
22641
22642 out = result, initialized with the start address
22643 align_rtx = alignment of the address.
22644 scratch = scratch register, initialized with the startaddress when
22645 not aligned, otherwise undefined
22646
22647 This is just the body. It needs the initializations mentioned above and
22648 some address computing at the end. These things are done in i386.md. */
22649
22650 static void
22651 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
22652 {
22653 int align;
22654 rtx tmp;
22655 rtx align_2_label = NULL_RTX;
22656 rtx align_3_label = NULL_RTX;
22657 rtx align_4_label = gen_label_rtx ();
22658 rtx end_0_label = gen_label_rtx ();
22659 rtx mem;
22660 rtx tmpreg = gen_reg_rtx (SImode);
22661 rtx scratch = gen_reg_rtx (SImode);
22662 rtx cmp;
22663
22664 align = 0;
22665 if (CONST_INT_P (align_rtx))
22666 align = INTVAL (align_rtx);
22667
22668 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
22669
22670 /* Is there a known alignment and is it less than 4? */
22671 if (align < 4)
22672 {
22673 rtx scratch1 = gen_reg_rtx (Pmode);
22674 emit_move_insn (scratch1, out);
22675 /* Is there a known alignment and is it not 2? */
22676 if (align != 2)
22677 {
22678 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
22679 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
22680
22681 /* Leave just the 3 lower bits. */
22682 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
22683 NULL_RTX, 0, OPTAB_WIDEN);
22684
22685 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22686 Pmode, 1, align_4_label);
22687 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
22688 Pmode, 1, align_2_label);
22689 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
22690 Pmode, 1, align_3_label);
22691 }
22692 else
22693 {
22694 /* Since the alignment is 2, we have to check 2 or 0 bytes;
22695 check if is aligned to 4 - byte. */
22696
22697 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
22698 NULL_RTX, 0, OPTAB_WIDEN);
22699
22700 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
22701 Pmode, 1, align_4_label);
22702 }
22703
22704 mem = change_address (src, QImode, out);
22705
22706 /* Now compare the bytes. */
22707
22708 /* Compare the first n unaligned byte on a byte per byte basis. */
22709 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
22710 QImode, 1, end_0_label);
22711
22712 /* Increment the address. */
22713 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22714
22715 /* Not needed with an alignment of 2 */
22716 if (align != 2)
22717 {
22718 emit_label (align_2_label);
22719
22720 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22721 end_0_label);
22722
22723 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22724
22725 emit_label (align_3_label);
22726 }
22727
22728 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
22729 end_0_label);
22730
22731 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
22732 }
22733
22734 /* Generate loop to check 4 bytes at a time. It is not a good idea to
22735 align this loop. It gives only huge programs, but does not help to
22736 speed up. */
22737 emit_label (align_4_label);
22738
22739 mem = change_address (src, SImode, out);
22740 emit_move_insn (scratch, mem);
22741 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
22742
22743 /* This formula yields a nonzero result iff one of the bytes is zero.
22744 This saves three branches inside loop and many cycles. */
22745
22746 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
22747 emit_insn (gen_one_cmplsi2 (scratch, scratch));
22748 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
22749 emit_insn (gen_andsi3 (tmpreg, tmpreg,
22750 gen_int_mode (0x80808080, SImode)));
22751 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
22752 align_4_label);
22753
22754 if (TARGET_CMOVE)
22755 {
22756 rtx reg = gen_reg_rtx (SImode);
22757 rtx reg2 = gen_reg_rtx (Pmode);
22758 emit_move_insn (reg, tmpreg);
22759 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
22760
22761 /* If zero is not in the first two bytes, move two bytes forward. */
22762 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22763 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22764 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22765 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
22766 gen_rtx_IF_THEN_ELSE (SImode, tmp,
22767 reg,
22768 tmpreg)));
22769 /* Emit lea manually to avoid clobbering of flags. */
22770 emit_insn (gen_rtx_SET (SImode, reg2,
22771 gen_rtx_PLUS (Pmode, out, const2_rtx)));
22772
22773 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22774 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
22775 emit_insn (gen_rtx_SET (VOIDmode, out,
22776 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
22777 reg2,
22778 out)));
22779 }
22780 else
22781 {
22782 rtx end_2_label = gen_label_rtx ();
22783 /* Is zero in the first two bytes? */
22784
22785 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
22786 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
22787 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
22788 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22789 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
22790 pc_rtx);
22791 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
22792 JUMP_LABEL (tmp) = end_2_label;
22793
22794 /* Not in the first two. Move two bytes forward. */
22795 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
22796 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
22797
22798 emit_label (end_2_label);
22799
22800 }
22801
22802 /* Avoid branch in fixing the byte. */
22803 tmpreg = gen_lowpart (QImode, tmpreg);
22804 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
22805 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
22806 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
22807 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
22808
22809 emit_label (end_0_label);
22810 }
22811
22812 /* Expand strlen. */
22813
22814 bool
22815 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
22816 {
22817 rtx addr, scratch1, scratch2, scratch3, scratch4;
22818
22819 /* The generic case of strlen expander is long. Avoid it's
22820 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
22821
22822 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22823 && !TARGET_INLINE_ALL_STRINGOPS
22824 && !optimize_insn_for_size_p ()
22825 && (!CONST_INT_P (align) || INTVAL (align) < 4))
22826 return false;
22827
22828 addr = force_reg (Pmode, XEXP (src, 0));
22829 scratch1 = gen_reg_rtx (Pmode);
22830
22831 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
22832 && !optimize_insn_for_size_p ())
22833 {
22834 /* Well it seems that some optimizer does not combine a call like
22835 foo(strlen(bar), strlen(bar));
22836 when the move and the subtraction is done here. It does calculate
22837 the length just once when these instructions are done inside of
22838 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
22839 often used and I use one fewer register for the lifetime of
22840 output_strlen_unroll() this is better. */
22841
22842 emit_move_insn (out, addr);
22843
22844 ix86_expand_strlensi_unroll_1 (out, src, align);
22845
22846 /* strlensi_unroll_1 returns the address of the zero at the end of
22847 the string, like memchr(), so compute the length by subtracting
22848 the start address. */
22849 emit_insn (ix86_gen_sub3 (out, out, addr));
22850 }
22851 else
22852 {
22853 rtx unspec;
22854
22855 /* Can't use this if the user has appropriated eax, ecx, or edi. */
22856 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
22857 return false;
22858
22859 scratch2 = gen_reg_rtx (Pmode);
22860 scratch3 = gen_reg_rtx (Pmode);
22861 scratch4 = force_reg (Pmode, constm1_rtx);
22862
22863 emit_move_insn (scratch3, addr);
22864 eoschar = force_reg (QImode, eoschar);
22865
22866 src = replace_equiv_address_nv (src, scratch3);
22867
22868 /* If .md starts supporting :P, this can be done in .md. */
22869 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
22870 scratch4), UNSPEC_SCAS);
22871 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
22872 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
22873 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
22874 }
22875 return true;
22876 }
22877
22878 /* For given symbol (function) construct code to compute address of it's PLT
22879 entry in large x86-64 PIC model. */
22880 rtx
22881 construct_plt_address (rtx symbol)
22882 {
22883 rtx tmp = gen_reg_rtx (Pmode);
22884 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
22885
22886 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
22887 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
22888
22889 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
22890 emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
22891 return tmp;
22892 }
22893
22894 rtx
22895 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
22896 rtx callarg2,
22897 rtx pop, bool sibcall)
22898 {
22899 /* We need to represent that SI and DI registers are clobbered
22900 by SYSV calls. */
22901 static int clobbered_registers[] = {
22902 XMM6_REG, XMM7_REG, XMM8_REG,
22903 XMM9_REG, XMM10_REG, XMM11_REG,
22904 XMM12_REG, XMM13_REG, XMM14_REG,
22905 XMM15_REG, SI_REG, DI_REG
22906 };
22907 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
22908 rtx use = NULL, call;
22909 unsigned int vec_len;
22910
22911 if (pop == const0_rtx)
22912 pop = NULL;
22913 gcc_assert (!TARGET_64BIT || !pop);
22914
22915 if (TARGET_MACHO && !TARGET_64BIT)
22916 {
22917 #if TARGET_MACHO
22918 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
22919 fnaddr = machopic_indirect_call_target (fnaddr);
22920 #endif
22921 }
22922 else
22923 {
22924 /* Static functions and indirect calls don't need the pic register. */
22925 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
22926 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22927 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
22928 use_reg (&use, pic_offset_table_rtx);
22929 }
22930
22931 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
22932 {
22933 rtx al = gen_rtx_REG (QImode, AX_REG);
22934 emit_move_insn (al, callarg2);
22935 use_reg (&use, al);
22936 }
22937
22938 if (ix86_cmodel == CM_LARGE_PIC
22939 && MEM_P (fnaddr)
22940 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
22941 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
22942 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
22943 else if (sibcall
22944 ? !sibcall_insn_operand (XEXP (fnaddr, 0), Pmode)
22945 : !call_insn_operand (XEXP (fnaddr, 0), Pmode))
22946 {
22947 fnaddr = XEXP (fnaddr, 0);
22948 if (GET_MODE (fnaddr) != Pmode)
22949 fnaddr = convert_to_mode (Pmode, fnaddr, 1);
22950 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (Pmode, fnaddr));
22951 }
22952
22953 vec_len = 0;
22954 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
22955 if (retval)
22956 call = gen_rtx_SET (VOIDmode, retval, call);
22957 vec[vec_len++] = call;
22958
22959 if (pop)
22960 {
22961 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
22962 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
22963 vec[vec_len++] = pop;
22964 }
22965
22966 if (TARGET_64BIT_MS_ABI
22967 && (!callarg2 || INTVAL (callarg2) != -2))
22968 {
22969 unsigned i;
22970
22971 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
22972 UNSPEC_MS_TO_SYSV_CALL);
22973
22974 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
22975 vec[vec_len++]
22976 = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
22977 ? TImode : DImode,
22978 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
22979 ? TImode : DImode,
22980 clobbered_registers[i]));
22981 }
22982
22983 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
22984 if (TARGET_VZEROUPPER)
22985 {
22986 int avx256;
22987 if (cfun->machine->callee_pass_avx256_p)
22988 {
22989 if (cfun->machine->callee_return_avx256_p)
22990 avx256 = callee_return_pass_avx256;
22991 else
22992 avx256 = callee_pass_avx256;
22993 }
22994 else if (cfun->machine->callee_return_avx256_p)
22995 avx256 = callee_return_avx256;
22996 else
22997 avx256 = call_no_avx256;
22998
22999 if (reload_completed)
23000 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
23001 else
23002 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode,
23003 gen_rtvec (1, GEN_INT (avx256)),
23004 UNSPEC_CALL_NEEDS_VZEROUPPER);
23005 }
23006
23007 if (vec_len > 1)
23008 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23009 call = emit_call_insn (call);
23010 if (use)
23011 CALL_INSN_FUNCTION_USAGE (call) = use;
23012
23013 return call;
23014 }
23015
23016 void
23017 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
23018 {
23019 rtx pat = PATTERN (insn);
23020 rtvec vec = XVEC (pat, 0);
23021 int len = GET_NUM_ELEM (vec) - 1;
23022
23023 /* Strip off the last entry of the parallel. */
23024 gcc_assert (GET_CODE (RTVEC_ELT (vec, len)) == UNSPEC);
23025 gcc_assert (XINT (RTVEC_ELT (vec, len), 1) == UNSPEC_CALL_NEEDS_VZEROUPPER);
23026 if (len == 1)
23027 pat = RTVEC_ELT (vec, 0);
23028 else
23029 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (len, &RTVEC_ELT (vec, 0)));
23030
23031 emit_insn (gen_avx_vzeroupper (vzeroupper));
23032 emit_call_insn (pat);
23033 }
23034
23035 /* Output the assembly for a call instruction. */
23036
23037 const char *
23038 ix86_output_call_insn (rtx insn, rtx call_op)
23039 {
23040 bool direct_p = constant_call_address_operand (call_op, Pmode);
23041 bool seh_nop_p = false;
23042 const char *xasm;
23043
23044 if (SIBLING_CALL_P (insn))
23045 {
23046 if (direct_p)
23047 xasm = "jmp\t%P0";
23048 /* SEH epilogue detection requires the indirect branch case
23049 to include REX.W. */
23050 else if (TARGET_SEH)
23051 xasm = "rex.W jmp %A0";
23052 else
23053 xasm = "jmp\t%A0";
23054
23055 output_asm_insn (xasm, &call_op);
23056 return "";
23057 }
23058
23059 /* SEH unwinding can require an extra nop to be emitted in several
23060 circumstances. Determine if we have one of those. */
23061 if (TARGET_SEH)
23062 {
23063 rtx i;
23064
23065 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23066 {
23067 /* If we get to another real insn, we don't need the nop. */
23068 if (INSN_P (i))
23069 break;
23070
23071 /* If we get to the epilogue note, prevent a catch region from
23072 being adjacent to the standard epilogue sequence. If non-
23073 call-exceptions, we'll have done this during epilogue emission. */
23074 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23075 && !flag_non_call_exceptions
23076 && !can_throw_internal (insn))
23077 {
23078 seh_nop_p = true;
23079 break;
23080 }
23081 }
23082
23083 /* If we didn't find a real insn following the call, prevent the
23084 unwinder from looking into the next function. */
23085 if (i == NULL)
23086 seh_nop_p = true;
23087 }
23088
23089 if (direct_p)
23090 xasm = "call\t%P0";
23091 else
23092 xasm = "call\t%A0";
23093
23094 output_asm_insn (xasm, &call_op);
23095
23096 if (seh_nop_p)
23097 return "nop";
23098
23099 return "";
23100 }
23101 \f
23102 /* Clear stack slot assignments remembered from previous functions.
23103 This is called from INIT_EXPANDERS once before RTL is emitted for each
23104 function. */
23105
23106 static struct machine_function *
23107 ix86_init_machine_status (void)
23108 {
23109 struct machine_function *f;
23110
23111 f = ggc_alloc_cleared_machine_function ();
23112 f->use_fast_prologue_epilogue_nregs = -1;
23113 f->tls_descriptor_call_expanded_p = 0;
23114 f->call_abi = ix86_abi;
23115
23116 return f;
23117 }
23118
23119 /* Return a MEM corresponding to a stack slot with mode MODE.
23120 Allocate a new slot if necessary.
23121
23122 The RTL for a function can have several slots available: N is
23123 which slot to use. */
23124
23125 rtx
23126 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23127 {
23128 struct stack_local_entry *s;
23129
23130 gcc_assert (n < MAX_386_STACK_LOCALS);
23131
23132 /* Virtual slot is valid only before vregs are instantiated. */
23133 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
23134
23135 for (s = ix86_stack_locals; s; s = s->next)
23136 if (s->mode == mode && s->n == n)
23137 return validize_mem (copy_rtx (s->rtl));
23138
23139 s = ggc_alloc_stack_local_entry ();
23140 s->n = n;
23141 s->mode = mode;
23142 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23143
23144 s->next = ix86_stack_locals;
23145 ix86_stack_locals = s;
23146 return validize_mem (s->rtl);
23147 }
23148 \f
23149 /* Calculate the length of the memory address in the instruction encoding.
23150 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23151 or other prefixes. */
23152
23153 int
23154 memory_address_length (rtx addr)
23155 {
23156 struct ix86_address parts;
23157 rtx base, index, disp;
23158 int len;
23159 int ok;
23160
23161 if (GET_CODE (addr) == PRE_DEC
23162 || GET_CODE (addr) == POST_INC
23163 || GET_CODE (addr) == PRE_MODIFY
23164 || GET_CODE (addr) == POST_MODIFY)
23165 return 0;
23166
23167 ok = ix86_decompose_address (addr, &parts);
23168 gcc_assert (ok);
23169
23170 if (parts.base && GET_CODE (parts.base) == SUBREG)
23171 parts.base = SUBREG_REG (parts.base);
23172 if (parts.index && GET_CODE (parts.index) == SUBREG)
23173 parts.index = SUBREG_REG (parts.index);
23174
23175 base = parts.base;
23176 index = parts.index;
23177 disp = parts.disp;
23178
23179 /* Add length of addr32 prefix. */
23180 len = (GET_CODE (addr) == ZERO_EXTEND
23181 || GET_CODE (addr) == AND);
23182
23183 /* Rule of thumb:
23184 - esp as the base always wants an index,
23185 - ebp as the base always wants a displacement,
23186 - r12 as the base always wants an index,
23187 - r13 as the base always wants a displacement. */
23188
23189 /* Register Indirect. */
23190 if (base && !index && !disp)
23191 {
23192 /* esp (for its index) and ebp (for its displacement) need
23193 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23194 code. */
23195 if (REG_P (addr)
23196 && (addr == arg_pointer_rtx
23197 || addr == frame_pointer_rtx
23198 || REGNO (addr) == SP_REG
23199 || REGNO (addr) == BP_REG
23200 || REGNO (addr) == R12_REG
23201 || REGNO (addr) == R13_REG))
23202 len = 1;
23203 }
23204
23205 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23206 is not disp32, but disp32(%rip), so for disp32
23207 SIB byte is needed, unless print_operand_address
23208 optimizes it into disp32(%rip) or (%rip) is implied
23209 by UNSPEC. */
23210 else if (disp && !base && !index)
23211 {
23212 len = 4;
23213 if (TARGET_64BIT)
23214 {
23215 rtx symbol = disp;
23216
23217 if (GET_CODE (disp) == CONST)
23218 symbol = XEXP (disp, 0);
23219 if (GET_CODE (symbol) == PLUS
23220 && CONST_INT_P (XEXP (symbol, 1)))
23221 symbol = XEXP (symbol, 0);
23222
23223 if (GET_CODE (symbol) != LABEL_REF
23224 && (GET_CODE (symbol) != SYMBOL_REF
23225 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23226 && (GET_CODE (symbol) != UNSPEC
23227 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23228 && XINT (symbol, 1) != UNSPEC_PCREL
23229 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23230 len += 1;
23231 }
23232 }
23233
23234 else
23235 {
23236 /* Find the length of the displacement constant. */
23237 if (disp)
23238 {
23239 if (base && satisfies_constraint_K (disp))
23240 len = 1;
23241 else
23242 len = 4;
23243 }
23244 /* ebp always wants a displacement. Similarly r13. */
23245 else if (base && REG_P (base)
23246 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23247 len = 1;
23248
23249 /* An index requires the two-byte modrm form.... */
23250 if (index
23251 /* ...like esp (or r12), which always wants an index. */
23252 || base == arg_pointer_rtx
23253 || base == frame_pointer_rtx
23254 || (base && REG_P (base)
23255 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23256 len += 1;
23257 }
23258
23259 switch (parts.seg)
23260 {
23261 case SEG_FS:
23262 case SEG_GS:
23263 len += 1;
23264 break;
23265 default:
23266 break;
23267 }
23268
23269 return len;
23270 }
23271
23272 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23273 is set, expect that insn have 8bit immediate alternative. */
23274 int
23275 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23276 {
23277 int len = 0;
23278 int i;
23279 extract_insn_cached (insn);
23280 for (i = recog_data.n_operands - 1; i >= 0; --i)
23281 if (CONSTANT_P (recog_data.operand[i]))
23282 {
23283 enum attr_mode mode = get_attr_mode (insn);
23284
23285 gcc_assert (!len);
23286 if (shortform && CONST_INT_P (recog_data.operand[i]))
23287 {
23288 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23289 switch (mode)
23290 {
23291 case MODE_QI:
23292 len = 1;
23293 continue;
23294 case MODE_HI:
23295 ival = trunc_int_for_mode (ival, HImode);
23296 break;
23297 case MODE_SI:
23298 ival = trunc_int_for_mode (ival, SImode);
23299 break;
23300 default:
23301 break;
23302 }
23303 if (IN_RANGE (ival, -128, 127))
23304 {
23305 len = 1;
23306 continue;
23307 }
23308 }
23309 switch (mode)
23310 {
23311 case MODE_QI:
23312 len = 1;
23313 break;
23314 case MODE_HI:
23315 len = 2;
23316 break;
23317 case MODE_SI:
23318 len = 4;
23319 break;
23320 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
23321 case MODE_DI:
23322 len = 4;
23323 break;
23324 default:
23325 fatal_insn ("unknown insn mode", insn);
23326 }
23327 }
23328 return len;
23329 }
23330 /* Compute default value for "length_address" attribute. */
23331 int
23332 ix86_attr_length_address_default (rtx insn)
23333 {
23334 int i;
23335
23336 if (get_attr_type (insn) == TYPE_LEA)
23337 {
23338 rtx set = PATTERN (insn), addr;
23339
23340 if (GET_CODE (set) == PARALLEL)
23341 set = XVECEXP (set, 0, 0);
23342
23343 gcc_assert (GET_CODE (set) == SET);
23344
23345 addr = SET_SRC (set);
23346 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
23347 {
23348 if (GET_CODE (addr) == ZERO_EXTEND)
23349 addr = XEXP (addr, 0);
23350 if (GET_CODE (addr) == SUBREG)
23351 addr = SUBREG_REG (addr);
23352 }
23353
23354 return memory_address_length (addr);
23355 }
23356
23357 extract_insn_cached (insn);
23358 for (i = recog_data.n_operands - 1; i >= 0; --i)
23359 if (MEM_P (recog_data.operand[i]))
23360 {
23361 constrain_operands_cached (reload_completed);
23362 if (which_alternative != -1)
23363 {
23364 const char *constraints = recog_data.constraints[i];
23365 int alt = which_alternative;
23366
23367 while (*constraints == '=' || *constraints == '+')
23368 constraints++;
23369 while (alt-- > 0)
23370 while (*constraints++ != ',')
23371 ;
23372 /* Skip ignored operands. */
23373 if (*constraints == 'X')
23374 continue;
23375 }
23376 return memory_address_length (XEXP (recog_data.operand[i], 0));
23377 }
23378 return 0;
23379 }
23380
23381 /* Compute default value for "length_vex" attribute. It includes
23382 2 or 3 byte VEX prefix and 1 opcode byte. */
23383
23384 int
23385 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23386 {
23387 int i;
23388
23389 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23390 byte VEX prefix. */
23391 if (!has_0f_opcode || has_vex_w)
23392 return 3 + 1;
23393
23394 /* We can always use 2 byte VEX prefix in 32bit. */
23395 if (!TARGET_64BIT)
23396 return 2 + 1;
23397
23398 extract_insn_cached (insn);
23399
23400 for (i = recog_data.n_operands - 1; i >= 0; --i)
23401 if (REG_P (recog_data.operand[i]))
23402 {
23403 /* REX.W bit uses 3 byte VEX prefix. */
23404 if (GET_MODE (recog_data.operand[i]) == DImode
23405 && GENERAL_REG_P (recog_data.operand[i]))
23406 return 3 + 1;
23407 }
23408 else
23409 {
23410 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23411 if (MEM_P (recog_data.operand[i])
23412 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23413 return 3 + 1;
23414 }
23415
23416 return 2 + 1;
23417 }
23418 \f
23419 /* Return the maximum number of instructions a cpu can issue. */
23420
23421 static int
23422 ix86_issue_rate (void)
23423 {
23424 switch (ix86_tune)
23425 {
23426 case PROCESSOR_PENTIUM:
23427 case PROCESSOR_ATOM:
23428 case PROCESSOR_K6:
23429 return 2;
23430
23431 case PROCESSOR_PENTIUMPRO:
23432 case PROCESSOR_PENTIUM4:
23433 case PROCESSOR_CORE2_32:
23434 case PROCESSOR_CORE2_64:
23435 case PROCESSOR_COREI7_32:
23436 case PROCESSOR_COREI7_64:
23437 case PROCESSOR_ATHLON:
23438 case PROCESSOR_K8:
23439 case PROCESSOR_AMDFAM10:
23440 case PROCESSOR_NOCONA:
23441 case PROCESSOR_GENERIC32:
23442 case PROCESSOR_GENERIC64:
23443 case PROCESSOR_BDVER1:
23444 case PROCESSOR_BDVER2:
23445 case PROCESSOR_BTVER1:
23446 return 3;
23447
23448 default:
23449 return 1;
23450 }
23451 }
23452
23453 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23454 by DEP_INSN and nothing set by DEP_INSN. */
23455
23456 static bool
23457 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23458 {
23459 rtx set, set2;
23460
23461 /* Simplify the test for uninteresting insns. */
23462 if (insn_type != TYPE_SETCC
23463 && insn_type != TYPE_ICMOV
23464 && insn_type != TYPE_FCMOV
23465 && insn_type != TYPE_IBR)
23466 return false;
23467
23468 if ((set = single_set (dep_insn)) != 0)
23469 {
23470 set = SET_DEST (set);
23471 set2 = NULL_RTX;
23472 }
23473 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23474 && XVECLEN (PATTERN (dep_insn), 0) == 2
23475 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23476 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23477 {
23478 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23479 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23480 }
23481 else
23482 return false;
23483
23484 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23485 return false;
23486
23487 /* This test is true if the dependent insn reads the flags but
23488 not any other potentially set register. */
23489 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23490 return false;
23491
23492 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23493 return false;
23494
23495 return true;
23496 }
23497
23498 /* Return true iff USE_INSN has a memory address with operands set by
23499 SET_INSN. */
23500
23501 bool
23502 ix86_agi_dependent (rtx set_insn, rtx use_insn)
23503 {
23504 int i;
23505 extract_insn_cached (use_insn);
23506 for (i = recog_data.n_operands - 1; i >= 0; --i)
23507 if (MEM_P (recog_data.operand[i]))
23508 {
23509 rtx addr = XEXP (recog_data.operand[i], 0);
23510 return modified_in_p (addr, set_insn) != 0;
23511 }
23512 return false;
23513 }
23514
23515 static int
23516 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
23517 {
23518 enum attr_type insn_type, dep_insn_type;
23519 enum attr_memory memory;
23520 rtx set, set2;
23521 int dep_insn_code_number;
23522
23523 /* Anti and output dependencies have zero cost on all CPUs. */
23524 if (REG_NOTE_KIND (link) != 0)
23525 return 0;
23526
23527 dep_insn_code_number = recog_memoized (dep_insn);
23528
23529 /* If we can't recognize the insns, we can't really do anything. */
23530 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
23531 return cost;
23532
23533 insn_type = get_attr_type (insn);
23534 dep_insn_type = get_attr_type (dep_insn);
23535
23536 switch (ix86_tune)
23537 {
23538 case PROCESSOR_PENTIUM:
23539 /* Address Generation Interlock adds a cycle of latency. */
23540 if (insn_type == TYPE_LEA)
23541 {
23542 rtx addr = PATTERN (insn);
23543
23544 if (GET_CODE (addr) == PARALLEL)
23545 addr = XVECEXP (addr, 0, 0);
23546
23547 gcc_assert (GET_CODE (addr) == SET);
23548
23549 addr = SET_SRC (addr);
23550 if (modified_in_p (addr, dep_insn))
23551 cost += 1;
23552 }
23553 else if (ix86_agi_dependent (dep_insn, insn))
23554 cost += 1;
23555
23556 /* ??? Compares pair with jump/setcc. */
23557 if (ix86_flags_dependent (insn, dep_insn, insn_type))
23558 cost = 0;
23559
23560 /* Floating point stores require value to be ready one cycle earlier. */
23561 if (insn_type == TYPE_FMOV
23562 && get_attr_memory (insn) == MEMORY_STORE
23563 && !ix86_agi_dependent (dep_insn, insn))
23564 cost += 1;
23565 break;
23566
23567 case PROCESSOR_PENTIUMPRO:
23568 memory = get_attr_memory (insn);
23569
23570 /* INT->FP conversion is expensive. */
23571 if (get_attr_fp_int_src (dep_insn))
23572 cost += 5;
23573
23574 /* There is one cycle extra latency between an FP op and a store. */
23575 if (insn_type == TYPE_FMOV
23576 && (set = single_set (dep_insn)) != NULL_RTX
23577 && (set2 = single_set (insn)) != NULL_RTX
23578 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
23579 && MEM_P (SET_DEST (set2)))
23580 cost += 1;
23581
23582 /* Show ability of reorder buffer to hide latency of load by executing
23583 in parallel with previous instruction in case
23584 previous instruction is not needed to compute the address. */
23585 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23586 && !ix86_agi_dependent (dep_insn, insn))
23587 {
23588 /* Claim moves to take one cycle, as core can issue one load
23589 at time and the next load can start cycle later. */
23590 if (dep_insn_type == TYPE_IMOV
23591 || dep_insn_type == TYPE_FMOV)
23592 cost = 1;
23593 else if (cost > 1)
23594 cost--;
23595 }
23596 break;
23597
23598 case PROCESSOR_K6:
23599 memory = get_attr_memory (insn);
23600
23601 /* The esp dependency is resolved before the instruction is really
23602 finished. */
23603 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
23604 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
23605 return 1;
23606
23607 /* INT->FP conversion is expensive. */
23608 if (get_attr_fp_int_src (dep_insn))
23609 cost += 5;
23610
23611 /* Show ability of reorder buffer to hide latency of load by executing
23612 in parallel with previous instruction in case
23613 previous instruction is not needed to compute the address. */
23614 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23615 && !ix86_agi_dependent (dep_insn, insn))
23616 {
23617 /* Claim moves to take one cycle, as core can issue one load
23618 at time and the next load can start cycle later. */
23619 if (dep_insn_type == TYPE_IMOV
23620 || dep_insn_type == TYPE_FMOV)
23621 cost = 1;
23622 else if (cost > 2)
23623 cost -= 2;
23624 else
23625 cost = 1;
23626 }
23627 break;
23628
23629 case PROCESSOR_ATHLON:
23630 case PROCESSOR_K8:
23631 case PROCESSOR_AMDFAM10:
23632 case PROCESSOR_BDVER1:
23633 case PROCESSOR_BDVER2:
23634 case PROCESSOR_BTVER1:
23635 case PROCESSOR_ATOM:
23636 case PROCESSOR_GENERIC32:
23637 case PROCESSOR_GENERIC64:
23638 memory = get_attr_memory (insn);
23639
23640 /* Show ability of reorder buffer to hide latency of load by executing
23641 in parallel with previous instruction in case
23642 previous instruction is not needed to compute the address. */
23643 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
23644 && !ix86_agi_dependent (dep_insn, insn))
23645 {
23646 enum attr_unit unit = get_attr_unit (insn);
23647 int loadcost = 3;
23648
23649 /* Because of the difference between the length of integer and
23650 floating unit pipeline preparation stages, the memory operands
23651 for floating point are cheaper.
23652
23653 ??? For Athlon it the difference is most probably 2. */
23654 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
23655 loadcost = 3;
23656 else
23657 loadcost = TARGET_ATHLON ? 2 : 0;
23658
23659 if (cost >= loadcost)
23660 cost -= loadcost;
23661 else
23662 cost = 0;
23663 }
23664
23665 default:
23666 break;
23667 }
23668
23669 return cost;
23670 }
23671
23672 /* How many alternative schedules to try. This should be as wide as the
23673 scheduling freedom in the DFA, but no wider. Making this value too
23674 large results extra work for the scheduler. */
23675
23676 static int
23677 ia32_multipass_dfa_lookahead (void)
23678 {
23679 switch (ix86_tune)
23680 {
23681 case PROCESSOR_PENTIUM:
23682 return 2;
23683
23684 case PROCESSOR_PENTIUMPRO:
23685 case PROCESSOR_K6:
23686 return 1;
23687
23688 case PROCESSOR_CORE2_32:
23689 case PROCESSOR_CORE2_64:
23690 case PROCESSOR_COREI7_32:
23691 case PROCESSOR_COREI7_64:
23692 /* Generally, we want haifa-sched:max_issue() to look ahead as far
23693 as many instructions can be executed on a cycle, i.e.,
23694 issue_rate. I wonder why tuning for many CPUs does not do this. */
23695 return ix86_issue_rate ();
23696
23697 default:
23698 return 0;
23699 }
23700 }
23701
23702 \f
23703
23704 /* Model decoder of Core 2/i7.
23705 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
23706 track the instruction fetch block boundaries and make sure that long
23707 (9+ bytes) instructions are assigned to D0. */
23708
23709 /* Maximum length of an insn that can be handled by
23710 a secondary decoder unit. '8' for Core 2/i7. */
23711 static int core2i7_secondary_decoder_max_insn_size;
23712
23713 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
23714 '16' for Core 2/i7. */
23715 static int core2i7_ifetch_block_size;
23716
23717 /* Maximum number of instructions decoder can handle per cycle.
23718 '6' for Core 2/i7. */
23719 static int core2i7_ifetch_block_max_insns;
23720
23721 typedef struct ix86_first_cycle_multipass_data_ *
23722 ix86_first_cycle_multipass_data_t;
23723 typedef const struct ix86_first_cycle_multipass_data_ *
23724 const_ix86_first_cycle_multipass_data_t;
23725
23726 /* A variable to store target state across calls to max_issue within
23727 one cycle. */
23728 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
23729 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
23730
23731 /* Initialize DATA. */
23732 static void
23733 core2i7_first_cycle_multipass_init (void *_data)
23734 {
23735 ix86_first_cycle_multipass_data_t data
23736 = (ix86_first_cycle_multipass_data_t) _data;
23737
23738 data->ifetch_block_len = 0;
23739 data->ifetch_block_n_insns = 0;
23740 data->ready_try_change = NULL;
23741 data->ready_try_change_size = 0;
23742 }
23743
23744 /* Advancing the cycle; reset ifetch block counts. */
23745 static void
23746 core2i7_dfa_post_advance_cycle (void)
23747 {
23748 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
23749
23750 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23751
23752 data->ifetch_block_len = 0;
23753 data->ifetch_block_n_insns = 0;
23754 }
23755
23756 static int min_insn_size (rtx);
23757
23758 /* Filter out insns from ready_try that the core will not be able to issue
23759 on current cycle due to decoder. */
23760 static void
23761 core2i7_first_cycle_multipass_filter_ready_try
23762 (const_ix86_first_cycle_multipass_data_t data,
23763 char *ready_try, int n_ready, bool first_cycle_insn_p)
23764 {
23765 while (n_ready--)
23766 {
23767 rtx insn;
23768 int insn_size;
23769
23770 if (ready_try[n_ready])
23771 continue;
23772
23773 insn = get_ready_element (n_ready);
23774 insn_size = min_insn_size (insn);
23775
23776 if (/* If this is a too long an insn for a secondary decoder ... */
23777 (!first_cycle_insn_p
23778 && insn_size > core2i7_secondary_decoder_max_insn_size)
23779 /* ... or it would not fit into the ifetch block ... */
23780 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
23781 /* ... or the decoder is full already ... */
23782 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
23783 /* ... mask the insn out. */
23784 {
23785 ready_try[n_ready] = 1;
23786
23787 if (data->ready_try_change)
23788 SET_BIT (data->ready_try_change, n_ready);
23789 }
23790 }
23791 }
23792
23793 /* Prepare for a new round of multipass lookahead scheduling. */
23794 static void
23795 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
23796 bool first_cycle_insn_p)
23797 {
23798 ix86_first_cycle_multipass_data_t data
23799 = (ix86_first_cycle_multipass_data_t) _data;
23800 const_ix86_first_cycle_multipass_data_t prev_data
23801 = ix86_first_cycle_multipass_data;
23802
23803 /* Restore the state from the end of the previous round. */
23804 data->ifetch_block_len = prev_data->ifetch_block_len;
23805 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
23806
23807 /* Filter instructions that cannot be issued on current cycle due to
23808 decoder restrictions. */
23809 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23810 first_cycle_insn_p);
23811 }
23812
23813 /* INSN is being issued in current solution. Account for its impact on
23814 the decoder model. */
23815 static void
23816 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
23817 rtx insn, const void *_prev_data)
23818 {
23819 ix86_first_cycle_multipass_data_t data
23820 = (ix86_first_cycle_multipass_data_t) _data;
23821 const_ix86_first_cycle_multipass_data_t prev_data
23822 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
23823
23824 int insn_size = min_insn_size (insn);
23825
23826 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
23827 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
23828 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
23829 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
23830
23831 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
23832 if (!data->ready_try_change)
23833 {
23834 data->ready_try_change = sbitmap_alloc (n_ready);
23835 data->ready_try_change_size = n_ready;
23836 }
23837 else if (data->ready_try_change_size < n_ready)
23838 {
23839 data->ready_try_change = sbitmap_resize (data->ready_try_change,
23840 n_ready, 0);
23841 data->ready_try_change_size = n_ready;
23842 }
23843 sbitmap_zero (data->ready_try_change);
23844
23845 /* Filter out insns from ready_try that the core will not be able to issue
23846 on current cycle due to decoder. */
23847 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
23848 false);
23849 }
23850
23851 /* Revert the effect on ready_try. */
23852 static void
23853 core2i7_first_cycle_multipass_backtrack (const void *_data,
23854 char *ready_try,
23855 int n_ready ATTRIBUTE_UNUSED)
23856 {
23857 const_ix86_first_cycle_multipass_data_t data
23858 = (const_ix86_first_cycle_multipass_data_t) _data;
23859 unsigned int i = 0;
23860 sbitmap_iterator sbi;
23861
23862 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
23863 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
23864 {
23865 ready_try[i] = 0;
23866 }
23867 }
23868
23869 /* Save the result of multipass lookahead scheduling for the next round. */
23870 static void
23871 core2i7_first_cycle_multipass_end (const void *_data)
23872 {
23873 const_ix86_first_cycle_multipass_data_t data
23874 = (const_ix86_first_cycle_multipass_data_t) _data;
23875 ix86_first_cycle_multipass_data_t next_data
23876 = ix86_first_cycle_multipass_data;
23877
23878 if (data != NULL)
23879 {
23880 next_data->ifetch_block_len = data->ifetch_block_len;
23881 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
23882 }
23883 }
23884
23885 /* Deallocate target data. */
23886 static void
23887 core2i7_first_cycle_multipass_fini (void *_data)
23888 {
23889 ix86_first_cycle_multipass_data_t data
23890 = (ix86_first_cycle_multipass_data_t) _data;
23891
23892 if (data->ready_try_change)
23893 {
23894 sbitmap_free (data->ready_try_change);
23895 data->ready_try_change = NULL;
23896 data->ready_try_change_size = 0;
23897 }
23898 }
23899
23900 /* Prepare for scheduling pass. */
23901 static void
23902 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
23903 int verbose ATTRIBUTE_UNUSED,
23904 int max_uid ATTRIBUTE_UNUSED)
23905 {
23906 /* Install scheduling hooks for current CPU. Some of these hooks are used
23907 in time-critical parts of the scheduler, so we only set them up when
23908 they are actually used. */
23909 switch (ix86_tune)
23910 {
23911 case PROCESSOR_CORE2_32:
23912 case PROCESSOR_CORE2_64:
23913 case PROCESSOR_COREI7_32:
23914 case PROCESSOR_COREI7_64:
23915 targetm.sched.dfa_post_advance_cycle
23916 = core2i7_dfa_post_advance_cycle;
23917 targetm.sched.first_cycle_multipass_init
23918 = core2i7_first_cycle_multipass_init;
23919 targetm.sched.first_cycle_multipass_begin
23920 = core2i7_first_cycle_multipass_begin;
23921 targetm.sched.first_cycle_multipass_issue
23922 = core2i7_first_cycle_multipass_issue;
23923 targetm.sched.first_cycle_multipass_backtrack
23924 = core2i7_first_cycle_multipass_backtrack;
23925 targetm.sched.first_cycle_multipass_end
23926 = core2i7_first_cycle_multipass_end;
23927 targetm.sched.first_cycle_multipass_fini
23928 = core2i7_first_cycle_multipass_fini;
23929
23930 /* Set decoder parameters. */
23931 core2i7_secondary_decoder_max_insn_size = 8;
23932 core2i7_ifetch_block_size = 16;
23933 core2i7_ifetch_block_max_insns = 6;
23934 break;
23935
23936 default:
23937 targetm.sched.dfa_post_advance_cycle = NULL;
23938 targetm.sched.first_cycle_multipass_init = NULL;
23939 targetm.sched.first_cycle_multipass_begin = NULL;
23940 targetm.sched.first_cycle_multipass_issue = NULL;
23941 targetm.sched.first_cycle_multipass_backtrack = NULL;
23942 targetm.sched.first_cycle_multipass_end = NULL;
23943 targetm.sched.first_cycle_multipass_fini = NULL;
23944 break;
23945 }
23946 }
23947
23948 \f
23949 /* Compute the alignment given to a constant that is being placed in memory.
23950 EXP is the constant and ALIGN is the alignment that the object would
23951 ordinarily have.
23952 The value of this function is used instead of that alignment to align
23953 the object. */
23954
23955 int
23956 ix86_constant_alignment (tree exp, int align)
23957 {
23958 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
23959 || TREE_CODE (exp) == INTEGER_CST)
23960 {
23961 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
23962 return 64;
23963 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
23964 return 128;
23965 }
23966 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
23967 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
23968 return BITS_PER_WORD;
23969
23970 return align;
23971 }
23972
23973 /* Compute the alignment for a static variable.
23974 TYPE is the data type, and ALIGN is the alignment that
23975 the object would ordinarily have. The value of this function is used
23976 instead of that alignment to align the object. */
23977
23978 int
23979 ix86_data_alignment (tree type, int align)
23980 {
23981 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
23982
23983 if (AGGREGATE_TYPE_P (type)
23984 && TYPE_SIZE (type)
23985 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23986 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
23987 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
23988 && align < max_align)
23989 align = max_align;
23990
23991 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
23992 to 16byte boundary. */
23993 if (TARGET_64BIT)
23994 {
23995 if (AGGREGATE_TYPE_P (type)
23996 && TYPE_SIZE (type)
23997 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23998 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
23999 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24000 return 128;
24001 }
24002
24003 if (TREE_CODE (type) == ARRAY_TYPE)
24004 {
24005 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24006 return 64;
24007 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24008 return 128;
24009 }
24010 else if (TREE_CODE (type) == COMPLEX_TYPE)
24011 {
24012
24013 if (TYPE_MODE (type) == DCmode && align < 64)
24014 return 64;
24015 if ((TYPE_MODE (type) == XCmode
24016 || TYPE_MODE (type) == TCmode) && align < 128)
24017 return 128;
24018 }
24019 else if ((TREE_CODE (type) == RECORD_TYPE
24020 || TREE_CODE (type) == UNION_TYPE
24021 || TREE_CODE (type) == QUAL_UNION_TYPE)
24022 && TYPE_FIELDS (type))
24023 {
24024 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24025 return 64;
24026 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24027 return 128;
24028 }
24029 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24030 || TREE_CODE (type) == INTEGER_TYPE)
24031 {
24032 if (TYPE_MODE (type) == DFmode && align < 64)
24033 return 64;
24034 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24035 return 128;
24036 }
24037
24038 return align;
24039 }
24040
24041 /* Compute the alignment for a local variable or a stack slot. EXP is
24042 the data type or decl itself, MODE is the widest mode available and
24043 ALIGN is the alignment that the object would ordinarily have. The
24044 value of this macro is used instead of that alignment to align the
24045 object. */
24046
24047 unsigned int
24048 ix86_local_alignment (tree exp, enum machine_mode mode,
24049 unsigned int align)
24050 {
24051 tree type, decl;
24052
24053 if (exp && DECL_P (exp))
24054 {
24055 type = TREE_TYPE (exp);
24056 decl = exp;
24057 }
24058 else
24059 {
24060 type = exp;
24061 decl = NULL;
24062 }
24063
24064 /* Don't do dynamic stack realignment for long long objects with
24065 -mpreferred-stack-boundary=2. */
24066 if (!TARGET_64BIT
24067 && align == 64
24068 && ix86_preferred_stack_boundary < 64
24069 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24070 && (!type || !TYPE_USER_ALIGN (type))
24071 && (!decl || !DECL_USER_ALIGN (decl)))
24072 align = 32;
24073
24074 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24075 register in MODE. We will return the largest alignment of XF
24076 and DF. */
24077 if (!type)
24078 {
24079 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24080 align = GET_MODE_ALIGNMENT (DFmode);
24081 return align;
24082 }
24083
24084 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24085 to 16byte boundary. Exact wording is:
24086
24087 An array uses the same alignment as its elements, except that a local or
24088 global array variable of length at least 16 bytes or
24089 a C99 variable-length array variable always has alignment of at least 16 bytes.
24090
24091 This was added to allow use of aligned SSE instructions at arrays. This
24092 rule is meant for static storage (where compiler can not do the analysis
24093 by itself). We follow it for automatic variables only when convenient.
24094 We fully control everything in the function compiled and functions from
24095 other unit can not rely on the alignment.
24096
24097 Exclude va_list type. It is the common case of local array where
24098 we can not benefit from the alignment. */
24099 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24100 && TARGET_SSE)
24101 {
24102 if (AGGREGATE_TYPE_P (type)
24103 && (va_list_type_node == NULL_TREE
24104 || (TYPE_MAIN_VARIANT (type)
24105 != TYPE_MAIN_VARIANT (va_list_type_node)))
24106 && TYPE_SIZE (type)
24107 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24108 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24109 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24110 return 128;
24111 }
24112 if (TREE_CODE (type) == ARRAY_TYPE)
24113 {
24114 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24115 return 64;
24116 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24117 return 128;
24118 }
24119 else if (TREE_CODE (type) == COMPLEX_TYPE)
24120 {
24121 if (TYPE_MODE (type) == DCmode && align < 64)
24122 return 64;
24123 if ((TYPE_MODE (type) == XCmode
24124 || TYPE_MODE (type) == TCmode) && align < 128)
24125 return 128;
24126 }
24127 else if ((TREE_CODE (type) == RECORD_TYPE
24128 || TREE_CODE (type) == UNION_TYPE
24129 || TREE_CODE (type) == QUAL_UNION_TYPE)
24130 && TYPE_FIELDS (type))
24131 {
24132 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24133 return 64;
24134 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24135 return 128;
24136 }
24137 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24138 || TREE_CODE (type) == INTEGER_TYPE)
24139 {
24140
24141 if (TYPE_MODE (type) == DFmode && align < 64)
24142 return 64;
24143 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24144 return 128;
24145 }
24146 return align;
24147 }
24148
24149 /* Compute the minimum required alignment for dynamic stack realignment
24150 purposes for a local variable, parameter or a stack slot. EXP is
24151 the data type or decl itself, MODE is its mode and ALIGN is the
24152 alignment that the object would ordinarily have. */
24153
24154 unsigned int
24155 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24156 unsigned int align)
24157 {
24158 tree type, decl;
24159
24160 if (exp && DECL_P (exp))
24161 {
24162 type = TREE_TYPE (exp);
24163 decl = exp;
24164 }
24165 else
24166 {
24167 type = exp;
24168 decl = NULL;
24169 }
24170
24171 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
24172 return align;
24173
24174 /* Don't do dynamic stack realignment for long long objects with
24175 -mpreferred-stack-boundary=2. */
24176 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
24177 && (!type || !TYPE_USER_ALIGN (type))
24178 && (!decl || !DECL_USER_ALIGN (decl)))
24179 return 32;
24180
24181 return align;
24182 }
24183 \f
24184 /* Find a location for the static chain incoming to a nested function.
24185 This is a register, unless all free registers are used by arguments. */
24186
24187 static rtx
24188 ix86_static_chain (const_tree fndecl, bool incoming_p)
24189 {
24190 unsigned regno;
24191
24192 if (!DECL_STATIC_CHAIN (fndecl))
24193 return NULL;
24194
24195 if (TARGET_64BIT)
24196 {
24197 /* We always use R10 in 64-bit mode. */
24198 regno = R10_REG;
24199 }
24200 else
24201 {
24202 tree fntype;
24203 unsigned int ccvt;
24204
24205 /* By default in 32-bit mode we use ECX to pass the static chain. */
24206 regno = CX_REG;
24207
24208 fntype = TREE_TYPE (fndecl);
24209 ccvt = ix86_get_callcvt (fntype);
24210 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
24211 {
24212 /* Fastcall functions use ecx/edx for arguments, which leaves
24213 us with EAX for the static chain.
24214 Thiscall functions use ecx for arguments, which also
24215 leaves us with EAX for the static chain. */
24216 regno = AX_REG;
24217 }
24218 else if (ix86_function_regparm (fntype, fndecl) == 3)
24219 {
24220 /* For regparm 3, we have no free call-clobbered registers in
24221 which to store the static chain. In order to implement this,
24222 we have the trampoline push the static chain to the stack.
24223 However, we can't push a value below the return address when
24224 we call the nested function directly, so we have to use an
24225 alternate entry point. For this we use ESI, and have the
24226 alternate entry point push ESI, so that things appear the
24227 same once we're executing the nested function. */
24228 if (incoming_p)
24229 {
24230 if (fndecl == current_function_decl)
24231 ix86_static_chain_on_stack = true;
24232 return gen_frame_mem (SImode,
24233 plus_constant (arg_pointer_rtx, -8));
24234 }
24235 regno = SI_REG;
24236 }
24237 }
24238
24239 return gen_rtx_REG (Pmode, regno);
24240 }
24241
24242 /* Emit RTL insns to initialize the variable parts of a trampoline.
24243 FNDECL is the decl of the target address; M_TRAMP is a MEM for
24244 the trampoline, and CHAIN_VALUE is an RTX for the static chain
24245 to be passed to the target function. */
24246
24247 static void
24248 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
24249 {
24250 rtx mem, fnaddr;
24251 int opcode;
24252 int offset = 0;
24253
24254 fnaddr = XEXP (DECL_RTL (fndecl), 0);
24255
24256 if (TARGET_64BIT)
24257 {
24258 int size;
24259
24260 /* Load the function address to r11. Try to load address using
24261 the shorter movl instead of movabs. We may want to support
24262 movq for kernel mode, but kernel does not use trampolines at
24263 the moment. */
24264 if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
24265 {
24266 fnaddr = copy_to_mode_reg (DImode, fnaddr);
24267
24268 mem = adjust_address (m_tramp, HImode, offset);
24269 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
24270
24271 mem = adjust_address (m_tramp, SImode, offset + 2);
24272 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
24273 offset += 6;
24274 }
24275 else
24276 {
24277 mem = adjust_address (m_tramp, HImode, offset);
24278 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
24279
24280 mem = adjust_address (m_tramp, DImode, offset + 2);
24281 emit_move_insn (mem, fnaddr);
24282 offset += 10;
24283 }
24284
24285 /* Load static chain using movabs to r10. Use the
24286 shorter movl instead of movabs for x32. */
24287 if (TARGET_X32)
24288 {
24289 opcode = 0xba41;
24290 size = 6;
24291 }
24292 else
24293 {
24294 opcode = 0xba49;
24295 size = 10;
24296 }
24297
24298 mem = adjust_address (m_tramp, HImode, offset);
24299 emit_move_insn (mem, gen_int_mode (opcode, HImode));
24300
24301 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
24302 emit_move_insn (mem, chain_value);
24303 offset += size;
24304
24305 /* Jump to r11; the last (unused) byte is a nop, only there to
24306 pad the write out to a single 32-bit store. */
24307 mem = adjust_address (m_tramp, SImode, offset);
24308 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
24309 offset += 4;
24310 }
24311 else
24312 {
24313 rtx disp, chain;
24314
24315 /* Depending on the static chain location, either load a register
24316 with a constant, or push the constant to the stack. All of the
24317 instructions are the same size. */
24318 chain = ix86_static_chain (fndecl, true);
24319 if (REG_P (chain))
24320 {
24321 switch (REGNO (chain))
24322 {
24323 case AX_REG:
24324 opcode = 0xb8; break;
24325 case CX_REG:
24326 opcode = 0xb9; break;
24327 default:
24328 gcc_unreachable ();
24329 }
24330 }
24331 else
24332 opcode = 0x68;
24333
24334 mem = adjust_address (m_tramp, QImode, offset);
24335 emit_move_insn (mem, gen_int_mode (opcode, QImode));
24336
24337 mem = adjust_address (m_tramp, SImode, offset + 1);
24338 emit_move_insn (mem, chain_value);
24339 offset += 5;
24340
24341 mem = adjust_address (m_tramp, QImode, offset);
24342 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
24343
24344 mem = adjust_address (m_tramp, SImode, offset + 1);
24345
24346 /* Compute offset from the end of the jmp to the target function.
24347 In the case in which the trampoline stores the static chain on
24348 the stack, we need to skip the first insn which pushes the
24349 (call-saved) register static chain; this push is 1 byte. */
24350 offset += 5;
24351 disp = expand_binop (SImode, sub_optab, fnaddr,
24352 plus_constant (XEXP (m_tramp, 0),
24353 offset - (MEM_P (chain) ? 1 : 0)),
24354 NULL_RTX, 1, OPTAB_DIRECT);
24355 emit_move_insn (mem, disp);
24356 }
24357
24358 gcc_assert (offset <= TRAMPOLINE_SIZE);
24359
24360 #ifdef HAVE_ENABLE_EXECUTE_STACK
24361 #ifdef CHECK_EXECUTE_STACK_ENABLED
24362 if (CHECK_EXECUTE_STACK_ENABLED)
24363 #endif
24364 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
24365 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
24366 #endif
24367 }
24368 \f
24369 /* The following file contains several enumerations and data structures
24370 built from the definitions in i386-builtin-types.def. */
24371
24372 #include "i386-builtin-types.inc"
24373
24374 /* Table for the ix86 builtin non-function types. */
24375 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
24376
24377 /* Retrieve an element from the above table, building some of
24378 the types lazily. */
24379
24380 static tree
24381 ix86_get_builtin_type (enum ix86_builtin_type tcode)
24382 {
24383 unsigned int index;
24384 tree type, itype;
24385
24386 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
24387
24388 type = ix86_builtin_type_tab[(int) tcode];
24389 if (type != NULL)
24390 return type;
24391
24392 gcc_assert (tcode > IX86_BT_LAST_PRIM);
24393 if (tcode <= IX86_BT_LAST_VECT)
24394 {
24395 enum machine_mode mode;
24396
24397 index = tcode - IX86_BT_LAST_PRIM - 1;
24398 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
24399 mode = ix86_builtin_type_vect_mode[index];
24400
24401 type = build_vector_type_for_mode (itype, mode);
24402 }
24403 else
24404 {
24405 int quals;
24406
24407 index = tcode - IX86_BT_LAST_VECT - 1;
24408 if (tcode <= IX86_BT_LAST_PTR)
24409 quals = TYPE_UNQUALIFIED;
24410 else
24411 quals = TYPE_QUAL_CONST;
24412
24413 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
24414 if (quals != TYPE_UNQUALIFIED)
24415 itype = build_qualified_type (itype, quals);
24416
24417 type = build_pointer_type (itype);
24418 }
24419
24420 ix86_builtin_type_tab[(int) tcode] = type;
24421 return type;
24422 }
24423
24424 /* Table for the ix86 builtin function types. */
24425 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
24426
24427 /* Retrieve an element from the above table, building some of
24428 the types lazily. */
24429
24430 static tree
24431 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
24432 {
24433 tree type;
24434
24435 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
24436
24437 type = ix86_builtin_func_type_tab[(int) tcode];
24438 if (type != NULL)
24439 return type;
24440
24441 if (tcode <= IX86_BT_LAST_FUNC)
24442 {
24443 unsigned start = ix86_builtin_func_start[(int) tcode];
24444 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
24445 tree rtype, atype, args = void_list_node;
24446 unsigned i;
24447
24448 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
24449 for (i = after - 1; i > start; --i)
24450 {
24451 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
24452 args = tree_cons (NULL, atype, args);
24453 }
24454
24455 type = build_function_type (rtype, args);
24456 }
24457 else
24458 {
24459 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
24460 enum ix86_builtin_func_type icode;
24461
24462 icode = ix86_builtin_func_alias_base[index];
24463 type = ix86_get_builtin_func_type (icode);
24464 }
24465
24466 ix86_builtin_func_type_tab[(int) tcode] = type;
24467 return type;
24468 }
24469
24470
24471 /* Codes for all the SSE/MMX builtins. */
24472 enum ix86_builtins
24473 {
24474 IX86_BUILTIN_ADDPS,
24475 IX86_BUILTIN_ADDSS,
24476 IX86_BUILTIN_DIVPS,
24477 IX86_BUILTIN_DIVSS,
24478 IX86_BUILTIN_MULPS,
24479 IX86_BUILTIN_MULSS,
24480 IX86_BUILTIN_SUBPS,
24481 IX86_BUILTIN_SUBSS,
24482
24483 IX86_BUILTIN_CMPEQPS,
24484 IX86_BUILTIN_CMPLTPS,
24485 IX86_BUILTIN_CMPLEPS,
24486 IX86_BUILTIN_CMPGTPS,
24487 IX86_BUILTIN_CMPGEPS,
24488 IX86_BUILTIN_CMPNEQPS,
24489 IX86_BUILTIN_CMPNLTPS,
24490 IX86_BUILTIN_CMPNLEPS,
24491 IX86_BUILTIN_CMPNGTPS,
24492 IX86_BUILTIN_CMPNGEPS,
24493 IX86_BUILTIN_CMPORDPS,
24494 IX86_BUILTIN_CMPUNORDPS,
24495 IX86_BUILTIN_CMPEQSS,
24496 IX86_BUILTIN_CMPLTSS,
24497 IX86_BUILTIN_CMPLESS,
24498 IX86_BUILTIN_CMPNEQSS,
24499 IX86_BUILTIN_CMPNLTSS,
24500 IX86_BUILTIN_CMPNLESS,
24501 IX86_BUILTIN_CMPNGTSS,
24502 IX86_BUILTIN_CMPNGESS,
24503 IX86_BUILTIN_CMPORDSS,
24504 IX86_BUILTIN_CMPUNORDSS,
24505
24506 IX86_BUILTIN_COMIEQSS,
24507 IX86_BUILTIN_COMILTSS,
24508 IX86_BUILTIN_COMILESS,
24509 IX86_BUILTIN_COMIGTSS,
24510 IX86_BUILTIN_COMIGESS,
24511 IX86_BUILTIN_COMINEQSS,
24512 IX86_BUILTIN_UCOMIEQSS,
24513 IX86_BUILTIN_UCOMILTSS,
24514 IX86_BUILTIN_UCOMILESS,
24515 IX86_BUILTIN_UCOMIGTSS,
24516 IX86_BUILTIN_UCOMIGESS,
24517 IX86_BUILTIN_UCOMINEQSS,
24518
24519 IX86_BUILTIN_CVTPI2PS,
24520 IX86_BUILTIN_CVTPS2PI,
24521 IX86_BUILTIN_CVTSI2SS,
24522 IX86_BUILTIN_CVTSI642SS,
24523 IX86_BUILTIN_CVTSS2SI,
24524 IX86_BUILTIN_CVTSS2SI64,
24525 IX86_BUILTIN_CVTTPS2PI,
24526 IX86_BUILTIN_CVTTSS2SI,
24527 IX86_BUILTIN_CVTTSS2SI64,
24528
24529 IX86_BUILTIN_MAXPS,
24530 IX86_BUILTIN_MAXSS,
24531 IX86_BUILTIN_MINPS,
24532 IX86_BUILTIN_MINSS,
24533
24534 IX86_BUILTIN_LOADUPS,
24535 IX86_BUILTIN_STOREUPS,
24536 IX86_BUILTIN_MOVSS,
24537
24538 IX86_BUILTIN_MOVHLPS,
24539 IX86_BUILTIN_MOVLHPS,
24540 IX86_BUILTIN_LOADHPS,
24541 IX86_BUILTIN_LOADLPS,
24542 IX86_BUILTIN_STOREHPS,
24543 IX86_BUILTIN_STORELPS,
24544
24545 IX86_BUILTIN_MASKMOVQ,
24546 IX86_BUILTIN_MOVMSKPS,
24547 IX86_BUILTIN_PMOVMSKB,
24548
24549 IX86_BUILTIN_MOVNTPS,
24550 IX86_BUILTIN_MOVNTQ,
24551
24552 IX86_BUILTIN_LOADDQU,
24553 IX86_BUILTIN_STOREDQU,
24554
24555 IX86_BUILTIN_PACKSSWB,
24556 IX86_BUILTIN_PACKSSDW,
24557 IX86_BUILTIN_PACKUSWB,
24558
24559 IX86_BUILTIN_PADDB,
24560 IX86_BUILTIN_PADDW,
24561 IX86_BUILTIN_PADDD,
24562 IX86_BUILTIN_PADDQ,
24563 IX86_BUILTIN_PADDSB,
24564 IX86_BUILTIN_PADDSW,
24565 IX86_BUILTIN_PADDUSB,
24566 IX86_BUILTIN_PADDUSW,
24567 IX86_BUILTIN_PSUBB,
24568 IX86_BUILTIN_PSUBW,
24569 IX86_BUILTIN_PSUBD,
24570 IX86_BUILTIN_PSUBQ,
24571 IX86_BUILTIN_PSUBSB,
24572 IX86_BUILTIN_PSUBSW,
24573 IX86_BUILTIN_PSUBUSB,
24574 IX86_BUILTIN_PSUBUSW,
24575
24576 IX86_BUILTIN_PAND,
24577 IX86_BUILTIN_PANDN,
24578 IX86_BUILTIN_POR,
24579 IX86_BUILTIN_PXOR,
24580
24581 IX86_BUILTIN_PAVGB,
24582 IX86_BUILTIN_PAVGW,
24583
24584 IX86_BUILTIN_PCMPEQB,
24585 IX86_BUILTIN_PCMPEQW,
24586 IX86_BUILTIN_PCMPEQD,
24587 IX86_BUILTIN_PCMPGTB,
24588 IX86_BUILTIN_PCMPGTW,
24589 IX86_BUILTIN_PCMPGTD,
24590
24591 IX86_BUILTIN_PMADDWD,
24592
24593 IX86_BUILTIN_PMAXSW,
24594 IX86_BUILTIN_PMAXUB,
24595 IX86_BUILTIN_PMINSW,
24596 IX86_BUILTIN_PMINUB,
24597
24598 IX86_BUILTIN_PMULHUW,
24599 IX86_BUILTIN_PMULHW,
24600 IX86_BUILTIN_PMULLW,
24601
24602 IX86_BUILTIN_PSADBW,
24603 IX86_BUILTIN_PSHUFW,
24604
24605 IX86_BUILTIN_PSLLW,
24606 IX86_BUILTIN_PSLLD,
24607 IX86_BUILTIN_PSLLQ,
24608 IX86_BUILTIN_PSRAW,
24609 IX86_BUILTIN_PSRAD,
24610 IX86_BUILTIN_PSRLW,
24611 IX86_BUILTIN_PSRLD,
24612 IX86_BUILTIN_PSRLQ,
24613 IX86_BUILTIN_PSLLWI,
24614 IX86_BUILTIN_PSLLDI,
24615 IX86_BUILTIN_PSLLQI,
24616 IX86_BUILTIN_PSRAWI,
24617 IX86_BUILTIN_PSRADI,
24618 IX86_BUILTIN_PSRLWI,
24619 IX86_BUILTIN_PSRLDI,
24620 IX86_BUILTIN_PSRLQI,
24621
24622 IX86_BUILTIN_PUNPCKHBW,
24623 IX86_BUILTIN_PUNPCKHWD,
24624 IX86_BUILTIN_PUNPCKHDQ,
24625 IX86_BUILTIN_PUNPCKLBW,
24626 IX86_BUILTIN_PUNPCKLWD,
24627 IX86_BUILTIN_PUNPCKLDQ,
24628
24629 IX86_BUILTIN_SHUFPS,
24630
24631 IX86_BUILTIN_RCPPS,
24632 IX86_BUILTIN_RCPSS,
24633 IX86_BUILTIN_RSQRTPS,
24634 IX86_BUILTIN_RSQRTPS_NR,
24635 IX86_BUILTIN_RSQRTSS,
24636 IX86_BUILTIN_RSQRTF,
24637 IX86_BUILTIN_SQRTPS,
24638 IX86_BUILTIN_SQRTPS_NR,
24639 IX86_BUILTIN_SQRTSS,
24640
24641 IX86_BUILTIN_UNPCKHPS,
24642 IX86_BUILTIN_UNPCKLPS,
24643
24644 IX86_BUILTIN_ANDPS,
24645 IX86_BUILTIN_ANDNPS,
24646 IX86_BUILTIN_ORPS,
24647 IX86_BUILTIN_XORPS,
24648
24649 IX86_BUILTIN_EMMS,
24650 IX86_BUILTIN_LDMXCSR,
24651 IX86_BUILTIN_STMXCSR,
24652 IX86_BUILTIN_SFENCE,
24653
24654 /* 3DNow! Original */
24655 IX86_BUILTIN_FEMMS,
24656 IX86_BUILTIN_PAVGUSB,
24657 IX86_BUILTIN_PF2ID,
24658 IX86_BUILTIN_PFACC,
24659 IX86_BUILTIN_PFADD,
24660 IX86_BUILTIN_PFCMPEQ,
24661 IX86_BUILTIN_PFCMPGE,
24662 IX86_BUILTIN_PFCMPGT,
24663 IX86_BUILTIN_PFMAX,
24664 IX86_BUILTIN_PFMIN,
24665 IX86_BUILTIN_PFMUL,
24666 IX86_BUILTIN_PFRCP,
24667 IX86_BUILTIN_PFRCPIT1,
24668 IX86_BUILTIN_PFRCPIT2,
24669 IX86_BUILTIN_PFRSQIT1,
24670 IX86_BUILTIN_PFRSQRT,
24671 IX86_BUILTIN_PFSUB,
24672 IX86_BUILTIN_PFSUBR,
24673 IX86_BUILTIN_PI2FD,
24674 IX86_BUILTIN_PMULHRW,
24675
24676 /* 3DNow! Athlon Extensions */
24677 IX86_BUILTIN_PF2IW,
24678 IX86_BUILTIN_PFNACC,
24679 IX86_BUILTIN_PFPNACC,
24680 IX86_BUILTIN_PI2FW,
24681 IX86_BUILTIN_PSWAPDSI,
24682 IX86_BUILTIN_PSWAPDSF,
24683
24684 /* SSE2 */
24685 IX86_BUILTIN_ADDPD,
24686 IX86_BUILTIN_ADDSD,
24687 IX86_BUILTIN_DIVPD,
24688 IX86_BUILTIN_DIVSD,
24689 IX86_BUILTIN_MULPD,
24690 IX86_BUILTIN_MULSD,
24691 IX86_BUILTIN_SUBPD,
24692 IX86_BUILTIN_SUBSD,
24693
24694 IX86_BUILTIN_CMPEQPD,
24695 IX86_BUILTIN_CMPLTPD,
24696 IX86_BUILTIN_CMPLEPD,
24697 IX86_BUILTIN_CMPGTPD,
24698 IX86_BUILTIN_CMPGEPD,
24699 IX86_BUILTIN_CMPNEQPD,
24700 IX86_BUILTIN_CMPNLTPD,
24701 IX86_BUILTIN_CMPNLEPD,
24702 IX86_BUILTIN_CMPNGTPD,
24703 IX86_BUILTIN_CMPNGEPD,
24704 IX86_BUILTIN_CMPORDPD,
24705 IX86_BUILTIN_CMPUNORDPD,
24706 IX86_BUILTIN_CMPEQSD,
24707 IX86_BUILTIN_CMPLTSD,
24708 IX86_BUILTIN_CMPLESD,
24709 IX86_BUILTIN_CMPNEQSD,
24710 IX86_BUILTIN_CMPNLTSD,
24711 IX86_BUILTIN_CMPNLESD,
24712 IX86_BUILTIN_CMPORDSD,
24713 IX86_BUILTIN_CMPUNORDSD,
24714
24715 IX86_BUILTIN_COMIEQSD,
24716 IX86_BUILTIN_COMILTSD,
24717 IX86_BUILTIN_COMILESD,
24718 IX86_BUILTIN_COMIGTSD,
24719 IX86_BUILTIN_COMIGESD,
24720 IX86_BUILTIN_COMINEQSD,
24721 IX86_BUILTIN_UCOMIEQSD,
24722 IX86_BUILTIN_UCOMILTSD,
24723 IX86_BUILTIN_UCOMILESD,
24724 IX86_BUILTIN_UCOMIGTSD,
24725 IX86_BUILTIN_UCOMIGESD,
24726 IX86_BUILTIN_UCOMINEQSD,
24727
24728 IX86_BUILTIN_MAXPD,
24729 IX86_BUILTIN_MAXSD,
24730 IX86_BUILTIN_MINPD,
24731 IX86_BUILTIN_MINSD,
24732
24733 IX86_BUILTIN_ANDPD,
24734 IX86_BUILTIN_ANDNPD,
24735 IX86_BUILTIN_ORPD,
24736 IX86_BUILTIN_XORPD,
24737
24738 IX86_BUILTIN_SQRTPD,
24739 IX86_BUILTIN_SQRTSD,
24740
24741 IX86_BUILTIN_UNPCKHPD,
24742 IX86_BUILTIN_UNPCKLPD,
24743
24744 IX86_BUILTIN_SHUFPD,
24745
24746 IX86_BUILTIN_LOADUPD,
24747 IX86_BUILTIN_STOREUPD,
24748 IX86_BUILTIN_MOVSD,
24749
24750 IX86_BUILTIN_LOADHPD,
24751 IX86_BUILTIN_LOADLPD,
24752
24753 IX86_BUILTIN_CVTDQ2PD,
24754 IX86_BUILTIN_CVTDQ2PS,
24755
24756 IX86_BUILTIN_CVTPD2DQ,
24757 IX86_BUILTIN_CVTPD2PI,
24758 IX86_BUILTIN_CVTPD2PS,
24759 IX86_BUILTIN_CVTTPD2DQ,
24760 IX86_BUILTIN_CVTTPD2PI,
24761
24762 IX86_BUILTIN_CVTPI2PD,
24763 IX86_BUILTIN_CVTSI2SD,
24764 IX86_BUILTIN_CVTSI642SD,
24765
24766 IX86_BUILTIN_CVTSD2SI,
24767 IX86_BUILTIN_CVTSD2SI64,
24768 IX86_BUILTIN_CVTSD2SS,
24769 IX86_BUILTIN_CVTSS2SD,
24770 IX86_BUILTIN_CVTTSD2SI,
24771 IX86_BUILTIN_CVTTSD2SI64,
24772
24773 IX86_BUILTIN_CVTPS2DQ,
24774 IX86_BUILTIN_CVTPS2PD,
24775 IX86_BUILTIN_CVTTPS2DQ,
24776
24777 IX86_BUILTIN_MOVNTI,
24778 IX86_BUILTIN_MOVNTI64,
24779 IX86_BUILTIN_MOVNTPD,
24780 IX86_BUILTIN_MOVNTDQ,
24781
24782 IX86_BUILTIN_MOVQ128,
24783
24784 /* SSE2 MMX */
24785 IX86_BUILTIN_MASKMOVDQU,
24786 IX86_BUILTIN_MOVMSKPD,
24787 IX86_BUILTIN_PMOVMSKB128,
24788
24789 IX86_BUILTIN_PACKSSWB128,
24790 IX86_BUILTIN_PACKSSDW128,
24791 IX86_BUILTIN_PACKUSWB128,
24792
24793 IX86_BUILTIN_PADDB128,
24794 IX86_BUILTIN_PADDW128,
24795 IX86_BUILTIN_PADDD128,
24796 IX86_BUILTIN_PADDQ128,
24797 IX86_BUILTIN_PADDSB128,
24798 IX86_BUILTIN_PADDSW128,
24799 IX86_BUILTIN_PADDUSB128,
24800 IX86_BUILTIN_PADDUSW128,
24801 IX86_BUILTIN_PSUBB128,
24802 IX86_BUILTIN_PSUBW128,
24803 IX86_BUILTIN_PSUBD128,
24804 IX86_BUILTIN_PSUBQ128,
24805 IX86_BUILTIN_PSUBSB128,
24806 IX86_BUILTIN_PSUBSW128,
24807 IX86_BUILTIN_PSUBUSB128,
24808 IX86_BUILTIN_PSUBUSW128,
24809
24810 IX86_BUILTIN_PAND128,
24811 IX86_BUILTIN_PANDN128,
24812 IX86_BUILTIN_POR128,
24813 IX86_BUILTIN_PXOR128,
24814
24815 IX86_BUILTIN_PAVGB128,
24816 IX86_BUILTIN_PAVGW128,
24817
24818 IX86_BUILTIN_PCMPEQB128,
24819 IX86_BUILTIN_PCMPEQW128,
24820 IX86_BUILTIN_PCMPEQD128,
24821 IX86_BUILTIN_PCMPGTB128,
24822 IX86_BUILTIN_PCMPGTW128,
24823 IX86_BUILTIN_PCMPGTD128,
24824
24825 IX86_BUILTIN_PMADDWD128,
24826
24827 IX86_BUILTIN_PMAXSW128,
24828 IX86_BUILTIN_PMAXUB128,
24829 IX86_BUILTIN_PMINSW128,
24830 IX86_BUILTIN_PMINUB128,
24831
24832 IX86_BUILTIN_PMULUDQ,
24833 IX86_BUILTIN_PMULUDQ128,
24834 IX86_BUILTIN_PMULHUW128,
24835 IX86_BUILTIN_PMULHW128,
24836 IX86_BUILTIN_PMULLW128,
24837
24838 IX86_BUILTIN_PSADBW128,
24839 IX86_BUILTIN_PSHUFHW,
24840 IX86_BUILTIN_PSHUFLW,
24841 IX86_BUILTIN_PSHUFD,
24842
24843 IX86_BUILTIN_PSLLDQI128,
24844 IX86_BUILTIN_PSLLWI128,
24845 IX86_BUILTIN_PSLLDI128,
24846 IX86_BUILTIN_PSLLQI128,
24847 IX86_BUILTIN_PSRAWI128,
24848 IX86_BUILTIN_PSRADI128,
24849 IX86_BUILTIN_PSRLDQI128,
24850 IX86_BUILTIN_PSRLWI128,
24851 IX86_BUILTIN_PSRLDI128,
24852 IX86_BUILTIN_PSRLQI128,
24853
24854 IX86_BUILTIN_PSLLDQ128,
24855 IX86_BUILTIN_PSLLW128,
24856 IX86_BUILTIN_PSLLD128,
24857 IX86_BUILTIN_PSLLQ128,
24858 IX86_BUILTIN_PSRAW128,
24859 IX86_BUILTIN_PSRAD128,
24860 IX86_BUILTIN_PSRLW128,
24861 IX86_BUILTIN_PSRLD128,
24862 IX86_BUILTIN_PSRLQ128,
24863
24864 IX86_BUILTIN_PUNPCKHBW128,
24865 IX86_BUILTIN_PUNPCKHWD128,
24866 IX86_BUILTIN_PUNPCKHDQ128,
24867 IX86_BUILTIN_PUNPCKHQDQ128,
24868 IX86_BUILTIN_PUNPCKLBW128,
24869 IX86_BUILTIN_PUNPCKLWD128,
24870 IX86_BUILTIN_PUNPCKLDQ128,
24871 IX86_BUILTIN_PUNPCKLQDQ128,
24872
24873 IX86_BUILTIN_CLFLUSH,
24874 IX86_BUILTIN_MFENCE,
24875 IX86_BUILTIN_LFENCE,
24876 IX86_BUILTIN_PAUSE,
24877
24878 IX86_BUILTIN_BSRSI,
24879 IX86_BUILTIN_BSRDI,
24880 IX86_BUILTIN_RDPMC,
24881 IX86_BUILTIN_RDTSC,
24882 IX86_BUILTIN_RDTSCP,
24883 IX86_BUILTIN_ROLQI,
24884 IX86_BUILTIN_ROLHI,
24885 IX86_BUILTIN_RORQI,
24886 IX86_BUILTIN_RORHI,
24887
24888 /* SSE3. */
24889 IX86_BUILTIN_ADDSUBPS,
24890 IX86_BUILTIN_HADDPS,
24891 IX86_BUILTIN_HSUBPS,
24892 IX86_BUILTIN_MOVSHDUP,
24893 IX86_BUILTIN_MOVSLDUP,
24894 IX86_BUILTIN_ADDSUBPD,
24895 IX86_BUILTIN_HADDPD,
24896 IX86_BUILTIN_HSUBPD,
24897 IX86_BUILTIN_LDDQU,
24898
24899 IX86_BUILTIN_MONITOR,
24900 IX86_BUILTIN_MWAIT,
24901
24902 /* SSSE3. */
24903 IX86_BUILTIN_PHADDW,
24904 IX86_BUILTIN_PHADDD,
24905 IX86_BUILTIN_PHADDSW,
24906 IX86_BUILTIN_PHSUBW,
24907 IX86_BUILTIN_PHSUBD,
24908 IX86_BUILTIN_PHSUBSW,
24909 IX86_BUILTIN_PMADDUBSW,
24910 IX86_BUILTIN_PMULHRSW,
24911 IX86_BUILTIN_PSHUFB,
24912 IX86_BUILTIN_PSIGNB,
24913 IX86_BUILTIN_PSIGNW,
24914 IX86_BUILTIN_PSIGND,
24915 IX86_BUILTIN_PALIGNR,
24916 IX86_BUILTIN_PABSB,
24917 IX86_BUILTIN_PABSW,
24918 IX86_BUILTIN_PABSD,
24919
24920 IX86_BUILTIN_PHADDW128,
24921 IX86_BUILTIN_PHADDD128,
24922 IX86_BUILTIN_PHADDSW128,
24923 IX86_BUILTIN_PHSUBW128,
24924 IX86_BUILTIN_PHSUBD128,
24925 IX86_BUILTIN_PHSUBSW128,
24926 IX86_BUILTIN_PMADDUBSW128,
24927 IX86_BUILTIN_PMULHRSW128,
24928 IX86_BUILTIN_PSHUFB128,
24929 IX86_BUILTIN_PSIGNB128,
24930 IX86_BUILTIN_PSIGNW128,
24931 IX86_BUILTIN_PSIGND128,
24932 IX86_BUILTIN_PALIGNR128,
24933 IX86_BUILTIN_PABSB128,
24934 IX86_BUILTIN_PABSW128,
24935 IX86_BUILTIN_PABSD128,
24936
24937 /* AMDFAM10 - SSE4A New Instructions. */
24938 IX86_BUILTIN_MOVNTSD,
24939 IX86_BUILTIN_MOVNTSS,
24940 IX86_BUILTIN_EXTRQI,
24941 IX86_BUILTIN_EXTRQ,
24942 IX86_BUILTIN_INSERTQI,
24943 IX86_BUILTIN_INSERTQ,
24944
24945 /* SSE4.1. */
24946 IX86_BUILTIN_BLENDPD,
24947 IX86_BUILTIN_BLENDPS,
24948 IX86_BUILTIN_BLENDVPD,
24949 IX86_BUILTIN_BLENDVPS,
24950 IX86_BUILTIN_PBLENDVB128,
24951 IX86_BUILTIN_PBLENDW128,
24952
24953 IX86_BUILTIN_DPPD,
24954 IX86_BUILTIN_DPPS,
24955
24956 IX86_BUILTIN_INSERTPS128,
24957
24958 IX86_BUILTIN_MOVNTDQA,
24959 IX86_BUILTIN_MPSADBW128,
24960 IX86_BUILTIN_PACKUSDW128,
24961 IX86_BUILTIN_PCMPEQQ,
24962 IX86_BUILTIN_PHMINPOSUW128,
24963
24964 IX86_BUILTIN_PMAXSB128,
24965 IX86_BUILTIN_PMAXSD128,
24966 IX86_BUILTIN_PMAXUD128,
24967 IX86_BUILTIN_PMAXUW128,
24968
24969 IX86_BUILTIN_PMINSB128,
24970 IX86_BUILTIN_PMINSD128,
24971 IX86_BUILTIN_PMINUD128,
24972 IX86_BUILTIN_PMINUW128,
24973
24974 IX86_BUILTIN_PMOVSXBW128,
24975 IX86_BUILTIN_PMOVSXBD128,
24976 IX86_BUILTIN_PMOVSXBQ128,
24977 IX86_BUILTIN_PMOVSXWD128,
24978 IX86_BUILTIN_PMOVSXWQ128,
24979 IX86_BUILTIN_PMOVSXDQ128,
24980
24981 IX86_BUILTIN_PMOVZXBW128,
24982 IX86_BUILTIN_PMOVZXBD128,
24983 IX86_BUILTIN_PMOVZXBQ128,
24984 IX86_BUILTIN_PMOVZXWD128,
24985 IX86_BUILTIN_PMOVZXWQ128,
24986 IX86_BUILTIN_PMOVZXDQ128,
24987
24988 IX86_BUILTIN_PMULDQ128,
24989 IX86_BUILTIN_PMULLD128,
24990
24991 IX86_BUILTIN_ROUNDSD,
24992 IX86_BUILTIN_ROUNDSS,
24993
24994 IX86_BUILTIN_ROUNDPD,
24995 IX86_BUILTIN_ROUNDPS,
24996
24997 IX86_BUILTIN_FLOORPD,
24998 IX86_BUILTIN_CEILPD,
24999 IX86_BUILTIN_TRUNCPD,
25000 IX86_BUILTIN_RINTPD,
25001 IX86_BUILTIN_ROUNDPD_AZ,
25002
25003 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25004 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25005 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25006
25007 IX86_BUILTIN_FLOORPS,
25008 IX86_BUILTIN_CEILPS,
25009 IX86_BUILTIN_TRUNCPS,
25010 IX86_BUILTIN_RINTPS,
25011 IX86_BUILTIN_ROUNDPS_AZ,
25012
25013 IX86_BUILTIN_FLOORPS_SFIX,
25014 IX86_BUILTIN_CEILPS_SFIX,
25015 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25016
25017 IX86_BUILTIN_PTESTZ,
25018 IX86_BUILTIN_PTESTC,
25019 IX86_BUILTIN_PTESTNZC,
25020
25021 IX86_BUILTIN_VEC_INIT_V2SI,
25022 IX86_BUILTIN_VEC_INIT_V4HI,
25023 IX86_BUILTIN_VEC_INIT_V8QI,
25024 IX86_BUILTIN_VEC_EXT_V2DF,
25025 IX86_BUILTIN_VEC_EXT_V2DI,
25026 IX86_BUILTIN_VEC_EXT_V4SF,
25027 IX86_BUILTIN_VEC_EXT_V4SI,
25028 IX86_BUILTIN_VEC_EXT_V8HI,
25029 IX86_BUILTIN_VEC_EXT_V2SI,
25030 IX86_BUILTIN_VEC_EXT_V4HI,
25031 IX86_BUILTIN_VEC_EXT_V16QI,
25032 IX86_BUILTIN_VEC_SET_V2DI,
25033 IX86_BUILTIN_VEC_SET_V4SF,
25034 IX86_BUILTIN_VEC_SET_V4SI,
25035 IX86_BUILTIN_VEC_SET_V8HI,
25036 IX86_BUILTIN_VEC_SET_V4HI,
25037 IX86_BUILTIN_VEC_SET_V16QI,
25038
25039 IX86_BUILTIN_VEC_PACK_SFIX,
25040 IX86_BUILTIN_VEC_PACK_SFIX256,
25041
25042 /* SSE4.2. */
25043 IX86_BUILTIN_CRC32QI,
25044 IX86_BUILTIN_CRC32HI,
25045 IX86_BUILTIN_CRC32SI,
25046 IX86_BUILTIN_CRC32DI,
25047
25048 IX86_BUILTIN_PCMPESTRI128,
25049 IX86_BUILTIN_PCMPESTRM128,
25050 IX86_BUILTIN_PCMPESTRA128,
25051 IX86_BUILTIN_PCMPESTRC128,
25052 IX86_BUILTIN_PCMPESTRO128,
25053 IX86_BUILTIN_PCMPESTRS128,
25054 IX86_BUILTIN_PCMPESTRZ128,
25055 IX86_BUILTIN_PCMPISTRI128,
25056 IX86_BUILTIN_PCMPISTRM128,
25057 IX86_BUILTIN_PCMPISTRA128,
25058 IX86_BUILTIN_PCMPISTRC128,
25059 IX86_BUILTIN_PCMPISTRO128,
25060 IX86_BUILTIN_PCMPISTRS128,
25061 IX86_BUILTIN_PCMPISTRZ128,
25062
25063 IX86_BUILTIN_PCMPGTQ,
25064
25065 /* AES instructions */
25066 IX86_BUILTIN_AESENC128,
25067 IX86_BUILTIN_AESENCLAST128,
25068 IX86_BUILTIN_AESDEC128,
25069 IX86_BUILTIN_AESDECLAST128,
25070 IX86_BUILTIN_AESIMC128,
25071 IX86_BUILTIN_AESKEYGENASSIST128,
25072
25073 /* PCLMUL instruction */
25074 IX86_BUILTIN_PCLMULQDQ128,
25075
25076 /* AVX */
25077 IX86_BUILTIN_ADDPD256,
25078 IX86_BUILTIN_ADDPS256,
25079 IX86_BUILTIN_ADDSUBPD256,
25080 IX86_BUILTIN_ADDSUBPS256,
25081 IX86_BUILTIN_ANDPD256,
25082 IX86_BUILTIN_ANDPS256,
25083 IX86_BUILTIN_ANDNPD256,
25084 IX86_BUILTIN_ANDNPS256,
25085 IX86_BUILTIN_BLENDPD256,
25086 IX86_BUILTIN_BLENDPS256,
25087 IX86_BUILTIN_BLENDVPD256,
25088 IX86_BUILTIN_BLENDVPS256,
25089 IX86_BUILTIN_DIVPD256,
25090 IX86_BUILTIN_DIVPS256,
25091 IX86_BUILTIN_DPPS256,
25092 IX86_BUILTIN_HADDPD256,
25093 IX86_BUILTIN_HADDPS256,
25094 IX86_BUILTIN_HSUBPD256,
25095 IX86_BUILTIN_HSUBPS256,
25096 IX86_BUILTIN_MAXPD256,
25097 IX86_BUILTIN_MAXPS256,
25098 IX86_BUILTIN_MINPD256,
25099 IX86_BUILTIN_MINPS256,
25100 IX86_BUILTIN_MULPD256,
25101 IX86_BUILTIN_MULPS256,
25102 IX86_BUILTIN_ORPD256,
25103 IX86_BUILTIN_ORPS256,
25104 IX86_BUILTIN_SHUFPD256,
25105 IX86_BUILTIN_SHUFPS256,
25106 IX86_BUILTIN_SUBPD256,
25107 IX86_BUILTIN_SUBPS256,
25108 IX86_BUILTIN_XORPD256,
25109 IX86_BUILTIN_XORPS256,
25110 IX86_BUILTIN_CMPSD,
25111 IX86_BUILTIN_CMPSS,
25112 IX86_BUILTIN_CMPPD,
25113 IX86_BUILTIN_CMPPS,
25114 IX86_BUILTIN_CMPPD256,
25115 IX86_BUILTIN_CMPPS256,
25116 IX86_BUILTIN_CVTDQ2PD256,
25117 IX86_BUILTIN_CVTDQ2PS256,
25118 IX86_BUILTIN_CVTPD2PS256,
25119 IX86_BUILTIN_CVTPS2DQ256,
25120 IX86_BUILTIN_CVTPS2PD256,
25121 IX86_BUILTIN_CVTTPD2DQ256,
25122 IX86_BUILTIN_CVTPD2DQ256,
25123 IX86_BUILTIN_CVTTPS2DQ256,
25124 IX86_BUILTIN_EXTRACTF128PD256,
25125 IX86_BUILTIN_EXTRACTF128PS256,
25126 IX86_BUILTIN_EXTRACTF128SI256,
25127 IX86_BUILTIN_VZEROALL,
25128 IX86_BUILTIN_VZEROUPPER,
25129 IX86_BUILTIN_VPERMILVARPD,
25130 IX86_BUILTIN_VPERMILVARPS,
25131 IX86_BUILTIN_VPERMILVARPD256,
25132 IX86_BUILTIN_VPERMILVARPS256,
25133 IX86_BUILTIN_VPERMILPD,
25134 IX86_BUILTIN_VPERMILPS,
25135 IX86_BUILTIN_VPERMILPD256,
25136 IX86_BUILTIN_VPERMILPS256,
25137 IX86_BUILTIN_VPERMIL2PD,
25138 IX86_BUILTIN_VPERMIL2PS,
25139 IX86_BUILTIN_VPERMIL2PD256,
25140 IX86_BUILTIN_VPERMIL2PS256,
25141 IX86_BUILTIN_VPERM2F128PD256,
25142 IX86_BUILTIN_VPERM2F128PS256,
25143 IX86_BUILTIN_VPERM2F128SI256,
25144 IX86_BUILTIN_VBROADCASTSS,
25145 IX86_BUILTIN_VBROADCASTSD256,
25146 IX86_BUILTIN_VBROADCASTSS256,
25147 IX86_BUILTIN_VBROADCASTPD256,
25148 IX86_BUILTIN_VBROADCASTPS256,
25149 IX86_BUILTIN_VINSERTF128PD256,
25150 IX86_BUILTIN_VINSERTF128PS256,
25151 IX86_BUILTIN_VINSERTF128SI256,
25152 IX86_BUILTIN_LOADUPD256,
25153 IX86_BUILTIN_LOADUPS256,
25154 IX86_BUILTIN_STOREUPD256,
25155 IX86_BUILTIN_STOREUPS256,
25156 IX86_BUILTIN_LDDQU256,
25157 IX86_BUILTIN_MOVNTDQ256,
25158 IX86_BUILTIN_MOVNTPD256,
25159 IX86_BUILTIN_MOVNTPS256,
25160 IX86_BUILTIN_LOADDQU256,
25161 IX86_BUILTIN_STOREDQU256,
25162 IX86_BUILTIN_MASKLOADPD,
25163 IX86_BUILTIN_MASKLOADPS,
25164 IX86_BUILTIN_MASKSTOREPD,
25165 IX86_BUILTIN_MASKSTOREPS,
25166 IX86_BUILTIN_MASKLOADPD256,
25167 IX86_BUILTIN_MASKLOADPS256,
25168 IX86_BUILTIN_MASKSTOREPD256,
25169 IX86_BUILTIN_MASKSTOREPS256,
25170 IX86_BUILTIN_MOVSHDUP256,
25171 IX86_BUILTIN_MOVSLDUP256,
25172 IX86_BUILTIN_MOVDDUP256,
25173
25174 IX86_BUILTIN_SQRTPD256,
25175 IX86_BUILTIN_SQRTPS256,
25176 IX86_BUILTIN_SQRTPS_NR256,
25177 IX86_BUILTIN_RSQRTPS256,
25178 IX86_BUILTIN_RSQRTPS_NR256,
25179
25180 IX86_BUILTIN_RCPPS256,
25181
25182 IX86_BUILTIN_ROUNDPD256,
25183 IX86_BUILTIN_ROUNDPS256,
25184
25185 IX86_BUILTIN_FLOORPD256,
25186 IX86_BUILTIN_CEILPD256,
25187 IX86_BUILTIN_TRUNCPD256,
25188 IX86_BUILTIN_RINTPD256,
25189 IX86_BUILTIN_ROUNDPD_AZ256,
25190
25191 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
25192 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
25193 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
25194
25195 IX86_BUILTIN_FLOORPS256,
25196 IX86_BUILTIN_CEILPS256,
25197 IX86_BUILTIN_TRUNCPS256,
25198 IX86_BUILTIN_RINTPS256,
25199 IX86_BUILTIN_ROUNDPS_AZ256,
25200
25201 IX86_BUILTIN_FLOORPS_SFIX256,
25202 IX86_BUILTIN_CEILPS_SFIX256,
25203 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
25204
25205 IX86_BUILTIN_UNPCKHPD256,
25206 IX86_BUILTIN_UNPCKLPD256,
25207 IX86_BUILTIN_UNPCKHPS256,
25208 IX86_BUILTIN_UNPCKLPS256,
25209
25210 IX86_BUILTIN_SI256_SI,
25211 IX86_BUILTIN_PS256_PS,
25212 IX86_BUILTIN_PD256_PD,
25213 IX86_BUILTIN_SI_SI256,
25214 IX86_BUILTIN_PS_PS256,
25215 IX86_BUILTIN_PD_PD256,
25216
25217 IX86_BUILTIN_VTESTZPD,
25218 IX86_BUILTIN_VTESTCPD,
25219 IX86_BUILTIN_VTESTNZCPD,
25220 IX86_BUILTIN_VTESTZPS,
25221 IX86_BUILTIN_VTESTCPS,
25222 IX86_BUILTIN_VTESTNZCPS,
25223 IX86_BUILTIN_VTESTZPD256,
25224 IX86_BUILTIN_VTESTCPD256,
25225 IX86_BUILTIN_VTESTNZCPD256,
25226 IX86_BUILTIN_VTESTZPS256,
25227 IX86_BUILTIN_VTESTCPS256,
25228 IX86_BUILTIN_VTESTNZCPS256,
25229 IX86_BUILTIN_PTESTZ256,
25230 IX86_BUILTIN_PTESTC256,
25231 IX86_BUILTIN_PTESTNZC256,
25232
25233 IX86_BUILTIN_MOVMSKPD256,
25234 IX86_BUILTIN_MOVMSKPS256,
25235
25236 /* AVX2 */
25237 IX86_BUILTIN_MPSADBW256,
25238 IX86_BUILTIN_PABSB256,
25239 IX86_BUILTIN_PABSW256,
25240 IX86_BUILTIN_PABSD256,
25241 IX86_BUILTIN_PACKSSDW256,
25242 IX86_BUILTIN_PACKSSWB256,
25243 IX86_BUILTIN_PACKUSDW256,
25244 IX86_BUILTIN_PACKUSWB256,
25245 IX86_BUILTIN_PADDB256,
25246 IX86_BUILTIN_PADDW256,
25247 IX86_BUILTIN_PADDD256,
25248 IX86_BUILTIN_PADDQ256,
25249 IX86_BUILTIN_PADDSB256,
25250 IX86_BUILTIN_PADDSW256,
25251 IX86_BUILTIN_PADDUSB256,
25252 IX86_BUILTIN_PADDUSW256,
25253 IX86_BUILTIN_PALIGNR256,
25254 IX86_BUILTIN_AND256I,
25255 IX86_BUILTIN_ANDNOT256I,
25256 IX86_BUILTIN_PAVGB256,
25257 IX86_BUILTIN_PAVGW256,
25258 IX86_BUILTIN_PBLENDVB256,
25259 IX86_BUILTIN_PBLENDVW256,
25260 IX86_BUILTIN_PCMPEQB256,
25261 IX86_BUILTIN_PCMPEQW256,
25262 IX86_BUILTIN_PCMPEQD256,
25263 IX86_BUILTIN_PCMPEQQ256,
25264 IX86_BUILTIN_PCMPGTB256,
25265 IX86_BUILTIN_PCMPGTW256,
25266 IX86_BUILTIN_PCMPGTD256,
25267 IX86_BUILTIN_PCMPGTQ256,
25268 IX86_BUILTIN_PHADDW256,
25269 IX86_BUILTIN_PHADDD256,
25270 IX86_BUILTIN_PHADDSW256,
25271 IX86_BUILTIN_PHSUBW256,
25272 IX86_BUILTIN_PHSUBD256,
25273 IX86_BUILTIN_PHSUBSW256,
25274 IX86_BUILTIN_PMADDUBSW256,
25275 IX86_BUILTIN_PMADDWD256,
25276 IX86_BUILTIN_PMAXSB256,
25277 IX86_BUILTIN_PMAXSW256,
25278 IX86_BUILTIN_PMAXSD256,
25279 IX86_BUILTIN_PMAXUB256,
25280 IX86_BUILTIN_PMAXUW256,
25281 IX86_BUILTIN_PMAXUD256,
25282 IX86_BUILTIN_PMINSB256,
25283 IX86_BUILTIN_PMINSW256,
25284 IX86_BUILTIN_PMINSD256,
25285 IX86_BUILTIN_PMINUB256,
25286 IX86_BUILTIN_PMINUW256,
25287 IX86_BUILTIN_PMINUD256,
25288 IX86_BUILTIN_PMOVMSKB256,
25289 IX86_BUILTIN_PMOVSXBW256,
25290 IX86_BUILTIN_PMOVSXBD256,
25291 IX86_BUILTIN_PMOVSXBQ256,
25292 IX86_BUILTIN_PMOVSXWD256,
25293 IX86_BUILTIN_PMOVSXWQ256,
25294 IX86_BUILTIN_PMOVSXDQ256,
25295 IX86_BUILTIN_PMOVZXBW256,
25296 IX86_BUILTIN_PMOVZXBD256,
25297 IX86_BUILTIN_PMOVZXBQ256,
25298 IX86_BUILTIN_PMOVZXWD256,
25299 IX86_BUILTIN_PMOVZXWQ256,
25300 IX86_BUILTIN_PMOVZXDQ256,
25301 IX86_BUILTIN_PMULDQ256,
25302 IX86_BUILTIN_PMULHRSW256,
25303 IX86_BUILTIN_PMULHUW256,
25304 IX86_BUILTIN_PMULHW256,
25305 IX86_BUILTIN_PMULLW256,
25306 IX86_BUILTIN_PMULLD256,
25307 IX86_BUILTIN_PMULUDQ256,
25308 IX86_BUILTIN_POR256,
25309 IX86_BUILTIN_PSADBW256,
25310 IX86_BUILTIN_PSHUFB256,
25311 IX86_BUILTIN_PSHUFD256,
25312 IX86_BUILTIN_PSHUFHW256,
25313 IX86_BUILTIN_PSHUFLW256,
25314 IX86_BUILTIN_PSIGNB256,
25315 IX86_BUILTIN_PSIGNW256,
25316 IX86_BUILTIN_PSIGND256,
25317 IX86_BUILTIN_PSLLDQI256,
25318 IX86_BUILTIN_PSLLWI256,
25319 IX86_BUILTIN_PSLLW256,
25320 IX86_BUILTIN_PSLLDI256,
25321 IX86_BUILTIN_PSLLD256,
25322 IX86_BUILTIN_PSLLQI256,
25323 IX86_BUILTIN_PSLLQ256,
25324 IX86_BUILTIN_PSRAWI256,
25325 IX86_BUILTIN_PSRAW256,
25326 IX86_BUILTIN_PSRADI256,
25327 IX86_BUILTIN_PSRAD256,
25328 IX86_BUILTIN_PSRLDQI256,
25329 IX86_BUILTIN_PSRLWI256,
25330 IX86_BUILTIN_PSRLW256,
25331 IX86_BUILTIN_PSRLDI256,
25332 IX86_BUILTIN_PSRLD256,
25333 IX86_BUILTIN_PSRLQI256,
25334 IX86_BUILTIN_PSRLQ256,
25335 IX86_BUILTIN_PSUBB256,
25336 IX86_BUILTIN_PSUBW256,
25337 IX86_BUILTIN_PSUBD256,
25338 IX86_BUILTIN_PSUBQ256,
25339 IX86_BUILTIN_PSUBSB256,
25340 IX86_BUILTIN_PSUBSW256,
25341 IX86_BUILTIN_PSUBUSB256,
25342 IX86_BUILTIN_PSUBUSW256,
25343 IX86_BUILTIN_PUNPCKHBW256,
25344 IX86_BUILTIN_PUNPCKHWD256,
25345 IX86_BUILTIN_PUNPCKHDQ256,
25346 IX86_BUILTIN_PUNPCKHQDQ256,
25347 IX86_BUILTIN_PUNPCKLBW256,
25348 IX86_BUILTIN_PUNPCKLWD256,
25349 IX86_BUILTIN_PUNPCKLDQ256,
25350 IX86_BUILTIN_PUNPCKLQDQ256,
25351 IX86_BUILTIN_PXOR256,
25352 IX86_BUILTIN_MOVNTDQA256,
25353 IX86_BUILTIN_VBROADCASTSS_PS,
25354 IX86_BUILTIN_VBROADCASTSS_PS256,
25355 IX86_BUILTIN_VBROADCASTSD_PD256,
25356 IX86_BUILTIN_VBROADCASTSI256,
25357 IX86_BUILTIN_PBLENDD256,
25358 IX86_BUILTIN_PBLENDD128,
25359 IX86_BUILTIN_PBROADCASTB256,
25360 IX86_BUILTIN_PBROADCASTW256,
25361 IX86_BUILTIN_PBROADCASTD256,
25362 IX86_BUILTIN_PBROADCASTQ256,
25363 IX86_BUILTIN_PBROADCASTB128,
25364 IX86_BUILTIN_PBROADCASTW128,
25365 IX86_BUILTIN_PBROADCASTD128,
25366 IX86_BUILTIN_PBROADCASTQ128,
25367 IX86_BUILTIN_VPERMVARSI256,
25368 IX86_BUILTIN_VPERMDF256,
25369 IX86_BUILTIN_VPERMVARSF256,
25370 IX86_BUILTIN_VPERMDI256,
25371 IX86_BUILTIN_VPERMTI256,
25372 IX86_BUILTIN_VEXTRACT128I256,
25373 IX86_BUILTIN_VINSERT128I256,
25374 IX86_BUILTIN_MASKLOADD,
25375 IX86_BUILTIN_MASKLOADQ,
25376 IX86_BUILTIN_MASKLOADD256,
25377 IX86_BUILTIN_MASKLOADQ256,
25378 IX86_BUILTIN_MASKSTORED,
25379 IX86_BUILTIN_MASKSTOREQ,
25380 IX86_BUILTIN_MASKSTORED256,
25381 IX86_BUILTIN_MASKSTOREQ256,
25382 IX86_BUILTIN_PSLLVV4DI,
25383 IX86_BUILTIN_PSLLVV2DI,
25384 IX86_BUILTIN_PSLLVV8SI,
25385 IX86_BUILTIN_PSLLVV4SI,
25386 IX86_BUILTIN_PSRAVV8SI,
25387 IX86_BUILTIN_PSRAVV4SI,
25388 IX86_BUILTIN_PSRLVV4DI,
25389 IX86_BUILTIN_PSRLVV2DI,
25390 IX86_BUILTIN_PSRLVV8SI,
25391 IX86_BUILTIN_PSRLVV4SI,
25392
25393 IX86_BUILTIN_GATHERSIV2DF,
25394 IX86_BUILTIN_GATHERSIV4DF,
25395 IX86_BUILTIN_GATHERDIV2DF,
25396 IX86_BUILTIN_GATHERDIV4DF,
25397 IX86_BUILTIN_GATHERSIV4SF,
25398 IX86_BUILTIN_GATHERSIV8SF,
25399 IX86_BUILTIN_GATHERDIV4SF,
25400 IX86_BUILTIN_GATHERDIV8SF,
25401 IX86_BUILTIN_GATHERSIV2DI,
25402 IX86_BUILTIN_GATHERSIV4DI,
25403 IX86_BUILTIN_GATHERDIV2DI,
25404 IX86_BUILTIN_GATHERDIV4DI,
25405 IX86_BUILTIN_GATHERSIV4SI,
25406 IX86_BUILTIN_GATHERSIV8SI,
25407 IX86_BUILTIN_GATHERDIV4SI,
25408 IX86_BUILTIN_GATHERDIV8SI,
25409
25410 /* Alternate 4 element gather for the vectorizer where
25411 all operands are 32-byte wide. */
25412 IX86_BUILTIN_GATHERALTSIV4DF,
25413 IX86_BUILTIN_GATHERALTDIV8SF,
25414 IX86_BUILTIN_GATHERALTSIV4DI,
25415 IX86_BUILTIN_GATHERALTDIV8SI,
25416
25417 /* TFmode support builtins. */
25418 IX86_BUILTIN_INFQ,
25419 IX86_BUILTIN_HUGE_VALQ,
25420 IX86_BUILTIN_FABSQ,
25421 IX86_BUILTIN_COPYSIGNQ,
25422
25423 /* Vectorizer support builtins. */
25424 IX86_BUILTIN_CPYSGNPS,
25425 IX86_BUILTIN_CPYSGNPD,
25426 IX86_BUILTIN_CPYSGNPS256,
25427 IX86_BUILTIN_CPYSGNPD256,
25428
25429 /* FMA4 instructions. */
25430 IX86_BUILTIN_VFMADDSS,
25431 IX86_BUILTIN_VFMADDSD,
25432 IX86_BUILTIN_VFMADDPS,
25433 IX86_BUILTIN_VFMADDPD,
25434 IX86_BUILTIN_VFMADDPS256,
25435 IX86_BUILTIN_VFMADDPD256,
25436 IX86_BUILTIN_VFMADDSUBPS,
25437 IX86_BUILTIN_VFMADDSUBPD,
25438 IX86_BUILTIN_VFMADDSUBPS256,
25439 IX86_BUILTIN_VFMADDSUBPD256,
25440
25441 /* FMA3 instructions. */
25442 IX86_BUILTIN_VFMADDSS3,
25443 IX86_BUILTIN_VFMADDSD3,
25444
25445 /* XOP instructions. */
25446 IX86_BUILTIN_VPCMOV,
25447 IX86_BUILTIN_VPCMOV_V2DI,
25448 IX86_BUILTIN_VPCMOV_V4SI,
25449 IX86_BUILTIN_VPCMOV_V8HI,
25450 IX86_BUILTIN_VPCMOV_V16QI,
25451 IX86_BUILTIN_VPCMOV_V4SF,
25452 IX86_BUILTIN_VPCMOV_V2DF,
25453 IX86_BUILTIN_VPCMOV256,
25454 IX86_BUILTIN_VPCMOV_V4DI256,
25455 IX86_BUILTIN_VPCMOV_V8SI256,
25456 IX86_BUILTIN_VPCMOV_V16HI256,
25457 IX86_BUILTIN_VPCMOV_V32QI256,
25458 IX86_BUILTIN_VPCMOV_V8SF256,
25459 IX86_BUILTIN_VPCMOV_V4DF256,
25460
25461 IX86_BUILTIN_VPPERM,
25462
25463 IX86_BUILTIN_VPMACSSWW,
25464 IX86_BUILTIN_VPMACSWW,
25465 IX86_BUILTIN_VPMACSSWD,
25466 IX86_BUILTIN_VPMACSWD,
25467 IX86_BUILTIN_VPMACSSDD,
25468 IX86_BUILTIN_VPMACSDD,
25469 IX86_BUILTIN_VPMACSSDQL,
25470 IX86_BUILTIN_VPMACSSDQH,
25471 IX86_BUILTIN_VPMACSDQL,
25472 IX86_BUILTIN_VPMACSDQH,
25473 IX86_BUILTIN_VPMADCSSWD,
25474 IX86_BUILTIN_VPMADCSWD,
25475
25476 IX86_BUILTIN_VPHADDBW,
25477 IX86_BUILTIN_VPHADDBD,
25478 IX86_BUILTIN_VPHADDBQ,
25479 IX86_BUILTIN_VPHADDWD,
25480 IX86_BUILTIN_VPHADDWQ,
25481 IX86_BUILTIN_VPHADDDQ,
25482 IX86_BUILTIN_VPHADDUBW,
25483 IX86_BUILTIN_VPHADDUBD,
25484 IX86_BUILTIN_VPHADDUBQ,
25485 IX86_BUILTIN_VPHADDUWD,
25486 IX86_BUILTIN_VPHADDUWQ,
25487 IX86_BUILTIN_VPHADDUDQ,
25488 IX86_BUILTIN_VPHSUBBW,
25489 IX86_BUILTIN_VPHSUBWD,
25490 IX86_BUILTIN_VPHSUBDQ,
25491
25492 IX86_BUILTIN_VPROTB,
25493 IX86_BUILTIN_VPROTW,
25494 IX86_BUILTIN_VPROTD,
25495 IX86_BUILTIN_VPROTQ,
25496 IX86_BUILTIN_VPROTB_IMM,
25497 IX86_BUILTIN_VPROTW_IMM,
25498 IX86_BUILTIN_VPROTD_IMM,
25499 IX86_BUILTIN_VPROTQ_IMM,
25500
25501 IX86_BUILTIN_VPSHLB,
25502 IX86_BUILTIN_VPSHLW,
25503 IX86_BUILTIN_VPSHLD,
25504 IX86_BUILTIN_VPSHLQ,
25505 IX86_BUILTIN_VPSHAB,
25506 IX86_BUILTIN_VPSHAW,
25507 IX86_BUILTIN_VPSHAD,
25508 IX86_BUILTIN_VPSHAQ,
25509
25510 IX86_BUILTIN_VFRCZSS,
25511 IX86_BUILTIN_VFRCZSD,
25512 IX86_BUILTIN_VFRCZPS,
25513 IX86_BUILTIN_VFRCZPD,
25514 IX86_BUILTIN_VFRCZPS256,
25515 IX86_BUILTIN_VFRCZPD256,
25516
25517 IX86_BUILTIN_VPCOMEQUB,
25518 IX86_BUILTIN_VPCOMNEUB,
25519 IX86_BUILTIN_VPCOMLTUB,
25520 IX86_BUILTIN_VPCOMLEUB,
25521 IX86_BUILTIN_VPCOMGTUB,
25522 IX86_BUILTIN_VPCOMGEUB,
25523 IX86_BUILTIN_VPCOMFALSEUB,
25524 IX86_BUILTIN_VPCOMTRUEUB,
25525
25526 IX86_BUILTIN_VPCOMEQUW,
25527 IX86_BUILTIN_VPCOMNEUW,
25528 IX86_BUILTIN_VPCOMLTUW,
25529 IX86_BUILTIN_VPCOMLEUW,
25530 IX86_BUILTIN_VPCOMGTUW,
25531 IX86_BUILTIN_VPCOMGEUW,
25532 IX86_BUILTIN_VPCOMFALSEUW,
25533 IX86_BUILTIN_VPCOMTRUEUW,
25534
25535 IX86_BUILTIN_VPCOMEQUD,
25536 IX86_BUILTIN_VPCOMNEUD,
25537 IX86_BUILTIN_VPCOMLTUD,
25538 IX86_BUILTIN_VPCOMLEUD,
25539 IX86_BUILTIN_VPCOMGTUD,
25540 IX86_BUILTIN_VPCOMGEUD,
25541 IX86_BUILTIN_VPCOMFALSEUD,
25542 IX86_BUILTIN_VPCOMTRUEUD,
25543
25544 IX86_BUILTIN_VPCOMEQUQ,
25545 IX86_BUILTIN_VPCOMNEUQ,
25546 IX86_BUILTIN_VPCOMLTUQ,
25547 IX86_BUILTIN_VPCOMLEUQ,
25548 IX86_BUILTIN_VPCOMGTUQ,
25549 IX86_BUILTIN_VPCOMGEUQ,
25550 IX86_BUILTIN_VPCOMFALSEUQ,
25551 IX86_BUILTIN_VPCOMTRUEUQ,
25552
25553 IX86_BUILTIN_VPCOMEQB,
25554 IX86_BUILTIN_VPCOMNEB,
25555 IX86_BUILTIN_VPCOMLTB,
25556 IX86_BUILTIN_VPCOMLEB,
25557 IX86_BUILTIN_VPCOMGTB,
25558 IX86_BUILTIN_VPCOMGEB,
25559 IX86_BUILTIN_VPCOMFALSEB,
25560 IX86_BUILTIN_VPCOMTRUEB,
25561
25562 IX86_BUILTIN_VPCOMEQW,
25563 IX86_BUILTIN_VPCOMNEW,
25564 IX86_BUILTIN_VPCOMLTW,
25565 IX86_BUILTIN_VPCOMLEW,
25566 IX86_BUILTIN_VPCOMGTW,
25567 IX86_BUILTIN_VPCOMGEW,
25568 IX86_BUILTIN_VPCOMFALSEW,
25569 IX86_BUILTIN_VPCOMTRUEW,
25570
25571 IX86_BUILTIN_VPCOMEQD,
25572 IX86_BUILTIN_VPCOMNED,
25573 IX86_BUILTIN_VPCOMLTD,
25574 IX86_BUILTIN_VPCOMLED,
25575 IX86_BUILTIN_VPCOMGTD,
25576 IX86_BUILTIN_VPCOMGED,
25577 IX86_BUILTIN_VPCOMFALSED,
25578 IX86_BUILTIN_VPCOMTRUED,
25579
25580 IX86_BUILTIN_VPCOMEQQ,
25581 IX86_BUILTIN_VPCOMNEQ,
25582 IX86_BUILTIN_VPCOMLTQ,
25583 IX86_BUILTIN_VPCOMLEQ,
25584 IX86_BUILTIN_VPCOMGTQ,
25585 IX86_BUILTIN_VPCOMGEQ,
25586 IX86_BUILTIN_VPCOMFALSEQ,
25587 IX86_BUILTIN_VPCOMTRUEQ,
25588
25589 /* LWP instructions. */
25590 IX86_BUILTIN_LLWPCB,
25591 IX86_BUILTIN_SLWPCB,
25592 IX86_BUILTIN_LWPVAL32,
25593 IX86_BUILTIN_LWPVAL64,
25594 IX86_BUILTIN_LWPINS32,
25595 IX86_BUILTIN_LWPINS64,
25596
25597 IX86_BUILTIN_CLZS,
25598
25599 /* BMI instructions. */
25600 IX86_BUILTIN_BEXTR32,
25601 IX86_BUILTIN_BEXTR64,
25602 IX86_BUILTIN_CTZS,
25603
25604 /* TBM instructions. */
25605 IX86_BUILTIN_BEXTRI32,
25606 IX86_BUILTIN_BEXTRI64,
25607
25608 /* BMI2 instructions. */
25609 IX86_BUILTIN_BZHI32,
25610 IX86_BUILTIN_BZHI64,
25611 IX86_BUILTIN_PDEP32,
25612 IX86_BUILTIN_PDEP64,
25613 IX86_BUILTIN_PEXT32,
25614 IX86_BUILTIN_PEXT64,
25615
25616 /* FSGSBASE instructions. */
25617 IX86_BUILTIN_RDFSBASE32,
25618 IX86_BUILTIN_RDFSBASE64,
25619 IX86_BUILTIN_RDGSBASE32,
25620 IX86_BUILTIN_RDGSBASE64,
25621 IX86_BUILTIN_WRFSBASE32,
25622 IX86_BUILTIN_WRFSBASE64,
25623 IX86_BUILTIN_WRGSBASE32,
25624 IX86_BUILTIN_WRGSBASE64,
25625
25626 /* RDRND instructions. */
25627 IX86_BUILTIN_RDRAND16_STEP,
25628 IX86_BUILTIN_RDRAND32_STEP,
25629 IX86_BUILTIN_RDRAND64_STEP,
25630
25631 /* F16C instructions. */
25632 IX86_BUILTIN_CVTPH2PS,
25633 IX86_BUILTIN_CVTPH2PS256,
25634 IX86_BUILTIN_CVTPS2PH,
25635 IX86_BUILTIN_CVTPS2PH256,
25636
25637 /* CFString built-in for darwin */
25638 IX86_BUILTIN_CFSTRING,
25639
25640 IX86_BUILTIN_MAX
25641 };
25642
25643 /* Table for the ix86 builtin decls. */
25644 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
25645
25646 /* Table of all of the builtin functions that are possible with different ISA's
25647 but are waiting to be built until a function is declared to use that
25648 ISA. */
25649 struct builtin_isa {
25650 const char *name; /* function name */
25651 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
25652 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
25653 bool const_p; /* true if the declaration is constant */
25654 bool set_and_not_built_p;
25655 };
25656
25657 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
25658
25659
25660 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
25661 of which isa_flags to use in the ix86_builtins_isa array. Stores the
25662 function decl in the ix86_builtins array. Returns the function decl or
25663 NULL_TREE, if the builtin was not added.
25664
25665 If the front end has a special hook for builtin functions, delay adding
25666 builtin functions that aren't in the current ISA until the ISA is changed
25667 with function specific optimization. Doing so, can save about 300K for the
25668 default compiler. When the builtin is expanded, check at that time whether
25669 it is valid.
25670
25671 If the front end doesn't have a special hook, record all builtins, even if
25672 it isn't an instruction set in the current ISA in case the user uses
25673 function specific options for a different ISA, so that we don't get scope
25674 errors if a builtin is added in the middle of a function scope. */
25675
25676 static inline tree
25677 def_builtin (HOST_WIDE_INT mask, const char *name,
25678 enum ix86_builtin_func_type tcode,
25679 enum ix86_builtins code)
25680 {
25681 tree decl = NULL_TREE;
25682
25683 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
25684 {
25685 ix86_builtins_isa[(int) code].isa = mask;
25686
25687 mask &= ~OPTION_MASK_ISA_64BIT;
25688 if (mask == 0
25689 || (mask & ix86_isa_flags) != 0
25690 || (lang_hooks.builtin_function
25691 == lang_hooks.builtin_function_ext_scope))
25692
25693 {
25694 tree type = ix86_get_builtin_func_type (tcode);
25695 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
25696 NULL, NULL_TREE);
25697 ix86_builtins[(int) code] = decl;
25698 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
25699 }
25700 else
25701 {
25702 ix86_builtins[(int) code] = NULL_TREE;
25703 ix86_builtins_isa[(int) code].tcode = tcode;
25704 ix86_builtins_isa[(int) code].name = name;
25705 ix86_builtins_isa[(int) code].const_p = false;
25706 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
25707 }
25708 }
25709
25710 return decl;
25711 }
25712
25713 /* Like def_builtin, but also marks the function decl "const". */
25714
25715 static inline tree
25716 def_builtin_const (HOST_WIDE_INT mask, const char *name,
25717 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
25718 {
25719 tree decl = def_builtin (mask, name, tcode, code);
25720 if (decl)
25721 TREE_READONLY (decl) = 1;
25722 else
25723 ix86_builtins_isa[(int) code].const_p = true;
25724
25725 return decl;
25726 }
25727
25728 /* Add any new builtin functions for a given ISA that may not have been
25729 declared. This saves a bit of space compared to adding all of the
25730 declarations to the tree, even if we didn't use them. */
25731
25732 static void
25733 ix86_add_new_builtins (HOST_WIDE_INT isa)
25734 {
25735 int i;
25736
25737 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
25738 {
25739 if ((ix86_builtins_isa[i].isa & isa) != 0
25740 && ix86_builtins_isa[i].set_and_not_built_p)
25741 {
25742 tree decl, type;
25743
25744 /* Don't define the builtin again. */
25745 ix86_builtins_isa[i].set_and_not_built_p = false;
25746
25747 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
25748 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
25749 type, i, BUILT_IN_MD, NULL,
25750 NULL_TREE);
25751
25752 ix86_builtins[i] = decl;
25753 if (ix86_builtins_isa[i].const_p)
25754 TREE_READONLY (decl) = 1;
25755 }
25756 }
25757 }
25758
25759 /* Bits for builtin_description.flag. */
25760
25761 /* Set when we don't support the comparison natively, and should
25762 swap_comparison in order to support it. */
25763 #define BUILTIN_DESC_SWAP_OPERANDS 1
25764
25765 struct builtin_description
25766 {
25767 const HOST_WIDE_INT mask;
25768 const enum insn_code icode;
25769 const char *const name;
25770 const enum ix86_builtins code;
25771 const enum rtx_code comparison;
25772 const int flag;
25773 };
25774
25775 static const struct builtin_description bdesc_comi[] =
25776 {
25777 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
25778 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
25779 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
25780 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
25781 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
25782 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
25783 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
25784 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
25785 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
25786 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
25787 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
25788 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
25789 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
25790 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
25791 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
25792 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
25793 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
25794 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
25795 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
25796 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
25797 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
25798 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
25799 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
25800 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
25801 };
25802
25803 static const struct builtin_description bdesc_pcmpestr[] =
25804 {
25805 /* SSE4.2 */
25806 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
25807 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
25808 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
25809 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
25810 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
25811 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
25812 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
25813 };
25814
25815 static const struct builtin_description bdesc_pcmpistr[] =
25816 {
25817 /* SSE4.2 */
25818 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
25819 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
25820 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
25821 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
25822 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
25823 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
25824 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
25825 };
25826
25827 /* Special builtins with variable number of arguments. */
25828 static const struct builtin_description bdesc_special_args[] =
25829 {
25830 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
25831 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
25832 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
25833
25834 /* MMX */
25835 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25836
25837 /* 3DNow! */
25838 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
25839
25840 /* SSE */
25841 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25842 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25843 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25844
25845 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25846 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
25847 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25848 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
25849
25850 /* SSE or 3DNow!A */
25851 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25852 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
25853
25854 /* SSE2 */
25855 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25856 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
25857 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25858 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
25859 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25860 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
25861 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
25862 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
25863 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
25864 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25865
25866 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25867 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
25868
25869 /* SSE3 */
25870 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
25871
25872 /* SSE4.1 */
25873 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
25874
25875 /* SSE4A */
25876 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
25877 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
25878
25879 /* AVX */
25880 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
25881 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
25882
25883 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
25884 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25885 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25886 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
25887 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
25888
25889 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
25890 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
25891 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25892 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25893 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25894 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
25895 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
25896
25897 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
25898 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
25899 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
25900
25901 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
25902 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
25903 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
25904 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
25905 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
25906 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
25907 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
25908 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
25909
25910 /* AVX2 */
25911 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
25912 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
25913 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
25914 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
25915 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
25916 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
25917 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
25918 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
25919 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
25920
25921 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
25922 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
25923 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
25924 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
25925 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
25926 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
25927
25928 /* FSGSBASE */
25929 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25930 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25931 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
25932 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
25933 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25934 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25935 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
25936 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
25937 };
25938
25939 /* Builtins with variable number of arguments. */
25940 static const struct builtin_description bdesc_args[] =
25941 {
25942 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
25943 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
25944 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
25945 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25946 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25947 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
25948 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
25949
25950 /* MMX */
25951 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25952 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25953 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25954 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25955 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25956 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25957
25958 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25959 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25960 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25961 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25962 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25963 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25964 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25965 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25966
25967 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25968 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25969
25970 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25971 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25972 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25973 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25974
25975 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25976 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25977 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25978 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25979 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25980 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25981
25982 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25983 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
25984 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
25985 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
25986 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
25987 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
25988
25989 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25990 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
25991 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
25992
25993 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
25994
25995 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
25996 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
25997 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
25998 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
25999 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26000 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26001
26002 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26003 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26004 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26005 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26006 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26007 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26008
26009 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26010 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26011 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26012 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26013
26014 /* 3DNow! */
26015 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26016 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26017 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26018 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26019
26020 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26021 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26022 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26023 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26024 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26025 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26026 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26027 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26028 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26029 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26030 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26031 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26032 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26033 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26034 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26035
26036 /* 3DNow!A */
26037 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26038 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26039 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26040 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26041 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26042 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26043
26044 /* SSE */
26045 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26046 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26047 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26048 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26049 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26050 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26051 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26052 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26053 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26054 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26055 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26056 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26057
26058 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26059
26060 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26061 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26062 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26063 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26064 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26065 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26066 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26067 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26068
26069 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26070 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26071 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26072 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26073 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26074 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26075 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26076 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26077 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26078 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26079 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26080 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26081 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26082 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26083 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26084 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26085 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26086 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26087 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26088 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26089 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26090 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26091
26092 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26093 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26094 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26095 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26096
26097 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26098 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26099 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26100 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26101
26102 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26103
26104 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26105 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26106 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26107 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26108 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26109
26110 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
26111 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
26112 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
26113
26114 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
26115
26116 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26117 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26118 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
26119
26120 /* SSE MMX or 3Dnow!A */
26121 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26122 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26123 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26124
26125 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26126 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26127 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26128 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26129
26130 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
26131 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
26132
26133 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
26134
26135 /* SSE2 */
26136 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26137
26138 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
26139 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
26140 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26141 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
26142 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
26143
26144 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26145 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26146 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
26147 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
26148 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
26149
26150 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
26151
26152 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26153 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
26154 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26155 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
26156
26157 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26158 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
26159 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26160
26161 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26162 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26163 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26164 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26165 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26166 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26167 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26168 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26169
26170 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26171 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26172 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26173 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26174 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
26175 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26176 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26177 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26178 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26179 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26180 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
26181 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26182 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
26183 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
26184 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
26185 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26186 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
26187 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
26188 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
26189 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
26190
26191 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26192 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26193 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26194 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26195
26196 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26197 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26198 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26199 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26200
26201 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26202
26203 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26204 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26205 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26206
26207 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26208
26209 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26210 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26211 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26212 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26213 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26214 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26215 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26216 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26217
26218 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26219 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26220 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26221 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26222 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26223 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26224 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26225 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26226
26227 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26228 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
26229
26230 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26231 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26232 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26233 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26234
26235 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26236 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26237
26238 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26239 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26240 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26241 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26242 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26243 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26244
26245 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26246 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26247 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26248 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26249
26250 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26251 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26252 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26253 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26254 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26255 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26256 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26257 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26258
26259 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26260 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26261 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
26262
26263 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26264 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
26265
26266 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
26267 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26268
26269 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
26270
26271 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
26272 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
26273 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
26274 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
26275
26276 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26277 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26278 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26279 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26280 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26281 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26282 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26283
26284 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
26285 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26286 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26287 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
26288 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26289 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26290 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
26291
26292 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
26293 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
26294 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
26295 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
26296
26297 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
26298 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26299 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
26300
26301 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
26302
26303 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
26304 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
26305
26306 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26307
26308 /* SSE2 MMX */
26309 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26310 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
26311
26312 /* SSE3 */
26313 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
26314 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26315
26316 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26317 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26318 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26319 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26320 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26321 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
26322
26323 /* SSSE3 */
26324 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26325 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
26326 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26327 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
26328 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26329 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26330
26331 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26332 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26333 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26334 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26335 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26336 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26337 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26338 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26339 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26340 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26341 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26342 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26343 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
26344 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
26345 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26346 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26347 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26348 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26349 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26350 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26351 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26352 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26353 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26354 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26355
26356 /* SSSE3. */
26357 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
26358 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
26359
26360 /* SSE4.1 */
26361 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26362 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26363 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
26364 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
26365 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26366 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26367 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26368 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
26369 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
26370 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
26371
26372 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26373 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26374 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26375 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26376 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26377 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26378 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
26379 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
26380 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
26381 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
26382 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
26383 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
26384 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26385
26386 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
26387 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26388 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26389 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26390 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26391 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26392 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
26393 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26394 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26395 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
26396 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
26397 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26398
26399 /* SSE4.1 */
26400 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26401 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26402 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26403 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26404
26405 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
26406 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
26407 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
26408 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
26409
26410 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26411 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
26412
26413 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
26414 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
26415
26416 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
26417 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
26418 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
26419 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
26420
26421 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
26422 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
26423
26424 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26425 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
26426
26427 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26428 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26429 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
26430
26431 /* SSE4.2 */
26432 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26433 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
26434 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
26435 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26436 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26437
26438 /* SSE4A */
26439 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
26440 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
26441 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
26442 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26443
26444 /* AES */
26445 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
26446 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26447
26448 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26449 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26450 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26451 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26452
26453 /* PCLMUL */
26454 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
26455
26456 /* AVX */
26457 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26458 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26459 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26460 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26461 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26462 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26463 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26464 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26465 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26466 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26467 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26468 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26469 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26470 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26471 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26472 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26473 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26474 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26475 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26476 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26477 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26478 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26479 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26480 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26481 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26482 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26483
26484 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
26485 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
26486 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
26487 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
26488
26489 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26490 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26491 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
26492 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
26493 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26494 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26495 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26496 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26497 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26498 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
26499 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26500 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26501 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26502 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
26503 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
26504 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
26505 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
26506 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
26507 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
26508 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26509 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
26510 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26511 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
26512 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26513 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
26514 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
26515 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26516 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
26517 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
26518 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26519 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26520 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
26521 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
26522 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
26523
26524 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26525 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26526 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26527
26528 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26529 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26530 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26531 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26532 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26533
26534 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26535
26536 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26537 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
26538
26539 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
26540 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
26541 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
26542 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
26543
26544 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
26545 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26546
26547 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26548 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
26549
26550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
26551 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
26552 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
26553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
26554
26555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
26556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
26557
26558 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
26559 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
26560
26561 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26562 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26563 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26564 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26565
26566 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26567 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26568 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26569 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
26570 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
26571 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
26572
26573 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26574 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26575 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
26576 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26577 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
26579 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26580 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26581 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
26582 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26583 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26584 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
26585 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26587 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
26588
26589 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
26590 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
26591
26592 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26593 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
26594
26595 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
26596
26597 /* AVX2 */
26598 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
26599 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
26600 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
26601 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
26602 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26603 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26604 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
26605 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
26606 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26607 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26608 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26609 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26610 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26611 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26612 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26613 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26614 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
26615 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26616 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26617 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26618 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26619 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
26620 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
26621 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26622 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26623 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26624 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26625 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26626 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26627 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26628 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26629 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26630 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26631 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26632 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26633 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26634 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26635 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26636 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
26637 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26638 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26639 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26640 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26641 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26642 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26643 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26644 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26645 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26646 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26647 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26648 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26649 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
26650 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26651 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26652 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26653 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26654 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26655 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26656 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
26657 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
26658 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
26659 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
26660 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
26661 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
26662 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26663 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26664 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26665 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26666 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26667 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26668 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
26669 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26670 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
26671 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26672 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
26673 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26674 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
26675 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26676 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26677 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26678 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26679 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26680 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26681 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26682 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26683 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26684 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26685 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26686 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26687 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26688 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26689 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
26690 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
26691 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
26692 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
26693 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
26694 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
26695 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
26696 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26697 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26698 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26699 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26700 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26701 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26702 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26703 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26704 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26705 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26706 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26708 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
26709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
26710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26712 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26714 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
26715 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
26716 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26717 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
26718 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
26719 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
26720 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
26721 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
26722 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
26723 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
26724 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
26725 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
26726 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
26727 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
26729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
26730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
26731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
26732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
26733 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
26734 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26737 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26738 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26739 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26740 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
26741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
26742 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
26743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
26744
26745 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26746
26747 /* BMI */
26748 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26749 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26750 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
26751
26752 /* TBM */
26753 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26754 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26755
26756 /* F16C */
26757 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
26758 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
26759 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
26760 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
26761
26762 /* BMI2 */
26763 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26764 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26765 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26766 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26767 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
26768 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
26769 };
26770
26771 /* FMA4 and XOP. */
26772 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
26773 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
26774 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
26775 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
26776 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
26777 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
26778 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
26779 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
26780 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
26781 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
26782 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
26783 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
26784 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
26785 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
26786 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
26787 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
26788 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
26789 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
26790 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
26791 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
26792 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
26793 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
26794 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
26795 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
26796 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
26797 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
26798 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
26799 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
26800 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
26801 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
26802 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
26803 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
26804 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
26805 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
26806 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
26807 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
26808 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
26809 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
26810 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
26811 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
26812 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
26813 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
26814 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
26815 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
26816 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
26817 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
26818 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
26819 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
26820 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
26821 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
26822 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
26823 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
26824
26825 static const struct builtin_description bdesc_multi_arg[] =
26826 {
26827 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
26828 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
26829 UNKNOWN, (int)MULTI_ARG_3_SF },
26830 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
26831 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
26832 UNKNOWN, (int)MULTI_ARG_3_DF },
26833
26834 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
26835 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
26836 UNKNOWN, (int)MULTI_ARG_3_SF },
26837 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
26838 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
26839 UNKNOWN, (int)MULTI_ARG_3_DF },
26840
26841 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
26842 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
26843 UNKNOWN, (int)MULTI_ARG_3_SF },
26844 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
26845 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
26846 UNKNOWN, (int)MULTI_ARG_3_DF },
26847 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
26848 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
26849 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26850 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
26851 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
26852 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26853
26854 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
26855 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
26856 UNKNOWN, (int)MULTI_ARG_3_SF },
26857 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
26858 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
26859 UNKNOWN, (int)MULTI_ARG_3_DF },
26860 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
26861 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
26862 UNKNOWN, (int)MULTI_ARG_3_SF2 },
26863 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
26864 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
26865 UNKNOWN, (int)MULTI_ARG_3_DF2 },
26866
26867 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
26868 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
26869 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
26870 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
26871 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
26872 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
26873 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
26874
26875 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26876 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
26877 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
26878 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
26879 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
26880 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
26881 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
26882
26883 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
26884
26885 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26886 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
26887 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26888 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26889 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26890 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
26891 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26892 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26893 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26894 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
26895 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26896 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
26897
26898 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26899 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
26900 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
26901 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
26902 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
26903 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
26904 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
26905 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
26906 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26907 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
26908 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
26909 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
26910 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
26911 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
26912 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
26913 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
26914
26915 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
26916 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
26917 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
26918 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
26919 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
26920 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
26921
26922 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26923 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26924 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
26930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
26931 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26932 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
26933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
26935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
26936 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
26937
26938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
26939 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26940 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
26941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
26942 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
26943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
26944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
26945
26946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
26947 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
26949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
26950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
26951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
26952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
26953
26954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
26955 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26956 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
26957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
26958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
26959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
26960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
26961
26962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26963 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
26965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
26966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
26967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
26968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
26969
26970 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
26971 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26972 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
26973 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
26974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
26975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
26976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
26977
26978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
26979 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26980 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
26981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
26982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
26983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
26984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
26985
26986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
26987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
26989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
26990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
26991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
26992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
26993
26994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
26995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
26997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
26998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
26999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27001
27002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27010
27011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27019
27020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27024
27025 };
27026 \f
27027 /* TM vector builtins. */
27028
27029 /* Reuse the existing x86-specific `struct builtin_description' cause
27030 we're lazy. Add casts to make them fit. */
27031 static const struct builtin_description bdesc_tm[] =
27032 {
27033 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27034 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27035 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27036 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27037 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27038 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27039 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27040
27041 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27042 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27043 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27044 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27045 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27046 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27047 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27048
27049 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27050 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27051 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27052 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27053 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27054 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27055 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27056
27057 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27058 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27059 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27060 };
27061
27062 /* TM callbacks. */
27063
27064 /* Return the builtin decl needed to load a vector of TYPE. */
27065
27066 static tree
27067 ix86_builtin_tm_load (tree type)
27068 {
27069 if (TREE_CODE (type) == VECTOR_TYPE)
27070 {
27071 switch (tree_low_cst (TYPE_SIZE (type), 1))
27072 {
27073 case 64:
27074 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27075 case 128:
27076 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27077 case 256:
27078 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27079 }
27080 }
27081 return NULL_TREE;
27082 }
27083
27084 /* Return the builtin decl needed to store a vector of TYPE. */
27085
27086 static tree
27087 ix86_builtin_tm_store (tree type)
27088 {
27089 if (TREE_CODE (type) == VECTOR_TYPE)
27090 {
27091 switch (tree_low_cst (TYPE_SIZE (type), 1))
27092 {
27093 case 64:
27094 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27095 case 128:
27096 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27097 case 256:
27098 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27099 }
27100 }
27101 return NULL_TREE;
27102 }
27103 \f
27104 /* Initialize the transactional memory vector load/store builtins. */
27105
27106 static void
27107 ix86_init_tm_builtins (void)
27108 {
27109 enum ix86_builtin_func_type ftype;
27110 const struct builtin_description *d;
27111 size_t i;
27112 tree decl;
27113 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
27114 tree attrs_log, attrs_type_log;
27115
27116 if (!flag_tm)
27117 return;
27118
27119 /* If there are no builtins defined, we must be compiling in a
27120 language without trans-mem support. */
27121 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
27122 return;
27123
27124 /* Use whatever attributes a normal TM load has. */
27125 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
27126 attrs_load = DECL_ATTRIBUTES (decl);
27127 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27128 /* Use whatever attributes a normal TM store has. */
27129 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
27130 attrs_store = DECL_ATTRIBUTES (decl);
27131 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27132 /* Use whatever attributes a normal TM log has. */
27133 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
27134 attrs_log = DECL_ATTRIBUTES (decl);
27135 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
27136
27137 for (i = 0, d = bdesc_tm;
27138 i < ARRAY_SIZE (bdesc_tm);
27139 i++, d++)
27140 {
27141 if ((d->mask & ix86_isa_flags) != 0
27142 || (lang_hooks.builtin_function
27143 == lang_hooks.builtin_function_ext_scope))
27144 {
27145 tree type, attrs, attrs_type;
27146 enum built_in_function code = (enum built_in_function) d->code;
27147
27148 ftype = (enum ix86_builtin_func_type) d->flag;
27149 type = ix86_get_builtin_func_type (ftype);
27150
27151 if (BUILTIN_TM_LOAD_P (code))
27152 {
27153 attrs = attrs_load;
27154 attrs_type = attrs_type_load;
27155 }
27156 else if (BUILTIN_TM_STORE_P (code))
27157 {
27158 attrs = attrs_store;
27159 attrs_type = attrs_type_store;
27160 }
27161 else
27162 {
27163 attrs = attrs_log;
27164 attrs_type = attrs_type_log;
27165 }
27166 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
27167 /* The builtin without the prefix for
27168 calling it directly. */
27169 d->name + strlen ("__builtin_"),
27170 attrs);
27171 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
27172 set the TYPE_ATTRIBUTES. */
27173 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
27174
27175 set_builtin_decl (code, decl, false);
27176 }
27177 }
27178 }
27179
27180 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
27181 in the current target ISA to allow the user to compile particular modules
27182 with different target specific options that differ from the command line
27183 options. */
27184 static void
27185 ix86_init_mmx_sse_builtins (void)
27186 {
27187 const struct builtin_description * d;
27188 enum ix86_builtin_func_type ftype;
27189 size_t i;
27190
27191 /* Add all special builtins with variable number of operands. */
27192 for (i = 0, d = bdesc_special_args;
27193 i < ARRAY_SIZE (bdesc_special_args);
27194 i++, d++)
27195 {
27196 if (d->name == 0)
27197 continue;
27198
27199 ftype = (enum ix86_builtin_func_type) d->flag;
27200 def_builtin (d->mask, d->name, ftype, d->code);
27201 }
27202
27203 /* Add all builtins with variable number of operands. */
27204 for (i = 0, d = bdesc_args;
27205 i < ARRAY_SIZE (bdesc_args);
27206 i++, d++)
27207 {
27208 if (d->name == 0)
27209 continue;
27210
27211 ftype = (enum ix86_builtin_func_type) d->flag;
27212 def_builtin_const (d->mask, d->name, ftype, d->code);
27213 }
27214
27215 /* pcmpestr[im] insns. */
27216 for (i = 0, d = bdesc_pcmpestr;
27217 i < ARRAY_SIZE (bdesc_pcmpestr);
27218 i++, d++)
27219 {
27220 if (d->code == IX86_BUILTIN_PCMPESTRM128)
27221 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
27222 else
27223 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
27224 def_builtin_const (d->mask, d->name, ftype, d->code);
27225 }
27226
27227 /* pcmpistr[im] insns. */
27228 for (i = 0, d = bdesc_pcmpistr;
27229 i < ARRAY_SIZE (bdesc_pcmpistr);
27230 i++, d++)
27231 {
27232 if (d->code == IX86_BUILTIN_PCMPISTRM128)
27233 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
27234 else
27235 ftype = INT_FTYPE_V16QI_V16QI_INT;
27236 def_builtin_const (d->mask, d->name, ftype, d->code);
27237 }
27238
27239 /* comi/ucomi insns. */
27240 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27241 {
27242 if (d->mask == OPTION_MASK_ISA_SSE2)
27243 ftype = INT_FTYPE_V2DF_V2DF;
27244 else
27245 ftype = INT_FTYPE_V4SF_V4SF;
27246 def_builtin_const (d->mask, d->name, ftype, d->code);
27247 }
27248
27249 /* SSE */
27250 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
27251 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
27252 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
27253 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
27254
27255 /* SSE or 3DNow!A */
27256 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27257 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
27258 IX86_BUILTIN_MASKMOVQ);
27259
27260 /* SSE2 */
27261 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
27262 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
27263
27264 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
27265 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
27266 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
27267 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
27268
27269 /* SSE3. */
27270 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
27271 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
27272 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
27273 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
27274
27275 /* AES */
27276 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
27277 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
27278 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
27279 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
27280 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
27281 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
27282 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
27283 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
27284 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
27285 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
27286 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
27287 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
27288
27289 /* PCLMUL */
27290 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
27291 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
27292
27293 /* RDRND */
27294 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
27295 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
27296 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
27297 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
27298 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
27299 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
27300 IX86_BUILTIN_RDRAND64_STEP);
27301
27302 /* AVX2 */
27303 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
27304 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
27305 IX86_BUILTIN_GATHERSIV2DF);
27306
27307 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
27308 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
27309 IX86_BUILTIN_GATHERSIV4DF);
27310
27311 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
27312 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
27313 IX86_BUILTIN_GATHERDIV2DF);
27314
27315 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
27316 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
27317 IX86_BUILTIN_GATHERDIV4DF);
27318
27319 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
27320 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
27321 IX86_BUILTIN_GATHERSIV4SF);
27322
27323 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
27324 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
27325 IX86_BUILTIN_GATHERSIV8SF);
27326
27327 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
27328 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
27329 IX86_BUILTIN_GATHERDIV4SF);
27330
27331 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
27332 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
27333 IX86_BUILTIN_GATHERDIV8SF);
27334
27335 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
27336 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
27337 IX86_BUILTIN_GATHERSIV2DI);
27338
27339 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
27340 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
27341 IX86_BUILTIN_GATHERSIV4DI);
27342
27343 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
27344 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
27345 IX86_BUILTIN_GATHERDIV2DI);
27346
27347 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
27348 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
27349 IX86_BUILTIN_GATHERDIV4DI);
27350
27351 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
27352 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
27353 IX86_BUILTIN_GATHERSIV4SI);
27354
27355 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
27356 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
27357 IX86_BUILTIN_GATHERSIV8SI);
27358
27359 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
27360 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
27361 IX86_BUILTIN_GATHERDIV4SI);
27362
27363 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
27364 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
27365 IX86_BUILTIN_GATHERDIV8SI);
27366
27367 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
27368 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
27369 IX86_BUILTIN_GATHERALTSIV4DF);
27370
27371 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
27372 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
27373 IX86_BUILTIN_GATHERALTDIV8SF);
27374
27375 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
27376 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
27377 IX86_BUILTIN_GATHERALTSIV4DI);
27378
27379 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
27380 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
27381 IX86_BUILTIN_GATHERALTDIV8SI);
27382
27383 /* MMX access to the vec_init patterns. */
27384 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
27385 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
27386
27387 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
27388 V4HI_FTYPE_HI_HI_HI_HI,
27389 IX86_BUILTIN_VEC_INIT_V4HI);
27390
27391 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
27392 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
27393 IX86_BUILTIN_VEC_INIT_V8QI);
27394
27395 /* Access to the vec_extract patterns. */
27396 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
27397 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
27398 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
27399 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
27400 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
27401 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
27402 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
27403 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
27404 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
27405 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
27406
27407 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27408 "__builtin_ia32_vec_ext_v4hi",
27409 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
27410
27411 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
27412 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
27413
27414 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
27415 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
27416
27417 /* Access to the vec_set patterns. */
27418 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
27419 "__builtin_ia32_vec_set_v2di",
27420 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
27421
27422 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
27423 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
27424
27425 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
27426 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
27427
27428 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
27429 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
27430
27431 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
27432 "__builtin_ia32_vec_set_v4hi",
27433 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
27434
27435 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
27436 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
27437
27438 /* Add FMA4 multi-arg argument instructions */
27439 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27440 {
27441 if (d->name == 0)
27442 continue;
27443
27444 ftype = (enum ix86_builtin_func_type) d->flag;
27445 def_builtin_const (d->mask, d->name, ftype, d->code);
27446 }
27447 }
27448
27449 /* Internal method for ix86_init_builtins. */
27450
27451 static void
27452 ix86_init_builtins_va_builtins_abi (void)
27453 {
27454 tree ms_va_ref, sysv_va_ref;
27455 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
27456 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
27457 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
27458 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
27459
27460 if (!TARGET_64BIT)
27461 return;
27462 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
27463 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
27464 ms_va_ref = build_reference_type (ms_va_list_type_node);
27465 sysv_va_ref =
27466 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
27467
27468 fnvoid_va_end_ms =
27469 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27470 fnvoid_va_start_ms =
27471 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
27472 fnvoid_va_end_sysv =
27473 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
27474 fnvoid_va_start_sysv =
27475 build_varargs_function_type_list (void_type_node, sysv_va_ref,
27476 NULL_TREE);
27477 fnvoid_va_copy_ms =
27478 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
27479 NULL_TREE);
27480 fnvoid_va_copy_sysv =
27481 build_function_type_list (void_type_node, sysv_va_ref,
27482 sysv_va_ref, NULL_TREE);
27483
27484 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
27485 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
27486 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
27487 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
27488 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
27489 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
27490 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
27491 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27492 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
27493 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27494 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
27495 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
27496 }
27497
27498 static void
27499 ix86_init_builtin_types (void)
27500 {
27501 tree float128_type_node, float80_type_node;
27502
27503 /* The __float80 type. */
27504 float80_type_node = long_double_type_node;
27505 if (TYPE_MODE (float80_type_node) != XFmode)
27506 {
27507 /* The __float80 type. */
27508 float80_type_node = make_node (REAL_TYPE);
27509
27510 TYPE_PRECISION (float80_type_node) = 80;
27511 layout_type (float80_type_node);
27512 }
27513 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
27514
27515 /* The __float128 type. */
27516 float128_type_node = make_node (REAL_TYPE);
27517 TYPE_PRECISION (float128_type_node) = 128;
27518 layout_type (float128_type_node);
27519 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
27520
27521 /* This macro is built by i386-builtin-types.awk. */
27522 DEFINE_BUILTIN_PRIMITIVE_TYPES;
27523 }
27524
27525 static void
27526 ix86_init_builtins (void)
27527 {
27528 tree t;
27529
27530 ix86_init_builtin_types ();
27531
27532 /* TFmode support builtins. */
27533 def_builtin_const (0, "__builtin_infq",
27534 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
27535 def_builtin_const (0, "__builtin_huge_valq",
27536 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
27537
27538 /* We will expand them to normal call if SSE2 isn't available since
27539 they are used by libgcc. */
27540 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
27541 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
27542 BUILT_IN_MD, "__fabstf2", NULL_TREE);
27543 TREE_READONLY (t) = 1;
27544 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
27545
27546 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
27547 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
27548 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
27549 TREE_READONLY (t) = 1;
27550 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
27551
27552 ix86_init_tm_builtins ();
27553 ix86_init_mmx_sse_builtins ();
27554
27555 if (TARGET_LP64)
27556 ix86_init_builtins_va_builtins_abi ();
27557
27558 #ifdef SUBTARGET_INIT_BUILTINS
27559 SUBTARGET_INIT_BUILTINS;
27560 #endif
27561 }
27562
27563 /* Return the ix86 builtin for CODE. */
27564
27565 static tree
27566 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
27567 {
27568 if (code >= IX86_BUILTIN_MAX)
27569 return error_mark_node;
27570
27571 return ix86_builtins[code];
27572 }
27573
27574 /* Errors in the source file can cause expand_expr to return const0_rtx
27575 where we expect a vector. To avoid crashing, use one of the vector
27576 clear instructions. */
27577 static rtx
27578 safe_vector_operand (rtx x, enum machine_mode mode)
27579 {
27580 if (x == const0_rtx)
27581 x = CONST0_RTX (mode);
27582 return x;
27583 }
27584
27585 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
27586
27587 static rtx
27588 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
27589 {
27590 rtx pat;
27591 tree arg0 = CALL_EXPR_ARG (exp, 0);
27592 tree arg1 = CALL_EXPR_ARG (exp, 1);
27593 rtx op0 = expand_normal (arg0);
27594 rtx op1 = expand_normal (arg1);
27595 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27596 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27597 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
27598
27599 if (VECTOR_MODE_P (mode0))
27600 op0 = safe_vector_operand (op0, mode0);
27601 if (VECTOR_MODE_P (mode1))
27602 op1 = safe_vector_operand (op1, mode1);
27603
27604 if (optimize || !target
27605 || GET_MODE (target) != tmode
27606 || !insn_data[icode].operand[0].predicate (target, tmode))
27607 target = gen_reg_rtx (tmode);
27608
27609 if (GET_MODE (op1) == SImode && mode1 == TImode)
27610 {
27611 rtx x = gen_reg_rtx (V4SImode);
27612 emit_insn (gen_sse2_loadd (x, op1));
27613 op1 = gen_lowpart (TImode, x);
27614 }
27615
27616 if (!insn_data[icode].operand[1].predicate (op0, mode0))
27617 op0 = copy_to_mode_reg (mode0, op0);
27618 if (!insn_data[icode].operand[2].predicate (op1, mode1))
27619 op1 = copy_to_mode_reg (mode1, op1);
27620
27621 pat = GEN_FCN (icode) (target, op0, op1);
27622 if (! pat)
27623 return 0;
27624
27625 emit_insn (pat);
27626
27627 return target;
27628 }
27629
27630 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
27631
27632 static rtx
27633 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
27634 enum ix86_builtin_func_type m_type,
27635 enum rtx_code sub_code)
27636 {
27637 rtx pat;
27638 int i;
27639 int nargs;
27640 bool comparison_p = false;
27641 bool tf_p = false;
27642 bool last_arg_constant = false;
27643 int num_memory = 0;
27644 struct {
27645 rtx op;
27646 enum machine_mode mode;
27647 } args[4];
27648
27649 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27650
27651 switch (m_type)
27652 {
27653 case MULTI_ARG_4_DF2_DI_I:
27654 case MULTI_ARG_4_DF2_DI_I1:
27655 case MULTI_ARG_4_SF2_SI_I:
27656 case MULTI_ARG_4_SF2_SI_I1:
27657 nargs = 4;
27658 last_arg_constant = true;
27659 break;
27660
27661 case MULTI_ARG_3_SF:
27662 case MULTI_ARG_3_DF:
27663 case MULTI_ARG_3_SF2:
27664 case MULTI_ARG_3_DF2:
27665 case MULTI_ARG_3_DI:
27666 case MULTI_ARG_3_SI:
27667 case MULTI_ARG_3_SI_DI:
27668 case MULTI_ARG_3_HI:
27669 case MULTI_ARG_3_HI_SI:
27670 case MULTI_ARG_3_QI:
27671 case MULTI_ARG_3_DI2:
27672 case MULTI_ARG_3_SI2:
27673 case MULTI_ARG_3_HI2:
27674 case MULTI_ARG_3_QI2:
27675 nargs = 3;
27676 break;
27677
27678 case MULTI_ARG_2_SF:
27679 case MULTI_ARG_2_DF:
27680 case MULTI_ARG_2_DI:
27681 case MULTI_ARG_2_SI:
27682 case MULTI_ARG_2_HI:
27683 case MULTI_ARG_2_QI:
27684 nargs = 2;
27685 break;
27686
27687 case MULTI_ARG_2_DI_IMM:
27688 case MULTI_ARG_2_SI_IMM:
27689 case MULTI_ARG_2_HI_IMM:
27690 case MULTI_ARG_2_QI_IMM:
27691 nargs = 2;
27692 last_arg_constant = true;
27693 break;
27694
27695 case MULTI_ARG_1_SF:
27696 case MULTI_ARG_1_DF:
27697 case MULTI_ARG_1_SF2:
27698 case MULTI_ARG_1_DF2:
27699 case MULTI_ARG_1_DI:
27700 case MULTI_ARG_1_SI:
27701 case MULTI_ARG_1_HI:
27702 case MULTI_ARG_1_QI:
27703 case MULTI_ARG_1_SI_DI:
27704 case MULTI_ARG_1_HI_DI:
27705 case MULTI_ARG_1_HI_SI:
27706 case MULTI_ARG_1_QI_DI:
27707 case MULTI_ARG_1_QI_SI:
27708 case MULTI_ARG_1_QI_HI:
27709 nargs = 1;
27710 break;
27711
27712 case MULTI_ARG_2_DI_CMP:
27713 case MULTI_ARG_2_SI_CMP:
27714 case MULTI_ARG_2_HI_CMP:
27715 case MULTI_ARG_2_QI_CMP:
27716 nargs = 2;
27717 comparison_p = true;
27718 break;
27719
27720 case MULTI_ARG_2_SF_TF:
27721 case MULTI_ARG_2_DF_TF:
27722 case MULTI_ARG_2_DI_TF:
27723 case MULTI_ARG_2_SI_TF:
27724 case MULTI_ARG_2_HI_TF:
27725 case MULTI_ARG_2_QI_TF:
27726 nargs = 2;
27727 tf_p = true;
27728 break;
27729
27730 default:
27731 gcc_unreachable ();
27732 }
27733
27734 if (optimize || !target
27735 || GET_MODE (target) != tmode
27736 || !insn_data[icode].operand[0].predicate (target, tmode))
27737 target = gen_reg_rtx (tmode);
27738
27739 gcc_assert (nargs <= 4);
27740
27741 for (i = 0; i < nargs; i++)
27742 {
27743 tree arg = CALL_EXPR_ARG (exp, i);
27744 rtx op = expand_normal (arg);
27745 int adjust = (comparison_p) ? 1 : 0;
27746 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
27747
27748 if (last_arg_constant && i == nargs - 1)
27749 {
27750 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
27751 {
27752 enum insn_code new_icode = icode;
27753 switch (icode)
27754 {
27755 case CODE_FOR_xop_vpermil2v2df3:
27756 case CODE_FOR_xop_vpermil2v4sf3:
27757 case CODE_FOR_xop_vpermil2v4df3:
27758 case CODE_FOR_xop_vpermil2v8sf3:
27759 error ("the last argument must be a 2-bit immediate");
27760 return gen_reg_rtx (tmode);
27761 case CODE_FOR_xop_rotlv2di3:
27762 new_icode = CODE_FOR_rotlv2di3;
27763 goto xop_rotl;
27764 case CODE_FOR_xop_rotlv4si3:
27765 new_icode = CODE_FOR_rotlv4si3;
27766 goto xop_rotl;
27767 case CODE_FOR_xop_rotlv8hi3:
27768 new_icode = CODE_FOR_rotlv8hi3;
27769 goto xop_rotl;
27770 case CODE_FOR_xop_rotlv16qi3:
27771 new_icode = CODE_FOR_rotlv16qi3;
27772 xop_rotl:
27773 if (CONST_INT_P (op))
27774 {
27775 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
27776 op = GEN_INT (INTVAL (op) & mask);
27777 gcc_checking_assert
27778 (insn_data[icode].operand[i + 1].predicate (op, mode));
27779 }
27780 else
27781 {
27782 gcc_checking_assert
27783 (nargs == 2
27784 && insn_data[new_icode].operand[0].mode == tmode
27785 && insn_data[new_icode].operand[1].mode == tmode
27786 && insn_data[new_icode].operand[2].mode == mode
27787 && insn_data[new_icode].operand[0].predicate
27788 == insn_data[icode].operand[0].predicate
27789 && insn_data[new_icode].operand[1].predicate
27790 == insn_data[icode].operand[1].predicate);
27791 icode = new_icode;
27792 goto non_constant;
27793 }
27794 break;
27795 default:
27796 gcc_unreachable ();
27797 }
27798 }
27799 }
27800 else
27801 {
27802 non_constant:
27803 if (VECTOR_MODE_P (mode))
27804 op = safe_vector_operand (op, mode);
27805
27806 /* If we aren't optimizing, only allow one memory operand to be
27807 generated. */
27808 if (memory_operand (op, mode))
27809 num_memory++;
27810
27811 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
27812
27813 if (optimize
27814 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
27815 || num_memory > 1)
27816 op = force_reg (mode, op);
27817 }
27818
27819 args[i].op = op;
27820 args[i].mode = mode;
27821 }
27822
27823 switch (nargs)
27824 {
27825 case 1:
27826 pat = GEN_FCN (icode) (target, args[0].op);
27827 break;
27828
27829 case 2:
27830 if (tf_p)
27831 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
27832 GEN_INT ((int)sub_code));
27833 else if (! comparison_p)
27834 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
27835 else
27836 {
27837 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
27838 args[0].op,
27839 args[1].op);
27840
27841 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
27842 }
27843 break;
27844
27845 case 3:
27846 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
27847 break;
27848
27849 case 4:
27850 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
27851 break;
27852
27853 default:
27854 gcc_unreachable ();
27855 }
27856
27857 if (! pat)
27858 return 0;
27859
27860 emit_insn (pat);
27861 return target;
27862 }
27863
27864 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
27865 insns with vec_merge. */
27866
27867 static rtx
27868 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
27869 rtx target)
27870 {
27871 rtx pat;
27872 tree arg0 = CALL_EXPR_ARG (exp, 0);
27873 rtx op1, op0 = expand_normal (arg0);
27874 enum machine_mode tmode = insn_data[icode].operand[0].mode;
27875 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
27876
27877 if (optimize || !target
27878 || GET_MODE (target) != tmode
27879 || !insn_data[icode].operand[0].predicate (target, tmode))
27880 target = gen_reg_rtx (tmode);
27881
27882 if (VECTOR_MODE_P (mode0))
27883 op0 = safe_vector_operand (op0, mode0);
27884
27885 if ((optimize && !register_operand (op0, mode0))
27886 || !insn_data[icode].operand[1].predicate (op0, mode0))
27887 op0 = copy_to_mode_reg (mode0, op0);
27888
27889 op1 = op0;
27890 if (!insn_data[icode].operand[2].predicate (op1, mode0))
27891 op1 = copy_to_mode_reg (mode0, op1);
27892
27893 pat = GEN_FCN (icode) (target, op0, op1);
27894 if (! pat)
27895 return 0;
27896 emit_insn (pat);
27897 return target;
27898 }
27899
27900 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
27901
27902 static rtx
27903 ix86_expand_sse_compare (const struct builtin_description *d,
27904 tree exp, rtx target, bool swap)
27905 {
27906 rtx pat;
27907 tree arg0 = CALL_EXPR_ARG (exp, 0);
27908 tree arg1 = CALL_EXPR_ARG (exp, 1);
27909 rtx op0 = expand_normal (arg0);
27910 rtx op1 = expand_normal (arg1);
27911 rtx op2;
27912 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
27913 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
27914 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
27915 enum rtx_code comparison = d->comparison;
27916
27917 if (VECTOR_MODE_P (mode0))
27918 op0 = safe_vector_operand (op0, mode0);
27919 if (VECTOR_MODE_P (mode1))
27920 op1 = safe_vector_operand (op1, mode1);
27921
27922 /* Swap operands if we have a comparison that isn't available in
27923 hardware. */
27924 if (swap)
27925 {
27926 rtx tmp = gen_reg_rtx (mode1);
27927 emit_move_insn (tmp, op1);
27928 op1 = op0;
27929 op0 = tmp;
27930 }
27931
27932 if (optimize || !target
27933 || GET_MODE (target) != tmode
27934 || !insn_data[d->icode].operand[0].predicate (target, tmode))
27935 target = gen_reg_rtx (tmode);
27936
27937 if ((optimize && !register_operand (op0, mode0))
27938 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
27939 op0 = copy_to_mode_reg (mode0, op0);
27940 if ((optimize && !register_operand (op1, mode1))
27941 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
27942 op1 = copy_to_mode_reg (mode1, op1);
27943
27944 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
27945 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
27946 if (! pat)
27947 return 0;
27948 emit_insn (pat);
27949 return target;
27950 }
27951
27952 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
27953
27954 static rtx
27955 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
27956 rtx target)
27957 {
27958 rtx pat;
27959 tree arg0 = CALL_EXPR_ARG (exp, 0);
27960 tree arg1 = CALL_EXPR_ARG (exp, 1);
27961 rtx op0 = expand_normal (arg0);
27962 rtx op1 = expand_normal (arg1);
27963 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
27964 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
27965 enum rtx_code comparison = d->comparison;
27966
27967 if (VECTOR_MODE_P (mode0))
27968 op0 = safe_vector_operand (op0, mode0);
27969 if (VECTOR_MODE_P (mode1))
27970 op1 = safe_vector_operand (op1, mode1);
27971
27972 /* Swap operands if we have a comparison that isn't available in
27973 hardware. */
27974 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
27975 {
27976 rtx tmp = op1;
27977 op1 = op0;
27978 op0 = tmp;
27979 }
27980
27981 target = gen_reg_rtx (SImode);
27982 emit_move_insn (target, const0_rtx);
27983 target = gen_rtx_SUBREG (QImode, target, 0);
27984
27985 if ((optimize && !register_operand (op0, mode0))
27986 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
27987 op0 = copy_to_mode_reg (mode0, op0);
27988 if ((optimize && !register_operand (op1, mode1))
27989 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
27990 op1 = copy_to_mode_reg (mode1, op1);
27991
27992 pat = GEN_FCN (d->icode) (op0, op1);
27993 if (! pat)
27994 return 0;
27995 emit_insn (pat);
27996 emit_insn (gen_rtx_SET (VOIDmode,
27997 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
27998 gen_rtx_fmt_ee (comparison, QImode,
27999 SET_DEST (pat),
28000 const0_rtx)));
28001
28002 return SUBREG_REG (target);
28003 }
28004
28005 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
28006
28007 static rtx
28008 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
28009 rtx target)
28010 {
28011 rtx pat;
28012 tree arg0 = CALL_EXPR_ARG (exp, 0);
28013 rtx op1, op0 = expand_normal (arg0);
28014 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28015 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28016
28017 if (optimize || target == 0
28018 || GET_MODE (target) != tmode
28019 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28020 target = gen_reg_rtx (tmode);
28021
28022 if (VECTOR_MODE_P (mode0))
28023 op0 = safe_vector_operand (op0, mode0);
28024
28025 if ((optimize && !register_operand (op0, mode0))
28026 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28027 op0 = copy_to_mode_reg (mode0, op0);
28028
28029 op1 = GEN_INT (d->comparison);
28030
28031 pat = GEN_FCN (d->icode) (target, op0, op1);
28032 if (! pat)
28033 return 0;
28034 emit_insn (pat);
28035 return target;
28036 }
28037
28038 static rtx
28039 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
28040 tree exp, rtx target)
28041 {
28042 rtx pat;
28043 tree arg0 = CALL_EXPR_ARG (exp, 0);
28044 tree arg1 = CALL_EXPR_ARG (exp, 1);
28045 rtx op0 = expand_normal (arg0);
28046 rtx op1 = expand_normal (arg1);
28047 rtx op2;
28048 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
28049 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
28050 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
28051
28052 if (optimize || target == 0
28053 || GET_MODE (target) != tmode
28054 || !insn_data[d->icode].operand[0].predicate (target, tmode))
28055 target = gen_reg_rtx (tmode);
28056
28057 op0 = safe_vector_operand (op0, mode0);
28058 op1 = safe_vector_operand (op1, mode1);
28059
28060 if ((optimize && !register_operand (op0, mode0))
28061 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28062 op0 = copy_to_mode_reg (mode0, op0);
28063 if ((optimize && !register_operand (op1, mode1))
28064 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28065 op1 = copy_to_mode_reg (mode1, op1);
28066
28067 op2 = GEN_INT (d->comparison);
28068
28069 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
28070 if (! pat)
28071 return 0;
28072 emit_insn (pat);
28073 return target;
28074 }
28075
28076 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
28077
28078 static rtx
28079 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
28080 rtx target)
28081 {
28082 rtx pat;
28083 tree arg0 = CALL_EXPR_ARG (exp, 0);
28084 tree arg1 = CALL_EXPR_ARG (exp, 1);
28085 rtx op0 = expand_normal (arg0);
28086 rtx op1 = expand_normal (arg1);
28087 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
28088 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
28089 enum rtx_code comparison = d->comparison;
28090
28091 if (VECTOR_MODE_P (mode0))
28092 op0 = safe_vector_operand (op0, mode0);
28093 if (VECTOR_MODE_P (mode1))
28094 op1 = safe_vector_operand (op1, mode1);
28095
28096 target = gen_reg_rtx (SImode);
28097 emit_move_insn (target, const0_rtx);
28098 target = gen_rtx_SUBREG (QImode, target, 0);
28099
28100 if ((optimize && !register_operand (op0, mode0))
28101 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
28102 op0 = copy_to_mode_reg (mode0, op0);
28103 if ((optimize && !register_operand (op1, mode1))
28104 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
28105 op1 = copy_to_mode_reg (mode1, op1);
28106
28107 pat = GEN_FCN (d->icode) (op0, op1);
28108 if (! pat)
28109 return 0;
28110 emit_insn (pat);
28111 emit_insn (gen_rtx_SET (VOIDmode,
28112 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28113 gen_rtx_fmt_ee (comparison, QImode,
28114 SET_DEST (pat),
28115 const0_rtx)));
28116
28117 return SUBREG_REG (target);
28118 }
28119
28120 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
28121
28122 static rtx
28123 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
28124 tree exp, rtx target)
28125 {
28126 rtx pat;
28127 tree arg0 = CALL_EXPR_ARG (exp, 0);
28128 tree arg1 = CALL_EXPR_ARG (exp, 1);
28129 tree arg2 = CALL_EXPR_ARG (exp, 2);
28130 tree arg3 = CALL_EXPR_ARG (exp, 3);
28131 tree arg4 = CALL_EXPR_ARG (exp, 4);
28132 rtx scratch0, scratch1;
28133 rtx op0 = expand_normal (arg0);
28134 rtx op1 = expand_normal (arg1);
28135 rtx op2 = expand_normal (arg2);
28136 rtx op3 = expand_normal (arg3);
28137 rtx op4 = expand_normal (arg4);
28138 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
28139
28140 tmode0 = insn_data[d->icode].operand[0].mode;
28141 tmode1 = insn_data[d->icode].operand[1].mode;
28142 modev2 = insn_data[d->icode].operand[2].mode;
28143 modei3 = insn_data[d->icode].operand[3].mode;
28144 modev4 = insn_data[d->icode].operand[4].mode;
28145 modei5 = insn_data[d->icode].operand[5].mode;
28146 modeimm = insn_data[d->icode].operand[6].mode;
28147
28148 if (VECTOR_MODE_P (modev2))
28149 op0 = safe_vector_operand (op0, modev2);
28150 if (VECTOR_MODE_P (modev4))
28151 op2 = safe_vector_operand (op2, modev4);
28152
28153 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28154 op0 = copy_to_mode_reg (modev2, op0);
28155 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
28156 op1 = copy_to_mode_reg (modei3, op1);
28157 if ((optimize && !register_operand (op2, modev4))
28158 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
28159 op2 = copy_to_mode_reg (modev4, op2);
28160 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
28161 op3 = copy_to_mode_reg (modei5, op3);
28162
28163 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
28164 {
28165 error ("the fifth argument must be an 8-bit immediate");
28166 return const0_rtx;
28167 }
28168
28169 if (d->code == IX86_BUILTIN_PCMPESTRI128)
28170 {
28171 if (optimize || !target
28172 || GET_MODE (target) != tmode0
28173 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28174 target = gen_reg_rtx (tmode0);
28175
28176 scratch1 = gen_reg_rtx (tmode1);
28177
28178 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
28179 }
28180 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
28181 {
28182 if (optimize || !target
28183 || GET_MODE (target) != tmode1
28184 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28185 target = gen_reg_rtx (tmode1);
28186
28187 scratch0 = gen_reg_rtx (tmode0);
28188
28189 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
28190 }
28191 else
28192 {
28193 gcc_assert (d->flag);
28194
28195 scratch0 = gen_reg_rtx (tmode0);
28196 scratch1 = gen_reg_rtx (tmode1);
28197
28198 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
28199 }
28200
28201 if (! pat)
28202 return 0;
28203
28204 emit_insn (pat);
28205
28206 if (d->flag)
28207 {
28208 target = gen_reg_rtx (SImode);
28209 emit_move_insn (target, const0_rtx);
28210 target = gen_rtx_SUBREG (QImode, target, 0);
28211
28212 emit_insn
28213 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28214 gen_rtx_fmt_ee (EQ, QImode,
28215 gen_rtx_REG ((enum machine_mode) d->flag,
28216 FLAGS_REG),
28217 const0_rtx)));
28218 return SUBREG_REG (target);
28219 }
28220 else
28221 return target;
28222 }
28223
28224
28225 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
28226
28227 static rtx
28228 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
28229 tree exp, rtx target)
28230 {
28231 rtx pat;
28232 tree arg0 = CALL_EXPR_ARG (exp, 0);
28233 tree arg1 = CALL_EXPR_ARG (exp, 1);
28234 tree arg2 = CALL_EXPR_ARG (exp, 2);
28235 rtx scratch0, scratch1;
28236 rtx op0 = expand_normal (arg0);
28237 rtx op1 = expand_normal (arg1);
28238 rtx op2 = expand_normal (arg2);
28239 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
28240
28241 tmode0 = insn_data[d->icode].operand[0].mode;
28242 tmode1 = insn_data[d->icode].operand[1].mode;
28243 modev2 = insn_data[d->icode].operand[2].mode;
28244 modev3 = insn_data[d->icode].operand[3].mode;
28245 modeimm = insn_data[d->icode].operand[4].mode;
28246
28247 if (VECTOR_MODE_P (modev2))
28248 op0 = safe_vector_operand (op0, modev2);
28249 if (VECTOR_MODE_P (modev3))
28250 op1 = safe_vector_operand (op1, modev3);
28251
28252 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
28253 op0 = copy_to_mode_reg (modev2, op0);
28254 if ((optimize && !register_operand (op1, modev3))
28255 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
28256 op1 = copy_to_mode_reg (modev3, op1);
28257
28258 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
28259 {
28260 error ("the third argument must be an 8-bit immediate");
28261 return const0_rtx;
28262 }
28263
28264 if (d->code == IX86_BUILTIN_PCMPISTRI128)
28265 {
28266 if (optimize || !target
28267 || GET_MODE (target) != tmode0
28268 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
28269 target = gen_reg_rtx (tmode0);
28270
28271 scratch1 = gen_reg_rtx (tmode1);
28272
28273 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
28274 }
28275 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
28276 {
28277 if (optimize || !target
28278 || GET_MODE (target) != tmode1
28279 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
28280 target = gen_reg_rtx (tmode1);
28281
28282 scratch0 = gen_reg_rtx (tmode0);
28283
28284 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
28285 }
28286 else
28287 {
28288 gcc_assert (d->flag);
28289
28290 scratch0 = gen_reg_rtx (tmode0);
28291 scratch1 = gen_reg_rtx (tmode1);
28292
28293 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
28294 }
28295
28296 if (! pat)
28297 return 0;
28298
28299 emit_insn (pat);
28300
28301 if (d->flag)
28302 {
28303 target = gen_reg_rtx (SImode);
28304 emit_move_insn (target, const0_rtx);
28305 target = gen_rtx_SUBREG (QImode, target, 0);
28306
28307 emit_insn
28308 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
28309 gen_rtx_fmt_ee (EQ, QImode,
28310 gen_rtx_REG ((enum machine_mode) d->flag,
28311 FLAGS_REG),
28312 const0_rtx)));
28313 return SUBREG_REG (target);
28314 }
28315 else
28316 return target;
28317 }
28318
28319 /* Subroutine of ix86_expand_builtin to take care of insns with
28320 variable number of operands. */
28321
28322 static rtx
28323 ix86_expand_args_builtin (const struct builtin_description *d,
28324 tree exp, rtx target)
28325 {
28326 rtx pat, real_target;
28327 unsigned int i, nargs;
28328 unsigned int nargs_constant = 0;
28329 int num_memory = 0;
28330 struct
28331 {
28332 rtx op;
28333 enum machine_mode mode;
28334 } args[4];
28335 bool last_arg_count = false;
28336 enum insn_code icode = d->icode;
28337 const struct insn_data_d *insn_p = &insn_data[icode];
28338 enum machine_mode tmode = insn_p->operand[0].mode;
28339 enum machine_mode rmode = VOIDmode;
28340 bool swap = false;
28341 enum rtx_code comparison = d->comparison;
28342
28343 switch ((enum ix86_builtin_func_type) d->flag)
28344 {
28345 case V2DF_FTYPE_V2DF_ROUND:
28346 case V4DF_FTYPE_V4DF_ROUND:
28347 case V4SF_FTYPE_V4SF_ROUND:
28348 case V8SF_FTYPE_V8SF_ROUND:
28349 case V4SI_FTYPE_V4SF_ROUND:
28350 case V8SI_FTYPE_V8SF_ROUND:
28351 return ix86_expand_sse_round (d, exp, target);
28352 case V4SI_FTYPE_V2DF_V2DF_ROUND:
28353 case V8SI_FTYPE_V4DF_V4DF_ROUND:
28354 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
28355 case INT_FTYPE_V8SF_V8SF_PTEST:
28356 case INT_FTYPE_V4DI_V4DI_PTEST:
28357 case INT_FTYPE_V4DF_V4DF_PTEST:
28358 case INT_FTYPE_V4SF_V4SF_PTEST:
28359 case INT_FTYPE_V2DI_V2DI_PTEST:
28360 case INT_FTYPE_V2DF_V2DF_PTEST:
28361 return ix86_expand_sse_ptest (d, exp, target);
28362 case FLOAT128_FTYPE_FLOAT128:
28363 case FLOAT_FTYPE_FLOAT:
28364 case INT_FTYPE_INT:
28365 case UINT64_FTYPE_INT:
28366 case UINT16_FTYPE_UINT16:
28367 case INT64_FTYPE_INT64:
28368 case INT64_FTYPE_V4SF:
28369 case INT64_FTYPE_V2DF:
28370 case INT_FTYPE_V16QI:
28371 case INT_FTYPE_V8QI:
28372 case INT_FTYPE_V8SF:
28373 case INT_FTYPE_V4DF:
28374 case INT_FTYPE_V4SF:
28375 case INT_FTYPE_V2DF:
28376 case INT_FTYPE_V32QI:
28377 case V16QI_FTYPE_V16QI:
28378 case V8SI_FTYPE_V8SF:
28379 case V8SI_FTYPE_V4SI:
28380 case V8HI_FTYPE_V8HI:
28381 case V8HI_FTYPE_V16QI:
28382 case V8QI_FTYPE_V8QI:
28383 case V8SF_FTYPE_V8SF:
28384 case V8SF_FTYPE_V8SI:
28385 case V8SF_FTYPE_V4SF:
28386 case V8SF_FTYPE_V8HI:
28387 case V4SI_FTYPE_V4SI:
28388 case V4SI_FTYPE_V16QI:
28389 case V4SI_FTYPE_V4SF:
28390 case V4SI_FTYPE_V8SI:
28391 case V4SI_FTYPE_V8HI:
28392 case V4SI_FTYPE_V4DF:
28393 case V4SI_FTYPE_V2DF:
28394 case V4HI_FTYPE_V4HI:
28395 case V4DF_FTYPE_V4DF:
28396 case V4DF_FTYPE_V4SI:
28397 case V4DF_FTYPE_V4SF:
28398 case V4DF_FTYPE_V2DF:
28399 case V4SF_FTYPE_V4SF:
28400 case V4SF_FTYPE_V4SI:
28401 case V4SF_FTYPE_V8SF:
28402 case V4SF_FTYPE_V4DF:
28403 case V4SF_FTYPE_V8HI:
28404 case V4SF_FTYPE_V2DF:
28405 case V2DI_FTYPE_V2DI:
28406 case V2DI_FTYPE_V16QI:
28407 case V2DI_FTYPE_V8HI:
28408 case V2DI_FTYPE_V4SI:
28409 case V2DF_FTYPE_V2DF:
28410 case V2DF_FTYPE_V4SI:
28411 case V2DF_FTYPE_V4DF:
28412 case V2DF_FTYPE_V4SF:
28413 case V2DF_FTYPE_V2SI:
28414 case V2SI_FTYPE_V2SI:
28415 case V2SI_FTYPE_V4SF:
28416 case V2SI_FTYPE_V2SF:
28417 case V2SI_FTYPE_V2DF:
28418 case V2SF_FTYPE_V2SF:
28419 case V2SF_FTYPE_V2SI:
28420 case V32QI_FTYPE_V32QI:
28421 case V32QI_FTYPE_V16QI:
28422 case V16HI_FTYPE_V16HI:
28423 case V16HI_FTYPE_V8HI:
28424 case V8SI_FTYPE_V8SI:
28425 case V16HI_FTYPE_V16QI:
28426 case V8SI_FTYPE_V16QI:
28427 case V4DI_FTYPE_V16QI:
28428 case V8SI_FTYPE_V8HI:
28429 case V4DI_FTYPE_V8HI:
28430 case V4DI_FTYPE_V4SI:
28431 case V4DI_FTYPE_V2DI:
28432 nargs = 1;
28433 break;
28434 case V4SF_FTYPE_V4SF_VEC_MERGE:
28435 case V2DF_FTYPE_V2DF_VEC_MERGE:
28436 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
28437 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
28438 case V16QI_FTYPE_V16QI_V16QI:
28439 case V16QI_FTYPE_V8HI_V8HI:
28440 case V8QI_FTYPE_V8QI_V8QI:
28441 case V8QI_FTYPE_V4HI_V4HI:
28442 case V8HI_FTYPE_V8HI_V8HI:
28443 case V8HI_FTYPE_V16QI_V16QI:
28444 case V8HI_FTYPE_V4SI_V4SI:
28445 case V8SF_FTYPE_V8SF_V8SF:
28446 case V8SF_FTYPE_V8SF_V8SI:
28447 case V4SI_FTYPE_V4SI_V4SI:
28448 case V4SI_FTYPE_V8HI_V8HI:
28449 case V4SI_FTYPE_V4SF_V4SF:
28450 case V4SI_FTYPE_V2DF_V2DF:
28451 case V4HI_FTYPE_V4HI_V4HI:
28452 case V4HI_FTYPE_V8QI_V8QI:
28453 case V4HI_FTYPE_V2SI_V2SI:
28454 case V4DF_FTYPE_V4DF_V4DF:
28455 case V4DF_FTYPE_V4DF_V4DI:
28456 case V4SF_FTYPE_V4SF_V4SF:
28457 case V4SF_FTYPE_V4SF_V4SI:
28458 case V4SF_FTYPE_V4SF_V2SI:
28459 case V4SF_FTYPE_V4SF_V2DF:
28460 case V4SF_FTYPE_V4SF_DI:
28461 case V4SF_FTYPE_V4SF_SI:
28462 case V2DI_FTYPE_V2DI_V2DI:
28463 case V2DI_FTYPE_V16QI_V16QI:
28464 case V2DI_FTYPE_V4SI_V4SI:
28465 case V2DI_FTYPE_V2DI_V16QI:
28466 case V2DI_FTYPE_V2DF_V2DF:
28467 case V2SI_FTYPE_V2SI_V2SI:
28468 case V2SI_FTYPE_V4HI_V4HI:
28469 case V2SI_FTYPE_V2SF_V2SF:
28470 case V2DF_FTYPE_V2DF_V2DF:
28471 case V2DF_FTYPE_V2DF_V4SF:
28472 case V2DF_FTYPE_V2DF_V2DI:
28473 case V2DF_FTYPE_V2DF_DI:
28474 case V2DF_FTYPE_V2DF_SI:
28475 case V2SF_FTYPE_V2SF_V2SF:
28476 case V1DI_FTYPE_V1DI_V1DI:
28477 case V1DI_FTYPE_V8QI_V8QI:
28478 case V1DI_FTYPE_V2SI_V2SI:
28479 case V32QI_FTYPE_V16HI_V16HI:
28480 case V16HI_FTYPE_V8SI_V8SI:
28481 case V32QI_FTYPE_V32QI_V32QI:
28482 case V16HI_FTYPE_V32QI_V32QI:
28483 case V16HI_FTYPE_V16HI_V16HI:
28484 case V8SI_FTYPE_V4DF_V4DF:
28485 case V8SI_FTYPE_V8SI_V8SI:
28486 case V8SI_FTYPE_V16HI_V16HI:
28487 case V4DI_FTYPE_V4DI_V4DI:
28488 case V4DI_FTYPE_V8SI_V8SI:
28489 if (comparison == UNKNOWN)
28490 return ix86_expand_binop_builtin (icode, exp, target);
28491 nargs = 2;
28492 break;
28493 case V4SF_FTYPE_V4SF_V4SF_SWAP:
28494 case V2DF_FTYPE_V2DF_V2DF_SWAP:
28495 gcc_assert (comparison != UNKNOWN);
28496 nargs = 2;
28497 swap = true;
28498 break;
28499 case V16HI_FTYPE_V16HI_V8HI_COUNT:
28500 case V16HI_FTYPE_V16HI_SI_COUNT:
28501 case V8SI_FTYPE_V8SI_V4SI_COUNT:
28502 case V8SI_FTYPE_V8SI_SI_COUNT:
28503 case V4DI_FTYPE_V4DI_V2DI_COUNT:
28504 case V4DI_FTYPE_V4DI_INT_COUNT:
28505 case V8HI_FTYPE_V8HI_V8HI_COUNT:
28506 case V8HI_FTYPE_V8HI_SI_COUNT:
28507 case V4SI_FTYPE_V4SI_V4SI_COUNT:
28508 case V4SI_FTYPE_V4SI_SI_COUNT:
28509 case V4HI_FTYPE_V4HI_V4HI_COUNT:
28510 case V4HI_FTYPE_V4HI_SI_COUNT:
28511 case V2DI_FTYPE_V2DI_V2DI_COUNT:
28512 case V2DI_FTYPE_V2DI_SI_COUNT:
28513 case V2SI_FTYPE_V2SI_V2SI_COUNT:
28514 case V2SI_FTYPE_V2SI_SI_COUNT:
28515 case V1DI_FTYPE_V1DI_V1DI_COUNT:
28516 case V1DI_FTYPE_V1DI_SI_COUNT:
28517 nargs = 2;
28518 last_arg_count = true;
28519 break;
28520 case UINT64_FTYPE_UINT64_UINT64:
28521 case UINT_FTYPE_UINT_UINT:
28522 case UINT_FTYPE_UINT_USHORT:
28523 case UINT_FTYPE_UINT_UCHAR:
28524 case UINT16_FTYPE_UINT16_INT:
28525 case UINT8_FTYPE_UINT8_INT:
28526 nargs = 2;
28527 break;
28528 case V2DI_FTYPE_V2DI_INT_CONVERT:
28529 nargs = 2;
28530 rmode = V1TImode;
28531 nargs_constant = 1;
28532 break;
28533 case V4DI_FTYPE_V4DI_INT_CONVERT:
28534 nargs = 2;
28535 rmode = V2TImode;
28536 nargs_constant = 1;
28537 break;
28538 case V8HI_FTYPE_V8HI_INT:
28539 case V8HI_FTYPE_V8SF_INT:
28540 case V8HI_FTYPE_V4SF_INT:
28541 case V8SF_FTYPE_V8SF_INT:
28542 case V4SI_FTYPE_V4SI_INT:
28543 case V4SI_FTYPE_V8SI_INT:
28544 case V4HI_FTYPE_V4HI_INT:
28545 case V4DF_FTYPE_V4DF_INT:
28546 case V4SF_FTYPE_V4SF_INT:
28547 case V4SF_FTYPE_V8SF_INT:
28548 case V2DI_FTYPE_V2DI_INT:
28549 case V2DF_FTYPE_V2DF_INT:
28550 case V2DF_FTYPE_V4DF_INT:
28551 case V16HI_FTYPE_V16HI_INT:
28552 case V8SI_FTYPE_V8SI_INT:
28553 case V4DI_FTYPE_V4DI_INT:
28554 case V2DI_FTYPE_V4DI_INT:
28555 nargs = 2;
28556 nargs_constant = 1;
28557 break;
28558 case V16QI_FTYPE_V16QI_V16QI_V16QI:
28559 case V8SF_FTYPE_V8SF_V8SF_V8SF:
28560 case V4DF_FTYPE_V4DF_V4DF_V4DF:
28561 case V4SF_FTYPE_V4SF_V4SF_V4SF:
28562 case V2DF_FTYPE_V2DF_V2DF_V2DF:
28563 case V32QI_FTYPE_V32QI_V32QI_V32QI:
28564 nargs = 3;
28565 break;
28566 case V32QI_FTYPE_V32QI_V32QI_INT:
28567 case V16HI_FTYPE_V16HI_V16HI_INT:
28568 case V16QI_FTYPE_V16QI_V16QI_INT:
28569 case V4DI_FTYPE_V4DI_V4DI_INT:
28570 case V8HI_FTYPE_V8HI_V8HI_INT:
28571 case V8SI_FTYPE_V8SI_V8SI_INT:
28572 case V8SI_FTYPE_V8SI_V4SI_INT:
28573 case V8SF_FTYPE_V8SF_V8SF_INT:
28574 case V8SF_FTYPE_V8SF_V4SF_INT:
28575 case V4SI_FTYPE_V4SI_V4SI_INT:
28576 case V4DF_FTYPE_V4DF_V4DF_INT:
28577 case V4DF_FTYPE_V4DF_V2DF_INT:
28578 case V4SF_FTYPE_V4SF_V4SF_INT:
28579 case V2DI_FTYPE_V2DI_V2DI_INT:
28580 case V4DI_FTYPE_V4DI_V2DI_INT:
28581 case V2DF_FTYPE_V2DF_V2DF_INT:
28582 nargs = 3;
28583 nargs_constant = 1;
28584 break;
28585 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
28586 nargs = 3;
28587 rmode = V4DImode;
28588 nargs_constant = 1;
28589 break;
28590 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
28591 nargs = 3;
28592 rmode = V2DImode;
28593 nargs_constant = 1;
28594 break;
28595 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
28596 nargs = 3;
28597 rmode = DImode;
28598 nargs_constant = 1;
28599 break;
28600 case V2DI_FTYPE_V2DI_UINT_UINT:
28601 nargs = 3;
28602 nargs_constant = 2;
28603 break;
28604 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
28605 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
28606 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
28607 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
28608 nargs = 4;
28609 nargs_constant = 1;
28610 break;
28611 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
28612 nargs = 4;
28613 nargs_constant = 2;
28614 break;
28615 default:
28616 gcc_unreachable ();
28617 }
28618
28619 gcc_assert (nargs <= ARRAY_SIZE (args));
28620
28621 if (comparison != UNKNOWN)
28622 {
28623 gcc_assert (nargs == 2);
28624 return ix86_expand_sse_compare (d, exp, target, swap);
28625 }
28626
28627 if (rmode == VOIDmode || rmode == tmode)
28628 {
28629 if (optimize
28630 || target == 0
28631 || GET_MODE (target) != tmode
28632 || !insn_p->operand[0].predicate (target, tmode))
28633 target = gen_reg_rtx (tmode);
28634 real_target = target;
28635 }
28636 else
28637 {
28638 target = gen_reg_rtx (rmode);
28639 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
28640 }
28641
28642 for (i = 0; i < nargs; i++)
28643 {
28644 tree arg = CALL_EXPR_ARG (exp, i);
28645 rtx op = expand_normal (arg);
28646 enum machine_mode mode = insn_p->operand[i + 1].mode;
28647 bool match = insn_p->operand[i + 1].predicate (op, mode);
28648
28649 if (last_arg_count && (i + 1) == nargs)
28650 {
28651 /* SIMD shift insns take either an 8-bit immediate or
28652 register as count. But builtin functions take int as
28653 count. If count doesn't match, we put it in register. */
28654 if (!match)
28655 {
28656 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
28657 if (!insn_p->operand[i + 1].predicate (op, mode))
28658 op = copy_to_reg (op);
28659 }
28660 }
28661 else if ((nargs - i) <= nargs_constant)
28662 {
28663 if (!match)
28664 switch (icode)
28665 {
28666 case CODE_FOR_avx2_inserti128:
28667 case CODE_FOR_avx2_extracti128:
28668 error ("the last argument must be an 1-bit immediate");
28669 return const0_rtx;
28670
28671 case CODE_FOR_sse4_1_roundsd:
28672 case CODE_FOR_sse4_1_roundss:
28673
28674 case CODE_FOR_sse4_1_roundpd:
28675 case CODE_FOR_sse4_1_roundps:
28676 case CODE_FOR_avx_roundpd256:
28677 case CODE_FOR_avx_roundps256:
28678
28679 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
28680 case CODE_FOR_sse4_1_roundps_sfix:
28681 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
28682 case CODE_FOR_avx_roundps_sfix256:
28683
28684 case CODE_FOR_sse4_1_blendps:
28685 case CODE_FOR_avx_blendpd256:
28686 case CODE_FOR_avx_vpermilv4df:
28687 error ("the last argument must be a 4-bit immediate");
28688 return const0_rtx;
28689
28690 case CODE_FOR_sse4_1_blendpd:
28691 case CODE_FOR_avx_vpermilv2df:
28692 case CODE_FOR_xop_vpermil2v2df3:
28693 case CODE_FOR_xop_vpermil2v4sf3:
28694 case CODE_FOR_xop_vpermil2v4df3:
28695 case CODE_FOR_xop_vpermil2v8sf3:
28696 error ("the last argument must be a 2-bit immediate");
28697 return const0_rtx;
28698
28699 case CODE_FOR_avx_vextractf128v4df:
28700 case CODE_FOR_avx_vextractf128v8sf:
28701 case CODE_FOR_avx_vextractf128v8si:
28702 case CODE_FOR_avx_vinsertf128v4df:
28703 case CODE_FOR_avx_vinsertf128v8sf:
28704 case CODE_FOR_avx_vinsertf128v8si:
28705 error ("the last argument must be a 1-bit immediate");
28706 return const0_rtx;
28707
28708 case CODE_FOR_avx_vmcmpv2df3:
28709 case CODE_FOR_avx_vmcmpv4sf3:
28710 case CODE_FOR_avx_cmpv2df3:
28711 case CODE_FOR_avx_cmpv4sf3:
28712 case CODE_FOR_avx_cmpv4df3:
28713 case CODE_FOR_avx_cmpv8sf3:
28714 error ("the last argument must be a 5-bit immediate");
28715 return const0_rtx;
28716
28717 default:
28718 switch (nargs_constant)
28719 {
28720 case 2:
28721 if ((nargs - i) == nargs_constant)
28722 {
28723 error ("the next to last argument must be an 8-bit immediate");
28724 break;
28725 }
28726 case 1:
28727 error ("the last argument must be an 8-bit immediate");
28728 break;
28729 default:
28730 gcc_unreachable ();
28731 }
28732 return const0_rtx;
28733 }
28734 }
28735 else
28736 {
28737 if (VECTOR_MODE_P (mode))
28738 op = safe_vector_operand (op, mode);
28739
28740 /* If we aren't optimizing, only allow one memory operand to
28741 be generated. */
28742 if (memory_operand (op, mode))
28743 num_memory++;
28744
28745 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
28746 {
28747 if (optimize || !match || num_memory > 1)
28748 op = copy_to_mode_reg (mode, op);
28749 }
28750 else
28751 {
28752 op = copy_to_reg (op);
28753 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
28754 }
28755 }
28756
28757 args[i].op = op;
28758 args[i].mode = mode;
28759 }
28760
28761 switch (nargs)
28762 {
28763 case 1:
28764 pat = GEN_FCN (icode) (real_target, args[0].op);
28765 break;
28766 case 2:
28767 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
28768 break;
28769 case 3:
28770 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28771 args[2].op);
28772 break;
28773 case 4:
28774 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
28775 args[2].op, args[3].op);
28776 break;
28777 default:
28778 gcc_unreachable ();
28779 }
28780
28781 if (! pat)
28782 return 0;
28783
28784 emit_insn (pat);
28785 return target;
28786 }
28787
28788 /* Subroutine of ix86_expand_builtin to take care of special insns
28789 with variable number of operands. */
28790
28791 static rtx
28792 ix86_expand_special_args_builtin (const struct builtin_description *d,
28793 tree exp, rtx target)
28794 {
28795 tree arg;
28796 rtx pat, op;
28797 unsigned int i, nargs, arg_adjust, memory;
28798 struct
28799 {
28800 rtx op;
28801 enum machine_mode mode;
28802 } args[3];
28803 enum insn_code icode = d->icode;
28804 bool last_arg_constant = false;
28805 const struct insn_data_d *insn_p = &insn_data[icode];
28806 enum machine_mode tmode = insn_p->operand[0].mode;
28807 enum { load, store } klass;
28808
28809 switch ((enum ix86_builtin_func_type) d->flag)
28810 {
28811 case VOID_FTYPE_VOID:
28812 if (icode == CODE_FOR_avx_vzeroupper)
28813 target = GEN_INT (vzeroupper_intrinsic);
28814 emit_insn (GEN_FCN (icode) (target));
28815 return 0;
28816 case VOID_FTYPE_UINT64:
28817 case VOID_FTYPE_UNSIGNED:
28818 nargs = 0;
28819 klass = store;
28820 memory = 0;
28821 break;
28822 case UINT64_FTYPE_VOID:
28823 case UNSIGNED_FTYPE_VOID:
28824 nargs = 0;
28825 klass = load;
28826 memory = 0;
28827 break;
28828 case UINT64_FTYPE_PUNSIGNED:
28829 case V2DI_FTYPE_PV2DI:
28830 case V4DI_FTYPE_PV4DI:
28831 case V32QI_FTYPE_PCCHAR:
28832 case V16QI_FTYPE_PCCHAR:
28833 case V8SF_FTYPE_PCV4SF:
28834 case V8SF_FTYPE_PCFLOAT:
28835 case V4SF_FTYPE_PCFLOAT:
28836 case V4DF_FTYPE_PCV2DF:
28837 case V4DF_FTYPE_PCDOUBLE:
28838 case V2DF_FTYPE_PCDOUBLE:
28839 case VOID_FTYPE_PVOID:
28840 nargs = 1;
28841 klass = load;
28842 memory = 0;
28843 break;
28844 case VOID_FTYPE_PV2SF_V4SF:
28845 case VOID_FTYPE_PV4DI_V4DI:
28846 case VOID_FTYPE_PV2DI_V2DI:
28847 case VOID_FTYPE_PCHAR_V32QI:
28848 case VOID_FTYPE_PCHAR_V16QI:
28849 case VOID_FTYPE_PFLOAT_V8SF:
28850 case VOID_FTYPE_PFLOAT_V4SF:
28851 case VOID_FTYPE_PDOUBLE_V4DF:
28852 case VOID_FTYPE_PDOUBLE_V2DF:
28853 case VOID_FTYPE_PLONGLONG_LONGLONG:
28854 case VOID_FTYPE_PULONGLONG_ULONGLONG:
28855 case VOID_FTYPE_PINT_INT:
28856 nargs = 1;
28857 klass = store;
28858 /* Reserve memory operand for target. */
28859 memory = ARRAY_SIZE (args);
28860 break;
28861 case V4SF_FTYPE_V4SF_PCV2SF:
28862 case V2DF_FTYPE_V2DF_PCDOUBLE:
28863 nargs = 2;
28864 klass = load;
28865 memory = 1;
28866 break;
28867 case V8SF_FTYPE_PCV8SF_V8SI:
28868 case V4DF_FTYPE_PCV4DF_V4DI:
28869 case V4SF_FTYPE_PCV4SF_V4SI:
28870 case V2DF_FTYPE_PCV2DF_V2DI:
28871 case V8SI_FTYPE_PCV8SI_V8SI:
28872 case V4DI_FTYPE_PCV4DI_V4DI:
28873 case V4SI_FTYPE_PCV4SI_V4SI:
28874 case V2DI_FTYPE_PCV2DI_V2DI:
28875 nargs = 2;
28876 klass = load;
28877 memory = 0;
28878 break;
28879 case VOID_FTYPE_PV8SF_V8SI_V8SF:
28880 case VOID_FTYPE_PV4DF_V4DI_V4DF:
28881 case VOID_FTYPE_PV4SF_V4SI_V4SF:
28882 case VOID_FTYPE_PV2DF_V2DI_V2DF:
28883 case VOID_FTYPE_PV8SI_V8SI_V8SI:
28884 case VOID_FTYPE_PV4DI_V4DI_V4DI:
28885 case VOID_FTYPE_PV4SI_V4SI_V4SI:
28886 case VOID_FTYPE_PV2DI_V2DI_V2DI:
28887 nargs = 2;
28888 klass = store;
28889 /* Reserve memory operand for target. */
28890 memory = ARRAY_SIZE (args);
28891 break;
28892 case VOID_FTYPE_UINT_UINT_UINT:
28893 case VOID_FTYPE_UINT64_UINT_UINT:
28894 case UCHAR_FTYPE_UINT_UINT_UINT:
28895 case UCHAR_FTYPE_UINT64_UINT_UINT:
28896 nargs = 3;
28897 klass = load;
28898 memory = ARRAY_SIZE (args);
28899 last_arg_constant = true;
28900 break;
28901 default:
28902 gcc_unreachable ();
28903 }
28904
28905 gcc_assert (nargs <= ARRAY_SIZE (args));
28906
28907 if (klass == store)
28908 {
28909 arg = CALL_EXPR_ARG (exp, 0);
28910 op = expand_normal (arg);
28911 gcc_assert (target == 0);
28912 if (memory)
28913 {
28914 if (GET_MODE (op) != Pmode)
28915 op = convert_to_mode (Pmode, op, 1);
28916 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
28917 }
28918 else
28919 target = force_reg (tmode, op);
28920 arg_adjust = 1;
28921 }
28922 else
28923 {
28924 arg_adjust = 0;
28925 if (optimize
28926 || target == 0
28927 || GET_MODE (target) != tmode
28928 || !insn_p->operand[0].predicate (target, tmode))
28929 target = gen_reg_rtx (tmode);
28930 }
28931
28932 for (i = 0; i < nargs; i++)
28933 {
28934 enum machine_mode mode = insn_p->operand[i + 1].mode;
28935 bool match;
28936
28937 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
28938 op = expand_normal (arg);
28939 match = insn_p->operand[i + 1].predicate (op, mode);
28940
28941 if (last_arg_constant && (i + 1) == nargs)
28942 {
28943 if (!match)
28944 {
28945 if (icode == CODE_FOR_lwp_lwpvalsi3
28946 || icode == CODE_FOR_lwp_lwpinssi3
28947 || icode == CODE_FOR_lwp_lwpvaldi3
28948 || icode == CODE_FOR_lwp_lwpinsdi3)
28949 error ("the last argument must be a 32-bit immediate");
28950 else
28951 error ("the last argument must be an 8-bit immediate");
28952 return const0_rtx;
28953 }
28954 }
28955 else
28956 {
28957 if (i == memory)
28958 {
28959 /* This must be the memory operand. */
28960 if (GET_MODE (op) != Pmode)
28961 op = convert_to_mode (Pmode, op, 1);
28962 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
28963 gcc_assert (GET_MODE (op) == mode
28964 || GET_MODE (op) == VOIDmode);
28965 }
28966 else
28967 {
28968 /* This must be register. */
28969 if (VECTOR_MODE_P (mode))
28970 op = safe_vector_operand (op, mode);
28971
28972 gcc_assert (GET_MODE (op) == mode
28973 || GET_MODE (op) == VOIDmode);
28974 op = copy_to_mode_reg (mode, op);
28975 }
28976 }
28977
28978 args[i].op = op;
28979 args[i].mode = mode;
28980 }
28981
28982 switch (nargs)
28983 {
28984 case 0:
28985 pat = GEN_FCN (icode) (target);
28986 break;
28987 case 1:
28988 pat = GEN_FCN (icode) (target, args[0].op);
28989 break;
28990 case 2:
28991 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
28992 break;
28993 case 3:
28994 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
28995 break;
28996 default:
28997 gcc_unreachable ();
28998 }
28999
29000 if (! pat)
29001 return 0;
29002 emit_insn (pat);
29003 return klass == store ? 0 : target;
29004 }
29005
29006 /* Return the integer constant in ARG. Constrain it to be in the range
29007 of the subparts of VEC_TYPE; issue an error if not. */
29008
29009 static int
29010 get_element_number (tree vec_type, tree arg)
29011 {
29012 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
29013
29014 if (!host_integerp (arg, 1)
29015 || (elt = tree_low_cst (arg, 1), elt > max))
29016 {
29017 error ("selector must be an integer constant in the range 0..%wi", max);
29018 return 0;
29019 }
29020
29021 return elt;
29022 }
29023
29024 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29025 ix86_expand_vector_init. We DO have language-level syntax for this, in
29026 the form of (type){ init-list }. Except that since we can't place emms
29027 instructions from inside the compiler, we can't allow the use of MMX
29028 registers unless the user explicitly asks for it. So we do *not* define
29029 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
29030 we have builtins invoked by mmintrin.h that gives us license to emit
29031 these sorts of instructions. */
29032
29033 static rtx
29034 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
29035 {
29036 enum machine_mode tmode = TYPE_MODE (type);
29037 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
29038 int i, n_elt = GET_MODE_NUNITS (tmode);
29039 rtvec v = rtvec_alloc (n_elt);
29040
29041 gcc_assert (VECTOR_MODE_P (tmode));
29042 gcc_assert (call_expr_nargs (exp) == n_elt);
29043
29044 for (i = 0; i < n_elt; ++i)
29045 {
29046 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
29047 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
29048 }
29049
29050 if (!target || !register_operand (target, tmode))
29051 target = gen_reg_rtx (tmode);
29052
29053 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
29054 return target;
29055 }
29056
29057 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29058 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
29059 had a language-level syntax for referencing vector elements. */
29060
29061 static rtx
29062 ix86_expand_vec_ext_builtin (tree exp, rtx target)
29063 {
29064 enum machine_mode tmode, mode0;
29065 tree arg0, arg1;
29066 int elt;
29067 rtx op0;
29068
29069 arg0 = CALL_EXPR_ARG (exp, 0);
29070 arg1 = CALL_EXPR_ARG (exp, 1);
29071
29072 op0 = expand_normal (arg0);
29073 elt = get_element_number (TREE_TYPE (arg0), arg1);
29074
29075 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29076 mode0 = TYPE_MODE (TREE_TYPE (arg0));
29077 gcc_assert (VECTOR_MODE_P (mode0));
29078
29079 op0 = force_reg (mode0, op0);
29080
29081 if (optimize || !target || !register_operand (target, tmode))
29082 target = gen_reg_rtx (tmode);
29083
29084 ix86_expand_vector_extract (true, target, op0, elt);
29085
29086 return target;
29087 }
29088
29089 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
29090 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
29091 a language-level syntax for referencing vector elements. */
29092
29093 static rtx
29094 ix86_expand_vec_set_builtin (tree exp)
29095 {
29096 enum machine_mode tmode, mode1;
29097 tree arg0, arg1, arg2;
29098 int elt;
29099 rtx op0, op1, target;
29100
29101 arg0 = CALL_EXPR_ARG (exp, 0);
29102 arg1 = CALL_EXPR_ARG (exp, 1);
29103 arg2 = CALL_EXPR_ARG (exp, 2);
29104
29105 tmode = TYPE_MODE (TREE_TYPE (arg0));
29106 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
29107 gcc_assert (VECTOR_MODE_P (tmode));
29108
29109 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
29110 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
29111 elt = get_element_number (TREE_TYPE (arg0), arg2);
29112
29113 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
29114 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
29115
29116 op0 = force_reg (tmode, op0);
29117 op1 = force_reg (mode1, op1);
29118
29119 /* OP0 is the source of these builtin functions and shouldn't be
29120 modified. Create a copy, use it and return it as target. */
29121 target = gen_reg_rtx (tmode);
29122 emit_move_insn (target, op0);
29123 ix86_expand_vector_set (true, target, op1, elt);
29124
29125 return target;
29126 }
29127
29128 /* Expand an expression EXP that calls a built-in function,
29129 with result going to TARGET if that's convenient
29130 (and in mode MODE if that's convenient).
29131 SUBTARGET may be used as the target for computing one of EXP's operands.
29132 IGNORE is nonzero if the value is to be ignored. */
29133
29134 static rtx
29135 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
29136 enum machine_mode mode ATTRIBUTE_UNUSED,
29137 int ignore ATTRIBUTE_UNUSED)
29138 {
29139 const struct builtin_description *d;
29140 size_t i;
29141 enum insn_code icode;
29142 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
29143 tree arg0, arg1, arg2, arg3, arg4;
29144 rtx op0, op1, op2, op3, op4, pat;
29145 enum machine_mode mode0, mode1, mode2, mode3, mode4;
29146 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
29147
29148 /* Determine whether the builtin function is available under the current ISA.
29149 Originally the builtin was not created if it wasn't applicable to the
29150 current ISA based on the command line switches. With function specific
29151 options, we need to check in the context of the function making the call
29152 whether it is supported. */
29153 if (ix86_builtins_isa[fcode].isa
29154 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
29155 {
29156 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
29157 NULL, (enum fpmath_unit) 0, false);
29158
29159 if (!opts)
29160 error ("%qE needs unknown isa option", fndecl);
29161 else
29162 {
29163 gcc_assert (opts != NULL);
29164 error ("%qE needs isa option %s", fndecl, opts);
29165 free (opts);
29166 }
29167 return const0_rtx;
29168 }
29169
29170 switch (fcode)
29171 {
29172 case IX86_BUILTIN_MASKMOVQ:
29173 case IX86_BUILTIN_MASKMOVDQU:
29174 icode = (fcode == IX86_BUILTIN_MASKMOVQ
29175 ? CODE_FOR_mmx_maskmovq
29176 : CODE_FOR_sse2_maskmovdqu);
29177 /* Note the arg order is different from the operand order. */
29178 arg1 = CALL_EXPR_ARG (exp, 0);
29179 arg2 = CALL_EXPR_ARG (exp, 1);
29180 arg0 = CALL_EXPR_ARG (exp, 2);
29181 op0 = expand_normal (arg0);
29182 op1 = expand_normal (arg1);
29183 op2 = expand_normal (arg2);
29184 mode0 = insn_data[icode].operand[0].mode;
29185 mode1 = insn_data[icode].operand[1].mode;
29186 mode2 = insn_data[icode].operand[2].mode;
29187
29188 if (GET_MODE (op0) != Pmode)
29189 op0 = convert_to_mode (Pmode, op0, 1);
29190 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
29191
29192 if (!insn_data[icode].operand[0].predicate (op0, mode0))
29193 op0 = copy_to_mode_reg (mode0, op0);
29194 if (!insn_data[icode].operand[1].predicate (op1, mode1))
29195 op1 = copy_to_mode_reg (mode1, op1);
29196 if (!insn_data[icode].operand[2].predicate (op2, mode2))
29197 op2 = copy_to_mode_reg (mode2, op2);
29198 pat = GEN_FCN (icode) (op0, op1, op2);
29199 if (! pat)
29200 return 0;
29201 emit_insn (pat);
29202 return 0;
29203
29204 case IX86_BUILTIN_LDMXCSR:
29205 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
29206 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29207 emit_move_insn (target, op0);
29208 emit_insn (gen_sse_ldmxcsr (target));
29209 return 0;
29210
29211 case IX86_BUILTIN_STMXCSR:
29212 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
29213 emit_insn (gen_sse_stmxcsr (target));
29214 return copy_to_mode_reg (SImode, target);
29215
29216 case IX86_BUILTIN_CLFLUSH:
29217 arg0 = CALL_EXPR_ARG (exp, 0);
29218 op0 = expand_normal (arg0);
29219 icode = CODE_FOR_sse2_clflush;
29220 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29221 {
29222 if (GET_MODE (op0) != Pmode)
29223 op0 = convert_to_mode (Pmode, op0, 1);
29224 op0 = force_reg (Pmode, op0);
29225 }
29226
29227 emit_insn (gen_sse2_clflush (op0));
29228 return 0;
29229
29230 case IX86_BUILTIN_MONITOR:
29231 arg0 = CALL_EXPR_ARG (exp, 0);
29232 arg1 = CALL_EXPR_ARG (exp, 1);
29233 arg2 = CALL_EXPR_ARG (exp, 2);
29234 op0 = expand_normal (arg0);
29235 op1 = expand_normal (arg1);
29236 op2 = expand_normal (arg2);
29237 if (!REG_P (op0))
29238 {
29239 if (GET_MODE (op0) != Pmode)
29240 op0 = convert_to_mode (Pmode, op0, 1);
29241 op0 = force_reg (Pmode, op0);
29242 }
29243 if (!REG_P (op1))
29244 op1 = copy_to_mode_reg (SImode, op1);
29245 if (!REG_P (op2))
29246 op2 = copy_to_mode_reg (SImode, op2);
29247 emit_insn (ix86_gen_monitor (op0, op1, op2));
29248 return 0;
29249
29250 case IX86_BUILTIN_MWAIT:
29251 arg0 = CALL_EXPR_ARG (exp, 0);
29252 arg1 = CALL_EXPR_ARG (exp, 1);
29253 op0 = expand_normal (arg0);
29254 op1 = expand_normal (arg1);
29255 if (!REG_P (op0))
29256 op0 = copy_to_mode_reg (SImode, op0);
29257 if (!REG_P (op1))
29258 op1 = copy_to_mode_reg (SImode, op1);
29259 emit_insn (gen_sse3_mwait (op0, op1));
29260 return 0;
29261
29262 case IX86_BUILTIN_VEC_INIT_V2SI:
29263 case IX86_BUILTIN_VEC_INIT_V4HI:
29264 case IX86_BUILTIN_VEC_INIT_V8QI:
29265 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
29266
29267 case IX86_BUILTIN_VEC_EXT_V2DF:
29268 case IX86_BUILTIN_VEC_EXT_V2DI:
29269 case IX86_BUILTIN_VEC_EXT_V4SF:
29270 case IX86_BUILTIN_VEC_EXT_V4SI:
29271 case IX86_BUILTIN_VEC_EXT_V8HI:
29272 case IX86_BUILTIN_VEC_EXT_V2SI:
29273 case IX86_BUILTIN_VEC_EXT_V4HI:
29274 case IX86_BUILTIN_VEC_EXT_V16QI:
29275 return ix86_expand_vec_ext_builtin (exp, target);
29276
29277 case IX86_BUILTIN_VEC_SET_V2DI:
29278 case IX86_BUILTIN_VEC_SET_V4SF:
29279 case IX86_BUILTIN_VEC_SET_V4SI:
29280 case IX86_BUILTIN_VEC_SET_V8HI:
29281 case IX86_BUILTIN_VEC_SET_V4HI:
29282 case IX86_BUILTIN_VEC_SET_V16QI:
29283 return ix86_expand_vec_set_builtin (exp);
29284
29285 case IX86_BUILTIN_INFQ:
29286 case IX86_BUILTIN_HUGE_VALQ:
29287 {
29288 REAL_VALUE_TYPE inf;
29289 rtx tmp;
29290
29291 real_inf (&inf);
29292 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
29293
29294 tmp = validize_mem (force_const_mem (mode, tmp));
29295
29296 if (target == 0)
29297 target = gen_reg_rtx (mode);
29298
29299 emit_move_insn (target, tmp);
29300 return target;
29301 }
29302
29303 case IX86_BUILTIN_LLWPCB:
29304 arg0 = CALL_EXPR_ARG (exp, 0);
29305 op0 = expand_normal (arg0);
29306 icode = CODE_FOR_lwp_llwpcb;
29307 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
29308 {
29309 if (GET_MODE (op0) != Pmode)
29310 op0 = convert_to_mode (Pmode, op0, 1);
29311 op0 = force_reg (Pmode, op0);
29312 }
29313 emit_insn (gen_lwp_llwpcb (op0));
29314 return 0;
29315
29316 case IX86_BUILTIN_SLWPCB:
29317 icode = CODE_FOR_lwp_slwpcb;
29318 if (!target
29319 || !insn_data[icode].operand[0].predicate (target, Pmode))
29320 target = gen_reg_rtx (Pmode);
29321 emit_insn (gen_lwp_slwpcb (target));
29322 return target;
29323
29324 case IX86_BUILTIN_BEXTRI32:
29325 case IX86_BUILTIN_BEXTRI64:
29326 arg0 = CALL_EXPR_ARG (exp, 0);
29327 arg1 = CALL_EXPR_ARG (exp, 1);
29328 op0 = expand_normal (arg0);
29329 op1 = expand_normal (arg1);
29330 icode = (fcode == IX86_BUILTIN_BEXTRI32
29331 ? CODE_FOR_tbm_bextri_si
29332 : CODE_FOR_tbm_bextri_di);
29333 if (!CONST_INT_P (op1))
29334 {
29335 error ("last argument must be an immediate");
29336 return const0_rtx;
29337 }
29338 else
29339 {
29340 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
29341 unsigned char lsb_index = INTVAL (op1) & 0xFF;
29342 op1 = GEN_INT (length);
29343 op2 = GEN_INT (lsb_index);
29344 pat = GEN_FCN (icode) (target, op0, op1, op2);
29345 if (pat)
29346 emit_insn (pat);
29347 return target;
29348 }
29349
29350 case IX86_BUILTIN_RDRAND16_STEP:
29351 icode = CODE_FOR_rdrandhi_1;
29352 mode0 = HImode;
29353 goto rdrand_step;
29354
29355 case IX86_BUILTIN_RDRAND32_STEP:
29356 icode = CODE_FOR_rdrandsi_1;
29357 mode0 = SImode;
29358 goto rdrand_step;
29359
29360 case IX86_BUILTIN_RDRAND64_STEP:
29361 icode = CODE_FOR_rdranddi_1;
29362 mode0 = DImode;
29363
29364 rdrand_step:
29365 op0 = gen_reg_rtx (mode0);
29366 emit_insn (GEN_FCN (icode) (op0));
29367
29368 arg0 = CALL_EXPR_ARG (exp, 0);
29369 op1 = expand_normal (arg0);
29370 if (!address_operand (op1, VOIDmode))
29371 {
29372 op1 = convert_memory_address (Pmode, op1);
29373 op1 = copy_addr_to_reg (op1);
29374 }
29375 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
29376
29377 op1 = gen_reg_rtx (SImode);
29378 emit_move_insn (op1, CONST1_RTX (SImode));
29379
29380 /* Emit SImode conditional move. */
29381 if (mode0 == HImode)
29382 {
29383 op2 = gen_reg_rtx (SImode);
29384 emit_insn (gen_zero_extendhisi2 (op2, op0));
29385 }
29386 else if (mode0 == SImode)
29387 op2 = op0;
29388 else
29389 op2 = gen_rtx_SUBREG (SImode, op0, 0);
29390
29391 if (target == 0)
29392 target = gen_reg_rtx (SImode);
29393
29394 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
29395 const0_rtx);
29396 emit_insn (gen_rtx_SET (VOIDmode, target,
29397 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
29398 return target;
29399
29400 case IX86_BUILTIN_GATHERSIV2DF:
29401 icode = CODE_FOR_avx2_gathersiv2df;
29402 goto gather_gen;
29403 case IX86_BUILTIN_GATHERSIV4DF:
29404 icode = CODE_FOR_avx2_gathersiv4df;
29405 goto gather_gen;
29406 case IX86_BUILTIN_GATHERDIV2DF:
29407 icode = CODE_FOR_avx2_gatherdiv2df;
29408 goto gather_gen;
29409 case IX86_BUILTIN_GATHERDIV4DF:
29410 icode = CODE_FOR_avx2_gatherdiv4df;
29411 goto gather_gen;
29412 case IX86_BUILTIN_GATHERSIV4SF:
29413 icode = CODE_FOR_avx2_gathersiv4sf;
29414 goto gather_gen;
29415 case IX86_BUILTIN_GATHERSIV8SF:
29416 icode = CODE_FOR_avx2_gathersiv8sf;
29417 goto gather_gen;
29418 case IX86_BUILTIN_GATHERDIV4SF:
29419 icode = CODE_FOR_avx2_gatherdiv4sf;
29420 goto gather_gen;
29421 case IX86_BUILTIN_GATHERDIV8SF:
29422 icode = CODE_FOR_avx2_gatherdiv8sf;
29423 goto gather_gen;
29424 case IX86_BUILTIN_GATHERSIV2DI:
29425 icode = CODE_FOR_avx2_gathersiv2di;
29426 goto gather_gen;
29427 case IX86_BUILTIN_GATHERSIV4DI:
29428 icode = CODE_FOR_avx2_gathersiv4di;
29429 goto gather_gen;
29430 case IX86_BUILTIN_GATHERDIV2DI:
29431 icode = CODE_FOR_avx2_gatherdiv2di;
29432 goto gather_gen;
29433 case IX86_BUILTIN_GATHERDIV4DI:
29434 icode = CODE_FOR_avx2_gatherdiv4di;
29435 goto gather_gen;
29436 case IX86_BUILTIN_GATHERSIV4SI:
29437 icode = CODE_FOR_avx2_gathersiv4si;
29438 goto gather_gen;
29439 case IX86_BUILTIN_GATHERSIV8SI:
29440 icode = CODE_FOR_avx2_gathersiv8si;
29441 goto gather_gen;
29442 case IX86_BUILTIN_GATHERDIV4SI:
29443 icode = CODE_FOR_avx2_gatherdiv4si;
29444 goto gather_gen;
29445 case IX86_BUILTIN_GATHERDIV8SI:
29446 icode = CODE_FOR_avx2_gatherdiv8si;
29447 goto gather_gen;
29448 case IX86_BUILTIN_GATHERALTSIV4DF:
29449 icode = CODE_FOR_avx2_gathersiv4df;
29450 goto gather_gen;
29451 case IX86_BUILTIN_GATHERALTDIV8SF:
29452 icode = CODE_FOR_avx2_gatherdiv8sf;
29453 goto gather_gen;
29454 case IX86_BUILTIN_GATHERALTSIV4DI:
29455 icode = CODE_FOR_avx2_gathersiv4di;
29456 goto gather_gen;
29457 case IX86_BUILTIN_GATHERALTDIV8SI:
29458 icode = CODE_FOR_avx2_gatherdiv8si;
29459 goto gather_gen;
29460
29461 gather_gen:
29462 arg0 = CALL_EXPR_ARG (exp, 0);
29463 arg1 = CALL_EXPR_ARG (exp, 1);
29464 arg2 = CALL_EXPR_ARG (exp, 2);
29465 arg3 = CALL_EXPR_ARG (exp, 3);
29466 arg4 = CALL_EXPR_ARG (exp, 4);
29467 op0 = expand_normal (arg0);
29468 op1 = expand_normal (arg1);
29469 op2 = expand_normal (arg2);
29470 op3 = expand_normal (arg3);
29471 op4 = expand_normal (arg4);
29472 /* Note the arg order is different from the operand order. */
29473 mode0 = insn_data[icode].operand[1].mode;
29474 mode2 = insn_data[icode].operand[3].mode;
29475 mode3 = insn_data[icode].operand[4].mode;
29476 mode4 = insn_data[icode].operand[5].mode;
29477
29478 if (target == NULL_RTX
29479 || GET_MODE (target) != insn_data[icode].operand[0].mode)
29480 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
29481 else
29482 subtarget = target;
29483
29484 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
29485 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
29486 {
29487 rtx half = gen_reg_rtx (V4SImode);
29488 if (!nonimmediate_operand (op2, V8SImode))
29489 op2 = copy_to_mode_reg (V8SImode, op2);
29490 emit_insn (gen_vec_extract_lo_v8si (half, op2));
29491 op2 = half;
29492 }
29493 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
29494 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
29495 {
29496 rtx (*gen) (rtx, rtx);
29497 rtx half = gen_reg_rtx (mode0);
29498 if (mode0 == V4SFmode)
29499 gen = gen_vec_extract_lo_v8sf;
29500 else
29501 gen = gen_vec_extract_lo_v8si;
29502 if (!nonimmediate_operand (op0, GET_MODE (op0)))
29503 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
29504 emit_insn (gen (half, op0));
29505 op0 = half;
29506 if (!nonimmediate_operand (op3, GET_MODE (op3)))
29507 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
29508 emit_insn (gen (half, op3));
29509 op3 = half;
29510 }
29511
29512 /* Force memory operand only with base register here. But we
29513 don't want to do it on memory operand for other builtin
29514 functions. */
29515 if (GET_MODE (op1) != Pmode)
29516 op1 = convert_to_mode (Pmode, op1, 1);
29517 op1 = force_reg (Pmode, op1);
29518
29519 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29520 op0 = copy_to_mode_reg (mode0, op0);
29521 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
29522 op1 = copy_to_mode_reg (Pmode, op1);
29523 if (!insn_data[icode].operand[3].predicate (op2, mode2))
29524 op2 = copy_to_mode_reg (mode2, op2);
29525 if (!insn_data[icode].operand[4].predicate (op3, mode3))
29526 op3 = copy_to_mode_reg (mode3, op3);
29527 if (!insn_data[icode].operand[5].predicate (op4, mode4))
29528 {
29529 error ("last argument must be scale 1, 2, 4, 8");
29530 return const0_rtx;
29531 }
29532
29533 /* Optimize. If mask is known to have all high bits set,
29534 replace op0 with pc_rtx to signal that the instruction
29535 overwrites the whole destination and doesn't use its
29536 previous contents. */
29537 if (optimize)
29538 {
29539 if (TREE_CODE (arg3) == VECTOR_CST)
29540 {
29541 tree elt;
29542 unsigned int negative = 0;
29543 for (elt = TREE_VECTOR_CST_ELTS (arg3);
29544 elt; elt = TREE_CHAIN (elt))
29545 {
29546 tree cst = TREE_VALUE (elt);
29547 if (TREE_CODE (cst) == INTEGER_CST
29548 && tree_int_cst_sign_bit (cst))
29549 negative++;
29550 else if (TREE_CODE (cst) == REAL_CST
29551 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
29552 negative++;
29553 }
29554 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
29555 op0 = pc_rtx;
29556 }
29557 else if (TREE_CODE (arg3) == SSA_NAME)
29558 {
29559 /* Recognize also when mask is like:
29560 __v2df src = _mm_setzero_pd ();
29561 __v2df mask = _mm_cmpeq_pd (src, src);
29562 or
29563 __v8sf src = _mm256_setzero_ps ();
29564 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
29565 as that is a cheaper way to load all ones into
29566 a register than having to load a constant from
29567 memory. */
29568 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
29569 if (is_gimple_call (def_stmt))
29570 {
29571 tree fndecl = gimple_call_fndecl (def_stmt);
29572 if (fndecl
29573 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29574 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
29575 {
29576 case IX86_BUILTIN_CMPPD:
29577 case IX86_BUILTIN_CMPPS:
29578 case IX86_BUILTIN_CMPPD256:
29579 case IX86_BUILTIN_CMPPS256:
29580 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
29581 break;
29582 /* FALLTHRU */
29583 case IX86_BUILTIN_CMPEQPD:
29584 case IX86_BUILTIN_CMPEQPS:
29585 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
29586 && initializer_zerop (gimple_call_arg (def_stmt,
29587 1)))
29588 op0 = pc_rtx;
29589 break;
29590 default:
29591 break;
29592 }
29593 }
29594 }
29595 }
29596
29597 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
29598 if (! pat)
29599 return const0_rtx;
29600 emit_insn (pat);
29601
29602 if (fcode == IX86_BUILTIN_GATHERDIV8SF
29603 || fcode == IX86_BUILTIN_GATHERDIV8SI)
29604 {
29605 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
29606 ? V4SFmode : V4SImode;
29607 if (target == NULL_RTX)
29608 target = gen_reg_rtx (tmode);
29609 if (tmode == V4SFmode)
29610 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
29611 else
29612 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
29613 }
29614 else
29615 target = subtarget;
29616
29617 return target;
29618
29619 default:
29620 break;
29621 }
29622
29623 for (i = 0, d = bdesc_special_args;
29624 i < ARRAY_SIZE (bdesc_special_args);
29625 i++, d++)
29626 if (d->code == fcode)
29627 return ix86_expand_special_args_builtin (d, exp, target);
29628
29629 for (i = 0, d = bdesc_args;
29630 i < ARRAY_SIZE (bdesc_args);
29631 i++, d++)
29632 if (d->code == fcode)
29633 switch (fcode)
29634 {
29635 case IX86_BUILTIN_FABSQ:
29636 case IX86_BUILTIN_COPYSIGNQ:
29637 if (!TARGET_SSE2)
29638 /* Emit a normal call if SSE2 isn't available. */
29639 return expand_call (exp, target, ignore);
29640 default:
29641 return ix86_expand_args_builtin (d, exp, target);
29642 }
29643
29644 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29645 if (d->code == fcode)
29646 return ix86_expand_sse_comi (d, exp, target);
29647
29648 for (i = 0, d = bdesc_pcmpestr;
29649 i < ARRAY_SIZE (bdesc_pcmpestr);
29650 i++, d++)
29651 if (d->code == fcode)
29652 return ix86_expand_sse_pcmpestr (d, exp, target);
29653
29654 for (i = 0, d = bdesc_pcmpistr;
29655 i < ARRAY_SIZE (bdesc_pcmpistr);
29656 i++, d++)
29657 if (d->code == fcode)
29658 return ix86_expand_sse_pcmpistr (d, exp, target);
29659
29660 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29661 if (d->code == fcode)
29662 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
29663 (enum ix86_builtin_func_type)
29664 d->flag, d->comparison);
29665
29666 gcc_unreachable ();
29667 }
29668
29669 /* Returns a function decl for a vectorized version of the builtin function
29670 with builtin function code FN and the result vector type TYPE, or NULL_TREE
29671 if it is not available. */
29672
29673 static tree
29674 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
29675 tree type_in)
29676 {
29677 enum machine_mode in_mode, out_mode;
29678 int in_n, out_n;
29679 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
29680
29681 if (TREE_CODE (type_out) != VECTOR_TYPE
29682 || TREE_CODE (type_in) != VECTOR_TYPE
29683 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
29684 return NULL_TREE;
29685
29686 out_mode = TYPE_MODE (TREE_TYPE (type_out));
29687 out_n = TYPE_VECTOR_SUBPARTS (type_out);
29688 in_mode = TYPE_MODE (TREE_TYPE (type_in));
29689 in_n = TYPE_VECTOR_SUBPARTS (type_in);
29690
29691 switch (fn)
29692 {
29693 case BUILT_IN_SQRT:
29694 if (out_mode == DFmode && in_mode == DFmode)
29695 {
29696 if (out_n == 2 && in_n == 2)
29697 return ix86_builtins[IX86_BUILTIN_SQRTPD];
29698 else if (out_n == 4 && in_n == 4)
29699 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
29700 }
29701 break;
29702
29703 case BUILT_IN_SQRTF:
29704 if (out_mode == SFmode && in_mode == SFmode)
29705 {
29706 if (out_n == 4 && in_n == 4)
29707 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
29708 else if (out_n == 8 && in_n == 8)
29709 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
29710 }
29711 break;
29712
29713 case BUILT_IN_IFLOOR:
29714 case BUILT_IN_LFLOOR:
29715 case BUILT_IN_LLFLOOR:
29716 /* The round insn does not trap on denormals. */
29717 if (flag_trapping_math || !TARGET_ROUND)
29718 break;
29719
29720 if (out_mode == SImode && in_mode == DFmode)
29721 {
29722 if (out_n == 4 && in_n == 2)
29723 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
29724 else if (out_n == 8 && in_n == 4)
29725 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
29726 }
29727 break;
29728
29729 case BUILT_IN_IFLOORF:
29730 case BUILT_IN_LFLOORF:
29731 case BUILT_IN_LLFLOORF:
29732 /* The round insn does not trap on denormals. */
29733 if (flag_trapping_math || !TARGET_ROUND)
29734 break;
29735
29736 if (out_mode == SImode && in_mode == SFmode)
29737 {
29738 if (out_n == 4 && in_n == 4)
29739 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
29740 else if (out_n == 8 && in_n == 8)
29741 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
29742 }
29743 break;
29744
29745 case BUILT_IN_ICEIL:
29746 case BUILT_IN_LCEIL:
29747 case BUILT_IN_LLCEIL:
29748 /* The round insn does not trap on denormals. */
29749 if (flag_trapping_math || !TARGET_ROUND)
29750 break;
29751
29752 if (out_mode == SImode && in_mode == DFmode)
29753 {
29754 if (out_n == 4 && in_n == 2)
29755 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
29756 else if (out_n == 8 && in_n == 4)
29757 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
29758 }
29759 break;
29760
29761 case BUILT_IN_ICEILF:
29762 case BUILT_IN_LCEILF:
29763 case BUILT_IN_LLCEILF:
29764 /* The round insn does not trap on denormals. */
29765 if (flag_trapping_math || !TARGET_ROUND)
29766 break;
29767
29768 if (out_mode == SImode && in_mode == SFmode)
29769 {
29770 if (out_n == 4 && in_n == 4)
29771 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
29772 else if (out_n == 8 && in_n == 8)
29773 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
29774 }
29775 break;
29776
29777 case BUILT_IN_IRINT:
29778 case BUILT_IN_LRINT:
29779 case BUILT_IN_LLRINT:
29780 if (out_mode == SImode && in_mode == DFmode)
29781 {
29782 if (out_n == 4 && in_n == 2)
29783 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
29784 else if (out_n == 8 && in_n == 4)
29785 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
29786 }
29787 break;
29788
29789 case BUILT_IN_IRINTF:
29790 case BUILT_IN_LRINTF:
29791 case BUILT_IN_LLRINTF:
29792 if (out_mode == SImode && in_mode == SFmode)
29793 {
29794 if (out_n == 4 && in_n == 4)
29795 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
29796 else if (out_n == 8 && in_n == 8)
29797 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
29798 }
29799 break;
29800
29801 case BUILT_IN_IROUND:
29802 case BUILT_IN_LROUND:
29803 case BUILT_IN_LLROUND:
29804 /* The round insn does not trap on denormals. */
29805 if (flag_trapping_math || !TARGET_ROUND)
29806 break;
29807
29808 if (out_mode == SImode && in_mode == DFmode)
29809 {
29810 if (out_n == 4 && in_n == 2)
29811 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
29812 else if (out_n == 8 && in_n == 4)
29813 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
29814 }
29815 break;
29816
29817 case BUILT_IN_IROUNDF:
29818 case BUILT_IN_LROUNDF:
29819 case BUILT_IN_LLROUNDF:
29820 /* The round insn does not trap on denormals. */
29821 if (flag_trapping_math || !TARGET_ROUND)
29822 break;
29823
29824 if (out_mode == SImode && in_mode == SFmode)
29825 {
29826 if (out_n == 4 && in_n == 4)
29827 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
29828 else if (out_n == 8 && in_n == 8)
29829 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
29830 }
29831 break;
29832
29833 case BUILT_IN_COPYSIGN:
29834 if (out_mode == DFmode && in_mode == DFmode)
29835 {
29836 if (out_n == 2 && in_n == 2)
29837 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
29838 else if (out_n == 4 && in_n == 4)
29839 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
29840 }
29841 break;
29842
29843 case BUILT_IN_COPYSIGNF:
29844 if (out_mode == SFmode && in_mode == SFmode)
29845 {
29846 if (out_n == 4 && in_n == 4)
29847 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
29848 else if (out_n == 8 && in_n == 8)
29849 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
29850 }
29851 break;
29852
29853 case BUILT_IN_FLOOR:
29854 /* The round insn does not trap on denormals. */
29855 if (flag_trapping_math || !TARGET_ROUND)
29856 break;
29857
29858 if (out_mode == DFmode && in_mode == DFmode)
29859 {
29860 if (out_n == 2 && in_n == 2)
29861 return ix86_builtins[IX86_BUILTIN_FLOORPD];
29862 else if (out_n == 4 && in_n == 4)
29863 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
29864 }
29865 break;
29866
29867 case BUILT_IN_FLOORF:
29868 /* The round insn does not trap on denormals. */
29869 if (flag_trapping_math || !TARGET_ROUND)
29870 break;
29871
29872 if (out_mode == SFmode && in_mode == SFmode)
29873 {
29874 if (out_n == 4 && in_n == 4)
29875 return ix86_builtins[IX86_BUILTIN_FLOORPS];
29876 else if (out_n == 8 && in_n == 8)
29877 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
29878 }
29879 break;
29880
29881 case BUILT_IN_CEIL:
29882 /* The round insn does not trap on denormals. */
29883 if (flag_trapping_math || !TARGET_ROUND)
29884 break;
29885
29886 if (out_mode == DFmode && in_mode == DFmode)
29887 {
29888 if (out_n == 2 && in_n == 2)
29889 return ix86_builtins[IX86_BUILTIN_CEILPD];
29890 else if (out_n == 4 && in_n == 4)
29891 return ix86_builtins[IX86_BUILTIN_CEILPD256];
29892 }
29893 break;
29894
29895 case BUILT_IN_CEILF:
29896 /* The round insn does not trap on denormals. */
29897 if (flag_trapping_math || !TARGET_ROUND)
29898 break;
29899
29900 if (out_mode == SFmode && in_mode == SFmode)
29901 {
29902 if (out_n == 4 && in_n == 4)
29903 return ix86_builtins[IX86_BUILTIN_CEILPS];
29904 else if (out_n == 8 && in_n == 8)
29905 return ix86_builtins[IX86_BUILTIN_CEILPS256];
29906 }
29907 break;
29908
29909 case BUILT_IN_TRUNC:
29910 /* The round insn does not trap on denormals. */
29911 if (flag_trapping_math || !TARGET_ROUND)
29912 break;
29913
29914 if (out_mode == DFmode && in_mode == DFmode)
29915 {
29916 if (out_n == 2 && in_n == 2)
29917 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
29918 else if (out_n == 4 && in_n == 4)
29919 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
29920 }
29921 break;
29922
29923 case BUILT_IN_TRUNCF:
29924 /* The round insn does not trap on denormals. */
29925 if (flag_trapping_math || !TARGET_ROUND)
29926 break;
29927
29928 if (out_mode == SFmode && in_mode == SFmode)
29929 {
29930 if (out_n == 4 && in_n == 4)
29931 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
29932 else if (out_n == 8 && in_n == 8)
29933 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
29934 }
29935 break;
29936
29937 case BUILT_IN_RINT:
29938 /* The round insn does not trap on denormals. */
29939 if (flag_trapping_math || !TARGET_ROUND)
29940 break;
29941
29942 if (out_mode == DFmode && in_mode == DFmode)
29943 {
29944 if (out_n == 2 && in_n == 2)
29945 return ix86_builtins[IX86_BUILTIN_RINTPD];
29946 else if (out_n == 4 && in_n == 4)
29947 return ix86_builtins[IX86_BUILTIN_RINTPD256];
29948 }
29949 break;
29950
29951 case BUILT_IN_RINTF:
29952 /* The round insn does not trap on denormals. */
29953 if (flag_trapping_math || !TARGET_ROUND)
29954 break;
29955
29956 if (out_mode == SFmode && in_mode == SFmode)
29957 {
29958 if (out_n == 4 && in_n == 4)
29959 return ix86_builtins[IX86_BUILTIN_RINTPS];
29960 else if (out_n == 8 && in_n == 8)
29961 return ix86_builtins[IX86_BUILTIN_RINTPS256];
29962 }
29963 break;
29964
29965 case BUILT_IN_ROUND:
29966 /* The round insn does not trap on denormals. */
29967 if (flag_trapping_math || !TARGET_ROUND)
29968 break;
29969
29970 if (out_mode == DFmode && in_mode == DFmode)
29971 {
29972 if (out_n == 2 && in_n == 2)
29973 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
29974 else if (out_n == 4 && in_n == 4)
29975 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
29976 }
29977 break;
29978
29979 case BUILT_IN_ROUNDF:
29980 /* The round insn does not trap on denormals. */
29981 if (flag_trapping_math || !TARGET_ROUND)
29982 break;
29983
29984 if (out_mode == SFmode && in_mode == SFmode)
29985 {
29986 if (out_n == 4 && in_n == 4)
29987 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
29988 else if (out_n == 8 && in_n == 8)
29989 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
29990 }
29991 break;
29992
29993 case BUILT_IN_FMA:
29994 if (out_mode == DFmode && in_mode == DFmode)
29995 {
29996 if (out_n == 2 && in_n == 2)
29997 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
29998 if (out_n == 4 && in_n == 4)
29999 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
30000 }
30001 break;
30002
30003 case BUILT_IN_FMAF:
30004 if (out_mode == SFmode && in_mode == SFmode)
30005 {
30006 if (out_n == 4 && in_n == 4)
30007 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
30008 if (out_n == 8 && in_n == 8)
30009 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
30010 }
30011 break;
30012
30013 default:
30014 break;
30015 }
30016
30017 /* Dispatch to a handler for a vectorization library. */
30018 if (ix86_veclib_handler)
30019 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
30020 type_in);
30021
30022 return NULL_TREE;
30023 }
30024
30025 /* Handler for an SVML-style interface to
30026 a library with vectorized intrinsics. */
30027
30028 static tree
30029 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
30030 {
30031 char name[20];
30032 tree fntype, new_fndecl, args;
30033 unsigned arity;
30034 const char *bname;
30035 enum machine_mode el_mode, in_mode;
30036 int n, in_n;
30037
30038 /* The SVML is suitable for unsafe math only. */
30039 if (!flag_unsafe_math_optimizations)
30040 return NULL_TREE;
30041
30042 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30043 n = TYPE_VECTOR_SUBPARTS (type_out);
30044 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30045 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30046 if (el_mode != in_mode
30047 || n != in_n)
30048 return NULL_TREE;
30049
30050 switch (fn)
30051 {
30052 case BUILT_IN_EXP:
30053 case BUILT_IN_LOG:
30054 case BUILT_IN_LOG10:
30055 case BUILT_IN_POW:
30056 case BUILT_IN_TANH:
30057 case BUILT_IN_TAN:
30058 case BUILT_IN_ATAN:
30059 case BUILT_IN_ATAN2:
30060 case BUILT_IN_ATANH:
30061 case BUILT_IN_CBRT:
30062 case BUILT_IN_SINH:
30063 case BUILT_IN_SIN:
30064 case BUILT_IN_ASINH:
30065 case BUILT_IN_ASIN:
30066 case BUILT_IN_COSH:
30067 case BUILT_IN_COS:
30068 case BUILT_IN_ACOSH:
30069 case BUILT_IN_ACOS:
30070 if (el_mode != DFmode || n != 2)
30071 return NULL_TREE;
30072 break;
30073
30074 case BUILT_IN_EXPF:
30075 case BUILT_IN_LOGF:
30076 case BUILT_IN_LOG10F:
30077 case BUILT_IN_POWF:
30078 case BUILT_IN_TANHF:
30079 case BUILT_IN_TANF:
30080 case BUILT_IN_ATANF:
30081 case BUILT_IN_ATAN2F:
30082 case BUILT_IN_ATANHF:
30083 case BUILT_IN_CBRTF:
30084 case BUILT_IN_SINHF:
30085 case BUILT_IN_SINF:
30086 case BUILT_IN_ASINHF:
30087 case BUILT_IN_ASINF:
30088 case BUILT_IN_COSHF:
30089 case BUILT_IN_COSF:
30090 case BUILT_IN_ACOSHF:
30091 case BUILT_IN_ACOSF:
30092 if (el_mode != SFmode || n != 4)
30093 return NULL_TREE;
30094 break;
30095
30096 default:
30097 return NULL_TREE;
30098 }
30099
30100 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30101
30102 if (fn == BUILT_IN_LOGF)
30103 strcpy (name, "vmlsLn4");
30104 else if (fn == BUILT_IN_LOG)
30105 strcpy (name, "vmldLn2");
30106 else if (n == 4)
30107 {
30108 sprintf (name, "vmls%s", bname+10);
30109 name[strlen (name)-1] = '4';
30110 }
30111 else
30112 sprintf (name, "vmld%s2", bname+10);
30113
30114 /* Convert to uppercase. */
30115 name[4] &= ~0x20;
30116
30117 arity = 0;
30118 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30119 args;
30120 args = TREE_CHAIN (args))
30121 arity++;
30122
30123 if (arity == 1)
30124 fntype = build_function_type_list (type_out, type_in, NULL);
30125 else
30126 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30127
30128 /* Build a function declaration for the vectorized function. */
30129 new_fndecl = build_decl (BUILTINS_LOCATION,
30130 FUNCTION_DECL, get_identifier (name), fntype);
30131 TREE_PUBLIC (new_fndecl) = 1;
30132 DECL_EXTERNAL (new_fndecl) = 1;
30133 DECL_IS_NOVOPS (new_fndecl) = 1;
30134 TREE_READONLY (new_fndecl) = 1;
30135
30136 return new_fndecl;
30137 }
30138
30139 /* Handler for an ACML-style interface to
30140 a library with vectorized intrinsics. */
30141
30142 static tree
30143 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
30144 {
30145 char name[20] = "__vr.._";
30146 tree fntype, new_fndecl, args;
30147 unsigned arity;
30148 const char *bname;
30149 enum machine_mode el_mode, in_mode;
30150 int n, in_n;
30151
30152 /* The ACML is 64bits only and suitable for unsafe math only as
30153 it does not correctly support parts of IEEE with the required
30154 precision such as denormals. */
30155 if (!TARGET_64BIT
30156 || !flag_unsafe_math_optimizations)
30157 return NULL_TREE;
30158
30159 el_mode = TYPE_MODE (TREE_TYPE (type_out));
30160 n = TYPE_VECTOR_SUBPARTS (type_out);
30161 in_mode = TYPE_MODE (TREE_TYPE (type_in));
30162 in_n = TYPE_VECTOR_SUBPARTS (type_in);
30163 if (el_mode != in_mode
30164 || n != in_n)
30165 return NULL_TREE;
30166
30167 switch (fn)
30168 {
30169 case BUILT_IN_SIN:
30170 case BUILT_IN_COS:
30171 case BUILT_IN_EXP:
30172 case BUILT_IN_LOG:
30173 case BUILT_IN_LOG2:
30174 case BUILT_IN_LOG10:
30175 name[4] = 'd';
30176 name[5] = '2';
30177 if (el_mode != DFmode
30178 || n != 2)
30179 return NULL_TREE;
30180 break;
30181
30182 case BUILT_IN_SINF:
30183 case BUILT_IN_COSF:
30184 case BUILT_IN_EXPF:
30185 case BUILT_IN_POWF:
30186 case BUILT_IN_LOGF:
30187 case BUILT_IN_LOG2F:
30188 case BUILT_IN_LOG10F:
30189 name[4] = 's';
30190 name[5] = '4';
30191 if (el_mode != SFmode
30192 || n != 4)
30193 return NULL_TREE;
30194 break;
30195
30196 default:
30197 return NULL_TREE;
30198 }
30199
30200 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
30201 sprintf (name + 7, "%s", bname+10);
30202
30203 arity = 0;
30204 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
30205 args;
30206 args = TREE_CHAIN (args))
30207 arity++;
30208
30209 if (arity == 1)
30210 fntype = build_function_type_list (type_out, type_in, NULL);
30211 else
30212 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
30213
30214 /* Build a function declaration for the vectorized function. */
30215 new_fndecl = build_decl (BUILTINS_LOCATION,
30216 FUNCTION_DECL, get_identifier (name), fntype);
30217 TREE_PUBLIC (new_fndecl) = 1;
30218 DECL_EXTERNAL (new_fndecl) = 1;
30219 DECL_IS_NOVOPS (new_fndecl) = 1;
30220 TREE_READONLY (new_fndecl) = 1;
30221
30222 return new_fndecl;
30223 }
30224
30225 /* Returns a decl of a function that implements gather load with
30226 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
30227 Return NULL_TREE if it is not available. */
30228
30229 static tree
30230 ix86_vectorize_builtin_gather (const_tree mem_vectype,
30231 const_tree index_type, int scale)
30232 {
30233 bool si;
30234 enum ix86_builtins code;
30235
30236 if (! TARGET_AVX2)
30237 return NULL_TREE;
30238
30239 if ((TREE_CODE (index_type) != INTEGER_TYPE
30240 && !POINTER_TYPE_P (index_type))
30241 || (TYPE_MODE (index_type) != SImode
30242 && TYPE_MODE (index_type) != DImode))
30243 return NULL_TREE;
30244
30245 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
30246 return NULL_TREE;
30247
30248 /* v*gather* insn sign extends index to pointer mode. */
30249 if (TYPE_PRECISION (index_type) < POINTER_SIZE
30250 && TYPE_UNSIGNED (index_type))
30251 return NULL_TREE;
30252
30253 if (scale <= 0
30254 || scale > 8
30255 || (scale & (scale - 1)) != 0)
30256 return NULL_TREE;
30257
30258 si = TYPE_MODE (index_type) == SImode;
30259 switch (TYPE_MODE (mem_vectype))
30260 {
30261 case V2DFmode:
30262 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
30263 break;
30264 case V4DFmode:
30265 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
30266 break;
30267 case V2DImode:
30268 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
30269 break;
30270 case V4DImode:
30271 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
30272 break;
30273 case V4SFmode:
30274 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
30275 break;
30276 case V8SFmode:
30277 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
30278 break;
30279 case V4SImode:
30280 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
30281 break;
30282 case V8SImode:
30283 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
30284 break;
30285 default:
30286 return NULL_TREE;
30287 }
30288
30289 return ix86_builtins[code];
30290 }
30291
30292 /* Returns a code for a target-specific builtin that implements
30293 reciprocal of the function, or NULL_TREE if not available. */
30294
30295 static tree
30296 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
30297 bool sqrt ATTRIBUTE_UNUSED)
30298 {
30299 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
30300 && flag_finite_math_only && !flag_trapping_math
30301 && flag_unsafe_math_optimizations))
30302 return NULL_TREE;
30303
30304 if (md_fn)
30305 /* Machine dependent builtins. */
30306 switch (fn)
30307 {
30308 /* Vectorized version of sqrt to rsqrt conversion. */
30309 case IX86_BUILTIN_SQRTPS_NR:
30310 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
30311
30312 case IX86_BUILTIN_SQRTPS_NR256:
30313 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
30314
30315 default:
30316 return NULL_TREE;
30317 }
30318 else
30319 /* Normal builtins. */
30320 switch (fn)
30321 {
30322 /* Sqrt to rsqrt conversion. */
30323 case BUILT_IN_SQRTF:
30324 return ix86_builtins[IX86_BUILTIN_RSQRTF];
30325
30326 default:
30327 return NULL_TREE;
30328 }
30329 }
30330 \f
30331 /* Helper for avx_vpermilps256_operand et al. This is also used by
30332 the expansion functions to turn the parallel back into a mask.
30333 The return value is 0 for no match and the imm8+1 for a match. */
30334
30335 int
30336 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
30337 {
30338 unsigned i, nelt = GET_MODE_NUNITS (mode);
30339 unsigned mask = 0;
30340 unsigned char ipar[8];
30341
30342 if (XVECLEN (par, 0) != (int) nelt)
30343 return 0;
30344
30345 /* Validate that all of the elements are constants, and not totally
30346 out of range. Copy the data into an integral array to make the
30347 subsequent checks easier. */
30348 for (i = 0; i < nelt; ++i)
30349 {
30350 rtx er = XVECEXP (par, 0, i);
30351 unsigned HOST_WIDE_INT ei;
30352
30353 if (!CONST_INT_P (er))
30354 return 0;
30355 ei = INTVAL (er);
30356 if (ei >= nelt)
30357 return 0;
30358 ipar[i] = ei;
30359 }
30360
30361 switch (mode)
30362 {
30363 case V4DFmode:
30364 /* In the 256-bit DFmode case, we can only move elements within
30365 a 128-bit lane. */
30366 for (i = 0; i < 2; ++i)
30367 {
30368 if (ipar[i] >= 2)
30369 return 0;
30370 mask |= ipar[i] << i;
30371 }
30372 for (i = 2; i < 4; ++i)
30373 {
30374 if (ipar[i] < 2)
30375 return 0;
30376 mask |= (ipar[i] - 2) << i;
30377 }
30378 break;
30379
30380 case V8SFmode:
30381 /* In the 256-bit SFmode case, we have full freedom of movement
30382 within the low 128-bit lane, but the high 128-bit lane must
30383 mirror the exact same pattern. */
30384 for (i = 0; i < 4; ++i)
30385 if (ipar[i] + 4 != ipar[i + 4])
30386 return 0;
30387 nelt = 4;
30388 /* FALLTHRU */
30389
30390 case V2DFmode:
30391 case V4SFmode:
30392 /* In the 128-bit case, we've full freedom in the placement of
30393 the elements from the source operand. */
30394 for (i = 0; i < nelt; ++i)
30395 mask |= ipar[i] << (i * (nelt / 2));
30396 break;
30397
30398 default:
30399 gcc_unreachable ();
30400 }
30401
30402 /* Make sure success has a non-zero value by adding one. */
30403 return mask + 1;
30404 }
30405
30406 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
30407 the expansion functions to turn the parallel back into a mask.
30408 The return value is 0 for no match and the imm8+1 for a match. */
30409
30410 int
30411 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
30412 {
30413 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
30414 unsigned mask = 0;
30415 unsigned char ipar[8];
30416
30417 if (XVECLEN (par, 0) != (int) nelt)
30418 return 0;
30419
30420 /* Validate that all of the elements are constants, and not totally
30421 out of range. Copy the data into an integral array to make the
30422 subsequent checks easier. */
30423 for (i = 0; i < nelt; ++i)
30424 {
30425 rtx er = XVECEXP (par, 0, i);
30426 unsigned HOST_WIDE_INT ei;
30427
30428 if (!CONST_INT_P (er))
30429 return 0;
30430 ei = INTVAL (er);
30431 if (ei >= 2 * nelt)
30432 return 0;
30433 ipar[i] = ei;
30434 }
30435
30436 /* Validate that the halves of the permute are halves. */
30437 for (i = 0; i < nelt2 - 1; ++i)
30438 if (ipar[i] + 1 != ipar[i + 1])
30439 return 0;
30440 for (i = nelt2; i < nelt - 1; ++i)
30441 if (ipar[i] + 1 != ipar[i + 1])
30442 return 0;
30443
30444 /* Reconstruct the mask. */
30445 for (i = 0; i < 2; ++i)
30446 {
30447 unsigned e = ipar[i * nelt2];
30448 if (e % nelt2)
30449 return 0;
30450 e /= nelt2;
30451 mask |= e << (i * 4);
30452 }
30453
30454 /* Make sure success has a non-zero value by adding one. */
30455 return mask + 1;
30456 }
30457 \f
30458 /* Store OPERAND to the memory after reload is completed. This means
30459 that we can't easily use assign_stack_local. */
30460 rtx
30461 ix86_force_to_memory (enum machine_mode mode, rtx operand)
30462 {
30463 rtx result;
30464
30465 gcc_assert (reload_completed);
30466 if (ix86_using_red_zone ())
30467 {
30468 result = gen_rtx_MEM (mode,
30469 gen_rtx_PLUS (Pmode,
30470 stack_pointer_rtx,
30471 GEN_INT (-RED_ZONE_SIZE)));
30472 emit_move_insn (result, operand);
30473 }
30474 else if (TARGET_64BIT)
30475 {
30476 switch (mode)
30477 {
30478 case HImode:
30479 case SImode:
30480 operand = gen_lowpart (DImode, operand);
30481 /* FALLTHRU */
30482 case DImode:
30483 emit_insn (
30484 gen_rtx_SET (VOIDmode,
30485 gen_rtx_MEM (DImode,
30486 gen_rtx_PRE_DEC (DImode,
30487 stack_pointer_rtx)),
30488 operand));
30489 break;
30490 default:
30491 gcc_unreachable ();
30492 }
30493 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30494 }
30495 else
30496 {
30497 switch (mode)
30498 {
30499 case DImode:
30500 {
30501 rtx operands[2];
30502 split_double_mode (mode, &operand, 1, operands, operands + 1);
30503 emit_insn (
30504 gen_rtx_SET (VOIDmode,
30505 gen_rtx_MEM (SImode,
30506 gen_rtx_PRE_DEC (Pmode,
30507 stack_pointer_rtx)),
30508 operands[1]));
30509 emit_insn (
30510 gen_rtx_SET (VOIDmode,
30511 gen_rtx_MEM (SImode,
30512 gen_rtx_PRE_DEC (Pmode,
30513 stack_pointer_rtx)),
30514 operands[0]));
30515 }
30516 break;
30517 case HImode:
30518 /* Store HImodes as SImodes. */
30519 operand = gen_lowpart (SImode, operand);
30520 /* FALLTHRU */
30521 case SImode:
30522 emit_insn (
30523 gen_rtx_SET (VOIDmode,
30524 gen_rtx_MEM (GET_MODE (operand),
30525 gen_rtx_PRE_DEC (SImode,
30526 stack_pointer_rtx)),
30527 operand));
30528 break;
30529 default:
30530 gcc_unreachable ();
30531 }
30532 result = gen_rtx_MEM (mode, stack_pointer_rtx);
30533 }
30534 return result;
30535 }
30536
30537 /* Free operand from the memory. */
30538 void
30539 ix86_free_from_memory (enum machine_mode mode)
30540 {
30541 if (!ix86_using_red_zone ())
30542 {
30543 int size;
30544
30545 if (mode == DImode || TARGET_64BIT)
30546 size = 8;
30547 else
30548 size = 4;
30549 /* Use LEA to deallocate stack space. In peephole2 it will be converted
30550 to pop or add instruction if registers are available. */
30551 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
30552 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
30553 GEN_INT (size))));
30554 }
30555 }
30556
30557 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
30558
30559 Put float CONST_DOUBLE in the constant pool instead of fp regs.
30560 QImode must go into class Q_REGS.
30561 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
30562 movdf to do mem-to-mem moves through integer regs. */
30563
30564 static reg_class_t
30565 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
30566 {
30567 enum machine_mode mode = GET_MODE (x);
30568
30569 /* We're only allowed to return a subclass of CLASS. Many of the
30570 following checks fail for NO_REGS, so eliminate that early. */
30571 if (regclass == NO_REGS)
30572 return NO_REGS;
30573
30574 /* All classes can load zeros. */
30575 if (x == CONST0_RTX (mode))
30576 return regclass;
30577
30578 /* Force constants into memory if we are loading a (nonzero) constant into
30579 an MMX or SSE register. This is because there are no MMX/SSE instructions
30580 to load from a constant. */
30581 if (CONSTANT_P (x)
30582 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
30583 return NO_REGS;
30584
30585 /* Prefer SSE regs only, if we can use them for math. */
30586 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
30587 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
30588
30589 /* Floating-point constants need more complex checks. */
30590 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
30591 {
30592 /* General regs can load everything. */
30593 if (reg_class_subset_p (regclass, GENERAL_REGS))
30594 return regclass;
30595
30596 /* Floats can load 0 and 1 plus some others. Note that we eliminated
30597 zero above. We only want to wind up preferring 80387 registers if
30598 we plan on doing computation with them. */
30599 if (TARGET_80387
30600 && standard_80387_constant_p (x) > 0)
30601 {
30602 /* Limit class to non-sse. */
30603 if (regclass == FLOAT_SSE_REGS)
30604 return FLOAT_REGS;
30605 if (regclass == FP_TOP_SSE_REGS)
30606 return FP_TOP_REG;
30607 if (regclass == FP_SECOND_SSE_REGS)
30608 return FP_SECOND_REG;
30609 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
30610 return regclass;
30611 }
30612
30613 return NO_REGS;
30614 }
30615
30616 /* Generally when we see PLUS here, it's the function invariant
30617 (plus soft-fp const_int). Which can only be computed into general
30618 regs. */
30619 if (GET_CODE (x) == PLUS)
30620 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
30621
30622 /* QImode constants are easy to load, but non-constant QImode data
30623 must go into Q_REGS. */
30624 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
30625 {
30626 if (reg_class_subset_p (regclass, Q_REGS))
30627 return regclass;
30628 if (reg_class_subset_p (Q_REGS, regclass))
30629 return Q_REGS;
30630 return NO_REGS;
30631 }
30632
30633 return regclass;
30634 }
30635
30636 /* Discourage putting floating-point values in SSE registers unless
30637 SSE math is being used, and likewise for the 387 registers. */
30638 static reg_class_t
30639 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
30640 {
30641 enum machine_mode mode = GET_MODE (x);
30642
30643 /* Restrict the output reload class to the register bank that we are doing
30644 math on. If we would like not to return a subset of CLASS, reject this
30645 alternative: if reload cannot do this, it will still use its choice. */
30646 mode = GET_MODE (x);
30647 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
30648 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
30649
30650 if (X87_FLOAT_MODE_P (mode))
30651 {
30652 if (regclass == FP_TOP_SSE_REGS)
30653 return FP_TOP_REG;
30654 else if (regclass == FP_SECOND_SSE_REGS)
30655 return FP_SECOND_REG;
30656 else
30657 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
30658 }
30659
30660 return regclass;
30661 }
30662
30663 static reg_class_t
30664 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
30665 enum machine_mode mode, secondary_reload_info *sri)
30666 {
30667 /* Double-word spills from general registers to non-offsettable memory
30668 references (zero-extended addresses) require special handling. */
30669 if (TARGET_64BIT
30670 && MEM_P (x)
30671 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
30672 && rclass == GENERAL_REGS
30673 && !offsettable_memref_p (x))
30674 {
30675 sri->icode = (in_p
30676 ? CODE_FOR_reload_noff_load
30677 : CODE_FOR_reload_noff_store);
30678 /* Add the cost of moving address to a temporary. */
30679 sri->extra_cost = 1;
30680
30681 return NO_REGS;
30682 }
30683
30684 /* QImode spills from non-QI registers require
30685 intermediate register on 32bit targets. */
30686 if (!TARGET_64BIT
30687 && !in_p && mode == QImode
30688 && (rclass == GENERAL_REGS
30689 || rclass == LEGACY_REGS
30690 || rclass == INDEX_REGS))
30691 {
30692 int regno;
30693
30694 if (REG_P (x))
30695 regno = REGNO (x);
30696 else
30697 regno = -1;
30698
30699 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
30700 regno = true_regnum (x);
30701
30702 /* Return Q_REGS if the operand is in memory. */
30703 if (regno == -1)
30704 return Q_REGS;
30705 }
30706
30707 /* This condition handles corner case where an expression involving
30708 pointers gets vectorized. We're trying to use the address of a
30709 stack slot as a vector initializer.
30710
30711 (set (reg:V2DI 74 [ vect_cst_.2 ])
30712 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
30713
30714 Eventually frame gets turned into sp+offset like this:
30715
30716 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30717 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30718 (const_int 392 [0x188]))))
30719
30720 That later gets turned into:
30721
30722 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30723 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
30724 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
30725
30726 We'll have the following reload recorded:
30727
30728 Reload 0: reload_in (DI) =
30729 (plus:DI (reg/f:DI 7 sp)
30730 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
30731 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30732 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
30733 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
30734 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
30735 reload_reg_rtx: (reg:V2DI 22 xmm1)
30736
30737 Which isn't going to work since SSE instructions can't handle scalar
30738 additions. Returning GENERAL_REGS forces the addition into integer
30739 register and reload can handle subsequent reloads without problems. */
30740
30741 if (in_p && GET_CODE (x) == PLUS
30742 && SSE_CLASS_P (rclass)
30743 && SCALAR_INT_MODE_P (mode))
30744 return GENERAL_REGS;
30745
30746 return NO_REGS;
30747 }
30748
30749 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
30750
30751 static bool
30752 ix86_class_likely_spilled_p (reg_class_t rclass)
30753 {
30754 switch (rclass)
30755 {
30756 case AREG:
30757 case DREG:
30758 case CREG:
30759 case BREG:
30760 case AD_REGS:
30761 case SIREG:
30762 case DIREG:
30763 case SSE_FIRST_REG:
30764 case FP_TOP_REG:
30765 case FP_SECOND_REG:
30766 return true;
30767
30768 default:
30769 break;
30770 }
30771
30772 return false;
30773 }
30774
30775 /* If we are copying between general and FP registers, we need a memory
30776 location. The same is true for SSE and MMX registers.
30777
30778 To optimize register_move_cost performance, allow inline variant.
30779
30780 The macro can't work reliably when one of the CLASSES is class containing
30781 registers from multiple units (SSE, MMX, integer). We avoid this by never
30782 combining those units in single alternative in the machine description.
30783 Ensure that this constraint holds to avoid unexpected surprises.
30784
30785 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
30786 enforce these sanity checks. */
30787
30788 static inline bool
30789 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30790 enum machine_mode mode, int strict)
30791 {
30792 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
30793 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
30794 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
30795 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
30796 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
30797 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
30798 {
30799 gcc_assert (!strict);
30800 return true;
30801 }
30802
30803 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
30804 return true;
30805
30806 /* ??? This is a lie. We do have moves between mmx/general, and for
30807 mmx/sse2. But by saying we need secondary memory we discourage the
30808 register allocator from using the mmx registers unless needed. */
30809 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
30810 return true;
30811
30812 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
30813 {
30814 /* SSE1 doesn't have any direct moves from other classes. */
30815 if (!TARGET_SSE2)
30816 return true;
30817
30818 /* If the target says that inter-unit moves are more expensive
30819 than moving through memory, then don't generate them. */
30820 if (!TARGET_INTER_UNIT_MOVES)
30821 return true;
30822
30823 /* Between SSE and general, we have moves no larger than word size. */
30824 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
30825 return true;
30826 }
30827
30828 return false;
30829 }
30830
30831 bool
30832 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
30833 enum machine_mode mode, int strict)
30834 {
30835 return inline_secondary_memory_needed (class1, class2, mode, strict);
30836 }
30837
30838 /* Implement the TARGET_CLASS_MAX_NREGS hook.
30839
30840 On the 80386, this is the size of MODE in words,
30841 except in the FP regs, where a single reg is always enough. */
30842
30843 static unsigned char
30844 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
30845 {
30846 if (MAYBE_INTEGER_CLASS_P (rclass))
30847 {
30848 if (mode == XFmode)
30849 return (TARGET_64BIT ? 2 : 3);
30850 else if (mode == XCmode)
30851 return (TARGET_64BIT ? 4 : 6);
30852 else
30853 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
30854 }
30855 else
30856 {
30857 if (COMPLEX_MODE_P (mode))
30858 return 2;
30859 else
30860 return 1;
30861 }
30862 }
30863
30864 /* Return true if the registers in CLASS cannot represent the change from
30865 modes FROM to TO. */
30866
30867 bool
30868 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
30869 enum reg_class regclass)
30870 {
30871 if (from == to)
30872 return false;
30873
30874 /* x87 registers can't do subreg at all, as all values are reformatted
30875 to extended precision. */
30876 if (MAYBE_FLOAT_CLASS_P (regclass))
30877 return true;
30878
30879 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
30880 {
30881 /* Vector registers do not support QI or HImode loads. If we don't
30882 disallow a change to these modes, reload will assume it's ok to
30883 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
30884 the vec_dupv4hi pattern. */
30885 if (GET_MODE_SIZE (from) < 4)
30886 return true;
30887
30888 /* Vector registers do not support subreg with nonzero offsets, which
30889 are otherwise valid for integer registers. Since we can't see
30890 whether we have a nonzero offset from here, prohibit all
30891 nonparadoxical subregs changing size. */
30892 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
30893 return true;
30894 }
30895
30896 return false;
30897 }
30898
30899 /* Return the cost of moving data of mode M between a
30900 register and memory. A value of 2 is the default; this cost is
30901 relative to those in `REGISTER_MOVE_COST'.
30902
30903 This function is used extensively by register_move_cost that is used to
30904 build tables at startup. Make it inline in this case.
30905 When IN is 2, return maximum of in and out move cost.
30906
30907 If moving between registers and memory is more expensive than
30908 between two registers, you should define this macro to express the
30909 relative cost.
30910
30911 Model also increased moving costs of QImode registers in non
30912 Q_REGS classes.
30913 */
30914 static inline int
30915 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
30916 int in)
30917 {
30918 int cost;
30919 if (FLOAT_CLASS_P (regclass))
30920 {
30921 int index;
30922 switch (mode)
30923 {
30924 case SFmode:
30925 index = 0;
30926 break;
30927 case DFmode:
30928 index = 1;
30929 break;
30930 case XFmode:
30931 index = 2;
30932 break;
30933 default:
30934 return 100;
30935 }
30936 if (in == 2)
30937 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
30938 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
30939 }
30940 if (SSE_CLASS_P (regclass))
30941 {
30942 int index;
30943 switch (GET_MODE_SIZE (mode))
30944 {
30945 case 4:
30946 index = 0;
30947 break;
30948 case 8:
30949 index = 1;
30950 break;
30951 case 16:
30952 index = 2;
30953 break;
30954 default:
30955 return 100;
30956 }
30957 if (in == 2)
30958 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
30959 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
30960 }
30961 if (MMX_CLASS_P (regclass))
30962 {
30963 int index;
30964 switch (GET_MODE_SIZE (mode))
30965 {
30966 case 4:
30967 index = 0;
30968 break;
30969 case 8:
30970 index = 1;
30971 break;
30972 default:
30973 return 100;
30974 }
30975 if (in)
30976 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
30977 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
30978 }
30979 switch (GET_MODE_SIZE (mode))
30980 {
30981 case 1:
30982 if (Q_CLASS_P (regclass) || TARGET_64BIT)
30983 {
30984 if (!in)
30985 return ix86_cost->int_store[0];
30986 if (TARGET_PARTIAL_REG_DEPENDENCY
30987 && optimize_function_for_speed_p (cfun))
30988 cost = ix86_cost->movzbl_load;
30989 else
30990 cost = ix86_cost->int_load[0];
30991 if (in == 2)
30992 return MAX (cost, ix86_cost->int_store[0]);
30993 return cost;
30994 }
30995 else
30996 {
30997 if (in == 2)
30998 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
30999 if (in)
31000 return ix86_cost->movzbl_load;
31001 else
31002 return ix86_cost->int_store[0] + 4;
31003 }
31004 break;
31005 case 2:
31006 if (in == 2)
31007 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
31008 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
31009 default:
31010 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
31011 if (mode == TFmode)
31012 mode = XFmode;
31013 if (in == 2)
31014 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
31015 else if (in)
31016 cost = ix86_cost->int_load[2];
31017 else
31018 cost = ix86_cost->int_store[2];
31019 return (cost * (((int) GET_MODE_SIZE (mode)
31020 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
31021 }
31022 }
31023
31024 static int
31025 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
31026 bool in)
31027 {
31028 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
31029 }
31030
31031
31032 /* Return the cost of moving data from a register in class CLASS1 to
31033 one in class CLASS2.
31034
31035 It is not required that the cost always equal 2 when FROM is the same as TO;
31036 on some machines it is expensive to move between registers if they are not
31037 general registers. */
31038
31039 static int
31040 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
31041 reg_class_t class2_i)
31042 {
31043 enum reg_class class1 = (enum reg_class) class1_i;
31044 enum reg_class class2 = (enum reg_class) class2_i;
31045
31046 /* In case we require secondary memory, compute cost of the store followed
31047 by load. In order to avoid bad register allocation choices, we need
31048 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
31049
31050 if (inline_secondary_memory_needed (class1, class2, mode, 0))
31051 {
31052 int cost = 1;
31053
31054 cost += inline_memory_move_cost (mode, class1, 2);
31055 cost += inline_memory_move_cost (mode, class2, 2);
31056
31057 /* In case of copying from general_purpose_register we may emit multiple
31058 stores followed by single load causing memory size mismatch stall.
31059 Count this as arbitrarily high cost of 20. */
31060 if (targetm.class_max_nregs (class1, mode)
31061 > targetm.class_max_nregs (class2, mode))
31062 cost += 20;
31063
31064 /* In the case of FP/MMX moves, the registers actually overlap, and we
31065 have to switch modes in order to treat them differently. */
31066 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
31067 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
31068 cost += 20;
31069
31070 return cost;
31071 }
31072
31073 /* Moves between SSE/MMX and integer unit are expensive. */
31074 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
31075 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
31076
31077 /* ??? By keeping returned value relatively high, we limit the number
31078 of moves between integer and MMX/SSE registers for all targets.
31079 Additionally, high value prevents problem with x86_modes_tieable_p(),
31080 where integer modes in MMX/SSE registers are not tieable
31081 because of missing QImode and HImode moves to, from or between
31082 MMX/SSE registers. */
31083 return MAX (8, ix86_cost->mmxsse_to_integer);
31084
31085 if (MAYBE_FLOAT_CLASS_P (class1))
31086 return ix86_cost->fp_move;
31087 if (MAYBE_SSE_CLASS_P (class1))
31088 return ix86_cost->sse_move;
31089 if (MAYBE_MMX_CLASS_P (class1))
31090 return ix86_cost->mmx_move;
31091 return 2;
31092 }
31093
31094 /* Return TRUE if hard register REGNO can hold a value of machine-mode
31095 MODE. */
31096
31097 bool
31098 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
31099 {
31100 /* Flags and only flags can only hold CCmode values. */
31101 if (CC_REGNO_P (regno))
31102 return GET_MODE_CLASS (mode) == MODE_CC;
31103 if (GET_MODE_CLASS (mode) == MODE_CC
31104 || GET_MODE_CLASS (mode) == MODE_RANDOM
31105 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
31106 return false;
31107 if (FP_REGNO_P (regno))
31108 return VALID_FP_MODE_P (mode);
31109 if (SSE_REGNO_P (regno))
31110 {
31111 /* We implement the move patterns for all vector modes into and
31112 out of SSE registers, even when no operation instructions
31113 are available. OImode move is available only when AVX is
31114 enabled. */
31115 return ((TARGET_AVX && mode == OImode)
31116 || VALID_AVX256_REG_MODE (mode)
31117 || VALID_SSE_REG_MODE (mode)
31118 || VALID_SSE2_REG_MODE (mode)
31119 || VALID_MMX_REG_MODE (mode)
31120 || VALID_MMX_REG_MODE_3DNOW (mode));
31121 }
31122 if (MMX_REGNO_P (regno))
31123 {
31124 /* We implement the move patterns for 3DNOW modes even in MMX mode,
31125 so if the register is available at all, then we can move data of
31126 the given mode into or out of it. */
31127 return (VALID_MMX_REG_MODE (mode)
31128 || VALID_MMX_REG_MODE_3DNOW (mode));
31129 }
31130
31131 if (mode == QImode)
31132 {
31133 /* Take care for QImode values - they can be in non-QI regs,
31134 but then they do cause partial register stalls. */
31135 if (regno <= BX_REG || TARGET_64BIT)
31136 return true;
31137 if (!TARGET_PARTIAL_REG_STALL)
31138 return true;
31139 return !can_create_pseudo_p ();
31140 }
31141 /* We handle both integer and floats in the general purpose registers. */
31142 else if (VALID_INT_MODE_P (mode))
31143 return true;
31144 else if (VALID_FP_MODE_P (mode))
31145 return true;
31146 else if (VALID_DFP_MODE_P (mode))
31147 return true;
31148 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
31149 on to use that value in smaller contexts, this can easily force a
31150 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
31151 supporting DImode, allow it. */
31152 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
31153 return true;
31154
31155 return false;
31156 }
31157
31158 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
31159 tieable integer mode. */
31160
31161 static bool
31162 ix86_tieable_integer_mode_p (enum machine_mode mode)
31163 {
31164 switch (mode)
31165 {
31166 case HImode:
31167 case SImode:
31168 return true;
31169
31170 case QImode:
31171 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
31172
31173 case DImode:
31174 return TARGET_64BIT;
31175
31176 default:
31177 return false;
31178 }
31179 }
31180
31181 /* Return true if MODE1 is accessible in a register that can hold MODE2
31182 without copying. That is, all register classes that can hold MODE2
31183 can also hold MODE1. */
31184
31185 bool
31186 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
31187 {
31188 if (mode1 == mode2)
31189 return true;
31190
31191 if (ix86_tieable_integer_mode_p (mode1)
31192 && ix86_tieable_integer_mode_p (mode2))
31193 return true;
31194
31195 /* MODE2 being XFmode implies fp stack or general regs, which means we
31196 can tie any smaller floating point modes to it. Note that we do not
31197 tie this with TFmode. */
31198 if (mode2 == XFmode)
31199 return mode1 == SFmode || mode1 == DFmode;
31200
31201 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
31202 that we can tie it with SFmode. */
31203 if (mode2 == DFmode)
31204 return mode1 == SFmode;
31205
31206 /* If MODE2 is only appropriate for an SSE register, then tie with
31207 any other mode acceptable to SSE registers. */
31208 if (GET_MODE_SIZE (mode2) == 16
31209 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
31210 return (GET_MODE_SIZE (mode1) == 16
31211 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
31212
31213 /* If MODE2 is appropriate for an MMX register, then tie
31214 with any other mode acceptable to MMX registers. */
31215 if (GET_MODE_SIZE (mode2) == 8
31216 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
31217 return (GET_MODE_SIZE (mode1) == 8
31218 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
31219
31220 return false;
31221 }
31222
31223 /* Compute a (partial) cost for rtx X. Return true if the complete
31224 cost has been computed, and false if subexpressions should be
31225 scanned. In either case, *TOTAL contains the cost result. */
31226
31227 static bool
31228 ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
31229 bool speed)
31230 {
31231 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
31232 enum machine_mode mode = GET_MODE (x);
31233 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
31234
31235 switch (code)
31236 {
31237 case CONST_INT:
31238 case CONST:
31239 case LABEL_REF:
31240 case SYMBOL_REF:
31241 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
31242 *total = 3;
31243 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
31244 *total = 2;
31245 else if (flag_pic && SYMBOLIC_CONST (x)
31246 && (!TARGET_64BIT
31247 || (!GET_CODE (x) != LABEL_REF
31248 && (GET_CODE (x) != SYMBOL_REF
31249 || !SYMBOL_REF_LOCAL_P (x)))))
31250 *total = 1;
31251 else
31252 *total = 0;
31253 return true;
31254
31255 case CONST_DOUBLE:
31256 if (mode == VOIDmode)
31257 *total = 0;
31258 else
31259 switch (standard_80387_constant_p (x))
31260 {
31261 case 1: /* 0.0 */
31262 *total = 1;
31263 break;
31264 default: /* Other constants */
31265 *total = 2;
31266 break;
31267 case 0:
31268 case -1:
31269 /* Start with (MEM (SYMBOL_REF)), since that's where
31270 it'll probably end up. Add a penalty for size. */
31271 *total = (COSTS_N_INSNS (1)
31272 + (flag_pic != 0 && !TARGET_64BIT)
31273 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
31274 break;
31275 }
31276 return true;
31277
31278 case ZERO_EXTEND:
31279 /* The zero extensions is often completely free on x86_64, so make
31280 it as cheap as possible. */
31281 if (TARGET_64BIT && mode == DImode
31282 && GET_MODE (XEXP (x, 0)) == SImode)
31283 *total = 1;
31284 else if (TARGET_ZERO_EXTEND_WITH_AND)
31285 *total = cost->add;
31286 else
31287 *total = cost->movzx;
31288 return false;
31289
31290 case SIGN_EXTEND:
31291 *total = cost->movsx;
31292 return false;
31293
31294 case ASHIFT:
31295 if (CONST_INT_P (XEXP (x, 1))
31296 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
31297 {
31298 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31299 if (value == 1)
31300 {
31301 *total = cost->add;
31302 return false;
31303 }
31304 if ((value == 2 || value == 3)
31305 && cost->lea <= cost->shift_const)
31306 {
31307 *total = cost->lea;
31308 return false;
31309 }
31310 }
31311 /* FALLTHRU */
31312
31313 case ROTATE:
31314 case ASHIFTRT:
31315 case LSHIFTRT:
31316 case ROTATERT:
31317 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
31318 {
31319 if (CONST_INT_P (XEXP (x, 1)))
31320 {
31321 if (INTVAL (XEXP (x, 1)) > 32)
31322 *total = cost->shift_const + COSTS_N_INSNS (2);
31323 else
31324 *total = cost->shift_const * 2;
31325 }
31326 else
31327 {
31328 if (GET_CODE (XEXP (x, 1)) == AND)
31329 *total = cost->shift_var * 2;
31330 else
31331 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
31332 }
31333 }
31334 else
31335 {
31336 if (CONST_INT_P (XEXP (x, 1)))
31337 *total = cost->shift_const;
31338 else
31339 *total = cost->shift_var;
31340 }
31341 return false;
31342
31343 case FMA:
31344 {
31345 rtx sub;
31346
31347 gcc_assert (FLOAT_MODE_P (mode));
31348 gcc_assert (TARGET_FMA || TARGET_FMA4);
31349
31350 /* ??? SSE scalar/vector cost should be used here. */
31351 /* ??? Bald assumption that fma has the same cost as fmul. */
31352 *total = cost->fmul;
31353 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
31354
31355 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
31356 sub = XEXP (x, 0);
31357 if (GET_CODE (sub) == NEG)
31358 sub = XEXP (sub, 0);
31359 *total += rtx_cost (sub, FMA, 0, speed);
31360
31361 sub = XEXP (x, 2);
31362 if (GET_CODE (sub) == NEG)
31363 sub = XEXP (sub, 0);
31364 *total += rtx_cost (sub, FMA, 2, speed);
31365 return true;
31366 }
31367
31368 case MULT:
31369 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31370 {
31371 /* ??? SSE scalar cost should be used here. */
31372 *total = cost->fmul;
31373 return false;
31374 }
31375 else if (X87_FLOAT_MODE_P (mode))
31376 {
31377 *total = cost->fmul;
31378 return false;
31379 }
31380 else if (FLOAT_MODE_P (mode))
31381 {
31382 /* ??? SSE vector cost should be used here. */
31383 *total = cost->fmul;
31384 return false;
31385 }
31386 else
31387 {
31388 rtx op0 = XEXP (x, 0);
31389 rtx op1 = XEXP (x, 1);
31390 int nbits;
31391 if (CONST_INT_P (XEXP (x, 1)))
31392 {
31393 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
31394 for (nbits = 0; value != 0; value &= value - 1)
31395 nbits++;
31396 }
31397 else
31398 /* This is arbitrary. */
31399 nbits = 7;
31400
31401 /* Compute costs correctly for widening multiplication. */
31402 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
31403 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
31404 == GET_MODE_SIZE (mode))
31405 {
31406 int is_mulwiden = 0;
31407 enum machine_mode inner_mode = GET_MODE (op0);
31408
31409 if (GET_CODE (op0) == GET_CODE (op1))
31410 is_mulwiden = 1, op1 = XEXP (op1, 0);
31411 else if (CONST_INT_P (op1))
31412 {
31413 if (GET_CODE (op0) == SIGN_EXTEND)
31414 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
31415 == INTVAL (op1);
31416 else
31417 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
31418 }
31419
31420 if (is_mulwiden)
31421 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
31422 }
31423
31424 *total = (cost->mult_init[MODE_INDEX (mode)]
31425 + nbits * cost->mult_bit
31426 + rtx_cost (op0, outer_code, opno, speed)
31427 + rtx_cost (op1, outer_code, opno, speed));
31428
31429 return true;
31430 }
31431
31432 case DIV:
31433 case UDIV:
31434 case MOD:
31435 case UMOD:
31436 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31437 /* ??? SSE cost should be used here. */
31438 *total = cost->fdiv;
31439 else if (X87_FLOAT_MODE_P (mode))
31440 *total = cost->fdiv;
31441 else if (FLOAT_MODE_P (mode))
31442 /* ??? SSE vector cost should be used here. */
31443 *total = cost->fdiv;
31444 else
31445 *total = cost->divide[MODE_INDEX (mode)];
31446 return false;
31447
31448 case PLUS:
31449 if (GET_MODE_CLASS (mode) == MODE_INT
31450 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
31451 {
31452 if (GET_CODE (XEXP (x, 0)) == PLUS
31453 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
31454 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
31455 && CONSTANT_P (XEXP (x, 1)))
31456 {
31457 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
31458 if (val == 2 || val == 4 || val == 8)
31459 {
31460 *total = cost->lea;
31461 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31462 outer_code, opno, speed);
31463 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
31464 outer_code, opno, speed);
31465 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31466 return true;
31467 }
31468 }
31469 else if (GET_CODE (XEXP (x, 0)) == MULT
31470 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
31471 {
31472 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
31473 if (val == 2 || val == 4 || val == 8)
31474 {
31475 *total = cost->lea;
31476 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31477 outer_code, opno, speed);
31478 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31479 return true;
31480 }
31481 }
31482 else if (GET_CODE (XEXP (x, 0)) == PLUS)
31483 {
31484 *total = cost->lea;
31485 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
31486 outer_code, opno, speed);
31487 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
31488 outer_code, opno, speed);
31489 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
31490 return true;
31491 }
31492 }
31493 /* FALLTHRU */
31494
31495 case MINUS:
31496 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31497 {
31498 /* ??? SSE cost should be used here. */
31499 *total = cost->fadd;
31500 return false;
31501 }
31502 else if (X87_FLOAT_MODE_P (mode))
31503 {
31504 *total = cost->fadd;
31505 return false;
31506 }
31507 else if (FLOAT_MODE_P (mode))
31508 {
31509 /* ??? SSE vector cost should be used here. */
31510 *total = cost->fadd;
31511 return false;
31512 }
31513 /* FALLTHRU */
31514
31515 case AND:
31516 case IOR:
31517 case XOR:
31518 if (!TARGET_64BIT && mode == DImode)
31519 {
31520 *total = (cost->add * 2
31521 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
31522 << (GET_MODE (XEXP (x, 0)) != DImode))
31523 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
31524 << (GET_MODE (XEXP (x, 1)) != DImode)));
31525 return true;
31526 }
31527 /* FALLTHRU */
31528
31529 case NEG:
31530 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31531 {
31532 /* ??? SSE cost should be used here. */
31533 *total = cost->fchs;
31534 return false;
31535 }
31536 else if (X87_FLOAT_MODE_P (mode))
31537 {
31538 *total = cost->fchs;
31539 return false;
31540 }
31541 else if (FLOAT_MODE_P (mode))
31542 {
31543 /* ??? SSE vector cost should be used here. */
31544 *total = cost->fchs;
31545 return false;
31546 }
31547 /* FALLTHRU */
31548
31549 case NOT:
31550 if (!TARGET_64BIT && mode == DImode)
31551 *total = cost->add * 2;
31552 else
31553 *total = cost->add;
31554 return false;
31555
31556 case COMPARE:
31557 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
31558 && XEXP (XEXP (x, 0), 1) == const1_rtx
31559 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
31560 && XEXP (x, 1) == const0_rtx)
31561 {
31562 /* This kind of construct is implemented using test[bwl].
31563 Treat it as if we had an AND. */
31564 *total = (cost->add
31565 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
31566 + rtx_cost (const1_rtx, outer_code, opno, speed));
31567 return true;
31568 }
31569 return false;
31570
31571 case FLOAT_EXTEND:
31572 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
31573 *total = 0;
31574 return false;
31575
31576 case ABS:
31577 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31578 /* ??? SSE cost should be used here. */
31579 *total = cost->fabs;
31580 else if (X87_FLOAT_MODE_P (mode))
31581 *total = cost->fabs;
31582 else if (FLOAT_MODE_P (mode))
31583 /* ??? SSE vector cost should be used here. */
31584 *total = cost->fabs;
31585 return false;
31586
31587 case SQRT:
31588 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
31589 /* ??? SSE cost should be used here. */
31590 *total = cost->fsqrt;
31591 else if (X87_FLOAT_MODE_P (mode))
31592 *total = cost->fsqrt;
31593 else if (FLOAT_MODE_P (mode))
31594 /* ??? SSE vector cost should be used here. */
31595 *total = cost->fsqrt;
31596 return false;
31597
31598 case UNSPEC:
31599 if (XINT (x, 1) == UNSPEC_TP)
31600 *total = 0;
31601 return false;
31602
31603 case VEC_SELECT:
31604 case VEC_CONCAT:
31605 case VEC_MERGE:
31606 case VEC_DUPLICATE:
31607 /* ??? Assume all of these vector manipulation patterns are
31608 recognizable. In which case they all pretty much have the
31609 same cost. */
31610 *total = COSTS_N_INSNS (1);
31611 return true;
31612
31613 default:
31614 return false;
31615 }
31616 }
31617
31618 #if TARGET_MACHO
31619
31620 static int current_machopic_label_num;
31621
31622 /* Given a symbol name and its associated stub, write out the
31623 definition of the stub. */
31624
31625 void
31626 machopic_output_stub (FILE *file, const char *symb, const char *stub)
31627 {
31628 unsigned int length;
31629 char *binder_name, *symbol_name, lazy_ptr_name[32];
31630 int label = ++current_machopic_label_num;
31631
31632 /* For 64-bit we shouldn't get here. */
31633 gcc_assert (!TARGET_64BIT);
31634
31635 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
31636 symb = targetm.strip_name_encoding (symb);
31637
31638 length = strlen (stub);
31639 binder_name = XALLOCAVEC (char, length + 32);
31640 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
31641
31642 length = strlen (symb);
31643 symbol_name = XALLOCAVEC (char, length + 32);
31644 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
31645
31646 sprintf (lazy_ptr_name, "L%d$lz", label);
31647
31648 if (MACHOPIC_ATT_STUB)
31649 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
31650 else if (MACHOPIC_PURE)
31651 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
31652 else
31653 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
31654
31655 fprintf (file, "%s:\n", stub);
31656 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31657
31658 if (MACHOPIC_ATT_STUB)
31659 {
31660 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
31661 }
31662 else if (MACHOPIC_PURE)
31663 {
31664 /* PIC stub. */
31665 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31666 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
31667 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
31668 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
31669 label, lazy_ptr_name, label);
31670 fprintf (file, "\tjmp\t*%%ecx\n");
31671 }
31672 else
31673 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
31674
31675 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
31676 it needs no stub-binding-helper. */
31677 if (MACHOPIC_ATT_STUB)
31678 return;
31679
31680 fprintf (file, "%s:\n", binder_name);
31681
31682 if (MACHOPIC_PURE)
31683 {
31684 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
31685 fprintf (file, "\tpushl\t%%ecx\n");
31686 }
31687 else
31688 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
31689
31690 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
31691
31692 /* N.B. Keep the correspondence of these
31693 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
31694 old-pic/new-pic/non-pic stubs; altering this will break
31695 compatibility with existing dylibs. */
31696 if (MACHOPIC_PURE)
31697 {
31698 /* 25-byte PIC stub using "CALL get_pc_thunk". */
31699 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
31700 }
31701 else
31702 /* 16-byte -mdynamic-no-pic stub. */
31703 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
31704
31705 fprintf (file, "%s:\n", lazy_ptr_name);
31706 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
31707 fprintf (file, ASM_LONG "%s\n", binder_name);
31708 }
31709 #endif /* TARGET_MACHO */
31710
31711 /* Order the registers for register allocator. */
31712
31713 void
31714 x86_order_regs_for_local_alloc (void)
31715 {
31716 int pos = 0;
31717 int i;
31718
31719 /* First allocate the local general purpose registers. */
31720 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31721 if (GENERAL_REGNO_P (i) && call_used_regs[i])
31722 reg_alloc_order [pos++] = i;
31723
31724 /* Global general purpose registers. */
31725 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
31726 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
31727 reg_alloc_order [pos++] = i;
31728
31729 /* x87 registers come first in case we are doing FP math
31730 using them. */
31731 if (!TARGET_SSE_MATH)
31732 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31733 reg_alloc_order [pos++] = i;
31734
31735 /* SSE registers. */
31736 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
31737 reg_alloc_order [pos++] = i;
31738 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
31739 reg_alloc_order [pos++] = i;
31740
31741 /* x87 registers. */
31742 if (TARGET_SSE_MATH)
31743 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
31744 reg_alloc_order [pos++] = i;
31745
31746 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
31747 reg_alloc_order [pos++] = i;
31748
31749 /* Initialize the rest of array as we do not allocate some registers
31750 at all. */
31751 while (pos < FIRST_PSEUDO_REGISTER)
31752 reg_alloc_order [pos++] = 0;
31753 }
31754
31755 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
31756 in struct attribute_spec handler. */
31757 static tree
31758 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
31759 tree args,
31760 int flags ATTRIBUTE_UNUSED,
31761 bool *no_add_attrs)
31762 {
31763 if (TREE_CODE (*node) != FUNCTION_TYPE
31764 && TREE_CODE (*node) != METHOD_TYPE
31765 && TREE_CODE (*node) != FIELD_DECL
31766 && TREE_CODE (*node) != TYPE_DECL)
31767 {
31768 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31769 name);
31770 *no_add_attrs = true;
31771 return NULL_TREE;
31772 }
31773 if (TARGET_64BIT)
31774 {
31775 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
31776 name);
31777 *no_add_attrs = true;
31778 return NULL_TREE;
31779 }
31780 if (is_attribute_p ("callee_pop_aggregate_return", name))
31781 {
31782 tree cst;
31783
31784 cst = TREE_VALUE (args);
31785 if (TREE_CODE (cst) != INTEGER_CST)
31786 {
31787 warning (OPT_Wattributes,
31788 "%qE attribute requires an integer constant argument",
31789 name);
31790 *no_add_attrs = true;
31791 }
31792 else if (compare_tree_int (cst, 0) != 0
31793 && compare_tree_int (cst, 1) != 0)
31794 {
31795 warning (OPT_Wattributes,
31796 "argument to %qE attribute is neither zero, nor one",
31797 name);
31798 *no_add_attrs = true;
31799 }
31800
31801 return NULL_TREE;
31802 }
31803
31804 return NULL_TREE;
31805 }
31806
31807 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
31808 struct attribute_spec.handler. */
31809 static tree
31810 ix86_handle_abi_attribute (tree *node, tree name,
31811 tree args ATTRIBUTE_UNUSED,
31812 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31813 {
31814 if (TREE_CODE (*node) != FUNCTION_TYPE
31815 && TREE_CODE (*node) != METHOD_TYPE
31816 && TREE_CODE (*node) != FIELD_DECL
31817 && TREE_CODE (*node) != TYPE_DECL)
31818 {
31819 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31820 name);
31821 *no_add_attrs = true;
31822 return NULL_TREE;
31823 }
31824
31825 /* Can combine regparm with all attributes but fastcall. */
31826 if (is_attribute_p ("ms_abi", name))
31827 {
31828 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
31829 {
31830 error ("ms_abi and sysv_abi attributes are not compatible");
31831 }
31832
31833 return NULL_TREE;
31834 }
31835 else if (is_attribute_p ("sysv_abi", name))
31836 {
31837 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
31838 {
31839 error ("ms_abi and sysv_abi attributes are not compatible");
31840 }
31841
31842 return NULL_TREE;
31843 }
31844
31845 return NULL_TREE;
31846 }
31847
31848 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
31849 struct attribute_spec.handler. */
31850 static tree
31851 ix86_handle_struct_attribute (tree *node, tree name,
31852 tree args ATTRIBUTE_UNUSED,
31853 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31854 {
31855 tree *type = NULL;
31856 if (DECL_P (*node))
31857 {
31858 if (TREE_CODE (*node) == TYPE_DECL)
31859 type = &TREE_TYPE (*node);
31860 }
31861 else
31862 type = node;
31863
31864 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
31865 || TREE_CODE (*type) == UNION_TYPE)))
31866 {
31867 warning (OPT_Wattributes, "%qE attribute ignored",
31868 name);
31869 *no_add_attrs = true;
31870 }
31871
31872 else if ((is_attribute_p ("ms_struct", name)
31873 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
31874 || ((is_attribute_p ("gcc_struct", name)
31875 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
31876 {
31877 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
31878 name);
31879 *no_add_attrs = true;
31880 }
31881
31882 return NULL_TREE;
31883 }
31884
31885 static tree
31886 ix86_handle_fndecl_attribute (tree *node, tree name,
31887 tree args ATTRIBUTE_UNUSED,
31888 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
31889 {
31890 if (TREE_CODE (*node) != FUNCTION_DECL)
31891 {
31892 warning (OPT_Wattributes, "%qE attribute only applies to functions",
31893 name);
31894 *no_add_attrs = true;
31895 }
31896 return NULL_TREE;
31897 }
31898
31899 static bool
31900 ix86_ms_bitfield_layout_p (const_tree record_type)
31901 {
31902 return ((TARGET_MS_BITFIELD_LAYOUT
31903 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
31904 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
31905 }
31906
31907 /* Returns an expression indicating where the this parameter is
31908 located on entry to the FUNCTION. */
31909
31910 static rtx
31911 x86_this_parameter (tree function)
31912 {
31913 tree type = TREE_TYPE (function);
31914 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
31915 int nregs;
31916
31917 if (TARGET_64BIT)
31918 {
31919 const int *parm_regs;
31920
31921 if (ix86_function_type_abi (type) == MS_ABI)
31922 parm_regs = x86_64_ms_abi_int_parameter_registers;
31923 else
31924 parm_regs = x86_64_int_parameter_registers;
31925 return gen_rtx_REG (DImode, parm_regs[aggr]);
31926 }
31927
31928 nregs = ix86_function_regparm (type, function);
31929
31930 if (nregs > 0 && !stdarg_p (type))
31931 {
31932 int regno;
31933 unsigned int ccvt = ix86_get_callcvt (type);
31934
31935 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
31936 regno = aggr ? DX_REG : CX_REG;
31937 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
31938 {
31939 regno = CX_REG;
31940 if (aggr)
31941 return gen_rtx_MEM (SImode,
31942 plus_constant (stack_pointer_rtx, 4));
31943 }
31944 else
31945 {
31946 regno = AX_REG;
31947 if (aggr)
31948 {
31949 regno = DX_REG;
31950 if (nregs == 1)
31951 return gen_rtx_MEM (SImode,
31952 plus_constant (stack_pointer_rtx, 4));
31953 }
31954 }
31955 return gen_rtx_REG (SImode, regno);
31956 }
31957
31958 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
31959 }
31960
31961 /* Determine whether x86_output_mi_thunk can succeed. */
31962
31963 static bool
31964 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
31965 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
31966 HOST_WIDE_INT vcall_offset, const_tree function)
31967 {
31968 /* 64-bit can handle anything. */
31969 if (TARGET_64BIT)
31970 return true;
31971
31972 /* For 32-bit, everything's fine if we have one free register. */
31973 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
31974 return true;
31975
31976 /* Need a free register for vcall_offset. */
31977 if (vcall_offset)
31978 return false;
31979
31980 /* Need a free register for GOT references. */
31981 if (flag_pic && !targetm.binds_local_p (function))
31982 return false;
31983
31984 /* Otherwise ok. */
31985 return true;
31986 }
31987
31988 /* Output the assembler code for a thunk function. THUNK_DECL is the
31989 declaration for the thunk function itself, FUNCTION is the decl for
31990 the target function. DELTA is an immediate constant offset to be
31991 added to THIS. If VCALL_OFFSET is nonzero, the word at
31992 *(*this + vcall_offset) should be added to THIS. */
31993
31994 static void
31995 x86_output_mi_thunk (FILE *file,
31996 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
31997 HOST_WIDE_INT vcall_offset, tree function)
31998 {
31999 rtx this_param = x86_this_parameter (function);
32000 rtx this_reg, tmp, fnaddr;
32001
32002 emit_note (NOTE_INSN_PROLOGUE_END);
32003
32004 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
32005 pull it in now and let DELTA benefit. */
32006 if (REG_P (this_param))
32007 this_reg = this_param;
32008 else if (vcall_offset)
32009 {
32010 /* Put the this parameter into %eax. */
32011 this_reg = gen_rtx_REG (Pmode, AX_REG);
32012 emit_move_insn (this_reg, this_param);
32013 }
32014 else
32015 this_reg = NULL_RTX;
32016
32017 /* Adjust the this parameter by a fixed constant. */
32018 if (delta)
32019 {
32020 rtx delta_rtx = GEN_INT (delta);
32021 rtx delta_dst = this_reg ? this_reg : this_param;
32022
32023 if (TARGET_64BIT)
32024 {
32025 if (!x86_64_general_operand (delta_rtx, Pmode))
32026 {
32027 tmp = gen_rtx_REG (Pmode, R10_REG);
32028 emit_move_insn (tmp, delta_rtx);
32029 delta_rtx = tmp;
32030 }
32031 }
32032
32033 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
32034 }
32035
32036 /* Adjust the this parameter by a value stored in the vtable. */
32037 if (vcall_offset)
32038 {
32039 rtx vcall_addr, vcall_mem, this_mem;
32040 unsigned int tmp_regno;
32041
32042 if (TARGET_64BIT)
32043 tmp_regno = R10_REG;
32044 else
32045 {
32046 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
32047 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
32048 tmp_regno = AX_REG;
32049 else
32050 tmp_regno = CX_REG;
32051 }
32052 tmp = gen_rtx_REG (Pmode, tmp_regno);
32053
32054 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
32055 if (Pmode != ptr_mode)
32056 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
32057 emit_move_insn (tmp, this_mem);
32058
32059 /* Adjust the this parameter. */
32060 vcall_addr = plus_constant (tmp, vcall_offset);
32061 if (TARGET_64BIT
32062 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
32063 {
32064 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
32065 emit_move_insn (tmp2, GEN_INT (vcall_offset));
32066 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
32067 }
32068
32069 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
32070 if (Pmode != ptr_mode)
32071 emit_insn (gen_addsi_1_zext (this_reg,
32072 gen_rtx_REG (ptr_mode,
32073 REGNO (this_reg)),
32074 vcall_mem));
32075 else
32076 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
32077 }
32078
32079 /* If necessary, drop THIS back to its stack slot. */
32080 if (this_reg && this_reg != this_param)
32081 emit_move_insn (this_param, this_reg);
32082
32083 fnaddr = XEXP (DECL_RTL (function), 0);
32084 if (TARGET_64BIT)
32085 {
32086 if (!flag_pic || targetm.binds_local_p (function)
32087 || cfun->machine->call_abi == MS_ABI)
32088 ;
32089 else
32090 {
32091 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
32092 tmp = gen_rtx_CONST (Pmode, tmp);
32093 fnaddr = gen_rtx_MEM (Pmode, tmp);
32094 }
32095 }
32096 else
32097 {
32098 if (!flag_pic || targetm.binds_local_p (function))
32099 ;
32100 #if TARGET_MACHO
32101 else if (TARGET_MACHO)
32102 {
32103 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
32104 fnaddr = XEXP (fnaddr, 0);
32105 }
32106 #endif /* TARGET_MACHO */
32107 else
32108 {
32109 tmp = gen_rtx_REG (Pmode, CX_REG);
32110 output_set_got (tmp, NULL_RTX);
32111
32112 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
32113 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
32114 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
32115 }
32116 }
32117
32118 /* Our sibling call patterns do not allow memories, because we have no
32119 predicate that can distinguish between frame and non-frame memory.
32120 For our purposes here, we can get away with (ab)using a jump pattern,
32121 because we're going to do no optimization. */
32122 if (MEM_P (fnaddr))
32123 emit_jump_insn (gen_indirect_jump (fnaddr));
32124 else
32125 {
32126 tmp = gen_rtx_MEM (QImode, fnaddr);
32127 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
32128 tmp = emit_call_insn (tmp);
32129 SIBLING_CALL_P (tmp) = 1;
32130 }
32131 emit_barrier ();
32132
32133 /* Emit just enough of rest_of_compilation to get the insns emitted.
32134 Note that use_thunk calls assemble_start_function et al. */
32135 tmp = get_insns ();
32136 insn_locators_alloc ();
32137 shorten_branches (tmp);
32138 final_start_function (tmp, file, 1);
32139 final (tmp, file, 1);
32140 final_end_function ();
32141 }
32142
32143 static void
32144 x86_file_start (void)
32145 {
32146 default_file_start ();
32147 #if TARGET_MACHO
32148 darwin_file_start ();
32149 #endif
32150 if (X86_FILE_START_VERSION_DIRECTIVE)
32151 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
32152 if (X86_FILE_START_FLTUSED)
32153 fputs ("\t.global\t__fltused\n", asm_out_file);
32154 if (ix86_asm_dialect == ASM_INTEL)
32155 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
32156 }
32157
32158 int
32159 x86_field_alignment (tree field, int computed)
32160 {
32161 enum machine_mode mode;
32162 tree type = TREE_TYPE (field);
32163
32164 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
32165 return computed;
32166 mode = TYPE_MODE (strip_array_types (type));
32167 if (mode == DFmode || mode == DCmode
32168 || GET_MODE_CLASS (mode) == MODE_INT
32169 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
32170 return MIN (32, computed);
32171 return computed;
32172 }
32173
32174 /* Output assembler code to FILE to increment profiler label # LABELNO
32175 for profiling a function entry. */
32176 void
32177 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
32178 {
32179 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
32180 : MCOUNT_NAME);
32181
32182 if (TARGET_64BIT)
32183 {
32184 #ifndef NO_PROFILE_COUNTERS
32185 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
32186 #endif
32187
32188 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
32189 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
32190 else
32191 fprintf (file, "\tcall\t%s\n", mcount_name);
32192 }
32193 else if (flag_pic)
32194 {
32195 #ifndef NO_PROFILE_COUNTERS
32196 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
32197 LPREFIX, labelno);
32198 #endif
32199 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
32200 }
32201 else
32202 {
32203 #ifndef NO_PROFILE_COUNTERS
32204 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
32205 LPREFIX, labelno);
32206 #endif
32207 fprintf (file, "\tcall\t%s\n", mcount_name);
32208 }
32209 }
32210
32211 /* We don't have exact information about the insn sizes, but we may assume
32212 quite safely that we are informed about all 1 byte insns and memory
32213 address sizes. This is enough to eliminate unnecessary padding in
32214 99% of cases. */
32215
32216 static int
32217 min_insn_size (rtx insn)
32218 {
32219 int l = 0, len;
32220
32221 if (!INSN_P (insn) || !active_insn_p (insn))
32222 return 0;
32223
32224 /* Discard alignments we've emit and jump instructions. */
32225 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
32226 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
32227 return 0;
32228 if (JUMP_TABLE_DATA_P (insn))
32229 return 0;
32230
32231 /* Important case - calls are always 5 bytes.
32232 It is common to have many calls in the row. */
32233 if (CALL_P (insn)
32234 && symbolic_reference_mentioned_p (PATTERN (insn))
32235 && !SIBLING_CALL_P (insn))
32236 return 5;
32237 len = get_attr_length (insn);
32238 if (len <= 1)
32239 return 1;
32240
32241 /* For normal instructions we rely on get_attr_length being exact,
32242 with a few exceptions. */
32243 if (!JUMP_P (insn))
32244 {
32245 enum attr_type type = get_attr_type (insn);
32246
32247 switch (type)
32248 {
32249 case TYPE_MULTI:
32250 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
32251 || asm_noperands (PATTERN (insn)) >= 0)
32252 return 0;
32253 break;
32254 case TYPE_OTHER:
32255 case TYPE_FCMP:
32256 break;
32257 default:
32258 /* Otherwise trust get_attr_length. */
32259 return len;
32260 }
32261
32262 l = get_attr_length_address (insn);
32263 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
32264 l = 4;
32265 }
32266 if (l)
32267 return 1+l;
32268 else
32269 return 2;
32270 }
32271
32272 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32273
32274 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
32275 window. */
32276
32277 static void
32278 ix86_avoid_jump_mispredicts (void)
32279 {
32280 rtx insn, start = get_insns ();
32281 int nbytes = 0, njumps = 0;
32282 int isjump = 0;
32283
32284 /* Look for all minimal intervals of instructions containing 4 jumps.
32285 The intervals are bounded by START and INSN. NBYTES is the total
32286 size of instructions in the interval including INSN and not including
32287 START. When the NBYTES is smaller than 16 bytes, it is possible
32288 that the end of START and INSN ends up in the same 16byte page.
32289
32290 The smallest offset in the page INSN can start is the case where START
32291 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
32292 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
32293 */
32294 for (insn = start; insn; insn = NEXT_INSN (insn))
32295 {
32296 int min_size;
32297
32298 if (LABEL_P (insn))
32299 {
32300 int align = label_to_alignment (insn);
32301 int max_skip = label_to_max_skip (insn);
32302
32303 if (max_skip > 15)
32304 max_skip = 15;
32305 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
32306 already in the current 16 byte page, because otherwise
32307 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
32308 bytes to reach 16 byte boundary. */
32309 if (align <= 0
32310 || (align <= 3 && max_skip != (1 << align) - 1))
32311 max_skip = 0;
32312 if (dump_file)
32313 fprintf (dump_file, "Label %i with max_skip %i\n",
32314 INSN_UID (insn), max_skip);
32315 if (max_skip)
32316 {
32317 while (nbytes + max_skip >= 16)
32318 {
32319 start = NEXT_INSN (start);
32320 if ((JUMP_P (start)
32321 && GET_CODE (PATTERN (start)) != ADDR_VEC
32322 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32323 || CALL_P (start))
32324 njumps--, isjump = 1;
32325 else
32326 isjump = 0;
32327 nbytes -= min_insn_size (start);
32328 }
32329 }
32330 continue;
32331 }
32332
32333 min_size = min_insn_size (insn);
32334 nbytes += min_size;
32335 if (dump_file)
32336 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
32337 INSN_UID (insn), min_size);
32338 if ((JUMP_P (insn)
32339 && GET_CODE (PATTERN (insn)) != ADDR_VEC
32340 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
32341 || CALL_P (insn))
32342 njumps++;
32343 else
32344 continue;
32345
32346 while (njumps > 3)
32347 {
32348 start = NEXT_INSN (start);
32349 if ((JUMP_P (start)
32350 && GET_CODE (PATTERN (start)) != ADDR_VEC
32351 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
32352 || CALL_P (start))
32353 njumps--, isjump = 1;
32354 else
32355 isjump = 0;
32356 nbytes -= min_insn_size (start);
32357 }
32358 gcc_assert (njumps >= 0);
32359 if (dump_file)
32360 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
32361 INSN_UID (start), INSN_UID (insn), nbytes);
32362
32363 if (njumps == 3 && isjump && nbytes < 16)
32364 {
32365 int padsize = 15 - nbytes + min_insn_size (insn);
32366
32367 if (dump_file)
32368 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
32369 INSN_UID (insn), padsize);
32370 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
32371 }
32372 }
32373 }
32374 #endif
32375
32376 /* AMD Athlon works faster
32377 when RET is not destination of conditional jump or directly preceded
32378 by other jump instruction. We avoid the penalty by inserting NOP just
32379 before the RET instructions in such cases. */
32380 static void
32381 ix86_pad_returns (void)
32382 {
32383 edge e;
32384 edge_iterator ei;
32385
32386 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32387 {
32388 basic_block bb = e->src;
32389 rtx ret = BB_END (bb);
32390 rtx prev;
32391 bool replace = false;
32392
32393 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
32394 || optimize_bb_for_size_p (bb))
32395 continue;
32396 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
32397 if (active_insn_p (prev) || LABEL_P (prev))
32398 break;
32399 if (prev && LABEL_P (prev))
32400 {
32401 edge e;
32402 edge_iterator ei;
32403
32404 FOR_EACH_EDGE (e, ei, bb->preds)
32405 if (EDGE_FREQUENCY (e) && e->src->index >= 0
32406 && !(e->flags & EDGE_FALLTHRU))
32407 replace = true;
32408 }
32409 if (!replace)
32410 {
32411 prev = prev_active_insn (ret);
32412 if (prev
32413 && ((JUMP_P (prev) && any_condjump_p (prev))
32414 || CALL_P (prev)))
32415 replace = true;
32416 /* Empty functions get branch mispredict even when
32417 the jump destination is not visible to us. */
32418 if (!prev && !optimize_function_for_size_p (cfun))
32419 replace = true;
32420 }
32421 if (replace)
32422 {
32423 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
32424 delete_insn (ret);
32425 }
32426 }
32427 }
32428
32429 /* Count the minimum number of instructions in BB. Return 4 if the
32430 number of instructions >= 4. */
32431
32432 static int
32433 ix86_count_insn_bb (basic_block bb)
32434 {
32435 rtx insn;
32436 int insn_count = 0;
32437
32438 /* Count number of instructions in this block. Return 4 if the number
32439 of instructions >= 4. */
32440 FOR_BB_INSNS (bb, insn)
32441 {
32442 /* Only happen in exit blocks. */
32443 if (JUMP_P (insn)
32444 && ANY_RETURN_P (PATTERN (insn)))
32445 break;
32446
32447 if (NONDEBUG_INSN_P (insn)
32448 && GET_CODE (PATTERN (insn)) != USE
32449 && GET_CODE (PATTERN (insn)) != CLOBBER)
32450 {
32451 insn_count++;
32452 if (insn_count >= 4)
32453 return insn_count;
32454 }
32455 }
32456
32457 return insn_count;
32458 }
32459
32460
32461 /* Count the minimum number of instructions in code path in BB.
32462 Return 4 if the number of instructions >= 4. */
32463
32464 static int
32465 ix86_count_insn (basic_block bb)
32466 {
32467 edge e;
32468 edge_iterator ei;
32469 int min_prev_count;
32470
32471 /* Only bother counting instructions along paths with no
32472 more than 2 basic blocks between entry and exit. Given
32473 that BB has an edge to exit, determine if a predecessor
32474 of BB has an edge from entry. If so, compute the number
32475 of instructions in the predecessor block. If there
32476 happen to be multiple such blocks, compute the minimum. */
32477 min_prev_count = 4;
32478 FOR_EACH_EDGE (e, ei, bb->preds)
32479 {
32480 edge prev_e;
32481 edge_iterator prev_ei;
32482
32483 if (e->src == ENTRY_BLOCK_PTR)
32484 {
32485 min_prev_count = 0;
32486 break;
32487 }
32488 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
32489 {
32490 if (prev_e->src == ENTRY_BLOCK_PTR)
32491 {
32492 int count = ix86_count_insn_bb (e->src);
32493 if (count < min_prev_count)
32494 min_prev_count = count;
32495 break;
32496 }
32497 }
32498 }
32499
32500 if (min_prev_count < 4)
32501 min_prev_count += ix86_count_insn_bb (bb);
32502
32503 return min_prev_count;
32504 }
32505
32506 /* Pad short funtion to 4 instructions. */
32507
32508 static void
32509 ix86_pad_short_function (void)
32510 {
32511 edge e;
32512 edge_iterator ei;
32513
32514 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
32515 {
32516 rtx ret = BB_END (e->src);
32517 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
32518 {
32519 int insn_count = ix86_count_insn (e->src);
32520
32521 /* Pad short function. */
32522 if (insn_count < 4)
32523 {
32524 rtx insn = ret;
32525
32526 /* Find epilogue. */
32527 while (insn
32528 && (!NOTE_P (insn)
32529 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
32530 insn = PREV_INSN (insn);
32531
32532 if (!insn)
32533 insn = ret;
32534
32535 /* Two NOPs count as one instruction. */
32536 insn_count = 2 * (4 - insn_count);
32537 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
32538 }
32539 }
32540 }
32541 }
32542
32543 /* Implement machine specific optimizations. We implement padding of returns
32544 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
32545 static void
32546 ix86_reorg (void)
32547 {
32548 /* We are freeing block_for_insn in the toplev to keep compatibility
32549 with old MDEP_REORGS that are not CFG based. Recompute it now. */
32550 compute_bb_for_insn ();
32551
32552 /* Run the vzeroupper optimization if needed. */
32553 if (TARGET_VZEROUPPER)
32554 move_or_delete_vzeroupper ();
32555
32556 if (optimize && optimize_function_for_speed_p (cfun))
32557 {
32558 if (TARGET_PAD_SHORT_FUNCTION)
32559 ix86_pad_short_function ();
32560 else if (TARGET_PAD_RETURNS)
32561 ix86_pad_returns ();
32562 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
32563 if (TARGET_FOUR_JUMP_LIMIT)
32564 ix86_avoid_jump_mispredicts ();
32565 #endif
32566 }
32567 }
32568
32569 /* Return nonzero when QImode register that must be represented via REX prefix
32570 is used. */
32571 bool
32572 x86_extended_QIreg_mentioned_p (rtx insn)
32573 {
32574 int i;
32575 extract_insn_cached (insn);
32576 for (i = 0; i < recog_data.n_operands; i++)
32577 if (REG_P (recog_data.operand[i])
32578 && REGNO (recog_data.operand[i]) > BX_REG)
32579 return true;
32580 return false;
32581 }
32582
32583 /* Return nonzero when P points to register encoded via REX prefix.
32584 Called via for_each_rtx. */
32585 static int
32586 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
32587 {
32588 unsigned int regno;
32589 if (!REG_P (*p))
32590 return 0;
32591 regno = REGNO (*p);
32592 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
32593 }
32594
32595 /* Return true when INSN mentions register that must be encoded using REX
32596 prefix. */
32597 bool
32598 x86_extended_reg_mentioned_p (rtx insn)
32599 {
32600 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
32601 extended_reg_mentioned_1, NULL);
32602 }
32603
32604 /* If profitable, negate (without causing overflow) integer constant
32605 of mode MODE at location LOC. Return true in this case. */
32606 bool
32607 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
32608 {
32609 HOST_WIDE_INT val;
32610
32611 if (!CONST_INT_P (*loc))
32612 return false;
32613
32614 switch (mode)
32615 {
32616 case DImode:
32617 /* DImode x86_64 constants must fit in 32 bits. */
32618 gcc_assert (x86_64_immediate_operand (*loc, mode));
32619
32620 mode = SImode;
32621 break;
32622
32623 case SImode:
32624 case HImode:
32625 case QImode:
32626 break;
32627
32628 default:
32629 gcc_unreachable ();
32630 }
32631
32632 /* Avoid overflows. */
32633 if (mode_signbit_p (mode, *loc))
32634 return false;
32635
32636 val = INTVAL (*loc);
32637
32638 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
32639 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
32640 if ((val < 0 && val != -128)
32641 || val == 128)
32642 {
32643 *loc = GEN_INT (-val);
32644 return true;
32645 }
32646
32647 return false;
32648 }
32649
32650 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
32651 optabs would emit if we didn't have TFmode patterns. */
32652
32653 void
32654 x86_emit_floatuns (rtx operands[2])
32655 {
32656 rtx neglab, donelab, i0, i1, f0, in, out;
32657 enum machine_mode mode, inmode;
32658
32659 inmode = GET_MODE (operands[1]);
32660 gcc_assert (inmode == SImode || inmode == DImode);
32661
32662 out = operands[0];
32663 in = force_reg (inmode, operands[1]);
32664 mode = GET_MODE (out);
32665 neglab = gen_label_rtx ();
32666 donelab = gen_label_rtx ();
32667 f0 = gen_reg_rtx (mode);
32668
32669 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
32670
32671 expand_float (out, in, 0);
32672
32673 emit_jump_insn (gen_jump (donelab));
32674 emit_barrier ();
32675
32676 emit_label (neglab);
32677
32678 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
32679 1, OPTAB_DIRECT);
32680 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
32681 1, OPTAB_DIRECT);
32682 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
32683
32684 expand_float (f0, i0, 0);
32685
32686 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
32687
32688 emit_label (donelab);
32689 }
32690 \f
32691 /* AVX2 does support 32-byte integer vector operations,
32692 thus the longest vector we are faced with is V32QImode. */
32693 #define MAX_VECT_LEN 32
32694
32695 struct expand_vec_perm_d
32696 {
32697 rtx target, op0, op1;
32698 unsigned char perm[MAX_VECT_LEN];
32699 enum machine_mode vmode;
32700 unsigned char nelt;
32701 bool testing_p;
32702 };
32703
32704 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
32705 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
32706
32707 /* Get a vector mode of the same size as the original but with elements
32708 twice as wide. This is only guaranteed to apply to integral vectors. */
32709
32710 static inline enum machine_mode
32711 get_mode_wider_vector (enum machine_mode o)
32712 {
32713 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
32714 enum machine_mode n = GET_MODE_WIDER_MODE (o);
32715 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
32716 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
32717 return n;
32718 }
32719
32720 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32721 with all elements equal to VAR. Return true if successful. */
32722
32723 static bool
32724 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
32725 rtx target, rtx val)
32726 {
32727 bool ok;
32728
32729 switch (mode)
32730 {
32731 case V2SImode:
32732 case V2SFmode:
32733 if (!mmx_ok)
32734 return false;
32735 /* FALLTHRU */
32736
32737 case V4DFmode:
32738 case V4DImode:
32739 case V8SFmode:
32740 case V8SImode:
32741 case V2DFmode:
32742 case V2DImode:
32743 case V4SFmode:
32744 case V4SImode:
32745 {
32746 rtx insn, dup;
32747
32748 /* First attempt to recognize VAL as-is. */
32749 dup = gen_rtx_VEC_DUPLICATE (mode, val);
32750 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
32751 if (recog_memoized (insn) < 0)
32752 {
32753 rtx seq;
32754 /* If that fails, force VAL into a register. */
32755
32756 start_sequence ();
32757 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
32758 seq = get_insns ();
32759 end_sequence ();
32760 if (seq)
32761 emit_insn_before (seq, insn);
32762
32763 ok = recog_memoized (insn) >= 0;
32764 gcc_assert (ok);
32765 }
32766 }
32767 return true;
32768
32769 case V4HImode:
32770 if (!mmx_ok)
32771 return false;
32772 if (TARGET_SSE || TARGET_3DNOW_A)
32773 {
32774 rtx x;
32775
32776 val = gen_lowpart (SImode, val);
32777 x = gen_rtx_TRUNCATE (HImode, val);
32778 x = gen_rtx_VEC_DUPLICATE (mode, x);
32779 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32780 return true;
32781 }
32782 goto widen;
32783
32784 case V8QImode:
32785 if (!mmx_ok)
32786 return false;
32787 goto widen;
32788
32789 case V8HImode:
32790 if (TARGET_SSE2)
32791 {
32792 struct expand_vec_perm_d dperm;
32793 rtx tmp1, tmp2;
32794
32795 permute:
32796 memset (&dperm, 0, sizeof (dperm));
32797 dperm.target = target;
32798 dperm.vmode = mode;
32799 dperm.nelt = GET_MODE_NUNITS (mode);
32800 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
32801
32802 /* Extend to SImode using a paradoxical SUBREG. */
32803 tmp1 = gen_reg_rtx (SImode);
32804 emit_move_insn (tmp1, gen_lowpart (SImode, val));
32805
32806 /* Insert the SImode value as low element of a V4SImode vector. */
32807 tmp2 = gen_lowpart (V4SImode, dperm.op0);
32808 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
32809
32810 ok = (expand_vec_perm_1 (&dperm)
32811 || expand_vec_perm_broadcast_1 (&dperm));
32812 gcc_assert (ok);
32813 return ok;
32814 }
32815 goto widen;
32816
32817 case V16QImode:
32818 if (TARGET_SSE2)
32819 goto permute;
32820 goto widen;
32821
32822 widen:
32823 /* Replicate the value once into the next wider mode and recurse. */
32824 {
32825 enum machine_mode smode, wsmode, wvmode;
32826 rtx x;
32827
32828 smode = GET_MODE_INNER (mode);
32829 wvmode = get_mode_wider_vector (mode);
32830 wsmode = GET_MODE_INNER (wvmode);
32831
32832 val = convert_modes (wsmode, smode, val, true);
32833 x = expand_simple_binop (wsmode, ASHIFT, val,
32834 GEN_INT (GET_MODE_BITSIZE (smode)),
32835 NULL_RTX, 1, OPTAB_LIB_WIDEN);
32836 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
32837
32838 x = gen_lowpart (wvmode, target);
32839 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
32840 gcc_assert (ok);
32841 return ok;
32842 }
32843
32844 case V16HImode:
32845 case V32QImode:
32846 {
32847 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
32848 rtx x = gen_reg_rtx (hvmode);
32849
32850 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
32851 gcc_assert (ok);
32852
32853 x = gen_rtx_VEC_CONCAT (mode, x, x);
32854 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32855 }
32856 return true;
32857
32858 default:
32859 return false;
32860 }
32861 }
32862
32863 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
32864 whose ONE_VAR element is VAR, and other elements are zero. Return true
32865 if successful. */
32866
32867 static bool
32868 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
32869 rtx target, rtx var, int one_var)
32870 {
32871 enum machine_mode vsimode;
32872 rtx new_target;
32873 rtx x, tmp;
32874 bool use_vector_set = false;
32875
32876 switch (mode)
32877 {
32878 case V2DImode:
32879 /* For SSE4.1, we normally use vector set. But if the second
32880 element is zero and inter-unit moves are OK, we use movq
32881 instead. */
32882 use_vector_set = (TARGET_64BIT
32883 && TARGET_SSE4_1
32884 && !(TARGET_INTER_UNIT_MOVES
32885 && one_var == 0));
32886 break;
32887 case V16QImode:
32888 case V4SImode:
32889 case V4SFmode:
32890 use_vector_set = TARGET_SSE4_1;
32891 break;
32892 case V8HImode:
32893 use_vector_set = TARGET_SSE2;
32894 break;
32895 case V4HImode:
32896 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
32897 break;
32898 case V32QImode:
32899 case V16HImode:
32900 case V8SImode:
32901 case V8SFmode:
32902 case V4DFmode:
32903 use_vector_set = TARGET_AVX;
32904 break;
32905 case V4DImode:
32906 /* Use ix86_expand_vector_set in 64bit mode only. */
32907 use_vector_set = TARGET_AVX && TARGET_64BIT;
32908 break;
32909 default:
32910 break;
32911 }
32912
32913 if (use_vector_set)
32914 {
32915 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
32916 var = force_reg (GET_MODE_INNER (mode), var);
32917 ix86_expand_vector_set (mmx_ok, target, var, one_var);
32918 return true;
32919 }
32920
32921 switch (mode)
32922 {
32923 case V2SFmode:
32924 case V2SImode:
32925 if (!mmx_ok)
32926 return false;
32927 /* FALLTHRU */
32928
32929 case V2DFmode:
32930 case V2DImode:
32931 if (one_var != 0)
32932 return false;
32933 var = force_reg (GET_MODE_INNER (mode), var);
32934 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
32935 emit_insn (gen_rtx_SET (VOIDmode, target, x));
32936 return true;
32937
32938 case V4SFmode:
32939 case V4SImode:
32940 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
32941 new_target = gen_reg_rtx (mode);
32942 else
32943 new_target = target;
32944 var = force_reg (GET_MODE_INNER (mode), var);
32945 x = gen_rtx_VEC_DUPLICATE (mode, var);
32946 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
32947 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
32948 if (one_var != 0)
32949 {
32950 /* We need to shuffle the value to the correct position, so
32951 create a new pseudo to store the intermediate result. */
32952
32953 /* With SSE2, we can use the integer shuffle insns. */
32954 if (mode != V4SFmode && TARGET_SSE2)
32955 {
32956 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
32957 const1_rtx,
32958 GEN_INT (one_var == 1 ? 0 : 1),
32959 GEN_INT (one_var == 2 ? 0 : 1),
32960 GEN_INT (one_var == 3 ? 0 : 1)));
32961 if (target != new_target)
32962 emit_move_insn (target, new_target);
32963 return true;
32964 }
32965
32966 /* Otherwise convert the intermediate result to V4SFmode and
32967 use the SSE1 shuffle instructions. */
32968 if (mode != V4SFmode)
32969 {
32970 tmp = gen_reg_rtx (V4SFmode);
32971 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
32972 }
32973 else
32974 tmp = new_target;
32975
32976 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
32977 const1_rtx,
32978 GEN_INT (one_var == 1 ? 0 : 1),
32979 GEN_INT (one_var == 2 ? 0+4 : 1+4),
32980 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
32981
32982 if (mode != V4SFmode)
32983 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
32984 else if (tmp != target)
32985 emit_move_insn (target, tmp);
32986 }
32987 else if (target != new_target)
32988 emit_move_insn (target, new_target);
32989 return true;
32990
32991 case V8HImode:
32992 case V16QImode:
32993 vsimode = V4SImode;
32994 goto widen;
32995 case V4HImode:
32996 case V8QImode:
32997 if (!mmx_ok)
32998 return false;
32999 vsimode = V2SImode;
33000 goto widen;
33001 widen:
33002 if (one_var != 0)
33003 return false;
33004
33005 /* Zero extend the variable element to SImode and recurse. */
33006 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
33007
33008 x = gen_reg_rtx (vsimode);
33009 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
33010 var, one_var))
33011 gcc_unreachable ();
33012
33013 emit_move_insn (target, gen_lowpart (mode, x));
33014 return true;
33015
33016 default:
33017 return false;
33018 }
33019 }
33020
33021 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
33022 consisting of the values in VALS. It is known that all elements
33023 except ONE_VAR are constants. Return true if successful. */
33024
33025 static bool
33026 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
33027 rtx target, rtx vals, int one_var)
33028 {
33029 rtx var = XVECEXP (vals, 0, one_var);
33030 enum machine_mode wmode;
33031 rtx const_vec, x;
33032
33033 const_vec = copy_rtx (vals);
33034 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
33035 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
33036
33037 switch (mode)
33038 {
33039 case V2DFmode:
33040 case V2DImode:
33041 case V2SFmode:
33042 case V2SImode:
33043 /* For the two element vectors, it's just as easy to use
33044 the general case. */
33045 return false;
33046
33047 case V4DImode:
33048 /* Use ix86_expand_vector_set in 64bit mode only. */
33049 if (!TARGET_64BIT)
33050 return false;
33051 case V4DFmode:
33052 case V8SFmode:
33053 case V8SImode:
33054 case V16HImode:
33055 case V32QImode:
33056 case V4SFmode:
33057 case V4SImode:
33058 case V8HImode:
33059 case V4HImode:
33060 break;
33061
33062 case V16QImode:
33063 if (TARGET_SSE4_1)
33064 break;
33065 wmode = V8HImode;
33066 goto widen;
33067 case V8QImode:
33068 wmode = V4HImode;
33069 goto widen;
33070 widen:
33071 /* There's no way to set one QImode entry easily. Combine
33072 the variable value with its adjacent constant value, and
33073 promote to an HImode set. */
33074 x = XVECEXP (vals, 0, one_var ^ 1);
33075 if (one_var & 1)
33076 {
33077 var = convert_modes (HImode, QImode, var, true);
33078 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
33079 NULL_RTX, 1, OPTAB_LIB_WIDEN);
33080 x = GEN_INT (INTVAL (x) & 0xff);
33081 }
33082 else
33083 {
33084 var = convert_modes (HImode, QImode, var, true);
33085 x = gen_int_mode (INTVAL (x) << 8, HImode);
33086 }
33087 if (x != const0_rtx)
33088 var = expand_simple_binop (HImode, IOR, var, x, var,
33089 1, OPTAB_LIB_WIDEN);
33090
33091 x = gen_reg_rtx (wmode);
33092 emit_move_insn (x, gen_lowpart (wmode, const_vec));
33093 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
33094
33095 emit_move_insn (target, gen_lowpart (mode, x));
33096 return true;
33097
33098 default:
33099 return false;
33100 }
33101
33102 emit_move_insn (target, const_vec);
33103 ix86_expand_vector_set (mmx_ok, target, var, one_var);
33104 return true;
33105 }
33106
33107 /* A subroutine of ix86_expand_vector_init_general. Use vector
33108 concatenate to handle the most general case: all values variable,
33109 and none identical. */
33110
33111 static void
33112 ix86_expand_vector_init_concat (enum machine_mode mode,
33113 rtx target, rtx *ops, int n)
33114 {
33115 enum machine_mode cmode, hmode = VOIDmode;
33116 rtx first[8], second[4];
33117 rtvec v;
33118 int i, j;
33119
33120 switch (n)
33121 {
33122 case 2:
33123 switch (mode)
33124 {
33125 case V8SImode:
33126 cmode = V4SImode;
33127 break;
33128 case V8SFmode:
33129 cmode = V4SFmode;
33130 break;
33131 case V4DImode:
33132 cmode = V2DImode;
33133 break;
33134 case V4DFmode:
33135 cmode = V2DFmode;
33136 break;
33137 case V4SImode:
33138 cmode = V2SImode;
33139 break;
33140 case V4SFmode:
33141 cmode = V2SFmode;
33142 break;
33143 case V2DImode:
33144 cmode = DImode;
33145 break;
33146 case V2SImode:
33147 cmode = SImode;
33148 break;
33149 case V2DFmode:
33150 cmode = DFmode;
33151 break;
33152 case V2SFmode:
33153 cmode = SFmode;
33154 break;
33155 default:
33156 gcc_unreachable ();
33157 }
33158
33159 if (!register_operand (ops[1], cmode))
33160 ops[1] = force_reg (cmode, ops[1]);
33161 if (!register_operand (ops[0], cmode))
33162 ops[0] = force_reg (cmode, ops[0]);
33163 emit_insn (gen_rtx_SET (VOIDmode, target,
33164 gen_rtx_VEC_CONCAT (mode, ops[0],
33165 ops[1])));
33166 break;
33167
33168 case 4:
33169 switch (mode)
33170 {
33171 case V4DImode:
33172 cmode = V2DImode;
33173 break;
33174 case V4DFmode:
33175 cmode = V2DFmode;
33176 break;
33177 case V4SImode:
33178 cmode = V2SImode;
33179 break;
33180 case V4SFmode:
33181 cmode = V2SFmode;
33182 break;
33183 default:
33184 gcc_unreachable ();
33185 }
33186 goto half;
33187
33188 case 8:
33189 switch (mode)
33190 {
33191 case V8SImode:
33192 cmode = V2SImode;
33193 hmode = V4SImode;
33194 break;
33195 case V8SFmode:
33196 cmode = V2SFmode;
33197 hmode = V4SFmode;
33198 break;
33199 default:
33200 gcc_unreachable ();
33201 }
33202 goto half;
33203
33204 half:
33205 /* FIXME: We process inputs backward to help RA. PR 36222. */
33206 i = n - 1;
33207 j = (n >> 1) - 1;
33208 for (; i > 0; i -= 2, j--)
33209 {
33210 first[j] = gen_reg_rtx (cmode);
33211 v = gen_rtvec (2, ops[i - 1], ops[i]);
33212 ix86_expand_vector_init (false, first[j],
33213 gen_rtx_PARALLEL (cmode, v));
33214 }
33215
33216 n >>= 1;
33217 if (n > 2)
33218 {
33219 gcc_assert (hmode != VOIDmode);
33220 for (i = j = 0; i < n; i += 2, j++)
33221 {
33222 second[j] = gen_reg_rtx (hmode);
33223 ix86_expand_vector_init_concat (hmode, second [j],
33224 &first [i], 2);
33225 }
33226 n >>= 1;
33227 ix86_expand_vector_init_concat (mode, target, second, n);
33228 }
33229 else
33230 ix86_expand_vector_init_concat (mode, target, first, n);
33231 break;
33232
33233 default:
33234 gcc_unreachable ();
33235 }
33236 }
33237
33238 /* A subroutine of ix86_expand_vector_init_general. Use vector
33239 interleave to handle the most general case: all values variable,
33240 and none identical. */
33241
33242 static void
33243 ix86_expand_vector_init_interleave (enum machine_mode mode,
33244 rtx target, rtx *ops, int n)
33245 {
33246 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
33247 int i, j;
33248 rtx op0, op1;
33249 rtx (*gen_load_even) (rtx, rtx, rtx);
33250 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
33251 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
33252
33253 switch (mode)
33254 {
33255 case V8HImode:
33256 gen_load_even = gen_vec_setv8hi;
33257 gen_interleave_first_low = gen_vec_interleave_lowv4si;
33258 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33259 inner_mode = HImode;
33260 first_imode = V4SImode;
33261 second_imode = V2DImode;
33262 third_imode = VOIDmode;
33263 break;
33264 case V16QImode:
33265 gen_load_even = gen_vec_setv16qi;
33266 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
33267 gen_interleave_second_low = gen_vec_interleave_lowv4si;
33268 inner_mode = QImode;
33269 first_imode = V8HImode;
33270 second_imode = V4SImode;
33271 third_imode = V2DImode;
33272 break;
33273 default:
33274 gcc_unreachable ();
33275 }
33276
33277 for (i = 0; i < n; i++)
33278 {
33279 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
33280 op0 = gen_reg_rtx (SImode);
33281 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
33282
33283 /* Insert the SImode value as low element of V4SImode vector. */
33284 op1 = gen_reg_rtx (V4SImode);
33285 op0 = gen_rtx_VEC_MERGE (V4SImode,
33286 gen_rtx_VEC_DUPLICATE (V4SImode,
33287 op0),
33288 CONST0_RTX (V4SImode),
33289 const1_rtx);
33290 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
33291
33292 /* Cast the V4SImode vector back to a vector in orignal mode. */
33293 op0 = gen_reg_rtx (mode);
33294 emit_move_insn (op0, gen_lowpart (mode, op1));
33295
33296 /* Load even elements into the second positon. */
33297 emit_insn (gen_load_even (op0,
33298 force_reg (inner_mode,
33299 ops [i + i + 1]),
33300 const1_rtx));
33301
33302 /* Cast vector to FIRST_IMODE vector. */
33303 ops[i] = gen_reg_rtx (first_imode);
33304 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
33305 }
33306
33307 /* Interleave low FIRST_IMODE vectors. */
33308 for (i = j = 0; i < n; i += 2, j++)
33309 {
33310 op0 = gen_reg_rtx (first_imode);
33311 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
33312
33313 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
33314 ops[j] = gen_reg_rtx (second_imode);
33315 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
33316 }
33317
33318 /* Interleave low SECOND_IMODE vectors. */
33319 switch (second_imode)
33320 {
33321 case V4SImode:
33322 for (i = j = 0; i < n / 2; i += 2, j++)
33323 {
33324 op0 = gen_reg_rtx (second_imode);
33325 emit_insn (gen_interleave_second_low (op0, ops[i],
33326 ops[i + 1]));
33327
33328 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
33329 vector. */
33330 ops[j] = gen_reg_rtx (third_imode);
33331 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
33332 }
33333 second_imode = V2DImode;
33334 gen_interleave_second_low = gen_vec_interleave_lowv2di;
33335 /* FALLTHRU */
33336
33337 case V2DImode:
33338 op0 = gen_reg_rtx (second_imode);
33339 emit_insn (gen_interleave_second_low (op0, ops[0],
33340 ops[1]));
33341
33342 /* Cast the SECOND_IMODE vector back to a vector on original
33343 mode. */
33344 emit_insn (gen_rtx_SET (VOIDmode, target,
33345 gen_lowpart (mode, op0)));
33346 break;
33347
33348 default:
33349 gcc_unreachable ();
33350 }
33351 }
33352
33353 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
33354 all values variable, and none identical. */
33355
33356 static void
33357 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
33358 rtx target, rtx vals)
33359 {
33360 rtx ops[32], op0, op1;
33361 enum machine_mode half_mode = VOIDmode;
33362 int n, i;
33363
33364 switch (mode)
33365 {
33366 case V2SFmode:
33367 case V2SImode:
33368 if (!mmx_ok && !TARGET_SSE)
33369 break;
33370 /* FALLTHRU */
33371
33372 case V8SFmode:
33373 case V8SImode:
33374 case V4DFmode:
33375 case V4DImode:
33376 case V4SFmode:
33377 case V4SImode:
33378 case V2DFmode:
33379 case V2DImode:
33380 n = GET_MODE_NUNITS (mode);
33381 for (i = 0; i < n; i++)
33382 ops[i] = XVECEXP (vals, 0, i);
33383 ix86_expand_vector_init_concat (mode, target, ops, n);
33384 return;
33385
33386 case V32QImode:
33387 half_mode = V16QImode;
33388 goto half;
33389
33390 case V16HImode:
33391 half_mode = V8HImode;
33392 goto half;
33393
33394 half:
33395 n = GET_MODE_NUNITS (mode);
33396 for (i = 0; i < n; i++)
33397 ops[i] = XVECEXP (vals, 0, i);
33398 op0 = gen_reg_rtx (half_mode);
33399 op1 = gen_reg_rtx (half_mode);
33400 ix86_expand_vector_init_interleave (half_mode, op0, ops,
33401 n >> 2);
33402 ix86_expand_vector_init_interleave (half_mode, op1,
33403 &ops [n >> 1], n >> 2);
33404 emit_insn (gen_rtx_SET (VOIDmode, target,
33405 gen_rtx_VEC_CONCAT (mode, op0, op1)));
33406 return;
33407
33408 case V16QImode:
33409 if (!TARGET_SSE4_1)
33410 break;
33411 /* FALLTHRU */
33412
33413 case V8HImode:
33414 if (!TARGET_SSE2)
33415 break;
33416
33417 /* Don't use ix86_expand_vector_init_interleave if we can't
33418 move from GPR to SSE register directly. */
33419 if (!TARGET_INTER_UNIT_MOVES)
33420 break;
33421
33422 n = GET_MODE_NUNITS (mode);
33423 for (i = 0; i < n; i++)
33424 ops[i] = XVECEXP (vals, 0, i);
33425 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
33426 return;
33427
33428 case V4HImode:
33429 case V8QImode:
33430 break;
33431
33432 default:
33433 gcc_unreachable ();
33434 }
33435
33436 {
33437 int i, j, n_elts, n_words, n_elt_per_word;
33438 enum machine_mode inner_mode;
33439 rtx words[4], shift;
33440
33441 inner_mode = GET_MODE_INNER (mode);
33442 n_elts = GET_MODE_NUNITS (mode);
33443 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
33444 n_elt_per_word = n_elts / n_words;
33445 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
33446
33447 for (i = 0; i < n_words; ++i)
33448 {
33449 rtx word = NULL_RTX;
33450
33451 for (j = 0; j < n_elt_per_word; ++j)
33452 {
33453 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
33454 elt = convert_modes (word_mode, inner_mode, elt, true);
33455
33456 if (j == 0)
33457 word = elt;
33458 else
33459 {
33460 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
33461 word, 1, OPTAB_LIB_WIDEN);
33462 word = expand_simple_binop (word_mode, IOR, word, elt,
33463 word, 1, OPTAB_LIB_WIDEN);
33464 }
33465 }
33466
33467 words[i] = word;
33468 }
33469
33470 if (n_words == 1)
33471 emit_move_insn (target, gen_lowpart (mode, words[0]));
33472 else if (n_words == 2)
33473 {
33474 rtx tmp = gen_reg_rtx (mode);
33475 emit_clobber (tmp);
33476 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
33477 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
33478 emit_move_insn (target, tmp);
33479 }
33480 else if (n_words == 4)
33481 {
33482 rtx tmp = gen_reg_rtx (V4SImode);
33483 gcc_assert (word_mode == SImode);
33484 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
33485 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
33486 emit_move_insn (target, gen_lowpart (mode, tmp));
33487 }
33488 else
33489 gcc_unreachable ();
33490 }
33491 }
33492
33493 /* Initialize vector TARGET via VALS. Suppress the use of MMX
33494 instructions unless MMX_OK is true. */
33495
33496 void
33497 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
33498 {
33499 enum machine_mode mode = GET_MODE (target);
33500 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33501 int n_elts = GET_MODE_NUNITS (mode);
33502 int n_var = 0, one_var = -1;
33503 bool all_same = true, all_const_zero = true;
33504 int i;
33505 rtx x;
33506
33507 for (i = 0; i < n_elts; ++i)
33508 {
33509 x = XVECEXP (vals, 0, i);
33510 if (!(CONST_INT_P (x)
33511 || GET_CODE (x) == CONST_DOUBLE
33512 || GET_CODE (x) == CONST_FIXED))
33513 n_var++, one_var = i;
33514 else if (x != CONST0_RTX (inner_mode))
33515 all_const_zero = false;
33516 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
33517 all_same = false;
33518 }
33519
33520 /* Constants are best loaded from the constant pool. */
33521 if (n_var == 0)
33522 {
33523 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
33524 return;
33525 }
33526
33527 /* If all values are identical, broadcast the value. */
33528 if (all_same
33529 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
33530 XVECEXP (vals, 0, 0)))
33531 return;
33532
33533 /* Values where only one field is non-constant are best loaded from
33534 the pool and overwritten via move later. */
33535 if (n_var == 1)
33536 {
33537 if (all_const_zero
33538 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
33539 XVECEXP (vals, 0, one_var),
33540 one_var))
33541 return;
33542
33543 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
33544 return;
33545 }
33546
33547 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
33548 }
33549
33550 void
33551 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
33552 {
33553 enum machine_mode mode = GET_MODE (target);
33554 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33555 enum machine_mode half_mode;
33556 bool use_vec_merge = false;
33557 rtx tmp;
33558 static rtx (*gen_extract[6][2]) (rtx, rtx)
33559 = {
33560 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
33561 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
33562 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
33563 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
33564 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
33565 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
33566 };
33567 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
33568 = {
33569 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
33570 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
33571 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
33572 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
33573 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
33574 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
33575 };
33576 int i, j, n;
33577
33578 switch (mode)
33579 {
33580 case V2SFmode:
33581 case V2SImode:
33582 if (mmx_ok)
33583 {
33584 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33585 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
33586 if (elt == 0)
33587 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33588 else
33589 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33590 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33591 return;
33592 }
33593 break;
33594
33595 case V2DImode:
33596 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
33597 if (use_vec_merge)
33598 break;
33599
33600 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
33601 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
33602 if (elt == 0)
33603 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
33604 else
33605 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
33606 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33607 return;
33608
33609 case V2DFmode:
33610 {
33611 rtx op0, op1;
33612
33613 /* For the two element vectors, we implement a VEC_CONCAT with
33614 the extraction of the other element. */
33615
33616 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
33617 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
33618
33619 if (elt == 0)
33620 op0 = val, op1 = tmp;
33621 else
33622 op0 = tmp, op1 = val;
33623
33624 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
33625 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33626 }
33627 return;
33628
33629 case V4SFmode:
33630 use_vec_merge = TARGET_SSE4_1;
33631 if (use_vec_merge)
33632 break;
33633
33634 switch (elt)
33635 {
33636 case 0:
33637 use_vec_merge = true;
33638 break;
33639
33640 case 1:
33641 /* tmp = target = A B C D */
33642 tmp = copy_to_reg (target);
33643 /* target = A A B B */
33644 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
33645 /* target = X A B B */
33646 ix86_expand_vector_set (false, target, val, 0);
33647 /* target = A X C D */
33648 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33649 const1_rtx, const0_rtx,
33650 GEN_INT (2+4), GEN_INT (3+4)));
33651 return;
33652
33653 case 2:
33654 /* tmp = target = A B C D */
33655 tmp = copy_to_reg (target);
33656 /* tmp = X B C D */
33657 ix86_expand_vector_set (false, tmp, val, 0);
33658 /* target = A B X D */
33659 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33660 const0_rtx, const1_rtx,
33661 GEN_INT (0+4), GEN_INT (3+4)));
33662 return;
33663
33664 case 3:
33665 /* tmp = target = A B C D */
33666 tmp = copy_to_reg (target);
33667 /* tmp = X B C D */
33668 ix86_expand_vector_set (false, tmp, val, 0);
33669 /* target = A B X D */
33670 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
33671 const0_rtx, const1_rtx,
33672 GEN_INT (2+4), GEN_INT (0+4)));
33673 return;
33674
33675 default:
33676 gcc_unreachable ();
33677 }
33678 break;
33679
33680 case V4SImode:
33681 use_vec_merge = TARGET_SSE4_1;
33682 if (use_vec_merge)
33683 break;
33684
33685 /* Element 0 handled by vec_merge below. */
33686 if (elt == 0)
33687 {
33688 use_vec_merge = true;
33689 break;
33690 }
33691
33692 if (TARGET_SSE2)
33693 {
33694 /* With SSE2, use integer shuffles to swap element 0 and ELT,
33695 store into element 0, then shuffle them back. */
33696
33697 rtx order[4];
33698
33699 order[0] = GEN_INT (elt);
33700 order[1] = const1_rtx;
33701 order[2] = const2_rtx;
33702 order[3] = GEN_INT (3);
33703 order[elt] = const0_rtx;
33704
33705 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33706 order[1], order[2], order[3]));
33707
33708 ix86_expand_vector_set (false, target, val, 0);
33709
33710 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
33711 order[1], order[2], order[3]));
33712 }
33713 else
33714 {
33715 /* For SSE1, we have to reuse the V4SF code. */
33716 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
33717 gen_lowpart (SFmode, val), elt);
33718 }
33719 return;
33720
33721 case V8HImode:
33722 use_vec_merge = TARGET_SSE2;
33723 break;
33724 case V4HImode:
33725 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33726 break;
33727
33728 case V16QImode:
33729 use_vec_merge = TARGET_SSE4_1;
33730 break;
33731
33732 case V8QImode:
33733 break;
33734
33735 case V32QImode:
33736 half_mode = V16QImode;
33737 j = 0;
33738 n = 16;
33739 goto half;
33740
33741 case V16HImode:
33742 half_mode = V8HImode;
33743 j = 1;
33744 n = 8;
33745 goto half;
33746
33747 case V8SImode:
33748 half_mode = V4SImode;
33749 j = 2;
33750 n = 4;
33751 goto half;
33752
33753 case V4DImode:
33754 half_mode = V2DImode;
33755 j = 3;
33756 n = 2;
33757 goto half;
33758
33759 case V8SFmode:
33760 half_mode = V4SFmode;
33761 j = 4;
33762 n = 4;
33763 goto half;
33764
33765 case V4DFmode:
33766 half_mode = V2DFmode;
33767 j = 5;
33768 n = 2;
33769 goto half;
33770
33771 half:
33772 /* Compute offset. */
33773 i = elt / n;
33774 elt %= n;
33775
33776 gcc_assert (i <= 1);
33777
33778 /* Extract the half. */
33779 tmp = gen_reg_rtx (half_mode);
33780 emit_insn (gen_extract[j][i] (tmp, target));
33781
33782 /* Put val in tmp at elt. */
33783 ix86_expand_vector_set (false, tmp, val, elt);
33784
33785 /* Put it back. */
33786 emit_insn (gen_insert[j][i] (target, target, tmp));
33787 return;
33788
33789 default:
33790 break;
33791 }
33792
33793 if (use_vec_merge)
33794 {
33795 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
33796 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
33797 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
33798 }
33799 else
33800 {
33801 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
33802
33803 emit_move_insn (mem, target);
33804
33805 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
33806 emit_move_insn (tmp, val);
33807
33808 emit_move_insn (target, mem);
33809 }
33810 }
33811
33812 void
33813 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
33814 {
33815 enum machine_mode mode = GET_MODE (vec);
33816 enum machine_mode inner_mode = GET_MODE_INNER (mode);
33817 bool use_vec_extr = false;
33818 rtx tmp;
33819
33820 switch (mode)
33821 {
33822 case V2SImode:
33823 case V2SFmode:
33824 if (!mmx_ok)
33825 break;
33826 /* FALLTHRU */
33827
33828 case V2DFmode:
33829 case V2DImode:
33830 use_vec_extr = true;
33831 break;
33832
33833 case V4SFmode:
33834 use_vec_extr = TARGET_SSE4_1;
33835 if (use_vec_extr)
33836 break;
33837
33838 switch (elt)
33839 {
33840 case 0:
33841 tmp = vec;
33842 break;
33843
33844 case 1:
33845 case 3:
33846 tmp = gen_reg_rtx (mode);
33847 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
33848 GEN_INT (elt), GEN_INT (elt),
33849 GEN_INT (elt+4), GEN_INT (elt+4)));
33850 break;
33851
33852 case 2:
33853 tmp = gen_reg_rtx (mode);
33854 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
33855 break;
33856
33857 default:
33858 gcc_unreachable ();
33859 }
33860 vec = tmp;
33861 use_vec_extr = true;
33862 elt = 0;
33863 break;
33864
33865 case V4SImode:
33866 use_vec_extr = TARGET_SSE4_1;
33867 if (use_vec_extr)
33868 break;
33869
33870 if (TARGET_SSE2)
33871 {
33872 switch (elt)
33873 {
33874 case 0:
33875 tmp = vec;
33876 break;
33877
33878 case 1:
33879 case 3:
33880 tmp = gen_reg_rtx (mode);
33881 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
33882 GEN_INT (elt), GEN_INT (elt),
33883 GEN_INT (elt), GEN_INT (elt)));
33884 break;
33885
33886 case 2:
33887 tmp = gen_reg_rtx (mode);
33888 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
33889 break;
33890
33891 default:
33892 gcc_unreachable ();
33893 }
33894 vec = tmp;
33895 use_vec_extr = true;
33896 elt = 0;
33897 }
33898 else
33899 {
33900 /* For SSE1, we have to reuse the V4SF code. */
33901 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
33902 gen_lowpart (V4SFmode, vec), elt);
33903 return;
33904 }
33905 break;
33906
33907 case V8HImode:
33908 use_vec_extr = TARGET_SSE2;
33909 break;
33910 case V4HImode:
33911 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
33912 break;
33913
33914 case V16QImode:
33915 use_vec_extr = TARGET_SSE4_1;
33916 break;
33917
33918 case V8SFmode:
33919 if (TARGET_AVX)
33920 {
33921 tmp = gen_reg_rtx (V4SFmode);
33922 if (elt < 4)
33923 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
33924 else
33925 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
33926 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33927 return;
33928 }
33929 break;
33930
33931 case V4DFmode:
33932 if (TARGET_AVX)
33933 {
33934 tmp = gen_reg_rtx (V2DFmode);
33935 if (elt < 2)
33936 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
33937 else
33938 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
33939 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33940 return;
33941 }
33942 break;
33943
33944 case V32QImode:
33945 if (TARGET_AVX)
33946 {
33947 tmp = gen_reg_rtx (V16QImode);
33948 if (elt < 16)
33949 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
33950 else
33951 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
33952 ix86_expand_vector_extract (false, target, tmp, elt & 15);
33953 return;
33954 }
33955 break;
33956
33957 case V16HImode:
33958 if (TARGET_AVX)
33959 {
33960 tmp = gen_reg_rtx (V8HImode);
33961 if (elt < 8)
33962 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
33963 else
33964 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
33965 ix86_expand_vector_extract (false, target, tmp, elt & 7);
33966 return;
33967 }
33968 break;
33969
33970 case V8SImode:
33971 if (TARGET_AVX)
33972 {
33973 tmp = gen_reg_rtx (V4SImode);
33974 if (elt < 4)
33975 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
33976 else
33977 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
33978 ix86_expand_vector_extract (false, target, tmp, elt & 3);
33979 return;
33980 }
33981 break;
33982
33983 case V4DImode:
33984 if (TARGET_AVX)
33985 {
33986 tmp = gen_reg_rtx (V2DImode);
33987 if (elt < 2)
33988 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
33989 else
33990 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
33991 ix86_expand_vector_extract (false, target, tmp, elt & 1);
33992 return;
33993 }
33994 break;
33995
33996 case V8QImode:
33997 /* ??? Could extract the appropriate HImode element and shift. */
33998 default:
33999 break;
34000 }
34001
34002 if (use_vec_extr)
34003 {
34004 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
34005 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
34006
34007 /* Let the rtl optimizers know about the zero extension performed. */
34008 if (inner_mode == QImode || inner_mode == HImode)
34009 {
34010 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
34011 target = gen_lowpart (SImode, target);
34012 }
34013
34014 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
34015 }
34016 else
34017 {
34018 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
34019
34020 emit_move_insn (mem, vec);
34021
34022 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
34023 emit_move_insn (target, tmp);
34024 }
34025 }
34026
34027 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
34028 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
34029 The upper bits of DEST are undefined, though they shouldn't cause
34030 exceptions (some bits from src or all zeros are ok). */
34031
34032 static void
34033 emit_reduc_half (rtx dest, rtx src, int i)
34034 {
34035 rtx tem;
34036 switch (GET_MODE (src))
34037 {
34038 case V4SFmode:
34039 if (i == 128)
34040 tem = gen_sse_movhlps (dest, src, src);
34041 else
34042 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
34043 GEN_INT (1 + 4), GEN_INT (1 + 4));
34044 break;
34045 case V2DFmode:
34046 tem = gen_vec_interleave_highv2df (dest, src, src);
34047 break;
34048 case V16QImode:
34049 case V8HImode:
34050 case V4SImode:
34051 case V2DImode:
34052 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
34053 gen_lowpart (V1TImode, src),
34054 GEN_INT (i / 2));
34055 break;
34056 case V8SFmode:
34057 if (i == 256)
34058 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
34059 else
34060 tem = gen_avx_shufps256 (dest, src, src,
34061 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
34062 break;
34063 case V4DFmode:
34064 if (i == 256)
34065 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
34066 else
34067 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
34068 break;
34069 case V32QImode:
34070 case V16HImode:
34071 case V8SImode:
34072 case V4DImode:
34073 if (i == 256)
34074 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
34075 gen_lowpart (V4DImode, src),
34076 gen_lowpart (V4DImode, src),
34077 const1_rtx);
34078 else
34079 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
34080 gen_lowpart (V2TImode, src),
34081 GEN_INT (i / 2));
34082 break;
34083 default:
34084 gcc_unreachable ();
34085 }
34086 emit_insn (tem);
34087 }
34088
34089 /* Expand a vector reduction. FN is the binary pattern to reduce;
34090 DEST is the destination; IN is the input vector. */
34091
34092 void
34093 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
34094 {
34095 rtx half, dst, vec = in;
34096 enum machine_mode mode = GET_MODE (in);
34097 int i;
34098
34099 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
34100 if (TARGET_SSE4_1
34101 && mode == V8HImode
34102 && fn == gen_uminv8hi3)
34103 {
34104 emit_insn (gen_sse4_1_phminposuw (dest, in));
34105 return;
34106 }
34107
34108 for (i = GET_MODE_BITSIZE (mode);
34109 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
34110 i >>= 1)
34111 {
34112 half = gen_reg_rtx (mode);
34113 emit_reduc_half (half, vec, i);
34114 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
34115 dst = dest;
34116 else
34117 dst = gen_reg_rtx (mode);
34118 emit_insn (fn (dst, half, vec));
34119 vec = dst;
34120 }
34121 }
34122 \f
34123 /* Target hook for scalar_mode_supported_p. */
34124 static bool
34125 ix86_scalar_mode_supported_p (enum machine_mode mode)
34126 {
34127 if (DECIMAL_FLOAT_MODE_P (mode))
34128 return default_decimal_float_supported_p ();
34129 else if (mode == TFmode)
34130 return true;
34131 else
34132 return default_scalar_mode_supported_p (mode);
34133 }
34134
34135 /* Implements target hook vector_mode_supported_p. */
34136 static bool
34137 ix86_vector_mode_supported_p (enum machine_mode mode)
34138 {
34139 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34140 return true;
34141 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34142 return true;
34143 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34144 return true;
34145 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
34146 return true;
34147 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
34148 return true;
34149 return false;
34150 }
34151
34152 /* Target hook for c_mode_for_suffix. */
34153 static enum machine_mode
34154 ix86_c_mode_for_suffix (char suffix)
34155 {
34156 if (suffix == 'q')
34157 return TFmode;
34158 if (suffix == 'w')
34159 return XFmode;
34160
34161 return VOIDmode;
34162 }
34163
34164 /* Worker function for TARGET_MD_ASM_CLOBBERS.
34165
34166 We do this in the new i386 backend to maintain source compatibility
34167 with the old cc0-based compiler. */
34168
34169 static tree
34170 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
34171 tree inputs ATTRIBUTE_UNUSED,
34172 tree clobbers)
34173 {
34174 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
34175 clobbers);
34176 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
34177 clobbers);
34178 return clobbers;
34179 }
34180
34181 /* Implements target vector targetm.asm.encode_section_info. */
34182
34183 static void ATTRIBUTE_UNUSED
34184 ix86_encode_section_info (tree decl, rtx rtl, int first)
34185 {
34186 default_encode_section_info (decl, rtl, first);
34187
34188 if (TREE_CODE (decl) == VAR_DECL
34189 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
34190 && ix86_in_large_data_p (decl))
34191 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
34192 }
34193
34194 /* Worker function for REVERSE_CONDITION. */
34195
34196 enum rtx_code
34197 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
34198 {
34199 return (mode != CCFPmode && mode != CCFPUmode
34200 ? reverse_condition (code)
34201 : reverse_condition_maybe_unordered (code));
34202 }
34203
34204 /* Output code to perform an x87 FP register move, from OPERANDS[1]
34205 to OPERANDS[0]. */
34206
34207 const char *
34208 output_387_reg_move (rtx insn, rtx *operands)
34209 {
34210 if (REG_P (operands[0]))
34211 {
34212 if (REG_P (operands[1])
34213 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34214 {
34215 if (REGNO (operands[0]) == FIRST_STACK_REG)
34216 return output_387_ffreep (operands, 0);
34217 return "fstp\t%y0";
34218 }
34219 if (STACK_TOP_P (operands[0]))
34220 return "fld%Z1\t%y1";
34221 return "fst\t%y0";
34222 }
34223 else if (MEM_P (operands[0]))
34224 {
34225 gcc_assert (REG_P (operands[1]));
34226 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
34227 return "fstp%Z0\t%y0";
34228 else
34229 {
34230 /* There is no non-popping store to memory for XFmode.
34231 So if we need one, follow the store with a load. */
34232 if (GET_MODE (operands[0]) == XFmode)
34233 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
34234 else
34235 return "fst%Z0\t%y0";
34236 }
34237 }
34238 else
34239 gcc_unreachable();
34240 }
34241
34242 /* Output code to perform a conditional jump to LABEL, if C2 flag in
34243 FP status register is set. */
34244
34245 void
34246 ix86_emit_fp_unordered_jump (rtx label)
34247 {
34248 rtx reg = gen_reg_rtx (HImode);
34249 rtx temp;
34250
34251 emit_insn (gen_x86_fnstsw_1 (reg));
34252
34253 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
34254 {
34255 emit_insn (gen_x86_sahf_1 (reg));
34256
34257 temp = gen_rtx_REG (CCmode, FLAGS_REG);
34258 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
34259 }
34260 else
34261 {
34262 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
34263
34264 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
34265 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
34266 }
34267
34268 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
34269 gen_rtx_LABEL_REF (VOIDmode, label),
34270 pc_rtx);
34271 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
34272
34273 emit_jump_insn (temp);
34274 predict_jump (REG_BR_PROB_BASE * 10 / 100);
34275 }
34276
34277 /* Output code to perform a log1p XFmode calculation. */
34278
34279 void ix86_emit_i387_log1p (rtx op0, rtx op1)
34280 {
34281 rtx label1 = gen_label_rtx ();
34282 rtx label2 = gen_label_rtx ();
34283
34284 rtx tmp = gen_reg_rtx (XFmode);
34285 rtx tmp2 = gen_reg_rtx (XFmode);
34286 rtx test;
34287
34288 emit_insn (gen_absxf2 (tmp, op1));
34289 test = gen_rtx_GE (VOIDmode, tmp,
34290 CONST_DOUBLE_FROM_REAL_VALUE (
34291 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
34292 XFmode));
34293 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
34294
34295 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34296 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
34297 emit_jump (label2);
34298
34299 emit_label (label1);
34300 emit_move_insn (tmp, CONST1_RTX (XFmode));
34301 emit_insn (gen_addxf3 (tmp, op1, tmp));
34302 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
34303 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
34304
34305 emit_label (label2);
34306 }
34307
34308 /* Emit code for round calculation. */
34309 void ix86_emit_i387_round (rtx op0, rtx op1)
34310 {
34311 enum machine_mode inmode = GET_MODE (op1);
34312 enum machine_mode outmode = GET_MODE (op0);
34313 rtx e1, e2, res, tmp, tmp1, half;
34314 rtx scratch = gen_reg_rtx (HImode);
34315 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
34316 rtx jump_label = gen_label_rtx ();
34317 rtx insn;
34318 rtx (*gen_abs) (rtx, rtx);
34319 rtx (*gen_neg) (rtx, rtx);
34320
34321 switch (inmode)
34322 {
34323 case SFmode:
34324 gen_abs = gen_abssf2;
34325 break;
34326 case DFmode:
34327 gen_abs = gen_absdf2;
34328 break;
34329 case XFmode:
34330 gen_abs = gen_absxf2;
34331 break;
34332 default:
34333 gcc_unreachable ();
34334 }
34335
34336 switch (outmode)
34337 {
34338 case SFmode:
34339 gen_neg = gen_negsf2;
34340 break;
34341 case DFmode:
34342 gen_neg = gen_negdf2;
34343 break;
34344 case XFmode:
34345 gen_neg = gen_negxf2;
34346 break;
34347 case HImode:
34348 gen_neg = gen_neghi2;
34349 break;
34350 case SImode:
34351 gen_neg = gen_negsi2;
34352 break;
34353 case DImode:
34354 gen_neg = gen_negdi2;
34355 break;
34356 default:
34357 gcc_unreachable ();
34358 }
34359
34360 e1 = gen_reg_rtx (inmode);
34361 e2 = gen_reg_rtx (inmode);
34362 res = gen_reg_rtx (outmode);
34363
34364 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
34365
34366 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
34367
34368 /* scratch = fxam(op1) */
34369 emit_insn (gen_rtx_SET (VOIDmode, scratch,
34370 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
34371 UNSPEC_FXAM)));
34372 /* e1 = fabs(op1) */
34373 emit_insn (gen_abs (e1, op1));
34374
34375 /* e2 = e1 + 0.5 */
34376 half = force_reg (inmode, half);
34377 emit_insn (gen_rtx_SET (VOIDmode, e2,
34378 gen_rtx_PLUS (inmode, e1, half)));
34379
34380 /* res = floor(e2) */
34381 if (inmode != XFmode)
34382 {
34383 tmp1 = gen_reg_rtx (XFmode);
34384
34385 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
34386 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
34387 }
34388 else
34389 tmp1 = e2;
34390
34391 switch (outmode)
34392 {
34393 case SFmode:
34394 case DFmode:
34395 {
34396 rtx tmp0 = gen_reg_rtx (XFmode);
34397
34398 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
34399
34400 emit_insn (gen_rtx_SET (VOIDmode, res,
34401 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
34402 UNSPEC_TRUNC_NOOP)));
34403 }
34404 break;
34405 case XFmode:
34406 emit_insn (gen_frndintxf2_floor (res, tmp1));
34407 break;
34408 case HImode:
34409 emit_insn (gen_lfloorxfhi2 (res, tmp1));
34410 break;
34411 case SImode:
34412 emit_insn (gen_lfloorxfsi2 (res, tmp1));
34413 break;
34414 case DImode:
34415 emit_insn (gen_lfloorxfdi2 (res, tmp1));
34416 break;
34417 default:
34418 gcc_unreachable ();
34419 }
34420
34421 /* flags = signbit(a) */
34422 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
34423
34424 /* if (flags) then res = -res */
34425 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
34426 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
34427 gen_rtx_LABEL_REF (VOIDmode, jump_label),
34428 pc_rtx);
34429 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34430 predict_jump (REG_BR_PROB_BASE * 50 / 100);
34431 JUMP_LABEL (insn) = jump_label;
34432
34433 emit_insn (gen_neg (res, res));
34434
34435 emit_label (jump_label);
34436 LABEL_NUSES (jump_label) = 1;
34437
34438 emit_move_insn (op0, res);
34439 }
34440
34441 /* Output code to perform a Newton-Rhapson approximation of a single precision
34442 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
34443
34444 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
34445 {
34446 rtx x0, x1, e0, e1;
34447
34448 x0 = gen_reg_rtx (mode);
34449 e0 = gen_reg_rtx (mode);
34450 e1 = gen_reg_rtx (mode);
34451 x1 = gen_reg_rtx (mode);
34452
34453 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
34454
34455 b = force_reg (mode, b);
34456
34457 /* x0 = rcp(b) estimate */
34458 emit_insn (gen_rtx_SET (VOIDmode, x0,
34459 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
34460 UNSPEC_RCP)));
34461 /* e0 = x0 * b */
34462 emit_insn (gen_rtx_SET (VOIDmode, e0,
34463 gen_rtx_MULT (mode, x0, b)));
34464
34465 /* e0 = x0 * e0 */
34466 emit_insn (gen_rtx_SET (VOIDmode, e0,
34467 gen_rtx_MULT (mode, x0, e0)));
34468
34469 /* e1 = x0 + x0 */
34470 emit_insn (gen_rtx_SET (VOIDmode, e1,
34471 gen_rtx_PLUS (mode, x0, x0)));
34472
34473 /* x1 = e1 - e0 */
34474 emit_insn (gen_rtx_SET (VOIDmode, x1,
34475 gen_rtx_MINUS (mode, e1, e0)));
34476
34477 /* res = a * x1 */
34478 emit_insn (gen_rtx_SET (VOIDmode, res,
34479 gen_rtx_MULT (mode, a, x1)));
34480 }
34481
34482 /* Output code to perform a Newton-Rhapson approximation of a
34483 single precision floating point [reciprocal] square root. */
34484
34485 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
34486 bool recip)
34487 {
34488 rtx x0, e0, e1, e2, e3, mthree, mhalf;
34489 REAL_VALUE_TYPE r;
34490
34491 x0 = gen_reg_rtx (mode);
34492 e0 = gen_reg_rtx (mode);
34493 e1 = gen_reg_rtx (mode);
34494 e2 = gen_reg_rtx (mode);
34495 e3 = gen_reg_rtx (mode);
34496
34497 real_from_integer (&r, VOIDmode, -3, -1, 0);
34498 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34499
34500 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
34501 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
34502
34503 if (VECTOR_MODE_P (mode))
34504 {
34505 mthree = ix86_build_const_vector (mode, true, mthree);
34506 mhalf = ix86_build_const_vector (mode, true, mhalf);
34507 }
34508
34509 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
34510 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
34511
34512 a = force_reg (mode, a);
34513
34514 /* x0 = rsqrt(a) estimate */
34515 emit_insn (gen_rtx_SET (VOIDmode, x0,
34516 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
34517 UNSPEC_RSQRT)));
34518
34519 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
34520 if (!recip)
34521 {
34522 rtx zero, mask;
34523
34524 zero = gen_reg_rtx (mode);
34525 mask = gen_reg_rtx (mode);
34526
34527 zero = force_reg (mode, CONST0_RTX(mode));
34528 emit_insn (gen_rtx_SET (VOIDmode, mask,
34529 gen_rtx_NE (mode, zero, a)));
34530
34531 emit_insn (gen_rtx_SET (VOIDmode, x0,
34532 gen_rtx_AND (mode, x0, mask)));
34533 }
34534
34535 /* e0 = x0 * a */
34536 emit_insn (gen_rtx_SET (VOIDmode, e0,
34537 gen_rtx_MULT (mode, x0, a)));
34538 /* e1 = e0 * x0 */
34539 emit_insn (gen_rtx_SET (VOIDmode, e1,
34540 gen_rtx_MULT (mode, e0, x0)));
34541
34542 /* e2 = e1 - 3. */
34543 mthree = force_reg (mode, mthree);
34544 emit_insn (gen_rtx_SET (VOIDmode, e2,
34545 gen_rtx_PLUS (mode, e1, mthree)));
34546
34547 mhalf = force_reg (mode, mhalf);
34548 if (recip)
34549 /* e3 = -.5 * x0 */
34550 emit_insn (gen_rtx_SET (VOIDmode, e3,
34551 gen_rtx_MULT (mode, x0, mhalf)));
34552 else
34553 /* e3 = -.5 * e0 */
34554 emit_insn (gen_rtx_SET (VOIDmode, e3,
34555 gen_rtx_MULT (mode, e0, mhalf)));
34556 /* ret = e2 * e3 */
34557 emit_insn (gen_rtx_SET (VOIDmode, res,
34558 gen_rtx_MULT (mode, e2, e3)));
34559 }
34560
34561 #ifdef TARGET_SOLARIS
34562 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
34563
34564 static void
34565 i386_solaris_elf_named_section (const char *name, unsigned int flags,
34566 tree decl)
34567 {
34568 /* With Binutils 2.15, the "@unwind" marker must be specified on
34569 every occurrence of the ".eh_frame" section, not just the first
34570 one. */
34571 if (TARGET_64BIT
34572 && strcmp (name, ".eh_frame") == 0)
34573 {
34574 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
34575 flags & SECTION_WRITE ? "aw" : "a");
34576 return;
34577 }
34578
34579 #ifndef USE_GAS
34580 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
34581 {
34582 solaris_elf_asm_comdat_section (name, flags, decl);
34583 return;
34584 }
34585 #endif
34586
34587 default_elf_asm_named_section (name, flags, decl);
34588 }
34589 #endif /* TARGET_SOLARIS */
34590
34591 /* Return the mangling of TYPE if it is an extended fundamental type. */
34592
34593 static const char *
34594 ix86_mangle_type (const_tree type)
34595 {
34596 type = TYPE_MAIN_VARIANT (type);
34597
34598 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
34599 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
34600 return NULL;
34601
34602 switch (TYPE_MODE (type))
34603 {
34604 case TFmode:
34605 /* __float128 is "g". */
34606 return "g";
34607 case XFmode:
34608 /* "long double" or __float80 is "e". */
34609 return "e";
34610 default:
34611 return NULL;
34612 }
34613 }
34614
34615 /* For 32-bit code we can save PIC register setup by using
34616 __stack_chk_fail_local hidden function instead of calling
34617 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
34618 register, so it is better to call __stack_chk_fail directly. */
34619
34620 static tree ATTRIBUTE_UNUSED
34621 ix86_stack_protect_fail (void)
34622 {
34623 return TARGET_64BIT
34624 ? default_external_stack_protect_fail ()
34625 : default_hidden_stack_protect_fail ();
34626 }
34627
34628 /* Select a format to encode pointers in exception handling data. CODE
34629 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
34630 true if the symbol may be affected by dynamic relocations.
34631
34632 ??? All x86 object file formats are capable of representing this.
34633 After all, the relocation needed is the same as for the call insn.
34634 Whether or not a particular assembler allows us to enter such, I
34635 guess we'll have to see. */
34636 int
34637 asm_preferred_eh_data_format (int code, int global)
34638 {
34639 if (flag_pic)
34640 {
34641 int type = DW_EH_PE_sdata8;
34642 if (!TARGET_64BIT
34643 || ix86_cmodel == CM_SMALL_PIC
34644 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
34645 type = DW_EH_PE_sdata4;
34646 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
34647 }
34648 if (ix86_cmodel == CM_SMALL
34649 || (ix86_cmodel == CM_MEDIUM && code))
34650 return DW_EH_PE_udata4;
34651 return DW_EH_PE_absptr;
34652 }
34653 \f
34654 /* Expand copysign from SIGN to the positive value ABS_VALUE
34655 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
34656 the sign-bit. */
34657 static void
34658 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
34659 {
34660 enum machine_mode mode = GET_MODE (sign);
34661 rtx sgn = gen_reg_rtx (mode);
34662 if (mask == NULL_RTX)
34663 {
34664 enum machine_mode vmode;
34665
34666 if (mode == SFmode)
34667 vmode = V4SFmode;
34668 else if (mode == DFmode)
34669 vmode = V2DFmode;
34670 else
34671 vmode = mode;
34672
34673 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
34674 if (!VECTOR_MODE_P (mode))
34675 {
34676 /* We need to generate a scalar mode mask in this case. */
34677 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34678 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34679 mask = gen_reg_rtx (mode);
34680 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34681 }
34682 }
34683 else
34684 mask = gen_rtx_NOT (mode, mask);
34685 emit_insn (gen_rtx_SET (VOIDmode, sgn,
34686 gen_rtx_AND (mode, mask, sign)));
34687 emit_insn (gen_rtx_SET (VOIDmode, result,
34688 gen_rtx_IOR (mode, abs_value, sgn)));
34689 }
34690
34691 /* Expand fabs (OP0) and return a new rtx that holds the result. The
34692 mask for masking out the sign-bit is stored in *SMASK, if that is
34693 non-null. */
34694 static rtx
34695 ix86_expand_sse_fabs (rtx op0, rtx *smask)
34696 {
34697 enum machine_mode vmode, mode = GET_MODE (op0);
34698 rtx xa, mask;
34699
34700 xa = gen_reg_rtx (mode);
34701 if (mode == SFmode)
34702 vmode = V4SFmode;
34703 else if (mode == DFmode)
34704 vmode = V2DFmode;
34705 else
34706 vmode = mode;
34707 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
34708 if (!VECTOR_MODE_P (mode))
34709 {
34710 /* We need to generate a scalar mode mask in this case. */
34711 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
34712 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
34713 mask = gen_reg_rtx (mode);
34714 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
34715 }
34716 emit_insn (gen_rtx_SET (VOIDmode, xa,
34717 gen_rtx_AND (mode, op0, mask)));
34718
34719 if (smask)
34720 *smask = mask;
34721
34722 return xa;
34723 }
34724
34725 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
34726 swapping the operands if SWAP_OPERANDS is true. The expanded
34727 code is a forward jump to a newly created label in case the
34728 comparison is true. The generated label rtx is returned. */
34729 static rtx
34730 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
34731 bool swap_operands)
34732 {
34733 rtx label, tmp;
34734
34735 if (swap_operands)
34736 {
34737 tmp = op0;
34738 op0 = op1;
34739 op1 = tmp;
34740 }
34741
34742 label = gen_label_rtx ();
34743 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
34744 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34745 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
34746 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
34747 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
34748 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
34749 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
34750 JUMP_LABEL (tmp) = label;
34751
34752 return label;
34753 }
34754
34755 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
34756 using comparison code CODE. Operands are swapped for the comparison if
34757 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
34758 static rtx
34759 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
34760 bool swap_operands)
34761 {
34762 rtx (*insn)(rtx, rtx, rtx, rtx);
34763 enum machine_mode mode = GET_MODE (op0);
34764 rtx mask = gen_reg_rtx (mode);
34765
34766 if (swap_operands)
34767 {
34768 rtx tmp = op0;
34769 op0 = op1;
34770 op1 = tmp;
34771 }
34772
34773 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
34774
34775 emit_insn (insn (mask, op0, op1,
34776 gen_rtx_fmt_ee (code, mode, op0, op1)));
34777 return mask;
34778 }
34779
34780 /* Generate and return a rtx of mode MODE for 2**n where n is the number
34781 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
34782 static rtx
34783 ix86_gen_TWO52 (enum machine_mode mode)
34784 {
34785 REAL_VALUE_TYPE TWO52r;
34786 rtx TWO52;
34787
34788 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
34789 TWO52 = const_double_from_real_value (TWO52r, mode);
34790 TWO52 = force_reg (mode, TWO52);
34791
34792 return TWO52;
34793 }
34794
34795 /* Expand SSE sequence for computing lround from OP1 storing
34796 into OP0. */
34797 void
34798 ix86_expand_lround (rtx op0, rtx op1)
34799 {
34800 /* C code for the stuff we're doing below:
34801 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
34802 return (long)tmp;
34803 */
34804 enum machine_mode mode = GET_MODE (op1);
34805 const struct real_format *fmt;
34806 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
34807 rtx adj;
34808
34809 /* load nextafter (0.5, 0.0) */
34810 fmt = REAL_MODE_FORMAT (mode);
34811 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
34812 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
34813
34814 /* adj = copysign (0.5, op1) */
34815 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
34816 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
34817
34818 /* adj = op1 + adj */
34819 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
34820
34821 /* op0 = (imode)adj */
34822 expand_fix (op0, adj, 0);
34823 }
34824
34825 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
34826 into OPERAND0. */
34827 void
34828 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
34829 {
34830 /* C code for the stuff we're doing below (for do_floor):
34831 xi = (long)op1;
34832 xi -= (double)xi > op1 ? 1 : 0;
34833 return xi;
34834 */
34835 enum machine_mode fmode = GET_MODE (op1);
34836 enum machine_mode imode = GET_MODE (op0);
34837 rtx ireg, freg, label, tmp;
34838
34839 /* reg = (long)op1 */
34840 ireg = gen_reg_rtx (imode);
34841 expand_fix (ireg, op1, 0);
34842
34843 /* freg = (double)reg */
34844 freg = gen_reg_rtx (fmode);
34845 expand_float (freg, ireg, 0);
34846
34847 /* ireg = (freg > op1) ? ireg - 1 : ireg */
34848 label = ix86_expand_sse_compare_and_jump (UNLE,
34849 freg, op1, !do_floor);
34850 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
34851 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
34852 emit_move_insn (ireg, tmp);
34853
34854 emit_label (label);
34855 LABEL_NUSES (label) = 1;
34856
34857 emit_move_insn (op0, ireg);
34858 }
34859
34860 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
34861 result in OPERAND0. */
34862 void
34863 ix86_expand_rint (rtx operand0, rtx operand1)
34864 {
34865 /* C code for the stuff we're doing below:
34866 xa = fabs (operand1);
34867 if (!isless (xa, 2**52))
34868 return operand1;
34869 xa = xa + 2**52 - 2**52;
34870 return copysign (xa, operand1);
34871 */
34872 enum machine_mode mode = GET_MODE (operand0);
34873 rtx res, xa, label, TWO52, mask;
34874
34875 res = gen_reg_rtx (mode);
34876 emit_move_insn (res, operand1);
34877
34878 /* xa = abs (operand1) */
34879 xa = ix86_expand_sse_fabs (res, &mask);
34880
34881 /* if (!isless (xa, TWO52)) goto label; */
34882 TWO52 = ix86_gen_TWO52 (mode);
34883 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34884
34885 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34886 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34887
34888 ix86_sse_copysign_to_positive (res, xa, res, mask);
34889
34890 emit_label (label);
34891 LABEL_NUSES (label) = 1;
34892
34893 emit_move_insn (operand0, res);
34894 }
34895
34896 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34897 into OPERAND0. */
34898 void
34899 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
34900 {
34901 /* C code for the stuff we expand below.
34902 double xa = fabs (x), x2;
34903 if (!isless (xa, TWO52))
34904 return x;
34905 xa = xa + TWO52 - TWO52;
34906 x2 = copysign (xa, x);
34907 Compensate. Floor:
34908 if (x2 > x)
34909 x2 -= 1;
34910 Compensate. Ceil:
34911 if (x2 < x)
34912 x2 -= -1;
34913 return x2;
34914 */
34915 enum machine_mode mode = GET_MODE (operand0);
34916 rtx xa, TWO52, tmp, label, one, res, mask;
34917
34918 TWO52 = ix86_gen_TWO52 (mode);
34919
34920 /* Temporary for holding the result, initialized to the input
34921 operand to ease control flow. */
34922 res = gen_reg_rtx (mode);
34923 emit_move_insn (res, operand1);
34924
34925 /* xa = abs (operand1) */
34926 xa = ix86_expand_sse_fabs (res, &mask);
34927
34928 /* if (!isless (xa, TWO52)) goto label; */
34929 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34930
34931 /* xa = xa + TWO52 - TWO52; */
34932 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
34933 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
34934
34935 /* xa = copysign (xa, operand1) */
34936 ix86_sse_copysign_to_positive (xa, xa, res, mask);
34937
34938 /* generate 1.0 or -1.0 */
34939 one = force_reg (mode,
34940 const_double_from_real_value (do_floor
34941 ? dconst1 : dconstm1, mode));
34942
34943 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
34944 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
34945 emit_insn (gen_rtx_SET (VOIDmode, tmp,
34946 gen_rtx_AND (mode, one, tmp)));
34947 /* We always need to subtract here to preserve signed zero. */
34948 tmp = expand_simple_binop (mode, MINUS,
34949 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
34950 emit_move_insn (res, tmp);
34951
34952 emit_label (label);
34953 LABEL_NUSES (label) = 1;
34954
34955 emit_move_insn (operand0, res);
34956 }
34957
34958 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
34959 into OPERAND0. */
34960 void
34961 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
34962 {
34963 /* C code for the stuff we expand below.
34964 double xa = fabs (x), x2;
34965 if (!isless (xa, TWO52))
34966 return x;
34967 x2 = (double)(long)x;
34968 Compensate. Floor:
34969 if (x2 > x)
34970 x2 -= 1;
34971 Compensate. Ceil:
34972 if (x2 < x)
34973 x2 += 1;
34974 if (HONOR_SIGNED_ZEROS (mode))
34975 return copysign (x2, x);
34976 return x2;
34977 */
34978 enum machine_mode mode = GET_MODE (operand0);
34979 rtx xa, xi, TWO52, tmp, label, one, res, mask;
34980
34981 TWO52 = ix86_gen_TWO52 (mode);
34982
34983 /* Temporary for holding the result, initialized to the input
34984 operand to ease control flow. */
34985 res = gen_reg_rtx (mode);
34986 emit_move_insn (res, operand1);
34987
34988 /* xa = abs (operand1) */
34989 xa = ix86_expand_sse_fabs (res, &mask);
34990
34991 /* if (!isless (xa, TWO52)) goto label; */
34992 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
34993
34994 /* xa = (double)(long)x */
34995 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
34996 expand_fix (xi, res, 0);
34997 expand_float (xa, xi, 0);
34998
34999 /* generate 1.0 */
35000 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35001
35002 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
35003 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
35004 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35005 gen_rtx_AND (mode, one, tmp)));
35006 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
35007 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35008 emit_move_insn (res, tmp);
35009
35010 if (HONOR_SIGNED_ZEROS (mode))
35011 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35012
35013 emit_label (label);
35014 LABEL_NUSES (label) = 1;
35015
35016 emit_move_insn (operand0, res);
35017 }
35018
35019 /* Expand SSE sequence for computing round from OPERAND1 storing
35020 into OPERAND0. Sequence that works without relying on DImode truncation
35021 via cvttsd2siq that is only available on 64bit targets. */
35022 void
35023 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
35024 {
35025 /* C code for the stuff we expand below.
35026 double xa = fabs (x), xa2, x2;
35027 if (!isless (xa, TWO52))
35028 return x;
35029 Using the absolute value and copying back sign makes
35030 -0.0 -> -0.0 correct.
35031 xa2 = xa + TWO52 - TWO52;
35032 Compensate.
35033 dxa = xa2 - xa;
35034 if (dxa <= -0.5)
35035 xa2 += 1;
35036 else if (dxa > 0.5)
35037 xa2 -= 1;
35038 x2 = copysign (xa2, x);
35039 return x2;
35040 */
35041 enum machine_mode mode = GET_MODE (operand0);
35042 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
35043
35044 TWO52 = ix86_gen_TWO52 (mode);
35045
35046 /* Temporary for holding the result, initialized to the input
35047 operand to ease control flow. */
35048 res = gen_reg_rtx (mode);
35049 emit_move_insn (res, operand1);
35050
35051 /* xa = abs (operand1) */
35052 xa = ix86_expand_sse_fabs (res, &mask);
35053
35054 /* if (!isless (xa, TWO52)) goto label; */
35055 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35056
35057 /* xa2 = xa + TWO52 - TWO52; */
35058 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35059 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
35060
35061 /* dxa = xa2 - xa; */
35062 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
35063
35064 /* generate 0.5, 1.0 and -0.5 */
35065 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
35066 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
35067 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
35068 0, OPTAB_DIRECT);
35069
35070 /* Compensate. */
35071 tmp = gen_reg_rtx (mode);
35072 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
35073 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
35074 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35075 gen_rtx_AND (mode, one, tmp)));
35076 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35077 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
35078 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
35079 emit_insn (gen_rtx_SET (VOIDmode, tmp,
35080 gen_rtx_AND (mode, one, tmp)));
35081 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
35082
35083 /* res = copysign (xa2, operand1) */
35084 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
35085
35086 emit_label (label);
35087 LABEL_NUSES (label) = 1;
35088
35089 emit_move_insn (operand0, res);
35090 }
35091
35092 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35093 into OPERAND0. */
35094 void
35095 ix86_expand_trunc (rtx operand0, rtx operand1)
35096 {
35097 /* C code for SSE variant we expand below.
35098 double xa = fabs (x), x2;
35099 if (!isless (xa, TWO52))
35100 return x;
35101 x2 = (double)(long)x;
35102 if (HONOR_SIGNED_ZEROS (mode))
35103 return copysign (x2, x);
35104 return x2;
35105 */
35106 enum machine_mode mode = GET_MODE (operand0);
35107 rtx xa, xi, TWO52, label, res, mask;
35108
35109 TWO52 = ix86_gen_TWO52 (mode);
35110
35111 /* Temporary for holding the result, initialized to the input
35112 operand to ease control flow. */
35113 res = gen_reg_rtx (mode);
35114 emit_move_insn (res, operand1);
35115
35116 /* xa = abs (operand1) */
35117 xa = ix86_expand_sse_fabs (res, &mask);
35118
35119 /* if (!isless (xa, TWO52)) goto label; */
35120 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35121
35122 /* x = (double)(long)x */
35123 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35124 expand_fix (xi, res, 0);
35125 expand_float (res, xi, 0);
35126
35127 if (HONOR_SIGNED_ZEROS (mode))
35128 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
35129
35130 emit_label (label);
35131 LABEL_NUSES (label) = 1;
35132
35133 emit_move_insn (operand0, res);
35134 }
35135
35136 /* Expand SSE sequence for computing trunc from OPERAND1 storing
35137 into OPERAND0. */
35138 void
35139 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
35140 {
35141 enum machine_mode mode = GET_MODE (operand0);
35142 rtx xa, mask, TWO52, label, one, res, smask, tmp;
35143
35144 /* C code for SSE variant we expand below.
35145 double xa = fabs (x), x2;
35146 if (!isless (xa, TWO52))
35147 return x;
35148 xa2 = xa + TWO52 - TWO52;
35149 Compensate:
35150 if (xa2 > xa)
35151 xa2 -= 1.0;
35152 x2 = copysign (xa2, x);
35153 return x2;
35154 */
35155
35156 TWO52 = ix86_gen_TWO52 (mode);
35157
35158 /* Temporary for holding the result, initialized to the input
35159 operand to ease control flow. */
35160 res = gen_reg_rtx (mode);
35161 emit_move_insn (res, operand1);
35162
35163 /* xa = abs (operand1) */
35164 xa = ix86_expand_sse_fabs (res, &smask);
35165
35166 /* if (!isless (xa, TWO52)) goto label; */
35167 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35168
35169 /* res = xa + TWO52 - TWO52; */
35170 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
35171 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
35172 emit_move_insn (res, tmp);
35173
35174 /* generate 1.0 */
35175 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
35176
35177 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
35178 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
35179 emit_insn (gen_rtx_SET (VOIDmode, mask,
35180 gen_rtx_AND (mode, mask, one)));
35181 tmp = expand_simple_binop (mode, MINUS,
35182 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
35183 emit_move_insn (res, tmp);
35184
35185 /* res = copysign (res, operand1) */
35186 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
35187
35188 emit_label (label);
35189 LABEL_NUSES (label) = 1;
35190
35191 emit_move_insn (operand0, res);
35192 }
35193
35194 /* Expand SSE sequence for computing round from OPERAND1 storing
35195 into OPERAND0. */
35196 void
35197 ix86_expand_round (rtx operand0, rtx operand1)
35198 {
35199 /* C code for the stuff we're doing below:
35200 double xa = fabs (x);
35201 if (!isless (xa, TWO52))
35202 return x;
35203 xa = (double)(long)(xa + nextafter (0.5, 0.0));
35204 return copysign (xa, x);
35205 */
35206 enum machine_mode mode = GET_MODE (operand0);
35207 rtx res, TWO52, xa, label, xi, half, mask;
35208 const struct real_format *fmt;
35209 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35210
35211 /* Temporary for holding the result, initialized to the input
35212 operand to ease control flow. */
35213 res = gen_reg_rtx (mode);
35214 emit_move_insn (res, operand1);
35215
35216 TWO52 = ix86_gen_TWO52 (mode);
35217 xa = ix86_expand_sse_fabs (res, &mask);
35218 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
35219
35220 /* load nextafter (0.5, 0.0) */
35221 fmt = REAL_MODE_FORMAT (mode);
35222 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35223 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35224
35225 /* xa = xa + 0.5 */
35226 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
35227 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
35228
35229 /* xa = (double)(int64_t)xa */
35230 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
35231 expand_fix (xi, xa, 0);
35232 expand_float (xa, xi, 0);
35233
35234 /* res = copysign (xa, operand1) */
35235 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
35236
35237 emit_label (label);
35238 LABEL_NUSES (label) = 1;
35239
35240 emit_move_insn (operand0, res);
35241 }
35242
35243 /* Expand SSE sequence for computing round
35244 from OP1 storing into OP0 using sse4 round insn. */
35245 void
35246 ix86_expand_round_sse4 (rtx op0, rtx op1)
35247 {
35248 enum machine_mode mode = GET_MODE (op0);
35249 rtx e1, e2, res, half;
35250 const struct real_format *fmt;
35251 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
35252 rtx (*gen_copysign) (rtx, rtx, rtx);
35253 rtx (*gen_round) (rtx, rtx, rtx);
35254
35255 switch (mode)
35256 {
35257 case SFmode:
35258 gen_copysign = gen_copysignsf3;
35259 gen_round = gen_sse4_1_roundsf2;
35260 break;
35261 case DFmode:
35262 gen_copysign = gen_copysigndf3;
35263 gen_round = gen_sse4_1_rounddf2;
35264 break;
35265 default:
35266 gcc_unreachable ();
35267 }
35268
35269 /* round (a) = trunc (a + copysign (0.5, a)) */
35270
35271 /* load nextafter (0.5, 0.0) */
35272 fmt = REAL_MODE_FORMAT (mode);
35273 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
35274 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
35275 half = const_double_from_real_value (pred_half, mode);
35276
35277 /* e1 = copysign (0.5, op1) */
35278 e1 = gen_reg_rtx (mode);
35279 emit_insn (gen_copysign (e1, half, op1));
35280
35281 /* e2 = op1 + e1 */
35282 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
35283
35284 /* res = trunc (e2) */
35285 res = gen_reg_rtx (mode);
35286 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
35287
35288 emit_move_insn (op0, res);
35289 }
35290 \f
35291
35292 /* Table of valid machine attributes. */
35293 static const struct attribute_spec ix86_attribute_table[] =
35294 {
35295 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
35296 affects_type_identity } */
35297 /* Stdcall attribute says callee is responsible for popping arguments
35298 if they are not variable. */
35299 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35300 true },
35301 /* Fastcall attribute says callee is responsible for popping arguments
35302 if they are not variable. */
35303 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35304 true },
35305 /* Thiscall attribute says callee is responsible for popping arguments
35306 if they are not variable. */
35307 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35308 true },
35309 /* Cdecl attribute says the callee is a normal C declaration */
35310 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35311 true },
35312 /* Regparm attribute specifies how many integer arguments are to be
35313 passed in registers. */
35314 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
35315 true },
35316 /* Sseregparm attribute says we are using x86_64 calling conventions
35317 for FP arguments. */
35318 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
35319 true },
35320 /* The transactional memory builtins are implicitly regparm or fastcall
35321 depending on the ABI. Override the generic do-nothing attribute that
35322 these builtins were declared with. */
35323 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
35324 true },
35325 /* force_align_arg_pointer says this function realigns the stack at entry. */
35326 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
35327 false, true, true, ix86_handle_cconv_attribute, false },
35328 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
35329 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
35330 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
35331 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
35332 false },
35333 #endif
35334 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35335 false },
35336 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
35337 false },
35338 #ifdef SUBTARGET_ATTRIBUTE_TABLE
35339 SUBTARGET_ATTRIBUTE_TABLE,
35340 #endif
35341 /* ms_abi and sysv_abi calling convention function attributes. */
35342 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35343 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
35344 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
35345 false },
35346 { "callee_pop_aggregate_return", 1, 1, false, true, true,
35347 ix86_handle_callee_pop_aggregate_return, true },
35348 /* End element. */
35349 { NULL, 0, 0, false, false, false, NULL, false }
35350 };
35351
35352 /* Implement targetm.vectorize.builtin_vectorization_cost. */
35353 static int
35354 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
35355 tree vectype ATTRIBUTE_UNUSED,
35356 int misalign ATTRIBUTE_UNUSED)
35357 {
35358 switch (type_of_cost)
35359 {
35360 case scalar_stmt:
35361 return ix86_cost->scalar_stmt_cost;
35362
35363 case scalar_load:
35364 return ix86_cost->scalar_load_cost;
35365
35366 case scalar_store:
35367 return ix86_cost->scalar_store_cost;
35368
35369 case vector_stmt:
35370 return ix86_cost->vec_stmt_cost;
35371
35372 case vector_load:
35373 return ix86_cost->vec_align_load_cost;
35374
35375 case vector_store:
35376 return ix86_cost->vec_store_cost;
35377
35378 case vec_to_scalar:
35379 return ix86_cost->vec_to_scalar_cost;
35380
35381 case scalar_to_vec:
35382 return ix86_cost->scalar_to_vec_cost;
35383
35384 case unaligned_load:
35385 case unaligned_store:
35386 return ix86_cost->vec_unalign_load_cost;
35387
35388 case cond_branch_taken:
35389 return ix86_cost->cond_taken_branch_cost;
35390
35391 case cond_branch_not_taken:
35392 return ix86_cost->cond_not_taken_branch_cost;
35393
35394 case vec_perm:
35395 case vec_promote_demote:
35396 return ix86_cost->vec_stmt_cost;
35397
35398 default:
35399 gcc_unreachable ();
35400 }
35401 }
35402
35403 /* Construct (set target (vec_select op0 (parallel perm))) and
35404 return true if that's a valid instruction in the active ISA. */
35405
35406 static bool
35407 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
35408 {
35409 rtx rperm[MAX_VECT_LEN], x;
35410 unsigned i;
35411
35412 for (i = 0; i < nelt; ++i)
35413 rperm[i] = GEN_INT (perm[i]);
35414
35415 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
35416 x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
35417 x = gen_rtx_SET (VOIDmode, target, x);
35418
35419 x = emit_insn (x);
35420 if (recog_memoized (x) < 0)
35421 {
35422 remove_insn (x);
35423 return false;
35424 }
35425 return true;
35426 }
35427
35428 /* Similar, but generate a vec_concat from op0 and op1 as well. */
35429
35430 static bool
35431 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
35432 const unsigned char *perm, unsigned nelt)
35433 {
35434 enum machine_mode v2mode;
35435 rtx x;
35436
35437 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
35438 x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
35439 return expand_vselect (target, x, perm, nelt);
35440 }
35441
35442 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35443 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
35444
35445 static bool
35446 expand_vec_perm_blend (struct expand_vec_perm_d *d)
35447 {
35448 enum machine_mode vmode = d->vmode;
35449 unsigned i, mask, nelt = d->nelt;
35450 rtx target, op0, op1, x;
35451 rtx rperm[32], vperm;
35452
35453 if (d->op0 == d->op1)
35454 return false;
35455 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
35456 ;
35457 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
35458 ;
35459 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
35460 ;
35461 else
35462 return false;
35463
35464 /* This is a blend, not a permute. Elements must stay in their
35465 respective lanes. */
35466 for (i = 0; i < nelt; ++i)
35467 {
35468 unsigned e = d->perm[i];
35469 if (!(e == i || e == i + nelt))
35470 return false;
35471 }
35472
35473 if (d->testing_p)
35474 return true;
35475
35476 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
35477 decision should be extracted elsewhere, so that we only try that
35478 sequence once all budget==3 options have been tried. */
35479 target = d->target;
35480 op0 = d->op0;
35481 op1 = d->op1;
35482 mask = 0;
35483
35484 switch (vmode)
35485 {
35486 case V4DFmode:
35487 case V8SFmode:
35488 case V2DFmode:
35489 case V4SFmode:
35490 case V8HImode:
35491 case V8SImode:
35492 for (i = 0; i < nelt; ++i)
35493 mask |= (d->perm[i] >= nelt) << i;
35494 break;
35495
35496 case V2DImode:
35497 for (i = 0; i < 2; ++i)
35498 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
35499 vmode = V8HImode;
35500 goto do_subreg;
35501
35502 case V4SImode:
35503 for (i = 0; i < 4; ++i)
35504 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35505 vmode = V8HImode;
35506 goto do_subreg;
35507
35508 case V16QImode:
35509 /* See if bytes move in pairs so we can use pblendw with
35510 an immediate argument, rather than pblendvb with a vector
35511 argument. */
35512 for (i = 0; i < 16; i += 2)
35513 if (d->perm[i] + 1 != d->perm[i + 1])
35514 {
35515 use_pblendvb:
35516 for (i = 0; i < nelt; ++i)
35517 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
35518
35519 finish_pblendvb:
35520 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
35521 vperm = force_reg (vmode, vperm);
35522
35523 if (GET_MODE_SIZE (vmode) == 16)
35524 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
35525 else
35526 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
35527 return true;
35528 }
35529
35530 for (i = 0; i < 8; ++i)
35531 mask |= (d->perm[i * 2] >= 16) << i;
35532 vmode = V8HImode;
35533 /* FALLTHRU */
35534
35535 do_subreg:
35536 target = gen_lowpart (vmode, target);
35537 op0 = gen_lowpart (vmode, op0);
35538 op1 = gen_lowpart (vmode, op1);
35539 break;
35540
35541 case V32QImode:
35542 /* See if bytes move in pairs. If not, vpblendvb must be used. */
35543 for (i = 0; i < 32; i += 2)
35544 if (d->perm[i] + 1 != d->perm[i + 1])
35545 goto use_pblendvb;
35546 /* See if bytes move in quadruplets. If yes, vpblendd
35547 with immediate can be used. */
35548 for (i = 0; i < 32; i += 4)
35549 if (d->perm[i] + 2 != d->perm[i + 2])
35550 break;
35551 if (i < 32)
35552 {
35553 /* See if bytes move the same in both lanes. If yes,
35554 vpblendw with immediate can be used. */
35555 for (i = 0; i < 16; i += 2)
35556 if (d->perm[i] + 16 != d->perm[i + 16])
35557 goto use_pblendvb;
35558
35559 /* Use vpblendw. */
35560 for (i = 0; i < 16; ++i)
35561 mask |= (d->perm[i * 2] >= 32) << i;
35562 vmode = V16HImode;
35563 goto do_subreg;
35564 }
35565
35566 /* Use vpblendd. */
35567 for (i = 0; i < 8; ++i)
35568 mask |= (d->perm[i * 4] >= 32) << i;
35569 vmode = V8SImode;
35570 goto do_subreg;
35571
35572 case V16HImode:
35573 /* See if words move in pairs. If yes, vpblendd can be used. */
35574 for (i = 0; i < 16; i += 2)
35575 if (d->perm[i] + 1 != d->perm[i + 1])
35576 break;
35577 if (i < 16)
35578 {
35579 /* See if words move the same in both lanes. If not,
35580 vpblendvb must be used. */
35581 for (i = 0; i < 8; i++)
35582 if (d->perm[i] + 8 != d->perm[i + 8])
35583 {
35584 /* Use vpblendvb. */
35585 for (i = 0; i < 32; ++i)
35586 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
35587
35588 vmode = V32QImode;
35589 nelt = 32;
35590 target = gen_lowpart (vmode, target);
35591 op0 = gen_lowpart (vmode, op0);
35592 op1 = gen_lowpart (vmode, op1);
35593 goto finish_pblendvb;
35594 }
35595
35596 /* Use vpblendw. */
35597 for (i = 0; i < 16; ++i)
35598 mask |= (d->perm[i] >= 16) << i;
35599 break;
35600 }
35601
35602 /* Use vpblendd. */
35603 for (i = 0; i < 8; ++i)
35604 mask |= (d->perm[i * 2] >= 16) << i;
35605 vmode = V8SImode;
35606 goto do_subreg;
35607
35608 case V4DImode:
35609 /* Use vpblendd. */
35610 for (i = 0; i < 4; ++i)
35611 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
35612 vmode = V8SImode;
35613 goto do_subreg;
35614
35615 default:
35616 gcc_unreachable ();
35617 }
35618
35619 /* This matches five different patterns with the different modes. */
35620 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
35621 x = gen_rtx_SET (VOIDmode, target, x);
35622 emit_insn (x);
35623
35624 return true;
35625 }
35626
35627 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35628 in terms of the variable form of vpermilps.
35629
35630 Note that we will have already failed the immediate input vpermilps,
35631 which requires that the high and low part shuffle be identical; the
35632 variable form doesn't require that. */
35633
35634 static bool
35635 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
35636 {
35637 rtx rperm[8], vperm;
35638 unsigned i;
35639
35640 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
35641 return false;
35642
35643 /* We can only permute within the 128-bit lane. */
35644 for (i = 0; i < 8; ++i)
35645 {
35646 unsigned e = d->perm[i];
35647 if (i < 4 ? e >= 4 : e < 4)
35648 return false;
35649 }
35650
35651 if (d->testing_p)
35652 return true;
35653
35654 for (i = 0; i < 8; ++i)
35655 {
35656 unsigned e = d->perm[i];
35657
35658 /* Within each 128-bit lane, the elements of op0 are numbered
35659 from 0 and the elements of op1 are numbered from 4. */
35660 if (e >= 8 + 4)
35661 e -= 8;
35662 else if (e >= 4)
35663 e -= 4;
35664
35665 rperm[i] = GEN_INT (e);
35666 }
35667
35668 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
35669 vperm = force_reg (V8SImode, vperm);
35670 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
35671
35672 return true;
35673 }
35674
35675 /* Return true if permutation D can be performed as VMODE permutation
35676 instead. */
35677
35678 static bool
35679 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
35680 {
35681 unsigned int i, j, chunk;
35682
35683 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
35684 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
35685 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
35686 return false;
35687
35688 if (GET_MODE_NUNITS (vmode) >= d->nelt)
35689 return true;
35690
35691 chunk = d->nelt / GET_MODE_NUNITS (vmode);
35692 for (i = 0; i < d->nelt; i += chunk)
35693 if (d->perm[i] & (chunk - 1))
35694 return false;
35695 else
35696 for (j = 1; j < chunk; ++j)
35697 if (d->perm[i] + j != d->perm[i + j])
35698 return false;
35699
35700 return true;
35701 }
35702
35703 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35704 in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */
35705
35706 static bool
35707 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
35708 {
35709 unsigned i, nelt, eltsz, mask;
35710 unsigned char perm[32];
35711 enum machine_mode vmode = V16QImode;
35712 rtx rperm[32], vperm, target, op0, op1;
35713
35714 nelt = d->nelt;
35715
35716 if (d->op0 != d->op1)
35717 {
35718 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
35719 {
35720 if (TARGET_AVX2
35721 && valid_perm_using_mode_p (V2TImode, d))
35722 {
35723 if (d->testing_p)
35724 return true;
35725
35726 /* Use vperm2i128 insn. The pattern uses
35727 V4DImode instead of V2TImode. */
35728 target = gen_lowpart (V4DImode, d->target);
35729 op0 = gen_lowpart (V4DImode, d->op0);
35730 op1 = gen_lowpart (V4DImode, d->op1);
35731 rperm[0]
35732 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
35733 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
35734 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
35735 return true;
35736 }
35737 return false;
35738 }
35739 }
35740 else
35741 {
35742 if (GET_MODE_SIZE (d->vmode) == 16)
35743 {
35744 if (!TARGET_SSSE3)
35745 return false;
35746 }
35747 else if (GET_MODE_SIZE (d->vmode) == 32)
35748 {
35749 if (!TARGET_AVX2)
35750 return false;
35751
35752 /* V4DImode should be already handled through
35753 expand_vselect by vpermq instruction. */
35754 gcc_assert (d->vmode != V4DImode);
35755
35756 vmode = V32QImode;
35757 if (d->vmode == V8SImode
35758 || d->vmode == V16HImode
35759 || d->vmode == V32QImode)
35760 {
35761 /* First see if vpermq can be used for
35762 V8SImode/V16HImode/V32QImode. */
35763 if (valid_perm_using_mode_p (V4DImode, d))
35764 {
35765 for (i = 0; i < 4; i++)
35766 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
35767 if (d->testing_p)
35768 return true;
35769 return expand_vselect (gen_lowpart (V4DImode, d->target),
35770 gen_lowpart (V4DImode, d->op0),
35771 perm, 4);
35772 }
35773
35774 /* Next see if vpermd can be used. */
35775 if (valid_perm_using_mode_p (V8SImode, d))
35776 vmode = V8SImode;
35777 }
35778
35779 if (vmode == V32QImode)
35780 {
35781 /* vpshufb only works intra lanes, it is not
35782 possible to shuffle bytes in between the lanes. */
35783 for (i = 0; i < nelt; ++i)
35784 if ((d->perm[i] ^ i) & (nelt / 2))
35785 return false;
35786 }
35787 }
35788 else
35789 return false;
35790 }
35791
35792 if (d->testing_p)
35793 return true;
35794
35795 if (vmode == V8SImode)
35796 for (i = 0; i < 8; ++i)
35797 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
35798 else
35799 {
35800 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
35801 if (d->op0 != d->op1)
35802 mask = 2 * nelt - 1;
35803 else if (vmode == V16QImode)
35804 mask = nelt - 1;
35805 else
35806 mask = nelt / 2 - 1;
35807
35808 for (i = 0; i < nelt; ++i)
35809 {
35810 unsigned j, e = d->perm[i] & mask;
35811 for (j = 0; j < eltsz; ++j)
35812 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
35813 }
35814 }
35815
35816 vperm = gen_rtx_CONST_VECTOR (vmode,
35817 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
35818 vperm = force_reg (vmode, vperm);
35819
35820 target = gen_lowpart (vmode, d->target);
35821 op0 = gen_lowpart (vmode, d->op0);
35822 if (d->op0 == d->op1)
35823 {
35824 if (vmode == V16QImode)
35825 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
35826 else if (vmode == V32QImode)
35827 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
35828 else
35829 emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
35830 }
35831 else
35832 {
35833 op1 = gen_lowpart (vmode, d->op1);
35834 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
35835 }
35836
35837 return true;
35838 }
35839
35840 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
35841 in a single instruction. */
35842
35843 static bool
35844 expand_vec_perm_1 (struct expand_vec_perm_d *d)
35845 {
35846 unsigned i, nelt = d->nelt;
35847 unsigned char perm2[MAX_VECT_LEN];
35848
35849 /* Check plain VEC_SELECT first, because AVX has instructions that could
35850 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
35851 input where SEL+CONCAT may not. */
35852 if (d->op0 == d->op1)
35853 {
35854 int mask = nelt - 1;
35855 bool identity_perm = true;
35856 bool broadcast_perm = true;
35857
35858 for (i = 0; i < nelt; i++)
35859 {
35860 perm2[i] = d->perm[i] & mask;
35861 if (perm2[i] != i)
35862 identity_perm = false;
35863 if (perm2[i])
35864 broadcast_perm = false;
35865 }
35866
35867 if (identity_perm)
35868 {
35869 if (!d->testing_p)
35870 emit_move_insn (d->target, d->op0);
35871 return true;
35872 }
35873 else if (broadcast_perm && TARGET_AVX2)
35874 {
35875 /* Use vpbroadcast{b,w,d}. */
35876 rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
35877 switch (d->vmode)
35878 {
35879 case V32QImode:
35880 op = gen_lowpart (V16QImode, op);
35881 gen = gen_avx2_pbroadcastv32qi;
35882 break;
35883 case V16HImode:
35884 op = gen_lowpart (V8HImode, op);
35885 gen = gen_avx2_pbroadcastv16hi;
35886 break;
35887 case V8SImode:
35888 op = gen_lowpart (V4SImode, op);
35889 gen = gen_avx2_pbroadcastv8si;
35890 break;
35891 case V16QImode:
35892 gen = gen_avx2_pbroadcastv16qi;
35893 break;
35894 case V8HImode:
35895 gen = gen_avx2_pbroadcastv8hi;
35896 break;
35897 /* For other modes prefer other shuffles this function creates. */
35898 default: break;
35899 }
35900 if (gen != NULL)
35901 {
35902 if (!d->testing_p)
35903 emit_insn (gen (d->target, op));
35904 return true;
35905 }
35906 }
35907
35908 if (expand_vselect (d->target, d->op0, perm2, nelt))
35909 return true;
35910
35911 /* There are plenty of patterns in sse.md that are written for
35912 SEL+CONCAT and are not replicated for a single op. Perhaps
35913 that should be changed, to avoid the nastiness here. */
35914
35915 /* Recognize interleave style patterns, which means incrementing
35916 every other permutation operand. */
35917 for (i = 0; i < nelt; i += 2)
35918 {
35919 perm2[i] = d->perm[i] & mask;
35920 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
35921 }
35922 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35923 return true;
35924
35925 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
35926 if (nelt >= 4)
35927 {
35928 for (i = 0; i < nelt; i += 4)
35929 {
35930 perm2[i + 0] = d->perm[i + 0] & mask;
35931 perm2[i + 1] = d->perm[i + 1] & mask;
35932 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
35933 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
35934 }
35935
35936 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
35937 return true;
35938 }
35939 }
35940
35941 /* Finally, try the fully general two operand permute. */
35942 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
35943 return true;
35944
35945 /* Recognize interleave style patterns with reversed operands. */
35946 if (d->op0 != d->op1)
35947 {
35948 for (i = 0; i < nelt; ++i)
35949 {
35950 unsigned e = d->perm[i];
35951 if (e >= nelt)
35952 e -= nelt;
35953 else
35954 e += nelt;
35955 perm2[i] = e;
35956 }
35957
35958 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
35959 return true;
35960 }
35961
35962 /* Try the SSE4.1 blend variable merge instructions. */
35963 if (expand_vec_perm_blend (d))
35964 return true;
35965
35966 /* Try one of the AVX vpermil variable permutations. */
35967 if (expand_vec_perm_vpermil (d))
35968 return true;
35969
35970 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
35971 vpshufb, vpermd or vpermq variable permutation. */
35972 if (expand_vec_perm_pshufb (d))
35973 return true;
35974
35975 return false;
35976 }
35977
35978 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
35979 in terms of a pair of pshuflw + pshufhw instructions. */
35980
35981 static bool
35982 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
35983 {
35984 unsigned char perm2[MAX_VECT_LEN];
35985 unsigned i;
35986 bool ok;
35987
35988 if (d->vmode != V8HImode || d->op0 != d->op1)
35989 return false;
35990
35991 /* The two permutations only operate in 64-bit lanes. */
35992 for (i = 0; i < 4; ++i)
35993 if (d->perm[i] >= 4)
35994 return false;
35995 for (i = 4; i < 8; ++i)
35996 if (d->perm[i] < 4)
35997 return false;
35998
35999 if (d->testing_p)
36000 return true;
36001
36002 /* Emit the pshuflw. */
36003 memcpy (perm2, d->perm, 4);
36004 for (i = 4; i < 8; ++i)
36005 perm2[i] = i;
36006 ok = expand_vselect (d->target, d->op0, perm2, 8);
36007 gcc_assert (ok);
36008
36009 /* Emit the pshufhw. */
36010 memcpy (perm2 + 4, d->perm + 4, 4);
36011 for (i = 0; i < 4; ++i)
36012 perm2[i] = i;
36013 ok = expand_vselect (d->target, d->target, perm2, 8);
36014 gcc_assert (ok);
36015
36016 return true;
36017 }
36018
36019 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36020 the permutation using the SSSE3 palignr instruction. This succeeds
36021 when all of the elements in PERM fit within one vector and we merely
36022 need to shift them down so that a single vector permutation has a
36023 chance to succeed. */
36024
36025 static bool
36026 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
36027 {
36028 unsigned i, nelt = d->nelt;
36029 unsigned min, max;
36030 bool in_order, ok;
36031 rtx shift;
36032
36033 /* Even with AVX, palignr only operates on 128-bit vectors. */
36034 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36035 return false;
36036
36037 min = nelt, max = 0;
36038 for (i = 0; i < nelt; ++i)
36039 {
36040 unsigned e = d->perm[i];
36041 if (e < min)
36042 min = e;
36043 if (e > max)
36044 max = e;
36045 }
36046 if (min == 0 || max - min >= nelt)
36047 return false;
36048
36049 /* Given that we have SSSE3, we know we'll be able to implement the
36050 single operand permutation after the palignr with pshufb. */
36051 if (d->testing_p)
36052 return true;
36053
36054 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
36055 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
36056 gen_lowpart (TImode, d->op1),
36057 gen_lowpart (TImode, d->op0), shift));
36058
36059 d->op0 = d->op1 = d->target;
36060
36061 in_order = true;
36062 for (i = 0; i < nelt; ++i)
36063 {
36064 unsigned e = d->perm[i] - min;
36065 if (e != i)
36066 in_order = false;
36067 d->perm[i] = e;
36068 }
36069
36070 /* Test for the degenerate case where the alignment by itself
36071 produces the desired permutation. */
36072 if (in_order)
36073 return true;
36074
36075 ok = expand_vec_perm_1 (d);
36076 gcc_assert (ok);
36077
36078 return ok;
36079 }
36080
36081 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
36082
36083 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36084 a two vector permutation into a single vector permutation by using
36085 an interleave operation to merge the vectors. */
36086
36087 static bool
36088 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
36089 {
36090 struct expand_vec_perm_d dremap, dfinal;
36091 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
36092 unsigned HOST_WIDE_INT contents;
36093 unsigned char remap[2 * MAX_VECT_LEN];
36094 rtx seq;
36095 bool ok, same_halves = false;
36096
36097 if (GET_MODE_SIZE (d->vmode) == 16)
36098 {
36099 if (d->op0 == d->op1)
36100 return false;
36101 }
36102 else if (GET_MODE_SIZE (d->vmode) == 32)
36103 {
36104 if (!TARGET_AVX)
36105 return false;
36106 /* For 32-byte modes allow even d->op0 == d->op1.
36107 The lack of cross-lane shuffling in some instructions
36108 might prevent a single insn shuffle. */
36109 dfinal = *d;
36110 dfinal.testing_p = true;
36111 /* If expand_vec_perm_interleave3 can expand this into
36112 a 3 insn sequence, give up and let it be expanded as
36113 3 insn sequence. While that is one insn longer,
36114 it doesn't need a memory operand and in the common
36115 case that both interleave low and high permutations
36116 with the same operands are adjacent needs 4 insns
36117 for both after CSE. */
36118 if (expand_vec_perm_interleave3 (&dfinal))
36119 return false;
36120 }
36121 else
36122 return false;
36123
36124 /* Examine from whence the elements come. */
36125 contents = 0;
36126 for (i = 0; i < nelt; ++i)
36127 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
36128
36129 memset (remap, 0xff, sizeof (remap));
36130 dremap = *d;
36131
36132 if (GET_MODE_SIZE (d->vmode) == 16)
36133 {
36134 unsigned HOST_WIDE_INT h1, h2, h3, h4;
36135
36136 /* Split the two input vectors into 4 halves. */
36137 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
36138 h2 = h1 << nelt2;
36139 h3 = h2 << nelt2;
36140 h4 = h3 << nelt2;
36141
36142 /* If the elements from the low halves use interleave low, and similarly
36143 for interleave high. If the elements are from mis-matched halves, we
36144 can use shufps for V4SF/V4SI or do a DImode shuffle. */
36145 if ((contents & (h1 | h3)) == contents)
36146 {
36147 /* punpckl* */
36148 for (i = 0; i < nelt2; ++i)
36149 {
36150 remap[i] = i * 2;
36151 remap[i + nelt] = i * 2 + 1;
36152 dremap.perm[i * 2] = i;
36153 dremap.perm[i * 2 + 1] = i + nelt;
36154 }
36155 if (!TARGET_SSE2 && d->vmode == V4SImode)
36156 dremap.vmode = V4SFmode;
36157 }
36158 else if ((contents & (h2 | h4)) == contents)
36159 {
36160 /* punpckh* */
36161 for (i = 0; i < nelt2; ++i)
36162 {
36163 remap[i + nelt2] = i * 2;
36164 remap[i + nelt + nelt2] = i * 2 + 1;
36165 dremap.perm[i * 2] = i + nelt2;
36166 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
36167 }
36168 if (!TARGET_SSE2 && d->vmode == V4SImode)
36169 dremap.vmode = V4SFmode;
36170 }
36171 else if ((contents & (h1 | h4)) == contents)
36172 {
36173 /* shufps */
36174 for (i = 0; i < nelt2; ++i)
36175 {
36176 remap[i] = i;
36177 remap[i + nelt + nelt2] = i + nelt2;
36178 dremap.perm[i] = i;
36179 dremap.perm[i + nelt2] = i + nelt + nelt2;
36180 }
36181 if (nelt != 4)
36182 {
36183 /* shufpd */
36184 dremap.vmode = V2DImode;
36185 dremap.nelt = 2;
36186 dremap.perm[0] = 0;
36187 dremap.perm[1] = 3;
36188 }
36189 }
36190 else if ((contents & (h2 | h3)) == contents)
36191 {
36192 /* shufps */
36193 for (i = 0; i < nelt2; ++i)
36194 {
36195 remap[i + nelt2] = i;
36196 remap[i + nelt] = i + nelt2;
36197 dremap.perm[i] = i + nelt2;
36198 dremap.perm[i + nelt2] = i + nelt;
36199 }
36200 if (nelt != 4)
36201 {
36202 /* shufpd */
36203 dremap.vmode = V2DImode;
36204 dremap.nelt = 2;
36205 dremap.perm[0] = 1;
36206 dremap.perm[1] = 2;
36207 }
36208 }
36209 else
36210 return false;
36211 }
36212 else
36213 {
36214 unsigned int nelt4 = nelt / 4, nzcnt = 0;
36215 unsigned HOST_WIDE_INT q[8];
36216 unsigned int nonzero_halves[4];
36217
36218 /* Split the two input vectors into 8 quarters. */
36219 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
36220 for (i = 1; i < 8; ++i)
36221 q[i] = q[0] << (nelt4 * i);
36222 for (i = 0; i < 4; ++i)
36223 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
36224 {
36225 nonzero_halves[nzcnt] = i;
36226 ++nzcnt;
36227 }
36228
36229 if (nzcnt == 1)
36230 {
36231 gcc_assert (d->op0 == d->op1);
36232 nonzero_halves[1] = nonzero_halves[0];
36233 same_halves = true;
36234 }
36235 else if (d->op0 == d->op1)
36236 {
36237 gcc_assert (nonzero_halves[0] == 0);
36238 gcc_assert (nonzero_halves[1] == 1);
36239 }
36240
36241 if (nzcnt <= 2)
36242 {
36243 if (d->perm[0] / nelt2 == nonzero_halves[1])
36244 {
36245 /* Attempt to increase the likelyhood that dfinal
36246 shuffle will be intra-lane. */
36247 char tmph = nonzero_halves[0];
36248 nonzero_halves[0] = nonzero_halves[1];
36249 nonzero_halves[1] = tmph;
36250 }
36251
36252 /* vperm2f128 or vperm2i128. */
36253 for (i = 0; i < nelt2; ++i)
36254 {
36255 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
36256 remap[i + nonzero_halves[0] * nelt2] = i;
36257 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
36258 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
36259 }
36260
36261 if (d->vmode != V8SFmode
36262 && d->vmode != V4DFmode
36263 && d->vmode != V8SImode)
36264 {
36265 dremap.vmode = V8SImode;
36266 dremap.nelt = 8;
36267 for (i = 0; i < 4; ++i)
36268 {
36269 dremap.perm[i] = i + nonzero_halves[0] * 4;
36270 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
36271 }
36272 }
36273 }
36274 else if (d->op0 == d->op1)
36275 return false;
36276 else if (TARGET_AVX2
36277 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
36278 {
36279 /* vpunpckl* */
36280 for (i = 0; i < nelt4; ++i)
36281 {
36282 remap[i] = i * 2;
36283 remap[i + nelt] = i * 2 + 1;
36284 remap[i + nelt2] = i * 2 + nelt2;
36285 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
36286 dremap.perm[i * 2] = i;
36287 dremap.perm[i * 2 + 1] = i + nelt;
36288 dremap.perm[i * 2 + nelt2] = i + nelt2;
36289 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
36290 }
36291 }
36292 else if (TARGET_AVX2
36293 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
36294 {
36295 /* vpunpckh* */
36296 for (i = 0; i < nelt4; ++i)
36297 {
36298 remap[i + nelt4] = i * 2;
36299 remap[i + nelt + nelt4] = i * 2 + 1;
36300 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
36301 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
36302 dremap.perm[i * 2] = i + nelt4;
36303 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
36304 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
36305 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
36306 }
36307 }
36308 else
36309 return false;
36310 }
36311
36312 /* Use the remapping array set up above to move the elements from their
36313 swizzled locations into their final destinations. */
36314 dfinal = *d;
36315 for (i = 0; i < nelt; ++i)
36316 {
36317 unsigned e = remap[d->perm[i]];
36318 gcc_assert (e < nelt);
36319 /* If same_halves is true, both halves of the remapped vector are the
36320 same. Avoid cross-lane accesses if possible. */
36321 if (same_halves && i >= nelt2)
36322 {
36323 gcc_assert (e < nelt2);
36324 dfinal.perm[i] = e + nelt2;
36325 }
36326 else
36327 dfinal.perm[i] = e;
36328 }
36329 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
36330 dfinal.op1 = dfinal.op0;
36331 dremap.target = dfinal.op0;
36332
36333 /* Test if the final remap can be done with a single insn. For V4SFmode or
36334 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
36335 start_sequence ();
36336 ok = expand_vec_perm_1 (&dfinal);
36337 seq = get_insns ();
36338 end_sequence ();
36339
36340 if (!ok)
36341 return false;
36342
36343 if (d->testing_p)
36344 return true;
36345
36346 if (dremap.vmode != dfinal.vmode)
36347 {
36348 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
36349 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
36350 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
36351 }
36352
36353 ok = expand_vec_perm_1 (&dremap);
36354 gcc_assert (ok);
36355
36356 emit_insn (seq);
36357 return true;
36358 }
36359
36360 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36361 a single vector cross-lane permutation into vpermq followed
36362 by any of the single insn permutations. */
36363
36364 static bool
36365 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
36366 {
36367 struct expand_vec_perm_d dremap, dfinal;
36368 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
36369 unsigned contents[2];
36370 bool ok;
36371
36372 if (!(TARGET_AVX2
36373 && (d->vmode == V32QImode || d->vmode == V16HImode)
36374 && d->op0 == d->op1))
36375 return false;
36376
36377 contents[0] = 0;
36378 contents[1] = 0;
36379 for (i = 0; i < nelt2; ++i)
36380 {
36381 contents[0] |= 1u << (d->perm[i] / nelt4);
36382 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
36383 }
36384
36385 for (i = 0; i < 2; ++i)
36386 {
36387 unsigned int cnt = 0;
36388 for (j = 0; j < 4; ++j)
36389 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
36390 return false;
36391 }
36392
36393 if (d->testing_p)
36394 return true;
36395
36396 dremap = *d;
36397 dremap.vmode = V4DImode;
36398 dremap.nelt = 4;
36399 dremap.target = gen_reg_rtx (V4DImode);
36400 dremap.op0 = gen_lowpart (V4DImode, d->op0);
36401 dremap.op1 = dremap.op0;
36402 for (i = 0; i < 2; ++i)
36403 {
36404 unsigned int cnt = 0;
36405 for (j = 0; j < 4; ++j)
36406 if ((contents[i] & (1u << j)) != 0)
36407 dremap.perm[2 * i + cnt++] = j;
36408 for (; cnt < 2; ++cnt)
36409 dremap.perm[2 * i + cnt] = 0;
36410 }
36411
36412 dfinal = *d;
36413 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
36414 dfinal.op1 = dfinal.op0;
36415 for (i = 0, j = 0; i < nelt; ++i)
36416 {
36417 if (i == nelt2)
36418 j = 2;
36419 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
36420 if ((d->perm[i] / nelt4) == dremap.perm[j])
36421 ;
36422 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
36423 dfinal.perm[i] |= nelt4;
36424 else
36425 gcc_unreachable ();
36426 }
36427
36428 ok = expand_vec_perm_1 (&dremap);
36429 gcc_assert (ok);
36430
36431 ok = expand_vec_perm_1 (&dfinal);
36432 gcc_assert (ok);
36433
36434 return true;
36435 }
36436
36437 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
36438 a two vector permutation using 2 intra-lane interleave insns
36439 and cross-lane shuffle for 32-byte vectors. */
36440
36441 static bool
36442 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
36443 {
36444 unsigned i, nelt;
36445 rtx (*gen) (rtx, rtx, rtx);
36446
36447 if (d->op0 == d->op1)
36448 return false;
36449 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
36450 ;
36451 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
36452 ;
36453 else
36454 return false;
36455
36456 nelt = d->nelt;
36457 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
36458 return false;
36459 for (i = 0; i < nelt; i += 2)
36460 if (d->perm[i] != d->perm[0] + i / 2
36461 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
36462 return false;
36463
36464 if (d->testing_p)
36465 return true;
36466
36467 switch (d->vmode)
36468 {
36469 case V32QImode:
36470 if (d->perm[0])
36471 gen = gen_vec_interleave_highv32qi;
36472 else
36473 gen = gen_vec_interleave_lowv32qi;
36474 break;
36475 case V16HImode:
36476 if (d->perm[0])
36477 gen = gen_vec_interleave_highv16hi;
36478 else
36479 gen = gen_vec_interleave_lowv16hi;
36480 break;
36481 case V8SImode:
36482 if (d->perm[0])
36483 gen = gen_vec_interleave_highv8si;
36484 else
36485 gen = gen_vec_interleave_lowv8si;
36486 break;
36487 case V4DImode:
36488 if (d->perm[0])
36489 gen = gen_vec_interleave_highv4di;
36490 else
36491 gen = gen_vec_interleave_lowv4di;
36492 break;
36493 case V8SFmode:
36494 if (d->perm[0])
36495 gen = gen_vec_interleave_highv8sf;
36496 else
36497 gen = gen_vec_interleave_lowv8sf;
36498 break;
36499 case V4DFmode:
36500 if (d->perm[0])
36501 gen = gen_vec_interleave_highv4df;
36502 else
36503 gen = gen_vec_interleave_lowv4df;
36504 break;
36505 default:
36506 gcc_unreachable ();
36507 }
36508
36509 emit_insn (gen (d->target, d->op0, d->op1));
36510 return true;
36511 }
36512
36513 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
36514 permutation with two pshufb insns and an ior. We should have already
36515 failed all two instruction sequences. */
36516
36517 static bool
36518 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
36519 {
36520 rtx rperm[2][16], vperm, l, h, op, m128;
36521 unsigned int i, nelt, eltsz;
36522
36523 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
36524 return false;
36525 gcc_assert (d->op0 != d->op1);
36526
36527 nelt = d->nelt;
36528 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36529
36530 /* Generate two permutation masks. If the required element is within
36531 the given vector it is shuffled into the proper lane. If the required
36532 element is in the other vector, force a zero into the lane by setting
36533 bit 7 in the permutation mask. */
36534 m128 = GEN_INT (-128);
36535 for (i = 0; i < nelt; ++i)
36536 {
36537 unsigned j, e = d->perm[i];
36538 unsigned which = (e >= nelt);
36539 if (e >= nelt)
36540 e -= nelt;
36541
36542 for (j = 0; j < eltsz; ++j)
36543 {
36544 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
36545 rperm[1-which][i*eltsz + j] = m128;
36546 }
36547 }
36548
36549 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
36550 vperm = force_reg (V16QImode, vperm);
36551
36552 l = gen_reg_rtx (V16QImode);
36553 op = gen_lowpart (V16QImode, d->op0);
36554 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
36555
36556 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
36557 vperm = force_reg (V16QImode, vperm);
36558
36559 h = gen_reg_rtx (V16QImode);
36560 op = gen_lowpart (V16QImode, d->op1);
36561 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
36562
36563 op = gen_lowpart (V16QImode, d->target);
36564 emit_insn (gen_iorv16qi3 (op, l, h));
36565
36566 return true;
36567 }
36568
36569 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
36570 with two vpshufb insns, vpermq and vpor. We should have already failed
36571 all two or three instruction sequences. */
36572
36573 static bool
36574 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
36575 {
36576 rtx rperm[2][32], vperm, l, h, hp, op, m128;
36577 unsigned int i, nelt, eltsz;
36578
36579 if (!TARGET_AVX2
36580 || d->op0 != d->op1
36581 || (d->vmode != V32QImode && d->vmode != V16HImode))
36582 return false;
36583
36584 if (d->testing_p)
36585 return true;
36586
36587 nelt = d->nelt;
36588 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36589
36590 /* Generate two permutation masks. If the required element is within
36591 the same lane, it is shuffled in. If the required element from the
36592 other lane, force a zero by setting bit 7 in the permutation mask.
36593 In the other mask the mask has non-negative elements if element
36594 is requested from the other lane, but also moved to the other lane,
36595 so that the result of vpshufb can have the two V2TImode halves
36596 swapped. */
36597 m128 = GEN_INT (-128);
36598 for (i = 0; i < nelt; ++i)
36599 {
36600 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36601 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
36602
36603 for (j = 0; j < eltsz; ++j)
36604 {
36605 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
36606 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
36607 }
36608 }
36609
36610 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36611 vperm = force_reg (V32QImode, vperm);
36612
36613 h = gen_reg_rtx (V32QImode);
36614 op = gen_lowpart (V32QImode, d->op0);
36615 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36616
36617 /* Swap the 128-byte lanes of h into hp. */
36618 hp = gen_reg_rtx (V4DImode);
36619 op = gen_lowpart (V4DImode, h);
36620 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
36621 const1_rtx));
36622
36623 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36624 vperm = force_reg (V32QImode, vperm);
36625
36626 l = gen_reg_rtx (V32QImode);
36627 op = gen_lowpart (V32QImode, d->op0);
36628 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36629
36630 op = gen_lowpart (V32QImode, d->target);
36631 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
36632
36633 return true;
36634 }
36635
36636 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
36637 and extract-odd permutations of two V32QImode and V16QImode operand
36638 with two vpshufb insns, vpor and vpermq. We should have already
36639 failed all two or three instruction sequences. */
36640
36641 static bool
36642 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
36643 {
36644 rtx rperm[2][32], vperm, l, h, ior, op, m128;
36645 unsigned int i, nelt, eltsz;
36646
36647 if (!TARGET_AVX2
36648 || d->op0 == d->op1
36649 || (d->vmode != V32QImode && d->vmode != V16HImode))
36650 return false;
36651
36652 for (i = 0; i < d->nelt; ++i)
36653 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
36654 return false;
36655
36656 if (d->testing_p)
36657 return true;
36658
36659 nelt = d->nelt;
36660 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
36661
36662 /* Generate two permutation masks. In the first permutation mask
36663 the first quarter will contain indexes for the first half
36664 of the op0, the second quarter will contain bit 7 set, third quarter
36665 will contain indexes for the second half of the op0 and the
36666 last quarter bit 7 set. In the second permutation mask
36667 the first quarter will contain bit 7 set, the second quarter
36668 indexes for the first half of the op1, the third quarter bit 7 set
36669 and last quarter indexes for the second half of the op1.
36670 I.e. the first mask e.g. for V32QImode extract even will be:
36671 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
36672 (all values masked with 0xf except for -128) and second mask
36673 for extract even will be
36674 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
36675 m128 = GEN_INT (-128);
36676 for (i = 0; i < nelt; ++i)
36677 {
36678 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
36679 unsigned which = d->perm[i] >= nelt;
36680 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
36681
36682 for (j = 0; j < eltsz; ++j)
36683 {
36684 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
36685 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
36686 }
36687 }
36688
36689 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
36690 vperm = force_reg (V32QImode, vperm);
36691
36692 l = gen_reg_rtx (V32QImode);
36693 op = gen_lowpart (V32QImode, d->op0);
36694 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
36695
36696 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
36697 vperm = force_reg (V32QImode, vperm);
36698
36699 h = gen_reg_rtx (V32QImode);
36700 op = gen_lowpart (V32QImode, d->op1);
36701 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
36702
36703 ior = gen_reg_rtx (V32QImode);
36704 emit_insn (gen_iorv32qi3 (ior, l, h));
36705
36706 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
36707 op = gen_lowpart (V4DImode, d->target);
36708 ior = gen_lowpart (V4DImode, ior);
36709 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
36710 const1_rtx, GEN_INT (3)));
36711
36712 return true;
36713 }
36714
36715 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
36716 and extract-odd permutations. */
36717
36718 static bool
36719 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
36720 {
36721 rtx t1, t2, t3;
36722
36723 switch (d->vmode)
36724 {
36725 case V4DFmode:
36726 t1 = gen_reg_rtx (V4DFmode);
36727 t2 = gen_reg_rtx (V4DFmode);
36728
36729 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36730 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
36731 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
36732
36733 /* Now an unpck[lh]pd will produce the result required. */
36734 if (odd)
36735 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
36736 else
36737 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
36738 emit_insn (t3);
36739 break;
36740
36741 case V8SFmode:
36742 {
36743 int mask = odd ? 0xdd : 0x88;
36744
36745 t1 = gen_reg_rtx (V8SFmode);
36746 t2 = gen_reg_rtx (V8SFmode);
36747 t3 = gen_reg_rtx (V8SFmode);
36748
36749 /* Shuffle within the 128-bit lanes to produce:
36750 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
36751 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
36752 GEN_INT (mask)));
36753
36754 /* Shuffle the lanes around to produce:
36755 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
36756 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
36757 GEN_INT (0x3)));
36758
36759 /* Shuffle within the 128-bit lanes to produce:
36760 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
36761 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
36762
36763 /* Shuffle within the 128-bit lanes to produce:
36764 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
36765 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
36766
36767 /* Shuffle the lanes around to produce:
36768 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
36769 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
36770 GEN_INT (0x20)));
36771 }
36772 break;
36773
36774 case V2DFmode:
36775 case V4SFmode:
36776 case V2DImode:
36777 case V4SImode:
36778 /* These are always directly implementable by expand_vec_perm_1. */
36779 gcc_unreachable ();
36780
36781 case V8HImode:
36782 if (TARGET_SSSE3)
36783 return expand_vec_perm_pshufb2 (d);
36784 else
36785 {
36786 /* We need 2*log2(N)-1 operations to achieve odd/even
36787 with interleave. */
36788 t1 = gen_reg_rtx (V8HImode);
36789 t2 = gen_reg_rtx (V8HImode);
36790 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
36791 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
36792 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
36793 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
36794 if (odd)
36795 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
36796 else
36797 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
36798 emit_insn (t3);
36799 }
36800 break;
36801
36802 case V16QImode:
36803 if (TARGET_SSSE3)
36804 return expand_vec_perm_pshufb2 (d);
36805 else
36806 {
36807 t1 = gen_reg_rtx (V16QImode);
36808 t2 = gen_reg_rtx (V16QImode);
36809 t3 = gen_reg_rtx (V16QImode);
36810 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
36811 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
36812 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
36813 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
36814 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
36815 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
36816 if (odd)
36817 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
36818 else
36819 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
36820 emit_insn (t3);
36821 }
36822 break;
36823
36824 case V16HImode:
36825 case V32QImode:
36826 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
36827
36828 case V4DImode:
36829 if (!TARGET_AVX2)
36830 {
36831 struct expand_vec_perm_d d_copy = *d;
36832 d_copy.vmode = V4DFmode;
36833 d_copy.target = gen_lowpart (V4DFmode, d->target);
36834 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
36835 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
36836 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36837 }
36838
36839 t1 = gen_reg_rtx (V4DImode);
36840 t2 = gen_reg_rtx (V4DImode);
36841
36842 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
36843 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
36844 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
36845
36846 /* Now an vpunpck[lh]qdq will produce the result required. */
36847 if (odd)
36848 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
36849 else
36850 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
36851 emit_insn (t3);
36852 break;
36853
36854 case V8SImode:
36855 if (!TARGET_AVX2)
36856 {
36857 struct expand_vec_perm_d d_copy = *d;
36858 d_copy.vmode = V8SFmode;
36859 d_copy.target = gen_lowpart (V8SFmode, d->target);
36860 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
36861 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
36862 return expand_vec_perm_even_odd_1 (&d_copy, odd);
36863 }
36864
36865 t1 = gen_reg_rtx (V8SImode);
36866 t2 = gen_reg_rtx (V8SImode);
36867
36868 /* Shuffle the lanes around into
36869 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
36870 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
36871 gen_lowpart (V4DImode, d->op0),
36872 gen_lowpart (V4DImode, d->op1),
36873 GEN_INT (0x20)));
36874 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
36875 gen_lowpart (V4DImode, d->op0),
36876 gen_lowpart (V4DImode, d->op1),
36877 GEN_INT (0x31)));
36878
36879 /* Swap the 2nd and 3rd position in each lane into
36880 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
36881 emit_insn (gen_avx2_pshufdv3 (t1, t1,
36882 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36883 emit_insn (gen_avx2_pshufdv3 (t2, t2,
36884 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
36885
36886 /* Now an vpunpck[lh]qdq will produce
36887 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
36888 if (odd)
36889 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
36890 gen_lowpart (V4DImode, t1),
36891 gen_lowpart (V4DImode, t2));
36892 else
36893 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
36894 gen_lowpart (V4DImode, t1),
36895 gen_lowpart (V4DImode, t2));
36896 emit_insn (t3);
36897 break;
36898
36899 default:
36900 gcc_unreachable ();
36901 }
36902
36903 return true;
36904 }
36905
36906 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36907 extract-even and extract-odd permutations. */
36908
36909 static bool
36910 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
36911 {
36912 unsigned i, odd, nelt = d->nelt;
36913
36914 odd = d->perm[0];
36915 if (odd != 0 && odd != 1)
36916 return false;
36917
36918 for (i = 1; i < nelt; ++i)
36919 if (d->perm[i] != 2 * i + odd)
36920 return false;
36921
36922 return expand_vec_perm_even_odd_1 (d, odd);
36923 }
36924
36925 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
36926 permutations. We assume that expand_vec_perm_1 has already failed. */
36927
36928 static bool
36929 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
36930 {
36931 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
36932 enum machine_mode vmode = d->vmode;
36933 unsigned char perm2[4];
36934 rtx op0 = d->op0;
36935 bool ok;
36936
36937 switch (vmode)
36938 {
36939 case V4DFmode:
36940 case V8SFmode:
36941 /* These are special-cased in sse.md so that we can optionally
36942 use the vbroadcast instruction. They expand to two insns
36943 if the input happens to be in a register. */
36944 gcc_unreachable ();
36945
36946 case V2DFmode:
36947 case V2DImode:
36948 case V4SFmode:
36949 case V4SImode:
36950 /* These are always implementable using standard shuffle patterns. */
36951 gcc_unreachable ();
36952
36953 case V8HImode:
36954 case V16QImode:
36955 /* These can be implemented via interleave. We save one insn by
36956 stopping once we have promoted to V4SImode and then use pshufd. */
36957 do
36958 {
36959 rtx dest;
36960 rtx (*gen) (rtx, rtx, rtx)
36961 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
36962 : gen_vec_interleave_lowv8hi;
36963
36964 if (elt >= nelt2)
36965 {
36966 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
36967 : gen_vec_interleave_highv8hi;
36968 elt -= nelt2;
36969 }
36970 nelt2 /= 2;
36971
36972 dest = gen_reg_rtx (vmode);
36973 emit_insn (gen (dest, op0, op0));
36974 vmode = get_mode_wider_vector (vmode);
36975 op0 = gen_lowpart (vmode, dest);
36976 }
36977 while (vmode != V4SImode);
36978
36979 memset (perm2, elt, 4);
36980 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
36981 gcc_assert (ok);
36982 return true;
36983
36984 case V32QImode:
36985 case V16HImode:
36986 case V8SImode:
36987 case V4DImode:
36988 /* For AVX2 broadcasts of the first element vpbroadcast* or
36989 vpermq should be used by expand_vec_perm_1. */
36990 gcc_assert (!TARGET_AVX2 || d->perm[0]);
36991 return false;
36992
36993 default:
36994 gcc_unreachable ();
36995 }
36996 }
36997
36998 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
36999 broadcast permutations. */
37000
37001 static bool
37002 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
37003 {
37004 unsigned i, elt, nelt = d->nelt;
37005
37006 if (d->op0 != d->op1)
37007 return false;
37008
37009 elt = d->perm[0];
37010 for (i = 1; i < nelt; ++i)
37011 if (d->perm[i] != elt)
37012 return false;
37013
37014 return expand_vec_perm_broadcast_1 (d);
37015 }
37016
37017 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
37018 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
37019 all the shorter instruction sequences. */
37020
37021 static bool
37022 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
37023 {
37024 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
37025 unsigned int i, nelt, eltsz;
37026 bool used[4];
37027
37028 if (!TARGET_AVX2
37029 || d->op0 == d->op1
37030 || (d->vmode != V32QImode && d->vmode != V16HImode))
37031 return false;
37032
37033 if (d->testing_p)
37034 return true;
37035
37036 nelt = d->nelt;
37037 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
37038
37039 /* Generate 4 permutation masks. If the required element is within
37040 the same lane, it is shuffled in. If the required element from the
37041 other lane, force a zero by setting bit 7 in the permutation mask.
37042 In the other mask the mask has non-negative elements if element
37043 is requested from the other lane, but also moved to the other lane,
37044 so that the result of vpshufb can have the two V2TImode halves
37045 swapped. */
37046 m128 = GEN_INT (-128);
37047 for (i = 0; i < 32; ++i)
37048 {
37049 rperm[0][i] = m128;
37050 rperm[1][i] = m128;
37051 rperm[2][i] = m128;
37052 rperm[3][i] = m128;
37053 }
37054 used[0] = false;
37055 used[1] = false;
37056 used[2] = false;
37057 used[3] = false;
37058 for (i = 0; i < nelt; ++i)
37059 {
37060 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
37061 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
37062 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
37063
37064 for (j = 0; j < eltsz; ++j)
37065 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
37066 used[which] = true;
37067 }
37068
37069 for (i = 0; i < 2; ++i)
37070 {
37071 if (!used[2 * i + 1])
37072 {
37073 h[i] = NULL_RTX;
37074 continue;
37075 }
37076 vperm = gen_rtx_CONST_VECTOR (V32QImode,
37077 gen_rtvec_v (32, rperm[2 * i + 1]));
37078 vperm = force_reg (V32QImode, vperm);
37079 h[i] = gen_reg_rtx (V32QImode);
37080 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37081 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
37082 }
37083
37084 /* Swap the 128-byte lanes of h[X]. */
37085 for (i = 0; i < 2; ++i)
37086 {
37087 if (h[i] == NULL_RTX)
37088 continue;
37089 op = gen_reg_rtx (V4DImode);
37090 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
37091 const2_rtx, GEN_INT (3), const0_rtx,
37092 const1_rtx));
37093 h[i] = gen_lowpart (V32QImode, op);
37094 }
37095
37096 for (i = 0; i < 2; ++i)
37097 {
37098 if (!used[2 * i])
37099 {
37100 l[i] = NULL_RTX;
37101 continue;
37102 }
37103 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
37104 vperm = force_reg (V32QImode, vperm);
37105 l[i] = gen_reg_rtx (V32QImode);
37106 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
37107 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
37108 }
37109
37110 for (i = 0; i < 2; ++i)
37111 {
37112 if (h[i] && l[i])
37113 {
37114 op = gen_reg_rtx (V32QImode);
37115 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
37116 l[i] = op;
37117 }
37118 else if (h[i])
37119 l[i] = h[i];
37120 }
37121
37122 gcc_assert (l[0] && l[1]);
37123 op = gen_lowpart (V32QImode, d->target);
37124 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
37125 return true;
37126 }
37127
37128 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
37129 With all of the interface bits taken care of, perform the expansion
37130 in D and return true on success. */
37131
37132 static bool
37133 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
37134 {
37135 /* Try a single instruction expansion. */
37136 if (expand_vec_perm_1 (d))
37137 return true;
37138
37139 /* Try sequences of two instructions. */
37140
37141 if (expand_vec_perm_pshuflw_pshufhw (d))
37142 return true;
37143
37144 if (expand_vec_perm_palignr (d))
37145 return true;
37146
37147 if (expand_vec_perm_interleave2 (d))
37148 return true;
37149
37150 if (expand_vec_perm_broadcast (d))
37151 return true;
37152
37153 if (expand_vec_perm_vpermq_perm_1 (d))
37154 return true;
37155
37156 /* Try sequences of three instructions. */
37157
37158 if (expand_vec_perm_pshufb2 (d))
37159 return true;
37160
37161 if (expand_vec_perm_interleave3 (d))
37162 return true;
37163
37164 /* Try sequences of four instructions. */
37165
37166 if (expand_vec_perm_vpshufb2_vpermq (d))
37167 return true;
37168
37169 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
37170 return true;
37171
37172 /* ??? Look for narrow permutations whose element orderings would
37173 allow the promotion to a wider mode. */
37174
37175 /* ??? Look for sequences of interleave or a wider permute that place
37176 the data into the correct lanes for a half-vector shuffle like
37177 pshuf[lh]w or vpermilps. */
37178
37179 /* ??? Look for sequences of interleave that produce the desired results.
37180 The combinatorics of punpck[lh] get pretty ugly... */
37181
37182 if (expand_vec_perm_even_odd (d))
37183 return true;
37184
37185 /* Even longer sequences. */
37186 if (expand_vec_perm_vpshufb4_vpermq2 (d))
37187 return true;
37188
37189 return false;
37190 }
37191
37192 bool
37193 ix86_expand_vec_perm_const (rtx operands[4])
37194 {
37195 struct expand_vec_perm_d d;
37196 unsigned char perm[MAX_VECT_LEN];
37197 int i, nelt, which;
37198 rtx sel;
37199
37200 d.target = operands[0];
37201 d.op0 = operands[1];
37202 d.op1 = operands[2];
37203 sel = operands[3];
37204
37205 d.vmode = GET_MODE (d.target);
37206 gcc_assert (VECTOR_MODE_P (d.vmode));
37207 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37208 d.testing_p = false;
37209
37210 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
37211 gcc_assert (XVECLEN (sel, 0) == nelt);
37212 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
37213
37214 for (i = which = 0; i < nelt; ++i)
37215 {
37216 rtx e = XVECEXP (sel, 0, i);
37217 int ei = INTVAL (e) & (2 * nelt - 1);
37218
37219 which |= (ei < nelt ? 1 : 2);
37220 d.perm[i] = ei;
37221 perm[i] = ei;
37222 }
37223
37224 switch (which)
37225 {
37226 default:
37227 gcc_unreachable();
37228
37229 case 3:
37230 if (!rtx_equal_p (d.op0, d.op1))
37231 break;
37232
37233 /* The elements of PERM do not suggest that only the first operand
37234 is used, but both operands are identical. Allow easier matching
37235 of the permutation by folding the permutation into the single
37236 input vector. */
37237 for (i = 0; i < nelt; ++i)
37238 if (d.perm[i] >= nelt)
37239 d.perm[i] -= nelt;
37240 /* FALLTHRU */
37241
37242 case 1:
37243 d.op1 = d.op0;
37244 break;
37245
37246 case 2:
37247 for (i = 0; i < nelt; ++i)
37248 d.perm[i] -= nelt;
37249 d.op0 = d.op1;
37250 break;
37251 }
37252
37253 if (ix86_expand_vec_perm_const_1 (&d))
37254 return true;
37255
37256 /* If the mask says both arguments are needed, but they are the same,
37257 the above tried to expand with d.op0 == d.op1. If that didn't work,
37258 retry with d.op0 != d.op1 as that is what testing has been done with. */
37259 if (which == 3 && d.op0 == d.op1)
37260 {
37261 rtx seq;
37262 bool ok;
37263
37264 memcpy (d.perm, perm, sizeof (perm));
37265 d.op1 = gen_reg_rtx (d.vmode);
37266 start_sequence ();
37267 ok = ix86_expand_vec_perm_const_1 (&d);
37268 seq = get_insns ();
37269 end_sequence ();
37270 if (ok)
37271 {
37272 emit_move_insn (d.op1, d.op0);
37273 emit_insn (seq);
37274 return true;
37275 }
37276 }
37277
37278 return false;
37279 }
37280
37281 /* Implement targetm.vectorize.vec_perm_const_ok. */
37282
37283 static bool
37284 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
37285 const unsigned char *sel)
37286 {
37287 struct expand_vec_perm_d d;
37288 unsigned int i, nelt, which;
37289 bool ret, one_vec;
37290
37291 d.vmode = vmode;
37292 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37293 d.testing_p = true;
37294
37295 /* Given sufficient ISA support we can just return true here
37296 for selected vector modes. */
37297 if (GET_MODE_SIZE (d.vmode) == 16)
37298 {
37299 /* All implementable with a single vpperm insn. */
37300 if (TARGET_XOP)
37301 return true;
37302 /* All implementable with 2 pshufb + 1 ior. */
37303 if (TARGET_SSSE3)
37304 return true;
37305 /* All implementable with shufpd or unpck[lh]pd. */
37306 if (d.nelt == 2)
37307 return true;
37308 }
37309
37310 /* Extract the values from the vector CST into the permutation
37311 array in D. */
37312 memcpy (d.perm, sel, nelt);
37313 for (i = which = 0; i < nelt; ++i)
37314 {
37315 unsigned char e = d.perm[i];
37316 gcc_assert (e < 2 * nelt);
37317 which |= (e < nelt ? 1 : 2);
37318 }
37319
37320 /* For all elements from second vector, fold the elements to first. */
37321 if (which == 2)
37322 for (i = 0; i < nelt; ++i)
37323 d.perm[i] -= nelt;
37324
37325 /* Check whether the mask can be applied to the vector type. */
37326 one_vec = (which != 3);
37327
37328 /* Implementable with shufps or pshufd. */
37329 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
37330 return true;
37331
37332 /* Otherwise we have to go through the motions and see if we can
37333 figure out how to generate the requested permutation. */
37334 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
37335 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
37336 if (!one_vec)
37337 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
37338
37339 start_sequence ();
37340 ret = ix86_expand_vec_perm_const_1 (&d);
37341 end_sequence ();
37342
37343 return ret;
37344 }
37345
37346 void
37347 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
37348 {
37349 struct expand_vec_perm_d d;
37350 unsigned i, nelt;
37351
37352 d.target = targ;
37353 d.op0 = op0;
37354 d.op1 = op1;
37355 d.vmode = GET_MODE (targ);
37356 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
37357 d.testing_p = false;
37358
37359 for (i = 0; i < nelt; ++i)
37360 d.perm[i] = i * 2 + odd;
37361
37362 /* We'll either be able to implement the permutation directly... */
37363 if (expand_vec_perm_1 (&d))
37364 return;
37365
37366 /* ... or we use the special-case patterns. */
37367 expand_vec_perm_even_odd_1 (&d, odd);
37368 }
37369
37370 /* Expand an insert into a vector register through pinsr insn.
37371 Return true if successful. */
37372
37373 bool
37374 ix86_expand_pinsr (rtx *operands)
37375 {
37376 rtx dst = operands[0];
37377 rtx src = operands[3];
37378
37379 unsigned int size = INTVAL (operands[1]);
37380 unsigned int pos = INTVAL (operands[2]);
37381
37382 if (GET_CODE (dst) == SUBREG)
37383 {
37384 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
37385 dst = SUBREG_REG (dst);
37386 }
37387
37388 if (GET_CODE (src) == SUBREG)
37389 src = SUBREG_REG (src);
37390
37391 switch (GET_MODE (dst))
37392 {
37393 case V16QImode:
37394 case V8HImode:
37395 case V4SImode:
37396 case V2DImode:
37397 {
37398 enum machine_mode srcmode, dstmode;
37399 rtx (*pinsr)(rtx, rtx, rtx, rtx);
37400
37401 srcmode = mode_for_size (size, MODE_INT, 0);
37402
37403 switch (srcmode)
37404 {
37405 case QImode:
37406 if (!TARGET_SSE4_1)
37407 return false;
37408 dstmode = V16QImode;
37409 pinsr = gen_sse4_1_pinsrb;
37410 break;
37411
37412 case HImode:
37413 if (!TARGET_SSE2)
37414 return false;
37415 dstmode = V8HImode;
37416 pinsr = gen_sse2_pinsrw;
37417 break;
37418
37419 case SImode:
37420 if (!TARGET_SSE4_1)
37421 return false;
37422 dstmode = V4SImode;
37423 pinsr = gen_sse4_1_pinsrd;
37424 break;
37425
37426 case DImode:
37427 gcc_assert (TARGET_64BIT);
37428 if (!TARGET_SSE4_1)
37429 return false;
37430 dstmode = V2DImode;
37431 pinsr = gen_sse4_1_pinsrq;
37432 break;
37433
37434 default:
37435 return false;
37436 }
37437
37438 dst = gen_lowpart (dstmode, dst);
37439 src = gen_lowpart (srcmode, src);
37440
37441 pos /= size;
37442
37443 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
37444 return true;
37445 }
37446
37447 default:
37448 return false;
37449 }
37450 }
37451 \f
37452 /* This function returns the calling abi specific va_list type node.
37453 It returns the FNDECL specific va_list type. */
37454
37455 static tree
37456 ix86_fn_abi_va_list (tree fndecl)
37457 {
37458 if (!TARGET_64BIT)
37459 return va_list_type_node;
37460 gcc_assert (fndecl != NULL_TREE);
37461
37462 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
37463 return ms_va_list_type_node;
37464 else
37465 return sysv_va_list_type_node;
37466 }
37467
37468 /* Returns the canonical va_list type specified by TYPE. If there
37469 is no valid TYPE provided, it return NULL_TREE. */
37470
37471 static tree
37472 ix86_canonical_va_list_type (tree type)
37473 {
37474 tree wtype, htype;
37475
37476 /* Resolve references and pointers to va_list type. */
37477 if (TREE_CODE (type) == MEM_REF)
37478 type = TREE_TYPE (type);
37479 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
37480 type = TREE_TYPE (type);
37481 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
37482 type = TREE_TYPE (type);
37483
37484 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
37485 {
37486 wtype = va_list_type_node;
37487 gcc_assert (wtype != NULL_TREE);
37488 htype = type;
37489 if (TREE_CODE (wtype) == ARRAY_TYPE)
37490 {
37491 /* If va_list is an array type, the argument may have decayed
37492 to a pointer type, e.g. by being passed to another function.
37493 In that case, unwrap both types so that we can compare the
37494 underlying records. */
37495 if (TREE_CODE (htype) == ARRAY_TYPE
37496 || POINTER_TYPE_P (htype))
37497 {
37498 wtype = TREE_TYPE (wtype);
37499 htype = TREE_TYPE (htype);
37500 }
37501 }
37502 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37503 return va_list_type_node;
37504 wtype = sysv_va_list_type_node;
37505 gcc_assert (wtype != NULL_TREE);
37506 htype = type;
37507 if (TREE_CODE (wtype) == ARRAY_TYPE)
37508 {
37509 /* If va_list is an array type, the argument may have decayed
37510 to a pointer type, e.g. by being passed to another function.
37511 In that case, unwrap both types so that we can compare the
37512 underlying records. */
37513 if (TREE_CODE (htype) == ARRAY_TYPE
37514 || POINTER_TYPE_P (htype))
37515 {
37516 wtype = TREE_TYPE (wtype);
37517 htype = TREE_TYPE (htype);
37518 }
37519 }
37520 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37521 return sysv_va_list_type_node;
37522 wtype = ms_va_list_type_node;
37523 gcc_assert (wtype != NULL_TREE);
37524 htype = type;
37525 if (TREE_CODE (wtype) == ARRAY_TYPE)
37526 {
37527 /* If va_list is an array type, the argument may have decayed
37528 to a pointer type, e.g. by being passed to another function.
37529 In that case, unwrap both types so that we can compare the
37530 underlying records. */
37531 if (TREE_CODE (htype) == ARRAY_TYPE
37532 || POINTER_TYPE_P (htype))
37533 {
37534 wtype = TREE_TYPE (wtype);
37535 htype = TREE_TYPE (htype);
37536 }
37537 }
37538 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
37539 return ms_va_list_type_node;
37540 return NULL_TREE;
37541 }
37542 return std_canonical_va_list_type (type);
37543 }
37544
37545 /* Iterate through the target-specific builtin types for va_list.
37546 IDX denotes the iterator, *PTREE is set to the result type of
37547 the va_list builtin, and *PNAME to its internal type.
37548 Returns zero if there is no element for this index, otherwise
37549 IDX should be increased upon the next call.
37550 Note, do not iterate a base builtin's name like __builtin_va_list.
37551 Used from c_common_nodes_and_builtins. */
37552
37553 static int
37554 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
37555 {
37556 if (TARGET_64BIT)
37557 {
37558 switch (idx)
37559 {
37560 default:
37561 break;
37562
37563 case 0:
37564 *ptree = ms_va_list_type_node;
37565 *pname = "__builtin_ms_va_list";
37566 return 1;
37567
37568 case 1:
37569 *ptree = sysv_va_list_type_node;
37570 *pname = "__builtin_sysv_va_list";
37571 return 1;
37572 }
37573 }
37574
37575 return 0;
37576 }
37577
37578 #undef TARGET_SCHED_DISPATCH
37579 #define TARGET_SCHED_DISPATCH has_dispatch
37580 #undef TARGET_SCHED_DISPATCH_DO
37581 #define TARGET_SCHED_DISPATCH_DO do_dispatch
37582 #undef TARGET_SCHED_REASSOCIATION_WIDTH
37583 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
37584
37585 /* The size of the dispatch window is the total number of bytes of
37586 object code allowed in a window. */
37587 #define DISPATCH_WINDOW_SIZE 16
37588
37589 /* Number of dispatch windows considered for scheduling. */
37590 #define MAX_DISPATCH_WINDOWS 3
37591
37592 /* Maximum number of instructions in a window. */
37593 #define MAX_INSN 4
37594
37595 /* Maximum number of immediate operands in a window. */
37596 #define MAX_IMM 4
37597
37598 /* Maximum number of immediate bits allowed in a window. */
37599 #define MAX_IMM_SIZE 128
37600
37601 /* Maximum number of 32 bit immediates allowed in a window. */
37602 #define MAX_IMM_32 4
37603
37604 /* Maximum number of 64 bit immediates allowed in a window. */
37605 #define MAX_IMM_64 2
37606
37607 /* Maximum total of loads or prefetches allowed in a window. */
37608 #define MAX_LOAD 2
37609
37610 /* Maximum total of stores allowed in a window. */
37611 #define MAX_STORE 1
37612
37613 #undef BIG
37614 #define BIG 100
37615
37616
37617 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
37618 enum dispatch_group {
37619 disp_no_group = 0,
37620 disp_load,
37621 disp_store,
37622 disp_load_store,
37623 disp_prefetch,
37624 disp_imm,
37625 disp_imm_32,
37626 disp_imm_64,
37627 disp_branch,
37628 disp_cmp,
37629 disp_jcc,
37630 disp_last
37631 };
37632
37633 /* Number of allowable groups in a dispatch window. It is an array
37634 indexed by dispatch_group enum. 100 is used as a big number,
37635 because the number of these kind of operations does not have any
37636 effect in dispatch window, but we need them for other reasons in
37637 the table. */
37638 static unsigned int num_allowable_groups[disp_last] = {
37639 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
37640 };
37641
37642 char group_name[disp_last + 1][16] = {
37643 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
37644 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
37645 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
37646 };
37647
37648 /* Instruction path. */
37649 enum insn_path {
37650 no_path = 0,
37651 path_single, /* Single micro op. */
37652 path_double, /* Double micro op. */
37653 path_multi, /* Instructions with more than 2 micro op.. */
37654 last_path
37655 };
37656
37657 /* sched_insn_info defines a window to the instructions scheduled in
37658 the basic block. It contains a pointer to the insn_info table and
37659 the instruction scheduled.
37660
37661 Windows are allocated for each basic block and are linked
37662 together. */
37663 typedef struct sched_insn_info_s {
37664 rtx insn;
37665 enum dispatch_group group;
37666 enum insn_path path;
37667 int byte_len;
37668 int imm_bytes;
37669 } sched_insn_info;
37670
37671 /* Linked list of dispatch windows. This is a two way list of
37672 dispatch windows of a basic block. It contains information about
37673 the number of uops in the window and the total number of
37674 instructions and of bytes in the object code for this dispatch
37675 window. */
37676 typedef struct dispatch_windows_s {
37677 int num_insn; /* Number of insn in the window. */
37678 int num_uops; /* Number of uops in the window. */
37679 int window_size; /* Number of bytes in the window. */
37680 int window_num; /* Window number between 0 or 1. */
37681 int num_imm; /* Number of immediates in an insn. */
37682 int num_imm_32; /* Number of 32 bit immediates in an insn. */
37683 int num_imm_64; /* Number of 64 bit immediates in an insn. */
37684 int imm_size; /* Total immediates in the window. */
37685 int num_loads; /* Total memory loads in the window. */
37686 int num_stores; /* Total memory stores in the window. */
37687 int violation; /* Violation exists in window. */
37688 sched_insn_info *window; /* Pointer to the window. */
37689 struct dispatch_windows_s *next;
37690 struct dispatch_windows_s *prev;
37691 } dispatch_windows;
37692
37693 /* Immediate valuse used in an insn. */
37694 typedef struct imm_info_s
37695 {
37696 int imm;
37697 int imm32;
37698 int imm64;
37699 } imm_info;
37700
37701 static dispatch_windows *dispatch_window_list;
37702 static dispatch_windows *dispatch_window_list1;
37703
37704 /* Get dispatch group of insn. */
37705
37706 static enum dispatch_group
37707 get_mem_group (rtx insn)
37708 {
37709 enum attr_memory memory;
37710
37711 if (INSN_CODE (insn) < 0)
37712 return disp_no_group;
37713 memory = get_attr_memory (insn);
37714 if (memory == MEMORY_STORE)
37715 return disp_store;
37716
37717 if (memory == MEMORY_LOAD)
37718 return disp_load;
37719
37720 if (memory == MEMORY_BOTH)
37721 return disp_load_store;
37722
37723 return disp_no_group;
37724 }
37725
37726 /* Return true if insn is a compare instruction. */
37727
37728 static bool
37729 is_cmp (rtx insn)
37730 {
37731 enum attr_type type;
37732
37733 type = get_attr_type (insn);
37734 return (type == TYPE_TEST
37735 || type == TYPE_ICMP
37736 || type == TYPE_FCMP
37737 || GET_CODE (PATTERN (insn)) == COMPARE);
37738 }
37739
37740 /* Return true if a dispatch violation encountered. */
37741
37742 static bool
37743 dispatch_violation (void)
37744 {
37745 if (dispatch_window_list->next)
37746 return dispatch_window_list->next->violation;
37747 return dispatch_window_list->violation;
37748 }
37749
37750 /* Return true if insn is a branch instruction. */
37751
37752 static bool
37753 is_branch (rtx insn)
37754 {
37755 return (CALL_P (insn) || JUMP_P (insn));
37756 }
37757
37758 /* Return true if insn is a prefetch instruction. */
37759
37760 static bool
37761 is_prefetch (rtx insn)
37762 {
37763 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
37764 }
37765
37766 /* This function initializes a dispatch window and the list container holding a
37767 pointer to the window. */
37768
37769 static void
37770 init_window (int window_num)
37771 {
37772 int i;
37773 dispatch_windows *new_list;
37774
37775 if (window_num == 0)
37776 new_list = dispatch_window_list;
37777 else
37778 new_list = dispatch_window_list1;
37779
37780 new_list->num_insn = 0;
37781 new_list->num_uops = 0;
37782 new_list->window_size = 0;
37783 new_list->next = NULL;
37784 new_list->prev = NULL;
37785 new_list->window_num = window_num;
37786 new_list->num_imm = 0;
37787 new_list->num_imm_32 = 0;
37788 new_list->num_imm_64 = 0;
37789 new_list->imm_size = 0;
37790 new_list->num_loads = 0;
37791 new_list->num_stores = 0;
37792 new_list->violation = false;
37793
37794 for (i = 0; i < MAX_INSN; i++)
37795 {
37796 new_list->window[i].insn = NULL;
37797 new_list->window[i].group = disp_no_group;
37798 new_list->window[i].path = no_path;
37799 new_list->window[i].byte_len = 0;
37800 new_list->window[i].imm_bytes = 0;
37801 }
37802 return;
37803 }
37804
37805 /* This function allocates and initializes a dispatch window and the
37806 list container holding a pointer to the window. */
37807
37808 static dispatch_windows *
37809 allocate_window (void)
37810 {
37811 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
37812 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
37813
37814 return new_list;
37815 }
37816
37817 /* This routine initializes the dispatch scheduling information. It
37818 initiates building dispatch scheduler tables and constructs the
37819 first dispatch window. */
37820
37821 static void
37822 init_dispatch_sched (void)
37823 {
37824 /* Allocate a dispatch list and a window. */
37825 dispatch_window_list = allocate_window ();
37826 dispatch_window_list1 = allocate_window ();
37827 init_window (0);
37828 init_window (1);
37829 }
37830
37831 /* This function returns true if a branch is detected. End of a basic block
37832 does not have to be a branch, but here we assume only branches end a
37833 window. */
37834
37835 static bool
37836 is_end_basic_block (enum dispatch_group group)
37837 {
37838 return group == disp_branch;
37839 }
37840
37841 /* This function is called when the end of a window processing is reached. */
37842
37843 static void
37844 process_end_window (void)
37845 {
37846 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
37847 if (dispatch_window_list->next)
37848 {
37849 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
37850 gcc_assert (dispatch_window_list->window_size
37851 + dispatch_window_list1->window_size <= 48);
37852 init_window (1);
37853 }
37854 init_window (0);
37855 }
37856
37857 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
37858 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
37859 for 48 bytes of instructions. Note that these windows are not dispatch
37860 windows that their sizes are DISPATCH_WINDOW_SIZE. */
37861
37862 static dispatch_windows *
37863 allocate_next_window (int window_num)
37864 {
37865 if (window_num == 0)
37866 {
37867 if (dispatch_window_list->next)
37868 init_window (1);
37869 init_window (0);
37870 return dispatch_window_list;
37871 }
37872
37873 dispatch_window_list->next = dispatch_window_list1;
37874 dispatch_window_list1->prev = dispatch_window_list;
37875
37876 return dispatch_window_list1;
37877 }
37878
37879 /* Increment the number of immediate operands of an instruction. */
37880
37881 static int
37882 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
37883 {
37884 if (*in_rtx == 0)
37885 return 0;
37886
37887 switch ( GET_CODE (*in_rtx))
37888 {
37889 case CONST:
37890 case SYMBOL_REF:
37891 case CONST_INT:
37892 (imm_values->imm)++;
37893 if (x86_64_immediate_operand (*in_rtx, SImode))
37894 (imm_values->imm32)++;
37895 else
37896 (imm_values->imm64)++;
37897 break;
37898
37899 case CONST_DOUBLE:
37900 (imm_values->imm)++;
37901 (imm_values->imm64)++;
37902 break;
37903
37904 case CODE_LABEL:
37905 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
37906 {
37907 (imm_values->imm)++;
37908 (imm_values->imm32)++;
37909 }
37910 break;
37911
37912 default:
37913 break;
37914 }
37915
37916 return 0;
37917 }
37918
37919 /* Compute number of immediate operands of an instruction. */
37920
37921 static void
37922 find_constant (rtx in_rtx, imm_info *imm_values)
37923 {
37924 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
37925 (rtx_function) find_constant_1, (void *) imm_values);
37926 }
37927
37928 /* Return total size of immediate operands of an instruction along with number
37929 of corresponding immediate-operands. It initializes its parameters to zero
37930 befor calling FIND_CONSTANT.
37931 INSN is the input instruction. IMM is the total of immediates.
37932 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
37933 bit immediates. */
37934
37935 static int
37936 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
37937 {
37938 imm_info imm_values = {0, 0, 0};
37939
37940 find_constant (insn, &imm_values);
37941 *imm = imm_values.imm;
37942 *imm32 = imm_values.imm32;
37943 *imm64 = imm_values.imm64;
37944 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
37945 }
37946
37947 /* This function indicates if an operand of an instruction is an
37948 immediate. */
37949
37950 static bool
37951 has_immediate (rtx insn)
37952 {
37953 int num_imm_operand;
37954 int num_imm32_operand;
37955 int num_imm64_operand;
37956
37957 if (insn)
37958 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
37959 &num_imm64_operand);
37960 return false;
37961 }
37962
37963 /* Return single or double path for instructions. */
37964
37965 static enum insn_path
37966 get_insn_path (rtx insn)
37967 {
37968 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
37969
37970 if ((int)path == 0)
37971 return path_single;
37972
37973 if ((int)path == 1)
37974 return path_double;
37975
37976 return path_multi;
37977 }
37978
37979 /* Return insn dispatch group. */
37980
37981 static enum dispatch_group
37982 get_insn_group (rtx insn)
37983 {
37984 enum dispatch_group group = get_mem_group (insn);
37985 if (group)
37986 return group;
37987
37988 if (is_branch (insn))
37989 return disp_branch;
37990
37991 if (is_cmp (insn))
37992 return disp_cmp;
37993
37994 if (has_immediate (insn))
37995 return disp_imm;
37996
37997 if (is_prefetch (insn))
37998 return disp_prefetch;
37999
38000 return disp_no_group;
38001 }
38002
38003 /* Count number of GROUP restricted instructions in a dispatch
38004 window WINDOW_LIST. */
38005
38006 static int
38007 count_num_restricted (rtx insn, dispatch_windows *window_list)
38008 {
38009 enum dispatch_group group = get_insn_group (insn);
38010 int imm_size;
38011 int num_imm_operand;
38012 int num_imm32_operand;
38013 int num_imm64_operand;
38014
38015 if (group == disp_no_group)
38016 return 0;
38017
38018 if (group == disp_imm)
38019 {
38020 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38021 &num_imm64_operand);
38022 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
38023 || num_imm_operand + window_list->num_imm > MAX_IMM
38024 || (num_imm32_operand > 0
38025 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
38026 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
38027 || (num_imm64_operand > 0
38028 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
38029 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
38030 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
38031 && num_imm64_operand > 0
38032 && ((window_list->num_imm_64 > 0
38033 && window_list->num_insn >= 2)
38034 || window_list->num_insn >= 3)))
38035 return BIG;
38036
38037 return 1;
38038 }
38039
38040 if ((group == disp_load_store
38041 && (window_list->num_loads >= MAX_LOAD
38042 || window_list->num_stores >= MAX_STORE))
38043 || ((group == disp_load
38044 || group == disp_prefetch)
38045 && window_list->num_loads >= MAX_LOAD)
38046 || (group == disp_store
38047 && window_list->num_stores >= MAX_STORE))
38048 return BIG;
38049
38050 return 1;
38051 }
38052
38053 /* This function returns true if insn satisfies dispatch rules on the
38054 last window scheduled. */
38055
38056 static bool
38057 fits_dispatch_window (rtx insn)
38058 {
38059 dispatch_windows *window_list = dispatch_window_list;
38060 dispatch_windows *window_list_next = dispatch_window_list->next;
38061 unsigned int num_restrict;
38062 enum dispatch_group group = get_insn_group (insn);
38063 enum insn_path path = get_insn_path (insn);
38064 int sum;
38065
38066 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
38067 instructions should be given the lowest priority in the
38068 scheduling process in Haifa scheduler to make sure they will be
38069 scheduled in the same dispatch window as the refrence to them. */
38070 if (group == disp_jcc || group == disp_cmp)
38071 return false;
38072
38073 /* Check nonrestricted. */
38074 if (group == disp_no_group || group == disp_branch)
38075 return true;
38076
38077 /* Get last dispatch window. */
38078 if (window_list_next)
38079 window_list = window_list_next;
38080
38081 if (window_list->window_num == 1)
38082 {
38083 sum = window_list->prev->window_size + window_list->window_size;
38084
38085 if (sum == 32
38086 || (min_insn_size (insn) + sum) >= 48)
38087 /* Window 1 is full. Go for next window. */
38088 return true;
38089 }
38090
38091 num_restrict = count_num_restricted (insn, window_list);
38092
38093 if (num_restrict > num_allowable_groups[group])
38094 return false;
38095
38096 /* See if it fits in the first window. */
38097 if (window_list->window_num == 0)
38098 {
38099 /* The first widow should have only single and double path
38100 uops. */
38101 if (path == path_double
38102 && (window_list->num_uops + 2) > MAX_INSN)
38103 return false;
38104 else if (path != path_single)
38105 return false;
38106 }
38107 return true;
38108 }
38109
38110 /* Add an instruction INSN with NUM_UOPS micro-operations to the
38111 dispatch window WINDOW_LIST. */
38112
38113 static void
38114 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
38115 {
38116 int byte_len = min_insn_size (insn);
38117 int num_insn = window_list->num_insn;
38118 int imm_size;
38119 sched_insn_info *window = window_list->window;
38120 enum dispatch_group group = get_insn_group (insn);
38121 enum insn_path path = get_insn_path (insn);
38122 int num_imm_operand;
38123 int num_imm32_operand;
38124 int num_imm64_operand;
38125
38126 if (!window_list->violation && group != disp_cmp
38127 && !fits_dispatch_window (insn))
38128 window_list->violation = true;
38129
38130 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38131 &num_imm64_operand);
38132
38133 /* Initialize window with new instruction. */
38134 window[num_insn].insn = insn;
38135 window[num_insn].byte_len = byte_len;
38136 window[num_insn].group = group;
38137 window[num_insn].path = path;
38138 window[num_insn].imm_bytes = imm_size;
38139
38140 window_list->window_size += byte_len;
38141 window_list->num_insn = num_insn + 1;
38142 window_list->num_uops = window_list->num_uops + num_uops;
38143 window_list->imm_size += imm_size;
38144 window_list->num_imm += num_imm_operand;
38145 window_list->num_imm_32 += num_imm32_operand;
38146 window_list->num_imm_64 += num_imm64_operand;
38147
38148 if (group == disp_store)
38149 window_list->num_stores += 1;
38150 else if (group == disp_load
38151 || group == disp_prefetch)
38152 window_list->num_loads += 1;
38153 else if (group == disp_load_store)
38154 {
38155 window_list->num_stores += 1;
38156 window_list->num_loads += 1;
38157 }
38158 }
38159
38160 /* Adds a scheduled instruction, INSN, to the current dispatch window.
38161 If the total bytes of instructions or the number of instructions in
38162 the window exceed allowable, it allocates a new window. */
38163
38164 static void
38165 add_to_dispatch_window (rtx insn)
38166 {
38167 int byte_len;
38168 dispatch_windows *window_list;
38169 dispatch_windows *next_list;
38170 dispatch_windows *window0_list;
38171 enum insn_path path;
38172 enum dispatch_group insn_group;
38173 bool insn_fits;
38174 int num_insn;
38175 int num_uops;
38176 int window_num;
38177 int insn_num_uops;
38178 int sum;
38179
38180 if (INSN_CODE (insn) < 0)
38181 return;
38182
38183 byte_len = min_insn_size (insn);
38184 window_list = dispatch_window_list;
38185 next_list = window_list->next;
38186 path = get_insn_path (insn);
38187 insn_group = get_insn_group (insn);
38188
38189 /* Get the last dispatch window. */
38190 if (next_list)
38191 window_list = dispatch_window_list->next;
38192
38193 if (path == path_single)
38194 insn_num_uops = 1;
38195 else if (path == path_double)
38196 insn_num_uops = 2;
38197 else
38198 insn_num_uops = (int) path;
38199
38200 /* If current window is full, get a new window.
38201 Window number zero is full, if MAX_INSN uops are scheduled in it.
38202 Window number one is full, if window zero's bytes plus window
38203 one's bytes is 32, or if the bytes of the new instruction added
38204 to the total makes it greater than 48, or it has already MAX_INSN
38205 instructions in it. */
38206 num_insn = window_list->num_insn;
38207 num_uops = window_list->num_uops;
38208 window_num = window_list->window_num;
38209 insn_fits = fits_dispatch_window (insn);
38210
38211 if (num_insn >= MAX_INSN
38212 || num_uops + insn_num_uops > MAX_INSN
38213 || !(insn_fits))
38214 {
38215 window_num = ~window_num & 1;
38216 window_list = allocate_next_window (window_num);
38217 }
38218
38219 if (window_num == 0)
38220 {
38221 add_insn_window (insn, window_list, insn_num_uops);
38222 if (window_list->num_insn >= MAX_INSN
38223 && insn_group == disp_branch)
38224 {
38225 process_end_window ();
38226 return;
38227 }
38228 }
38229 else if (window_num == 1)
38230 {
38231 window0_list = window_list->prev;
38232 sum = window0_list->window_size + window_list->window_size;
38233 if (sum == 32
38234 || (byte_len + sum) >= 48)
38235 {
38236 process_end_window ();
38237 window_list = dispatch_window_list;
38238 }
38239
38240 add_insn_window (insn, window_list, insn_num_uops);
38241 }
38242 else
38243 gcc_unreachable ();
38244
38245 if (is_end_basic_block (insn_group))
38246 {
38247 /* End of basic block is reached do end-basic-block process. */
38248 process_end_window ();
38249 return;
38250 }
38251 }
38252
38253 /* Print the dispatch window, WINDOW_NUM, to FILE. */
38254
38255 DEBUG_FUNCTION static void
38256 debug_dispatch_window_file (FILE *file, int window_num)
38257 {
38258 dispatch_windows *list;
38259 int i;
38260
38261 if (window_num == 0)
38262 list = dispatch_window_list;
38263 else
38264 list = dispatch_window_list1;
38265
38266 fprintf (file, "Window #%d:\n", list->window_num);
38267 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
38268 list->num_insn, list->num_uops, list->window_size);
38269 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38270 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
38271
38272 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
38273 list->num_stores);
38274 fprintf (file, " insn info:\n");
38275
38276 for (i = 0; i < MAX_INSN; i++)
38277 {
38278 if (!list->window[i].insn)
38279 break;
38280 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
38281 i, group_name[list->window[i].group],
38282 i, (void *)list->window[i].insn,
38283 i, list->window[i].path,
38284 i, list->window[i].byte_len,
38285 i, list->window[i].imm_bytes);
38286 }
38287 }
38288
38289 /* Print to stdout a dispatch window. */
38290
38291 DEBUG_FUNCTION void
38292 debug_dispatch_window (int window_num)
38293 {
38294 debug_dispatch_window_file (stdout, window_num);
38295 }
38296
38297 /* Print INSN dispatch information to FILE. */
38298
38299 DEBUG_FUNCTION static void
38300 debug_insn_dispatch_info_file (FILE *file, rtx insn)
38301 {
38302 int byte_len;
38303 enum insn_path path;
38304 enum dispatch_group group;
38305 int imm_size;
38306 int num_imm_operand;
38307 int num_imm32_operand;
38308 int num_imm64_operand;
38309
38310 if (INSN_CODE (insn) < 0)
38311 return;
38312
38313 byte_len = min_insn_size (insn);
38314 path = get_insn_path (insn);
38315 group = get_insn_group (insn);
38316 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
38317 &num_imm64_operand);
38318
38319 fprintf (file, " insn info:\n");
38320 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
38321 group_name[group], path, byte_len);
38322 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
38323 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
38324 }
38325
38326 /* Print to STDERR the status of the ready list with respect to
38327 dispatch windows. */
38328
38329 DEBUG_FUNCTION void
38330 debug_ready_dispatch (void)
38331 {
38332 int i;
38333 int no_ready = number_in_ready ();
38334
38335 fprintf (stdout, "Number of ready: %d\n", no_ready);
38336
38337 for (i = 0; i < no_ready; i++)
38338 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
38339 }
38340
38341 /* This routine is the driver of the dispatch scheduler. */
38342
38343 static void
38344 do_dispatch (rtx insn, int mode)
38345 {
38346 if (mode == DISPATCH_INIT)
38347 init_dispatch_sched ();
38348 else if (mode == ADD_TO_DISPATCH_WINDOW)
38349 add_to_dispatch_window (insn);
38350 }
38351
38352 /* Return TRUE if Dispatch Scheduling is supported. */
38353
38354 static bool
38355 has_dispatch (rtx insn, int action)
38356 {
38357 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
38358 && flag_dispatch_scheduler)
38359 switch (action)
38360 {
38361 default:
38362 return false;
38363
38364 case IS_DISPATCH_ON:
38365 return true;
38366 break;
38367
38368 case IS_CMP:
38369 return is_cmp (insn);
38370
38371 case DISPATCH_VIOLATION:
38372 return dispatch_violation ();
38373
38374 case FITS_DISPATCH_WINDOW:
38375 return fits_dispatch_window (insn);
38376 }
38377
38378 return false;
38379 }
38380
38381 /* Implementation of reassociation_width target hook used by
38382 reassoc phase to identify parallelism level in reassociated
38383 tree. Statements tree_code is passed in OPC. Arguments type
38384 is passed in MODE.
38385
38386 Currently parallel reassociation is enabled for Atom
38387 processors only and we set reassociation width to be 2
38388 because Atom may issue up to 2 instructions per cycle.
38389
38390 Return value should be fixed if parallel reassociation is
38391 enabled for other processors. */
38392
38393 static int
38394 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
38395 enum machine_mode mode)
38396 {
38397 int res = 1;
38398
38399 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
38400 res = 2;
38401 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
38402 res = 2;
38403
38404 return res;
38405 }
38406
38407 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
38408 place emms and femms instructions. */
38409
38410 static enum machine_mode
38411 ix86_preferred_simd_mode (enum machine_mode mode)
38412 {
38413 if (!TARGET_SSE)
38414 return word_mode;
38415
38416 switch (mode)
38417 {
38418 case QImode:
38419 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
38420 case HImode:
38421 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
38422 case SImode:
38423 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
38424 case DImode:
38425 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
38426
38427 case SFmode:
38428 if (TARGET_AVX && !TARGET_PREFER_AVX128)
38429 return V8SFmode;
38430 else
38431 return V4SFmode;
38432
38433 case DFmode:
38434 if (!TARGET_VECTORIZE_DOUBLE)
38435 return word_mode;
38436 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
38437 return V4DFmode;
38438 else if (TARGET_SSE2)
38439 return V2DFmode;
38440 /* FALLTHRU */
38441
38442 default:
38443 return word_mode;
38444 }
38445 }
38446
38447 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
38448 vectors. */
38449
38450 static unsigned int
38451 ix86_autovectorize_vector_sizes (void)
38452 {
38453 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
38454 }
38455
38456 /* Initialize the GCC target structure. */
38457 #undef TARGET_RETURN_IN_MEMORY
38458 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
38459
38460 #undef TARGET_LEGITIMIZE_ADDRESS
38461 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
38462
38463 #undef TARGET_ATTRIBUTE_TABLE
38464 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
38465 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38466 # undef TARGET_MERGE_DECL_ATTRIBUTES
38467 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
38468 #endif
38469
38470 #undef TARGET_COMP_TYPE_ATTRIBUTES
38471 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
38472
38473 #undef TARGET_INIT_BUILTINS
38474 #define TARGET_INIT_BUILTINS ix86_init_builtins
38475 #undef TARGET_BUILTIN_DECL
38476 #define TARGET_BUILTIN_DECL ix86_builtin_decl
38477 #undef TARGET_EXPAND_BUILTIN
38478 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
38479
38480 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
38481 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
38482 ix86_builtin_vectorized_function
38483
38484 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
38485 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
38486
38487 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
38488 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
38489
38490 #undef TARGET_VECTORIZE_BUILTIN_GATHER
38491 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
38492
38493 #undef TARGET_BUILTIN_RECIPROCAL
38494 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
38495
38496 #undef TARGET_ASM_FUNCTION_EPILOGUE
38497 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
38498
38499 #undef TARGET_ENCODE_SECTION_INFO
38500 #ifndef SUBTARGET_ENCODE_SECTION_INFO
38501 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
38502 #else
38503 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
38504 #endif
38505
38506 #undef TARGET_ASM_OPEN_PAREN
38507 #define TARGET_ASM_OPEN_PAREN ""
38508 #undef TARGET_ASM_CLOSE_PAREN
38509 #define TARGET_ASM_CLOSE_PAREN ""
38510
38511 #undef TARGET_ASM_BYTE_OP
38512 #define TARGET_ASM_BYTE_OP ASM_BYTE
38513
38514 #undef TARGET_ASM_ALIGNED_HI_OP
38515 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
38516 #undef TARGET_ASM_ALIGNED_SI_OP
38517 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
38518 #ifdef ASM_QUAD
38519 #undef TARGET_ASM_ALIGNED_DI_OP
38520 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
38521 #endif
38522
38523 #undef TARGET_PROFILE_BEFORE_PROLOGUE
38524 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
38525
38526 #undef TARGET_ASM_UNALIGNED_HI_OP
38527 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
38528 #undef TARGET_ASM_UNALIGNED_SI_OP
38529 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
38530 #undef TARGET_ASM_UNALIGNED_DI_OP
38531 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
38532
38533 #undef TARGET_PRINT_OPERAND
38534 #define TARGET_PRINT_OPERAND ix86_print_operand
38535 #undef TARGET_PRINT_OPERAND_ADDRESS
38536 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
38537 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
38538 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
38539 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
38540 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
38541
38542 #undef TARGET_SCHED_INIT_GLOBAL
38543 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
38544 #undef TARGET_SCHED_ADJUST_COST
38545 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
38546 #undef TARGET_SCHED_ISSUE_RATE
38547 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
38548 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
38549 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
38550 ia32_multipass_dfa_lookahead
38551
38552 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
38553 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
38554
38555 #ifdef HAVE_AS_TLS
38556 #undef TARGET_HAVE_TLS
38557 #define TARGET_HAVE_TLS true
38558 #endif
38559 #undef TARGET_CANNOT_FORCE_CONST_MEM
38560 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
38561 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
38562 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
38563
38564 #undef TARGET_DELEGITIMIZE_ADDRESS
38565 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
38566
38567 #undef TARGET_MS_BITFIELD_LAYOUT_P
38568 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
38569
38570 #if TARGET_MACHO
38571 #undef TARGET_BINDS_LOCAL_P
38572 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
38573 #endif
38574 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38575 #undef TARGET_BINDS_LOCAL_P
38576 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
38577 #endif
38578
38579 #undef TARGET_ASM_OUTPUT_MI_THUNK
38580 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
38581 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
38582 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
38583
38584 #undef TARGET_ASM_FILE_START
38585 #define TARGET_ASM_FILE_START x86_file_start
38586
38587 #undef TARGET_OPTION_OVERRIDE
38588 #define TARGET_OPTION_OVERRIDE ix86_option_override
38589
38590 #undef TARGET_REGISTER_MOVE_COST
38591 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
38592 #undef TARGET_MEMORY_MOVE_COST
38593 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
38594 #undef TARGET_RTX_COSTS
38595 #define TARGET_RTX_COSTS ix86_rtx_costs
38596 #undef TARGET_ADDRESS_COST
38597 #define TARGET_ADDRESS_COST ix86_address_cost
38598
38599 #undef TARGET_FIXED_CONDITION_CODE_REGS
38600 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
38601 #undef TARGET_CC_MODES_COMPATIBLE
38602 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
38603
38604 #undef TARGET_MACHINE_DEPENDENT_REORG
38605 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
38606
38607 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
38608 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
38609
38610 #undef TARGET_BUILD_BUILTIN_VA_LIST
38611 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
38612
38613 #undef TARGET_ENUM_VA_LIST_P
38614 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
38615
38616 #undef TARGET_FN_ABI_VA_LIST
38617 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
38618
38619 #undef TARGET_CANONICAL_VA_LIST_TYPE
38620 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
38621
38622 #undef TARGET_EXPAND_BUILTIN_VA_START
38623 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
38624
38625 #undef TARGET_MD_ASM_CLOBBERS
38626 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
38627
38628 #undef TARGET_PROMOTE_PROTOTYPES
38629 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
38630 #undef TARGET_STRUCT_VALUE_RTX
38631 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
38632 #undef TARGET_SETUP_INCOMING_VARARGS
38633 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
38634 #undef TARGET_MUST_PASS_IN_STACK
38635 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
38636 #undef TARGET_FUNCTION_ARG_ADVANCE
38637 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
38638 #undef TARGET_FUNCTION_ARG
38639 #define TARGET_FUNCTION_ARG ix86_function_arg
38640 #undef TARGET_FUNCTION_ARG_BOUNDARY
38641 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
38642 #undef TARGET_PASS_BY_REFERENCE
38643 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
38644 #undef TARGET_INTERNAL_ARG_POINTER
38645 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
38646 #undef TARGET_UPDATE_STACK_BOUNDARY
38647 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
38648 #undef TARGET_GET_DRAP_RTX
38649 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
38650 #undef TARGET_STRICT_ARGUMENT_NAMING
38651 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
38652 #undef TARGET_STATIC_CHAIN
38653 #define TARGET_STATIC_CHAIN ix86_static_chain
38654 #undef TARGET_TRAMPOLINE_INIT
38655 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
38656 #undef TARGET_RETURN_POPS_ARGS
38657 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
38658
38659 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
38660 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
38661
38662 #undef TARGET_SCALAR_MODE_SUPPORTED_P
38663 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
38664
38665 #undef TARGET_VECTOR_MODE_SUPPORTED_P
38666 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
38667
38668 #undef TARGET_C_MODE_FOR_SUFFIX
38669 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
38670
38671 #ifdef HAVE_AS_TLS
38672 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
38673 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
38674 #endif
38675
38676 #ifdef SUBTARGET_INSERT_ATTRIBUTES
38677 #undef TARGET_INSERT_ATTRIBUTES
38678 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
38679 #endif
38680
38681 #undef TARGET_MANGLE_TYPE
38682 #define TARGET_MANGLE_TYPE ix86_mangle_type
38683
38684 #if !TARGET_MACHO
38685 #undef TARGET_STACK_PROTECT_FAIL
38686 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
38687 #endif
38688
38689 #undef TARGET_FUNCTION_VALUE
38690 #define TARGET_FUNCTION_VALUE ix86_function_value
38691
38692 #undef TARGET_FUNCTION_VALUE_REGNO_P
38693 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
38694
38695 #undef TARGET_PROMOTE_FUNCTION_MODE
38696 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
38697
38698 #undef TARGET_SECONDARY_RELOAD
38699 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
38700
38701 #undef TARGET_CLASS_MAX_NREGS
38702 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
38703
38704 #undef TARGET_PREFERRED_RELOAD_CLASS
38705 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
38706 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
38707 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
38708 #undef TARGET_CLASS_LIKELY_SPILLED_P
38709 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
38710
38711 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
38712 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
38713 ix86_builtin_vectorization_cost
38714 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
38715 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
38716 ix86_vectorize_vec_perm_const_ok
38717 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
38718 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
38719 ix86_preferred_simd_mode
38720 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
38721 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
38722 ix86_autovectorize_vector_sizes
38723
38724 #undef TARGET_SET_CURRENT_FUNCTION
38725 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
38726
38727 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
38728 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
38729
38730 #undef TARGET_OPTION_SAVE
38731 #define TARGET_OPTION_SAVE ix86_function_specific_save
38732
38733 #undef TARGET_OPTION_RESTORE
38734 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
38735
38736 #undef TARGET_OPTION_PRINT
38737 #define TARGET_OPTION_PRINT ix86_function_specific_print
38738
38739 #undef TARGET_CAN_INLINE_P
38740 #define TARGET_CAN_INLINE_P ix86_can_inline_p
38741
38742 #undef TARGET_EXPAND_TO_RTL_HOOK
38743 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
38744
38745 #undef TARGET_LEGITIMATE_ADDRESS_P
38746 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
38747
38748 #undef TARGET_LEGITIMATE_CONSTANT_P
38749 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
38750
38751 #undef TARGET_FRAME_POINTER_REQUIRED
38752 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
38753
38754 #undef TARGET_CAN_ELIMINATE
38755 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
38756
38757 #undef TARGET_EXTRA_LIVE_ON_ENTRY
38758 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
38759
38760 #undef TARGET_ASM_CODE_END
38761 #define TARGET_ASM_CODE_END ix86_code_end
38762
38763 #undef TARGET_CONDITIONAL_REGISTER_USAGE
38764 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
38765
38766 #if TARGET_MACHO
38767 #undef TARGET_INIT_LIBFUNCS
38768 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
38769 #endif
38770
38771 struct gcc_target targetm = TARGET_INITIALIZER;
38772 \f
38773 #include "gt-i386.h"